diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,70873 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.964806124969128, + "eval_steps": 500, + "global_step": 10120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000395159298592245, + "grad_norm": 45.00149749159182, + "learning_rate": 1.0638297872340427e-08, + "loss": 5.3324, + "step": 1 + }, + { + "epoch": 0.00079031859718449, + "grad_norm": 47.65956613810862, + "learning_rate": 2.1276595744680853e-08, + "loss": 5.4979, + "step": 2 + }, + { + "epoch": 0.001185477895776735, + "grad_norm": 44.18618860656718, + "learning_rate": 3.191489361702128e-08, + "loss": 5.1904, + "step": 3 + }, + { + "epoch": 0.00158063719436898, + "grad_norm": 46.48562838932194, + "learning_rate": 4.2553191489361707e-08, + "loss": 5.3019, + "step": 4 + }, + { + "epoch": 0.001975796492961225, + "grad_norm": 46.390295089186544, + "learning_rate": 5.319148936170213e-08, + "loss": 5.3445, + "step": 5 + }, + { + "epoch": 0.00237095579155347, + "grad_norm": 46.76761442399584, + "learning_rate": 6.382978723404255e-08, + "loss": 5.3375, + "step": 6 + }, + { + "epoch": 0.002766115090145715, + "grad_norm": 48.279218165562085, + "learning_rate": 7.446808510638299e-08, + "loss": 5.3938, + "step": 7 + }, + { + "epoch": 0.00316127438873796, + "grad_norm": 49.205312835168, + "learning_rate": 8.510638297872341e-08, + "loss": 5.4594, + "step": 8 + }, + { + "epoch": 0.003556433687330205, + "grad_norm": 46.56040077093359, + "learning_rate": 9.574468085106384e-08, + "loss": 5.3369, + "step": 9 + }, + { + "epoch": 0.00395159298592245, + "grad_norm": 45.73670653221075, + "learning_rate": 1.0638297872340426e-07, + "loss": 5.28, + "step": 10 + }, + { + "epoch": 0.004346752284514695, + "grad_norm": 45.65613647161846, + "learning_rate": 1.1702127659574468e-07, + "loss": 5.4326, + "step": 11 + }, + { + "epoch": 0.00474191158310694, + "grad_norm": 48.50884423593061, + "learning_rate": 1.276595744680851e-07, + "loss": 5.4455, + "step": 12 + }, + { + "epoch": 0.005137070881699185, + "grad_norm": 48.08180573773191, + "learning_rate": 1.3829787234042553e-07, + "loss": 5.5208, + "step": 13 + }, + { + "epoch": 0.00553223018029143, + "grad_norm": 45.43412625685486, + "learning_rate": 1.4893617021276598e-07, + "loss": 5.3535, + "step": 14 + }, + { + "epoch": 0.005927389478883675, + "grad_norm": 46.86389365193829, + "learning_rate": 1.5957446808510638e-07, + "loss": 5.3197, + "step": 15 + }, + { + "epoch": 0.00632254877747592, + "grad_norm": 44.6253221018194, + "learning_rate": 1.7021276595744683e-07, + "loss": 5.4598, + "step": 16 + }, + { + "epoch": 0.006717708076068165, + "grad_norm": 45.02802440676611, + "learning_rate": 1.8085106382978722e-07, + "loss": 5.1813, + "step": 17 + }, + { + "epoch": 0.00711286737466041, + "grad_norm": 44.458003741695514, + "learning_rate": 1.9148936170212767e-07, + "loss": 5.299, + "step": 18 + }, + { + "epoch": 0.007508026673252655, + "grad_norm": 42.134533089902625, + "learning_rate": 2.0212765957446812e-07, + "loss": 5.0162, + "step": 19 + }, + { + "epoch": 0.0079031859718449, + "grad_norm": 45.3211377446349, + "learning_rate": 2.1276595744680852e-07, + "loss": 5.3317, + "step": 20 + }, + { + "epoch": 0.008298345270437145, + "grad_norm": 39.99349721590117, + "learning_rate": 2.2340425531914897e-07, + "loss": 5.1135, + "step": 21 + }, + { + "epoch": 0.00869350456902939, + "grad_norm": 38.01803171664136, + "learning_rate": 2.3404255319148937e-07, + "loss": 5.1482, + "step": 22 + }, + { + "epoch": 0.009088663867621635, + "grad_norm": 39.354061184796315, + "learning_rate": 2.446808510638298e-07, + "loss": 5.2229, + "step": 23 + }, + { + "epoch": 0.00948382316621388, + "grad_norm": 40.80324954040139, + "learning_rate": 2.553191489361702e-07, + "loss": 5.2, + "step": 24 + }, + { + "epoch": 0.009878982464806126, + "grad_norm": 38.10588841551195, + "learning_rate": 2.6595744680851066e-07, + "loss": 5.0376, + "step": 25 + }, + { + "epoch": 0.01027414176339837, + "grad_norm": 38.040044463686115, + "learning_rate": 2.7659574468085106e-07, + "loss": 5.234, + "step": 26 + }, + { + "epoch": 0.010669301061990615, + "grad_norm": 38.93658315055219, + "learning_rate": 2.872340425531915e-07, + "loss": 5.2858, + "step": 27 + }, + { + "epoch": 0.01106446036058286, + "grad_norm": 27.7819822121004, + "learning_rate": 2.9787234042553196e-07, + "loss": 4.8375, + "step": 28 + }, + { + "epoch": 0.011459619659175104, + "grad_norm": 27.368335853965625, + "learning_rate": 3.0851063829787236e-07, + "loss": 4.9516, + "step": 29 + }, + { + "epoch": 0.01185477895776735, + "grad_norm": 27.250102842852968, + "learning_rate": 3.1914893617021275e-07, + "loss": 4.926, + "step": 30 + }, + { + "epoch": 0.012249938256359595, + "grad_norm": 26.50459159867008, + "learning_rate": 3.297872340425532e-07, + "loss": 4.8223, + "step": 31 + }, + { + "epoch": 0.01264509755495184, + "grad_norm": 27.513101530119382, + "learning_rate": 3.4042553191489365e-07, + "loss": 5.0109, + "step": 32 + }, + { + "epoch": 0.013040256853544084, + "grad_norm": 26.6602907635092, + "learning_rate": 3.510638297872341e-07, + "loss": 4.8947, + "step": 33 + }, + { + "epoch": 0.01343541615213633, + "grad_norm": 23.204176582249318, + "learning_rate": 3.6170212765957445e-07, + "loss": 4.6691, + "step": 34 + }, + { + "epoch": 0.013830575450728575, + "grad_norm": 24.66170575750984, + "learning_rate": 3.723404255319149e-07, + "loss": 4.9299, + "step": 35 + }, + { + "epoch": 0.01422573474932082, + "grad_norm": 24.263834716366237, + "learning_rate": 3.8297872340425535e-07, + "loss": 4.9347, + "step": 36 + }, + { + "epoch": 0.014620894047913065, + "grad_norm": 23.965936128051734, + "learning_rate": 3.936170212765958e-07, + "loss": 4.8474, + "step": 37 + }, + { + "epoch": 0.01501605334650531, + "grad_norm": 18.157336850120906, + "learning_rate": 4.0425531914893625e-07, + "loss": 4.6366, + "step": 38 + }, + { + "epoch": 0.015411212645097556, + "grad_norm": 17.163104051613317, + "learning_rate": 4.148936170212766e-07, + "loss": 4.5056, + "step": 39 + }, + { + "epoch": 0.0158063719436898, + "grad_norm": 18.14853573769075, + "learning_rate": 4.2553191489361704e-07, + "loss": 4.5394, + "step": 40 + }, + { + "epoch": 0.016201531242282043, + "grad_norm": 17.031326941596244, + "learning_rate": 4.361702127659575e-07, + "loss": 4.5494, + "step": 41 + }, + { + "epoch": 0.01659669054087429, + "grad_norm": 16.530898546137706, + "learning_rate": 4.4680851063829794e-07, + "loss": 4.3963, + "step": 42 + }, + { + "epoch": 0.016991849839466534, + "grad_norm": 15.10305267365516, + "learning_rate": 4.574468085106383e-07, + "loss": 4.3983, + "step": 43 + }, + { + "epoch": 0.01738700913805878, + "grad_norm": 15.265493803692115, + "learning_rate": 4.6808510638297873e-07, + "loss": 4.3438, + "step": 44 + }, + { + "epoch": 0.017782168436651025, + "grad_norm": 14.891772118131469, + "learning_rate": 4.787234042553192e-07, + "loss": 4.356, + "step": 45 + }, + { + "epoch": 0.01817732773524327, + "grad_norm": 14.448417003857594, + "learning_rate": 4.893617021276596e-07, + "loss": 4.3505, + "step": 46 + }, + { + "epoch": 0.018572487033835516, + "grad_norm": 14.013807338161678, + "learning_rate": 5.000000000000001e-07, + "loss": 4.3902, + "step": 47 + }, + { + "epoch": 0.01896764633242776, + "grad_norm": 12.903706951798439, + "learning_rate": 5.106382978723404e-07, + "loss": 4.2129, + "step": 48 + }, + { + "epoch": 0.019362805631020004, + "grad_norm": 12.74392330179218, + "learning_rate": 5.212765957446809e-07, + "loss": 4.2245, + "step": 49 + }, + { + "epoch": 0.01975796492961225, + "grad_norm": 11.409859418409809, + "learning_rate": 5.319148936170213e-07, + "loss": 4.0668, + "step": 50 + }, + { + "epoch": 0.020153124228204495, + "grad_norm": 12.382991509917876, + "learning_rate": 5.425531914893618e-07, + "loss": 4.0922, + "step": 51 + }, + { + "epoch": 0.02054828352679674, + "grad_norm": 11.858955825560276, + "learning_rate": 5.531914893617021e-07, + "loss": 4.0067, + "step": 52 + }, + { + "epoch": 0.020943442825388986, + "grad_norm": 12.68072624432379, + "learning_rate": 5.638297872340426e-07, + "loss": 4.0186, + "step": 53 + }, + { + "epoch": 0.02133860212398123, + "grad_norm": 10.958508064800348, + "learning_rate": 5.74468085106383e-07, + "loss": 3.873, + "step": 54 + }, + { + "epoch": 0.021733761422573473, + "grad_norm": 10.842426212742, + "learning_rate": 5.851063829787235e-07, + "loss": 3.9817, + "step": 55 + }, + { + "epoch": 0.02212892072116572, + "grad_norm": 10.471356929993854, + "learning_rate": 5.957446808510639e-07, + "loss": 3.9189, + "step": 56 + }, + { + "epoch": 0.022524080019757965, + "grad_norm": 9.522689670857774, + "learning_rate": 6.063829787234043e-07, + "loss": 3.7485, + "step": 57 + }, + { + "epoch": 0.02291923931835021, + "grad_norm": 9.89336870076738, + "learning_rate": 6.170212765957447e-07, + "loss": 3.7502, + "step": 58 + }, + { + "epoch": 0.023314398616942456, + "grad_norm": 10.12284131423492, + "learning_rate": 6.276595744680851e-07, + "loss": 3.784, + "step": 59 + }, + { + "epoch": 0.0237095579155347, + "grad_norm": 9.549611583907577, + "learning_rate": 6.382978723404255e-07, + "loss": 3.7651, + "step": 60 + }, + { + "epoch": 0.024104717214126947, + "grad_norm": 9.025126475730096, + "learning_rate": 6.48936170212766e-07, + "loss": 3.6512, + "step": 61 + }, + { + "epoch": 0.02449987651271919, + "grad_norm": 8.597629129461852, + "learning_rate": 6.595744680851064e-07, + "loss": 3.7376, + "step": 62 + }, + { + "epoch": 0.024895035811311434, + "grad_norm": 8.345445781598853, + "learning_rate": 6.702127659574469e-07, + "loss": 3.4983, + "step": 63 + }, + { + "epoch": 0.02529019510990368, + "grad_norm": 7.990649600345105, + "learning_rate": 6.808510638297873e-07, + "loss": 3.5068, + "step": 64 + }, + { + "epoch": 0.025685354408495925, + "grad_norm": 7.513578806398596, + "learning_rate": 6.914893617021278e-07, + "loss": 3.5573, + "step": 65 + }, + { + "epoch": 0.02608051370708817, + "grad_norm": 7.9623181239477105, + "learning_rate": 7.021276595744682e-07, + "loss": 3.5052, + "step": 66 + }, + { + "epoch": 0.026475673005680416, + "grad_norm": 7.637547791371253, + "learning_rate": 7.127659574468087e-07, + "loss": 3.4709, + "step": 67 + }, + { + "epoch": 0.02687083230427266, + "grad_norm": 8.558533087051865, + "learning_rate": 7.234042553191489e-07, + "loss": 3.3483, + "step": 68 + }, + { + "epoch": 0.027265991602864904, + "grad_norm": 8.199113740225883, + "learning_rate": 7.340425531914893e-07, + "loss": 3.2996, + "step": 69 + }, + { + "epoch": 0.02766115090145715, + "grad_norm": 8.450241754459654, + "learning_rate": 7.446808510638298e-07, + "loss": 3.1916, + "step": 70 + }, + { + "epoch": 0.028056310200049395, + "grad_norm": 7.103052632657081, + "learning_rate": 7.553191489361702e-07, + "loss": 3.165, + "step": 71 + }, + { + "epoch": 0.02845146949864164, + "grad_norm": 7.397343059451777, + "learning_rate": 7.659574468085107e-07, + "loss": 3.1549, + "step": 72 + }, + { + "epoch": 0.028846628797233886, + "grad_norm": 7.532992807551275, + "learning_rate": 7.765957446808511e-07, + "loss": 3.0944, + "step": 73 + }, + { + "epoch": 0.02924178809582613, + "grad_norm": 6.784491281435092, + "learning_rate": 7.872340425531916e-07, + "loss": 3.0294, + "step": 74 + }, + { + "epoch": 0.029636947394418373, + "grad_norm": 7.083810212959109, + "learning_rate": 7.97872340425532e-07, + "loss": 3.0021, + "step": 75 + }, + { + "epoch": 0.03003210669301062, + "grad_norm": 6.443669804960258, + "learning_rate": 8.085106382978725e-07, + "loss": 2.9301, + "step": 76 + }, + { + "epoch": 0.030427265991602864, + "grad_norm": 6.650880145536403, + "learning_rate": 8.191489361702127e-07, + "loss": 2.9397, + "step": 77 + }, + { + "epoch": 0.03082242529019511, + "grad_norm": 6.36715615120205, + "learning_rate": 8.297872340425532e-07, + "loss": 2.8843, + "step": 78 + }, + { + "epoch": 0.031217584588787355, + "grad_norm": 6.238954181171991, + "learning_rate": 8.404255319148936e-07, + "loss": 2.7895, + "step": 79 + }, + { + "epoch": 0.0316127438873796, + "grad_norm": 6.232900546667891, + "learning_rate": 8.510638297872341e-07, + "loss": 2.8601, + "step": 80 + }, + { + "epoch": 0.032007903185971846, + "grad_norm": 5.726543181847973, + "learning_rate": 8.617021276595745e-07, + "loss": 2.6892, + "step": 81 + }, + { + "epoch": 0.03240306248456409, + "grad_norm": 5.502730599732169, + "learning_rate": 8.72340425531915e-07, + "loss": 2.6352, + "step": 82 + }, + { + "epoch": 0.032798221783156334, + "grad_norm": 5.739593573701677, + "learning_rate": 8.829787234042554e-07, + "loss": 2.6015, + "step": 83 + }, + { + "epoch": 0.03319338108174858, + "grad_norm": 5.6507901978379955, + "learning_rate": 8.936170212765959e-07, + "loss": 2.5505, + "step": 84 + }, + { + "epoch": 0.03358854038034083, + "grad_norm": 5.183242056909587, + "learning_rate": 9.042553191489363e-07, + "loss": 2.5001, + "step": 85 + }, + { + "epoch": 0.03398369967893307, + "grad_norm": 5.362205769595081, + "learning_rate": 9.148936170212766e-07, + "loss": 2.5267, + "step": 86 + }, + { + "epoch": 0.034378858977525316, + "grad_norm": 4.811942843640751, + "learning_rate": 9.25531914893617e-07, + "loss": 2.3826, + "step": 87 + }, + { + "epoch": 0.03477401827611756, + "grad_norm": 4.904542737672584, + "learning_rate": 9.361702127659575e-07, + "loss": 2.4234, + "step": 88 + }, + { + "epoch": 0.0351691775747098, + "grad_norm": 4.870107420594597, + "learning_rate": 9.468085106382979e-07, + "loss": 2.321, + "step": 89 + }, + { + "epoch": 0.03556433687330205, + "grad_norm": 4.581596683493078, + "learning_rate": 9.574468085106384e-07, + "loss": 2.2948, + "step": 90 + }, + { + "epoch": 0.0359594961718943, + "grad_norm": 4.661567248806833, + "learning_rate": 9.680851063829788e-07, + "loss": 2.2851, + "step": 91 + }, + { + "epoch": 0.03635465547048654, + "grad_norm": 4.1499647842616385, + "learning_rate": 9.787234042553193e-07, + "loss": 2.2358, + "step": 92 + }, + { + "epoch": 0.036749814769078785, + "grad_norm": 4.064054364491757, + "learning_rate": 9.893617021276597e-07, + "loss": 2.2021, + "step": 93 + }, + { + "epoch": 0.03714497406767103, + "grad_norm": 4.019497224170159, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.1558, + "step": 94 + }, + { + "epoch": 0.03754013336626327, + "grad_norm": 3.7739064116455254, + "learning_rate": 1.0106382978723404e-06, + "loss": 2.145, + "step": 95 + }, + { + "epoch": 0.03793529266485552, + "grad_norm": 3.587043554533132, + "learning_rate": 1.0212765957446809e-06, + "loss": 2.0649, + "step": 96 + }, + { + "epoch": 0.03833045196344777, + "grad_norm": 3.5425961325072732, + "learning_rate": 1.0319148936170213e-06, + "loss": 2.0494, + "step": 97 + }, + { + "epoch": 0.03872561126204001, + "grad_norm": 3.4793118680889914, + "learning_rate": 1.0425531914893618e-06, + "loss": 1.9564, + "step": 98 + }, + { + "epoch": 0.039120770560632255, + "grad_norm": 3.480928663874862, + "learning_rate": 1.0531914893617022e-06, + "loss": 1.9516, + "step": 99 + }, + { + "epoch": 0.0395159298592245, + "grad_norm": 3.4270001249424427, + "learning_rate": 1.0638297872340427e-06, + "loss": 1.943, + "step": 100 + }, + { + "epoch": 0.03991108915781674, + "grad_norm": 3.142501786484995, + "learning_rate": 1.074468085106383e-06, + "loss": 1.858, + "step": 101 + }, + { + "epoch": 0.04030624845640899, + "grad_norm": 3.2413282091965376, + "learning_rate": 1.0851063829787236e-06, + "loss": 1.8773, + "step": 102 + }, + { + "epoch": 0.04070140775500124, + "grad_norm": 3.2180251637134223, + "learning_rate": 1.095744680851064e-06, + "loss": 1.7858, + "step": 103 + }, + { + "epoch": 0.04109656705359348, + "grad_norm": 3.0827158001171715, + "learning_rate": 1.1063829787234042e-06, + "loss": 1.8103, + "step": 104 + }, + { + "epoch": 0.041491726352185725, + "grad_norm": 3.0156067280676284, + "learning_rate": 1.1170212765957447e-06, + "loss": 1.7703, + "step": 105 + }, + { + "epoch": 0.04188688565077797, + "grad_norm": 3.1564655772648536, + "learning_rate": 1.1276595744680851e-06, + "loss": 1.7584, + "step": 106 + }, + { + "epoch": 0.04228204494937021, + "grad_norm": 2.8464417493034384, + "learning_rate": 1.1382978723404256e-06, + "loss": 1.7098, + "step": 107 + }, + { + "epoch": 0.04267720424796246, + "grad_norm": 2.979579920610495, + "learning_rate": 1.148936170212766e-06, + "loss": 1.6593, + "step": 108 + }, + { + "epoch": 0.04307236354655471, + "grad_norm": 2.7758712165539996, + "learning_rate": 1.1595744680851065e-06, + "loss": 1.6563, + "step": 109 + }, + { + "epoch": 0.04346752284514695, + "grad_norm": 2.5891620159779016, + "learning_rate": 1.170212765957447e-06, + "loss": 1.6248, + "step": 110 + }, + { + "epoch": 0.043862682143739194, + "grad_norm": 2.4867507899845473, + "learning_rate": 1.1808510638297874e-06, + "loss": 1.5999, + "step": 111 + }, + { + "epoch": 0.04425784144233144, + "grad_norm": 2.411241253335211, + "learning_rate": 1.1914893617021278e-06, + "loss": 1.6233, + "step": 112 + }, + { + "epoch": 0.04465300074092368, + "grad_norm": 2.675802892730842, + "learning_rate": 1.202127659574468e-06, + "loss": 1.5911, + "step": 113 + }, + { + "epoch": 0.04504816003951593, + "grad_norm": 2.4484713846419783, + "learning_rate": 1.2127659574468085e-06, + "loss": 1.552, + "step": 114 + }, + { + "epoch": 0.045443319338108176, + "grad_norm": 2.126895225481791, + "learning_rate": 1.223404255319149e-06, + "loss": 1.5116, + "step": 115 + }, + { + "epoch": 0.04583847863670042, + "grad_norm": 2.108035496774095, + "learning_rate": 1.2340425531914894e-06, + "loss": 1.5332, + "step": 116 + }, + { + "epoch": 0.046233637935292664, + "grad_norm": 2.170204737476494, + "learning_rate": 1.2446808510638299e-06, + "loss": 1.5099, + "step": 117 + }, + { + "epoch": 0.04662879723388491, + "grad_norm": 1.9897269791458945, + "learning_rate": 1.2553191489361701e-06, + "loss": 1.4747, + "step": 118 + }, + { + "epoch": 0.04702395653247716, + "grad_norm": 1.924882132114436, + "learning_rate": 1.2659574468085106e-06, + "loss": 1.4877, + "step": 119 + }, + { + "epoch": 0.0474191158310694, + "grad_norm": 2.1236023483825934, + "learning_rate": 1.276595744680851e-06, + "loss": 1.4593, + "step": 120 + }, + { + "epoch": 0.047814275129661646, + "grad_norm": 1.8555875106493023, + "learning_rate": 1.2872340425531915e-06, + "loss": 1.3795, + "step": 121 + }, + { + "epoch": 0.04820943442825389, + "grad_norm": 1.9400614964548064, + "learning_rate": 1.297872340425532e-06, + "loss": 1.404, + "step": 122 + }, + { + "epoch": 0.04860459372684613, + "grad_norm": 1.6699536554778585, + "learning_rate": 1.3085106382978724e-06, + "loss": 1.3696, + "step": 123 + }, + { + "epoch": 0.04899975302543838, + "grad_norm": 1.689620342600094, + "learning_rate": 1.3191489361702128e-06, + "loss": 1.344, + "step": 124 + }, + { + "epoch": 0.04939491232403063, + "grad_norm": 1.7509004836307809, + "learning_rate": 1.3297872340425533e-06, + "loss": 1.3462, + "step": 125 + }, + { + "epoch": 0.04979007162262287, + "grad_norm": 1.7664945294015308, + "learning_rate": 1.3404255319148937e-06, + "loss": 1.3477, + "step": 126 + }, + { + "epoch": 0.050185230921215115, + "grad_norm": 2.084264973245835, + "learning_rate": 1.3510638297872342e-06, + "loss": 1.3218, + "step": 127 + }, + { + "epoch": 0.05058039021980736, + "grad_norm": 1.5965337608217485, + "learning_rate": 1.3617021276595746e-06, + "loss": 1.3084, + "step": 128 + }, + { + "epoch": 0.0509755495183996, + "grad_norm": 1.619614456507732, + "learning_rate": 1.372340425531915e-06, + "loss": 1.2938, + "step": 129 + }, + { + "epoch": 0.05137070881699185, + "grad_norm": 1.8247931156223327, + "learning_rate": 1.3829787234042555e-06, + "loss": 1.2944, + "step": 130 + }, + { + "epoch": 0.0517658681155841, + "grad_norm": 1.580039976604606, + "learning_rate": 1.393617021276596e-06, + "loss": 1.2777, + "step": 131 + }, + { + "epoch": 0.05216102741417634, + "grad_norm": 1.509843143895781, + "learning_rate": 1.4042553191489364e-06, + "loss": 1.2841, + "step": 132 + }, + { + "epoch": 0.052556186712768585, + "grad_norm": 1.6756920394271753, + "learning_rate": 1.4148936170212769e-06, + "loss": 1.2419, + "step": 133 + }, + { + "epoch": 0.05295134601136083, + "grad_norm": 1.4623629043957789, + "learning_rate": 1.4255319148936173e-06, + "loss": 1.2077, + "step": 134 + }, + { + "epoch": 0.05334650530995307, + "grad_norm": 1.383626312375799, + "learning_rate": 1.4361702127659578e-06, + "loss": 1.2265, + "step": 135 + }, + { + "epoch": 0.05374166460854532, + "grad_norm": 1.7925919805177952, + "learning_rate": 1.4468085106382978e-06, + "loss": 1.2186, + "step": 136 + }, + { + "epoch": 0.05413682390713757, + "grad_norm": 1.3738854790506727, + "learning_rate": 1.4574468085106382e-06, + "loss": 1.2159, + "step": 137 + }, + { + "epoch": 0.05453198320572981, + "grad_norm": 1.4567831484040439, + "learning_rate": 1.4680851063829787e-06, + "loss": 1.1814, + "step": 138 + }, + { + "epoch": 0.054927142504322055, + "grad_norm": 1.3546122882454086, + "learning_rate": 1.4787234042553191e-06, + "loss": 1.1519, + "step": 139 + }, + { + "epoch": 0.0553223018029143, + "grad_norm": 1.3541293229496851, + "learning_rate": 1.4893617021276596e-06, + "loss": 1.1625, + "step": 140 + }, + { + "epoch": 0.05571746110150654, + "grad_norm": 1.5328490619353512, + "learning_rate": 1.5e-06, + "loss": 1.1564, + "step": 141 + }, + { + "epoch": 0.05611262040009879, + "grad_norm": 1.2335978296031311, + "learning_rate": 1.5106382978723405e-06, + "loss": 1.1294, + "step": 142 + }, + { + "epoch": 0.05650777969869104, + "grad_norm": 1.281543201298622, + "learning_rate": 1.521276595744681e-06, + "loss": 1.1539, + "step": 143 + }, + { + "epoch": 0.05690293899728328, + "grad_norm": 1.1347832342130402, + "learning_rate": 1.5319148936170214e-06, + "loss": 1.1475, + "step": 144 + }, + { + "epoch": 0.057298098295875524, + "grad_norm": 1.272401514207382, + "learning_rate": 1.5425531914893618e-06, + "loss": 1.1062, + "step": 145 + }, + { + "epoch": 0.05769325759446777, + "grad_norm": 1.1113360919213993, + "learning_rate": 1.5531914893617023e-06, + "loss": 1.0904, + "step": 146 + }, + { + "epoch": 0.05808841689306001, + "grad_norm": 1.2143209476879704, + "learning_rate": 1.5638297872340427e-06, + "loss": 1.1221, + "step": 147 + }, + { + "epoch": 0.05848357619165226, + "grad_norm": 1.1975123411456277, + "learning_rate": 1.5744680851063832e-06, + "loss": 1.1134, + "step": 148 + }, + { + "epoch": 0.058878735490244506, + "grad_norm": 1.1752282768767688, + "learning_rate": 1.5851063829787236e-06, + "loss": 1.1073, + "step": 149 + }, + { + "epoch": 0.059273894788836747, + "grad_norm": 1.1221206874706884, + "learning_rate": 1.595744680851064e-06, + "loss": 1.0863, + "step": 150 + }, + { + "epoch": 0.059669054087428994, + "grad_norm": 1.0719072351198553, + "learning_rate": 1.6063829787234045e-06, + "loss": 1.087, + "step": 151 + }, + { + "epoch": 0.06006421338602124, + "grad_norm": 1.0618749015123967, + "learning_rate": 1.617021276595745e-06, + "loss": 1.0724, + "step": 152 + }, + { + "epoch": 0.06045937268461349, + "grad_norm": 1.289175259046802, + "learning_rate": 1.6276595744680854e-06, + "loss": 1.0913, + "step": 153 + }, + { + "epoch": 0.06085453198320573, + "grad_norm": 1.1077238693214941, + "learning_rate": 1.6382978723404255e-06, + "loss": 1.0408, + "step": 154 + }, + { + "epoch": 0.061249691281797976, + "grad_norm": 1.3121363094248382, + "learning_rate": 1.648936170212766e-06, + "loss": 1.0649, + "step": 155 + }, + { + "epoch": 0.06164485058039022, + "grad_norm": 1.0047858271903711, + "learning_rate": 1.6595744680851064e-06, + "loss": 1.0339, + "step": 156 + }, + { + "epoch": 0.06204000987898246, + "grad_norm": 1.222322248701329, + "learning_rate": 1.6702127659574468e-06, + "loss": 1.0367, + "step": 157 + }, + { + "epoch": 0.06243516917757471, + "grad_norm": 0.9790824508239406, + "learning_rate": 1.6808510638297873e-06, + "loss": 1.0221, + "step": 158 + }, + { + "epoch": 0.06283032847616696, + "grad_norm": 1.067740944312297, + "learning_rate": 1.6914893617021277e-06, + "loss": 1.0334, + "step": 159 + }, + { + "epoch": 0.0632254877747592, + "grad_norm": 0.896625566421198, + "learning_rate": 1.7021276595744682e-06, + "loss": 1.0092, + "step": 160 + }, + { + "epoch": 0.06362064707335144, + "grad_norm": 0.9627618743848442, + "learning_rate": 1.7127659574468086e-06, + "loss": 1.0463, + "step": 161 + }, + { + "epoch": 0.06401580637194369, + "grad_norm": 1.0734207189542146, + "learning_rate": 1.723404255319149e-06, + "loss": 1.0392, + "step": 162 + }, + { + "epoch": 0.06441096567053593, + "grad_norm": 0.9349753594761937, + "learning_rate": 1.7340425531914895e-06, + "loss": 0.9892, + "step": 163 + }, + { + "epoch": 0.06480612496912817, + "grad_norm": 0.9980254538111385, + "learning_rate": 1.74468085106383e-06, + "loss": 0.9982, + "step": 164 + }, + { + "epoch": 0.06520128426772043, + "grad_norm": 1.0811827248370265, + "learning_rate": 1.7553191489361704e-06, + "loss": 1.0152, + "step": 165 + }, + { + "epoch": 0.06559644356631267, + "grad_norm": 0.9331981369264469, + "learning_rate": 1.7659574468085109e-06, + "loss": 0.98, + "step": 166 + }, + { + "epoch": 0.06599160286490491, + "grad_norm": 0.9206315543562278, + "learning_rate": 1.7765957446808513e-06, + "loss": 1.0212, + "step": 167 + }, + { + "epoch": 0.06638676216349716, + "grad_norm": 1.1010365241697404, + "learning_rate": 1.7872340425531918e-06, + "loss": 0.9817, + "step": 168 + }, + { + "epoch": 0.0667819214620894, + "grad_norm": 0.9070688042861361, + "learning_rate": 1.7978723404255322e-06, + "loss": 0.9974, + "step": 169 + }, + { + "epoch": 0.06717708076068166, + "grad_norm": 0.9259977243442546, + "learning_rate": 1.8085106382978727e-06, + "loss": 0.9979, + "step": 170 + }, + { + "epoch": 0.0675722400592739, + "grad_norm": 0.9016076582389854, + "learning_rate": 1.8191489361702131e-06, + "loss": 0.9735, + "step": 171 + }, + { + "epoch": 0.06796739935786614, + "grad_norm": 0.9114028424001606, + "learning_rate": 1.8297872340425531e-06, + "loss": 0.9806, + "step": 172 + }, + { + "epoch": 0.06836255865645839, + "grad_norm": 0.9002141700632048, + "learning_rate": 1.8404255319148936e-06, + "loss": 0.9922, + "step": 173 + }, + { + "epoch": 0.06875771795505063, + "grad_norm": 0.9028305681256402, + "learning_rate": 1.851063829787234e-06, + "loss": 0.9698, + "step": 174 + }, + { + "epoch": 0.06915287725364287, + "grad_norm": 0.8797232554167207, + "learning_rate": 1.8617021276595745e-06, + "loss": 0.9627, + "step": 175 + }, + { + "epoch": 0.06954803655223513, + "grad_norm": 0.8502350286865529, + "learning_rate": 1.872340425531915e-06, + "loss": 0.9788, + "step": 176 + }, + { + "epoch": 0.06994319585082737, + "grad_norm": 0.9168330903333247, + "learning_rate": 1.8829787234042554e-06, + "loss": 0.9566, + "step": 177 + }, + { + "epoch": 0.0703383551494196, + "grad_norm": 0.8633709197803816, + "learning_rate": 1.8936170212765958e-06, + "loss": 0.9403, + "step": 178 + }, + { + "epoch": 0.07073351444801186, + "grad_norm": 0.9428773416078605, + "learning_rate": 1.9042553191489363e-06, + "loss": 0.973, + "step": 179 + }, + { + "epoch": 0.0711286737466041, + "grad_norm": 0.8934209230197848, + "learning_rate": 1.9148936170212767e-06, + "loss": 0.9531, + "step": 180 + }, + { + "epoch": 0.07152383304519634, + "grad_norm": 0.891758559309114, + "learning_rate": 1.925531914893617e-06, + "loss": 0.9379, + "step": 181 + }, + { + "epoch": 0.0719189923437886, + "grad_norm": 0.8289117111620914, + "learning_rate": 1.9361702127659576e-06, + "loss": 0.9594, + "step": 182 + }, + { + "epoch": 0.07231415164238084, + "grad_norm": 1.0538786469036632, + "learning_rate": 1.946808510638298e-06, + "loss": 0.9784, + "step": 183 + }, + { + "epoch": 0.07270931094097308, + "grad_norm": 0.8224300832472103, + "learning_rate": 1.9574468085106385e-06, + "loss": 0.9571, + "step": 184 + }, + { + "epoch": 0.07310447023956533, + "grad_norm": 0.8493893795427978, + "learning_rate": 1.968085106382979e-06, + "loss": 0.9436, + "step": 185 + }, + { + "epoch": 0.07349962953815757, + "grad_norm": 0.7674702264899148, + "learning_rate": 1.9787234042553194e-06, + "loss": 0.9391, + "step": 186 + }, + { + "epoch": 0.07389478883674981, + "grad_norm": 0.7756609871194469, + "learning_rate": 1.98936170212766e-06, + "loss": 0.9097, + "step": 187 + }, + { + "epoch": 0.07428994813534207, + "grad_norm": 0.8401464399948625, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.9084, + "step": 188 + }, + { + "epoch": 0.0746851074339343, + "grad_norm": 0.800464792845907, + "learning_rate": 2.0106382978723408e-06, + "loss": 0.955, + "step": 189 + }, + { + "epoch": 0.07508026673252655, + "grad_norm": 0.7963237467038586, + "learning_rate": 2.021276595744681e-06, + "loss": 0.9077, + "step": 190 + }, + { + "epoch": 0.0754754260311188, + "grad_norm": 0.8597621463693865, + "learning_rate": 2.0319148936170213e-06, + "loss": 0.9145, + "step": 191 + }, + { + "epoch": 0.07587058532971104, + "grad_norm": 0.8089654571622219, + "learning_rate": 2.0425531914893617e-06, + "loss": 0.896, + "step": 192 + }, + { + "epoch": 0.07626574462830328, + "grad_norm": 0.8373533262314947, + "learning_rate": 2.053191489361702e-06, + "loss": 0.9412, + "step": 193 + }, + { + "epoch": 0.07666090392689554, + "grad_norm": 0.7388812923284674, + "learning_rate": 2.0638297872340426e-06, + "loss": 0.9099, + "step": 194 + }, + { + "epoch": 0.07705606322548778, + "grad_norm": 0.7430489194577857, + "learning_rate": 2.074468085106383e-06, + "loss": 0.8875, + "step": 195 + }, + { + "epoch": 0.07745122252408002, + "grad_norm": 0.9205612468817582, + "learning_rate": 2.0851063829787235e-06, + "loss": 0.8999, + "step": 196 + }, + { + "epoch": 0.07784638182267227, + "grad_norm": 0.7981256350175032, + "learning_rate": 2.095744680851064e-06, + "loss": 0.906, + "step": 197 + }, + { + "epoch": 0.07824154112126451, + "grad_norm": 0.934795287215565, + "learning_rate": 2.1063829787234044e-06, + "loss": 0.8908, + "step": 198 + }, + { + "epoch": 0.07863670041985675, + "grad_norm": 0.8225561510286254, + "learning_rate": 2.117021276595745e-06, + "loss": 0.901, + "step": 199 + }, + { + "epoch": 0.079031859718449, + "grad_norm": 0.814892859258292, + "learning_rate": 2.1276595744680853e-06, + "loss": 0.897, + "step": 200 + }, + { + "epoch": 0.07942701901704124, + "grad_norm": 0.9225501505784203, + "learning_rate": 2.1382978723404258e-06, + "loss": 0.9174, + "step": 201 + }, + { + "epoch": 0.07982217831563349, + "grad_norm": 0.7007925087166256, + "learning_rate": 2.148936170212766e-06, + "loss": 0.9093, + "step": 202 + }, + { + "epoch": 0.08021733761422574, + "grad_norm": 0.7613541888010653, + "learning_rate": 2.1595744680851067e-06, + "loss": 0.8826, + "step": 203 + }, + { + "epoch": 0.08061249691281798, + "grad_norm": 0.726817783841255, + "learning_rate": 2.170212765957447e-06, + "loss": 0.8738, + "step": 204 + }, + { + "epoch": 0.08100765621141022, + "grad_norm": 0.7037687083978766, + "learning_rate": 2.1808510638297876e-06, + "loss": 0.8989, + "step": 205 + }, + { + "epoch": 0.08140281551000247, + "grad_norm": 0.7363960547281249, + "learning_rate": 2.191489361702128e-06, + "loss": 0.8902, + "step": 206 + }, + { + "epoch": 0.08179797480859471, + "grad_norm": 0.7580280652504182, + "learning_rate": 2.2021276595744685e-06, + "loss": 0.9102, + "step": 207 + }, + { + "epoch": 0.08219313410718695, + "grad_norm": 0.7757175348061099, + "learning_rate": 2.2127659574468085e-06, + "loss": 0.8779, + "step": 208 + }, + { + "epoch": 0.08258829340577921, + "grad_norm": 0.7482271345832553, + "learning_rate": 2.223404255319149e-06, + "loss": 0.8865, + "step": 209 + }, + { + "epoch": 0.08298345270437145, + "grad_norm": 0.7398985474265629, + "learning_rate": 2.2340425531914894e-06, + "loss": 0.8801, + "step": 210 + }, + { + "epoch": 0.08337861200296369, + "grad_norm": 0.7323608656730606, + "learning_rate": 2.24468085106383e-06, + "loss": 0.9002, + "step": 211 + }, + { + "epoch": 0.08377377130155594, + "grad_norm": 0.8148667910929218, + "learning_rate": 2.2553191489361703e-06, + "loss": 0.8647, + "step": 212 + }, + { + "epoch": 0.08416893060014818, + "grad_norm": 0.7262927203241404, + "learning_rate": 2.2659574468085107e-06, + "loss": 0.8741, + "step": 213 + }, + { + "epoch": 0.08456408989874042, + "grad_norm": 0.8201240188887061, + "learning_rate": 2.276595744680851e-06, + "loss": 0.9041, + "step": 214 + }, + { + "epoch": 0.08495924919733268, + "grad_norm": 0.7492499292809485, + "learning_rate": 2.2872340425531916e-06, + "loss": 0.853, + "step": 215 + }, + { + "epoch": 0.08535440849592492, + "grad_norm": 0.7916341013954181, + "learning_rate": 2.297872340425532e-06, + "loss": 0.8871, + "step": 216 + }, + { + "epoch": 0.08574956779451716, + "grad_norm": 0.7520394996291526, + "learning_rate": 2.3085106382978725e-06, + "loss": 0.8562, + "step": 217 + }, + { + "epoch": 0.08614472709310941, + "grad_norm": 1.1281408516742564, + "learning_rate": 2.319148936170213e-06, + "loss": 0.8645, + "step": 218 + }, + { + "epoch": 0.08653988639170165, + "grad_norm": 0.8788573403694457, + "learning_rate": 2.3297872340425534e-06, + "loss": 0.8623, + "step": 219 + }, + { + "epoch": 0.0869350456902939, + "grad_norm": 0.867944142979568, + "learning_rate": 2.340425531914894e-06, + "loss": 0.8572, + "step": 220 + }, + { + "epoch": 0.08733020498888615, + "grad_norm": 0.7926936676598336, + "learning_rate": 2.3510638297872343e-06, + "loss": 0.8713, + "step": 221 + }, + { + "epoch": 0.08772536428747839, + "grad_norm": 0.76255556176971, + "learning_rate": 2.3617021276595748e-06, + "loss": 0.8494, + "step": 222 + }, + { + "epoch": 0.08812052358607063, + "grad_norm": 0.7600673269310736, + "learning_rate": 2.3723404255319152e-06, + "loss": 0.8482, + "step": 223 + }, + { + "epoch": 0.08851568288466288, + "grad_norm": 0.7112814516322407, + "learning_rate": 2.3829787234042557e-06, + "loss": 0.8533, + "step": 224 + }, + { + "epoch": 0.08891084218325512, + "grad_norm": 0.7036773729937497, + "learning_rate": 2.393617021276596e-06, + "loss": 0.847, + "step": 225 + }, + { + "epoch": 0.08930600148184736, + "grad_norm": 0.6590954821985295, + "learning_rate": 2.404255319148936e-06, + "loss": 0.8561, + "step": 226 + }, + { + "epoch": 0.08970116078043962, + "grad_norm": 0.7847232653490934, + "learning_rate": 2.4148936170212766e-06, + "loss": 0.8536, + "step": 227 + }, + { + "epoch": 0.09009632007903186, + "grad_norm": 0.6509851787949374, + "learning_rate": 2.425531914893617e-06, + "loss": 0.85, + "step": 228 + }, + { + "epoch": 0.0904914793776241, + "grad_norm": 0.648445158504499, + "learning_rate": 2.4361702127659575e-06, + "loss": 0.8012, + "step": 229 + }, + { + "epoch": 0.09088663867621635, + "grad_norm": 0.736497309836382, + "learning_rate": 2.446808510638298e-06, + "loss": 0.8639, + "step": 230 + }, + { + "epoch": 0.09128179797480859, + "grad_norm": 0.645898512592661, + "learning_rate": 2.4574468085106384e-06, + "loss": 0.8243, + "step": 231 + }, + { + "epoch": 0.09167695727340083, + "grad_norm": 0.67481419189577, + "learning_rate": 2.468085106382979e-06, + "loss": 0.8682, + "step": 232 + }, + { + "epoch": 0.09207211657199309, + "grad_norm": 0.7664263353306855, + "learning_rate": 2.4787234042553193e-06, + "loss": 0.8623, + "step": 233 + }, + { + "epoch": 0.09246727587058533, + "grad_norm": 0.6372201971565544, + "learning_rate": 2.4893617021276598e-06, + "loss": 0.8153, + "step": 234 + }, + { + "epoch": 0.09286243516917757, + "grad_norm": 0.7305559386754572, + "learning_rate": 2.5e-06, + "loss": 0.85, + "step": 235 + }, + { + "epoch": 0.09325759446776982, + "grad_norm": 0.8258720994595878, + "learning_rate": 2.5106382978723402e-06, + "loss": 0.8588, + "step": 236 + }, + { + "epoch": 0.09365275376636206, + "grad_norm": 0.762878968342536, + "learning_rate": 2.521276595744681e-06, + "loss": 0.8606, + "step": 237 + }, + { + "epoch": 0.09404791306495432, + "grad_norm": 0.7142096057042978, + "learning_rate": 2.531914893617021e-06, + "loss": 0.8337, + "step": 238 + }, + { + "epoch": 0.09444307236354656, + "grad_norm": 0.6622420389390998, + "learning_rate": 2.542553191489362e-06, + "loss": 0.8475, + "step": 239 + }, + { + "epoch": 0.0948382316621388, + "grad_norm": 0.7610481119780552, + "learning_rate": 2.553191489361702e-06, + "loss": 0.8637, + "step": 240 + }, + { + "epoch": 0.09523339096073105, + "grad_norm": 0.8093851995765269, + "learning_rate": 2.563829787234043e-06, + "loss": 0.8521, + "step": 241 + }, + { + "epoch": 0.09562855025932329, + "grad_norm": 0.6922332669648779, + "learning_rate": 2.574468085106383e-06, + "loss": 0.8318, + "step": 242 + }, + { + "epoch": 0.09602370955791553, + "grad_norm": 0.7693846197783168, + "learning_rate": 2.585106382978724e-06, + "loss": 0.8344, + "step": 243 + }, + { + "epoch": 0.09641886885650779, + "grad_norm": 0.827949068079929, + "learning_rate": 2.595744680851064e-06, + "loss": 0.8486, + "step": 244 + }, + { + "epoch": 0.09681402815510003, + "grad_norm": 0.6650613101391282, + "learning_rate": 2.6063829787234047e-06, + "loss": 0.8562, + "step": 245 + }, + { + "epoch": 0.09720918745369227, + "grad_norm": 0.8278434657863171, + "learning_rate": 2.6170212765957447e-06, + "loss": 0.8416, + "step": 246 + }, + { + "epoch": 0.09760434675228452, + "grad_norm": 0.7938328023685507, + "learning_rate": 2.6276595744680856e-06, + "loss": 0.8311, + "step": 247 + }, + { + "epoch": 0.09799950605087676, + "grad_norm": 0.6143610103260296, + "learning_rate": 2.6382978723404256e-06, + "loss": 0.8254, + "step": 248 + }, + { + "epoch": 0.098394665349469, + "grad_norm": 0.7619119256406987, + "learning_rate": 2.6489361702127665e-06, + "loss": 0.839, + "step": 249 + }, + { + "epoch": 0.09878982464806126, + "grad_norm": 0.7986196919326185, + "learning_rate": 2.6595744680851065e-06, + "loss": 0.8455, + "step": 250 + }, + { + "epoch": 0.0991849839466535, + "grad_norm": 0.6717154185196351, + "learning_rate": 2.6702127659574474e-06, + "loss": 0.8336, + "step": 251 + }, + { + "epoch": 0.09958014324524574, + "grad_norm": 0.7767129776930844, + "learning_rate": 2.6808510638297874e-06, + "loss": 0.8145, + "step": 252 + }, + { + "epoch": 0.09997530254383799, + "grad_norm": 0.6439894121807275, + "learning_rate": 2.6914893617021283e-06, + "loss": 0.8111, + "step": 253 + }, + { + "epoch": 0.10037046184243023, + "grad_norm": 0.607679779354355, + "learning_rate": 2.7021276595744683e-06, + "loss": 0.8291, + "step": 254 + }, + { + "epoch": 0.10076562114102247, + "grad_norm": 0.7464563131206438, + "learning_rate": 2.7127659574468084e-06, + "loss": 0.8017, + "step": 255 + }, + { + "epoch": 0.10116078043961473, + "grad_norm": 0.7095071407490194, + "learning_rate": 2.7234042553191492e-06, + "loss": 0.8497, + "step": 256 + }, + { + "epoch": 0.10155593973820697, + "grad_norm": 0.6459106720980804, + "learning_rate": 2.7340425531914893e-06, + "loss": 0.8405, + "step": 257 + }, + { + "epoch": 0.1019510990367992, + "grad_norm": 0.7520801305481896, + "learning_rate": 2.74468085106383e-06, + "loss": 0.8074, + "step": 258 + }, + { + "epoch": 0.10234625833539146, + "grad_norm": 0.7352594748784674, + "learning_rate": 2.75531914893617e-06, + "loss": 0.8185, + "step": 259 + }, + { + "epoch": 0.1027414176339837, + "grad_norm": 0.7017112677861721, + "learning_rate": 2.765957446808511e-06, + "loss": 0.8266, + "step": 260 + }, + { + "epoch": 0.10313657693257594, + "grad_norm": 0.761095345544042, + "learning_rate": 2.776595744680851e-06, + "loss": 0.7901, + "step": 261 + }, + { + "epoch": 0.1035317362311682, + "grad_norm": 0.7630856502083389, + "learning_rate": 2.787234042553192e-06, + "loss": 0.8038, + "step": 262 + }, + { + "epoch": 0.10392689552976044, + "grad_norm": 0.7132952976277227, + "learning_rate": 2.797872340425532e-06, + "loss": 0.8157, + "step": 263 + }, + { + "epoch": 0.10432205482835268, + "grad_norm": 0.900249895843058, + "learning_rate": 2.808510638297873e-06, + "loss": 0.8025, + "step": 264 + }, + { + "epoch": 0.10471721412694493, + "grad_norm": 0.6280451084365367, + "learning_rate": 2.819148936170213e-06, + "loss": 0.7995, + "step": 265 + }, + { + "epoch": 0.10511237342553717, + "grad_norm": 0.7227719924319213, + "learning_rate": 2.8297872340425537e-06, + "loss": 0.8428, + "step": 266 + }, + { + "epoch": 0.10550753272412941, + "grad_norm": 0.7918390149442235, + "learning_rate": 2.8404255319148938e-06, + "loss": 0.7932, + "step": 267 + }, + { + "epoch": 0.10590269202272166, + "grad_norm": 0.6180037290493834, + "learning_rate": 2.8510638297872346e-06, + "loss": 0.8234, + "step": 268 + }, + { + "epoch": 0.1062978513213139, + "grad_norm": 0.6527196921600206, + "learning_rate": 2.8617021276595747e-06, + "loss": 0.8047, + "step": 269 + }, + { + "epoch": 0.10669301061990615, + "grad_norm": 1.0085142638197158, + "learning_rate": 2.8723404255319155e-06, + "loss": 0.8164, + "step": 270 + }, + { + "epoch": 0.1070881699184984, + "grad_norm": 0.9812147195881408, + "learning_rate": 2.8829787234042556e-06, + "loss": 0.7965, + "step": 271 + }, + { + "epoch": 0.10748332921709064, + "grad_norm": 1.0448400314614967, + "learning_rate": 2.8936170212765956e-06, + "loss": 0.8107, + "step": 272 + }, + { + "epoch": 0.10787848851568288, + "grad_norm": 0.6984652148632842, + "learning_rate": 2.9042553191489365e-06, + "loss": 0.7865, + "step": 273 + }, + { + "epoch": 0.10827364781427513, + "grad_norm": 0.615065445787656, + "learning_rate": 2.9148936170212765e-06, + "loss": 0.7847, + "step": 274 + }, + { + "epoch": 0.10866880711286737, + "grad_norm": 0.8697531900111486, + "learning_rate": 2.9255319148936174e-06, + "loss": 0.8037, + "step": 275 + }, + { + "epoch": 0.10906396641145961, + "grad_norm": 0.6197520935871196, + "learning_rate": 2.9361702127659574e-06, + "loss": 0.8151, + "step": 276 + }, + { + "epoch": 0.10945912571005187, + "grad_norm": 0.8060011131830561, + "learning_rate": 2.9468085106382983e-06, + "loss": 0.7873, + "step": 277 + }, + { + "epoch": 0.10985428500864411, + "grad_norm": 0.6756833173956306, + "learning_rate": 2.9574468085106383e-06, + "loss": 0.8293, + "step": 278 + }, + { + "epoch": 0.11024944430723635, + "grad_norm": 0.6099851620784188, + "learning_rate": 2.968085106382979e-06, + "loss": 0.8241, + "step": 279 + }, + { + "epoch": 0.1106446036058286, + "grad_norm": 0.7283434437348605, + "learning_rate": 2.978723404255319e-06, + "loss": 0.7936, + "step": 280 + }, + { + "epoch": 0.11103976290442084, + "grad_norm": 0.6267015873151124, + "learning_rate": 2.98936170212766e-06, + "loss": 0.7962, + "step": 281 + }, + { + "epoch": 0.11143492220301308, + "grad_norm": 0.6116326742433275, + "learning_rate": 3e-06, + "loss": 0.8187, + "step": 282 + }, + { + "epoch": 0.11183008150160534, + "grad_norm": 0.7049851223926784, + "learning_rate": 3.010638297872341e-06, + "loss": 0.7831, + "step": 283 + }, + { + "epoch": 0.11222524080019758, + "grad_norm": 0.6257674496318532, + "learning_rate": 3.021276595744681e-06, + "loss": 0.7781, + "step": 284 + }, + { + "epoch": 0.11262040009878982, + "grad_norm": 0.6190650007298025, + "learning_rate": 3.031914893617022e-06, + "loss": 0.7847, + "step": 285 + }, + { + "epoch": 0.11301555939738207, + "grad_norm": 0.6089050965059565, + "learning_rate": 3.042553191489362e-06, + "loss": 0.7922, + "step": 286 + }, + { + "epoch": 0.11341071869597431, + "grad_norm": 0.6557347874370106, + "learning_rate": 3.0531914893617027e-06, + "loss": 0.7963, + "step": 287 + }, + { + "epoch": 0.11380587799456655, + "grad_norm": 0.605070002885156, + "learning_rate": 3.0638297872340428e-06, + "loss": 0.8013, + "step": 288 + }, + { + "epoch": 0.11420103729315881, + "grad_norm": 0.6311761000607505, + "learning_rate": 3.0744680851063836e-06, + "loss": 0.8005, + "step": 289 + }, + { + "epoch": 0.11459619659175105, + "grad_norm": 0.6053509981952528, + "learning_rate": 3.0851063829787237e-06, + "loss": 0.7998, + "step": 290 + }, + { + "epoch": 0.11499135589034329, + "grad_norm": 0.6235992193861427, + "learning_rate": 3.0957446808510637e-06, + "loss": 0.7804, + "step": 291 + }, + { + "epoch": 0.11538651518893554, + "grad_norm": 0.7433766960086654, + "learning_rate": 3.1063829787234046e-06, + "loss": 0.7867, + "step": 292 + }, + { + "epoch": 0.11578167448752778, + "grad_norm": 1.272372503947218, + "learning_rate": 3.1170212765957446e-06, + "loss": 0.7929, + "step": 293 + }, + { + "epoch": 0.11617683378612002, + "grad_norm": 0.8186585325087611, + "learning_rate": 3.1276595744680855e-06, + "loss": 0.7854, + "step": 294 + }, + { + "epoch": 0.11657199308471228, + "grad_norm": 0.6739199686171928, + "learning_rate": 3.1382978723404255e-06, + "loss": 0.805, + "step": 295 + }, + { + "epoch": 0.11696715238330452, + "grad_norm": 0.6306549835549222, + "learning_rate": 3.1489361702127664e-06, + "loss": 0.786, + "step": 296 + }, + { + "epoch": 0.11736231168189676, + "grad_norm": 0.7237829920169022, + "learning_rate": 3.1595744680851064e-06, + "loss": 0.7921, + "step": 297 + }, + { + "epoch": 0.11775747098048901, + "grad_norm": 0.6263525455488399, + "learning_rate": 3.1702127659574473e-06, + "loss": 0.7794, + "step": 298 + }, + { + "epoch": 0.11815263027908125, + "grad_norm": 0.6260917117859388, + "learning_rate": 3.1808510638297873e-06, + "loss": 0.7821, + "step": 299 + }, + { + "epoch": 0.11854778957767349, + "grad_norm": 0.6670782296472709, + "learning_rate": 3.191489361702128e-06, + "loss": 0.7889, + "step": 300 + }, + { + "epoch": 0.11894294887626575, + "grad_norm": 0.7021296376843668, + "learning_rate": 3.202127659574468e-06, + "loss": 0.7879, + "step": 301 + }, + { + "epoch": 0.11933810817485799, + "grad_norm": 0.6436206115473436, + "learning_rate": 3.212765957446809e-06, + "loss": 0.7463, + "step": 302 + }, + { + "epoch": 0.11973326747345023, + "grad_norm": 0.6581559963727542, + "learning_rate": 3.223404255319149e-06, + "loss": 0.7833, + "step": 303 + }, + { + "epoch": 0.12012842677204248, + "grad_norm": 0.6085378327556474, + "learning_rate": 3.23404255319149e-06, + "loss": 0.7761, + "step": 304 + }, + { + "epoch": 0.12052358607063472, + "grad_norm": 0.6332080516803438, + "learning_rate": 3.24468085106383e-06, + "loss": 0.7687, + "step": 305 + }, + { + "epoch": 0.12091874536922698, + "grad_norm": 0.5853426452014024, + "learning_rate": 3.255319148936171e-06, + "loss": 0.7878, + "step": 306 + }, + { + "epoch": 0.12131390466781922, + "grad_norm": 0.6223748575158505, + "learning_rate": 3.265957446808511e-06, + "loss": 0.7739, + "step": 307 + }, + { + "epoch": 0.12170906396641146, + "grad_norm": 0.6981138823006418, + "learning_rate": 3.276595744680851e-06, + "loss": 0.7695, + "step": 308 + }, + { + "epoch": 0.12210422326500371, + "grad_norm": 0.6663161701946723, + "learning_rate": 3.287234042553192e-06, + "loss": 0.8053, + "step": 309 + }, + { + "epoch": 0.12249938256359595, + "grad_norm": 0.5681680114078884, + "learning_rate": 3.297872340425532e-06, + "loss": 0.7752, + "step": 310 + }, + { + "epoch": 0.12289454186218819, + "grad_norm": 0.5858906412963043, + "learning_rate": 3.3085106382978727e-06, + "loss": 0.7916, + "step": 311 + }, + { + "epoch": 0.12328970116078045, + "grad_norm": 0.6534395144260531, + "learning_rate": 3.3191489361702127e-06, + "loss": 0.7912, + "step": 312 + }, + { + "epoch": 0.12368486045937269, + "grad_norm": 0.586198108795242, + "learning_rate": 3.3297872340425536e-06, + "loss": 0.7813, + "step": 313 + }, + { + "epoch": 0.12408001975796493, + "grad_norm": 0.647904927155932, + "learning_rate": 3.3404255319148936e-06, + "loss": 0.7936, + "step": 314 + }, + { + "epoch": 0.12447517905655718, + "grad_norm": 0.58805476345561, + "learning_rate": 3.3510638297872345e-06, + "loss": 0.7973, + "step": 315 + }, + { + "epoch": 0.12487033835514942, + "grad_norm": 0.5947946152867843, + "learning_rate": 3.3617021276595745e-06, + "loss": 0.7632, + "step": 316 + }, + { + "epoch": 0.12526549765374168, + "grad_norm": 0.6193828087421168, + "learning_rate": 3.3723404255319154e-06, + "loss": 0.7848, + "step": 317 + }, + { + "epoch": 0.12566065695233392, + "grad_norm": 0.6258831363958954, + "learning_rate": 3.3829787234042554e-06, + "loss": 0.7678, + "step": 318 + }, + { + "epoch": 0.12605581625092616, + "grad_norm": 0.5747152594546233, + "learning_rate": 3.3936170212765963e-06, + "loss": 0.7817, + "step": 319 + }, + { + "epoch": 0.1264509755495184, + "grad_norm": 0.731674333424822, + "learning_rate": 3.4042553191489363e-06, + "loss": 0.7643, + "step": 320 + }, + { + "epoch": 0.12684613484811064, + "grad_norm": 0.6529010150169795, + "learning_rate": 3.414893617021277e-06, + "loss": 0.7667, + "step": 321 + }, + { + "epoch": 0.12724129414670288, + "grad_norm": 0.7014365604795955, + "learning_rate": 3.4255319148936172e-06, + "loss": 0.7787, + "step": 322 + }, + { + "epoch": 0.12763645344529514, + "grad_norm": 0.5834784417943027, + "learning_rate": 3.436170212765958e-06, + "loss": 0.779, + "step": 323 + }, + { + "epoch": 0.12803161274388739, + "grad_norm": 0.7742370051056543, + "learning_rate": 3.446808510638298e-06, + "loss": 0.7733, + "step": 324 + }, + { + "epoch": 0.12842677204247963, + "grad_norm": 0.5865654907094333, + "learning_rate": 3.457446808510639e-06, + "loss": 0.7456, + "step": 325 + }, + { + "epoch": 0.12882193134107187, + "grad_norm": 0.682149646741674, + "learning_rate": 3.468085106382979e-06, + "loss": 0.7994, + "step": 326 + }, + { + "epoch": 0.1292170906396641, + "grad_norm": 0.6701640838897713, + "learning_rate": 3.478723404255319e-06, + "loss": 0.7757, + "step": 327 + }, + { + "epoch": 0.12961224993825635, + "grad_norm": 0.6614941446237965, + "learning_rate": 3.48936170212766e-06, + "loss": 0.7725, + "step": 328 + }, + { + "epoch": 0.13000740923684861, + "grad_norm": 0.6095415461958933, + "learning_rate": 3.5e-06, + "loss": 0.7653, + "step": 329 + }, + { + "epoch": 0.13040256853544085, + "grad_norm": 0.6190745963316048, + "learning_rate": 3.510638297872341e-06, + "loss": 0.7805, + "step": 330 + }, + { + "epoch": 0.1307977278340331, + "grad_norm": 0.6147443464940033, + "learning_rate": 3.521276595744681e-06, + "loss": 0.7763, + "step": 331 + }, + { + "epoch": 0.13119288713262534, + "grad_norm": 0.5866635120263384, + "learning_rate": 3.5319148936170217e-06, + "loss": 0.7698, + "step": 332 + }, + { + "epoch": 0.13158804643121758, + "grad_norm": 0.6186148281750584, + "learning_rate": 3.5425531914893617e-06, + "loss": 0.7771, + "step": 333 + }, + { + "epoch": 0.13198320572980982, + "grad_norm": 0.6620495658950574, + "learning_rate": 3.5531914893617026e-06, + "loss": 0.7622, + "step": 334 + }, + { + "epoch": 0.13237836502840208, + "grad_norm": 0.6402051904915239, + "learning_rate": 3.5638297872340426e-06, + "loss": 0.7734, + "step": 335 + }, + { + "epoch": 0.13277352432699432, + "grad_norm": 0.6309802907790648, + "learning_rate": 3.5744680851063835e-06, + "loss": 0.7665, + "step": 336 + }, + { + "epoch": 0.13316868362558656, + "grad_norm": 0.6408680224306361, + "learning_rate": 3.5851063829787235e-06, + "loss": 0.7686, + "step": 337 + }, + { + "epoch": 0.1335638429241788, + "grad_norm": 0.6160995866186305, + "learning_rate": 3.5957446808510644e-06, + "loss": 0.7373, + "step": 338 + }, + { + "epoch": 0.13395900222277105, + "grad_norm": 0.6840190156980204, + "learning_rate": 3.6063829787234044e-06, + "loss": 0.7648, + "step": 339 + }, + { + "epoch": 0.1343541615213633, + "grad_norm": 0.6581837356659743, + "learning_rate": 3.6170212765957453e-06, + "loss": 0.7531, + "step": 340 + }, + { + "epoch": 0.13474932081995555, + "grad_norm": 0.7148764476179186, + "learning_rate": 3.6276595744680853e-06, + "loss": 0.7819, + "step": 341 + }, + { + "epoch": 0.1351444801185478, + "grad_norm": 0.6600259458370601, + "learning_rate": 3.6382978723404262e-06, + "loss": 0.7842, + "step": 342 + }, + { + "epoch": 0.13553963941714003, + "grad_norm": 0.6126556516156064, + "learning_rate": 3.6489361702127662e-06, + "loss": 0.7639, + "step": 343 + }, + { + "epoch": 0.13593479871573227, + "grad_norm": 0.6672986551005967, + "learning_rate": 3.6595744680851063e-06, + "loss": 0.7592, + "step": 344 + }, + { + "epoch": 0.13632995801432451, + "grad_norm": 0.6223043791589471, + "learning_rate": 3.670212765957447e-06, + "loss": 0.7564, + "step": 345 + }, + { + "epoch": 0.13672511731291678, + "grad_norm": 0.8110409579655677, + "learning_rate": 3.680851063829787e-06, + "loss": 0.7651, + "step": 346 + }, + { + "epoch": 0.13712027661150902, + "grad_norm": 0.6581211301407619, + "learning_rate": 3.691489361702128e-06, + "loss": 0.7611, + "step": 347 + }, + { + "epoch": 0.13751543591010126, + "grad_norm": 0.6744339312700035, + "learning_rate": 3.702127659574468e-06, + "loss": 0.7613, + "step": 348 + }, + { + "epoch": 0.1379105952086935, + "grad_norm": 0.605969858747699, + "learning_rate": 3.712765957446809e-06, + "loss": 0.7847, + "step": 349 + }, + { + "epoch": 0.13830575450728574, + "grad_norm": 0.6796481089227777, + "learning_rate": 3.723404255319149e-06, + "loss": 0.7698, + "step": 350 + }, + { + "epoch": 0.13870091380587798, + "grad_norm": 0.6445216487073453, + "learning_rate": 3.73404255319149e-06, + "loss": 0.7452, + "step": 351 + }, + { + "epoch": 0.13909607310447025, + "grad_norm": 0.5848343681445489, + "learning_rate": 3.74468085106383e-06, + "loss": 0.7532, + "step": 352 + }, + { + "epoch": 0.1394912324030625, + "grad_norm": 0.686482496408356, + "learning_rate": 3.7553191489361707e-06, + "loss": 0.745, + "step": 353 + }, + { + "epoch": 0.13988639170165473, + "grad_norm": 0.6484269721297492, + "learning_rate": 3.7659574468085108e-06, + "loss": 0.7408, + "step": 354 + }, + { + "epoch": 0.14028155100024697, + "grad_norm": 0.5809696958364097, + "learning_rate": 3.7765957446808516e-06, + "loss": 0.7471, + "step": 355 + }, + { + "epoch": 0.1406767102988392, + "grad_norm": 0.6182389029108604, + "learning_rate": 3.7872340425531917e-06, + "loss": 0.7654, + "step": 356 + }, + { + "epoch": 0.14107186959743145, + "grad_norm": 0.6745002737038325, + "learning_rate": 3.7978723404255325e-06, + "loss": 0.7604, + "step": 357 + }, + { + "epoch": 0.14146702889602372, + "grad_norm": 0.7457299483196483, + "learning_rate": 3.8085106382978726e-06, + "loss": 0.771, + "step": 358 + }, + { + "epoch": 0.14186218819461596, + "grad_norm": 0.626223558343114, + "learning_rate": 3.819148936170213e-06, + "loss": 0.7856, + "step": 359 + }, + { + "epoch": 0.1422573474932082, + "grad_norm": 0.6297264134042032, + "learning_rate": 3.8297872340425535e-06, + "loss": 0.748, + "step": 360 + }, + { + "epoch": 0.14265250679180044, + "grad_norm": 0.5951480073799157, + "learning_rate": 3.840425531914894e-06, + "loss": 0.7782, + "step": 361 + }, + { + "epoch": 0.14304766609039268, + "grad_norm": 0.5886768213260241, + "learning_rate": 3.851063829787234e-06, + "loss": 0.7455, + "step": 362 + }, + { + "epoch": 0.14344282538898492, + "grad_norm": 0.673440091158271, + "learning_rate": 3.861702127659575e-06, + "loss": 0.7419, + "step": 363 + }, + { + "epoch": 0.1438379846875772, + "grad_norm": 0.6400671142527239, + "learning_rate": 3.872340425531915e-06, + "loss": 0.7648, + "step": 364 + }, + { + "epoch": 0.14423314398616943, + "grad_norm": 0.6378848588398233, + "learning_rate": 3.882978723404256e-06, + "loss": 0.764, + "step": 365 + }, + { + "epoch": 0.14462830328476167, + "grad_norm": 0.5855533735447821, + "learning_rate": 3.893617021276596e-06, + "loss": 0.755, + "step": 366 + }, + { + "epoch": 0.1450234625833539, + "grad_norm": 0.5988399094182794, + "learning_rate": 3.904255319148937e-06, + "loss": 0.7717, + "step": 367 + }, + { + "epoch": 0.14541862188194615, + "grad_norm": 0.5909689175761067, + "learning_rate": 3.914893617021277e-06, + "loss": 0.7305, + "step": 368 + }, + { + "epoch": 0.1458137811805384, + "grad_norm": 0.6091829357239786, + "learning_rate": 3.9255319148936175e-06, + "loss": 0.7582, + "step": 369 + }, + { + "epoch": 0.14620894047913066, + "grad_norm": 0.663665703959751, + "learning_rate": 3.936170212765958e-06, + "loss": 0.7634, + "step": 370 + }, + { + "epoch": 0.1466040997777229, + "grad_norm": 0.6131032136468707, + "learning_rate": 3.946808510638298e-06, + "loss": 0.7352, + "step": 371 + }, + { + "epoch": 0.14699925907631514, + "grad_norm": 0.6438458775790014, + "learning_rate": 3.957446808510639e-06, + "loss": 0.7558, + "step": 372 + }, + { + "epoch": 0.14739441837490738, + "grad_norm": 0.6221026846089203, + "learning_rate": 3.968085106382979e-06, + "loss": 0.7483, + "step": 373 + }, + { + "epoch": 0.14778957767349962, + "grad_norm": 0.718970949273493, + "learning_rate": 3.97872340425532e-06, + "loss": 0.7582, + "step": 374 + }, + { + "epoch": 0.14818473697209186, + "grad_norm": 0.7632958625164173, + "learning_rate": 3.98936170212766e-06, + "loss": 0.7487, + "step": 375 + }, + { + "epoch": 0.14857989627068413, + "grad_norm": 0.6975538226598186, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7548, + "step": 376 + }, + { + "epoch": 0.14897505556927637, + "grad_norm": 1.011413729830943, + "learning_rate": 4.010638297872341e-06, + "loss": 0.7355, + "step": 377 + }, + { + "epoch": 0.1493702148678686, + "grad_norm": 0.6337564143926464, + "learning_rate": 4.0212765957446816e-06, + "loss": 0.7519, + "step": 378 + }, + { + "epoch": 0.14976537416646085, + "grad_norm": 0.668831153796517, + "learning_rate": 4.031914893617022e-06, + "loss": 0.7624, + "step": 379 + }, + { + "epoch": 0.1501605334650531, + "grad_norm": 0.6436926511990773, + "learning_rate": 4.042553191489362e-06, + "loss": 0.7474, + "step": 380 + }, + { + "epoch": 0.15055569276364533, + "grad_norm": 0.6272202096101458, + "learning_rate": 4.053191489361702e-06, + "loss": 0.7602, + "step": 381 + }, + { + "epoch": 0.1509508520622376, + "grad_norm": 0.6212797540013587, + "learning_rate": 4.0638297872340425e-06, + "loss": 0.7772, + "step": 382 + }, + { + "epoch": 0.15134601136082984, + "grad_norm": 0.6289946916321798, + "learning_rate": 4.074468085106383e-06, + "loss": 0.7689, + "step": 383 + }, + { + "epoch": 0.15174117065942208, + "grad_norm": 0.6380191041059411, + "learning_rate": 4.085106382978723e-06, + "loss": 0.7572, + "step": 384 + }, + { + "epoch": 0.15213632995801432, + "grad_norm": 0.5779795688082269, + "learning_rate": 4.095744680851064e-06, + "loss": 0.7546, + "step": 385 + }, + { + "epoch": 0.15253148925660656, + "grad_norm": 0.7842430533418746, + "learning_rate": 4.106382978723404e-06, + "loss": 0.7751, + "step": 386 + }, + { + "epoch": 0.1529266485551988, + "grad_norm": 0.6729893760136558, + "learning_rate": 4.117021276595745e-06, + "loss": 0.7455, + "step": 387 + }, + { + "epoch": 0.15332180785379107, + "grad_norm": 0.5882912759438818, + "learning_rate": 4.127659574468085e-06, + "loss": 0.7696, + "step": 388 + }, + { + "epoch": 0.1537169671523833, + "grad_norm": 0.6227877790472663, + "learning_rate": 4.138297872340426e-06, + "loss": 0.7608, + "step": 389 + }, + { + "epoch": 0.15411212645097555, + "grad_norm": 0.6619432056903835, + "learning_rate": 4.148936170212766e-06, + "loss": 0.7473, + "step": 390 + }, + { + "epoch": 0.1545072857495678, + "grad_norm": 0.628215165860546, + "learning_rate": 4.1595744680851066e-06, + "loss": 0.7489, + "step": 391 + }, + { + "epoch": 0.15490244504816003, + "grad_norm": 0.5841047919949067, + "learning_rate": 4.170212765957447e-06, + "loss": 0.7447, + "step": 392 + }, + { + "epoch": 0.15529760434675227, + "grad_norm": 0.5825058321729848, + "learning_rate": 4.1808510638297875e-06, + "loss": 0.7518, + "step": 393 + }, + { + "epoch": 0.15569276364534454, + "grad_norm": 0.5989180700052267, + "learning_rate": 4.191489361702128e-06, + "loss": 0.7463, + "step": 394 + }, + { + "epoch": 0.15608792294393678, + "grad_norm": 0.5485054555998863, + "learning_rate": 4.202127659574468e-06, + "loss": 0.7254, + "step": 395 + }, + { + "epoch": 0.15648308224252902, + "grad_norm": 0.646716189095031, + "learning_rate": 4.212765957446809e-06, + "loss": 0.7583, + "step": 396 + }, + { + "epoch": 0.15687824154112126, + "grad_norm": 0.6866055568311499, + "learning_rate": 4.223404255319149e-06, + "loss": 0.7372, + "step": 397 + }, + { + "epoch": 0.1572734008397135, + "grad_norm": 0.7140385676935366, + "learning_rate": 4.23404255319149e-06, + "loss": 0.739, + "step": 398 + }, + { + "epoch": 0.15766856013830574, + "grad_norm": 0.643023278201698, + "learning_rate": 4.24468085106383e-06, + "loss": 0.7522, + "step": 399 + }, + { + "epoch": 0.158063719436898, + "grad_norm": 0.6312967460011237, + "learning_rate": 4.255319148936171e-06, + "loss": 0.7676, + "step": 400 + }, + { + "epoch": 0.15845887873549025, + "grad_norm": 0.6382138110089172, + "learning_rate": 4.265957446808511e-06, + "loss": 0.7629, + "step": 401 + }, + { + "epoch": 0.1588540380340825, + "grad_norm": 0.6332459333263118, + "learning_rate": 4.2765957446808515e-06, + "loss": 0.7311, + "step": 402 + }, + { + "epoch": 0.15924919733267473, + "grad_norm": 0.6412487468227186, + "learning_rate": 4.287234042553192e-06, + "loss": 0.7567, + "step": 403 + }, + { + "epoch": 0.15964435663126697, + "grad_norm": 0.7012222995798256, + "learning_rate": 4.297872340425532e-06, + "loss": 0.7436, + "step": 404 + }, + { + "epoch": 0.1600395159298592, + "grad_norm": 0.6181464501586335, + "learning_rate": 4.308510638297873e-06, + "loss": 0.7298, + "step": 405 + }, + { + "epoch": 0.16043467522845148, + "grad_norm": 0.6207970587414706, + "learning_rate": 4.319148936170213e-06, + "loss": 0.7376, + "step": 406 + }, + { + "epoch": 0.16082983452704372, + "grad_norm": 0.6106494887539161, + "learning_rate": 4.329787234042554e-06, + "loss": 0.7613, + "step": 407 + }, + { + "epoch": 0.16122499382563596, + "grad_norm": 0.6219536291754221, + "learning_rate": 4.340425531914894e-06, + "loss": 0.7397, + "step": 408 + }, + { + "epoch": 0.1616201531242282, + "grad_norm": 0.6668879597496847, + "learning_rate": 4.351063829787235e-06, + "loss": 0.7469, + "step": 409 + }, + { + "epoch": 0.16201531242282044, + "grad_norm": 0.6048928125977155, + "learning_rate": 4.361702127659575e-06, + "loss": 0.7184, + "step": 410 + }, + { + "epoch": 0.1624104717214127, + "grad_norm": 0.6204217080705178, + "learning_rate": 4.3723404255319156e-06, + "loss": 0.7433, + "step": 411 + }, + { + "epoch": 0.16280563102000495, + "grad_norm": 0.6071870434011859, + "learning_rate": 4.382978723404256e-06, + "loss": 0.7522, + "step": 412 + }, + { + "epoch": 0.1632007903185972, + "grad_norm": 0.6179635375795429, + "learning_rate": 4.3936170212765965e-06, + "loss": 0.7521, + "step": 413 + }, + { + "epoch": 0.16359594961718943, + "grad_norm": 0.8065353676281373, + "learning_rate": 4.404255319148937e-06, + "loss": 0.7371, + "step": 414 + }, + { + "epoch": 0.16399110891578167, + "grad_norm": 0.7223783073823268, + "learning_rate": 4.414893617021277e-06, + "loss": 0.749, + "step": 415 + }, + { + "epoch": 0.1643862682143739, + "grad_norm": 0.6418436008798059, + "learning_rate": 4.425531914893617e-06, + "loss": 0.7074, + "step": 416 + }, + { + "epoch": 0.16478142751296618, + "grad_norm": 0.649001786754692, + "learning_rate": 4.436170212765957e-06, + "loss": 0.7409, + "step": 417 + }, + { + "epoch": 0.16517658681155842, + "grad_norm": 0.6488082324591959, + "learning_rate": 4.446808510638298e-06, + "loss": 0.7294, + "step": 418 + }, + { + "epoch": 0.16557174611015066, + "grad_norm": 0.6815680138497775, + "learning_rate": 4.457446808510638e-06, + "loss": 0.7516, + "step": 419 + }, + { + "epoch": 0.1659669054087429, + "grad_norm": 0.627178650684025, + "learning_rate": 4.468085106382979e-06, + "loss": 0.7462, + "step": 420 + }, + { + "epoch": 0.16636206470733514, + "grad_norm": 0.6847077790393925, + "learning_rate": 4.478723404255319e-06, + "loss": 0.7642, + "step": 421 + }, + { + "epoch": 0.16675722400592738, + "grad_norm": 0.6677432032398735, + "learning_rate": 4.48936170212766e-06, + "loss": 0.7376, + "step": 422 + }, + { + "epoch": 0.16715238330451965, + "grad_norm": 0.6622301887078457, + "learning_rate": 4.5e-06, + "loss": 0.7281, + "step": 423 + }, + { + "epoch": 0.1675475426031119, + "grad_norm": 0.7032852662429785, + "learning_rate": 4.5106382978723406e-06, + "loss": 0.7295, + "step": 424 + }, + { + "epoch": 0.16794270190170413, + "grad_norm": 0.9182307524223992, + "learning_rate": 4.521276595744681e-06, + "loss": 0.7251, + "step": 425 + }, + { + "epoch": 0.16833786120029637, + "grad_norm": 0.6549474458937882, + "learning_rate": 4.5319148936170215e-06, + "loss": 0.7393, + "step": 426 + }, + { + "epoch": 0.1687330204988886, + "grad_norm": 0.6260937965574087, + "learning_rate": 4.542553191489362e-06, + "loss": 0.7466, + "step": 427 + }, + { + "epoch": 0.16912817979748085, + "grad_norm": 0.6098749910337613, + "learning_rate": 4.553191489361702e-06, + "loss": 0.7429, + "step": 428 + }, + { + "epoch": 0.16952333909607312, + "grad_norm": 0.6258409309068056, + "learning_rate": 4.563829787234043e-06, + "loss": 0.7437, + "step": 429 + }, + { + "epoch": 0.16991849839466536, + "grad_norm": 0.6998727053444407, + "learning_rate": 4.574468085106383e-06, + "loss": 0.7349, + "step": 430 + }, + { + "epoch": 0.1703136576932576, + "grad_norm": 0.635839700174999, + "learning_rate": 4.585106382978724e-06, + "loss": 0.742, + "step": 431 + }, + { + "epoch": 0.17070881699184984, + "grad_norm": 0.6881295298901814, + "learning_rate": 4.595744680851064e-06, + "loss": 0.7635, + "step": 432 + }, + { + "epoch": 0.17110397629044208, + "grad_norm": 0.6138576739147359, + "learning_rate": 4.606382978723405e-06, + "loss": 0.7215, + "step": 433 + }, + { + "epoch": 0.17149913558903432, + "grad_norm": 0.6786451341607185, + "learning_rate": 4.617021276595745e-06, + "loss": 0.746, + "step": 434 + }, + { + "epoch": 0.1718942948876266, + "grad_norm": 0.6085640309923145, + "learning_rate": 4.6276595744680855e-06, + "loss": 0.7201, + "step": 435 + }, + { + "epoch": 0.17228945418621883, + "grad_norm": 0.6543509049521442, + "learning_rate": 4.638297872340426e-06, + "loss": 0.7376, + "step": 436 + }, + { + "epoch": 0.17268461348481107, + "grad_norm": 0.6281733392631065, + "learning_rate": 4.648936170212766e-06, + "loss": 0.7386, + "step": 437 + }, + { + "epoch": 0.1730797727834033, + "grad_norm": 0.5766921427888596, + "learning_rate": 4.659574468085107e-06, + "loss": 0.7344, + "step": 438 + }, + { + "epoch": 0.17347493208199555, + "grad_norm": 0.6238350960095946, + "learning_rate": 4.670212765957447e-06, + "loss": 0.7401, + "step": 439 + }, + { + "epoch": 0.1738700913805878, + "grad_norm": 0.6308341725603104, + "learning_rate": 4.680851063829788e-06, + "loss": 0.7311, + "step": 440 + }, + { + "epoch": 0.17426525067918006, + "grad_norm": 0.6418577607256782, + "learning_rate": 4.691489361702128e-06, + "loss": 0.7234, + "step": 441 + }, + { + "epoch": 0.1746604099777723, + "grad_norm": 0.5854790033107359, + "learning_rate": 4.702127659574469e-06, + "loss": 0.7323, + "step": 442 + }, + { + "epoch": 0.17505556927636454, + "grad_norm": 0.6823496981208785, + "learning_rate": 4.712765957446809e-06, + "loss": 0.7323, + "step": 443 + }, + { + "epoch": 0.17545072857495678, + "grad_norm": 0.6175278963352572, + "learning_rate": 4.7234042553191496e-06, + "loss": 0.7147, + "step": 444 + }, + { + "epoch": 0.17584588787354902, + "grad_norm": 0.7360888497887278, + "learning_rate": 4.73404255319149e-06, + "loss": 0.7348, + "step": 445 + }, + { + "epoch": 0.17624104717214126, + "grad_norm": 0.6230101684392804, + "learning_rate": 4.7446808510638305e-06, + "loss": 0.7119, + "step": 446 + }, + { + "epoch": 0.17663620647073353, + "grad_norm": 0.6554804476316878, + "learning_rate": 4.755319148936171e-06, + "loss": 0.7294, + "step": 447 + }, + { + "epoch": 0.17703136576932577, + "grad_norm": 0.8564206801667128, + "learning_rate": 4.765957446808511e-06, + "loss": 0.7423, + "step": 448 + }, + { + "epoch": 0.177426525067918, + "grad_norm": 1.3629336168086166, + "learning_rate": 4.776595744680852e-06, + "loss": 0.7484, + "step": 449 + }, + { + "epoch": 0.17782168436651025, + "grad_norm": 0.7691195276316329, + "learning_rate": 4.787234042553192e-06, + "loss": 0.7097, + "step": 450 + }, + { + "epoch": 0.1782168436651025, + "grad_norm": 0.6194923429303832, + "learning_rate": 4.797872340425533e-06, + "loss": 0.7285, + "step": 451 + }, + { + "epoch": 0.17861200296369473, + "grad_norm": 0.673233699921991, + "learning_rate": 4.808510638297872e-06, + "loss": 0.7278, + "step": 452 + }, + { + "epoch": 0.179007162262287, + "grad_norm": 0.6766336054718974, + "learning_rate": 4.819148936170213e-06, + "loss": 0.7356, + "step": 453 + }, + { + "epoch": 0.17940232156087924, + "grad_norm": 0.57599098500566, + "learning_rate": 4.829787234042553e-06, + "loss": 0.7115, + "step": 454 + }, + { + "epoch": 0.17979748085947148, + "grad_norm": 0.8080267118445753, + "learning_rate": 4.840425531914894e-06, + "loss": 0.7534, + "step": 455 + }, + { + "epoch": 0.18019264015806372, + "grad_norm": 0.603929876457928, + "learning_rate": 4.851063829787234e-06, + "loss": 0.7105, + "step": 456 + }, + { + "epoch": 0.18058779945665596, + "grad_norm": 0.643703372655632, + "learning_rate": 4.8617021276595746e-06, + "loss": 0.7279, + "step": 457 + }, + { + "epoch": 0.1809829587552482, + "grad_norm": 0.8198396590432799, + "learning_rate": 4.872340425531915e-06, + "loss": 0.7198, + "step": 458 + }, + { + "epoch": 0.18137811805384046, + "grad_norm": 0.65771945253315, + "learning_rate": 4.8829787234042555e-06, + "loss": 0.7113, + "step": 459 + }, + { + "epoch": 0.1817732773524327, + "grad_norm": 0.6963361768452864, + "learning_rate": 4.893617021276596e-06, + "loss": 0.74, + "step": 460 + }, + { + "epoch": 0.18216843665102495, + "grad_norm": 0.6261965671131124, + "learning_rate": 4.904255319148936e-06, + "loss": 0.7126, + "step": 461 + }, + { + "epoch": 0.18256359594961719, + "grad_norm": 0.6148512179719411, + "learning_rate": 4.914893617021277e-06, + "loss": 0.726, + "step": 462 + }, + { + "epoch": 0.18295875524820943, + "grad_norm": 0.676252763119563, + "learning_rate": 4.925531914893617e-06, + "loss": 0.73, + "step": 463 + }, + { + "epoch": 0.18335391454680167, + "grad_norm": 0.7917693042214963, + "learning_rate": 4.936170212765958e-06, + "loss": 0.745, + "step": 464 + }, + { + "epoch": 0.18374907384539393, + "grad_norm": 0.623944439707609, + "learning_rate": 4.946808510638298e-06, + "loss": 0.7178, + "step": 465 + }, + { + "epoch": 0.18414423314398617, + "grad_norm": 0.7595206103625429, + "learning_rate": 4.957446808510639e-06, + "loss": 0.7511, + "step": 466 + }, + { + "epoch": 0.18453939244257841, + "grad_norm": 0.6236195124431673, + "learning_rate": 4.968085106382979e-06, + "loss": 0.7443, + "step": 467 + }, + { + "epoch": 0.18493455174117066, + "grad_norm": 0.6344774337895079, + "learning_rate": 4.9787234042553195e-06, + "loss": 0.7502, + "step": 468 + }, + { + "epoch": 0.1853297110397629, + "grad_norm": 0.6180463187517735, + "learning_rate": 4.98936170212766e-06, + "loss": 0.7089, + "step": 469 + }, + { + "epoch": 0.18572487033835514, + "grad_norm": 0.6437262535137148, + "learning_rate": 5e-06, + "loss": 0.7118, + "step": 470 + }, + { + "epoch": 0.1861200296369474, + "grad_norm": 0.8247696667741479, + "learning_rate": 4.999999995090759e-06, + "loss": 0.7448, + "step": 471 + }, + { + "epoch": 0.18651518893553964, + "grad_norm": 0.6821659327496776, + "learning_rate": 4.9999999803630365e-06, + "loss": 0.7096, + "step": 472 + }, + { + "epoch": 0.18691034823413188, + "grad_norm": 0.6937300518785938, + "learning_rate": 4.999999955816832e-06, + "loss": 0.7505, + "step": 473 + }, + { + "epoch": 0.18730550753272412, + "grad_norm": 0.64493393853509, + "learning_rate": 4.999999921452146e-06, + "loss": 0.7246, + "step": 474 + }, + { + "epoch": 0.18770066683131637, + "grad_norm": 0.699335103430049, + "learning_rate": 4.999999877268977e-06, + "loss": 0.7362, + "step": 475 + }, + { + "epoch": 0.18809582612990863, + "grad_norm": 0.6958675091974922, + "learning_rate": 4.999999823267328e-06, + "loss": 0.7425, + "step": 476 + }, + { + "epoch": 0.18849098542850087, + "grad_norm": 0.6561148703214672, + "learning_rate": 4.999999759447197e-06, + "loss": 0.7193, + "step": 477 + }, + { + "epoch": 0.1888861447270931, + "grad_norm": 0.7332659964684728, + "learning_rate": 4.999999685808585e-06, + "loss": 0.7367, + "step": 478 + }, + { + "epoch": 0.18928130402568535, + "grad_norm": 0.6434935616449193, + "learning_rate": 4.999999602351493e-06, + "loss": 0.7241, + "step": 479 + }, + { + "epoch": 0.1896764633242776, + "grad_norm": 0.7085133055543303, + "learning_rate": 4.99999950907592e-06, + "loss": 0.7294, + "step": 480 + }, + { + "epoch": 0.19007162262286983, + "grad_norm": 0.6037858004944545, + "learning_rate": 4.999999405981868e-06, + "loss": 0.7169, + "step": 481 + }, + { + "epoch": 0.1904667819214621, + "grad_norm": 0.6353596283356293, + "learning_rate": 4.999999293069335e-06, + "loss": 0.7255, + "step": 482 + }, + { + "epoch": 0.19086194122005434, + "grad_norm": 0.6011323842788836, + "learning_rate": 4.999999170338324e-06, + "loss": 0.7098, + "step": 483 + }, + { + "epoch": 0.19125710051864658, + "grad_norm": 0.685601090106948, + "learning_rate": 4.999999037788834e-06, + "loss": 0.7333, + "step": 484 + }, + { + "epoch": 0.19165225981723882, + "grad_norm": 0.6377038979338291, + "learning_rate": 4.999998895420866e-06, + "loss": 0.7257, + "step": 485 + }, + { + "epoch": 0.19204741911583106, + "grad_norm": 0.6192918261508922, + "learning_rate": 4.9999987432344195e-06, + "loss": 0.7275, + "step": 486 + }, + { + "epoch": 0.1924425784144233, + "grad_norm": 0.6720726722815289, + "learning_rate": 4.999998581229497e-06, + "loss": 0.7209, + "step": 487 + }, + { + "epoch": 0.19283773771301557, + "grad_norm": 0.5958748564691253, + "learning_rate": 4.999998409406098e-06, + "loss": 0.7182, + "step": 488 + }, + { + "epoch": 0.1932328970116078, + "grad_norm": 0.6523018913824231, + "learning_rate": 4.999998227764223e-06, + "loss": 0.7223, + "step": 489 + }, + { + "epoch": 0.19362805631020005, + "grad_norm": 0.6969026955713506, + "learning_rate": 4.999998036303873e-06, + "loss": 0.7017, + "step": 490 + }, + { + "epoch": 0.1940232156087923, + "grad_norm": 0.6199690598086954, + "learning_rate": 4.999997835025049e-06, + "loss": 0.724, + "step": 491 + }, + { + "epoch": 0.19441837490738453, + "grad_norm": 0.6899050040288978, + "learning_rate": 4.999997623927752e-06, + "loss": 0.702, + "step": 492 + }, + { + "epoch": 0.19481353420597677, + "grad_norm": 0.6023461037422734, + "learning_rate": 4.999997403011982e-06, + "loss": 0.7192, + "step": 493 + }, + { + "epoch": 0.19520869350456904, + "grad_norm": 0.7949013705151751, + "learning_rate": 4.9999971722777395e-06, + "loss": 0.7214, + "step": 494 + }, + { + "epoch": 0.19560385280316128, + "grad_norm": 0.8406697228012115, + "learning_rate": 4.9999969317250276e-06, + "loss": 0.7511, + "step": 495 + }, + { + "epoch": 0.19599901210175352, + "grad_norm": 0.6165479711982665, + "learning_rate": 4.999996681353845e-06, + "loss": 0.7129, + "step": 496 + }, + { + "epoch": 0.19639417140034576, + "grad_norm": 0.6365598818327172, + "learning_rate": 4.999996421164194e-06, + "loss": 0.724, + "step": 497 + }, + { + "epoch": 0.196789330698938, + "grad_norm": 0.5804862939924182, + "learning_rate": 4.999996151156075e-06, + "loss": 0.7208, + "step": 498 + }, + { + "epoch": 0.19718448999753024, + "grad_norm": 0.8043069867513684, + "learning_rate": 4.9999958713294886e-06, + "loss": 0.7316, + "step": 499 + }, + { + "epoch": 0.1975796492961225, + "grad_norm": 0.7039727180630397, + "learning_rate": 4.999995581684437e-06, + "loss": 0.7273, + "step": 500 + }, + { + "epoch": 0.19797480859471475, + "grad_norm": 0.5796311427949719, + "learning_rate": 4.999995282220921e-06, + "loss": 0.7353, + "step": 501 + }, + { + "epoch": 0.198369967893307, + "grad_norm": 0.640816004878865, + "learning_rate": 4.999994972938941e-06, + "loss": 0.718, + "step": 502 + }, + { + "epoch": 0.19876512719189923, + "grad_norm": 0.6625901203049049, + "learning_rate": 4.9999946538384995e-06, + "loss": 0.717, + "step": 503 + }, + { + "epoch": 0.19916028649049147, + "grad_norm": 0.731900514664347, + "learning_rate": 4.999994324919596e-06, + "loss": 0.7154, + "step": 504 + }, + { + "epoch": 0.1995554457890837, + "grad_norm": 0.6358220991451766, + "learning_rate": 4.9999939861822345e-06, + "loss": 0.7135, + "step": 505 + }, + { + "epoch": 0.19995060508767598, + "grad_norm": 0.6772742599590151, + "learning_rate": 4.999993637626413e-06, + "loss": 0.7142, + "step": 506 + }, + { + "epoch": 0.20034576438626822, + "grad_norm": 0.6203815751394517, + "learning_rate": 4.999993279252136e-06, + "loss": 0.7094, + "step": 507 + }, + { + "epoch": 0.20074092368486046, + "grad_norm": 0.6159319745437677, + "learning_rate": 4.999992911059402e-06, + "loss": 0.7097, + "step": 508 + }, + { + "epoch": 0.2011360829834527, + "grad_norm": 0.6394572431227349, + "learning_rate": 4.999992533048216e-06, + "loss": 0.7389, + "step": 509 + }, + { + "epoch": 0.20153124228204494, + "grad_norm": 0.5730980921754206, + "learning_rate": 4.999992145218576e-06, + "loss": 0.7148, + "step": 510 + }, + { + "epoch": 0.20192640158063718, + "grad_norm": 0.642558910052563, + "learning_rate": 4.999991747570485e-06, + "loss": 0.6986, + "step": 511 + }, + { + "epoch": 0.20232156087922945, + "grad_norm": 0.7198501180731144, + "learning_rate": 4.999991340103944e-06, + "loss": 0.7169, + "step": 512 + }, + { + "epoch": 0.2027167201778217, + "grad_norm": 0.5448656307534819, + "learning_rate": 4.999990922818957e-06, + "loss": 0.6977, + "step": 513 + }, + { + "epoch": 0.20311187947641393, + "grad_norm": 0.6205152271493748, + "learning_rate": 4.999990495715522e-06, + "loss": 0.7289, + "step": 514 + }, + { + "epoch": 0.20350703877500617, + "grad_norm": 0.5899827817607862, + "learning_rate": 4.9999900587936425e-06, + "loss": 0.7269, + "step": 515 + }, + { + "epoch": 0.2039021980735984, + "grad_norm": 0.6379914033363828, + "learning_rate": 4.999989612053321e-06, + "loss": 0.7031, + "step": 516 + }, + { + "epoch": 0.20429735737219065, + "grad_norm": 0.63396345510458, + "learning_rate": 4.999989155494558e-06, + "loss": 0.7113, + "step": 517 + }, + { + "epoch": 0.20469251667078292, + "grad_norm": 0.5769043304193019, + "learning_rate": 4.999988689117355e-06, + "loss": 0.7371, + "step": 518 + }, + { + "epoch": 0.20508767596937516, + "grad_norm": 0.6508775630628256, + "learning_rate": 4.999988212921715e-06, + "loss": 0.6937, + "step": 519 + }, + { + "epoch": 0.2054828352679674, + "grad_norm": 0.6868129416605413, + "learning_rate": 4.99998772690764e-06, + "loss": 0.7383, + "step": 520 + }, + { + "epoch": 0.20587799456655964, + "grad_norm": 0.5389127646966064, + "learning_rate": 4.9999872310751305e-06, + "loss": 0.6993, + "step": 521 + }, + { + "epoch": 0.20627315386515188, + "grad_norm": 0.6060273410792214, + "learning_rate": 4.9999867254241894e-06, + "loss": 0.7118, + "step": 522 + }, + { + "epoch": 0.20666831316374412, + "grad_norm": 0.5800631862306052, + "learning_rate": 4.9999862099548186e-06, + "loss": 0.7323, + "step": 523 + }, + { + "epoch": 0.2070634724623364, + "grad_norm": 0.7534794475732837, + "learning_rate": 4.9999856846670195e-06, + "loss": 0.7228, + "step": 524 + }, + { + "epoch": 0.20745863176092863, + "grad_norm": 0.5812327339287102, + "learning_rate": 4.999985149560796e-06, + "loss": 0.7072, + "step": 525 + }, + { + "epoch": 0.20785379105952087, + "grad_norm": 0.5768618184120087, + "learning_rate": 4.999984604636148e-06, + "loss": 0.7088, + "step": 526 + }, + { + "epoch": 0.2082489503581131, + "grad_norm": 0.6399784431138891, + "learning_rate": 4.999984049893078e-06, + "loss": 0.7149, + "step": 527 + }, + { + "epoch": 0.20864410965670535, + "grad_norm": 0.5623651744286807, + "learning_rate": 4.99998348533159e-06, + "loss": 0.7226, + "step": 528 + }, + { + "epoch": 0.2090392689552976, + "grad_norm": 0.6963865673449156, + "learning_rate": 4.999982910951684e-06, + "loss": 0.7233, + "step": 529 + }, + { + "epoch": 0.20943442825388986, + "grad_norm": 0.6478334973673012, + "learning_rate": 4.999982326753363e-06, + "loss": 0.7262, + "step": 530 + }, + { + "epoch": 0.2098295875524821, + "grad_norm": 0.5449147628310747, + "learning_rate": 4.9999817327366305e-06, + "loss": 0.7072, + "step": 531 + }, + { + "epoch": 0.21022474685107434, + "grad_norm": 0.5817196907571065, + "learning_rate": 4.9999811289014876e-06, + "loss": 0.7127, + "step": 532 + }, + { + "epoch": 0.21061990614966658, + "grad_norm": 0.5908688502826027, + "learning_rate": 4.999980515247936e-06, + "loss": 0.7232, + "step": 533 + }, + { + "epoch": 0.21101506544825882, + "grad_norm": 0.5636398728531682, + "learning_rate": 4.99997989177598e-06, + "loss": 0.7351, + "step": 534 + }, + { + "epoch": 0.21141022474685106, + "grad_norm": 0.626447595665311, + "learning_rate": 4.9999792584856204e-06, + "loss": 0.7069, + "step": 535 + }, + { + "epoch": 0.21180538404544333, + "grad_norm": 0.5779788065467234, + "learning_rate": 4.999978615376861e-06, + "loss": 0.7109, + "step": 536 + }, + { + "epoch": 0.21220054334403557, + "grad_norm": 0.5720137928958787, + "learning_rate": 4.999977962449703e-06, + "loss": 0.7117, + "step": 537 + }, + { + "epoch": 0.2125957026426278, + "grad_norm": 0.5741484180144606, + "learning_rate": 4.99997729970415e-06, + "loss": 0.7004, + "step": 538 + }, + { + "epoch": 0.21299086194122005, + "grad_norm": 0.6385491107900715, + "learning_rate": 4.999976627140204e-06, + "loss": 0.7269, + "step": 539 + }, + { + "epoch": 0.2133860212398123, + "grad_norm": 0.5816500717084315, + "learning_rate": 4.999975944757868e-06, + "loss": 0.7036, + "step": 540 + }, + { + "epoch": 0.21378118053840453, + "grad_norm": 0.5659777534165991, + "learning_rate": 4.999975252557145e-06, + "loss": 0.7105, + "step": 541 + }, + { + "epoch": 0.2141763398369968, + "grad_norm": 0.5749663433605801, + "learning_rate": 4.999974550538037e-06, + "loss": 0.7264, + "step": 542 + }, + { + "epoch": 0.21457149913558904, + "grad_norm": 0.6112261810413965, + "learning_rate": 4.999973838700547e-06, + "loss": 0.7153, + "step": 543 + }, + { + "epoch": 0.21496665843418128, + "grad_norm": 0.5490028470815621, + "learning_rate": 4.999973117044678e-06, + "loss": 0.7083, + "step": 544 + }, + { + "epoch": 0.21536181773277352, + "grad_norm": 0.7622632899364211, + "learning_rate": 4.9999723855704315e-06, + "loss": 0.736, + "step": 545 + }, + { + "epoch": 0.21575697703136576, + "grad_norm": 0.596735586828792, + "learning_rate": 4.999971644277812e-06, + "loss": 0.6858, + "step": 546 + }, + { + "epoch": 0.21615213632995803, + "grad_norm": 0.6527535984449617, + "learning_rate": 4.999970893166823e-06, + "loss": 0.722, + "step": 547 + }, + { + "epoch": 0.21654729562855027, + "grad_norm": 0.652324953771554, + "learning_rate": 4.999970132237466e-06, + "loss": 0.7424, + "step": 548 + }, + { + "epoch": 0.2169424549271425, + "grad_norm": 0.5908339949824033, + "learning_rate": 4.999969361489744e-06, + "loss": 0.7227, + "step": 549 + }, + { + "epoch": 0.21733761422573475, + "grad_norm": 0.6629809077478138, + "learning_rate": 4.99996858092366e-06, + "loss": 0.6958, + "step": 550 + }, + { + "epoch": 0.217732773524327, + "grad_norm": 0.5704721132671529, + "learning_rate": 4.9999677905392185e-06, + "loss": 0.7056, + "step": 551 + }, + { + "epoch": 0.21812793282291923, + "grad_norm": 0.5883161489370882, + "learning_rate": 4.9999669903364215e-06, + "loss": 0.7131, + "step": 552 + }, + { + "epoch": 0.2185230921215115, + "grad_norm": 0.6166470675766057, + "learning_rate": 4.999966180315271e-06, + "loss": 0.7088, + "step": 553 + }, + { + "epoch": 0.21891825142010374, + "grad_norm": 0.5742237649639247, + "learning_rate": 4.999965360475773e-06, + "loss": 0.6977, + "step": 554 + }, + { + "epoch": 0.21931341071869598, + "grad_norm": 0.6751101103583117, + "learning_rate": 4.9999645308179275e-06, + "loss": 0.7072, + "step": 555 + }, + { + "epoch": 0.21970857001728822, + "grad_norm": 0.5984737347193803, + "learning_rate": 4.999963691341741e-06, + "loss": 0.7147, + "step": 556 + }, + { + "epoch": 0.22010372931588046, + "grad_norm": 0.6110992520398403, + "learning_rate": 4.999962842047214e-06, + "loss": 0.7013, + "step": 557 + }, + { + "epoch": 0.2204988886144727, + "grad_norm": 0.768651010687167, + "learning_rate": 4.999961982934352e-06, + "loss": 0.708, + "step": 558 + }, + { + "epoch": 0.22089404791306497, + "grad_norm": 0.5750080834238912, + "learning_rate": 4.999961114003156e-06, + "loss": 0.7163, + "step": 559 + }, + { + "epoch": 0.2212892072116572, + "grad_norm": 0.6908491657265623, + "learning_rate": 4.9999602352536316e-06, + "loss": 0.703, + "step": 560 + }, + { + "epoch": 0.22168436651024945, + "grad_norm": 0.6747348114487975, + "learning_rate": 4.999959346685782e-06, + "loss": 0.7156, + "step": 561 + }, + { + "epoch": 0.2220795258088417, + "grad_norm": 0.6777010126648408, + "learning_rate": 4.999958448299609e-06, + "loss": 0.7221, + "step": 562 + }, + { + "epoch": 0.22247468510743393, + "grad_norm": 0.7149124473295017, + "learning_rate": 4.9999575400951185e-06, + "loss": 0.7433, + "step": 563 + }, + { + "epoch": 0.22286984440602617, + "grad_norm": 0.5509899586140755, + "learning_rate": 4.999956622072312e-06, + "loss": 0.7023, + "step": 564 + }, + { + "epoch": 0.22326500370461844, + "grad_norm": 0.7820264763430183, + "learning_rate": 4.999955694231194e-06, + "loss": 0.7288, + "step": 565 + }, + { + "epoch": 0.22366016300321068, + "grad_norm": 0.5919945717677796, + "learning_rate": 4.999954756571769e-06, + "loss": 0.7152, + "step": 566 + }, + { + "epoch": 0.22405532230180292, + "grad_norm": 0.6535677585414817, + "learning_rate": 4.999953809094038e-06, + "loss": 0.7083, + "step": 567 + }, + { + "epoch": 0.22445048160039516, + "grad_norm": 0.7785050994873816, + "learning_rate": 4.999952851798008e-06, + "loss": 0.7062, + "step": 568 + }, + { + "epoch": 0.2248456408989874, + "grad_norm": 0.5482528691036233, + "learning_rate": 4.99995188468368e-06, + "loss": 0.6968, + "step": 569 + }, + { + "epoch": 0.22524080019757964, + "grad_norm": 0.7313401830631631, + "learning_rate": 4.99995090775106e-06, + "loss": 0.7196, + "step": 570 + }, + { + "epoch": 0.2256359594961719, + "grad_norm": 0.7311084419182904, + "learning_rate": 4.999949921000151e-06, + "loss": 0.6959, + "step": 571 + }, + { + "epoch": 0.22603111879476415, + "grad_norm": 0.640055542754839, + "learning_rate": 4.999948924430956e-06, + "loss": 0.7233, + "step": 572 + }, + { + "epoch": 0.2264262780933564, + "grad_norm": 0.7564396262457012, + "learning_rate": 4.9999479180434805e-06, + "loss": 0.7259, + "step": 573 + }, + { + "epoch": 0.22682143739194863, + "grad_norm": 0.5567836517749442, + "learning_rate": 4.999946901837727e-06, + "loss": 0.732, + "step": 574 + }, + { + "epoch": 0.22721659669054087, + "grad_norm": 0.6565826035434957, + "learning_rate": 4.999945875813701e-06, + "loss": 0.7107, + "step": 575 + }, + { + "epoch": 0.2276117559891331, + "grad_norm": 0.6555275013117167, + "learning_rate": 4.999944839971404e-06, + "loss": 0.7065, + "step": 576 + }, + { + "epoch": 0.22800691528772538, + "grad_norm": 0.5750121597094802, + "learning_rate": 4.999943794310844e-06, + "loss": 0.7168, + "step": 577 + }, + { + "epoch": 0.22840207458631762, + "grad_norm": 0.7172808104044253, + "learning_rate": 4.999942738832021e-06, + "loss": 0.7035, + "step": 578 + }, + { + "epoch": 0.22879723388490986, + "grad_norm": 0.681813584446898, + "learning_rate": 4.999941673534941e-06, + "loss": 0.7278, + "step": 579 + }, + { + "epoch": 0.2291923931835021, + "grad_norm": 0.6129736074024745, + "learning_rate": 4.999940598419609e-06, + "loss": 0.7207, + "step": 580 + }, + { + "epoch": 0.22958755248209434, + "grad_norm": 0.6921872225622865, + "learning_rate": 4.999939513486028e-06, + "loss": 0.7079, + "step": 581 + }, + { + "epoch": 0.22998271178068658, + "grad_norm": 0.6070410601571897, + "learning_rate": 4.9999384187342035e-06, + "loss": 0.7149, + "step": 582 + }, + { + "epoch": 0.23037787107927885, + "grad_norm": 0.6700679528371282, + "learning_rate": 4.999937314164138e-06, + "loss": 0.7091, + "step": 583 + }, + { + "epoch": 0.23077303037787109, + "grad_norm": 0.7175503981303464, + "learning_rate": 4.999936199775836e-06, + "loss": 0.7044, + "step": 584 + }, + { + "epoch": 0.23116818967646333, + "grad_norm": 0.5372712731465968, + "learning_rate": 4.999935075569304e-06, + "loss": 0.7185, + "step": 585 + }, + { + "epoch": 0.23156334897505557, + "grad_norm": 0.6626410452980375, + "learning_rate": 4.999933941544544e-06, + "loss": 0.7061, + "step": 586 + }, + { + "epoch": 0.2319585082736478, + "grad_norm": 0.6219349768601438, + "learning_rate": 4.999932797701563e-06, + "loss": 0.7027, + "step": 587 + }, + { + "epoch": 0.23235366757224005, + "grad_norm": 0.622944974108998, + "learning_rate": 4.999931644040363e-06, + "loss": 0.7041, + "step": 588 + }, + { + "epoch": 0.23274882687083231, + "grad_norm": 0.6403160681420452, + "learning_rate": 4.999930480560949e-06, + "loss": 0.713, + "step": 589 + }, + { + "epoch": 0.23314398616942456, + "grad_norm": 0.5810884528951928, + "learning_rate": 4.999929307263327e-06, + "loss": 0.6959, + "step": 590 + }, + { + "epoch": 0.2335391454680168, + "grad_norm": 0.5617194856604512, + "learning_rate": 4.999928124147501e-06, + "loss": 0.7134, + "step": 591 + }, + { + "epoch": 0.23393430476660904, + "grad_norm": 0.5794486892578056, + "learning_rate": 4.999926931213475e-06, + "loss": 0.6581, + "step": 592 + }, + { + "epoch": 0.23432946406520128, + "grad_norm": 0.6010638355759388, + "learning_rate": 4.999925728461254e-06, + "loss": 0.6905, + "step": 593 + }, + { + "epoch": 0.23472462336379352, + "grad_norm": 0.5698458903870885, + "learning_rate": 4.999924515890843e-06, + "loss": 0.6929, + "step": 594 + }, + { + "epoch": 0.23511978266238578, + "grad_norm": 0.6038481326467747, + "learning_rate": 4.9999232935022455e-06, + "loss": 0.7134, + "step": 595 + }, + { + "epoch": 0.23551494196097802, + "grad_norm": 0.5575437681930879, + "learning_rate": 4.9999220612954685e-06, + "loss": 0.686, + "step": 596 + }, + { + "epoch": 0.23591010125957027, + "grad_norm": 0.5582827483407896, + "learning_rate": 4.999920819270515e-06, + "loss": 0.677, + "step": 597 + }, + { + "epoch": 0.2363052605581625, + "grad_norm": 0.5581039778387297, + "learning_rate": 4.999919567427391e-06, + "loss": 0.7041, + "step": 598 + }, + { + "epoch": 0.23670041985675475, + "grad_norm": 0.5926274884146744, + "learning_rate": 4.999918305766101e-06, + "loss": 0.7131, + "step": 599 + }, + { + "epoch": 0.23709557915534699, + "grad_norm": 0.5683616438789352, + "learning_rate": 4.99991703428665e-06, + "loss": 0.7004, + "step": 600 + }, + { + "epoch": 0.23749073845393925, + "grad_norm": 0.5730277632443862, + "learning_rate": 4.999915752989042e-06, + "loss": 0.7056, + "step": 601 + }, + { + "epoch": 0.2378858977525315, + "grad_norm": 0.582494416565038, + "learning_rate": 4.999914461873283e-06, + "loss": 0.7116, + "step": 602 + }, + { + "epoch": 0.23828105705112373, + "grad_norm": 0.5966610530038993, + "learning_rate": 4.9999131609393795e-06, + "loss": 0.7284, + "step": 603 + }, + { + "epoch": 0.23867621634971597, + "grad_norm": 0.5558683547691745, + "learning_rate": 4.999911850187335e-06, + "loss": 0.6998, + "step": 604 + }, + { + "epoch": 0.23907137564830822, + "grad_norm": 0.6258177810251072, + "learning_rate": 4.999910529617154e-06, + "loss": 0.6855, + "step": 605 + }, + { + "epoch": 0.23946653494690046, + "grad_norm": 0.5318478473731086, + "learning_rate": 4.999909199228841e-06, + "loss": 0.6795, + "step": 606 + }, + { + "epoch": 0.23986169424549272, + "grad_norm": 0.5547878659686439, + "learning_rate": 4.999907859022405e-06, + "loss": 0.6953, + "step": 607 + }, + { + "epoch": 0.24025685354408496, + "grad_norm": 0.5931673987268137, + "learning_rate": 4.9999065089978485e-06, + "loss": 0.7148, + "step": 608 + }, + { + "epoch": 0.2406520128426772, + "grad_norm": 0.568231220260507, + "learning_rate": 4.9999051491551766e-06, + "loss": 0.695, + "step": 609 + }, + { + "epoch": 0.24104717214126944, + "grad_norm": 0.5808912372610399, + "learning_rate": 4.999903779494395e-06, + "loss": 0.7074, + "step": 610 + }, + { + "epoch": 0.24144233143986168, + "grad_norm": 0.5824501699853736, + "learning_rate": 4.9999024000155104e-06, + "loss": 0.7017, + "step": 611 + }, + { + "epoch": 0.24183749073845395, + "grad_norm": 0.5393723920629134, + "learning_rate": 4.9999010107185264e-06, + "loss": 0.6646, + "step": 612 + }, + { + "epoch": 0.2422326500370462, + "grad_norm": 0.5980491294226428, + "learning_rate": 4.99989961160345e-06, + "loss": 0.6794, + "step": 613 + }, + { + "epoch": 0.24262780933563843, + "grad_norm": 0.6188242993162584, + "learning_rate": 4.999898202670286e-06, + "loss": 0.705, + "step": 614 + }, + { + "epoch": 0.24302296863423067, + "grad_norm": 0.5890575366759165, + "learning_rate": 4.9998967839190384e-06, + "loss": 0.7118, + "step": 615 + }, + { + "epoch": 0.24341812793282291, + "grad_norm": 0.6031363408649272, + "learning_rate": 4.999895355349716e-06, + "loss": 0.7248, + "step": 616 + }, + { + "epoch": 0.24381328723141515, + "grad_norm": 0.5917614651923631, + "learning_rate": 4.9998939169623215e-06, + "loss": 0.696, + "step": 617 + }, + { + "epoch": 0.24420844653000742, + "grad_norm": 0.594908607458186, + "learning_rate": 4.9998924687568625e-06, + "loss": 0.7036, + "step": 618 + }, + { + "epoch": 0.24460360582859966, + "grad_norm": 0.6727052500603081, + "learning_rate": 4.999891010733343e-06, + "loss": 0.6824, + "step": 619 + }, + { + "epoch": 0.2449987651271919, + "grad_norm": 0.598235520036059, + "learning_rate": 4.99988954289177e-06, + "loss": 0.7238, + "step": 620 + }, + { + "epoch": 0.24539392442578414, + "grad_norm": 0.7176084933336286, + "learning_rate": 4.999888065232149e-06, + "loss": 0.7076, + "step": 621 + }, + { + "epoch": 0.24578908372437638, + "grad_norm": 0.5872860940803798, + "learning_rate": 4.999886577754486e-06, + "loss": 0.702, + "step": 622 + }, + { + "epoch": 0.24618424302296862, + "grad_norm": 0.6143484421432265, + "learning_rate": 4.999885080458786e-06, + "loss": 0.6923, + "step": 623 + }, + { + "epoch": 0.2465794023215609, + "grad_norm": 0.6537296010257171, + "learning_rate": 4.9998835733450564e-06, + "loss": 0.6996, + "step": 624 + }, + { + "epoch": 0.24697456162015313, + "grad_norm": 0.5652281166809588, + "learning_rate": 4.9998820564133016e-06, + "loss": 0.6926, + "step": 625 + }, + { + "epoch": 0.24736972091874537, + "grad_norm": 0.6841779465916256, + "learning_rate": 4.999880529663528e-06, + "loss": 0.708, + "step": 626 + }, + { + "epoch": 0.2477648802173376, + "grad_norm": 0.618999997435258, + "learning_rate": 4.9998789930957415e-06, + "loss": 0.7009, + "step": 627 + }, + { + "epoch": 0.24816003951592985, + "grad_norm": 0.5814213814123862, + "learning_rate": 4.999877446709949e-06, + "loss": 0.7166, + "step": 628 + }, + { + "epoch": 0.2485551988145221, + "grad_norm": 0.6410458092239222, + "learning_rate": 4.999875890506155e-06, + "loss": 0.701, + "step": 629 + }, + { + "epoch": 0.24895035811311436, + "grad_norm": 0.5643287481712287, + "learning_rate": 4.999874324484367e-06, + "loss": 0.6781, + "step": 630 + }, + { + "epoch": 0.2493455174117066, + "grad_norm": 0.719129519371788, + "learning_rate": 4.9998727486445895e-06, + "loss": 0.71, + "step": 631 + }, + { + "epoch": 0.24974067671029884, + "grad_norm": 0.643749171305181, + "learning_rate": 4.999871162986831e-06, + "loss": 0.6954, + "step": 632 + }, + { + "epoch": 0.2501358360088911, + "grad_norm": 0.5312855320703463, + "learning_rate": 4.999869567511097e-06, + "loss": 0.6619, + "step": 633 + }, + { + "epoch": 0.25053099530748335, + "grad_norm": 0.5908234255298699, + "learning_rate": 4.9998679622173925e-06, + "loss": 0.7128, + "step": 634 + }, + { + "epoch": 0.2509261546060756, + "grad_norm": 0.6073203092125553, + "learning_rate": 4.999866347105725e-06, + "loss": 0.6908, + "step": 635 + }, + { + "epoch": 0.25132131390466783, + "grad_norm": 0.5785049800687088, + "learning_rate": 4.999864722176101e-06, + "loss": 0.6863, + "step": 636 + }, + { + "epoch": 0.25171647320326007, + "grad_norm": 0.6312245040925751, + "learning_rate": 4.999863087428526e-06, + "loss": 0.6905, + "step": 637 + }, + { + "epoch": 0.2521116325018523, + "grad_norm": 0.596727469474082, + "learning_rate": 4.999861442863005e-06, + "loss": 0.6822, + "step": 638 + }, + { + "epoch": 0.25250679180044455, + "grad_norm": 0.5678026535695422, + "learning_rate": 4.999859788479548e-06, + "loss": 0.6999, + "step": 639 + }, + { + "epoch": 0.2529019510990368, + "grad_norm": 0.548613923585835, + "learning_rate": 4.99985812427816e-06, + "loss": 0.7132, + "step": 640 + }, + { + "epoch": 0.25329711039762903, + "grad_norm": 0.5873264929153031, + "learning_rate": 4.999856450258847e-06, + "loss": 0.7207, + "step": 641 + }, + { + "epoch": 0.2536922696962213, + "grad_norm": 0.5678140101223359, + "learning_rate": 4.9998547664216155e-06, + "loss": 0.7106, + "step": 642 + }, + { + "epoch": 0.2540874289948135, + "grad_norm": 0.5578970132548404, + "learning_rate": 4.999853072766473e-06, + "loss": 0.7125, + "step": 643 + }, + { + "epoch": 0.25448258829340575, + "grad_norm": 0.6041251604589954, + "learning_rate": 4.999851369293425e-06, + "loss": 0.6901, + "step": 644 + }, + { + "epoch": 0.25487774759199805, + "grad_norm": 0.5736028260838277, + "learning_rate": 4.99984965600248e-06, + "loss": 0.7116, + "step": 645 + }, + { + "epoch": 0.2552729068905903, + "grad_norm": 0.532105428102255, + "learning_rate": 4.999847932893642e-06, + "loss": 0.6927, + "step": 646 + }, + { + "epoch": 0.25566806618918253, + "grad_norm": 0.6413099649877119, + "learning_rate": 4.99984619996692e-06, + "loss": 0.6916, + "step": 647 + }, + { + "epoch": 0.25606322548777477, + "grad_norm": 0.5690991015722213, + "learning_rate": 4.9998444572223205e-06, + "loss": 0.7192, + "step": 648 + }, + { + "epoch": 0.256458384786367, + "grad_norm": 0.5557660152572236, + "learning_rate": 4.999842704659849e-06, + "loss": 0.6832, + "step": 649 + }, + { + "epoch": 0.25685354408495925, + "grad_norm": 0.5617270616167598, + "learning_rate": 4.999840942279514e-06, + "loss": 0.6878, + "step": 650 + }, + { + "epoch": 0.2572487033835515, + "grad_norm": 0.5666842673944367, + "learning_rate": 4.999839170081322e-06, + "loss": 0.6672, + "step": 651 + }, + { + "epoch": 0.25764386268214373, + "grad_norm": 0.5623407827389078, + "learning_rate": 4.999837388065279e-06, + "loss": 0.689, + "step": 652 + }, + { + "epoch": 0.25803902198073597, + "grad_norm": 0.5610766033323613, + "learning_rate": 4.999835596231392e-06, + "loss": 0.7217, + "step": 653 + }, + { + "epoch": 0.2584341812793282, + "grad_norm": 0.5941036719520371, + "learning_rate": 4.999833794579671e-06, + "loss": 0.704, + "step": 654 + }, + { + "epoch": 0.25882934057792045, + "grad_norm": 0.5527677491351838, + "learning_rate": 4.999831983110119e-06, + "loss": 0.7063, + "step": 655 + }, + { + "epoch": 0.2592244998765127, + "grad_norm": 0.5896815383284895, + "learning_rate": 4.999830161822746e-06, + "loss": 0.6823, + "step": 656 + }, + { + "epoch": 0.259619659175105, + "grad_norm": 0.5617257867146465, + "learning_rate": 4.999828330717558e-06, + "loss": 0.6952, + "step": 657 + }, + { + "epoch": 0.26001481847369723, + "grad_norm": 0.5841626881089839, + "learning_rate": 4.999826489794562e-06, + "loss": 0.6801, + "step": 658 + }, + { + "epoch": 0.26040997777228947, + "grad_norm": 0.5646406469988224, + "learning_rate": 4.9998246390537655e-06, + "loss": 0.6812, + "step": 659 + }, + { + "epoch": 0.2608051370708817, + "grad_norm": 0.5750406721528946, + "learning_rate": 4.999822778495176e-06, + "loss": 0.6943, + "step": 660 + }, + { + "epoch": 0.26120029636947395, + "grad_norm": 0.5559705390104022, + "learning_rate": 4.999820908118801e-06, + "loss": 0.6763, + "step": 661 + }, + { + "epoch": 0.2615954556680662, + "grad_norm": 0.5562879316008681, + "learning_rate": 4.999819027924646e-06, + "loss": 0.6832, + "step": 662 + }, + { + "epoch": 0.26199061496665843, + "grad_norm": 0.5921377833036137, + "learning_rate": 4.999817137912721e-06, + "loss": 0.6858, + "step": 663 + }, + { + "epoch": 0.26238577426525067, + "grad_norm": 0.567426550861614, + "learning_rate": 4.9998152380830325e-06, + "loss": 0.7036, + "step": 664 + }, + { + "epoch": 0.2627809335638429, + "grad_norm": 0.6061252730535107, + "learning_rate": 4.999813328435587e-06, + "loss": 0.6858, + "step": 665 + }, + { + "epoch": 0.26317609286243515, + "grad_norm": 0.5678734043512008, + "learning_rate": 4.999811408970392e-06, + "loss": 0.7152, + "step": 666 + }, + { + "epoch": 0.2635712521610274, + "grad_norm": 0.6033593786362805, + "learning_rate": 4.999809479687457e-06, + "loss": 0.6913, + "step": 667 + }, + { + "epoch": 0.26396641145961963, + "grad_norm": 0.5744590883863819, + "learning_rate": 4.999807540586788e-06, + "loss": 0.7029, + "step": 668 + }, + { + "epoch": 0.26436157075821193, + "grad_norm": 0.5281778655339753, + "learning_rate": 4.999805591668393e-06, + "loss": 0.7049, + "step": 669 + }, + { + "epoch": 0.26475673005680417, + "grad_norm": 0.5585259313584738, + "learning_rate": 4.99980363293228e-06, + "loss": 0.673, + "step": 670 + }, + { + "epoch": 0.2651518893553964, + "grad_norm": 0.5594694999229931, + "learning_rate": 4.999801664378455e-06, + "loss": 0.6827, + "step": 671 + }, + { + "epoch": 0.26554704865398865, + "grad_norm": 0.5996606992971393, + "learning_rate": 4.999799686006928e-06, + "loss": 0.7072, + "step": 672 + }, + { + "epoch": 0.2659422079525809, + "grad_norm": 0.5580082863684926, + "learning_rate": 4.999797697817706e-06, + "loss": 0.7039, + "step": 673 + }, + { + "epoch": 0.26633736725117313, + "grad_norm": 0.521406203903939, + "learning_rate": 4.999795699810796e-06, + "loss": 0.6886, + "step": 674 + }, + { + "epoch": 0.26673252654976537, + "grad_norm": 0.5672071403136022, + "learning_rate": 4.9997936919862065e-06, + "loss": 0.6799, + "step": 675 + }, + { + "epoch": 0.2671276858483576, + "grad_norm": 0.5568748196172835, + "learning_rate": 4.999791674343945e-06, + "loss": 0.7012, + "step": 676 + }, + { + "epoch": 0.26752284514694985, + "grad_norm": 0.5593131719229575, + "learning_rate": 4.999789646884021e-06, + "loss": 0.6841, + "step": 677 + }, + { + "epoch": 0.2679180044455421, + "grad_norm": 0.5731216777632498, + "learning_rate": 4.99978760960644e-06, + "loss": 0.7111, + "step": 678 + }, + { + "epoch": 0.26831316374413433, + "grad_norm": 0.6026088473088514, + "learning_rate": 4.999785562511212e-06, + "loss": 0.7065, + "step": 679 + }, + { + "epoch": 0.2687083230427266, + "grad_norm": 0.5516806935780276, + "learning_rate": 4.999783505598344e-06, + "loss": 0.6909, + "step": 680 + }, + { + "epoch": 0.26910348234131887, + "grad_norm": 0.5702038400823601, + "learning_rate": 4.999781438867844e-06, + "loss": 0.6734, + "step": 681 + }, + { + "epoch": 0.2694986416399111, + "grad_norm": 0.5746782762066555, + "learning_rate": 4.999779362319721e-06, + "loss": 0.7042, + "step": 682 + }, + { + "epoch": 0.26989380093850335, + "grad_norm": 0.6007328759815763, + "learning_rate": 4.9997772759539825e-06, + "loss": 0.6971, + "step": 683 + }, + { + "epoch": 0.2702889602370956, + "grad_norm": 0.5658032828435371, + "learning_rate": 4.999775179770637e-06, + "loss": 0.6741, + "step": 684 + }, + { + "epoch": 0.27068411953568783, + "grad_norm": 0.5445910362376495, + "learning_rate": 4.999773073769692e-06, + "loss": 0.6914, + "step": 685 + }, + { + "epoch": 0.27107927883428007, + "grad_norm": 0.5625355418392146, + "learning_rate": 4.9997709579511566e-06, + "loss": 0.6778, + "step": 686 + }, + { + "epoch": 0.2714744381328723, + "grad_norm": 0.5558770779852484, + "learning_rate": 4.999768832315038e-06, + "loss": 0.675, + "step": 687 + }, + { + "epoch": 0.27186959743146455, + "grad_norm": 0.5384961627107955, + "learning_rate": 4.999766696861346e-06, + "loss": 0.6881, + "step": 688 + }, + { + "epoch": 0.2722647567300568, + "grad_norm": 0.5646517085086373, + "learning_rate": 4.9997645515900895e-06, + "loss": 0.6737, + "step": 689 + }, + { + "epoch": 0.27265991602864903, + "grad_norm": 0.5769838182468013, + "learning_rate": 4.9997623965012745e-06, + "loss": 0.6718, + "step": 690 + }, + { + "epoch": 0.27305507532724127, + "grad_norm": 0.7282357435191615, + "learning_rate": 4.999760231594911e-06, + "loss": 0.7049, + "step": 691 + }, + { + "epoch": 0.27345023462583357, + "grad_norm": 0.5279126762131758, + "learning_rate": 4.999758056871007e-06, + "loss": 0.6621, + "step": 692 + }, + { + "epoch": 0.2738453939244258, + "grad_norm": 0.5706899106672201, + "learning_rate": 4.999755872329571e-06, + "loss": 0.6883, + "step": 693 + }, + { + "epoch": 0.27424055322301805, + "grad_norm": 0.5563908836582794, + "learning_rate": 4.999753677970614e-06, + "loss": 0.7143, + "step": 694 + }, + { + "epoch": 0.2746357125216103, + "grad_norm": 0.5362870289511571, + "learning_rate": 4.9997514737941396e-06, + "loss": 0.684, + "step": 695 + }, + { + "epoch": 0.2750308718202025, + "grad_norm": 0.6153860851792133, + "learning_rate": 4.9997492598001605e-06, + "loss": 0.6883, + "step": 696 + }, + { + "epoch": 0.27542603111879477, + "grad_norm": 0.546338501558778, + "learning_rate": 4.999747035988684e-06, + "loss": 0.6853, + "step": 697 + }, + { + "epoch": 0.275821190417387, + "grad_norm": 0.5411393398480797, + "learning_rate": 4.999744802359721e-06, + "loss": 0.6759, + "step": 698 + }, + { + "epoch": 0.27621634971597925, + "grad_norm": 0.5820635145009669, + "learning_rate": 4.999742558913276e-06, + "loss": 0.69, + "step": 699 + }, + { + "epoch": 0.2766115090145715, + "grad_norm": 0.519802903820158, + "learning_rate": 4.999740305649361e-06, + "loss": 0.689, + "step": 700 + }, + { + "epoch": 0.27700666831316373, + "grad_norm": 0.5335002406037318, + "learning_rate": 4.999738042567984e-06, + "loss": 0.6733, + "step": 701 + }, + { + "epoch": 0.27740182761175597, + "grad_norm": 0.6251340494736801, + "learning_rate": 4.999735769669154e-06, + "loss": 0.6947, + "step": 702 + }, + { + "epoch": 0.2777969869103482, + "grad_norm": 0.5568465012685252, + "learning_rate": 4.999733486952879e-06, + "loss": 0.6791, + "step": 703 + }, + { + "epoch": 0.2781921462089405, + "grad_norm": 0.5677313063564176, + "learning_rate": 4.9997311944191695e-06, + "loss": 0.6653, + "step": 704 + }, + { + "epoch": 0.27858730550753275, + "grad_norm": 0.5398469806124272, + "learning_rate": 4.9997288920680345e-06, + "loss": 0.6708, + "step": 705 + }, + { + "epoch": 0.278982464806125, + "grad_norm": 0.5249202618448952, + "learning_rate": 4.999726579899481e-06, + "loss": 0.6714, + "step": 706 + }, + { + "epoch": 0.2793776241047172, + "grad_norm": 0.5479464511682306, + "learning_rate": 4.99972425791352e-06, + "loss": 0.6884, + "step": 707 + }, + { + "epoch": 0.27977278340330947, + "grad_norm": 0.612951514131732, + "learning_rate": 4.99972192611016e-06, + "loss": 0.6934, + "step": 708 + }, + { + "epoch": 0.2801679427019017, + "grad_norm": 0.5472856818707729, + "learning_rate": 4.999719584489409e-06, + "loss": 0.6927, + "step": 709 + }, + { + "epoch": 0.28056310200049395, + "grad_norm": 0.5711839364829995, + "learning_rate": 4.99971723305128e-06, + "loss": 0.6993, + "step": 710 + }, + { + "epoch": 0.2809582612990862, + "grad_norm": 0.5344843617908501, + "learning_rate": 4.999714871795778e-06, + "loss": 0.6713, + "step": 711 + }, + { + "epoch": 0.2813534205976784, + "grad_norm": 0.5725098490316971, + "learning_rate": 4.9997125007229144e-06, + "loss": 0.7028, + "step": 712 + }, + { + "epoch": 0.28174857989627067, + "grad_norm": 0.5706604408107072, + "learning_rate": 4.999710119832697e-06, + "loss": 0.7134, + "step": 713 + }, + { + "epoch": 0.2821437391948629, + "grad_norm": 0.5261160242380305, + "learning_rate": 4.9997077291251366e-06, + "loss": 0.6767, + "step": 714 + }, + { + "epoch": 0.28253889849345515, + "grad_norm": 0.5469719762517126, + "learning_rate": 4.999705328600243e-06, + "loss": 0.7025, + "step": 715 + }, + { + "epoch": 0.28293405779204744, + "grad_norm": 0.5324472833124879, + "learning_rate": 4.9997029182580236e-06, + "loss": 0.6743, + "step": 716 + }, + { + "epoch": 0.2833292170906397, + "grad_norm": 0.5401067135045965, + "learning_rate": 4.99970049809849e-06, + "loss": 0.6931, + "step": 717 + }, + { + "epoch": 0.2837243763892319, + "grad_norm": 0.5544406474161367, + "learning_rate": 4.99969806812165e-06, + "loss": 0.6968, + "step": 718 + }, + { + "epoch": 0.28411953568782417, + "grad_norm": 0.5506360149437247, + "learning_rate": 4.999695628327514e-06, + "loss": 0.6981, + "step": 719 + }, + { + "epoch": 0.2845146949864164, + "grad_norm": 0.5478511076951501, + "learning_rate": 4.999693178716092e-06, + "loss": 0.6782, + "step": 720 + }, + { + "epoch": 0.28490985428500865, + "grad_norm": 0.5487601492507348, + "learning_rate": 4.999690719287392e-06, + "loss": 0.682, + "step": 721 + }, + { + "epoch": 0.2853050135836009, + "grad_norm": 0.5568074889646241, + "learning_rate": 4.999688250041426e-06, + "loss": 0.7005, + "step": 722 + }, + { + "epoch": 0.2857001728821931, + "grad_norm": 0.5211276732961236, + "learning_rate": 4.999685770978202e-06, + "loss": 0.685, + "step": 723 + }, + { + "epoch": 0.28609533218078537, + "grad_norm": 0.5238564370246029, + "learning_rate": 4.9996832820977305e-06, + "loss": 0.6733, + "step": 724 + }, + { + "epoch": 0.2864904914793776, + "grad_norm": 0.5488717039839817, + "learning_rate": 4.999680783400021e-06, + "loss": 0.6824, + "step": 725 + }, + { + "epoch": 0.28688565077796985, + "grad_norm": 0.5590783909424277, + "learning_rate": 4.999678274885082e-06, + "loss": 0.6852, + "step": 726 + }, + { + "epoch": 0.2872808100765621, + "grad_norm": 0.5718775481303447, + "learning_rate": 4.999675756552926e-06, + "loss": 0.6684, + "step": 727 + }, + { + "epoch": 0.2876759693751544, + "grad_norm": 0.5537561512785655, + "learning_rate": 4.99967322840356e-06, + "loss": 0.7021, + "step": 728 + }, + { + "epoch": 0.2880711286737466, + "grad_norm": 0.595756815576963, + "learning_rate": 4.999670690436997e-06, + "loss": 0.7033, + "step": 729 + }, + { + "epoch": 0.28846628797233886, + "grad_norm": 0.5734638717038859, + "learning_rate": 4.999668142653244e-06, + "loss": 0.6825, + "step": 730 + }, + { + "epoch": 0.2888614472709311, + "grad_norm": 0.5649785931782922, + "learning_rate": 4.999665585052314e-06, + "loss": 0.7142, + "step": 731 + }, + { + "epoch": 0.28925660656952334, + "grad_norm": 0.6530890960010839, + "learning_rate": 4.999663017634214e-06, + "loss": 0.6788, + "step": 732 + }, + { + "epoch": 0.2896517658681156, + "grad_norm": 0.6271597238579767, + "learning_rate": 4.999660440398957e-06, + "loss": 0.6949, + "step": 733 + }, + { + "epoch": 0.2900469251667078, + "grad_norm": 0.5695950102858286, + "learning_rate": 4.999657853346551e-06, + "loss": 0.6833, + "step": 734 + }, + { + "epoch": 0.29044208446530007, + "grad_norm": 0.6916391987946482, + "learning_rate": 4.999655256477006e-06, + "loss": 0.6961, + "step": 735 + }, + { + "epoch": 0.2908372437638923, + "grad_norm": 0.6074484081013837, + "learning_rate": 4.9996526497903345e-06, + "loss": 0.7025, + "step": 736 + }, + { + "epoch": 0.29123240306248455, + "grad_norm": 0.5356095341699845, + "learning_rate": 4.9996500332865445e-06, + "loss": 0.6576, + "step": 737 + }, + { + "epoch": 0.2916275623610768, + "grad_norm": 0.5270122623233058, + "learning_rate": 4.999647406965647e-06, + "loss": 0.6493, + "step": 738 + }, + { + "epoch": 0.292022721659669, + "grad_norm": 0.5844224564006655, + "learning_rate": 4.999644770827652e-06, + "loss": 0.7082, + "step": 739 + }, + { + "epoch": 0.2924178809582613, + "grad_norm": 0.5731795031934965, + "learning_rate": 4.999642124872571e-06, + "loss": 0.6796, + "step": 740 + }, + { + "epoch": 0.29281304025685356, + "grad_norm": 0.5464280422780536, + "learning_rate": 4.999639469100413e-06, + "loss": 0.6919, + "step": 741 + }, + { + "epoch": 0.2932081995554458, + "grad_norm": 0.5488021923081339, + "learning_rate": 4.99963680351119e-06, + "loss": 0.6991, + "step": 742 + }, + { + "epoch": 0.29360335885403804, + "grad_norm": 0.5999425296366285, + "learning_rate": 4.99963412810491e-06, + "loss": 0.6892, + "step": 743 + }, + { + "epoch": 0.2939985181526303, + "grad_norm": 0.5301599791894791, + "learning_rate": 4.999631442881586e-06, + "loss": 0.7089, + "step": 744 + }, + { + "epoch": 0.2943936774512225, + "grad_norm": 0.5563197028629331, + "learning_rate": 4.999628747841227e-06, + "loss": 0.6775, + "step": 745 + }, + { + "epoch": 0.29478883674981476, + "grad_norm": 0.5446511718424091, + "learning_rate": 4.999626042983845e-06, + "loss": 0.6963, + "step": 746 + }, + { + "epoch": 0.295183996048407, + "grad_norm": 0.5507487342171807, + "learning_rate": 4.9996233283094485e-06, + "loss": 0.6755, + "step": 747 + }, + { + "epoch": 0.29557915534699924, + "grad_norm": 0.5266055784439764, + "learning_rate": 4.999620603818051e-06, + "loss": 0.6929, + "step": 748 + }, + { + "epoch": 0.2959743146455915, + "grad_norm": 0.5626530509305415, + "learning_rate": 4.99961786950966e-06, + "loss": 0.6898, + "step": 749 + }, + { + "epoch": 0.2963694739441837, + "grad_norm": 0.5399861793258945, + "learning_rate": 4.999615125384289e-06, + "loss": 0.6801, + "step": 750 + }, + { + "epoch": 0.296764633242776, + "grad_norm": 0.5610645829088754, + "learning_rate": 4.999612371441947e-06, + "loss": 0.6709, + "step": 751 + }, + { + "epoch": 0.29715979254136826, + "grad_norm": 0.5394716972805861, + "learning_rate": 4.999609607682645e-06, + "loss": 0.6514, + "step": 752 + }, + { + "epoch": 0.2975549518399605, + "grad_norm": 0.5430055925792929, + "learning_rate": 4.999606834106395e-06, + "loss": 0.6849, + "step": 753 + }, + { + "epoch": 0.29795011113855274, + "grad_norm": 0.544962481828332, + "learning_rate": 4.999604050713207e-06, + "loss": 0.7015, + "step": 754 + }, + { + "epoch": 0.298345270437145, + "grad_norm": 0.5344852748367742, + "learning_rate": 4.999601257503093e-06, + "loss": 0.6949, + "step": 755 + }, + { + "epoch": 0.2987404297357372, + "grad_norm": 0.5719742664992333, + "learning_rate": 4.999598454476062e-06, + "loss": 0.6901, + "step": 756 + }, + { + "epoch": 0.29913558903432946, + "grad_norm": 0.584662964003699, + "learning_rate": 4.999595641632126e-06, + "loss": 0.7033, + "step": 757 + }, + { + "epoch": 0.2995307483329217, + "grad_norm": 0.552326717376869, + "learning_rate": 4.999592818971297e-06, + "loss": 0.6863, + "step": 758 + }, + { + "epoch": 0.29992590763151394, + "grad_norm": 0.5475655007217732, + "learning_rate": 4.999589986493585e-06, + "loss": 0.66, + "step": 759 + }, + { + "epoch": 0.3003210669301062, + "grad_norm": 0.5882240228280694, + "learning_rate": 4.999587144199001e-06, + "loss": 0.6962, + "step": 760 + }, + { + "epoch": 0.3007162262286984, + "grad_norm": 0.5699153356709309, + "learning_rate": 4.9995842920875575e-06, + "loss": 0.6803, + "step": 761 + }, + { + "epoch": 0.30111138552729066, + "grad_norm": 0.6280558268501941, + "learning_rate": 4.9995814301592635e-06, + "loss": 0.6793, + "step": 762 + }, + { + "epoch": 0.30150654482588296, + "grad_norm": 0.6650094210932691, + "learning_rate": 4.999578558414132e-06, + "loss": 0.6975, + "step": 763 + }, + { + "epoch": 0.3019017041244752, + "grad_norm": 0.5542990218134582, + "learning_rate": 4.999575676852174e-06, + "loss": 0.6897, + "step": 764 + }, + { + "epoch": 0.30229686342306744, + "grad_norm": 0.585799531977493, + "learning_rate": 4.9995727854734e-06, + "loss": 0.6721, + "step": 765 + }, + { + "epoch": 0.3026920227216597, + "grad_norm": 0.7770451245382635, + "learning_rate": 4.9995698842778225e-06, + "loss": 0.6981, + "step": 766 + }, + { + "epoch": 0.3030871820202519, + "grad_norm": 0.5471559718041984, + "learning_rate": 4.999566973265452e-06, + "loss": 0.6524, + "step": 767 + }, + { + "epoch": 0.30348234131884416, + "grad_norm": 0.5680704664062982, + "learning_rate": 4.999564052436301e-06, + "loss": 0.6481, + "step": 768 + }, + { + "epoch": 0.3038775006174364, + "grad_norm": 0.5983428486623749, + "learning_rate": 4.999561121790379e-06, + "loss": 0.6848, + "step": 769 + }, + { + "epoch": 0.30427265991602864, + "grad_norm": 0.5641224737953592, + "learning_rate": 4.9995581813276995e-06, + "loss": 0.6703, + "step": 770 + }, + { + "epoch": 0.3046678192146209, + "grad_norm": 0.5647226711303925, + "learning_rate": 4.999555231048273e-06, + "loss": 0.7158, + "step": 771 + }, + { + "epoch": 0.3050629785132131, + "grad_norm": 0.7023254822593299, + "learning_rate": 4.999552270952112e-06, + "loss": 0.684, + "step": 772 + }, + { + "epoch": 0.30545813781180536, + "grad_norm": 0.6146604133584318, + "learning_rate": 4.999549301039226e-06, + "loss": 0.7085, + "step": 773 + }, + { + "epoch": 0.3058532971103976, + "grad_norm": 0.562728967949529, + "learning_rate": 4.999546321309629e-06, + "loss": 0.6681, + "step": 774 + }, + { + "epoch": 0.3062484564089899, + "grad_norm": 1.4500324454053806, + "learning_rate": 4.999543331763332e-06, + "loss": 0.6917, + "step": 775 + }, + { + "epoch": 0.30664361570758214, + "grad_norm": 0.6576023968965049, + "learning_rate": 4.999540332400346e-06, + "loss": 0.6953, + "step": 776 + }, + { + "epoch": 0.3070387750061744, + "grad_norm": 0.5706530250137147, + "learning_rate": 4.999537323220684e-06, + "loss": 0.6794, + "step": 777 + }, + { + "epoch": 0.3074339343047666, + "grad_norm": 0.5428676827482191, + "learning_rate": 4.999534304224357e-06, + "loss": 0.6851, + "step": 778 + }, + { + "epoch": 0.30782909360335886, + "grad_norm": 0.5981739162493036, + "learning_rate": 4.999531275411377e-06, + "loss": 0.671, + "step": 779 + }, + { + "epoch": 0.3082242529019511, + "grad_norm": 0.6170758554603312, + "learning_rate": 4.999528236781757e-06, + "loss": 0.6957, + "step": 780 + }, + { + "epoch": 0.30861941220054334, + "grad_norm": 0.5823666001801212, + "learning_rate": 4.999525188335507e-06, + "loss": 0.6719, + "step": 781 + }, + { + "epoch": 0.3090145714991356, + "grad_norm": 0.5997874539667468, + "learning_rate": 4.99952213007264e-06, + "loss": 0.6917, + "step": 782 + }, + { + "epoch": 0.3094097307977278, + "grad_norm": 0.5821609633692683, + "learning_rate": 4.999519061993166e-06, + "loss": 0.6838, + "step": 783 + }, + { + "epoch": 0.30980489009632006, + "grad_norm": 0.5479337929157194, + "learning_rate": 4.999515984097101e-06, + "loss": 0.6837, + "step": 784 + }, + { + "epoch": 0.3102000493949123, + "grad_norm": 0.5937117790026203, + "learning_rate": 4.999512896384455e-06, + "loss": 0.6899, + "step": 785 + }, + { + "epoch": 0.31059520869350454, + "grad_norm": 0.5782838146122935, + "learning_rate": 4.99950979885524e-06, + "loss": 0.6802, + "step": 786 + }, + { + "epoch": 0.31099036799209684, + "grad_norm": 0.546006734321019, + "learning_rate": 4.999506691509467e-06, + "loss": 0.6776, + "step": 787 + }, + { + "epoch": 0.3113855272906891, + "grad_norm": 0.5621083736104908, + "learning_rate": 4.999503574347151e-06, + "loss": 0.679, + "step": 788 + }, + { + "epoch": 0.3117806865892813, + "grad_norm": 0.6018666658407275, + "learning_rate": 4.9995004473683016e-06, + "loss": 0.7095, + "step": 789 + }, + { + "epoch": 0.31217584588787356, + "grad_norm": 0.5665313992465836, + "learning_rate": 4.999497310572933e-06, + "loss": 0.6906, + "step": 790 + }, + { + "epoch": 0.3125710051864658, + "grad_norm": 0.5795816586087552, + "learning_rate": 4.999494163961057e-06, + "loss": 0.6792, + "step": 791 + }, + { + "epoch": 0.31296616448505804, + "grad_norm": 0.5868236665755461, + "learning_rate": 4.999491007532685e-06, + "loss": 0.6851, + "step": 792 + }, + { + "epoch": 0.3133613237836503, + "grad_norm": 0.5722398773037819, + "learning_rate": 4.999487841287831e-06, + "loss": 0.6667, + "step": 793 + }, + { + "epoch": 0.3137564830822425, + "grad_norm": 0.5946352610623089, + "learning_rate": 4.999484665226506e-06, + "loss": 0.6909, + "step": 794 + }, + { + "epoch": 0.31415164238083476, + "grad_norm": 1.2564866117315103, + "learning_rate": 4.999481479348722e-06, + "loss": 0.6852, + "step": 795 + }, + { + "epoch": 0.314546801679427, + "grad_norm": 0.5575268193016416, + "learning_rate": 4.999478283654494e-06, + "loss": 0.6742, + "step": 796 + }, + { + "epoch": 0.31494196097801924, + "grad_norm": 0.5545559884431172, + "learning_rate": 4.999475078143832e-06, + "loss": 0.6743, + "step": 797 + }, + { + "epoch": 0.3153371202766115, + "grad_norm": 0.5460260402768534, + "learning_rate": 4.999471862816751e-06, + "loss": 0.6682, + "step": 798 + }, + { + "epoch": 0.3157322795752038, + "grad_norm": 0.545727768192459, + "learning_rate": 4.9994686376732605e-06, + "loss": 0.675, + "step": 799 + }, + { + "epoch": 0.316127438873796, + "grad_norm": 0.5493756349564834, + "learning_rate": 4.999465402713376e-06, + "loss": 0.6785, + "step": 800 + }, + { + "epoch": 0.31652259817238826, + "grad_norm": 0.5521530157011729, + "learning_rate": 4.99946215793711e-06, + "loss": 0.6723, + "step": 801 + }, + { + "epoch": 0.3169177574709805, + "grad_norm": 0.6919194319875192, + "learning_rate": 4.999458903344473e-06, + "loss": 0.6801, + "step": 802 + }, + { + "epoch": 0.31731291676957274, + "grad_norm": 0.5635837786929486, + "learning_rate": 4.99945563893548e-06, + "loss": 0.6983, + "step": 803 + }, + { + "epoch": 0.317708076068165, + "grad_norm": 0.5577079529859765, + "learning_rate": 4.999452364710142e-06, + "loss": 0.6869, + "step": 804 + }, + { + "epoch": 0.3181032353667572, + "grad_norm": 0.522245723084247, + "learning_rate": 4.999449080668473e-06, + "loss": 0.6711, + "step": 805 + }, + { + "epoch": 0.31849839466534946, + "grad_norm": 0.5543341617543934, + "learning_rate": 4.999445786810486e-06, + "loss": 0.6528, + "step": 806 + }, + { + "epoch": 0.3188935539639417, + "grad_norm": 0.5404102043718793, + "learning_rate": 4.999442483136194e-06, + "loss": 0.6706, + "step": 807 + }, + { + "epoch": 0.31928871326253394, + "grad_norm": 0.5463105642491327, + "learning_rate": 4.99943916964561e-06, + "loss": 0.6675, + "step": 808 + }, + { + "epoch": 0.3196838725611262, + "grad_norm": 0.6177688829664617, + "learning_rate": 4.999435846338746e-06, + "loss": 0.6753, + "step": 809 + }, + { + "epoch": 0.3200790318597184, + "grad_norm": 0.5490308724780782, + "learning_rate": 4.999432513215616e-06, + "loss": 0.6751, + "step": 810 + }, + { + "epoch": 0.3204741911583107, + "grad_norm": 0.6090150943921979, + "learning_rate": 4.999429170276233e-06, + "loss": 0.6852, + "step": 811 + }, + { + "epoch": 0.32086935045690296, + "grad_norm": 0.5611702933211862, + "learning_rate": 4.99942581752061e-06, + "loss": 0.6857, + "step": 812 + }, + { + "epoch": 0.3212645097554952, + "grad_norm": 0.5471158991492043, + "learning_rate": 4.999422454948759e-06, + "loss": 0.668, + "step": 813 + }, + { + "epoch": 0.32165966905408744, + "grad_norm": 0.6040235246117969, + "learning_rate": 4.9994190825606955e-06, + "loss": 0.6738, + "step": 814 + }, + { + "epoch": 0.3220548283526797, + "grad_norm": 0.5461114975582116, + "learning_rate": 4.999415700356431e-06, + "loss": 0.6935, + "step": 815 + }, + { + "epoch": 0.3224499876512719, + "grad_norm": 0.5531585597793667, + "learning_rate": 4.99941230833598e-06, + "loss": 0.6964, + "step": 816 + }, + { + "epoch": 0.32284514694986416, + "grad_norm": 0.555832857990718, + "learning_rate": 4.999408906499355e-06, + "loss": 0.6878, + "step": 817 + }, + { + "epoch": 0.3232403062484564, + "grad_norm": 0.5159924465583967, + "learning_rate": 4.999405494846569e-06, + "loss": 0.6666, + "step": 818 + }, + { + "epoch": 0.32363546554704864, + "grad_norm": 0.5242278141632275, + "learning_rate": 4.999402073377637e-06, + "loss": 0.6556, + "step": 819 + }, + { + "epoch": 0.3240306248456409, + "grad_norm": 0.5485685565466117, + "learning_rate": 4.99939864209257e-06, + "loss": 0.6867, + "step": 820 + }, + { + "epoch": 0.3244257841442331, + "grad_norm": 0.5599138503166448, + "learning_rate": 4.999395200991384e-06, + "loss": 0.6741, + "step": 821 + }, + { + "epoch": 0.3248209434428254, + "grad_norm": 0.5314314737207193, + "learning_rate": 4.999391750074091e-06, + "loss": 0.6597, + "step": 822 + }, + { + "epoch": 0.32521610274141766, + "grad_norm": 0.5505001683410817, + "learning_rate": 4.999388289340705e-06, + "loss": 0.6734, + "step": 823 + }, + { + "epoch": 0.3256112620400099, + "grad_norm": 0.5483028368326526, + "learning_rate": 4.999384818791239e-06, + "loss": 0.6871, + "step": 824 + }, + { + "epoch": 0.32600642133860214, + "grad_norm": 0.5553659611188733, + "learning_rate": 4.9993813384257075e-06, + "loss": 0.6879, + "step": 825 + }, + { + "epoch": 0.3264015806371944, + "grad_norm": 0.5508245461247865, + "learning_rate": 4.999377848244123e-06, + "loss": 0.6608, + "step": 826 + }, + { + "epoch": 0.3267967399357866, + "grad_norm": 0.5468555275628424, + "learning_rate": 4.9993743482465015e-06, + "loss": 0.6885, + "step": 827 + }, + { + "epoch": 0.32719189923437886, + "grad_norm": 0.5518450089645637, + "learning_rate": 4.999370838432854e-06, + "loss": 0.6784, + "step": 828 + }, + { + "epoch": 0.3275870585329711, + "grad_norm": 0.5381677755962232, + "learning_rate": 4.999367318803197e-06, + "loss": 0.6813, + "step": 829 + }, + { + "epoch": 0.32798221783156334, + "grad_norm": 0.5400069304709476, + "learning_rate": 4.999363789357542e-06, + "loss": 0.6664, + "step": 830 + }, + { + "epoch": 0.3283773771301556, + "grad_norm": 0.6129541548516995, + "learning_rate": 4.999360250095903e-06, + "loss": 0.6813, + "step": 831 + }, + { + "epoch": 0.3287725364287478, + "grad_norm": 0.551379733965531, + "learning_rate": 4.999356701018295e-06, + "loss": 0.6698, + "step": 832 + }, + { + "epoch": 0.32916769572734006, + "grad_norm": 0.530243445703882, + "learning_rate": 4.999353142124732e-06, + "loss": 0.682, + "step": 833 + }, + { + "epoch": 0.32956285502593236, + "grad_norm": 0.5348358317715631, + "learning_rate": 4.999349573415227e-06, + "loss": 0.6489, + "step": 834 + }, + { + "epoch": 0.3299580143245246, + "grad_norm": 0.5547085181902088, + "learning_rate": 4.999345994889796e-06, + "loss": 0.6938, + "step": 835 + }, + { + "epoch": 0.33035317362311684, + "grad_norm": 0.5243281588616451, + "learning_rate": 4.99934240654845e-06, + "loss": 0.6671, + "step": 836 + }, + { + "epoch": 0.3307483329217091, + "grad_norm": 0.5493088801401717, + "learning_rate": 4.9993388083912054e-06, + "loss": 0.6706, + "step": 837 + }, + { + "epoch": 0.3311434922203013, + "grad_norm": 0.545645002523389, + "learning_rate": 4.999335200418076e-06, + "loss": 0.6901, + "step": 838 + }, + { + "epoch": 0.33153865151889356, + "grad_norm": 0.5613907384738281, + "learning_rate": 4.999331582629075e-06, + "loss": 0.6908, + "step": 839 + }, + { + "epoch": 0.3319338108174858, + "grad_norm": 0.6000984842384804, + "learning_rate": 4.999327955024217e-06, + "loss": 0.6828, + "step": 840 + }, + { + "epoch": 0.33232897011607804, + "grad_norm": 0.51885138747256, + "learning_rate": 4.9993243176035175e-06, + "loss": 0.6597, + "step": 841 + }, + { + "epoch": 0.3327241294146703, + "grad_norm": 0.6231011537404787, + "learning_rate": 4.999320670366989e-06, + "loss": 0.6571, + "step": 842 + }, + { + "epoch": 0.3331192887132625, + "grad_norm": 0.5214424709505693, + "learning_rate": 4.999317013314646e-06, + "loss": 0.673, + "step": 843 + }, + { + "epoch": 0.33351444801185476, + "grad_norm": 0.5410014959338921, + "learning_rate": 4.999313346446505e-06, + "loss": 0.6747, + "step": 844 + }, + { + "epoch": 0.333909607310447, + "grad_norm": 0.6300322367055629, + "learning_rate": 4.999309669762578e-06, + "loss": 0.6944, + "step": 845 + }, + { + "epoch": 0.3343047666090393, + "grad_norm": 0.6141171186975185, + "learning_rate": 4.99930598326288e-06, + "loss": 0.6746, + "step": 846 + }, + { + "epoch": 0.33469992590763153, + "grad_norm": 0.5592903856619154, + "learning_rate": 4.9993022869474264e-06, + "loss": 0.6846, + "step": 847 + }, + { + "epoch": 0.3350950852062238, + "grad_norm": 0.5914019521534136, + "learning_rate": 4.99929858081623e-06, + "loss": 0.6562, + "step": 848 + }, + { + "epoch": 0.335490244504816, + "grad_norm": 0.5556986261868628, + "learning_rate": 4.999294864869307e-06, + "loss": 0.695, + "step": 849 + }, + { + "epoch": 0.33588540380340826, + "grad_norm": 0.5361036550501209, + "learning_rate": 4.999291139106672e-06, + "loss": 0.6733, + "step": 850 + }, + { + "epoch": 0.3362805631020005, + "grad_norm": 0.5663681947389955, + "learning_rate": 4.9992874035283375e-06, + "loss": 0.6727, + "step": 851 + }, + { + "epoch": 0.33667572240059274, + "grad_norm": 0.5385125766471879, + "learning_rate": 4.999283658134322e-06, + "loss": 0.6717, + "step": 852 + }, + { + "epoch": 0.337070881699185, + "grad_norm": 0.58237023128874, + "learning_rate": 4.999279902924636e-06, + "loss": 0.6751, + "step": 853 + }, + { + "epoch": 0.3374660409977772, + "grad_norm": 0.6219007403124622, + "learning_rate": 4.999276137899297e-06, + "loss": 0.6515, + "step": 854 + }, + { + "epoch": 0.33786120029636946, + "grad_norm": 0.5581232382993533, + "learning_rate": 4.999272363058319e-06, + "loss": 0.6876, + "step": 855 + }, + { + "epoch": 0.3382563595949617, + "grad_norm": 0.5570097230753517, + "learning_rate": 4.999268578401717e-06, + "loss": 0.6696, + "step": 856 + }, + { + "epoch": 0.33865151889355394, + "grad_norm": 0.5502292291243254, + "learning_rate": 4.999264783929505e-06, + "loss": 0.6651, + "step": 857 + }, + { + "epoch": 0.33904667819214623, + "grad_norm": 0.5289187848980834, + "learning_rate": 4.999260979641699e-06, + "loss": 0.6586, + "step": 858 + }, + { + "epoch": 0.3394418374907385, + "grad_norm": 0.5677219337773182, + "learning_rate": 4.999257165538314e-06, + "loss": 0.6764, + "step": 859 + }, + { + "epoch": 0.3398369967893307, + "grad_norm": 0.6248356948027501, + "learning_rate": 4.999253341619363e-06, + "loss": 0.6685, + "step": 860 + }, + { + "epoch": 0.34023215608792295, + "grad_norm": 0.5245870998609313, + "learning_rate": 4.999249507884864e-06, + "loss": 0.6637, + "step": 861 + }, + { + "epoch": 0.3406273153865152, + "grad_norm": 0.5633998533768565, + "learning_rate": 4.9992456643348296e-06, + "loss": 0.6626, + "step": 862 + }, + { + "epoch": 0.34102247468510744, + "grad_norm": 0.6163731317461442, + "learning_rate": 4.999241810969276e-06, + "loss": 0.7031, + "step": 863 + }, + { + "epoch": 0.3414176339836997, + "grad_norm": 0.5319060256897468, + "learning_rate": 4.999237947788218e-06, + "loss": 0.6661, + "step": 864 + }, + { + "epoch": 0.3418127932822919, + "grad_norm": 0.5747234912624848, + "learning_rate": 4.999234074791673e-06, + "loss": 0.6637, + "step": 865 + }, + { + "epoch": 0.34220795258088416, + "grad_norm": 0.6217626123594077, + "learning_rate": 4.9992301919796515e-06, + "loss": 0.6956, + "step": 866 + }, + { + "epoch": 0.3426031118794764, + "grad_norm": 0.5291571135659683, + "learning_rate": 4.999226299352172e-06, + "loss": 0.6592, + "step": 867 + }, + { + "epoch": 0.34299827117806864, + "grad_norm": 0.5929800922433651, + "learning_rate": 4.99922239690925e-06, + "loss": 0.6758, + "step": 868 + }, + { + "epoch": 0.3433934304766609, + "grad_norm": 0.8263016280842782, + "learning_rate": 4.999218484650899e-06, + "loss": 0.6924, + "step": 869 + }, + { + "epoch": 0.3437885897752532, + "grad_norm": 0.5373424767279488, + "learning_rate": 4.999214562577137e-06, + "loss": 0.6778, + "step": 870 + }, + { + "epoch": 0.3441837490738454, + "grad_norm": 0.6161942738080999, + "learning_rate": 4.999210630687976e-06, + "loss": 0.6868, + "step": 871 + }, + { + "epoch": 0.34457890837243765, + "grad_norm": 0.5544247743850119, + "learning_rate": 4.999206688983435e-06, + "loss": 0.6472, + "step": 872 + }, + { + "epoch": 0.3449740676710299, + "grad_norm": 0.5436735454536356, + "learning_rate": 4.9992027374635265e-06, + "loss": 0.6681, + "step": 873 + }, + { + "epoch": 0.34536922696962213, + "grad_norm": 0.5870467900357642, + "learning_rate": 4.999198776128268e-06, + "loss": 0.6488, + "step": 874 + }, + { + "epoch": 0.3457643862682144, + "grad_norm": 0.5226483330016569, + "learning_rate": 4.999194804977674e-06, + "loss": 0.6474, + "step": 875 + }, + { + "epoch": 0.3461595455668066, + "grad_norm": 0.5480980063898397, + "learning_rate": 4.99919082401176e-06, + "loss": 0.6809, + "step": 876 + }, + { + "epoch": 0.34655470486539885, + "grad_norm": 0.5845327636648194, + "learning_rate": 4.999186833230542e-06, + "loss": 0.6461, + "step": 877 + }, + { + "epoch": 0.3469498641639911, + "grad_norm": 0.700267095825688, + "learning_rate": 4.999182832634036e-06, + "loss": 0.6864, + "step": 878 + }, + { + "epoch": 0.34734502346258334, + "grad_norm": 0.5704163766747798, + "learning_rate": 4.999178822222258e-06, + "loss": 0.6624, + "step": 879 + }, + { + "epoch": 0.3477401827611756, + "grad_norm": 0.5688194389353117, + "learning_rate": 4.999174801995222e-06, + "loss": 0.6597, + "step": 880 + }, + { + "epoch": 0.34813534205976787, + "grad_norm": 0.5210130312006719, + "learning_rate": 4.999170771952946e-06, + "loss": 0.6618, + "step": 881 + }, + { + "epoch": 0.3485305013583601, + "grad_norm": 0.5429969490828442, + "learning_rate": 4.999166732095445e-06, + "loss": 0.6677, + "step": 882 + }, + { + "epoch": 0.34892566065695235, + "grad_norm": 0.5679202548589564, + "learning_rate": 4.999162682422733e-06, + "loss": 0.6668, + "step": 883 + }, + { + "epoch": 0.3493208199555446, + "grad_norm": 0.5391777536262067, + "learning_rate": 4.999158622934829e-06, + "loss": 0.6704, + "step": 884 + }, + { + "epoch": 0.34971597925413683, + "grad_norm": 0.5622016243906001, + "learning_rate": 4.999154553631748e-06, + "loss": 0.6435, + "step": 885 + }, + { + "epoch": 0.3501111385527291, + "grad_norm": 0.5628428119759383, + "learning_rate": 4.999150474513504e-06, + "loss": 0.6618, + "step": 886 + }, + { + "epoch": 0.3505062978513213, + "grad_norm": 0.5303772562973009, + "learning_rate": 4.999146385580114e-06, + "loss": 0.6628, + "step": 887 + }, + { + "epoch": 0.35090145714991355, + "grad_norm": 0.5813782031762684, + "learning_rate": 4.999142286831596e-06, + "loss": 0.6876, + "step": 888 + }, + { + "epoch": 0.3512966164485058, + "grad_norm": 0.563823831771851, + "learning_rate": 4.999138178267965e-06, + "loss": 0.6604, + "step": 889 + }, + { + "epoch": 0.35169177574709803, + "grad_norm": 0.5253757499381441, + "learning_rate": 4.999134059889236e-06, + "loss": 0.6639, + "step": 890 + }, + { + "epoch": 0.3520869350456903, + "grad_norm": 0.5593726055000605, + "learning_rate": 4.9991299316954255e-06, + "loss": 0.6476, + "step": 891 + }, + { + "epoch": 0.3524820943442825, + "grad_norm": 0.5367006366591701, + "learning_rate": 4.9991257936865515e-06, + "loss": 0.6937, + "step": 892 + }, + { + "epoch": 0.3528772536428748, + "grad_norm": 0.5567610106868094, + "learning_rate": 4.999121645862628e-06, + "loss": 0.6731, + "step": 893 + }, + { + "epoch": 0.35327241294146705, + "grad_norm": 0.5324611058908338, + "learning_rate": 4.999117488223672e-06, + "loss": 0.6735, + "step": 894 + }, + { + "epoch": 0.3536675722400593, + "grad_norm": 0.845928580999717, + "learning_rate": 4.999113320769701e-06, + "loss": 0.6675, + "step": 895 + }, + { + "epoch": 0.35406273153865153, + "grad_norm": 0.5597594694408724, + "learning_rate": 4.999109143500729e-06, + "loss": 0.6982, + "step": 896 + }, + { + "epoch": 0.35445789083724377, + "grad_norm": 0.5742575981877034, + "learning_rate": 4.999104956416775e-06, + "loss": 0.6431, + "step": 897 + }, + { + "epoch": 0.354853050135836, + "grad_norm": 0.5446258761551195, + "learning_rate": 4.999100759517854e-06, + "loss": 0.6749, + "step": 898 + }, + { + "epoch": 0.35524820943442825, + "grad_norm": 0.5292659599253334, + "learning_rate": 4.999096552803983e-06, + "loss": 0.6775, + "step": 899 + }, + { + "epoch": 0.3556433687330205, + "grad_norm": 0.5292261917552322, + "learning_rate": 4.9990923362751776e-06, + "loss": 0.6716, + "step": 900 + }, + { + "epoch": 0.35603852803161273, + "grad_norm": 0.5470660498231109, + "learning_rate": 4.999088109931456e-06, + "loss": 0.6599, + "step": 901 + }, + { + "epoch": 0.356433687330205, + "grad_norm": 0.5338026521581444, + "learning_rate": 4.999083873772833e-06, + "loss": 0.6691, + "step": 902 + }, + { + "epoch": 0.3568288466287972, + "grad_norm": 0.5613881734474877, + "learning_rate": 4.9990796277993255e-06, + "loss": 0.6798, + "step": 903 + }, + { + "epoch": 0.35722400592738945, + "grad_norm": 0.5478029825120281, + "learning_rate": 4.999075372010952e-06, + "loss": 0.6623, + "step": 904 + }, + { + "epoch": 0.35761916522598175, + "grad_norm": 0.5165063149329737, + "learning_rate": 4.999071106407728e-06, + "loss": 0.661, + "step": 905 + }, + { + "epoch": 0.358014324524574, + "grad_norm": 0.5992216629583358, + "learning_rate": 4.999066830989669e-06, + "loss": 0.653, + "step": 906 + }, + { + "epoch": 0.35840948382316623, + "grad_norm": 0.5526267591056174, + "learning_rate": 4.999062545756794e-06, + "loss": 0.6657, + "step": 907 + }, + { + "epoch": 0.35880464312175847, + "grad_norm": 0.5231560128614834, + "learning_rate": 4.999058250709119e-06, + "loss": 0.6427, + "step": 908 + }, + { + "epoch": 0.3591998024203507, + "grad_norm": 0.5496671975565968, + "learning_rate": 4.99905394584666e-06, + "loss": 0.6728, + "step": 909 + }, + { + "epoch": 0.35959496171894295, + "grad_norm": 0.5814232320207219, + "learning_rate": 4.999049631169435e-06, + "loss": 0.6716, + "step": 910 + }, + { + "epoch": 0.3599901210175352, + "grad_norm": 0.5284531514261371, + "learning_rate": 4.99904530667746e-06, + "loss": 0.6784, + "step": 911 + }, + { + "epoch": 0.36038528031612743, + "grad_norm": 0.5830847990400584, + "learning_rate": 4.999040972370753e-06, + "loss": 0.6728, + "step": 912 + }, + { + "epoch": 0.3607804396147197, + "grad_norm": 0.5265753919458841, + "learning_rate": 4.999036628249331e-06, + "loss": 0.6958, + "step": 913 + }, + { + "epoch": 0.3611755989133119, + "grad_norm": 0.528588620481184, + "learning_rate": 4.99903227431321e-06, + "loss": 0.6548, + "step": 914 + }, + { + "epoch": 0.36157075821190415, + "grad_norm": 0.5182035242903082, + "learning_rate": 4.9990279105624076e-06, + "loss": 0.6598, + "step": 915 + }, + { + "epoch": 0.3619659175104964, + "grad_norm": 0.5286689369003267, + "learning_rate": 4.9990235369969406e-06, + "loss": 0.6872, + "step": 916 + }, + { + "epoch": 0.3623610768090887, + "grad_norm": 0.5386957143651905, + "learning_rate": 4.9990191536168274e-06, + "loss": 0.6491, + "step": 917 + }, + { + "epoch": 0.36275623610768093, + "grad_norm": 0.5514686370106001, + "learning_rate": 4.999014760422085e-06, + "loss": 0.6651, + "step": 918 + }, + { + "epoch": 0.36315139540627317, + "grad_norm": 0.5648373707016645, + "learning_rate": 4.999010357412729e-06, + "loss": 0.6731, + "step": 919 + }, + { + "epoch": 0.3635465547048654, + "grad_norm": 0.5152295186463809, + "learning_rate": 4.999005944588779e-06, + "loss": 0.6428, + "step": 920 + }, + { + "epoch": 0.36394171400345765, + "grad_norm": 0.5066022878135411, + "learning_rate": 4.9990015219502505e-06, + "loss": 0.658, + "step": 921 + }, + { + "epoch": 0.3643368733020499, + "grad_norm": 0.5428977376504636, + "learning_rate": 4.998997089497161e-06, + "loss": 0.6613, + "step": 922 + }, + { + "epoch": 0.36473203260064213, + "grad_norm": 0.5678413901240047, + "learning_rate": 4.998992647229529e-06, + "loss": 0.6737, + "step": 923 + }, + { + "epoch": 0.36512719189923437, + "grad_norm": 0.536401567098352, + "learning_rate": 4.9989881951473706e-06, + "loss": 0.652, + "step": 924 + }, + { + "epoch": 0.3655223511978266, + "grad_norm": 0.5768953101789457, + "learning_rate": 4.998983733250705e-06, + "loss": 0.6314, + "step": 925 + }, + { + "epoch": 0.36591751049641885, + "grad_norm": 0.5352996178968072, + "learning_rate": 4.998979261539548e-06, + "loss": 0.6683, + "step": 926 + }, + { + "epoch": 0.3663126697950111, + "grad_norm": 0.5174696386468567, + "learning_rate": 4.998974780013919e-06, + "loss": 0.6607, + "step": 927 + }, + { + "epoch": 0.36670782909360333, + "grad_norm": 0.6030535589781871, + "learning_rate": 4.998970288673833e-06, + "loss": 0.6578, + "step": 928 + }, + { + "epoch": 0.36710298839219563, + "grad_norm": 0.5500821850702614, + "learning_rate": 4.99896578751931e-06, + "loss": 0.6622, + "step": 929 + }, + { + "epoch": 0.36749814769078787, + "grad_norm": 0.5283486083712549, + "learning_rate": 4.998961276550367e-06, + "loss": 0.6518, + "step": 930 + }, + { + "epoch": 0.3678933069893801, + "grad_norm": 0.5663497776849602, + "learning_rate": 4.998956755767021e-06, + "loss": 0.6697, + "step": 931 + }, + { + "epoch": 0.36828846628797235, + "grad_norm": 0.5211239249646628, + "learning_rate": 4.998952225169291e-06, + "loss": 0.6658, + "step": 932 + }, + { + "epoch": 0.3686836255865646, + "grad_norm": 0.5375278433902325, + "learning_rate": 4.9989476847571935e-06, + "loss": 0.6672, + "step": 933 + }, + { + "epoch": 0.36907878488515683, + "grad_norm": 0.5125111796062831, + "learning_rate": 4.998943134530748e-06, + "loss": 0.6664, + "step": 934 + }, + { + "epoch": 0.36947394418374907, + "grad_norm": 0.7149723112166187, + "learning_rate": 4.9989385744899705e-06, + "loss": 0.6792, + "step": 935 + }, + { + "epoch": 0.3698691034823413, + "grad_norm": 0.5195495635576596, + "learning_rate": 4.99893400463488e-06, + "loss": 0.6424, + "step": 936 + }, + { + "epoch": 0.37026426278093355, + "grad_norm": 0.5261355467385814, + "learning_rate": 4.998929424965494e-06, + "loss": 0.6794, + "step": 937 + }, + { + "epoch": 0.3706594220795258, + "grad_norm": 0.5371603404887052, + "learning_rate": 4.99892483548183e-06, + "loss": 0.6556, + "step": 938 + }, + { + "epoch": 0.37105458137811803, + "grad_norm": 0.5288627513817326, + "learning_rate": 4.998920236183908e-06, + "loss": 0.6408, + "step": 939 + }, + { + "epoch": 0.37144974067671027, + "grad_norm": 0.5366598263301546, + "learning_rate": 4.998915627071743e-06, + "loss": 0.6745, + "step": 940 + }, + { + "epoch": 0.37184489997530257, + "grad_norm": 0.5209858322992073, + "learning_rate": 4.998911008145357e-06, + "loss": 0.6716, + "step": 941 + }, + { + "epoch": 0.3722400592738948, + "grad_norm": 0.5092473497788766, + "learning_rate": 4.998906379404764e-06, + "loss": 0.6588, + "step": 942 + }, + { + "epoch": 0.37263521857248705, + "grad_norm": 0.5301240404384194, + "learning_rate": 4.998901740849985e-06, + "loss": 0.6475, + "step": 943 + }, + { + "epoch": 0.3730303778710793, + "grad_norm": 0.5369766245496886, + "learning_rate": 4.998897092481037e-06, + "loss": 0.6704, + "step": 944 + }, + { + "epoch": 0.37342553716967153, + "grad_norm": 0.5677722223194425, + "learning_rate": 4.998892434297939e-06, + "loss": 0.6489, + "step": 945 + }, + { + "epoch": 0.37382069646826377, + "grad_norm": 0.5231100890683938, + "learning_rate": 4.998887766300708e-06, + "loss": 0.6746, + "step": 946 + }, + { + "epoch": 0.374215855766856, + "grad_norm": 0.5295898234845133, + "learning_rate": 4.998883088489365e-06, + "loss": 0.6814, + "step": 947 + }, + { + "epoch": 0.37461101506544825, + "grad_norm": 0.5785367347699937, + "learning_rate": 4.9988784008639254e-06, + "loss": 0.6677, + "step": 948 + }, + { + "epoch": 0.3750061743640405, + "grad_norm": 0.5052116039346995, + "learning_rate": 4.99887370342441e-06, + "loss": 0.6693, + "step": 949 + }, + { + "epoch": 0.37540133366263273, + "grad_norm": 0.5090885753544528, + "learning_rate": 4.998868996170835e-06, + "loss": 0.6308, + "step": 950 + }, + { + "epoch": 0.37579649296122497, + "grad_norm": 0.6095866408280952, + "learning_rate": 4.9988642791032205e-06, + "loss": 0.6728, + "step": 951 + }, + { + "epoch": 0.37619165225981727, + "grad_norm": 0.5542782397017351, + "learning_rate": 4.998859552221584e-06, + "loss": 0.6645, + "step": 952 + }, + { + "epoch": 0.3765868115584095, + "grad_norm": 0.6102273000867942, + "learning_rate": 4.9988548155259446e-06, + "loss": 0.6714, + "step": 953 + }, + { + "epoch": 0.37698197085700175, + "grad_norm": 0.5788528638282513, + "learning_rate": 4.998850069016321e-06, + "loss": 0.6676, + "step": 954 + }, + { + "epoch": 0.377377130155594, + "grad_norm": 0.5259169323825259, + "learning_rate": 4.998845312692732e-06, + "loss": 0.6705, + "step": 955 + }, + { + "epoch": 0.3777722894541862, + "grad_norm": 0.5396876703479768, + "learning_rate": 4.998840546555196e-06, + "loss": 0.6619, + "step": 956 + }, + { + "epoch": 0.37816744875277847, + "grad_norm": 0.5417300333578844, + "learning_rate": 4.9988357706037315e-06, + "loss": 0.675, + "step": 957 + }, + { + "epoch": 0.3785626080513707, + "grad_norm": 0.5212701698383503, + "learning_rate": 4.998830984838358e-06, + "loss": 0.6446, + "step": 958 + }, + { + "epoch": 0.37895776734996295, + "grad_norm": 0.5585604547421766, + "learning_rate": 4.9988261892590925e-06, + "loss": 0.6649, + "step": 959 + }, + { + "epoch": 0.3793529266485552, + "grad_norm": 0.5356740144146942, + "learning_rate": 4.998821383865956e-06, + "loss": 0.6612, + "step": 960 + }, + { + "epoch": 0.37974808594714743, + "grad_norm": 0.5389707866617842, + "learning_rate": 4.9988165686589665e-06, + "loss": 0.6612, + "step": 961 + }, + { + "epoch": 0.38014324524573967, + "grad_norm": 0.5470563785584632, + "learning_rate": 4.998811743638142e-06, + "loss": 0.6502, + "step": 962 + }, + { + "epoch": 0.3805384045443319, + "grad_norm": 0.5497560302772653, + "learning_rate": 4.998806908803504e-06, + "loss": 0.6656, + "step": 963 + }, + { + "epoch": 0.3809335638429242, + "grad_norm": 0.5504911983499313, + "learning_rate": 4.998802064155068e-06, + "loss": 0.6502, + "step": 964 + }, + { + "epoch": 0.38132872314151645, + "grad_norm": 0.5526354169398469, + "learning_rate": 4.998797209692856e-06, + "loss": 0.6807, + "step": 965 + }, + { + "epoch": 0.3817238824401087, + "grad_norm": 0.5608969389308869, + "learning_rate": 4.998792345416886e-06, + "loss": 0.6562, + "step": 966 + }, + { + "epoch": 0.3821190417387009, + "grad_norm": 0.5361562466884632, + "learning_rate": 4.998787471327177e-06, + "loss": 0.6781, + "step": 967 + }, + { + "epoch": 0.38251420103729317, + "grad_norm": 0.6165770215935255, + "learning_rate": 4.998782587423747e-06, + "loss": 0.668, + "step": 968 + }, + { + "epoch": 0.3829093603358854, + "grad_norm": 0.6202442629402494, + "learning_rate": 4.9987776937066175e-06, + "loss": 0.6746, + "step": 969 + }, + { + "epoch": 0.38330451963447765, + "grad_norm": 0.5192514425229897, + "learning_rate": 4.998772790175806e-06, + "loss": 0.6578, + "step": 970 + }, + { + "epoch": 0.3836996789330699, + "grad_norm": 0.621919978812942, + "learning_rate": 4.998767876831333e-06, + "loss": 0.6718, + "step": 971 + }, + { + "epoch": 0.38409483823166213, + "grad_norm": 0.5374983489076336, + "learning_rate": 4.998762953673216e-06, + "loss": 0.6818, + "step": 972 + }, + { + "epoch": 0.38448999753025437, + "grad_norm": 0.5336474258984356, + "learning_rate": 4.998758020701476e-06, + "loss": 0.6751, + "step": 973 + }, + { + "epoch": 0.3848851568288466, + "grad_norm": 0.5621510282866113, + "learning_rate": 4.998753077916132e-06, + "loss": 0.6844, + "step": 974 + }, + { + "epoch": 0.38528031612743885, + "grad_norm": 0.5441555376095901, + "learning_rate": 4.998748125317203e-06, + "loss": 0.6607, + "step": 975 + }, + { + "epoch": 0.38567547542603114, + "grad_norm": 0.5419600341782435, + "learning_rate": 4.998743162904709e-06, + "loss": 0.6722, + "step": 976 + }, + { + "epoch": 0.3860706347246234, + "grad_norm": 0.5739218284794436, + "learning_rate": 4.998738190678669e-06, + "loss": 0.6466, + "step": 977 + }, + { + "epoch": 0.3864657940232156, + "grad_norm": 0.5369050064214791, + "learning_rate": 4.998733208639103e-06, + "loss": 0.6476, + "step": 978 + }, + { + "epoch": 0.38686095332180787, + "grad_norm": 0.5377019972492644, + "learning_rate": 4.99872821678603e-06, + "loss": 0.6768, + "step": 979 + }, + { + "epoch": 0.3872561126204001, + "grad_norm": 0.6081827614560933, + "learning_rate": 4.99872321511947e-06, + "loss": 0.6694, + "step": 980 + }, + { + "epoch": 0.38765127191899235, + "grad_norm": 0.5640672539104963, + "learning_rate": 4.998718203639442e-06, + "loss": 0.6632, + "step": 981 + }, + { + "epoch": 0.3880464312175846, + "grad_norm": 0.5004089536671517, + "learning_rate": 4.998713182345967e-06, + "loss": 0.6379, + "step": 982 + }, + { + "epoch": 0.3884415905161768, + "grad_norm": 0.5836270463868607, + "learning_rate": 4.998708151239063e-06, + "loss": 0.658, + "step": 983 + }, + { + "epoch": 0.38883674981476907, + "grad_norm": 0.5251488931681385, + "learning_rate": 4.998703110318751e-06, + "loss": 0.696, + "step": 984 + }, + { + "epoch": 0.3892319091133613, + "grad_norm": 0.5242683309546348, + "learning_rate": 4.998698059585051e-06, + "loss": 0.6634, + "step": 985 + }, + { + "epoch": 0.38962706841195355, + "grad_norm": 0.5380044203359831, + "learning_rate": 4.998692999037982e-06, + "loss": 0.639, + "step": 986 + }, + { + "epoch": 0.3900222277105458, + "grad_norm": 0.5462572647473274, + "learning_rate": 4.998687928677563e-06, + "loss": 0.6645, + "step": 987 + }, + { + "epoch": 0.3904173870091381, + "grad_norm": 0.5464260530979489, + "learning_rate": 4.998682848503817e-06, + "loss": 0.6602, + "step": 988 + }, + { + "epoch": 0.3908125463077303, + "grad_norm": 0.535112702170672, + "learning_rate": 4.998677758516761e-06, + "loss": 0.6743, + "step": 989 + }, + { + "epoch": 0.39120770560632256, + "grad_norm": 0.5295319813464742, + "learning_rate": 4.998672658716416e-06, + "loss": 0.6655, + "step": 990 + }, + { + "epoch": 0.3916028649049148, + "grad_norm": 0.541811550848016, + "learning_rate": 4.998667549102803e-06, + "loss": 0.6758, + "step": 991 + }, + { + "epoch": 0.39199802420350705, + "grad_norm": 0.5451452325701546, + "learning_rate": 4.99866242967594e-06, + "loss": 0.6884, + "step": 992 + }, + { + "epoch": 0.3923931835020993, + "grad_norm": 0.5341028641904682, + "learning_rate": 4.998657300435849e-06, + "loss": 0.6585, + "step": 993 + }, + { + "epoch": 0.3927883428006915, + "grad_norm": 0.5871334739980254, + "learning_rate": 4.9986521613825486e-06, + "loss": 0.6512, + "step": 994 + }, + { + "epoch": 0.39318350209928377, + "grad_norm": 0.5236524401205415, + "learning_rate": 4.9986470125160605e-06, + "loss": 0.668, + "step": 995 + }, + { + "epoch": 0.393578661397876, + "grad_norm": 0.5213997123202212, + "learning_rate": 4.998641853836404e-06, + "loss": 0.6478, + "step": 996 + }, + { + "epoch": 0.39397382069646825, + "grad_norm": 0.5407197664544282, + "learning_rate": 4.9986366853436e-06, + "loss": 0.6936, + "step": 997 + }, + { + "epoch": 0.3943689799950605, + "grad_norm": 0.5280672011633306, + "learning_rate": 4.9986315070376675e-06, + "loss": 0.6498, + "step": 998 + }, + { + "epoch": 0.3947641392936527, + "grad_norm": 0.5204342792742052, + "learning_rate": 4.998626318918628e-06, + "loss": 0.6775, + "step": 999 + }, + { + "epoch": 0.395159298592245, + "grad_norm": 0.5029429193286309, + "learning_rate": 4.998621120986502e-06, + "loss": 0.6573, + "step": 1000 + }, + { + "epoch": 0.39555445789083726, + "grad_norm": 0.5297718137035743, + "learning_rate": 4.998615913241309e-06, + "loss": 0.6645, + "step": 1001 + }, + { + "epoch": 0.3959496171894295, + "grad_norm": 0.5082079772490311, + "learning_rate": 4.9986106956830705e-06, + "loss": 0.6609, + "step": 1002 + }, + { + "epoch": 0.39634477648802174, + "grad_norm": 0.5349674831031332, + "learning_rate": 4.998605468311805e-06, + "loss": 0.6563, + "step": 1003 + }, + { + "epoch": 0.396739935786614, + "grad_norm": 0.5733947545706467, + "learning_rate": 4.9986002311275365e-06, + "loss": 0.6363, + "step": 1004 + }, + { + "epoch": 0.3971350950852062, + "grad_norm": 0.5182307185611646, + "learning_rate": 4.9985949841302825e-06, + "loss": 0.6516, + "step": 1005 + }, + { + "epoch": 0.39753025438379846, + "grad_norm": 0.5062700575538346, + "learning_rate": 4.9985897273200645e-06, + "loss": 0.6529, + "step": 1006 + }, + { + "epoch": 0.3979254136823907, + "grad_norm": 0.5384297745518919, + "learning_rate": 4.998584460696904e-06, + "loss": 0.6737, + "step": 1007 + }, + { + "epoch": 0.39832057298098295, + "grad_norm": 0.5816595934297174, + "learning_rate": 4.99857918426082e-06, + "loss": 0.6739, + "step": 1008 + }, + { + "epoch": 0.3987157322795752, + "grad_norm": 0.5179842596066364, + "learning_rate": 4.998573898011835e-06, + "loss": 0.6551, + "step": 1009 + }, + { + "epoch": 0.3991108915781674, + "grad_norm": 0.5782261510920916, + "learning_rate": 4.998568601949968e-06, + "loss": 0.6653, + "step": 1010 + }, + { + "epoch": 0.39950605087675967, + "grad_norm": 0.5070377777043106, + "learning_rate": 4.998563296075241e-06, + "loss": 0.6637, + "step": 1011 + }, + { + "epoch": 0.39990121017535196, + "grad_norm": 0.5348184367082748, + "learning_rate": 4.998557980387675e-06, + "loss": 0.6573, + "step": 1012 + }, + { + "epoch": 0.4002963694739442, + "grad_norm": 0.5533134020078849, + "learning_rate": 4.99855265488729e-06, + "loss": 0.6513, + "step": 1013 + }, + { + "epoch": 0.40069152877253644, + "grad_norm": 0.5214559414090275, + "learning_rate": 4.998547319574108e-06, + "loss": 0.6607, + "step": 1014 + }, + { + "epoch": 0.4010866880711287, + "grad_norm": 0.5454794781961575, + "learning_rate": 4.998541974448149e-06, + "loss": 0.657, + "step": 1015 + }, + { + "epoch": 0.4014818473697209, + "grad_norm": 0.5323970053630921, + "learning_rate": 4.998536619509434e-06, + "loss": 0.6727, + "step": 1016 + }, + { + "epoch": 0.40187700666831316, + "grad_norm": 0.5677904629043107, + "learning_rate": 4.998531254757984e-06, + "loss": 0.6439, + "step": 1017 + }, + { + "epoch": 0.4022721659669054, + "grad_norm": 0.5203197455098383, + "learning_rate": 4.998525880193822e-06, + "loss": 0.6694, + "step": 1018 + }, + { + "epoch": 0.40266732526549764, + "grad_norm": 0.5251699628155582, + "learning_rate": 4.998520495816967e-06, + "loss": 0.6546, + "step": 1019 + }, + { + "epoch": 0.4030624845640899, + "grad_norm": 0.5539885023636764, + "learning_rate": 4.99851510162744e-06, + "loss": 0.6802, + "step": 1020 + }, + { + "epoch": 0.4034576438626821, + "grad_norm": 0.5328047319862138, + "learning_rate": 4.998509697625262e-06, + "loss": 0.65, + "step": 1021 + }, + { + "epoch": 0.40385280316127437, + "grad_norm": 0.5505034070242261, + "learning_rate": 4.998504283810457e-06, + "loss": 0.6507, + "step": 1022 + }, + { + "epoch": 0.40424796245986666, + "grad_norm": 0.571221870438257, + "learning_rate": 4.998498860183043e-06, + "loss": 0.6449, + "step": 1023 + }, + { + "epoch": 0.4046431217584589, + "grad_norm": 0.534352298409385, + "learning_rate": 4.998493426743044e-06, + "loss": 0.6806, + "step": 1024 + }, + { + "epoch": 0.40503828105705114, + "grad_norm": 0.5487946183182154, + "learning_rate": 4.9984879834904785e-06, + "loss": 0.6471, + "step": 1025 + }, + { + "epoch": 0.4054334403556434, + "grad_norm": 0.5244300620347428, + "learning_rate": 4.99848253042537e-06, + "loss": 0.6555, + "step": 1026 + }, + { + "epoch": 0.4058285996542356, + "grad_norm": 0.5408629309328332, + "learning_rate": 4.99847706754774e-06, + "loss": 0.6642, + "step": 1027 + }, + { + "epoch": 0.40622375895282786, + "grad_norm": 0.5333823797340622, + "learning_rate": 4.998471594857608e-06, + "loss": 0.6543, + "step": 1028 + }, + { + "epoch": 0.4066189182514201, + "grad_norm": 0.512514585835694, + "learning_rate": 4.998466112354998e-06, + "loss": 0.663, + "step": 1029 + }, + { + "epoch": 0.40701407755001234, + "grad_norm": 0.5066646847695025, + "learning_rate": 4.998460620039929e-06, + "loss": 0.6483, + "step": 1030 + }, + { + "epoch": 0.4074092368486046, + "grad_norm": 0.5408686381957716, + "learning_rate": 4.998455117912425e-06, + "loss": 0.6556, + "step": 1031 + }, + { + "epoch": 0.4078043961471968, + "grad_norm": 0.5118425141052046, + "learning_rate": 4.998449605972505e-06, + "loss": 0.6727, + "step": 1032 + }, + { + "epoch": 0.40819955544578906, + "grad_norm": 0.5596018207482388, + "learning_rate": 4.9984440842201935e-06, + "loss": 0.662, + "step": 1033 + }, + { + "epoch": 0.4085947147443813, + "grad_norm": 0.5081268236908408, + "learning_rate": 4.99843855265551e-06, + "loss": 0.6472, + "step": 1034 + }, + { + "epoch": 0.4089898740429736, + "grad_norm": 0.5208429105965913, + "learning_rate": 4.998433011278477e-06, + "loss": 0.6542, + "step": 1035 + }, + { + "epoch": 0.40938503334156584, + "grad_norm": 0.5476716325972726, + "learning_rate": 4.998427460089117e-06, + "loss": 0.6636, + "step": 1036 + }, + { + "epoch": 0.4097801926401581, + "grad_norm": 0.5581628717427036, + "learning_rate": 4.9984218990874504e-06, + "loss": 0.6688, + "step": 1037 + }, + { + "epoch": 0.4101753519387503, + "grad_norm": 0.5313422288525836, + "learning_rate": 4.9984163282734995e-06, + "loss": 0.6614, + "step": 1038 + }, + { + "epoch": 0.41057051123734256, + "grad_norm": 0.5152033372832628, + "learning_rate": 4.998410747647287e-06, + "loss": 0.6793, + "step": 1039 + }, + { + "epoch": 0.4109656705359348, + "grad_norm": 0.5352102398766864, + "learning_rate": 4.998405157208833e-06, + "loss": 0.6502, + "step": 1040 + }, + { + "epoch": 0.41136082983452704, + "grad_norm": 0.6112119061779702, + "learning_rate": 4.998399556958162e-06, + "loss": 0.6765, + "step": 1041 + }, + { + "epoch": 0.4117559891331193, + "grad_norm": 0.5012000160841078, + "learning_rate": 4.9983939468952945e-06, + "loss": 0.6602, + "step": 1042 + }, + { + "epoch": 0.4121511484317115, + "grad_norm": 0.5623209902661767, + "learning_rate": 4.9983883270202525e-06, + "loss": 0.6458, + "step": 1043 + }, + { + "epoch": 0.41254630773030376, + "grad_norm": 0.52453782255717, + "learning_rate": 4.998382697333058e-06, + "loss": 0.6542, + "step": 1044 + }, + { + "epoch": 0.412941467028896, + "grad_norm": 0.5822299190642851, + "learning_rate": 4.998377057833733e-06, + "loss": 0.679, + "step": 1045 + }, + { + "epoch": 0.41333662632748824, + "grad_norm": 0.5421354227434607, + "learning_rate": 4.998371408522302e-06, + "loss": 0.6717, + "step": 1046 + }, + { + "epoch": 0.41373178562608054, + "grad_norm": 0.5286757169293261, + "learning_rate": 4.998365749398783e-06, + "loss": 0.6711, + "step": 1047 + }, + { + "epoch": 0.4141269449246728, + "grad_norm": 0.5177607288874907, + "learning_rate": 4.9983600804632e-06, + "loss": 0.6686, + "step": 1048 + }, + { + "epoch": 0.414522104223265, + "grad_norm": 0.5543980251697767, + "learning_rate": 4.998354401715577e-06, + "loss": 0.6663, + "step": 1049 + }, + { + "epoch": 0.41491726352185726, + "grad_norm": 0.5459993812213396, + "learning_rate": 4.9983487131559354e-06, + "loss": 0.6407, + "step": 1050 + }, + { + "epoch": 0.4153124228204495, + "grad_norm": 0.5359069513585107, + "learning_rate": 4.998343014784296e-06, + "loss": 0.6575, + "step": 1051 + }, + { + "epoch": 0.41570758211904174, + "grad_norm": 0.5300909673600657, + "learning_rate": 4.998337306600683e-06, + "loss": 0.6667, + "step": 1052 + }, + { + "epoch": 0.416102741417634, + "grad_norm": 0.5392983958526001, + "learning_rate": 4.9983315886051185e-06, + "loss": 0.6721, + "step": 1053 + }, + { + "epoch": 0.4164979007162262, + "grad_norm": 0.5070297189799001, + "learning_rate": 4.998325860797624e-06, + "loss": 0.6561, + "step": 1054 + }, + { + "epoch": 0.41689306001481846, + "grad_norm": 0.5224307231811263, + "learning_rate": 4.998320123178223e-06, + "loss": 0.6618, + "step": 1055 + }, + { + "epoch": 0.4172882193134107, + "grad_norm": 0.604920198130287, + "learning_rate": 4.998314375746937e-06, + "loss": 0.6532, + "step": 1056 + }, + { + "epoch": 0.41768337861200294, + "grad_norm": 0.5168038782011498, + "learning_rate": 4.9983086185037896e-06, + "loss": 0.6651, + "step": 1057 + }, + { + "epoch": 0.4180785379105952, + "grad_norm": 0.5297744100346249, + "learning_rate": 4.998302851448803e-06, + "loss": 0.6608, + "step": 1058 + }, + { + "epoch": 0.4184736972091875, + "grad_norm": 0.5574740674977203, + "learning_rate": 4.998297074581999e-06, + "loss": 0.6881, + "step": 1059 + }, + { + "epoch": 0.4188688565077797, + "grad_norm": 0.521702687451242, + "learning_rate": 4.9982912879034025e-06, + "loss": 0.6839, + "step": 1060 + }, + { + "epoch": 0.41926401580637196, + "grad_norm": 0.5188090920413589, + "learning_rate": 4.9982854914130345e-06, + "loss": 0.6602, + "step": 1061 + }, + { + "epoch": 0.4196591751049642, + "grad_norm": 0.5270636307682602, + "learning_rate": 4.998279685110917e-06, + "loss": 0.6606, + "step": 1062 + }, + { + "epoch": 0.42005433440355644, + "grad_norm": 0.5366763635966227, + "learning_rate": 4.998273868997075e-06, + "loss": 0.6704, + "step": 1063 + }, + { + "epoch": 0.4204494937021487, + "grad_norm": 0.5031111535411266, + "learning_rate": 4.9982680430715305e-06, + "loss": 0.6747, + "step": 1064 + }, + { + "epoch": 0.4208446530007409, + "grad_norm": 0.5151445274947559, + "learning_rate": 4.998262207334306e-06, + "loss": 0.6645, + "step": 1065 + }, + { + "epoch": 0.42123981229933316, + "grad_norm": 0.5210208888874347, + "learning_rate": 4.998256361785424e-06, + "loss": 0.65, + "step": 1066 + }, + { + "epoch": 0.4216349715979254, + "grad_norm": 0.5169090685583989, + "learning_rate": 4.998250506424908e-06, + "loss": 0.6517, + "step": 1067 + }, + { + "epoch": 0.42203013089651764, + "grad_norm": 0.5413286895905355, + "learning_rate": 4.998244641252781e-06, + "loss": 0.6795, + "step": 1068 + }, + { + "epoch": 0.4224252901951099, + "grad_norm": 0.6433702861587849, + "learning_rate": 4.998238766269067e-06, + "loss": 0.6951, + "step": 1069 + }, + { + "epoch": 0.4228204494937021, + "grad_norm": 0.5023044231984531, + "learning_rate": 4.998232881473787e-06, + "loss": 0.6694, + "step": 1070 + }, + { + "epoch": 0.4232156087922944, + "grad_norm": 0.5994864572308269, + "learning_rate": 4.998226986866966e-06, + "loss": 0.6532, + "step": 1071 + }, + { + "epoch": 0.42361076809088666, + "grad_norm": 0.5126827044965871, + "learning_rate": 4.998221082448627e-06, + "loss": 0.6496, + "step": 1072 + }, + { + "epoch": 0.4240059273894789, + "grad_norm": 0.6035162881851404, + "learning_rate": 4.998215168218791e-06, + "loss": 0.6633, + "step": 1073 + }, + { + "epoch": 0.42440108668807114, + "grad_norm": 0.5296590133461134, + "learning_rate": 4.998209244177484e-06, + "loss": 0.6691, + "step": 1074 + }, + { + "epoch": 0.4247962459866634, + "grad_norm": 0.5435187950208628, + "learning_rate": 4.998203310324727e-06, + "loss": 0.6636, + "step": 1075 + }, + { + "epoch": 0.4251914052852556, + "grad_norm": 0.5931608594717878, + "learning_rate": 4.998197366660546e-06, + "loss": 0.6772, + "step": 1076 + }, + { + "epoch": 0.42558656458384786, + "grad_norm": 0.524768630466991, + "learning_rate": 4.9981914131849614e-06, + "loss": 0.657, + "step": 1077 + }, + { + "epoch": 0.4259817238824401, + "grad_norm": 0.5518814321211608, + "learning_rate": 4.998185449897999e-06, + "loss": 0.6483, + "step": 1078 + }, + { + "epoch": 0.42637688318103234, + "grad_norm": 0.536584834748668, + "learning_rate": 4.998179476799679e-06, + "loss": 0.6678, + "step": 1079 + }, + { + "epoch": 0.4267720424796246, + "grad_norm": 0.5621190263657815, + "learning_rate": 4.998173493890029e-06, + "loss": 0.6416, + "step": 1080 + }, + { + "epoch": 0.4271672017782168, + "grad_norm": 0.5755170038945299, + "learning_rate": 4.99816750116907e-06, + "loss": 0.6638, + "step": 1081 + }, + { + "epoch": 0.42756236107680906, + "grad_norm": 0.6441072902907758, + "learning_rate": 4.998161498636826e-06, + "loss": 0.6661, + "step": 1082 + }, + { + "epoch": 0.42795752037540136, + "grad_norm": 0.565628041779372, + "learning_rate": 4.998155486293321e-06, + "loss": 0.6417, + "step": 1083 + }, + { + "epoch": 0.4283526796739936, + "grad_norm": 1.8488838880474752, + "learning_rate": 4.9981494641385775e-06, + "loss": 0.6564, + "step": 1084 + }, + { + "epoch": 0.42874783897258584, + "grad_norm": 0.5160251022562632, + "learning_rate": 4.99814343217262e-06, + "loss": 0.6748, + "step": 1085 + }, + { + "epoch": 0.4291429982711781, + "grad_norm": 0.5216784182870835, + "learning_rate": 4.998137390395472e-06, + "loss": 0.6496, + "step": 1086 + }, + { + "epoch": 0.4295381575697703, + "grad_norm": 0.569161680789975, + "learning_rate": 4.998131338807158e-06, + "loss": 0.6629, + "step": 1087 + }, + { + "epoch": 0.42993331686836256, + "grad_norm": 0.5479677358649104, + "learning_rate": 4.9981252774077e-06, + "loss": 0.658, + "step": 1088 + }, + { + "epoch": 0.4303284761669548, + "grad_norm": 0.5267588064158593, + "learning_rate": 4.998119206197124e-06, + "loss": 0.646, + "step": 1089 + }, + { + "epoch": 0.43072363546554704, + "grad_norm": 0.6215218453962831, + "learning_rate": 4.9981131251754516e-06, + "loss": 0.6473, + "step": 1090 + }, + { + "epoch": 0.4311187947641393, + "grad_norm": 0.5829418685005663, + "learning_rate": 4.998107034342708e-06, + "loss": 0.6489, + "step": 1091 + }, + { + "epoch": 0.4315139540627315, + "grad_norm": 0.5445716063606678, + "learning_rate": 4.998100933698917e-06, + "loss": 0.6563, + "step": 1092 + }, + { + "epoch": 0.43190911336132376, + "grad_norm": 0.5717920439154024, + "learning_rate": 4.998094823244103e-06, + "loss": 0.6455, + "step": 1093 + }, + { + "epoch": 0.43230427265991606, + "grad_norm": 0.5194203985812613, + "learning_rate": 4.9980887029782895e-06, + "loss": 0.6441, + "step": 1094 + }, + { + "epoch": 0.4326994319585083, + "grad_norm": 0.5380215301586405, + "learning_rate": 4.9980825729015e-06, + "loss": 0.6592, + "step": 1095 + }, + { + "epoch": 0.43309459125710054, + "grad_norm": 0.5656627049153228, + "learning_rate": 4.998076433013758e-06, + "loss": 0.6538, + "step": 1096 + }, + { + "epoch": 0.4334897505556928, + "grad_norm": 0.5266370469773144, + "learning_rate": 4.998070283315091e-06, + "loss": 0.6631, + "step": 1097 + }, + { + "epoch": 0.433884909854285, + "grad_norm": 0.6484381198976229, + "learning_rate": 4.998064123805519e-06, + "loss": 0.6539, + "step": 1098 + }, + { + "epoch": 0.43428006915287726, + "grad_norm": 0.5474765769152128, + "learning_rate": 4.99805795448507e-06, + "loss": 0.6589, + "step": 1099 + }, + { + "epoch": 0.4346752284514695, + "grad_norm": 0.532857291701254, + "learning_rate": 4.998051775353764e-06, + "loss": 0.6592, + "step": 1100 + }, + { + "epoch": 0.43507038775006174, + "grad_norm": 0.58383601467703, + "learning_rate": 4.998045586411629e-06, + "loss": 0.6857, + "step": 1101 + }, + { + "epoch": 0.435465547048654, + "grad_norm": 0.5473181451591314, + "learning_rate": 4.998039387658686e-06, + "loss": 0.6637, + "step": 1102 + }, + { + "epoch": 0.4358607063472462, + "grad_norm": 0.5385435795744777, + "learning_rate": 4.998033179094963e-06, + "loss": 0.6259, + "step": 1103 + }, + { + "epoch": 0.43625586564583846, + "grad_norm": 0.5531552648985082, + "learning_rate": 4.998026960720483e-06, + "loss": 0.6586, + "step": 1104 + }, + { + "epoch": 0.4366510249444307, + "grad_norm": 0.528398105412856, + "learning_rate": 4.998020732535268e-06, + "loss": 0.6423, + "step": 1105 + }, + { + "epoch": 0.437046184243023, + "grad_norm": 0.5211440156999154, + "learning_rate": 4.998014494539345e-06, + "loss": 0.6316, + "step": 1106 + }, + { + "epoch": 0.43744134354161524, + "grad_norm": 0.547888131353762, + "learning_rate": 4.998008246732739e-06, + "loss": 0.6342, + "step": 1107 + }, + { + "epoch": 0.4378365028402075, + "grad_norm": 0.5557989460207667, + "learning_rate": 4.998001989115473e-06, + "loss": 0.6574, + "step": 1108 + }, + { + "epoch": 0.4382316621387997, + "grad_norm": 0.5500706961475933, + "learning_rate": 4.997995721687572e-06, + "loss": 0.6471, + "step": 1109 + }, + { + "epoch": 0.43862682143739196, + "grad_norm": 0.5504155370955836, + "learning_rate": 4.997989444449061e-06, + "loss": 0.6576, + "step": 1110 + }, + { + "epoch": 0.4390219807359842, + "grad_norm": 0.5547627447818888, + "learning_rate": 4.997983157399963e-06, + "loss": 0.6764, + "step": 1111 + }, + { + "epoch": 0.43941714003457644, + "grad_norm": 0.5804481448072446, + "learning_rate": 4.997976860540305e-06, + "loss": 0.6682, + "step": 1112 + }, + { + "epoch": 0.4398122993331687, + "grad_norm": 0.5397528058437261, + "learning_rate": 4.997970553870111e-06, + "loss": 0.6572, + "step": 1113 + }, + { + "epoch": 0.4402074586317609, + "grad_norm": 0.6196448584341162, + "learning_rate": 4.997964237389405e-06, + "loss": 0.658, + "step": 1114 + }, + { + "epoch": 0.44060261793035316, + "grad_norm": 0.6066819778157864, + "learning_rate": 4.997957911098212e-06, + "loss": 0.6799, + "step": 1115 + }, + { + "epoch": 0.4409977772289454, + "grad_norm": 0.5478230864583113, + "learning_rate": 4.997951574996558e-06, + "loss": 0.6656, + "step": 1116 + }, + { + "epoch": 0.44139293652753764, + "grad_norm": 0.6388534140119478, + "learning_rate": 4.997945229084467e-06, + "loss": 0.6733, + "step": 1117 + }, + { + "epoch": 0.44178809582612993, + "grad_norm": 0.5602979323078272, + "learning_rate": 4.997938873361964e-06, + "loss": 0.6499, + "step": 1118 + }, + { + "epoch": 0.4421832551247222, + "grad_norm": 0.6110344898706578, + "learning_rate": 4.997932507829073e-06, + "loss": 0.6486, + "step": 1119 + }, + { + "epoch": 0.4425784144233144, + "grad_norm": 0.5439975635095607, + "learning_rate": 4.997926132485821e-06, + "loss": 0.6615, + "step": 1120 + }, + { + "epoch": 0.44297357372190665, + "grad_norm": 0.5670773229357888, + "learning_rate": 4.9979197473322315e-06, + "loss": 0.6644, + "step": 1121 + }, + { + "epoch": 0.4433687330204989, + "grad_norm": 0.6135668184552048, + "learning_rate": 4.99791335236833e-06, + "loss": 0.6586, + "step": 1122 + }, + { + "epoch": 0.44376389231909114, + "grad_norm": 0.6934295929969644, + "learning_rate": 4.997906947594142e-06, + "loss": 0.6446, + "step": 1123 + }, + { + "epoch": 0.4441590516176834, + "grad_norm": 0.5521875793962009, + "learning_rate": 4.997900533009692e-06, + "loss": 0.6845, + "step": 1124 + }, + { + "epoch": 0.4445542109162756, + "grad_norm": 0.5802844349362782, + "learning_rate": 4.9978941086150055e-06, + "loss": 0.6313, + "step": 1125 + }, + { + "epoch": 0.44494937021486786, + "grad_norm": 0.5443010267154513, + "learning_rate": 4.997887674410108e-06, + "loss": 0.6602, + "step": 1126 + }, + { + "epoch": 0.4453445295134601, + "grad_norm": 0.5391556371847563, + "learning_rate": 4.997881230395024e-06, + "loss": 0.6605, + "step": 1127 + }, + { + "epoch": 0.44573968881205234, + "grad_norm": 0.6625220158972982, + "learning_rate": 4.99787477656978e-06, + "loss": 0.6627, + "step": 1128 + }, + { + "epoch": 0.4461348481106446, + "grad_norm": 0.5147363265798429, + "learning_rate": 4.9978683129344e-06, + "loss": 0.6382, + "step": 1129 + }, + { + "epoch": 0.4465300074092369, + "grad_norm": 0.5482662281477236, + "learning_rate": 4.99786183948891e-06, + "loss": 0.6558, + "step": 1130 + }, + { + "epoch": 0.4469251667078291, + "grad_norm": 0.5604183997920332, + "learning_rate": 4.997855356233337e-06, + "loss": 0.6517, + "step": 1131 + }, + { + "epoch": 0.44732032600642135, + "grad_norm": 0.5353748475279002, + "learning_rate": 4.997848863167703e-06, + "loss": 0.6341, + "step": 1132 + }, + { + "epoch": 0.4477154853050136, + "grad_norm": 0.5488398069149468, + "learning_rate": 4.997842360292036e-06, + "loss": 0.6676, + "step": 1133 + }, + { + "epoch": 0.44811064460360583, + "grad_norm": 0.5663070948772903, + "learning_rate": 4.997835847606361e-06, + "loss": 0.64, + "step": 1134 + }, + { + "epoch": 0.4485058039021981, + "grad_norm": 0.5403481201660154, + "learning_rate": 4.997829325110705e-06, + "loss": 0.6599, + "step": 1135 + }, + { + "epoch": 0.4489009632007903, + "grad_norm": 0.5222149042010814, + "learning_rate": 4.997822792805091e-06, + "loss": 0.6262, + "step": 1136 + }, + { + "epoch": 0.44929612249938256, + "grad_norm": 0.6013913868061602, + "learning_rate": 4.997816250689545e-06, + "loss": 0.6661, + "step": 1137 + }, + { + "epoch": 0.4496912817979748, + "grad_norm": 0.6787491243452577, + "learning_rate": 4.997809698764094e-06, + "loss": 0.653, + "step": 1138 + }, + { + "epoch": 0.45008644109656704, + "grad_norm": 0.6230119957241225, + "learning_rate": 4.997803137028764e-06, + "loss": 0.6444, + "step": 1139 + }, + { + "epoch": 0.4504816003951593, + "grad_norm": 0.5712264595241187, + "learning_rate": 4.9977965654835795e-06, + "loss": 0.6115, + "step": 1140 + }, + { + "epoch": 0.4508767596937515, + "grad_norm": 0.5074125803443317, + "learning_rate": 4.997789984128567e-06, + "loss": 0.6439, + "step": 1141 + }, + { + "epoch": 0.4512719189923438, + "grad_norm": 0.5525141812309801, + "learning_rate": 4.997783392963752e-06, + "loss": 0.6223, + "step": 1142 + }, + { + "epoch": 0.45166707829093605, + "grad_norm": 0.5882221184228417, + "learning_rate": 4.997776791989161e-06, + "loss": 0.6506, + "step": 1143 + }, + { + "epoch": 0.4520622375895283, + "grad_norm": 0.5063672919950463, + "learning_rate": 4.9977701812048185e-06, + "loss": 0.6345, + "step": 1144 + }, + { + "epoch": 0.45245739688812053, + "grad_norm": 0.5980686687629019, + "learning_rate": 4.997763560610752e-06, + "loss": 0.6694, + "step": 1145 + }, + { + "epoch": 0.4528525561867128, + "grad_norm": 0.5737769969500374, + "learning_rate": 4.997756930206987e-06, + "loss": 0.6576, + "step": 1146 + }, + { + "epoch": 0.453247715485305, + "grad_norm": 0.5254065661728536, + "learning_rate": 4.99775028999355e-06, + "loss": 0.6406, + "step": 1147 + }, + { + "epoch": 0.45364287478389725, + "grad_norm": 0.5556443229887936, + "learning_rate": 4.997743639970466e-06, + "loss": 0.6353, + "step": 1148 + }, + { + "epoch": 0.4540380340824895, + "grad_norm": 0.6029582429283775, + "learning_rate": 4.997736980137762e-06, + "loss": 0.6525, + "step": 1149 + }, + { + "epoch": 0.45443319338108173, + "grad_norm": 0.5498673723447964, + "learning_rate": 4.997730310495464e-06, + "loss": 0.6499, + "step": 1150 + }, + { + "epoch": 0.454828352679674, + "grad_norm": 0.5478846740663137, + "learning_rate": 4.997723631043597e-06, + "loss": 0.6598, + "step": 1151 + }, + { + "epoch": 0.4552235119782662, + "grad_norm": 0.5257136286019838, + "learning_rate": 4.997716941782189e-06, + "loss": 0.642, + "step": 1152 + }, + { + "epoch": 0.4556186712768585, + "grad_norm": 0.5545888280208816, + "learning_rate": 4.997710242711266e-06, + "loss": 0.6351, + "step": 1153 + }, + { + "epoch": 0.45601383057545075, + "grad_norm": 0.537711633021132, + "learning_rate": 4.997703533830853e-06, + "loss": 0.6504, + "step": 1154 + }, + { + "epoch": 0.456408989874043, + "grad_norm": 0.8090471632091599, + "learning_rate": 4.997696815140978e-06, + "loss": 0.6535, + "step": 1155 + }, + { + "epoch": 0.45680414917263523, + "grad_norm": 0.5180994899415312, + "learning_rate": 4.997690086641666e-06, + "loss": 0.6526, + "step": 1156 + }, + { + "epoch": 0.4571993084712275, + "grad_norm": 0.5504445838265242, + "learning_rate": 4.997683348332945e-06, + "loss": 0.6491, + "step": 1157 + }, + { + "epoch": 0.4575944677698197, + "grad_norm": 0.5189150345527932, + "learning_rate": 4.997676600214839e-06, + "loss": 0.6358, + "step": 1158 + }, + { + "epoch": 0.45798962706841195, + "grad_norm": 0.5120657160011092, + "learning_rate": 4.997669842287377e-06, + "loss": 0.6447, + "step": 1159 + }, + { + "epoch": 0.4583847863670042, + "grad_norm": 0.5051586925759178, + "learning_rate": 4.997663074550584e-06, + "loss": 0.6385, + "step": 1160 + }, + { + "epoch": 0.45877994566559643, + "grad_norm": 0.5168398913045842, + "learning_rate": 4.997656297004487e-06, + "loss": 0.6743, + "step": 1161 + }, + { + "epoch": 0.4591751049641887, + "grad_norm": 0.5043203559885473, + "learning_rate": 4.997649509649114e-06, + "loss": 0.6449, + "step": 1162 + }, + { + "epoch": 0.4595702642627809, + "grad_norm": 0.5459507585407142, + "learning_rate": 4.997642712484489e-06, + "loss": 0.6476, + "step": 1163 + }, + { + "epoch": 0.45996542356137315, + "grad_norm": 0.5245847133766435, + "learning_rate": 4.99763590551064e-06, + "loss": 0.6596, + "step": 1164 + }, + { + "epoch": 0.46036058285996545, + "grad_norm": 0.4986446564658458, + "learning_rate": 4.997629088727594e-06, + "loss": 0.6525, + "step": 1165 + }, + { + "epoch": 0.4607557421585577, + "grad_norm": 0.507187864739981, + "learning_rate": 4.997622262135379e-06, + "loss": 0.6522, + "step": 1166 + }, + { + "epoch": 0.46115090145714993, + "grad_norm": 0.5093539527497493, + "learning_rate": 4.997615425734019e-06, + "loss": 0.6483, + "step": 1167 + }, + { + "epoch": 0.46154606075574217, + "grad_norm": 0.5152474618083938, + "learning_rate": 4.997608579523543e-06, + "loss": 0.6435, + "step": 1168 + }, + { + "epoch": 0.4619412200543344, + "grad_norm": 0.5340544828015156, + "learning_rate": 4.997601723503977e-06, + "loss": 0.661, + "step": 1169 + }, + { + "epoch": 0.46233637935292665, + "grad_norm": 0.5035751988607956, + "learning_rate": 4.997594857675347e-06, + "loss": 0.6365, + "step": 1170 + }, + { + "epoch": 0.4627315386515189, + "grad_norm": 0.4881941716047279, + "learning_rate": 4.997587982037682e-06, + "loss": 0.6539, + "step": 1171 + }, + { + "epoch": 0.46312669795011113, + "grad_norm": 0.5200636790139908, + "learning_rate": 4.997581096591007e-06, + "loss": 0.66, + "step": 1172 + }, + { + "epoch": 0.4635218572487034, + "grad_norm": 0.575490383484796, + "learning_rate": 4.9975742013353515e-06, + "loss": 0.6833, + "step": 1173 + }, + { + "epoch": 0.4639170165472956, + "grad_norm": 0.5118745281561939, + "learning_rate": 4.99756729627074e-06, + "loss": 0.6538, + "step": 1174 + }, + { + "epoch": 0.46431217584588785, + "grad_norm": 0.5207712676596821, + "learning_rate": 4.997560381397201e-06, + "loss": 0.6575, + "step": 1175 + }, + { + "epoch": 0.4647073351444801, + "grad_norm": 0.5196060135878352, + "learning_rate": 4.997553456714762e-06, + "loss": 0.6606, + "step": 1176 + }, + { + "epoch": 0.4651024944430724, + "grad_norm": 0.4993473638807208, + "learning_rate": 4.997546522223449e-06, + "loss": 0.6473, + "step": 1177 + }, + { + "epoch": 0.46549765374166463, + "grad_norm": 0.6632587575022358, + "learning_rate": 4.99753957792329e-06, + "loss": 0.6479, + "step": 1178 + }, + { + "epoch": 0.46589281304025687, + "grad_norm": 0.5118246850008393, + "learning_rate": 4.997532623814312e-06, + "loss": 0.6508, + "step": 1179 + }, + { + "epoch": 0.4662879723388491, + "grad_norm": 0.5216848433144801, + "learning_rate": 4.997525659896543e-06, + "loss": 0.6507, + "step": 1180 + }, + { + "epoch": 0.46668313163744135, + "grad_norm": 0.5116873956882975, + "learning_rate": 4.99751868617001e-06, + "loss": 0.6321, + "step": 1181 + }, + { + "epoch": 0.4670782909360336, + "grad_norm": 0.5465288573676182, + "learning_rate": 4.997511702634739e-06, + "loss": 0.6723, + "step": 1182 + }, + { + "epoch": 0.46747345023462583, + "grad_norm": 0.5229836616121926, + "learning_rate": 4.997504709290759e-06, + "loss": 0.6572, + "step": 1183 + }, + { + "epoch": 0.46786860953321807, + "grad_norm": 0.5055148283070077, + "learning_rate": 4.997497706138098e-06, + "loss": 0.651, + "step": 1184 + }, + { + "epoch": 0.4682637688318103, + "grad_norm": 0.5073671397211292, + "learning_rate": 4.997490693176782e-06, + "loss": 0.6663, + "step": 1185 + }, + { + "epoch": 0.46865892813040255, + "grad_norm": 0.5187411381755869, + "learning_rate": 4.997483670406839e-06, + "loss": 0.6536, + "step": 1186 + }, + { + "epoch": 0.4690540874289948, + "grad_norm": 0.4896382189093635, + "learning_rate": 4.9974766378282964e-06, + "loss": 0.6376, + "step": 1187 + }, + { + "epoch": 0.46944924672758703, + "grad_norm": 0.5403722513951374, + "learning_rate": 4.997469595441182e-06, + "loss": 0.6384, + "step": 1188 + }, + { + "epoch": 0.46984440602617933, + "grad_norm": 0.5283649386398254, + "learning_rate": 4.9974625432455245e-06, + "loss": 0.6619, + "step": 1189 + }, + { + "epoch": 0.47023956532477157, + "grad_norm": 0.5145771307045552, + "learning_rate": 4.99745548124135e-06, + "loss": 0.6608, + "step": 1190 + }, + { + "epoch": 0.4706347246233638, + "grad_norm": 0.5088786133567975, + "learning_rate": 4.997448409428687e-06, + "loss": 0.652, + "step": 1191 + }, + { + "epoch": 0.47102988392195605, + "grad_norm": 0.5298568145169689, + "learning_rate": 4.997441327807563e-06, + "loss": 0.626, + "step": 1192 + }, + { + "epoch": 0.4714250432205483, + "grad_norm": 0.525542401872513, + "learning_rate": 4.997434236378006e-06, + "loss": 0.6535, + "step": 1193 + }, + { + "epoch": 0.47182020251914053, + "grad_norm": 0.5112712732618009, + "learning_rate": 4.997427135140045e-06, + "loss": 0.6463, + "step": 1194 + }, + { + "epoch": 0.47221536181773277, + "grad_norm": 0.5333131112367618, + "learning_rate": 4.997420024093705e-06, + "loss": 0.6479, + "step": 1195 + }, + { + "epoch": 0.472610521116325, + "grad_norm": 0.5314791312504851, + "learning_rate": 4.997412903239017e-06, + "loss": 0.6412, + "step": 1196 + }, + { + "epoch": 0.47300568041491725, + "grad_norm": 0.49879520819742884, + "learning_rate": 4.997405772576007e-06, + "loss": 0.6429, + "step": 1197 + }, + { + "epoch": 0.4734008397135095, + "grad_norm": 0.5301052895426869, + "learning_rate": 4.997398632104703e-06, + "loss": 0.662, + "step": 1198 + }, + { + "epoch": 0.47379599901210173, + "grad_norm": 0.5114526203823233, + "learning_rate": 4.997391481825135e-06, + "loss": 0.6488, + "step": 1199 + }, + { + "epoch": 0.47419115831069397, + "grad_norm": 0.7049934317889363, + "learning_rate": 4.9973843217373295e-06, + "loss": 0.6367, + "step": 1200 + }, + { + "epoch": 0.47458631760928627, + "grad_norm": 0.5070332705862394, + "learning_rate": 4.997377151841314e-06, + "loss": 0.6449, + "step": 1201 + }, + { + "epoch": 0.4749814769078785, + "grad_norm": 0.5056739665660747, + "learning_rate": 4.997369972137118e-06, + "loss": 0.6098, + "step": 1202 + }, + { + "epoch": 0.47537663620647075, + "grad_norm": 0.5196505706221262, + "learning_rate": 4.9973627826247695e-06, + "loss": 0.6412, + "step": 1203 + }, + { + "epoch": 0.475771795505063, + "grad_norm": 0.542890120550589, + "learning_rate": 4.997355583304297e-06, + "loss": 0.6798, + "step": 1204 + }, + { + "epoch": 0.47616695480365523, + "grad_norm": 0.5360546233685582, + "learning_rate": 4.997348374175727e-06, + "loss": 0.6472, + "step": 1205 + }, + { + "epoch": 0.47656211410224747, + "grad_norm": 0.5130309786882297, + "learning_rate": 4.997341155239089e-06, + "loss": 0.6628, + "step": 1206 + }, + { + "epoch": 0.4769572734008397, + "grad_norm": 0.5155911732166051, + "learning_rate": 4.997333926494412e-06, + "loss": 0.6589, + "step": 1207 + }, + { + "epoch": 0.47735243269943195, + "grad_norm": 0.5151851351284802, + "learning_rate": 4.997326687941724e-06, + "loss": 0.6426, + "step": 1208 + }, + { + "epoch": 0.4777475919980242, + "grad_norm": 0.5421565514451022, + "learning_rate": 4.997319439581053e-06, + "loss": 0.6286, + "step": 1209 + }, + { + "epoch": 0.47814275129661643, + "grad_norm": 0.5426134587047164, + "learning_rate": 4.997312181412428e-06, + "loss": 0.657, + "step": 1210 + }, + { + "epoch": 0.47853791059520867, + "grad_norm": 0.5155176144597221, + "learning_rate": 4.997304913435876e-06, + "loss": 0.6545, + "step": 1211 + }, + { + "epoch": 0.4789330698938009, + "grad_norm": 0.510072359951836, + "learning_rate": 4.997297635651428e-06, + "loss": 0.669, + "step": 1212 + }, + { + "epoch": 0.4793282291923932, + "grad_norm": 0.5446367757282072, + "learning_rate": 4.997290348059111e-06, + "loss": 0.6428, + "step": 1213 + }, + { + "epoch": 0.47972338849098545, + "grad_norm": 0.4942516591572794, + "learning_rate": 4.997283050658954e-06, + "loss": 0.6276, + "step": 1214 + }, + { + "epoch": 0.4801185477895777, + "grad_norm": 0.5495732208117189, + "learning_rate": 4.997275743450986e-06, + "loss": 0.6542, + "step": 1215 + }, + { + "epoch": 0.48051370708816993, + "grad_norm": 0.5299388692393306, + "learning_rate": 4.997268426435234e-06, + "loss": 0.6659, + "step": 1216 + }, + { + "epoch": 0.48090886638676217, + "grad_norm": 0.5053176342784106, + "learning_rate": 4.99726109961173e-06, + "loss": 0.6309, + "step": 1217 + }, + { + "epoch": 0.4813040256853544, + "grad_norm": 0.6098039985268289, + "learning_rate": 4.997253762980499e-06, + "loss": 0.667, + "step": 1218 + }, + { + "epoch": 0.48169918498394665, + "grad_norm": 0.5376800287875747, + "learning_rate": 4.9972464165415726e-06, + "loss": 0.6506, + "step": 1219 + }, + { + "epoch": 0.4820943442825389, + "grad_norm": 0.5696775883802918, + "learning_rate": 4.997239060294978e-06, + "loss": 0.6567, + "step": 1220 + }, + { + "epoch": 0.48248950358113113, + "grad_norm": 0.5370925937539444, + "learning_rate": 4.997231694240745e-06, + "loss": 0.6395, + "step": 1221 + }, + { + "epoch": 0.48288466287972337, + "grad_norm": 0.5518524860262873, + "learning_rate": 4.997224318378903e-06, + "loss": 0.6545, + "step": 1222 + }, + { + "epoch": 0.4832798221783156, + "grad_norm": 0.6934350958942014, + "learning_rate": 4.99721693270948e-06, + "loss": 0.6425, + "step": 1223 + }, + { + "epoch": 0.4836749814769079, + "grad_norm": 0.5319811886445309, + "learning_rate": 4.997209537232505e-06, + "loss": 0.6848, + "step": 1224 + }, + { + "epoch": 0.48407014077550015, + "grad_norm": 0.5133175351958729, + "learning_rate": 4.9972021319480065e-06, + "loss": 0.6422, + "step": 1225 + }, + { + "epoch": 0.4844653000740924, + "grad_norm": 0.515963610700411, + "learning_rate": 4.997194716856016e-06, + "loss": 0.6544, + "step": 1226 + }, + { + "epoch": 0.4848604593726846, + "grad_norm": 0.5152566753981194, + "learning_rate": 4.99718729195656e-06, + "loss": 0.6651, + "step": 1227 + }, + { + "epoch": 0.48525561867127687, + "grad_norm": 0.51916191994593, + "learning_rate": 4.997179857249669e-06, + "loss": 0.6539, + "step": 1228 + }, + { + "epoch": 0.4856507779698691, + "grad_norm": 0.550013608585077, + "learning_rate": 4.9971724127353725e-06, + "loss": 0.6719, + "step": 1229 + }, + { + "epoch": 0.48604593726846135, + "grad_norm": 0.5152452965696306, + "learning_rate": 4.997164958413698e-06, + "loss": 0.6558, + "step": 1230 + }, + { + "epoch": 0.4864410965670536, + "grad_norm": 0.5149055111216204, + "learning_rate": 4.997157494284677e-06, + "loss": 0.6652, + "step": 1231 + }, + { + "epoch": 0.48683625586564583, + "grad_norm": 0.5332858943526043, + "learning_rate": 4.997150020348337e-06, + "loss": 0.6394, + "step": 1232 + }, + { + "epoch": 0.48723141516423807, + "grad_norm": 0.56714602215336, + "learning_rate": 4.997142536604708e-06, + "loss": 0.6531, + "step": 1233 + }, + { + "epoch": 0.4876265744628303, + "grad_norm": 0.5545157033195995, + "learning_rate": 4.99713504305382e-06, + "loss": 0.6412, + "step": 1234 + }, + { + "epoch": 0.48802173376142255, + "grad_norm": 0.5587339548936653, + "learning_rate": 4.997127539695701e-06, + "loss": 0.6749, + "step": 1235 + }, + { + "epoch": 0.48841689306001485, + "grad_norm": 0.5115432181953685, + "learning_rate": 4.997120026530382e-06, + "loss": 0.6409, + "step": 1236 + }, + { + "epoch": 0.4888120523586071, + "grad_norm": 0.5464551003108359, + "learning_rate": 4.997112503557892e-06, + "loss": 0.6289, + "step": 1237 + }, + { + "epoch": 0.4892072116571993, + "grad_norm": 0.5486875395190994, + "learning_rate": 4.99710497077826e-06, + "loss": 0.6764, + "step": 1238 + }, + { + "epoch": 0.48960237095579157, + "grad_norm": 0.5032189382025016, + "learning_rate": 4.997097428191516e-06, + "loss": 0.658, + "step": 1239 + }, + { + "epoch": 0.4899975302543838, + "grad_norm": 0.6362327602346085, + "learning_rate": 4.99708987579769e-06, + "loss": 0.6591, + "step": 1240 + }, + { + "epoch": 0.49039268955297605, + "grad_norm": 0.5560505192991964, + "learning_rate": 4.9970823135968115e-06, + "loss": 0.6808, + "step": 1241 + }, + { + "epoch": 0.4907878488515683, + "grad_norm": 0.5180049644630822, + "learning_rate": 4.997074741588909e-06, + "loss": 0.6491, + "step": 1242 + }, + { + "epoch": 0.4911830081501605, + "grad_norm": 0.5090961137400876, + "learning_rate": 4.997067159774014e-06, + "loss": 0.6519, + "step": 1243 + }, + { + "epoch": 0.49157816744875277, + "grad_norm": 0.5358018751041574, + "learning_rate": 4.997059568152155e-06, + "loss": 0.6689, + "step": 1244 + }, + { + "epoch": 0.491973326747345, + "grad_norm": 0.5417084482265679, + "learning_rate": 4.997051966723363e-06, + "loss": 0.6412, + "step": 1245 + }, + { + "epoch": 0.49236848604593725, + "grad_norm": 0.524183120106596, + "learning_rate": 4.997044355487667e-06, + "loss": 0.6574, + "step": 1246 + }, + { + "epoch": 0.4927636453445295, + "grad_norm": 0.5104588248452702, + "learning_rate": 4.9970367344450966e-06, + "loss": 0.6336, + "step": 1247 + }, + { + "epoch": 0.4931588046431218, + "grad_norm": 0.5390658040510667, + "learning_rate": 4.997029103595682e-06, + "loss": 0.6575, + "step": 1248 + }, + { + "epoch": 0.493553963941714, + "grad_norm": 0.5085283720065422, + "learning_rate": 4.997021462939454e-06, + "loss": 0.644, + "step": 1249 + }, + { + "epoch": 0.49394912324030626, + "grad_norm": 0.5039511304830326, + "learning_rate": 4.997013812476442e-06, + "loss": 0.6587, + "step": 1250 + }, + { + "epoch": 0.4943442825388985, + "grad_norm": 0.530739785584054, + "learning_rate": 4.997006152206675e-06, + "loss": 0.6646, + "step": 1251 + }, + { + "epoch": 0.49473944183749075, + "grad_norm": 0.5419758659886152, + "learning_rate": 4.9969984821301855e-06, + "loss": 0.6505, + "step": 1252 + }, + { + "epoch": 0.495134601136083, + "grad_norm": 0.49355655865907666, + "learning_rate": 4.996990802247002e-06, + "loss": 0.6272, + "step": 1253 + }, + { + "epoch": 0.4955297604346752, + "grad_norm": 0.5263427398496581, + "learning_rate": 4.996983112557154e-06, + "loss": 0.6216, + "step": 1254 + }, + { + "epoch": 0.49592491973326747, + "grad_norm": 0.5361314380353814, + "learning_rate": 4.996975413060673e-06, + "loss": 0.6352, + "step": 1255 + }, + { + "epoch": 0.4963200790318597, + "grad_norm": 0.5697308230357576, + "learning_rate": 4.996967703757589e-06, + "loss": 0.6623, + "step": 1256 + }, + { + "epoch": 0.49671523833045195, + "grad_norm": 0.5281054703403194, + "learning_rate": 4.996959984647931e-06, + "loss": 0.6538, + "step": 1257 + }, + { + "epoch": 0.4971103976290442, + "grad_norm": 0.5251657598014784, + "learning_rate": 4.996952255731732e-06, + "loss": 0.6471, + "step": 1258 + }, + { + "epoch": 0.4975055569276364, + "grad_norm": 0.5279909788842021, + "learning_rate": 4.99694451700902e-06, + "loss": 0.6367, + "step": 1259 + }, + { + "epoch": 0.4979007162262287, + "grad_norm": 0.5181820037607305, + "learning_rate": 4.996936768479826e-06, + "loss": 0.661, + "step": 1260 + }, + { + "epoch": 0.49829587552482096, + "grad_norm": 0.526657102607614, + "learning_rate": 4.9969290101441815e-06, + "loss": 0.6481, + "step": 1261 + }, + { + "epoch": 0.4986910348234132, + "grad_norm": 0.5547575204328595, + "learning_rate": 4.996921242002115e-06, + "loss": 0.6392, + "step": 1262 + }, + { + "epoch": 0.49908619412200544, + "grad_norm": 0.5292550322998774, + "learning_rate": 4.996913464053659e-06, + "loss": 0.6462, + "step": 1263 + }, + { + "epoch": 0.4994813534205977, + "grad_norm": 0.5051213982225501, + "learning_rate": 4.996905676298843e-06, + "loss": 0.6243, + "step": 1264 + }, + { + "epoch": 0.4998765127191899, + "grad_norm": 0.5404121143898882, + "learning_rate": 4.996897878737697e-06, + "loss": 0.6508, + "step": 1265 + }, + { + "epoch": 0.5002716720177822, + "grad_norm": 0.5853244704579353, + "learning_rate": 4.996890071370253e-06, + "loss": 0.6692, + "step": 1266 + }, + { + "epoch": 0.5006668313163745, + "grad_norm": 0.49722504428159636, + "learning_rate": 4.99688225419654e-06, + "loss": 0.6241, + "step": 1267 + }, + { + "epoch": 0.5010619906149667, + "grad_norm": 0.5391014672758346, + "learning_rate": 4.996874427216591e-06, + "loss": 0.6517, + "step": 1268 + }, + { + "epoch": 0.5014571499135589, + "grad_norm": 0.5284804206913505, + "learning_rate": 4.996866590430435e-06, + "loss": 0.6511, + "step": 1269 + }, + { + "epoch": 0.5018523092121512, + "grad_norm": 0.5270981555373312, + "learning_rate": 4.996858743838103e-06, + "loss": 0.6426, + "step": 1270 + }, + { + "epoch": 0.5022474685107434, + "grad_norm": 0.5246251269006239, + "learning_rate": 4.996850887439626e-06, + "loss": 0.6558, + "step": 1271 + }, + { + "epoch": 0.5026426278093357, + "grad_norm": 0.5432048241827167, + "learning_rate": 4.996843021235035e-06, + "loss": 0.6398, + "step": 1272 + }, + { + "epoch": 0.5030377871079279, + "grad_norm": 0.5293220925783506, + "learning_rate": 4.9968351452243605e-06, + "loss": 0.6422, + "step": 1273 + }, + { + "epoch": 0.5034329464065201, + "grad_norm": 0.5133980649059295, + "learning_rate": 4.996827259407634e-06, + "loss": 0.6459, + "step": 1274 + }, + { + "epoch": 0.5038281057051124, + "grad_norm": 0.5382972682689461, + "learning_rate": 4.996819363784886e-06, + "loss": 0.6625, + "step": 1275 + }, + { + "epoch": 0.5042232650037046, + "grad_norm": 0.9692398606608912, + "learning_rate": 4.996811458356148e-06, + "loss": 0.6399, + "step": 1276 + }, + { + "epoch": 0.5046184243022969, + "grad_norm": 0.5168785866731923, + "learning_rate": 4.99680354312145e-06, + "loss": 0.6608, + "step": 1277 + }, + { + "epoch": 0.5050135836008891, + "grad_norm": 0.5770550825296323, + "learning_rate": 4.996795618080824e-06, + "loss": 0.626, + "step": 1278 + }, + { + "epoch": 0.5054087428994813, + "grad_norm": 0.5677109698383079, + "learning_rate": 4.996787683234302e-06, + "loss": 0.6436, + "step": 1279 + }, + { + "epoch": 0.5058039021980736, + "grad_norm": 0.5046529589482918, + "learning_rate": 4.9967797385819135e-06, + "loss": 0.632, + "step": 1280 + }, + { + "epoch": 0.5061990614966658, + "grad_norm": 0.5511822639718909, + "learning_rate": 4.99677178412369e-06, + "loss": 0.6504, + "step": 1281 + }, + { + "epoch": 0.5065942207952581, + "grad_norm": 0.5162230611570326, + "learning_rate": 4.996763819859663e-06, + "loss": 0.6545, + "step": 1282 + }, + { + "epoch": 0.5069893800938503, + "grad_norm": 0.5229128926155301, + "learning_rate": 4.996755845789865e-06, + "loss": 0.6812, + "step": 1283 + }, + { + "epoch": 0.5073845393924425, + "grad_norm": 0.5205567049268548, + "learning_rate": 4.9967478619143244e-06, + "loss": 0.6311, + "step": 1284 + }, + { + "epoch": 0.5077796986910348, + "grad_norm": 0.5104439487974587, + "learning_rate": 4.996739868233076e-06, + "loss": 0.6355, + "step": 1285 + }, + { + "epoch": 0.508174857989627, + "grad_norm": 0.5216314503599059, + "learning_rate": 4.996731864746148e-06, + "loss": 0.6466, + "step": 1286 + }, + { + "epoch": 0.5085700172882193, + "grad_norm": 0.6055026988137685, + "learning_rate": 4.9967238514535745e-06, + "loss": 0.6442, + "step": 1287 + }, + { + "epoch": 0.5089651765868115, + "grad_norm": 0.5106442297323203, + "learning_rate": 4.9967158283553856e-06, + "loss": 0.6419, + "step": 1288 + }, + { + "epoch": 0.5093603358854039, + "grad_norm": 0.5231952656265989, + "learning_rate": 4.996707795451612e-06, + "loss": 0.6562, + "step": 1289 + }, + { + "epoch": 0.5097554951839961, + "grad_norm": 0.5138942047359026, + "learning_rate": 4.996699752742287e-06, + "loss": 0.6576, + "step": 1290 + }, + { + "epoch": 0.5101506544825883, + "grad_norm": 0.5320229519854904, + "learning_rate": 4.996691700227441e-06, + "loss": 0.6552, + "step": 1291 + }, + { + "epoch": 0.5105458137811806, + "grad_norm": 0.523798873093951, + "learning_rate": 4.996683637907107e-06, + "loss": 0.657, + "step": 1292 + }, + { + "epoch": 0.5109409730797728, + "grad_norm": 0.4798254266039342, + "learning_rate": 4.996675565781315e-06, + "loss": 0.6393, + "step": 1293 + }, + { + "epoch": 0.5113361323783651, + "grad_norm": 0.493096648178815, + "learning_rate": 4.996667483850097e-06, + "loss": 0.6446, + "step": 1294 + }, + { + "epoch": 0.5117312916769573, + "grad_norm": 0.49595320560319645, + "learning_rate": 4.996659392113486e-06, + "loss": 0.6405, + "step": 1295 + }, + { + "epoch": 0.5121264509755495, + "grad_norm": 0.5364956991750699, + "learning_rate": 4.9966512905715135e-06, + "loss": 0.6527, + "step": 1296 + }, + { + "epoch": 0.5125216102741418, + "grad_norm": 0.5294308228967807, + "learning_rate": 4.99664317922421e-06, + "loss": 0.6481, + "step": 1297 + }, + { + "epoch": 0.512916769572734, + "grad_norm": 0.5414401474478188, + "learning_rate": 4.996635058071609e-06, + "loss": 0.6669, + "step": 1298 + }, + { + "epoch": 0.5133119288713263, + "grad_norm": 0.5036105503103926, + "learning_rate": 4.99662692711374e-06, + "loss": 0.6214, + "step": 1299 + }, + { + "epoch": 0.5137070881699185, + "grad_norm": 0.4980125974467828, + "learning_rate": 4.996618786350637e-06, + "loss": 0.6462, + "step": 1300 + }, + { + "epoch": 0.5141022474685107, + "grad_norm": 0.5214248630142702, + "learning_rate": 4.996610635782332e-06, + "loss": 0.6398, + "step": 1301 + }, + { + "epoch": 0.514497406767103, + "grad_norm": 0.500280326327944, + "learning_rate": 4.996602475408856e-06, + "loss": 0.6231, + "step": 1302 + }, + { + "epoch": 0.5148925660656952, + "grad_norm": 0.5289763112497039, + "learning_rate": 4.996594305230241e-06, + "loss": 0.6456, + "step": 1303 + }, + { + "epoch": 0.5152877253642875, + "grad_norm": 0.4963706186676594, + "learning_rate": 4.996586125246521e-06, + "loss": 0.6507, + "step": 1304 + }, + { + "epoch": 0.5156828846628797, + "grad_norm": 0.5107270935037411, + "learning_rate": 4.9965779354577254e-06, + "loss": 0.6505, + "step": 1305 + }, + { + "epoch": 0.5160780439614719, + "grad_norm": 0.5263648487154836, + "learning_rate": 4.996569735863888e-06, + "loss": 0.6441, + "step": 1306 + }, + { + "epoch": 0.5164732032600642, + "grad_norm": 0.5235885042012381, + "learning_rate": 4.9965615264650416e-06, + "loss": 0.6108, + "step": 1307 + }, + { + "epoch": 0.5168683625586564, + "grad_norm": 0.5141835232080435, + "learning_rate": 4.996553307261216e-06, + "loss": 0.6605, + "step": 1308 + }, + { + "epoch": 0.5172635218572487, + "grad_norm": 0.5388043396561775, + "learning_rate": 4.996545078252446e-06, + "loss": 0.6464, + "step": 1309 + }, + { + "epoch": 0.5176586811558409, + "grad_norm": 0.5082281116024677, + "learning_rate": 4.996536839438763e-06, + "loss": 0.6323, + "step": 1310 + }, + { + "epoch": 0.5180538404544331, + "grad_norm": 0.5635524885120492, + "learning_rate": 4.996528590820199e-06, + "loss": 0.6691, + "step": 1311 + }, + { + "epoch": 0.5184489997530254, + "grad_norm": 0.5229240130623062, + "learning_rate": 4.996520332396786e-06, + "loss": 0.6452, + "step": 1312 + }, + { + "epoch": 0.5188441590516177, + "grad_norm": 0.5812217140701571, + "learning_rate": 4.996512064168558e-06, + "loss": 0.6566, + "step": 1313 + }, + { + "epoch": 0.51923931835021, + "grad_norm": 0.5219118851284731, + "learning_rate": 4.996503786135546e-06, + "loss": 0.6663, + "step": 1314 + }, + { + "epoch": 0.5196344776488022, + "grad_norm": 0.507821654017062, + "learning_rate": 4.996495498297783e-06, + "loss": 0.6512, + "step": 1315 + }, + { + "epoch": 0.5200296369473945, + "grad_norm": 0.5307845396993616, + "learning_rate": 4.9964872006553025e-06, + "loss": 0.6269, + "step": 1316 + }, + { + "epoch": 0.5204247962459867, + "grad_norm": 0.5240077658361868, + "learning_rate": 4.996478893208135e-06, + "loss": 0.6816, + "step": 1317 + }, + { + "epoch": 0.5208199555445789, + "grad_norm": 0.5167433367959745, + "learning_rate": 4.996470575956316e-06, + "loss": 0.6555, + "step": 1318 + }, + { + "epoch": 0.5212151148431712, + "grad_norm": 0.5185457506960603, + "learning_rate": 4.996462248899876e-06, + "loss": 0.6492, + "step": 1319 + }, + { + "epoch": 0.5216102741417634, + "grad_norm": 0.5259749401931311, + "learning_rate": 4.9964539120388475e-06, + "loss": 0.6385, + "step": 1320 + }, + { + "epoch": 0.5220054334403557, + "grad_norm": 0.5363760379563364, + "learning_rate": 4.996445565373264e-06, + "loss": 0.6563, + "step": 1321 + }, + { + "epoch": 0.5224005927389479, + "grad_norm": 0.5118945344997703, + "learning_rate": 4.996437208903159e-06, + "loss": 0.6388, + "step": 1322 + }, + { + "epoch": 0.5227957520375401, + "grad_norm": 0.5105505561913978, + "learning_rate": 4.996428842628563e-06, + "loss": 0.6374, + "step": 1323 + }, + { + "epoch": 0.5231909113361324, + "grad_norm": 0.545025639173082, + "learning_rate": 4.996420466549512e-06, + "loss": 0.6304, + "step": 1324 + }, + { + "epoch": 0.5235860706347246, + "grad_norm": 0.5181486179482868, + "learning_rate": 4.996412080666036e-06, + "loss": 0.6607, + "step": 1325 + }, + { + "epoch": 0.5239812299333169, + "grad_norm": 0.5918376928214296, + "learning_rate": 4.99640368497817e-06, + "loss": 0.65, + "step": 1326 + }, + { + "epoch": 0.5243763892319091, + "grad_norm": 0.6440341218450398, + "learning_rate": 4.9963952794859475e-06, + "loss": 0.6314, + "step": 1327 + }, + { + "epoch": 0.5247715485305013, + "grad_norm": 0.506859989727924, + "learning_rate": 4.996386864189399e-06, + "loss": 0.6405, + "step": 1328 + }, + { + "epoch": 0.5251667078290936, + "grad_norm": 0.604693581937193, + "learning_rate": 4.9963784390885585e-06, + "loss": 0.6301, + "step": 1329 + }, + { + "epoch": 0.5255618671276858, + "grad_norm": 0.5388769958954528, + "learning_rate": 4.99637000418346e-06, + "loss": 0.6563, + "step": 1330 + }, + { + "epoch": 0.5259570264262781, + "grad_norm": 0.5525987392165671, + "learning_rate": 4.996361559474135e-06, + "loss": 0.6408, + "step": 1331 + }, + { + "epoch": 0.5263521857248703, + "grad_norm": 0.5438403662996905, + "learning_rate": 4.996353104960619e-06, + "loss": 0.6463, + "step": 1332 + }, + { + "epoch": 0.5267473450234625, + "grad_norm": 0.5205416563435727, + "learning_rate": 4.996344640642943e-06, + "loss": 0.6281, + "step": 1333 + }, + { + "epoch": 0.5271425043220548, + "grad_norm": 0.49717641720078637, + "learning_rate": 4.9963361665211404e-06, + "loss": 0.6438, + "step": 1334 + }, + { + "epoch": 0.527537663620647, + "grad_norm": 0.5720531214872092, + "learning_rate": 4.996327682595247e-06, + "loss": 0.6549, + "step": 1335 + }, + { + "epoch": 0.5279328229192393, + "grad_norm": 0.5083062127313356, + "learning_rate": 4.996319188865293e-06, + "loss": 0.6343, + "step": 1336 + }, + { + "epoch": 0.5283279822178316, + "grad_norm": 0.5326331393036251, + "learning_rate": 4.996310685331314e-06, + "loss": 0.6654, + "step": 1337 + }, + { + "epoch": 0.5287231415164239, + "grad_norm": 0.5263704388039211, + "learning_rate": 4.996302171993341e-06, + "loss": 0.6319, + "step": 1338 + }, + { + "epoch": 0.5291183008150161, + "grad_norm": 0.5004961835420784, + "learning_rate": 4.99629364885141e-06, + "loss": 0.6383, + "step": 1339 + }, + { + "epoch": 0.5295134601136083, + "grad_norm": 0.5474008702250441, + "learning_rate": 4.996285115905554e-06, + "loss": 0.644, + "step": 1340 + }, + { + "epoch": 0.5299086194122006, + "grad_norm": 0.5214317652571568, + "learning_rate": 4.996276573155805e-06, + "loss": 0.6278, + "step": 1341 + }, + { + "epoch": 0.5303037787107928, + "grad_norm": 0.5178793183599104, + "learning_rate": 4.996268020602198e-06, + "loss": 0.6527, + "step": 1342 + }, + { + "epoch": 0.5306989380093851, + "grad_norm": 0.5106695795609324, + "learning_rate": 4.9962594582447654e-06, + "loss": 0.6194, + "step": 1343 + }, + { + "epoch": 0.5310940973079773, + "grad_norm": 0.5553450350073675, + "learning_rate": 4.996250886083541e-06, + "loss": 0.6651, + "step": 1344 + }, + { + "epoch": 0.5314892566065695, + "grad_norm": 0.5020511547000661, + "learning_rate": 4.996242304118561e-06, + "loss": 0.6579, + "step": 1345 + }, + { + "epoch": 0.5318844159051618, + "grad_norm": 0.5056151563934147, + "learning_rate": 4.996233712349855e-06, + "loss": 0.6522, + "step": 1346 + }, + { + "epoch": 0.532279575203754, + "grad_norm": 0.5141576495869976, + "learning_rate": 4.996225110777459e-06, + "loss": 0.6233, + "step": 1347 + }, + { + "epoch": 0.5326747345023463, + "grad_norm": 0.5060024011484795, + "learning_rate": 4.996216499401408e-06, + "loss": 0.6496, + "step": 1348 + }, + { + "epoch": 0.5330698938009385, + "grad_norm": 0.513393743699311, + "learning_rate": 4.996207878221732e-06, + "loss": 0.6363, + "step": 1349 + }, + { + "epoch": 0.5334650530995307, + "grad_norm": 0.516451601423634, + "learning_rate": 4.9961992472384695e-06, + "loss": 0.6144, + "step": 1350 + }, + { + "epoch": 0.533860212398123, + "grad_norm": 0.543940796081213, + "learning_rate": 4.99619060645165e-06, + "loss": 0.6636, + "step": 1351 + }, + { + "epoch": 0.5342553716967152, + "grad_norm": 0.523839395723129, + "learning_rate": 4.996181955861311e-06, + "loss": 0.6522, + "step": 1352 + }, + { + "epoch": 0.5346505309953075, + "grad_norm": 0.5346665570634227, + "learning_rate": 4.996173295467485e-06, + "loss": 0.6247, + "step": 1353 + }, + { + "epoch": 0.5350456902938997, + "grad_norm": 0.5053515545286174, + "learning_rate": 4.996164625270206e-06, + "loss": 0.628, + "step": 1354 + }, + { + "epoch": 0.5354408495924919, + "grad_norm": 0.5385469662689469, + "learning_rate": 4.9961559452695075e-06, + "loss": 0.6403, + "step": 1355 + }, + { + "epoch": 0.5358360088910842, + "grad_norm": 0.530821893645594, + "learning_rate": 4.996147255465425e-06, + "loss": 0.6493, + "step": 1356 + }, + { + "epoch": 0.5362311681896764, + "grad_norm": 0.49520095478056697, + "learning_rate": 4.9961385558579915e-06, + "loss": 0.6391, + "step": 1357 + }, + { + "epoch": 0.5366263274882687, + "grad_norm": 0.5012636197893197, + "learning_rate": 4.996129846447241e-06, + "loss": 0.6265, + "step": 1358 + }, + { + "epoch": 0.5370214867868609, + "grad_norm": 0.5412008937924457, + "learning_rate": 4.996121127233209e-06, + "loss": 0.6583, + "step": 1359 + }, + { + "epoch": 0.5374166460854533, + "grad_norm": 0.51146837063799, + "learning_rate": 4.996112398215929e-06, + "loss": 0.6552, + "step": 1360 + }, + { + "epoch": 0.5378118053840455, + "grad_norm": 0.5164037939572933, + "learning_rate": 4.996103659395434e-06, + "loss": 0.6441, + "step": 1361 + }, + { + "epoch": 0.5382069646826377, + "grad_norm": 0.507221251387215, + "learning_rate": 4.99609491077176e-06, + "loss": 0.6183, + "step": 1362 + }, + { + "epoch": 0.53860212398123, + "grad_norm": 0.5110021327827016, + "learning_rate": 4.996086152344942e-06, + "loss": 0.6445, + "step": 1363 + }, + { + "epoch": 0.5389972832798222, + "grad_norm": 0.4950203261367757, + "learning_rate": 4.996077384115012e-06, + "loss": 0.6317, + "step": 1364 + }, + { + "epoch": 0.5393924425784145, + "grad_norm": 0.513880385000124, + "learning_rate": 4.9960686060820065e-06, + "loss": 0.6537, + "step": 1365 + }, + { + "epoch": 0.5397876018770067, + "grad_norm": 0.5047955828045061, + "learning_rate": 4.99605981824596e-06, + "loss": 0.6385, + "step": 1366 + }, + { + "epoch": 0.5401827611755989, + "grad_norm": 0.5444433922369591, + "learning_rate": 4.996051020606904e-06, + "loss": 0.6333, + "step": 1367 + }, + { + "epoch": 0.5405779204741912, + "grad_norm": 0.5391605933331952, + "learning_rate": 4.9960422131648765e-06, + "loss": 0.6553, + "step": 1368 + }, + { + "epoch": 0.5409730797727834, + "grad_norm": 0.5490109139703936, + "learning_rate": 4.996033395919911e-06, + "loss": 0.6352, + "step": 1369 + }, + { + "epoch": 0.5413682390713757, + "grad_norm": 0.5044155900096984, + "learning_rate": 4.996024568872042e-06, + "loss": 0.6328, + "step": 1370 + }, + { + "epoch": 0.5417633983699679, + "grad_norm": 0.5426896659207923, + "learning_rate": 4.9960157320213046e-06, + "loss": 0.6419, + "step": 1371 + }, + { + "epoch": 0.5421585576685601, + "grad_norm": 0.5475008282374658, + "learning_rate": 4.996006885367732e-06, + "loss": 0.661, + "step": 1372 + }, + { + "epoch": 0.5425537169671524, + "grad_norm": 0.50456290195716, + "learning_rate": 4.995998028911361e-06, + "loss": 0.6411, + "step": 1373 + }, + { + "epoch": 0.5429488762657446, + "grad_norm": 0.517404197086199, + "learning_rate": 4.995989162652224e-06, + "loss": 0.6244, + "step": 1374 + }, + { + "epoch": 0.5433440355643369, + "grad_norm": 0.52119175194665, + "learning_rate": 4.995980286590358e-06, + "loss": 0.6285, + "step": 1375 + }, + { + "epoch": 0.5437391948629291, + "grad_norm": 0.5535606068992925, + "learning_rate": 4.995971400725798e-06, + "loss": 0.6512, + "step": 1376 + }, + { + "epoch": 0.5441343541615213, + "grad_norm": 0.5096581348064602, + "learning_rate": 4.995962505058577e-06, + "loss": 0.6209, + "step": 1377 + }, + { + "epoch": 0.5445295134601136, + "grad_norm": 0.507954709115174, + "learning_rate": 4.995953599588731e-06, + "loss": 0.6347, + "step": 1378 + }, + { + "epoch": 0.5449246727587058, + "grad_norm": 0.5455172501120015, + "learning_rate": 4.995944684316295e-06, + "loss": 0.6544, + "step": 1379 + }, + { + "epoch": 0.5453198320572981, + "grad_norm": 0.5195045840736692, + "learning_rate": 4.995935759241304e-06, + "loss": 0.6308, + "step": 1380 + }, + { + "epoch": 0.5457149913558903, + "grad_norm": 0.5128980270231307, + "learning_rate": 4.995926824363793e-06, + "loss": 0.6451, + "step": 1381 + }, + { + "epoch": 0.5461101506544825, + "grad_norm": 0.5694454666753539, + "learning_rate": 4.995917879683796e-06, + "loss": 0.6544, + "step": 1382 + }, + { + "epoch": 0.5465053099530748, + "grad_norm": 0.5222383396969622, + "learning_rate": 4.995908925201351e-06, + "loss": 0.6215, + "step": 1383 + }, + { + "epoch": 0.5469004692516671, + "grad_norm": 0.5033825276170869, + "learning_rate": 4.995899960916489e-06, + "loss": 0.6485, + "step": 1384 + }, + { + "epoch": 0.5472956285502594, + "grad_norm": 0.5217250381657061, + "learning_rate": 4.9958909868292495e-06, + "loss": 0.6392, + "step": 1385 + }, + { + "epoch": 0.5476907878488516, + "grad_norm": 0.5275739257953245, + "learning_rate": 4.995882002939665e-06, + "loss": 0.6309, + "step": 1386 + }, + { + "epoch": 0.5480859471474439, + "grad_norm": 0.4888893706726107, + "learning_rate": 4.995873009247771e-06, + "loss": 0.6266, + "step": 1387 + }, + { + "epoch": 0.5484811064460361, + "grad_norm": 0.6091283696485257, + "learning_rate": 4.995864005753605e-06, + "loss": 0.6451, + "step": 1388 + }, + { + "epoch": 0.5488762657446283, + "grad_norm": 0.5292867693675212, + "learning_rate": 4.9958549924572e-06, + "loss": 0.6369, + "step": 1389 + }, + { + "epoch": 0.5492714250432206, + "grad_norm": 0.516530358022225, + "learning_rate": 4.995845969358591e-06, + "loss": 0.6427, + "step": 1390 + }, + { + "epoch": 0.5496665843418128, + "grad_norm": 0.5302839204130153, + "learning_rate": 4.995836936457816e-06, + "loss": 0.6581, + "step": 1391 + }, + { + "epoch": 0.550061743640405, + "grad_norm": 0.5392670526192246, + "learning_rate": 4.995827893754909e-06, + "loss": 0.6354, + "step": 1392 + }, + { + "epoch": 0.5504569029389973, + "grad_norm": 0.533934176126299, + "learning_rate": 4.995818841249905e-06, + "loss": 0.6514, + "step": 1393 + }, + { + "epoch": 0.5508520622375895, + "grad_norm": 0.5423235074943602, + "learning_rate": 4.99580977894284e-06, + "loss": 0.652, + "step": 1394 + }, + { + "epoch": 0.5512472215361818, + "grad_norm": 0.5548664924819643, + "learning_rate": 4.995800706833751e-06, + "loss": 0.6514, + "step": 1395 + }, + { + "epoch": 0.551642380834774, + "grad_norm": 0.5230139409187969, + "learning_rate": 4.995791624922671e-06, + "loss": 0.6449, + "step": 1396 + }, + { + "epoch": 0.5520375401333663, + "grad_norm": 0.4882535391793784, + "learning_rate": 4.995782533209638e-06, + "loss": 0.6171, + "step": 1397 + }, + { + "epoch": 0.5524326994319585, + "grad_norm": 0.5602125565323922, + "learning_rate": 4.995773431694686e-06, + "loss": 0.6535, + "step": 1398 + }, + { + "epoch": 0.5528278587305507, + "grad_norm": 0.5396171131081425, + "learning_rate": 4.995764320377852e-06, + "loss": 0.6597, + "step": 1399 + }, + { + "epoch": 0.553223018029143, + "grad_norm": 0.5233921034680511, + "learning_rate": 4.99575519925917e-06, + "loss": 0.6498, + "step": 1400 + }, + { + "epoch": 0.5536181773277352, + "grad_norm": 0.49235649050585056, + "learning_rate": 4.995746068338679e-06, + "loss": 0.6381, + "step": 1401 + }, + { + "epoch": 0.5540133366263275, + "grad_norm": 0.5307247825728258, + "learning_rate": 4.995736927616412e-06, + "loss": 0.6624, + "step": 1402 + }, + { + "epoch": 0.5544084959249197, + "grad_norm": 0.5423744236754792, + "learning_rate": 4.995727777092406e-06, + "loss": 0.6447, + "step": 1403 + }, + { + "epoch": 0.5548036552235119, + "grad_norm": 0.5784643067877503, + "learning_rate": 4.995718616766696e-06, + "loss": 0.661, + "step": 1404 + }, + { + "epoch": 0.5551988145221042, + "grad_norm": 0.5122057994466735, + "learning_rate": 4.995709446639319e-06, + "loss": 0.656, + "step": 1405 + }, + { + "epoch": 0.5555939738206964, + "grad_norm": 0.5174768644606775, + "learning_rate": 4.995700266710312e-06, + "loss": 0.6455, + "step": 1406 + }, + { + "epoch": 0.5559891331192887, + "grad_norm": 0.5496336917317313, + "learning_rate": 4.9956910769797085e-06, + "loss": 0.6422, + "step": 1407 + }, + { + "epoch": 0.556384292417881, + "grad_norm": 0.502618362332198, + "learning_rate": 4.9956818774475465e-06, + "loss": 0.6368, + "step": 1408 + }, + { + "epoch": 0.5567794517164733, + "grad_norm": 0.5691680887325605, + "learning_rate": 4.995672668113861e-06, + "loss": 0.6551, + "step": 1409 + }, + { + "epoch": 0.5571746110150655, + "grad_norm": 0.4970626341833687, + "learning_rate": 4.99566344897869e-06, + "loss": 0.6235, + "step": 1410 + }, + { + "epoch": 0.5575697703136577, + "grad_norm": 0.5189759440475544, + "learning_rate": 4.995654220042067e-06, + "loss": 0.6591, + "step": 1411 + }, + { + "epoch": 0.55796492961225, + "grad_norm": 0.5200276974925322, + "learning_rate": 4.99564498130403e-06, + "loss": 0.627, + "step": 1412 + }, + { + "epoch": 0.5583600889108422, + "grad_norm": 0.4968048227090689, + "learning_rate": 4.9956357327646155e-06, + "loss": 0.6456, + "step": 1413 + }, + { + "epoch": 0.5587552482094345, + "grad_norm": 0.5006447246560516, + "learning_rate": 4.995626474423859e-06, + "loss": 0.624, + "step": 1414 + }, + { + "epoch": 0.5591504075080267, + "grad_norm": 0.5408193413913738, + "learning_rate": 4.995617206281797e-06, + "loss": 0.6521, + "step": 1415 + }, + { + "epoch": 0.5595455668066189, + "grad_norm": 0.5102142633356043, + "learning_rate": 4.995607928338466e-06, + "loss": 0.6271, + "step": 1416 + }, + { + "epoch": 0.5599407261052112, + "grad_norm": 0.7566967338630805, + "learning_rate": 4.9955986405939025e-06, + "loss": 0.667, + "step": 1417 + }, + { + "epoch": 0.5603358854038034, + "grad_norm": 0.5002937738203951, + "learning_rate": 4.995589343048144e-06, + "loss": 0.5957, + "step": 1418 + }, + { + "epoch": 0.5607310447023957, + "grad_norm": 0.5307076787963949, + "learning_rate": 4.9955800357012245e-06, + "loss": 0.6303, + "step": 1419 + }, + { + "epoch": 0.5611262040009879, + "grad_norm": 0.5707224007581209, + "learning_rate": 4.995570718553182e-06, + "loss": 0.6358, + "step": 1420 + }, + { + "epoch": 0.5615213632995801, + "grad_norm": 0.5184302142265548, + "learning_rate": 4.995561391604054e-06, + "loss": 0.6429, + "step": 1421 + }, + { + "epoch": 0.5619165225981724, + "grad_norm": 0.5242472513502078, + "learning_rate": 4.995552054853876e-06, + "loss": 0.6352, + "step": 1422 + }, + { + "epoch": 0.5623116818967646, + "grad_norm": 0.5984667209493643, + "learning_rate": 4.995542708302684e-06, + "loss": 0.6449, + "step": 1423 + }, + { + "epoch": 0.5627068411953569, + "grad_norm": 0.5211234764982716, + "learning_rate": 4.995533351950517e-06, + "loss": 0.6336, + "step": 1424 + }, + { + "epoch": 0.5631020004939491, + "grad_norm": 0.5005552076190557, + "learning_rate": 4.9955239857974095e-06, + "loss": 0.6315, + "step": 1425 + }, + { + "epoch": 0.5634971597925413, + "grad_norm": 0.5156815083287939, + "learning_rate": 4.995514609843399e-06, + "loss": 0.6128, + "step": 1426 + }, + { + "epoch": 0.5638923190911336, + "grad_norm": 0.8025016430790596, + "learning_rate": 4.995505224088524e-06, + "loss": 0.6324, + "step": 1427 + }, + { + "epoch": 0.5642874783897258, + "grad_norm": 0.5143928596776913, + "learning_rate": 4.995495828532818e-06, + "loss": 0.6298, + "step": 1428 + }, + { + "epoch": 0.5646826376883181, + "grad_norm": 0.5420804076927369, + "learning_rate": 4.99548642317632e-06, + "loss": 0.6263, + "step": 1429 + }, + { + "epoch": 0.5650777969869103, + "grad_norm": 0.5455732492128239, + "learning_rate": 4.995477008019067e-06, + "loss": 0.6599, + "step": 1430 + }, + { + "epoch": 0.5654729562855026, + "grad_norm": 0.4976835769324547, + "learning_rate": 4.995467583061096e-06, + "loss": 0.6287, + "step": 1431 + }, + { + "epoch": 0.5658681155840949, + "grad_norm": 0.4891898546027407, + "learning_rate": 4.995458148302444e-06, + "loss": 0.6389, + "step": 1432 + }, + { + "epoch": 0.5662632748826871, + "grad_norm": 0.5330421535897775, + "learning_rate": 4.995448703743147e-06, + "loss": 0.6551, + "step": 1433 + }, + { + "epoch": 0.5666584341812794, + "grad_norm": 0.4928122884897478, + "learning_rate": 4.995439249383243e-06, + "loss": 0.6453, + "step": 1434 + }, + { + "epoch": 0.5670535934798716, + "grad_norm": 0.4982212210514846, + "learning_rate": 4.995429785222768e-06, + "loss": 0.6364, + "step": 1435 + }, + { + "epoch": 0.5674487527784638, + "grad_norm": 0.5090391879426845, + "learning_rate": 4.995420311261761e-06, + "loss": 0.6527, + "step": 1436 + }, + { + "epoch": 0.5678439120770561, + "grad_norm": 0.49179123014556764, + "learning_rate": 4.9954108275002586e-06, + "loss": 0.6198, + "step": 1437 + }, + { + "epoch": 0.5682390713756483, + "grad_norm": 0.49426838823272856, + "learning_rate": 4.9954013339382975e-06, + "loss": 0.6373, + "step": 1438 + }, + { + "epoch": 0.5686342306742406, + "grad_norm": 0.51127919787096, + "learning_rate": 4.995391830575915e-06, + "loss": 0.6447, + "step": 1439 + }, + { + "epoch": 0.5690293899728328, + "grad_norm": 0.5080627757706936, + "learning_rate": 4.995382317413149e-06, + "loss": 0.6239, + "step": 1440 + }, + { + "epoch": 0.569424549271425, + "grad_norm": 0.49636389856600227, + "learning_rate": 4.995372794450037e-06, + "loss": 0.6433, + "step": 1441 + }, + { + "epoch": 0.5698197085700173, + "grad_norm": 0.5337319531991955, + "learning_rate": 4.995363261686615e-06, + "loss": 0.6335, + "step": 1442 + }, + { + "epoch": 0.5702148678686095, + "grad_norm": 0.5283909447365106, + "learning_rate": 4.995353719122921e-06, + "loss": 0.6246, + "step": 1443 + }, + { + "epoch": 0.5706100271672018, + "grad_norm": 0.5050664542458486, + "learning_rate": 4.995344166758994e-06, + "loss": 0.6547, + "step": 1444 + }, + { + "epoch": 0.571005186465794, + "grad_norm": 0.47674622000290146, + "learning_rate": 4.99533460459487e-06, + "loss": 0.637, + "step": 1445 + }, + { + "epoch": 0.5714003457643863, + "grad_norm": 0.522395740013937, + "learning_rate": 4.995325032630588e-06, + "loss": 0.6294, + "step": 1446 + }, + { + "epoch": 0.5717955050629785, + "grad_norm": 0.5076861343641647, + "learning_rate": 4.995315450866183e-06, + "loss": 0.6478, + "step": 1447 + }, + { + "epoch": 0.5721906643615707, + "grad_norm": 0.5123340353219106, + "learning_rate": 4.995305859301695e-06, + "loss": 0.6324, + "step": 1448 + }, + { + "epoch": 0.572585823660163, + "grad_norm": 0.5116830386496904, + "learning_rate": 4.9952962579371595e-06, + "loss": 0.6541, + "step": 1449 + }, + { + "epoch": 0.5729809829587552, + "grad_norm": 0.5029055259684767, + "learning_rate": 4.995286646772616e-06, + "loss": 0.6411, + "step": 1450 + }, + { + "epoch": 0.5733761422573475, + "grad_norm": 0.54340913147798, + "learning_rate": 4.995277025808103e-06, + "loss": 0.6318, + "step": 1451 + }, + { + "epoch": 0.5737713015559397, + "grad_norm": 0.6008588857149761, + "learning_rate": 4.995267395043656e-06, + "loss": 0.6313, + "step": 1452 + }, + { + "epoch": 0.5741664608545319, + "grad_norm": 0.5046550392300445, + "learning_rate": 4.995257754479313e-06, + "loss": 0.6093, + "step": 1453 + }, + { + "epoch": 0.5745616201531242, + "grad_norm": 0.5075601586098186, + "learning_rate": 4.995248104115114e-06, + "loss": 0.6715, + "step": 1454 + }, + { + "epoch": 0.5749567794517165, + "grad_norm": 0.5175079376115269, + "learning_rate": 4.995238443951096e-06, + "loss": 0.6332, + "step": 1455 + }, + { + "epoch": 0.5753519387503088, + "grad_norm": 0.5251599308274112, + "learning_rate": 4.995228773987296e-06, + "loss": 0.6471, + "step": 1456 + }, + { + "epoch": 0.575747098048901, + "grad_norm": 0.5112664273121302, + "learning_rate": 4.995219094223753e-06, + "loss": 0.6487, + "step": 1457 + }, + { + "epoch": 0.5761422573474932, + "grad_norm": 0.5161124213557929, + "learning_rate": 4.9952094046605034e-06, + "loss": 0.6371, + "step": 1458 + }, + { + "epoch": 0.5765374166460855, + "grad_norm": 0.5122993630545928, + "learning_rate": 4.995199705297587e-06, + "loss": 0.6402, + "step": 1459 + }, + { + "epoch": 0.5769325759446777, + "grad_norm": 0.48934561452372555, + "learning_rate": 4.995189996135042e-06, + "loss": 0.6453, + "step": 1460 + }, + { + "epoch": 0.57732773524327, + "grad_norm": 0.5015789106369151, + "learning_rate": 4.995180277172905e-06, + "loss": 0.6405, + "step": 1461 + }, + { + "epoch": 0.5777228945418622, + "grad_norm": 0.4896116736893514, + "learning_rate": 4.995170548411215e-06, + "loss": 0.604, + "step": 1462 + }, + { + "epoch": 0.5781180538404544, + "grad_norm": 0.5056206160242199, + "learning_rate": 4.995160809850012e-06, + "loss": 0.6501, + "step": 1463 + }, + { + "epoch": 0.5785132131390467, + "grad_norm": 0.5271567630707523, + "learning_rate": 4.99515106148933e-06, + "loss": 0.6637, + "step": 1464 + }, + { + "epoch": 0.5789083724376389, + "grad_norm": 0.5161640907927971, + "learning_rate": 4.9951413033292115e-06, + "loss": 0.6575, + "step": 1465 + }, + { + "epoch": 0.5793035317362312, + "grad_norm": 0.5366526893281445, + "learning_rate": 4.995131535369693e-06, + "loss": 0.6442, + "step": 1466 + }, + { + "epoch": 0.5796986910348234, + "grad_norm": 0.478707795838876, + "learning_rate": 4.995121757610812e-06, + "loss": 0.6264, + "step": 1467 + }, + { + "epoch": 0.5800938503334157, + "grad_norm": 0.5109398474629543, + "learning_rate": 4.995111970052608e-06, + "loss": 0.6432, + "step": 1468 + }, + { + "epoch": 0.5804890096320079, + "grad_norm": 0.5029075819574539, + "learning_rate": 4.99510217269512e-06, + "loss": 0.641, + "step": 1469 + }, + { + "epoch": 0.5808841689306001, + "grad_norm": 0.5123704050877957, + "learning_rate": 4.995092365538385e-06, + "loss": 0.6311, + "step": 1470 + }, + { + "epoch": 0.5812793282291924, + "grad_norm": 0.5232609805684696, + "learning_rate": 4.995082548582443e-06, + "loss": 0.6722, + "step": 1471 + }, + { + "epoch": 0.5816744875277846, + "grad_norm": 0.5140040836372506, + "learning_rate": 4.995072721827331e-06, + "loss": 0.6263, + "step": 1472 + }, + { + "epoch": 0.5820696468263769, + "grad_norm": 0.5133765736650968, + "learning_rate": 4.995062885273089e-06, + "loss": 0.6297, + "step": 1473 + }, + { + "epoch": 0.5824648061249691, + "grad_norm": 0.4941943476821983, + "learning_rate": 4.995053038919755e-06, + "loss": 0.6439, + "step": 1474 + }, + { + "epoch": 0.5828599654235613, + "grad_norm": 0.49939101156839866, + "learning_rate": 4.9950431827673676e-06, + "loss": 0.6221, + "step": 1475 + }, + { + "epoch": 0.5832551247221536, + "grad_norm": 0.5284234536221161, + "learning_rate": 4.995033316815966e-06, + "loss": 0.6466, + "step": 1476 + }, + { + "epoch": 0.5836502840207458, + "grad_norm": 0.5232984307977716, + "learning_rate": 4.9950234410655886e-06, + "loss": 0.6657, + "step": 1477 + }, + { + "epoch": 0.584045443319338, + "grad_norm": 0.49111166592428446, + "learning_rate": 4.995013555516274e-06, + "loss": 0.6372, + "step": 1478 + }, + { + "epoch": 0.5844406026179304, + "grad_norm": 0.49817946819904, + "learning_rate": 4.99500366016806e-06, + "loss": 0.6564, + "step": 1479 + }, + { + "epoch": 0.5848357619165226, + "grad_norm": 0.5045229504599369, + "learning_rate": 4.994993755020989e-06, + "loss": 0.6236, + "step": 1480 + }, + { + "epoch": 0.5852309212151149, + "grad_norm": 0.5137046231454804, + "learning_rate": 4.994983840075096e-06, + "loss": 0.638, + "step": 1481 + }, + { + "epoch": 0.5856260805137071, + "grad_norm": 0.5090356242899768, + "learning_rate": 4.9949739153304224e-06, + "loss": 0.607, + "step": 1482 + }, + { + "epoch": 0.5860212398122994, + "grad_norm": 0.5086542750896299, + "learning_rate": 4.994963980787005e-06, + "loss": 0.6137, + "step": 1483 + }, + { + "epoch": 0.5864163991108916, + "grad_norm": 0.5359161859801612, + "learning_rate": 4.994954036444886e-06, + "loss": 0.6126, + "step": 1484 + }, + { + "epoch": 0.5868115584094838, + "grad_norm": 0.507773612302792, + "learning_rate": 4.994944082304102e-06, + "loss": 0.6161, + "step": 1485 + }, + { + "epoch": 0.5872067177080761, + "grad_norm": 0.4931028875320953, + "learning_rate": 4.9949341183646914e-06, + "loss": 0.6326, + "step": 1486 + }, + { + "epoch": 0.5876018770066683, + "grad_norm": 0.5370606115690266, + "learning_rate": 4.994924144626695e-06, + "loss": 0.6421, + "step": 1487 + }, + { + "epoch": 0.5879970363052606, + "grad_norm": 0.5028578672986176, + "learning_rate": 4.994914161090152e-06, + "loss": 0.6482, + "step": 1488 + }, + { + "epoch": 0.5883921956038528, + "grad_norm": 0.49503149805703456, + "learning_rate": 4.994904167755102e-06, + "loss": 0.6424, + "step": 1489 + }, + { + "epoch": 0.588787354902445, + "grad_norm": 0.52378156821829, + "learning_rate": 4.994894164621581e-06, + "loss": 0.6372, + "step": 1490 + }, + { + "epoch": 0.5891825142010373, + "grad_norm": 0.5049018438428512, + "learning_rate": 4.994884151689633e-06, + "loss": 0.6026, + "step": 1491 + }, + { + "epoch": 0.5895776734996295, + "grad_norm": 0.5056297740574257, + "learning_rate": 4.994874128959294e-06, + "loss": 0.6439, + "step": 1492 + }, + { + "epoch": 0.5899728327982218, + "grad_norm": 0.5059061924424931, + "learning_rate": 4.994864096430604e-06, + "loss": 0.6496, + "step": 1493 + }, + { + "epoch": 0.590367992096814, + "grad_norm": 0.5252330272280762, + "learning_rate": 4.994854054103604e-06, + "loss": 0.6431, + "step": 1494 + }, + { + "epoch": 0.5907631513954062, + "grad_norm": 0.5082734525840502, + "learning_rate": 4.994844001978331e-06, + "loss": 0.665, + "step": 1495 + }, + { + "epoch": 0.5911583106939985, + "grad_norm": 0.5783213762110898, + "learning_rate": 4.994833940054827e-06, + "loss": 0.6641, + "step": 1496 + }, + { + "epoch": 0.5915534699925907, + "grad_norm": 0.5579102147905942, + "learning_rate": 4.994823868333129e-06, + "loss": 0.6442, + "step": 1497 + }, + { + "epoch": 0.591948629291183, + "grad_norm": 0.49660206731752937, + "learning_rate": 4.9948137868132785e-06, + "loss": 0.6256, + "step": 1498 + }, + { + "epoch": 0.5923437885897752, + "grad_norm": 0.5338722501575794, + "learning_rate": 4.994803695495315e-06, + "loss": 0.6324, + "step": 1499 + }, + { + "epoch": 0.5927389478883675, + "grad_norm": 0.5435806297907863, + "learning_rate": 4.994793594379275e-06, + "loss": 0.642, + "step": 1500 + }, + { + "epoch": 0.5931341071869597, + "grad_norm": 0.5244058039956626, + "learning_rate": 4.9947834834652035e-06, + "loss": 0.6351, + "step": 1501 + }, + { + "epoch": 0.593529266485552, + "grad_norm": 0.5397763750210784, + "learning_rate": 4.9947733627531365e-06, + "loss": 0.5993, + "step": 1502 + }, + { + "epoch": 0.5939244257841443, + "grad_norm": 0.5288352373402283, + "learning_rate": 4.994763232243114e-06, + "loss": 0.6477, + "step": 1503 + }, + { + "epoch": 0.5943195850827365, + "grad_norm": 0.5090715743441777, + "learning_rate": 4.994753091935177e-06, + "loss": 0.645, + "step": 1504 + }, + { + "epoch": 0.5947147443813288, + "grad_norm": 0.6719556320052742, + "learning_rate": 4.994742941829364e-06, + "loss": 0.6573, + "step": 1505 + }, + { + "epoch": 0.595109903679921, + "grad_norm": 0.5147973826184885, + "learning_rate": 4.994732781925717e-06, + "loss": 0.6464, + "step": 1506 + }, + { + "epoch": 0.5955050629785132, + "grad_norm": 0.522878872773451, + "learning_rate": 4.994722612224274e-06, + "loss": 0.6592, + "step": 1507 + }, + { + "epoch": 0.5959002222771055, + "grad_norm": 0.5037339588735743, + "learning_rate": 4.9947124327250755e-06, + "loss": 0.6281, + "step": 1508 + }, + { + "epoch": 0.5962953815756977, + "grad_norm": 0.5142576687522799, + "learning_rate": 4.99470224342816e-06, + "loss": 0.6397, + "step": 1509 + }, + { + "epoch": 0.59669054087429, + "grad_norm": 0.5442114812958629, + "learning_rate": 4.99469204433357e-06, + "loss": 0.6681, + "step": 1510 + }, + { + "epoch": 0.5970857001728822, + "grad_norm": 0.49806465072349676, + "learning_rate": 4.994681835441345e-06, + "loss": 0.6411, + "step": 1511 + }, + { + "epoch": 0.5974808594714744, + "grad_norm": 0.5113177774148318, + "learning_rate": 4.994671616751524e-06, + "loss": 0.6365, + "step": 1512 + }, + { + "epoch": 0.5978760187700667, + "grad_norm": 0.5044746925510942, + "learning_rate": 4.994661388264148e-06, + "loss": 0.6245, + "step": 1513 + }, + { + "epoch": 0.5982711780686589, + "grad_norm": 0.5077014321420371, + "learning_rate": 4.994651149979257e-06, + "loss": 0.6296, + "step": 1514 + }, + { + "epoch": 0.5986663373672512, + "grad_norm": 0.5577617502488906, + "learning_rate": 4.9946409018968915e-06, + "loss": 0.6166, + "step": 1515 + }, + { + "epoch": 0.5990614966658434, + "grad_norm": 0.5454684436241605, + "learning_rate": 4.99463064401709e-06, + "loss": 0.6386, + "step": 1516 + }, + { + "epoch": 0.5994566559644356, + "grad_norm": 0.5330025000783452, + "learning_rate": 4.994620376339895e-06, + "loss": 0.6405, + "step": 1517 + }, + { + "epoch": 0.5998518152630279, + "grad_norm": 0.501164395733037, + "learning_rate": 4.994610098865346e-06, + "loss": 0.6225, + "step": 1518 + }, + { + "epoch": 0.6002469745616201, + "grad_norm": 0.49869645020020736, + "learning_rate": 4.994599811593484e-06, + "loss": 0.6316, + "step": 1519 + }, + { + "epoch": 0.6006421338602124, + "grad_norm": 0.5203023850194525, + "learning_rate": 4.9945895145243476e-06, + "loss": 0.6428, + "step": 1520 + }, + { + "epoch": 0.6010372931588046, + "grad_norm": 0.5288551722444358, + "learning_rate": 4.994579207657979e-06, + "loss": 0.6164, + "step": 1521 + }, + { + "epoch": 0.6014324524573968, + "grad_norm": 0.4919113743752538, + "learning_rate": 4.9945688909944175e-06, + "loss": 0.6213, + "step": 1522 + }, + { + "epoch": 0.6018276117559891, + "grad_norm": 0.5595834215690251, + "learning_rate": 4.994558564533705e-06, + "loss": 0.6436, + "step": 1523 + }, + { + "epoch": 0.6022227710545813, + "grad_norm": 0.5247902498260631, + "learning_rate": 4.9945482282758806e-06, + "loss": 0.638, + "step": 1524 + }, + { + "epoch": 0.6026179303531736, + "grad_norm": 0.5291480908300407, + "learning_rate": 4.994537882220985e-06, + "loss": 0.6253, + "step": 1525 + }, + { + "epoch": 0.6030130896517659, + "grad_norm": 0.501147812137641, + "learning_rate": 4.994527526369061e-06, + "loss": 0.664, + "step": 1526 + }, + { + "epoch": 0.6034082489503582, + "grad_norm": 0.505734785729107, + "learning_rate": 4.994517160720146e-06, + "loss": 0.6385, + "step": 1527 + }, + { + "epoch": 0.6038034082489504, + "grad_norm": 0.5275768661720794, + "learning_rate": 4.994506785274283e-06, + "loss": 0.6387, + "step": 1528 + }, + { + "epoch": 0.6041985675475426, + "grad_norm": 0.5056505262627228, + "learning_rate": 4.994496400031512e-06, + "loss": 0.64, + "step": 1529 + }, + { + "epoch": 0.6045937268461349, + "grad_norm": 0.4908463182821926, + "learning_rate": 4.9944860049918746e-06, + "loss": 0.621, + "step": 1530 + }, + { + "epoch": 0.6049888861447271, + "grad_norm": 0.5234617249072642, + "learning_rate": 4.99447560015541e-06, + "loss": 0.6418, + "step": 1531 + }, + { + "epoch": 0.6053840454433194, + "grad_norm": 0.501149633946628, + "learning_rate": 4.994465185522161e-06, + "loss": 0.6636, + "step": 1532 + }, + { + "epoch": 0.6057792047419116, + "grad_norm": 0.4720640412577634, + "learning_rate": 4.994454761092166e-06, + "loss": 0.6411, + "step": 1533 + }, + { + "epoch": 0.6061743640405038, + "grad_norm": 0.524342790903981, + "learning_rate": 4.994444326865469e-06, + "loss": 0.6452, + "step": 1534 + }, + { + "epoch": 0.6065695233390961, + "grad_norm": 0.5013477729936378, + "learning_rate": 4.994433882842108e-06, + "loss": 0.6304, + "step": 1535 + }, + { + "epoch": 0.6069646826376883, + "grad_norm": 0.5240831322483116, + "learning_rate": 4.994423429022126e-06, + "loss": 0.6339, + "step": 1536 + }, + { + "epoch": 0.6073598419362806, + "grad_norm": 0.4969641999990712, + "learning_rate": 4.994412965405563e-06, + "loss": 0.6443, + "step": 1537 + }, + { + "epoch": 0.6077550012348728, + "grad_norm": 0.5272037913210349, + "learning_rate": 4.9944024919924615e-06, + "loss": 0.6468, + "step": 1538 + }, + { + "epoch": 0.608150160533465, + "grad_norm": 0.529008774394857, + "learning_rate": 4.9943920087828615e-06, + "loss": 0.6362, + "step": 1539 + }, + { + "epoch": 0.6085453198320573, + "grad_norm": 0.4915768675876793, + "learning_rate": 4.994381515776804e-06, + "loss": 0.6435, + "step": 1540 + }, + { + "epoch": 0.6089404791306495, + "grad_norm": 0.5822531861044621, + "learning_rate": 4.9943710129743304e-06, + "loss": 0.6608, + "step": 1541 + }, + { + "epoch": 0.6093356384292418, + "grad_norm": 0.5146971943259008, + "learning_rate": 4.994360500375482e-06, + "loss": 0.6351, + "step": 1542 + }, + { + "epoch": 0.609730797727834, + "grad_norm": 0.5047870000075502, + "learning_rate": 4.994349977980301e-06, + "loss": 0.6387, + "step": 1543 + }, + { + "epoch": 0.6101259570264262, + "grad_norm": 0.4924385862974, + "learning_rate": 4.994339445788827e-06, + "loss": 0.6378, + "step": 1544 + }, + { + "epoch": 0.6105211163250185, + "grad_norm": 0.5195506832924177, + "learning_rate": 4.9943289038011035e-06, + "loss": 0.6422, + "step": 1545 + }, + { + "epoch": 0.6109162756236107, + "grad_norm": 0.504416370915793, + "learning_rate": 4.99431835201717e-06, + "loss": 0.6521, + "step": 1546 + }, + { + "epoch": 0.611311434922203, + "grad_norm": 0.5134028045383864, + "learning_rate": 4.9943077904370684e-06, + "loss": 0.6486, + "step": 1547 + }, + { + "epoch": 0.6117065942207952, + "grad_norm": 0.5438989106962773, + "learning_rate": 4.994297219060841e-06, + "loss": 0.6402, + "step": 1548 + }, + { + "epoch": 0.6121017535193874, + "grad_norm": 0.502459375072004, + "learning_rate": 4.994286637888528e-06, + "loss": 0.6421, + "step": 1549 + }, + { + "epoch": 0.6124969128179798, + "grad_norm": 0.5162824244852758, + "learning_rate": 4.994276046920172e-06, + "loss": 0.6403, + "step": 1550 + }, + { + "epoch": 0.612892072116572, + "grad_norm": 0.5178670603935037, + "learning_rate": 4.994265446155814e-06, + "loss": 0.6309, + "step": 1551 + }, + { + "epoch": 0.6132872314151643, + "grad_norm": 0.5117410334804755, + "learning_rate": 4.994254835595497e-06, + "loss": 0.6681, + "step": 1552 + }, + { + "epoch": 0.6136823907137565, + "grad_norm": 0.4965002156326863, + "learning_rate": 4.994244215239261e-06, + "loss": 0.6358, + "step": 1553 + }, + { + "epoch": 0.6140775500123488, + "grad_norm": 0.5151456093040961, + "learning_rate": 4.994233585087148e-06, + "loss": 0.6345, + "step": 1554 + }, + { + "epoch": 0.614472709310941, + "grad_norm": 0.5107017962950764, + "learning_rate": 4.9942229451392e-06, + "loss": 0.6224, + "step": 1555 + }, + { + "epoch": 0.6148678686095332, + "grad_norm": 0.5362359326835328, + "learning_rate": 4.99421229539546e-06, + "loss": 0.6402, + "step": 1556 + }, + { + "epoch": 0.6152630279081255, + "grad_norm": 0.5128666635843286, + "learning_rate": 4.994201635855967e-06, + "loss": 0.6359, + "step": 1557 + }, + { + "epoch": 0.6156581872067177, + "grad_norm": 0.5273421131964295, + "learning_rate": 4.994190966520765e-06, + "loss": 0.6503, + "step": 1558 + }, + { + "epoch": 0.61605334650531, + "grad_norm": 0.49230499474339184, + "learning_rate": 4.994180287389896e-06, + "loss": 0.6261, + "step": 1559 + }, + { + "epoch": 0.6164485058039022, + "grad_norm": 0.5217099067284172, + "learning_rate": 4.994169598463401e-06, + "loss": 0.6586, + "step": 1560 + }, + { + "epoch": 0.6168436651024944, + "grad_norm": 0.5102545311764888, + "learning_rate": 4.994158899741323e-06, + "loss": 0.6147, + "step": 1561 + }, + { + "epoch": 0.6172388244010867, + "grad_norm": 0.538265791682114, + "learning_rate": 4.9941481912237024e-06, + "loss": 0.6526, + "step": 1562 + }, + { + "epoch": 0.6176339836996789, + "grad_norm": 0.5087707653077916, + "learning_rate": 4.994137472910583e-06, + "loss": 0.6103, + "step": 1563 + }, + { + "epoch": 0.6180291429982712, + "grad_norm": 0.4985542462986526, + "learning_rate": 4.994126744802006e-06, + "loss": 0.6245, + "step": 1564 + }, + { + "epoch": 0.6184243022968634, + "grad_norm": 0.48942059314144304, + "learning_rate": 4.994116006898013e-06, + "loss": 0.6214, + "step": 1565 + }, + { + "epoch": 0.6188194615954556, + "grad_norm": 0.5116181294437978, + "learning_rate": 4.994105259198649e-06, + "loss": 0.622, + "step": 1566 + }, + { + "epoch": 0.6192146208940479, + "grad_norm": 0.5138679125630047, + "learning_rate": 4.994094501703951e-06, + "loss": 0.6434, + "step": 1567 + }, + { + "epoch": 0.6196097801926401, + "grad_norm": 0.5680217102402336, + "learning_rate": 4.994083734413966e-06, + "loss": 0.6421, + "step": 1568 + }, + { + "epoch": 0.6200049394912324, + "grad_norm": 0.5120382178875652, + "learning_rate": 4.9940729573287346e-06, + "loss": 0.6318, + "step": 1569 + }, + { + "epoch": 0.6204000987898246, + "grad_norm": 0.5470369708755527, + "learning_rate": 4.994062170448298e-06, + "loss": 0.6225, + "step": 1570 + }, + { + "epoch": 0.6207952580884168, + "grad_norm": 0.5586073977661113, + "learning_rate": 4.994051373772701e-06, + "loss": 0.6429, + "step": 1571 + }, + { + "epoch": 0.6211904173870091, + "grad_norm": 0.5780818912171855, + "learning_rate": 4.9940405673019844e-06, + "loss": 0.6378, + "step": 1572 + }, + { + "epoch": 0.6215855766856014, + "grad_norm": 0.5199314186964572, + "learning_rate": 4.99402975103619e-06, + "loss": 0.642, + "step": 1573 + }, + { + "epoch": 0.6219807359841937, + "grad_norm": 0.5292899229264992, + "learning_rate": 4.994018924975362e-06, + "loss": 0.638, + "step": 1574 + }, + { + "epoch": 0.6223758952827859, + "grad_norm": 0.5190979503571678, + "learning_rate": 4.994008089119542e-06, + "loss": 0.6242, + "step": 1575 + }, + { + "epoch": 0.6227710545813782, + "grad_norm": 0.575963095804118, + "learning_rate": 4.993997243468772e-06, + "loss": 0.6266, + "step": 1576 + }, + { + "epoch": 0.6231662138799704, + "grad_norm": 0.5340579778193899, + "learning_rate": 4.993986388023096e-06, + "loss": 0.6663, + "step": 1577 + }, + { + "epoch": 0.6235613731785626, + "grad_norm": 0.4959581900916024, + "learning_rate": 4.993975522782556e-06, + "loss": 0.6311, + "step": 1578 + }, + { + "epoch": 0.6239565324771549, + "grad_norm": 0.4981648627664517, + "learning_rate": 4.993964647747195e-06, + "loss": 0.6364, + "step": 1579 + }, + { + "epoch": 0.6243516917757471, + "grad_norm": 0.5042116842190024, + "learning_rate": 4.993953762917054e-06, + "loss": 0.6367, + "step": 1580 + }, + { + "epoch": 0.6247468510743394, + "grad_norm": 0.5027206712870845, + "learning_rate": 4.993942868292178e-06, + "loss": 0.6408, + "step": 1581 + }, + { + "epoch": 0.6251420103729316, + "grad_norm": 0.494384710791813, + "learning_rate": 4.993931963872608e-06, + "loss": 0.6388, + "step": 1582 + }, + { + "epoch": 0.6255371696715238, + "grad_norm": 0.48929240453544737, + "learning_rate": 4.993921049658389e-06, + "loss": 0.6359, + "step": 1583 + }, + { + "epoch": 0.6259323289701161, + "grad_norm": 0.5275911914444981, + "learning_rate": 4.993910125649561e-06, + "loss": 0.6435, + "step": 1584 + }, + { + "epoch": 0.6263274882687083, + "grad_norm": 0.5354103744385167, + "learning_rate": 4.993899191846169e-06, + "loss": 0.6214, + "step": 1585 + }, + { + "epoch": 0.6267226475673006, + "grad_norm": 0.5116129949120737, + "learning_rate": 4.9938882482482555e-06, + "loss": 0.636, + "step": 1586 + }, + { + "epoch": 0.6271178068658928, + "grad_norm": 0.5185673630960299, + "learning_rate": 4.993877294855863e-06, + "loss": 0.6366, + "step": 1587 + }, + { + "epoch": 0.627512966164485, + "grad_norm": 0.5025695303228517, + "learning_rate": 4.993866331669035e-06, + "loss": 0.6266, + "step": 1588 + }, + { + "epoch": 0.6279081254630773, + "grad_norm": 0.4939243554026485, + "learning_rate": 4.993855358687814e-06, + "loss": 0.6247, + "step": 1589 + }, + { + "epoch": 0.6283032847616695, + "grad_norm": 0.5013958381045799, + "learning_rate": 4.993844375912244e-06, + "loss": 0.6374, + "step": 1590 + }, + { + "epoch": 0.6286984440602618, + "grad_norm": 0.5165249055138371, + "learning_rate": 4.993833383342368e-06, + "loss": 0.6276, + "step": 1591 + }, + { + "epoch": 0.629093603358854, + "grad_norm": 0.5010885019204411, + "learning_rate": 4.993822380978228e-06, + "loss": 0.6273, + "step": 1592 + }, + { + "epoch": 0.6294887626574462, + "grad_norm": 0.47308571192974885, + "learning_rate": 4.993811368819869e-06, + "loss": 0.6406, + "step": 1593 + }, + { + "epoch": 0.6298839219560385, + "grad_norm": 0.5181491819308544, + "learning_rate": 4.993800346867333e-06, + "loss": 0.6577, + "step": 1594 + }, + { + "epoch": 0.6302790812546307, + "grad_norm": 0.7592473307425937, + "learning_rate": 4.993789315120663e-06, + "loss": 0.6136, + "step": 1595 + }, + { + "epoch": 0.630674240553223, + "grad_norm": 0.48322179308535596, + "learning_rate": 4.993778273579903e-06, + "loss": 0.6233, + "step": 1596 + }, + { + "epoch": 0.6310693998518153, + "grad_norm": 0.4989991809584226, + "learning_rate": 4.993767222245096e-06, + "loss": 0.615, + "step": 1597 + }, + { + "epoch": 0.6314645591504076, + "grad_norm": 0.5102524351860097, + "learning_rate": 4.993756161116287e-06, + "loss": 0.646, + "step": 1598 + }, + { + "epoch": 0.6318597184489998, + "grad_norm": 0.48881189426787136, + "learning_rate": 4.9937450901935166e-06, + "loss": 0.6309, + "step": 1599 + }, + { + "epoch": 0.632254877747592, + "grad_norm": 0.5162222090652335, + "learning_rate": 4.993734009476831e-06, + "loss": 0.6234, + "step": 1600 + }, + { + "epoch": 0.6326500370461843, + "grad_norm": 0.5219467840999582, + "learning_rate": 4.99372291896627e-06, + "loss": 0.6623, + "step": 1601 + }, + { + "epoch": 0.6330451963447765, + "grad_norm": 0.4812462236064405, + "learning_rate": 4.993711818661882e-06, + "loss": 0.6357, + "step": 1602 + }, + { + "epoch": 0.6334403556433688, + "grad_norm": 0.5121012568296751, + "learning_rate": 4.993700708563708e-06, + "loss": 0.6475, + "step": 1603 + }, + { + "epoch": 0.633835514941961, + "grad_norm": 0.5243902599376221, + "learning_rate": 4.993689588671792e-06, + "loss": 0.6204, + "step": 1604 + }, + { + "epoch": 0.6342306742405532, + "grad_norm": 0.5038240130102247, + "learning_rate": 4.9936784589861765e-06, + "loss": 0.6381, + "step": 1605 + }, + { + "epoch": 0.6346258335391455, + "grad_norm": 0.5056284357196895, + "learning_rate": 4.993667319506907e-06, + "loss": 0.6124, + "step": 1606 + }, + { + "epoch": 0.6350209928377377, + "grad_norm": 0.5210968104913557, + "learning_rate": 4.993656170234026e-06, + "loss": 0.6254, + "step": 1607 + }, + { + "epoch": 0.63541615213633, + "grad_norm": 0.5529274377061739, + "learning_rate": 4.9936450111675785e-06, + "loss": 0.6312, + "step": 1608 + }, + { + "epoch": 0.6358113114349222, + "grad_norm": 0.5464579530723207, + "learning_rate": 4.993633842307607e-06, + "loss": 0.616, + "step": 1609 + }, + { + "epoch": 0.6362064707335144, + "grad_norm": 0.5262401377076071, + "learning_rate": 4.9936226636541564e-06, + "loss": 0.6426, + "step": 1610 + }, + { + "epoch": 0.6366016300321067, + "grad_norm": 0.543994117033289, + "learning_rate": 4.993611475207269e-06, + "loss": 0.6192, + "step": 1611 + }, + { + "epoch": 0.6369967893306989, + "grad_norm": 0.5341939948990526, + "learning_rate": 4.993600276966992e-06, + "loss": 0.6271, + "step": 1612 + }, + { + "epoch": 0.6373919486292912, + "grad_norm": 0.5093829321261388, + "learning_rate": 4.993589068933366e-06, + "loss": 0.6177, + "step": 1613 + }, + { + "epoch": 0.6377871079278834, + "grad_norm": 0.6103509660198887, + "learning_rate": 4.993577851106437e-06, + "loss": 0.6292, + "step": 1614 + }, + { + "epoch": 0.6381822672264756, + "grad_norm": 0.5815804502591855, + "learning_rate": 4.993566623486247e-06, + "loss": 0.6644, + "step": 1615 + }, + { + "epoch": 0.6385774265250679, + "grad_norm": 0.5192475918715985, + "learning_rate": 4.993555386072843e-06, + "loss": 0.6439, + "step": 1616 + }, + { + "epoch": 0.6389725858236601, + "grad_norm": 0.5474735931150393, + "learning_rate": 4.993544138866266e-06, + "loss": 0.6393, + "step": 1617 + }, + { + "epoch": 0.6393677451222524, + "grad_norm": 0.5395600119026902, + "learning_rate": 4.993532881866564e-06, + "loss": 0.6386, + "step": 1618 + }, + { + "epoch": 0.6397629044208446, + "grad_norm": 0.5461307453791069, + "learning_rate": 4.993521615073777e-06, + "loss": 0.6423, + "step": 1619 + }, + { + "epoch": 0.6401580637194368, + "grad_norm": 0.5129146758889324, + "learning_rate": 4.9935103384879525e-06, + "loss": 0.6263, + "step": 1620 + }, + { + "epoch": 0.6405532230180292, + "grad_norm": 0.5338093770005661, + "learning_rate": 4.9934990521091335e-06, + "loss": 0.6464, + "step": 1621 + }, + { + "epoch": 0.6409483823166214, + "grad_norm": 0.5019435816787823, + "learning_rate": 4.993487755937363e-06, + "loss": 0.5966, + "step": 1622 + }, + { + "epoch": 0.6413435416152137, + "grad_norm": 0.520040205195016, + "learning_rate": 4.993476449972689e-06, + "loss": 0.6175, + "step": 1623 + }, + { + "epoch": 0.6417387009138059, + "grad_norm": 0.6219021349863824, + "learning_rate": 4.993465134215151e-06, + "loss": 0.635, + "step": 1624 + }, + { + "epoch": 0.6421338602123982, + "grad_norm": 0.4961034586172033, + "learning_rate": 4.993453808664798e-06, + "loss": 0.6482, + "step": 1625 + }, + { + "epoch": 0.6425290195109904, + "grad_norm": 0.5096787165995785, + "learning_rate": 4.9934424733216715e-06, + "loss": 0.6633, + "step": 1626 + }, + { + "epoch": 0.6429241788095826, + "grad_norm": 0.5237229903942505, + "learning_rate": 4.993431128185818e-06, + "loss": 0.6385, + "step": 1627 + }, + { + "epoch": 0.6433193381081749, + "grad_norm": 0.4967553534767619, + "learning_rate": 4.9934197732572794e-06, + "loss": 0.6326, + "step": 1628 + }, + { + "epoch": 0.6437144974067671, + "grad_norm": 0.49402878207474876, + "learning_rate": 4.993408408536104e-06, + "loss": 0.6437, + "step": 1629 + }, + { + "epoch": 0.6441096567053594, + "grad_norm": 0.5955393513897179, + "learning_rate": 4.993397034022333e-06, + "loss": 0.6238, + "step": 1630 + }, + { + "epoch": 0.6445048160039516, + "grad_norm": 0.5038622383402742, + "learning_rate": 4.993385649716014e-06, + "loss": 0.6286, + "step": 1631 + }, + { + "epoch": 0.6448999753025438, + "grad_norm": 0.47945185187430733, + "learning_rate": 4.9933742556171895e-06, + "loss": 0.6099, + "step": 1632 + }, + { + "epoch": 0.6452951346011361, + "grad_norm": 0.489392501913111, + "learning_rate": 4.993362851725905e-06, + "loss": 0.6319, + "step": 1633 + }, + { + "epoch": 0.6456902938997283, + "grad_norm": 0.5010165910463097, + "learning_rate": 4.993351438042204e-06, + "loss": 0.6394, + "step": 1634 + }, + { + "epoch": 0.6460854531983206, + "grad_norm": 0.4934200799303604, + "learning_rate": 4.993340014566135e-06, + "loss": 0.6357, + "step": 1635 + }, + { + "epoch": 0.6464806124969128, + "grad_norm": 0.4912427926805577, + "learning_rate": 4.993328581297738e-06, + "loss": 0.623, + "step": 1636 + }, + { + "epoch": 0.646875771795505, + "grad_norm": 0.4873510707511348, + "learning_rate": 4.993317138237062e-06, + "loss": 0.6391, + "step": 1637 + }, + { + "epoch": 0.6472709310940973, + "grad_norm": 0.4948293050023501, + "learning_rate": 4.99330568538415e-06, + "loss": 0.6412, + "step": 1638 + }, + { + "epoch": 0.6476660903926895, + "grad_norm": 0.4936219175231247, + "learning_rate": 4.993294222739047e-06, + "loss": 0.6268, + "step": 1639 + }, + { + "epoch": 0.6480612496912818, + "grad_norm": 0.49996197505527556, + "learning_rate": 4.993282750301799e-06, + "loss": 0.6429, + "step": 1640 + }, + { + "epoch": 0.648456408989874, + "grad_norm": 0.4941502207114622, + "learning_rate": 4.993271268072449e-06, + "loss": 0.6079, + "step": 1641 + }, + { + "epoch": 0.6488515682884662, + "grad_norm": 0.49029556645390376, + "learning_rate": 4.993259776051045e-06, + "loss": 0.6307, + "step": 1642 + }, + { + "epoch": 0.6492467275870585, + "grad_norm": 0.5033458347363889, + "learning_rate": 4.9932482742376295e-06, + "loss": 0.6331, + "step": 1643 + }, + { + "epoch": 0.6496418868856508, + "grad_norm": 0.4996027449996174, + "learning_rate": 4.993236762632248e-06, + "loss": 0.6174, + "step": 1644 + }, + { + "epoch": 0.6500370461842431, + "grad_norm": 0.4954371092717256, + "learning_rate": 4.993225241234949e-06, + "loss": 0.6355, + "step": 1645 + }, + { + "epoch": 0.6504322054828353, + "grad_norm": 0.525740581307838, + "learning_rate": 4.9932137100457735e-06, + "loss": 0.6318, + "step": 1646 + }, + { + "epoch": 0.6508273647814276, + "grad_norm": 0.5096572884835409, + "learning_rate": 4.993202169064769e-06, + "loss": 0.6369, + "step": 1647 + }, + { + "epoch": 0.6512225240800198, + "grad_norm": 0.8344702188550295, + "learning_rate": 4.993190618291979e-06, + "loss": 0.6473, + "step": 1648 + }, + { + "epoch": 0.651617683378612, + "grad_norm": 0.4943527469749559, + "learning_rate": 4.993179057727452e-06, + "loss": 0.6305, + "step": 1649 + }, + { + "epoch": 0.6520128426772043, + "grad_norm": 0.5123537897586123, + "learning_rate": 4.993167487371231e-06, + "loss": 0.6363, + "step": 1650 + }, + { + "epoch": 0.6524080019757965, + "grad_norm": 0.48337632188932905, + "learning_rate": 4.993155907223362e-06, + "loss": 0.6462, + "step": 1651 + }, + { + "epoch": 0.6528031612743888, + "grad_norm": 0.4941193733493147, + "learning_rate": 4.993144317283891e-06, + "loss": 0.6395, + "step": 1652 + }, + { + "epoch": 0.653198320572981, + "grad_norm": 0.5171725409360232, + "learning_rate": 4.993132717552862e-06, + "loss": 0.615, + "step": 1653 + }, + { + "epoch": 0.6535934798715732, + "grad_norm": 0.5028964520752601, + "learning_rate": 4.9931211080303225e-06, + "loss": 0.6264, + "step": 1654 + }, + { + "epoch": 0.6539886391701655, + "grad_norm": 0.5170261798292486, + "learning_rate": 4.9931094887163165e-06, + "loss": 0.6232, + "step": 1655 + }, + { + "epoch": 0.6543837984687577, + "grad_norm": 0.5393500555133617, + "learning_rate": 4.993097859610891e-06, + "loss": 0.6295, + "step": 1656 + }, + { + "epoch": 0.65477895776735, + "grad_norm": 0.5170632791663832, + "learning_rate": 4.99308622071409e-06, + "loss": 0.6457, + "step": 1657 + }, + { + "epoch": 0.6551741170659422, + "grad_norm": 0.5088037328336552, + "learning_rate": 4.993074572025961e-06, + "loss": 0.6329, + "step": 1658 + }, + { + "epoch": 0.6555692763645344, + "grad_norm": 0.5517685712418717, + "learning_rate": 4.993062913546549e-06, + "loss": 0.6151, + "step": 1659 + }, + { + "epoch": 0.6559644356631267, + "grad_norm": 0.6574193345875109, + "learning_rate": 4.9930512452758996e-06, + "loss": 0.6541, + "step": 1660 + }, + { + "epoch": 0.6563595949617189, + "grad_norm": 0.50641713145241, + "learning_rate": 4.993039567214058e-06, + "loss": 0.6439, + "step": 1661 + }, + { + "epoch": 0.6567547542603112, + "grad_norm": 0.5183226080631096, + "learning_rate": 4.993027879361072e-06, + "loss": 0.6441, + "step": 1662 + }, + { + "epoch": 0.6571499135589034, + "grad_norm": 0.5026167336939322, + "learning_rate": 4.993016181716987e-06, + "loss": 0.6335, + "step": 1663 + }, + { + "epoch": 0.6575450728574956, + "grad_norm": 0.5060260690365562, + "learning_rate": 4.993004474281846e-06, + "loss": 0.6447, + "step": 1664 + }, + { + "epoch": 0.6579402321560879, + "grad_norm": 0.5057941400086423, + "learning_rate": 4.992992757055699e-06, + "loss": 0.6291, + "step": 1665 + }, + { + "epoch": 0.6583353914546801, + "grad_norm": 0.506979357825653, + "learning_rate": 4.9929810300385894e-06, + "loss": 0.635, + "step": 1666 + }, + { + "epoch": 0.6587305507532724, + "grad_norm": 0.4818917872233048, + "learning_rate": 4.992969293230565e-06, + "loss": 0.6382, + "step": 1667 + }, + { + "epoch": 0.6591257100518647, + "grad_norm": 0.4925360002260035, + "learning_rate": 4.992957546631671e-06, + "loss": 0.6142, + "step": 1668 + }, + { + "epoch": 0.659520869350457, + "grad_norm": 0.5046203653064728, + "learning_rate": 4.992945790241952e-06, + "loss": 0.6304, + "step": 1669 + }, + { + "epoch": 0.6599160286490492, + "grad_norm": 0.49179177381723277, + "learning_rate": 4.992934024061456e-06, + "loss": 0.6293, + "step": 1670 + }, + { + "epoch": 0.6603111879476414, + "grad_norm": 0.4818457491170986, + "learning_rate": 4.9929222480902305e-06, + "loss": 0.612, + "step": 1671 + }, + { + "epoch": 0.6607063472462337, + "grad_norm": 0.4853439003519475, + "learning_rate": 4.992910462328319e-06, + "loss": 0.6331, + "step": 1672 + }, + { + "epoch": 0.6611015065448259, + "grad_norm": 0.4736906753344473, + "learning_rate": 4.99289866677577e-06, + "loss": 0.6233, + "step": 1673 + }, + { + "epoch": 0.6614966658434182, + "grad_norm": 0.4875165225064638, + "learning_rate": 4.992886861432628e-06, + "loss": 0.6267, + "step": 1674 + }, + { + "epoch": 0.6618918251420104, + "grad_norm": 0.4808588973957292, + "learning_rate": 4.99287504629894e-06, + "loss": 0.6172, + "step": 1675 + }, + { + "epoch": 0.6622869844406026, + "grad_norm": 0.4907208193211784, + "learning_rate": 4.992863221374753e-06, + "loss": 0.617, + "step": 1676 + }, + { + "epoch": 0.6626821437391949, + "grad_norm": 0.48958473331720903, + "learning_rate": 4.992851386660114e-06, + "loss": 0.6482, + "step": 1677 + }, + { + "epoch": 0.6630773030377871, + "grad_norm": 0.4746449843260302, + "learning_rate": 4.992839542155067e-06, + "loss": 0.6363, + "step": 1678 + }, + { + "epoch": 0.6634724623363794, + "grad_norm": 0.5081053936197776, + "learning_rate": 4.9928276878596605e-06, + "loss": 0.6349, + "step": 1679 + }, + { + "epoch": 0.6638676216349716, + "grad_norm": 0.5059779639168953, + "learning_rate": 4.99281582377394e-06, + "loss": 0.6479, + "step": 1680 + }, + { + "epoch": 0.6642627809335638, + "grad_norm": 0.49460659054649947, + "learning_rate": 4.992803949897954e-06, + "loss": 0.6161, + "step": 1681 + }, + { + "epoch": 0.6646579402321561, + "grad_norm": 0.48860656949577014, + "learning_rate": 4.992792066231746e-06, + "loss": 0.6147, + "step": 1682 + }, + { + "epoch": 0.6650530995307483, + "grad_norm": 0.4990949380069287, + "learning_rate": 4.992780172775366e-06, + "loss": 0.6687, + "step": 1683 + }, + { + "epoch": 0.6654482588293406, + "grad_norm": 0.50449571493179, + "learning_rate": 4.9927682695288584e-06, + "loss": 0.6234, + "step": 1684 + }, + { + "epoch": 0.6658434181279328, + "grad_norm": 0.5097265264943369, + "learning_rate": 4.992756356492271e-06, + "loss": 0.6345, + "step": 1685 + }, + { + "epoch": 0.666238577426525, + "grad_norm": 0.4883277462528845, + "learning_rate": 4.99274443366565e-06, + "loss": 0.6419, + "step": 1686 + }, + { + "epoch": 0.6666337367251173, + "grad_norm": 0.47970413042564153, + "learning_rate": 4.992732501049044e-06, + "loss": 0.6036, + "step": 1687 + }, + { + "epoch": 0.6670288960237095, + "grad_norm": 0.4935176772839213, + "learning_rate": 4.992720558642496e-06, + "loss": 0.6463, + "step": 1688 + }, + { + "epoch": 0.6674240553223018, + "grad_norm": 0.4759709026940183, + "learning_rate": 4.9927086064460575e-06, + "loss": 0.6537, + "step": 1689 + }, + { + "epoch": 0.667819214620894, + "grad_norm": 0.47430486816189027, + "learning_rate": 4.992696644459771e-06, + "loss": 0.6317, + "step": 1690 + }, + { + "epoch": 0.6682143739194862, + "grad_norm": 0.5034833730971029, + "learning_rate": 4.992684672683688e-06, + "loss": 0.627, + "step": 1691 + }, + { + "epoch": 0.6686095332180786, + "grad_norm": 0.519836264221229, + "learning_rate": 4.992672691117852e-06, + "loss": 0.6538, + "step": 1692 + }, + { + "epoch": 0.6690046925166708, + "grad_norm": 0.5050122558075558, + "learning_rate": 4.992660699762311e-06, + "loss": 0.6473, + "step": 1693 + }, + { + "epoch": 0.6693998518152631, + "grad_norm": 0.4707732129541408, + "learning_rate": 4.992648698617113e-06, + "loss": 0.6076, + "step": 1694 + }, + { + "epoch": 0.6697950111138553, + "grad_norm": 0.49511795117802215, + "learning_rate": 4.9926366876823054e-06, + "loss": 0.6271, + "step": 1695 + }, + { + "epoch": 0.6701901704124476, + "grad_norm": 0.49908332035521424, + "learning_rate": 4.992624666957932e-06, + "loss": 0.6431, + "step": 1696 + }, + { + "epoch": 0.6705853297110398, + "grad_norm": 0.48240971327251186, + "learning_rate": 4.992612636444045e-06, + "loss": 0.6218, + "step": 1697 + }, + { + "epoch": 0.670980489009632, + "grad_norm": 0.5028260213765751, + "learning_rate": 4.992600596140688e-06, + "loss": 0.6142, + "step": 1698 + }, + { + "epoch": 0.6713756483082243, + "grad_norm": 0.49372992278625305, + "learning_rate": 4.99258854604791e-06, + "loss": 0.6289, + "step": 1699 + }, + { + "epoch": 0.6717708076068165, + "grad_norm": 0.48990901770488715, + "learning_rate": 4.9925764861657575e-06, + "loss": 0.6139, + "step": 1700 + }, + { + "epoch": 0.6721659669054088, + "grad_norm": 0.5062662633187875, + "learning_rate": 4.9925644164942776e-06, + "loss": 0.6363, + "step": 1701 + }, + { + "epoch": 0.672561126204001, + "grad_norm": 0.5000686626853466, + "learning_rate": 4.992552337033519e-06, + "loss": 0.6062, + "step": 1702 + }, + { + "epoch": 0.6729562855025932, + "grad_norm": 0.49142806262561756, + "learning_rate": 4.992540247783528e-06, + "loss": 0.639, + "step": 1703 + }, + { + "epoch": 0.6733514448011855, + "grad_norm": 0.49049346875912636, + "learning_rate": 4.992528148744353e-06, + "loss": 0.6244, + "step": 1704 + }, + { + "epoch": 0.6737466040997777, + "grad_norm": 0.47745945762367137, + "learning_rate": 4.99251603991604e-06, + "loss": 0.6541, + "step": 1705 + }, + { + "epoch": 0.67414176339837, + "grad_norm": 0.4842192684853816, + "learning_rate": 4.992503921298638e-06, + "loss": 0.621, + "step": 1706 + }, + { + "epoch": 0.6745369226969622, + "grad_norm": 0.4870080088135584, + "learning_rate": 4.992491792892194e-06, + "loss": 0.6359, + "step": 1707 + }, + { + "epoch": 0.6749320819955544, + "grad_norm": 0.48850423253720654, + "learning_rate": 4.992479654696757e-06, + "loss": 0.6375, + "step": 1708 + }, + { + "epoch": 0.6753272412941467, + "grad_norm": 0.49822947926968186, + "learning_rate": 4.992467506712372e-06, + "loss": 0.6399, + "step": 1709 + }, + { + "epoch": 0.6757224005927389, + "grad_norm": 0.48888282315789844, + "learning_rate": 4.992455348939088e-06, + "loss": 0.6275, + "step": 1710 + }, + { + "epoch": 0.6761175598913312, + "grad_norm": 0.4782716794651655, + "learning_rate": 4.992443181376954e-06, + "loss": 0.6353, + "step": 1711 + }, + { + "epoch": 0.6765127191899234, + "grad_norm": 0.49789653142772966, + "learning_rate": 4.992431004026016e-06, + "loss": 0.6202, + "step": 1712 + }, + { + "epoch": 0.6769078784885156, + "grad_norm": 0.49660191988375124, + "learning_rate": 4.992418816886322e-06, + "loss": 0.6237, + "step": 1713 + }, + { + "epoch": 0.6773030377871079, + "grad_norm": 0.5012417152818854, + "learning_rate": 4.992406619957922e-06, + "loss": 0.6216, + "step": 1714 + }, + { + "epoch": 0.6776981970857002, + "grad_norm": 0.4839082971693346, + "learning_rate": 4.992394413240861e-06, + "loss": 0.6118, + "step": 1715 + }, + { + "epoch": 0.6780933563842925, + "grad_norm": 0.4965208023800787, + "learning_rate": 4.992382196735188e-06, + "loss": 0.6224, + "step": 1716 + }, + { + "epoch": 0.6784885156828847, + "grad_norm": 0.5042338428138545, + "learning_rate": 4.992369970440952e-06, + "loss": 0.643, + "step": 1717 + }, + { + "epoch": 0.678883674981477, + "grad_norm": 0.4975143700279896, + "learning_rate": 4.9923577343582e-06, + "loss": 0.6136, + "step": 1718 + }, + { + "epoch": 0.6792788342800692, + "grad_norm": 0.5281177934717292, + "learning_rate": 4.992345488486979e-06, + "loss": 0.6397, + "step": 1719 + }, + { + "epoch": 0.6796739935786614, + "grad_norm": 0.4881426918498292, + "learning_rate": 4.99233323282734e-06, + "loss": 0.6249, + "step": 1720 + }, + { + "epoch": 0.6800691528772537, + "grad_norm": 0.49409156351025163, + "learning_rate": 4.992320967379329e-06, + "loss": 0.6397, + "step": 1721 + }, + { + "epoch": 0.6804643121758459, + "grad_norm": 0.48346096985864084, + "learning_rate": 4.992308692142995e-06, + "loss": 0.633, + "step": 1722 + }, + { + "epoch": 0.6808594714744381, + "grad_norm": 0.4926531840900509, + "learning_rate": 4.992296407118385e-06, + "loss": 0.6346, + "step": 1723 + }, + { + "epoch": 0.6812546307730304, + "grad_norm": 0.4894311266855414, + "learning_rate": 4.992284112305549e-06, + "loss": 0.6086, + "step": 1724 + }, + { + "epoch": 0.6816497900716226, + "grad_norm": 0.49490153693793343, + "learning_rate": 4.992271807704534e-06, + "loss": 0.6318, + "step": 1725 + }, + { + "epoch": 0.6820449493702149, + "grad_norm": 0.4828850751455796, + "learning_rate": 4.9922594933153884e-06, + "loss": 0.632, + "step": 1726 + }, + { + "epoch": 0.6824401086688071, + "grad_norm": 0.5330475769017348, + "learning_rate": 4.992247169138161e-06, + "loss": 0.6269, + "step": 1727 + }, + { + "epoch": 0.6828352679673994, + "grad_norm": 0.48050985014818776, + "learning_rate": 4.9922348351729e-06, + "loss": 0.6349, + "step": 1728 + }, + { + "epoch": 0.6832304272659916, + "grad_norm": 0.48483280529770134, + "learning_rate": 4.992222491419655e-06, + "loss": 0.6555, + "step": 1729 + }, + { + "epoch": 0.6836255865645838, + "grad_norm": 0.5080406635977245, + "learning_rate": 4.992210137878472e-06, + "loss": 0.6359, + "step": 1730 + }, + { + "epoch": 0.6840207458631761, + "grad_norm": 0.5089264759702186, + "learning_rate": 4.9921977745494025e-06, + "loss": 0.6406, + "step": 1731 + }, + { + "epoch": 0.6844159051617683, + "grad_norm": 0.48659054045056965, + "learning_rate": 4.992185401432493e-06, + "loss": 0.602, + "step": 1732 + }, + { + "epoch": 0.6848110644603606, + "grad_norm": 0.49059051256711417, + "learning_rate": 4.992173018527791e-06, + "loss": 0.6035, + "step": 1733 + }, + { + "epoch": 0.6852062237589528, + "grad_norm": 0.5182172495516812, + "learning_rate": 4.992160625835348e-06, + "loss": 0.643, + "step": 1734 + }, + { + "epoch": 0.685601383057545, + "grad_norm": 0.49425214892911673, + "learning_rate": 4.992148223355211e-06, + "loss": 0.6326, + "step": 1735 + }, + { + "epoch": 0.6859965423561373, + "grad_norm": 0.5023535053495725, + "learning_rate": 4.9921358110874295e-06, + "loss": 0.6416, + "step": 1736 + }, + { + "epoch": 0.6863917016547295, + "grad_norm": 0.6023947929077457, + "learning_rate": 4.992123389032052e-06, + "loss": 0.6429, + "step": 1737 + }, + { + "epoch": 0.6867868609533218, + "grad_norm": 0.5037834252439649, + "learning_rate": 4.992110957189126e-06, + "loss": 0.6277, + "step": 1738 + }, + { + "epoch": 0.6871820202519141, + "grad_norm": 0.49549521269729524, + "learning_rate": 4.992098515558702e-06, + "loss": 0.6305, + "step": 1739 + }, + { + "epoch": 0.6875771795505063, + "grad_norm": 0.49551911885336786, + "learning_rate": 4.992086064140829e-06, + "loss": 0.6248, + "step": 1740 + }, + { + "epoch": 0.6879723388490986, + "grad_norm": 0.5246656926007554, + "learning_rate": 4.9920736029355544e-06, + "loss": 0.6281, + "step": 1741 + }, + { + "epoch": 0.6883674981476908, + "grad_norm": 0.5001944530177468, + "learning_rate": 4.992061131942929e-06, + "loss": 0.6261, + "step": 1742 + }, + { + "epoch": 0.6887626574462831, + "grad_norm": 0.4826806905887991, + "learning_rate": 4.992048651163e-06, + "loss": 0.6205, + "step": 1743 + }, + { + "epoch": 0.6891578167448753, + "grad_norm": 0.5077638034008282, + "learning_rate": 4.992036160595817e-06, + "loss": 0.6366, + "step": 1744 + }, + { + "epoch": 0.6895529760434675, + "grad_norm": 0.4957736986969226, + "learning_rate": 4.9920236602414295e-06, + "loss": 0.6197, + "step": 1745 + }, + { + "epoch": 0.6899481353420598, + "grad_norm": 0.4934343040818385, + "learning_rate": 4.992011150099886e-06, + "loss": 0.6381, + "step": 1746 + }, + { + "epoch": 0.690343294640652, + "grad_norm": 0.492375853768171, + "learning_rate": 4.991998630171236e-06, + "loss": 0.6313, + "step": 1747 + }, + { + "epoch": 0.6907384539392443, + "grad_norm": 0.47870636501116376, + "learning_rate": 4.991986100455529e-06, + "loss": 0.6038, + "step": 1748 + }, + { + "epoch": 0.6911336132378365, + "grad_norm": 0.5048304913807063, + "learning_rate": 4.991973560952813e-06, + "loss": 0.6293, + "step": 1749 + }, + { + "epoch": 0.6915287725364287, + "grad_norm": 0.5249637722841536, + "learning_rate": 4.991961011663139e-06, + "loss": 0.6307, + "step": 1750 + }, + { + "epoch": 0.691923931835021, + "grad_norm": 0.5208870005299038, + "learning_rate": 4.991948452586555e-06, + "loss": 0.6332, + "step": 1751 + }, + { + "epoch": 0.6923190911336132, + "grad_norm": 0.5015944555669404, + "learning_rate": 4.991935883723111e-06, + "loss": 0.5839, + "step": 1752 + }, + { + "epoch": 0.6927142504322055, + "grad_norm": 0.5163525589308585, + "learning_rate": 4.991923305072856e-06, + "loss": 0.6235, + "step": 1753 + }, + { + "epoch": 0.6931094097307977, + "grad_norm": 0.5032310673814737, + "learning_rate": 4.991910716635838e-06, + "loss": 0.6263, + "step": 1754 + }, + { + "epoch": 0.69350456902939, + "grad_norm": 0.47440600362048463, + "learning_rate": 4.991898118412109e-06, + "loss": 0.6376, + "step": 1755 + }, + { + "epoch": 0.6938997283279822, + "grad_norm": 0.47177908043565303, + "learning_rate": 4.991885510401717e-06, + "loss": 0.5965, + "step": 1756 + }, + { + "epoch": 0.6942948876265744, + "grad_norm": 0.5050010124065104, + "learning_rate": 4.991872892604713e-06, + "loss": 0.6443, + "step": 1757 + }, + { + "epoch": 0.6946900469251667, + "grad_norm": 0.4703390805427689, + "learning_rate": 4.991860265021144e-06, + "loss": 0.6371, + "step": 1758 + }, + { + "epoch": 0.6950852062237589, + "grad_norm": 0.5965162740676478, + "learning_rate": 4.991847627651062e-06, + "loss": 0.6325, + "step": 1759 + }, + { + "epoch": 0.6954803655223512, + "grad_norm": 0.4995953085902974, + "learning_rate": 4.991834980494515e-06, + "loss": 0.6124, + "step": 1760 + }, + { + "epoch": 0.6958755248209434, + "grad_norm": 0.48410450082082324, + "learning_rate": 4.991822323551554e-06, + "loss": 0.6543, + "step": 1761 + }, + { + "epoch": 0.6962706841195357, + "grad_norm": 0.5097060752514024, + "learning_rate": 4.991809656822227e-06, + "loss": 0.6341, + "step": 1762 + }, + { + "epoch": 0.696665843418128, + "grad_norm": 0.5028866911312128, + "learning_rate": 4.991796980306586e-06, + "loss": 0.6196, + "step": 1763 + }, + { + "epoch": 0.6970610027167202, + "grad_norm": 0.48181973244537724, + "learning_rate": 4.991784294004679e-06, + "loss": 0.6142, + "step": 1764 + }, + { + "epoch": 0.6974561620153125, + "grad_norm": 0.481960114220102, + "learning_rate": 4.991771597916556e-06, + "loss": 0.6064, + "step": 1765 + }, + { + "epoch": 0.6978513213139047, + "grad_norm": 0.4962930971445703, + "learning_rate": 4.9917588920422675e-06, + "loss": 0.6415, + "step": 1766 + }, + { + "epoch": 0.698246480612497, + "grad_norm": 0.49440862308297445, + "learning_rate": 4.991746176381863e-06, + "loss": 0.6169, + "step": 1767 + }, + { + "epoch": 0.6986416399110892, + "grad_norm": 0.5047352213306215, + "learning_rate": 4.991733450935393e-06, + "loss": 0.6304, + "step": 1768 + }, + { + "epoch": 0.6990367992096814, + "grad_norm": 0.4733163582966052, + "learning_rate": 4.991720715702907e-06, + "loss": 0.6272, + "step": 1769 + }, + { + "epoch": 0.6994319585082737, + "grad_norm": 0.5098227044422302, + "learning_rate": 4.991707970684455e-06, + "loss": 0.6364, + "step": 1770 + }, + { + "epoch": 0.6998271178068659, + "grad_norm": 0.49090533225232885, + "learning_rate": 4.991695215880087e-06, + "loss": 0.6415, + "step": 1771 + }, + { + "epoch": 0.7002222771054581, + "grad_norm": 0.49614105702660316, + "learning_rate": 4.991682451289853e-06, + "loss": 0.6588, + "step": 1772 + }, + { + "epoch": 0.7006174364040504, + "grad_norm": 0.47774693460503354, + "learning_rate": 4.991669676913804e-06, + "loss": 0.6316, + "step": 1773 + }, + { + "epoch": 0.7010125957026426, + "grad_norm": 0.4822380940669454, + "learning_rate": 4.991656892751989e-06, + "loss": 0.6086, + "step": 1774 + }, + { + "epoch": 0.7014077550012349, + "grad_norm": 0.472072856981578, + "learning_rate": 4.99164409880446e-06, + "loss": 0.6126, + "step": 1775 + }, + { + "epoch": 0.7018029142998271, + "grad_norm": 0.5033099153233312, + "learning_rate": 4.991631295071265e-06, + "loss": 0.6334, + "step": 1776 + }, + { + "epoch": 0.7021980735984193, + "grad_norm": 0.48177193738949203, + "learning_rate": 4.991618481552455e-06, + "loss": 0.6191, + "step": 1777 + }, + { + "epoch": 0.7025932328970116, + "grad_norm": 0.5635001881112269, + "learning_rate": 4.9916056582480805e-06, + "loss": 0.6255, + "step": 1778 + }, + { + "epoch": 0.7029883921956038, + "grad_norm": 0.497837050692373, + "learning_rate": 4.991592825158192e-06, + "loss": 0.6277, + "step": 1779 + }, + { + "epoch": 0.7033835514941961, + "grad_norm": 0.5046834438070316, + "learning_rate": 4.991579982282841e-06, + "loss": 0.6419, + "step": 1780 + }, + { + "epoch": 0.7037787107927883, + "grad_norm": 0.4902373827129985, + "learning_rate": 4.991567129622076e-06, + "loss": 0.6432, + "step": 1781 + }, + { + "epoch": 0.7041738700913805, + "grad_norm": 0.5067434826741889, + "learning_rate": 4.991554267175947e-06, + "loss": 0.6453, + "step": 1782 + }, + { + "epoch": 0.7045690293899728, + "grad_norm": 0.49398025290400144, + "learning_rate": 4.991541394944508e-06, + "loss": 0.6082, + "step": 1783 + }, + { + "epoch": 0.704964188688565, + "grad_norm": 0.500932111828932, + "learning_rate": 4.991528512927806e-06, + "loss": 0.6222, + "step": 1784 + }, + { + "epoch": 0.7053593479871573, + "grad_norm": 0.4855703145150172, + "learning_rate": 4.991515621125893e-06, + "loss": 0.6395, + "step": 1785 + }, + { + "epoch": 0.7057545072857496, + "grad_norm": 0.48895788982354876, + "learning_rate": 4.99150271953882e-06, + "loss": 0.6254, + "step": 1786 + }, + { + "epoch": 0.7061496665843419, + "grad_norm": 0.49706293010128083, + "learning_rate": 4.9914898081666375e-06, + "loss": 0.6225, + "step": 1787 + }, + { + "epoch": 0.7065448258829341, + "grad_norm": 0.5177841757480415, + "learning_rate": 4.991476887009395e-06, + "loss": 0.6477, + "step": 1788 + }, + { + "epoch": 0.7069399851815263, + "grad_norm": 0.4768336877804729, + "learning_rate": 4.991463956067145e-06, + "loss": 0.6427, + "step": 1789 + }, + { + "epoch": 0.7073351444801186, + "grad_norm": 0.48872337485295864, + "learning_rate": 4.9914510153399375e-06, + "loss": 0.6283, + "step": 1790 + }, + { + "epoch": 0.7077303037787108, + "grad_norm": 0.5233174541443238, + "learning_rate": 4.9914380648278224e-06, + "loss": 0.6483, + "step": 1791 + }, + { + "epoch": 0.7081254630773031, + "grad_norm": 0.48167766487074626, + "learning_rate": 4.991425104530852e-06, + "loss": 0.6253, + "step": 1792 + }, + { + "epoch": 0.7085206223758953, + "grad_norm": 0.49635669925532444, + "learning_rate": 4.991412134449078e-06, + "loss": 0.6354, + "step": 1793 + }, + { + "epoch": 0.7089157816744875, + "grad_norm": 0.494107036823578, + "learning_rate": 4.991399154582548e-06, + "loss": 0.618, + "step": 1794 + }, + { + "epoch": 0.7093109409730798, + "grad_norm": 0.48336185208770716, + "learning_rate": 4.991386164931316e-06, + "loss": 0.6236, + "step": 1795 + }, + { + "epoch": 0.709706100271672, + "grad_norm": 0.49749633745530736, + "learning_rate": 4.991373165495431e-06, + "loss": 0.6336, + "step": 1796 + }, + { + "epoch": 0.7101012595702643, + "grad_norm": 0.5034095057979783, + "learning_rate": 4.991360156274946e-06, + "loss": 0.647, + "step": 1797 + }, + { + "epoch": 0.7104964188688565, + "grad_norm": 0.5322413221010839, + "learning_rate": 4.9913471372699115e-06, + "loss": 0.6011, + "step": 1798 + }, + { + "epoch": 0.7108915781674487, + "grad_norm": 0.49545526697939474, + "learning_rate": 4.991334108480377e-06, + "loss": 0.6364, + "step": 1799 + }, + { + "epoch": 0.711286737466041, + "grad_norm": 0.49309817729132516, + "learning_rate": 4.9913210699063965e-06, + "loss": 0.614, + "step": 1800 + }, + { + "epoch": 0.7116818967646332, + "grad_norm": 0.5691944811153536, + "learning_rate": 4.991308021548018e-06, + "loss": 0.6186, + "step": 1801 + }, + { + "epoch": 0.7120770560632255, + "grad_norm": 0.5449350122228545, + "learning_rate": 4.9912949634052955e-06, + "loss": 0.6247, + "step": 1802 + }, + { + "epoch": 0.7124722153618177, + "grad_norm": 0.5049416194812504, + "learning_rate": 4.991281895478279e-06, + "loss": 0.5903, + "step": 1803 + }, + { + "epoch": 0.71286737466041, + "grad_norm": 0.47208342204692466, + "learning_rate": 4.9912688177670195e-06, + "loss": 0.6231, + "step": 1804 + }, + { + "epoch": 0.7132625339590022, + "grad_norm": 0.5243565371621691, + "learning_rate": 4.991255730271569e-06, + "loss": 0.6277, + "step": 1805 + }, + { + "epoch": 0.7136576932575944, + "grad_norm": 0.5670677049532014, + "learning_rate": 4.991242632991979e-06, + "loss": 0.6372, + "step": 1806 + }, + { + "epoch": 0.7140528525561867, + "grad_norm": 0.5158307483356673, + "learning_rate": 4.9912295259283015e-06, + "loss": 0.6125, + "step": 1807 + }, + { + "epoch": 0.7144480118547789, + "grad_norm": 0.508525557153672, + "learning_rate": 4.991216409080586e-06, + "loss": 0.6101, + "step": 1808 + }, + { + "epoch": 0.7148431711533711, + "grad_norm": 0.5076780173186427, + "learning_rate": 4.9912032824488855e-06, + "loss": 0.6345, + "step": 1809 + }, + { + "epoch": 0.7152383304519635, + "grad_norm": 0.5069119882804742, + "learning_rate": 4.991190146033251e-06, + "loss": 0.5988, + "step": 1810 + }, + { + "epoch": 0.7156334897505557, + "grad_norm": 0.5111101135704393, + "learning_rate": 4.991176999833734e-06, + "loss": 0.6285, + "step": 1811 + }, + { + "epoch": 0.716028649049148, + "grad_norm": 0.494023778451646, + "learning_rate": 4.991163843850388e-06, + "loss": 0.6271, + "step": 1812 + }, + { + "epoch": 0.7164238083477402, + "grad_norm": 0.49401028308719136, + "learning_rate": 4.991150678083262e-06, + "loss": 0.6162, + "step": 1813 + }, + { + "epoch": 0.7168189676463325, + "grad_norm": 0.5455894933511418, + "learning_rate": 4.99113750253241e-06, + "loss": 0.6207, + "step": 1814 + }, + { + "epoch": 0.7172141269449247, + "grad_norm": 0.5038767858357743, + "learning_rate": 4.991124317197881e-06, + "loss": 0.6508, + "step": 1815 + }, + { + "epoch": 0.7176092862435169, + "grad_norm": 0.49228914244907324, + "learning_rate": 4.991111122079729e-06, + "loss": 0.6566, + "step": 1816 + }, + { + "epoch": 0.7180044455421092, + "grad_norm": 0.4939671059206662, + "learning_rate": 4.991097917178005e-06, + "loss": 0.6153, + "step": 1817 + }, + { + "epoch": 0.7183996048407014, + "grad_norm": 0.48627356885183, + "learning_rate": 4.991084702492761e-06, + "loss": 0.6016, + "step": 1818 + }, + { + "epoch": 0.7187947641392937, + "grad_norm": 0.4940616525370284, + "learning_rate": 4.99107147802405e-06, + "loss": 0.6139, + "step": 1819 + }, + { + "epoch": 0.7191899234378859, + "grad_norm": 0.498285217304123, + "learning_rate": 4.991058243771922e-06, + "loss": 0.6108, + "step": 1820 + }, + { + "epoch": 0.7195850827364781, + "grad_norm": 0.5053900244808598, + "learning_rate": 4.9910449997364295e-06, + "loss": 0.632, + "step": 1821 + }, + { + "epoch": 0.7199802420350704, + "grad_norm": 0.5073136517973986, + "learning_rate": 4.991031745917626e-06, + "loss": 0.6369, + "step": 1822 + }, + { + "epoch": 0.7203754013336626, + "grad_norm": 0.5195708029162378, + "learning_rate": 4.991018482315561e-06, + "loss": 0.6273, + "step": 1823 + }, + { + "epoch": 0.7207705606322549, + "grad_norm": 0.48766761832670025, + "learning_rate": 4.99100520893029e-06, + "loss": 0.6362, + "step": 1824 + }, + { + "epoch": 0.7211657199308471, + "grad_norm": 0.5320190857436362, + "learning_rate": 4.990991925761862e-06, + "loss": 0.6209, + "step": 1825 + }, + { + "epoch": 0.7215608792294393, + "grad_norm": 0.5362331175084071, + "learning_rate": 4.99097863281033e-06, + "loss": 0.6234, + "step": 1826 + }, + { + "epoch": 0.7219560385280316, + "grad_norm": 0.46797239650415506, + "learning_rate": 4.990965330075746e-06, + "loss": 0.6149, + "step": 1827 + }, + { + "epoch": 0.7223511978266238, + "grad_norm": 0.49165256567192805, + "learning_rate": 4.990952017558164e-06, + "loss": 0.6197, + "step": 1828 + }, + { + "epoch": 0.7227463571252161, + "grad_norm": 0.5734478891642341, + "learning_rate": 4.9909386952576355e-06, + "loss": 0.6099, + "step": 1829 + }, + { + "epoch": 0.7231415164238083, + "grad_norm": 0.5067294740219436, + "learning_rate": 4.9909253631742115e-06, + "loss": 0.6278, + "step": 1830 + }, + { + "epoch": 0.7235366757224005, + "grad_norm": 0.5289952166160319, + "learning_rate": 4.990912021307945e-06, + "loss": 0.6258, + "step": 1831 + }, + { + "epoch": 0.7239318350209928, + "grad_norm": 0.5043539597608305, + "learning_rate": 4.990898669658889e-06, + "loss": 0.6166, + "step": 1832 + }, + { + "epoch": 0.7243269943195851, + "grad_norm": 0.5137481323998266, + "learning_rate": 4.990885308227096e-06, + "loss": 0.6349, + "step": 1833 + }, + { + "epoch": 0.7247221536181774, + "grad_norm": 0.5223800914526201, + "learning_rate": 4.9908719370126175e-06, + "loss": 0.6335, + "step": 1834 + }, + { + "epoch": 0.7251173129167696, + "grad_norm": 0.49483421508060577, + "learning_rate": 4.990858556015507e-06, + "loss": 0.6243, + "step": 1835 + }, + { + "epoch": 0.7255124722153619, + "grad_norm": 0.507533775647517, + "learning_rate": 4.990845165235816e-06, + "loss": 0.6288, + "step": 1836 + }, + { + "epoch": 0.7259076315139541, + "grad_norm": 0.5337126385462883, + "learning_rate": 4.990831764673598e-06, + "loss": 0.6422, + "step": 1837 + }, + { + "epoch": 0.7263027908125463, + "grad_norm": 0.8373622357864616, + "learning_rate": 4.9908183543289055e-06, + "loss": 0.6366, + "step": 1838 + }, + { + "epoch": 0.7266979501111386, + "grad_norm": 0.4867027048473409, + "learning_rate": 4.99080493420179e-06, + "loss": 0.6196, + "step": 1839 + }, + { + "epoch": 0.7270931094097308, + "grad_norm": 0.5143251024891161, + "learning_rate": 4.990791504292307e-06, + "loss": 0.6378, + "step": 1840 + }, + { + "epoch": 0.7274882687083231, + "grad_norm": 0.4925899162999197, + "learning_rate": 4.990778064600506e-06, + "loss": 0.6077, + "step": 1841 + }, + { + "epoch": 0.7278834280069153, + "grad_norm": 0.46958077754810756, + "learning_rate": 4.990764615126442e-06, + "loss": 0.6249, + "step": 1842 + }, + { + "epoch": 0.7282785873055075, + "grad_norm": 0.502560018511133, + "learning_rate": 4.990751155870167e-06, + "loss": 0.6106, + "step": 1843 + }, + { + "epoch": 0.7286737466040998, + "grad_norm": 0.5082154553642297, + "learning_rate": 4.990737686831734e-06, + "loss": 0.6111, + "step": 1844 + }, + { + "epoch": 0.729068905902692, + "grad_norm": 0.48176602480847014, + "learning_rate": 4.990724208011195e-06, + "loss": 0.6225, + "step": 1845 + }, + { + "epoch": 0.7294640652012843, + "grad_norm": 0.47402002408096394, + "learning_rate": 4.990710719408604e-06, + "loss": 0.6258, + "step": 1846 + }, + { + "epoch": 0.7298592244998765, + "grad_norm": 0.4718364904587041, + "learning_rate": 4.9906972210240146e-06, + "loss": 0.6206, + "step": 1847 + }, + { + "epoch": 0.7302543837984687, + "grad_norm": 0.48775817606336896, + "learning_rate": 4.990683712857479e-06, + "loss": 0.6176, + "step": 1848 + }, + { + "epoch": 0.730649543097061, + "grad_norm": 0.48862016729905866, + "learning_rate": 4.99067019490905e-06, + "loss": 0.6477, + "step": 1849 + }, + { + "epoch": 0.7310447023956532, + "grad_norm": 0.4851256824616345, + "learning_rate": 4.990656667178781e-06, + "loss": 0.6297, + "step": 1850 + }, + { + "epoch": 0.7314398616942455, + "grad_norm": 0.4900449934686768, + "learning_rate": 4.9906431296667235e-06, + "loss": 0.6225, + "step": 1851 + }, + { + "epoch": 0.7318350209928377, + "grad_norm": 0.4803057428812473, + "learning_rate": 4.9906295823729334e-06, + "loss": 0.6096, + "step": 1852 + }, + { + "epoch": 0.7322301802914299, + "grad_norm": 0.5084616847011403, + "learning_rate": 4.990616025297462e-06, + "loss": 0.6402, + "step": 1853 + }, + { + "epoch": 0.7326253395900222, + "grad_norm": 0.48222218139870393, + "learning_rate": 4.990602458440364e-06, + "loss": 0.618, + "step": 1854 + }, + { + "epoch": 0.7330204988886144, + "grad_norm": 0.4768756457104444, + "learning_rate": 4.990588881801692e-06, + "loss": 0.6277, + "step": 1855 + }, + { + "epoch": 0.7334156581872067, + "grad_norm": 0.47054304737078606, + "learning_rate": 4.990575295381499e-06, + "loss": 0.639, + "step": 1856 + }, + { + "epoch": 0.733810817485799, + "grad_norm": 0.4954513777862497, + "learning_rate": 4.990561699179838e-06, + "loss": 0.6336, + "step": 1857 + }, + { + "epoch": 0.7342059767843913, + "grad_norm": 0.4977664948935915, + "learning_rate": 4.990548093196765e-06, + "loss": 0.6408, + "step": 1858 + }, + { + "epoch": 0.7346011360829835, + "grad_norm": 0.4821488918104782, + "learning_rate": 4.9905344774323285e-06, + "loss": 0.6477, + "step": 1859 + }, + { + "epoch": 0.7349962953815757, + "grad_norm": 0.47164432904373776, + "learning_rate": 4.990520851886586e-06, + "loss": 0.6116, + "step": 1860 + }, + { + "epoch": 0.735391454680168, + "grad_norm": 0.5130522518926274, + "learning_rate": 4.990507216559591e-06, + "loss": 0.6285, + "step": 1861 + }, + { + "epoch": 0.7357866139787602, + "grad_norm": 0.5085662976075583, + "learning_rate": 4.990493571451396e-06, + "loss": 0.6263, + "step": 1862 + }, + { + "epoch": 0.7361817732773525, + "grad_norm": 0.48426222103509337, + "learning_rate": 4.990479916562053e-06, + "loss": 0.6231, + "step": 1863 + }, + { + "epoch": 0.7365769325759447, + "grad_norm": 0.5176828196274378, + "learning_rate": 4.990466251891618e-06, + "loss": 0.6608, + "step": 1864 + }, + { + "epoch": 0.7369720918745369, + "grad_norm": 0.5221468288688574, + "learning_rate": 4.990452577440144e-06, + "loss": 0.6274, + "step": 1865 + }, + { + "epoch": 0.7373672511731292, + "grad_norm": 0.47850681281032353, + "learning_rate": 4.990438893207684e-06, + "loss": 0.6229, + "step": 1866 + }, + { + "epoch": 0.7377624104717214, + "grad_norm": 0.543170513792422, + "learning_rate": 4.990425199194293e-06, + "loss": 0.6256, + "step": 1867 + }, + { + "epoch": 0.7381575697703137, + "grad_norm": 0.5436786053164533, + "learning_rate": 4.990411495400024e-06, + "loss": 0.6326, + "step": 1868 + }, + { + "epoch": 0.7385527290689059, + "grad_norm": 0.4779013731244117, + "learning_rate": 4.9903977818249305e-06, + "loss": 0.624, + "step": 1869 + }, + { + "epoch": 0.7389478883674981, + "grad_norm": 0.5107248359446434, + "learning_rate": 4.9903840584690675e-06, + "loss": 0.6465, + "step": 1870 + }, + { + "epoch": 0.7393430476660904, + "grad_norm": 0.5191444772993868, + "learning_rate": 4.990370325332488e-06, + "loss": 0.6215, + "step": 1871 + }, + { + "epoch": 0.7397382069646826, + "grad_norm": 0.5148010616996448, + "learning_rate": 4.990356582415245e-06, + "loss": 0.6254, + "step": 1872 + }, + { + "epoch": 0.7401333662632749, + "grad_norm": 0.4822825362300129, + "learning_rate": 4.990342829717394e-06, + "loss": 0.6275, + "step": 1873 + }, + { + "epoch": 0.7405285255618671, + "grad_norm": 0.5111622322537774, + "learning_rate": 4.9903290672389895e-06, + "loss": 0.6222, + "step": 1874 + }, + { + "epoch": 0.7409236848604593, + "grad_norm": 0.524971372504675, + "learning_rate": 4.990315294980083e-06, + "loss": 0.6329, + "step": 1875 + }, + { + "epoch": 0.7413188441590516, + "grad_norm": 0.46981819397533325, + "learning_rate": 4.990301512940732e-06, + "loss": 0.6255, + "step": 1876 + }, + { + "epoch": 0.7417140034576438, + "grad_norm": 0.5061645450254407, + "learning_rate": 4.990287721120988e-06, + "loss": 0.6332, + "step": 1877 + }, + { + "epoch": 0.7421091627562361, + "grad_norm": 0.5081802490588557, + "learning_rate": 4.990273919520906e-06, + "loss": 0.6223, + "step": 1878 + }, + { + "epoch": 0.7425043220548283, + "grad_norm": 0.4904066696859732, + "learning_rate": 4.990260108140541e-06, + "loss": 0.6403, + "step": 1879 + }, + { + "epoch": 0.7428994813534205, + "grad_norm": 0.4991704339399477, + "learning_rate": 4.990246286979945e-06, + "loss": 0.6378, + "step": 1880 + }, + { + "epoch": 0.7432946406520129, + "grad_norm": 0.4834570028068084, + "learning_rate": 4.9902324560391745e-06, + "loss": 0.6003, + "step": 1881 + }, + { + "epoch": 0.7436897999506051, + "grad_norm": 0.4946859173466393, + "learning_rate": 4.990218615318283e-06, + "loss": 0.609, + "step": 1882 + }, + { + "epoch": 0.7440849592491974, + "grad_norm": 0.4988829525100141, + "learning_rate": 4.990204764817326e-06, + "loss": 0.6338, + "step": 1883 + }, + { + "epoch": 0.7444801185477896, + "grad_norm": 0.5882873111602047, + "learning_rate": 4.990190904536355e-06, + "loss": 0.652, + "step": 1884 + }, + { + "epoch": 0.7448752778463819, + "grad_norm": 0.4988958396253975, + "learning_rate": 4.990177034475427e-06, + "loss": 0.626, + "step": 1885 + }, + { + "epoch": 0.7452704371449741, + "grad_norm": 0.495275464799001, + "learning_rate": 4.990163154634596e-06, + "loss": 0.6095, + "step": 1886 + }, + { + "epoch": 0.7456655964435663, + "grad_norm": 0.4919380989234139, + "learning_rate": 4.990149265013916e-06, + "loss": 0.6211, + "step": 1887 + }, + { + "epoch": 0.7460607557421586, + "grad_norm": 0.5019734847912949, + "learning_rate": 4.990135365613442e-06, + "loss": 0.6299, + "step": 1888 + }, + { + "epoch": 0.7464559150407508, + "grad_norm": 0.4828661219472996, + "learning_rate": 4.9901214564332275e-06, + "loss": 0.6202, + "step": 1889 + }, + { + "epoch": 0.7468510743393431, + "grad_norm": 0.48107070668497365, + "learning_rate": 4.990107537473329e-06, + "loss": 0.6092, + "step": 1890 + }, + { + "epoch": 0.7472462336379353, + "grad_norm": 0.4791747545680655, + "learning_rate": 4.9900936087338e-06, + "loss": 0.6447, + "step": 1891 + }, + { + "epoch": 0.7476413929365275, + "grad_norm": 0.49731269148004137, + "learning_rate": 4.990079670214696e-06, + "loss": 0.6332, + "step": 1892 + }, + { + "epoch": 0.7480365522351198, + "grad_norm": 0.49528011354972196, + "learning_rate": 4.99006572191607e-06, + "loss": 0.6231, + "step": 1893 + }, + { + "epoch": 0.748431711533712, + "grad_norm": 0.4967662330741853, + "learning_rate": 4.990051763837978e-06, + "loss": 0.635, + "step": 1894 + }, + { + "epoch": 0.7488268708323043, + "grad_norm": 0.5265304272477109, + "learning_rate": 4.990037795980475e-06, + "loss": 0.622, + "step": 1895 + }, + { + "epoch": 0.7492220301308965, + "grad_norm": 0.49557723563808614, + "learning_rate": 4.990023818343615e-06, + "loss": 0.6204, + "step": 1896 + }, + { + "epoch": 0.7496171894294887, + "grad_norm": 0.4721310887893004, + "learning_rate": 4.9900098309274544e-06, + "loss": 0.6307, + "step": 1897 + }, + { + "epoch": 0.750012348728081, + "grad_norm": 0.5090822375536443, + "learning_rate": 4.989995833732047e-06, + "loss": 0.6415, + "step": 1898 + }, + { + "epoch": 0.7504075080266732, + "grad_norm": 0.48847487647832843, + "learning_rate": 4.989981826757447e-06, + "loss": 0.6407, + "step": 1899 + }, + { + "epoch": 0.7508026673252655, + "grad_norm": 0.4756812146928004, + "learning_rate": 4.989967810003712e-06, + "loss": 0.6259, + "step": 1900 + }, + { + "epoch": 0.7511978266238577, + "grad_norm": 0.4824209290886366, + "learning_rate": 4.989953783470895e-06, + "loss": 0.6306, + "step": 1901 + }, + { + "epoch": 0.7515929859224499, + "grad_norm": 0.48714913834725626, + "learning_rate": 4.9899397471590505e-06, + "loss": 0.6251, + "step": 1902 + }, + { + "epoch": 0.7519881452210422, + "grad_norm": 0.6529166302870585, + "learning_rate": 4.9899257010682355e-06, + "loss": 0.628, + "step": 1903 + }, + { + "epoch": 0.7523833045196345, + "grad_norm": 0.4958403702141253, + "learning_rate": 4.989911645198504e-06, + "loss": 0.6198, + "step": 1904 + }, + { + "epoch": 0.7527784638182268, + "grad_norm": 0.4834392149932408, + "learning_rate": 4.989897579549912e-06, + "loss": 0.6155, + "step": 1905 + }, + { + "epoch": 0.753173623116819, + "grad_norm": 0.48216115631971956, + "learning_rate": 4.989883504122514e-06, + "loss": 0.6083, + "step": 1906 + }, + { + "epoch": 0.7535687824154113, + "grad_norm": 0.5044002777393403, + "learning_rate": 4.989869418916364e-06, + "loss": 0.6608, + "step": 1907 + }, + { + "epoch": 0.7539639417140035, + "grad_norm": 0.4915044040208007, + "learning_rate": 4.98985532393152e-06, + "loss": 0.6306, + "step": 1908 + }, + { + "epoch": 0.7543591010125957, + "grad_norm": 0.4808889993199748, + "learning_rate": 4.989841219168037e-06, + "loss": 0.6045, + "step": 1909 + }, + { + "epoch": 0.754754260311188, + "grad_norm": 0.48355433928580754, + "learning_rate": 4.989827104625969e-06, + "loss": 0.6225, + "step": 1910 + }, + { + "epoch": 0.7551494196097802, + "grad_norm": 0.4946507696546552, + "learning_rate": 4.989812980305372e-06, + "loss": 0.622, + "step": 1911 + }, + { + "epoch": 0.7555445789083725, + "grad_norm": 0.4821780404795558, + "learning_rate": 4.989798846206302e-06, + "loss": 0.6334, + "step": 1912 + }, + { + "epoch": 0.7559397382069647, + "grad_norm": 0.4729609905608424, + "learning_rate": 4.989784702328814e-06, + "loss": 0.6138, + "step": 1913 + }, + { + "epoch": 0.7563348975055569, + "grad_norm": 0.47732682551126054, + "learning_rate": 4.989770548672962e-06, + "loss": 0.6308, + "step": 1914 + }, + { + "epoch": 0.7567300568041492, + "grad_norm": 0.49741773014790336, + "learning_rate": 4.9897563852388046e-06, + "loss": 0.6096, + "step": 1915 + }, + { + "epoch": 0.7571252161027414, + "grad_norm": 0.491814907631123, + "learning_rate": 4.989742212026396e-06, + "loss": 0.6148, + "step": 1916 + }, + { + "epoch": 0.7575203754013337, + "grad_norm": 0.4953592826702089, + "learning_rate": 4.989728029035791e-06, + "loss": 0.6176, + "step": 1917 + }, + { + "epoch": 0.7579155346999259, + "grad_norm": 0.5041003634543201, + "learning_rate": 4.989713836267047e-06, + "loss": 0.6147, + "step": 1918 + }, + { + "epoch": 0.7583106939985181, + "grad_norm": 0.6158285253583131, + "learning_rate": 4.989699633720218e-06, + "loss": 0.6389, + "step": 1919 + }, + { + "epoch": 0.7587058532971104, + "grad_norm": 0.5193736067753019, + "learning_rate": 4.989685421395361e-06, + "loss": 0.6441, + "step": 1920 + }, + { + "epoch": 0.7591010125957026, + "grad_norm": 0.5216730101645742, + "learning_rate": 4.989671199292533e-06, + "loss": 0.6293, + "step": 1921 + }, + { + "epoch": 0.7594961718942949, + "grad_norm": 0.4902177383288198, + "learning_rate": 4.989656967411787e-06, + "loss": 0.6349, + "step": 1922 + }, + { + "epoch": 0.7598913311928871, + "grad_norm": 0.48667537873584354, + "learning_rate": 4.9896427257531795e-06, + "loss": 0.5971, + "step": 1923 + }, + { + "epoch": 0.7602864904914793, + "grad_norm": 0.5011754592981034, + "learning_rate": 4.9896284743167685e-06, + "loss": 0.6287, + "step": 1924 + }, + { + "epoch": 0.7606816497900716, + "grad_norm": 0.49307058655487895, + "learning_rate": 4.989614213102608e-06, + "loss": 0.6161, + "step": 1925 + }, + { + "epoch": 0.7610768090886638, + "grad_norm": 0.48115110544146544, + "learning_rate": 4.989599942110754e-06, + "loss": 0.6203, + "step": 1926 + }, + { + "epoch": 0.7614719683872561, + "grad_norm": 0.5197946460675674, + "learning_rate": 4.9895856613412645e-06, + "loss": 0.6194, + "step": 1927 + }, + { + "epoch": 0.7618671276858484, + "grad_norm": 0.4970888947426336, + "learning_rate": 4.989571370794194e-06, + "loss": 0.6471, + "step": 1928 + }, + { + "epoch": 0.7622622869844407, + "grad_norm": 0.5938997863900486, + "learning_rate": 4.989557070469598e-06, + "loss": 0.627, + "step": 1929 + }, + { + "epoch": 0.7626574462830329, + "grad_norm": 0.4912343551544669, + "learning_rate": 4.989542760367535e-06, + "loss": 0.6073, + "step": 1930 + }, + { + "epoch": 0.7630526055816251, + "grad_norm": 0.5035910915834263, + "learning_rate": 4.989528440488059e-06, + "loss": 0.6061, + "step": 1931 + }, + { + "epoch": 0.7634477648802174, + "grad_norm": 0.47771768086688043, + "learning_rate": 4.9895141108312264e-06, + "loss": 0.6155, + "step": 1932 + }, + { + "epoch": 0.7638429241788096, + "grad_norm": 0.49893837750925285, + "learning_rate": 4.9894997713970945e-06, + "loss": 0.6194, + "step": 1933 + }, + { + "epoch": 0.7642380834774019, + "grad_norm": 0.48183105529501513, + "learning_rate": 4.989485422185719e-06, + "loss": 0.6072, + "step": 1934 + }, + { + "epoch": 0.7646332427759941, + "grad_norm": 0.49437340564091187, + "learning_rate": 4.989471063197157e-06, + "loss": 0.6235, + "step": 1935 + }, + { + "epoch": 0.7650284020745863, + "grad_norm": 0.5086950926898877, + "learning_rate": 4.989456694431464e-06, + "loss": 0.6204, + "step": 1936 + }, + { + "epoch": 0.7654235613731786, + "grad_norm": 0.4769480906144647, + "learning_rate": 4.989442315888697e-06, + "loss": 0.5961, + "step": 1937 + }, + { + "epoch": 0.7658187206717708, + "grad_norm": 0.49314554262535976, + "learning_rate": 4.9894279275689124e-06, + "loss": 0.6249, + "step": 1938 + }, + { + "epoch": 0.766213879970363, + "grad_norm": 0.5154666684010718, + "learning_rate": 4.989413529472166e-06, + "loss": 0.6252, + "step": 1939 + }, + { + "epoch": 0.7666090392689553, + "grad_norm": 0.4870804225575094, + "learning_rate": 4.989399121598516e-06, + "loss": 0.6125, + "step": 1940 + }, + { + "epoch": 0.7670041985675475, + "grad_norm": 0.5588292063163853, + "learning_rate": 4.989384703948017e-06, + "loss": 0.6152, + "step": 1941 + }, + { + "epoch": 0.7673993578661398, + "grad_norm": 0.49714045565400894, + "learning_rate": 4.989370276520726e-06, + "loss": 0.6175, + "step": 1942 + }, + { + "epoch": 0.767794517164732, + "grad_norm": 0.501785140415218, + "learning_rate": 4.989355839316701e-06, + "loss": 0.6385, + "step": 1943 + }, + { + "epoch": 0.7681896764633243, + "grad_norm": 0.4727502626196803, + "learning_rate": 4.989341392335998e-06, + "loss": 0.619, + "step": 1944 + }, + { + "epoch": 0.7685848357619165, + "grad_norm": 0.5109703415941235, + "learning_rate": 4.989326935578673e-06, + "loss": 0.5984, + "step": 1945 + }, + { + "epoch": 0.7689799950605087, + "grad_norm": 0.512283569257176, + "learning_rate": 4.9893124690447835e-06, + "loss": 0.6236, + "step": 1946 + }, + { + "epoch": 0.769375154359101, + "grad_norm": 0.4941171640474088, + "learning_rate": 4.989297992734386e-06, + "loss": 0.6163, + "step": 1947 + }, + { + "epoch": 0.7697703136576932, + "grad_norm": 0.5134713647285842, + "learning_rate": 4.989283506647539e-06, + "loss": 0.6033, + "step": 1948 + }, + { + "epoch": 0.7701654729562855, + "grad_norm": 0.5065873835627485, + "learning_rate": 4.9892690107842964e-06, + "loss": 0.6306, + "step": 1949 + }, + { + "epoch": 0.7705606322548777, + "grad_norm": 0.488059732764682, + "learning_rate": 4.9892545051447175e-06, + "loss": 0.6095, + "step": 1950 + }, + { + "epoch": 0.7709557915534699, + "grad_norm": 0.5039415788029225, + "learning_rate": 4.989239989728859e-06, + "loss": 0.614, + "step": 1951 + }, + { + "epoch": 0.7713509508520623, + "grad_norm": 0.5032937006287896, + "learning_rate": 4.989225464536776e-06, + "loss": 0.6511, + "step": 1952 + }, + { + "epoch": 0.7717461101506545, + "grad_norm": 0.49467260366888127, + "learning_rate": 4.989210929568527e-06, + "loss": 0.6123, + "step": 1953 + }, + { + "epoch": 0.7721412694492468, + "grad_norm": 0.5221851652881345, + "learning_rate": 4.98919638482417e-06, + "loss": 0.6117, + "step": 1954 + }, + { + "epoch": 0.772536428747839, + "grad_norm": 0.5123539208719938, + "learning_rate": 4.989181830303761e-06, + "loss": 0.6081, + "step": 1955 + }, + { + "epoch": 0.7729315880464313, + "grad_norm": 0.49070829163767216, + "learning_rate": 4.9891672660073566e-06, + "loss": 0.6169, + "step": 1956 + }, + { + "epoch": 0.7733267473450235, + "grad_norm": 0.504303254265576, + "learning_rate": 4.989152691935015e-06, + "loss": 0.6099, + "step": 1957 + }, + { + "epoch": 0.7737219066436157, + "grad_norm": 0.4823674667046708, + "learning_rate": 4.989138108086793e-06, + "loss": 0.6057, + "step": 1958 + }, + { + "epoch": 0.774117065942208, + "grad_norm": 0.488893702740108, + "learning_rate": 4.989123514462748e-06, + "loss": 0.6215, + "step": 1959 + }, + { + "epoch": 0.7745122252408002, + "grad_norm": 0.5065060131368263, + "learning_rate": 4.989108911062938e-06, + "loss": 0.6268, + "step": 1960 + }, + { + "epoch": 0.7749073845393925, + "grad_norm": 0.46321552663150034, + "learning_rate": 4.989094297887419e-06, + "loss": 0.622, + "step": 1961 + }, + { + "epoch": 0.7753025438379847, + "grad_norm": 0.5020834149871061, + "learning_rate": 4.989079674936249e-06, + "loss": 0.6185, + "step": 1962 + }, + { + "epoch": 0.7756977031365769, + "grad_norm": 0.4981082643061992, + "learning_rate": 4.989065042209486e-06, + "loss": 0.6292, + "step": 1963 + }, + { + "epoch": 0.7760928624351692, + "grad_norm": 0.47491786674487524, + "learning_rate": 4.989050399707186e-06, + "loss": 0.6186, + "step": 1964 + }, + { + "epoch": 0.7764880217337614, + "grad_norm": 0.47555277744404306, + "learning_rate": 4.989035747429409e-06, + "loss": 0.6205, + "step": 1965 + }, + { + "epoch": 0.7768831810323537, + "grad_norm": 0.5232964357542301, + "learning_rate": 4.989021085376209e-06, + "loss": 0.6325, + "step": 1966 + }, + { + "epoch": 0.7772783403309459, + "grad_norm": 0.48296952218572786, + "learning_rate": 4.989006413547647e-06, + "loss": 0.6232, + "step": 1967 + }, + { + "epoch": 0.7776734996295381, + "grad_norm": 0.49104169294101535, + "learning_rate": 4.988991731943778e-06, + "loss": 0.6095, + "step": 1968 + }, + { + "epoch": 0.7780686589281304, + "grad_norm": 0.4706225330715028, + "learning_rate": 4.988977040564662e-06, + "loss": 0.6001, + "step": 1969 + }, + { + "epoch": 0.7784638182267226, + "grad_norm": 0.4916599710014126, + "learning_rate": 4.988962339410356e-06, + "loss": 0.6018, + "step": 1970 + }, + { + "epoch": 0.7788589775253149, + "grad_norm": 0.4930270017350692, + "learning_rate": 4.988947628480917e-06, + "loss": 0.6232, + "step": 1971 + }, + { + "epoch": 0.7792541368239071, + "grad_norm": 0.49545246948058913, + "learning_rate": 4.988932907776402e-06, + "loss": 0.6423, + "step": 1972 + }, + { + "epoch": 0.7796492961224993, + "grad_norm": 0.48258013902876534, + "learning_rate": 4.988918177296871e-06, + "loss": 0.6234, + "step": 1973 + }, + { + "epoch": 0.7800444554210916, + "grad_norm": 0.4654851797716931, + "learning_rate": 4.988903437042379e-06, + "loss": 0.6061, + "step": 1974 + }, + { + "epoch": 0.7804396147196839, + "grad_norm": 0.4709686787629398, + "learning_rate": 4.988888687012988e-06, + "loss": 0.6047, + "step": 1975 + }, + { + "epoch": 0.7808347740182762, + "grad_norm": 0.480518188036582, + "learning_rate": 4.988873927208753e-06, + "loss": 0.6272, + "step": 1976 + }, + { + "epoch": 0.7812299333168684, + "grad_norm": 0.4830740477285781, + "learning_rate": 4.9888591576297315e-06, + "loss": 0.6199, + "step": 1977 + }, + { + "epoch": 0.7816250926154606, + "grad_norm": 0.49273772037327457, + "learning_rate": 4.988844378275983e-06, + "loss": 0.6278, + "step": 1978 + }, + { + "epoch": 0.7820202519140529, + "grad_norm": 0.5246304983064629, + "learning_rate": 4.988829589147566e-06, + "loss": 0.642, + "step": 1979 + }, + { + "epoch": 0.7824154112126451, + "grad_norm": 0.5572740103595891, + "learning_rate": 4.988814790244536e-06, + "loss": 0.6414, + "step": 1980 + }, + { + "epoch": 0.7828105705112374, + "grad_norm": 0.5029929788394535, + "learning_rate": 4.988799981566954e-06, + "loss": 0.6119, + "step": 1981 + }, + { + "epoch": 0.7832057298098296, + "grad_norm": 0.5256809229609704, + "learning_rate": 4.988785163114876e-06, + "loss": 0.6411, + "step": 1982 + }, + { + "epoch": 0.7836008891084218, + "grad_norm": 0.46688383401448613, + "learning_rate": 4.988770334888362e-06, + "loss": 0.6118, + "step": 1983 + }, + { + "epoch": 0.7839960484070141, + "grad_norm": 0.4953447635940505, + "learning_rate": 4.988755496887469e-06, + "loss": 0.616, + "step": 1984 + }, + { + "epoch": 0.7843912077056063, + "grad_norm": 0.5398841768039435, + "learning_rate": 4.988740649112256e-06, + "loss": 0.6177, + "step": 1985 + }, + { + "epoch": 0.7847863670041986, + "grad_norm": 0.5019947174695433, + "learning_rate": 4.988725791562782e-06, + "loss": 0.6296, + "step": 1986 + }, + { + "epoch": 0.7851815263027908, + "grad_norm": 0.49373501939581677, + "learning_rate": 4.988710924239103e-06, + "loss": 0.6365, + "step": 1987 + }, + { + "epoch": 0.785576685601383, + "grad_norm": 0.5151169309400688, + "learning_rate": 4.988696047141278e-06, + "loss": 0.5958, + "step": 1988 + }, + { + "epoch": 0.7859718448999753, + "grad_norm": 0.4864586261342781, + "learning_rate": 4.988681160269367e-06, + "loss": 0.6389, + "step": 1989 + }, + { + "epoch": 0.7863670041985675, + "grad_norm": 0.47700282379110287, + "learning_rate": 4.988666263623428e-06, + "loss": 0.6333, + "step": 1990 + }, + { + "epoch": 0.7867621634971598, + "grad_norm": 0.5030597816770237, + "learning_rate": 4.988651357203519e-06, + "loss": 0.6201, + "step": 1991 + }, + { + "epoch": 0.787157322795752, + "grad_norm": 0.4781975328115511, + "learning_rate": 4.988636441009698e-06, + "loss": 0.6175, + "step": 1992 + }, + { + "epoch": 0.7875524820943443, + "grad_norm": 0.5174944437050455, + "learning_rate": 4.988621515042025e-06, + "loss": 0.6462, + "step": 1993 + }, + { + "epoch": 0.7879476413929365, + "grad_norm": 0.5374787592098473, + "learning_rate": 4.988606579300557e-06, + "loss": 0.634, + "step": 1994 + }, + { + "epoch": 0.7883428006915287, + "grad_norm": 0.47432495552537524, + "learning_rate": 4.988591633785354e-06, + "loss": 0.6332, + "step": 1995 + }, + { + "epoch": 0.788737959990121, + "grad_norm": 0.48163792175353165, + "learning_rate": 4.988576678496474e-06, + "loss": 0.6038, + "step": 1996 + }, + { + "epoch": 0.7891331192887132, + "grad_norm": 0.46928084110206164, + "learning_rate": 4.988561713433977e-06, + "loss": 0.6053, + "step": 1997 + }, + { + "epoch": 0.7895282785873055, + "grad_norm": 0.5029560676742602, + "learning_rate": 4.988546738597919e-06, + "loss": 0.6366, + "step": 1998 + }, + { + "epoch": 0.7899234378858978, + "grad_norm": 0.4850619096705073, + "learning_rate": 4.988531753988361e-06, + "loss": 0.6231, + "step": 1999 + }, + { + "epoch": 0.79031859718449, + "grad_norm": 0.4946994316101753, + "learning_rate": 4.988516759605363e-06, + "loss": 0.6331, + "step": 2000 + }, + { + "epoch": 0.7907137564830823, + "grad_norm": 0.5367867666042327, + "learning_rate": 4.988501755448981e-06, + "loss": 0.639, + "step": 2001 + }, + { + "epoch": 0.7911089157816745, + "grad_norm": 0.5010435528360841, + "learning_rate": 4.988486741519275e-06, + "loss": 0.62, + "step": 2002 + }, + { + "epoch": 0.7915040750802668, + "grad_norm": 0.4847944569719835, + "learning_rate": 4.988471717816305e-06, + "loss": 0.6629, + "step": 2003 + }, + { + "epoch": 0.791899234378859, + "grad_norm": 0.7692429223559273, + "learning_rate": 4.988456684340128e-06, + "loss": 0.6407, + "step": 2004 + }, + { + "epoch": 0.7922943936774512, + "grad_norm": 0.5018665724568167, + "learning_rate": 4.9884416410908055e-06, + "loss": 0.6318, + "step": 2005 + }, + { + "epoch": 0.7926895529760435, + "grad_norm": 0.47570736353057425, + "learning_rate": 4.988426588068394e-06, + "loss": 0.6203, + "step": 2006 + }, + { + "epoch": 0.7930847122746357, + "grad_norm": 0.49443151633573906, + "learning_rate": 4.988411525272954e-06, + "loss": 0.6314, + "step": 2007 + }, + { + "epoch": 0.793479871573228, + "grad_norm": 0.47649470527071924, + "learning_rate": 4.988396452704546e-06, + "loss": 0.6122, + "step": 2008 + }, + { + "epoch": 0.7938750308718202, + "grad_norm": 0.4959125816855865, + "learning_rate": 4.988381370363227e-06, + "loss": 0.6264, + "step": 2009 + }, + { + "epoch": 0.7942701901704124, + "grad_norm": 0.4670880342830564, + "learning_rate": 4.9883662782490576e-06, + "loss": 0.5956, + "step": 2010 + }, + { + "epoch": 0.7946653494690047, + "grad_norm": 0.5050700439632813, + "learning_rate": 4.988351176362095e-06, + "loss": 0.6234, + "step": 2011 + }, + { + "epoch": 0.7950605087675969, + "grad_norm": 0.48271559918802115, + "learning_rate": 4.9883360647024e-06, + "loss": 0.6295, + "step": 2012 + }, + { + "epoch": 0.7954556680661892, + "grad_norm": 0.4636363520372164, + "learning_rate": 4.988320943270034e-06, + "loss": 0.6177, + "step": 2013 + }, + { + "epoch": 0.7958508273647814, + "grad_norm": 0.6019915166951819, + "learning_rate": 4.988305812065053e-06, + "loss": 0.6307, + "step": 2014 + }, + { + "epoch": 0.7962459866633737, + "grad_norm": 0.5005076532485925, + "learning_rate": 4.988290671087517e-06, + "loss": 0.6331, + "step": 2015 + }, + { + "epoch": 0.7966411459619659, + "grad_norm": 0.5057151799248627, + "learning_rate": 4.988275520337488e-06, + "loss": 0.6397, + "step": 2016 + }, + { + "epoch": 0.7970363052605581, + "grad_norm": 0.4620738413199476, + "learning_rate": 4.988260359815022e-06, + "loss": 0.6243, + "step": 2017 + }, + { + "epoch": 0.7974314645591504, + "grad_norm": 0.48041825341679295, + "learning_rate": 4.988245189520181e-06, + "loss": 0.6347, + "step": 2018 + }, + { + "epoch": 0.7978266238577426, + "grad_norm": 0.4854380738784906, + "learning_rate": 4.9882300094530236e-06, + "loss": 0.6321, + "step": 2019 + }, + { + "epoch": 0.7982217831563349, + "grad_norm": 0.4732758526185407, + "learning_rate": 4.988214819613611e-06, + "loss": 0.613, + "step": 2020 + }, + { + "epoch": 0.7986169424549271, + "grad_norm": 0.4902286255287152, + "learning_rate": 4.988199620002e-06, + "loss": 0.642, + "step": 2021 + }, + { + "epoch": 0.7990121017535193, + "grad_norm": 0.48018089371628836, + "learning_rate": 4.988184410618252e-06, + "loss": 0.6281, + "step": 2022 + }, + { + "epoch": 0.7994072610521117, + "grad_norm": 0.48276881530797655, + "learning_rate": 4.988169191462426e-06, + "loss": 0.624, + "step": 2023 + }, + { + "epoch": 0.7998024203507039, + "grad_norm": 0.4805829544314743, + "learning_rate": 4.988153962534583e-06, + "loss": 0.6355, + "step": 2024 + }, + { + "epoch": 0.8001975796492962, + "grad_norm": 0.4650087756290116, + "learning_rate": 4.988138723834783e-06, + "loss": 0.6159, + "step": 2025 + }, + { + "epoch": 0.8005927389478884, + "grad_norm": 0.500302525183259, + "learning_rate": 4.9881234753630835e-06, + "loss": 0.6207, + "step": 2026 + }, + { + "epoch": 0.8009878982464806, + "grad_norm": 0.4633513547631363, + "learning_rate": 4.988108217119547e-06, + "loss": 0.603, + "step": 2027 + }, + { + "epoch": 0.8013830575450729, + "grad_norm": 0.4566681685008273, + "learning_rate": 4.988092949104232e-06, + "loss": 0.6074, + "step": 2028 + }, + { + "epoch": 0.8017782168436651, + "grad_norm": 0.4988424175671541, + "learning_rate": 4.988077671317198e-06, + "loss": 0.6013, + "step": 2029 + }, + { + "epoch": 0.8021733761422574, + "grad_norm": 0.4929308612546276, + "learning_rate": 4.988062383758506e-06, + "loss": 0.6244, + "step": 2030 + }, + { + "epoch": 0.8025685354408496, + "grad_norm": 0.4781963632718603, + "learning_rate": 4.988047086428217e-06, + "loss": 0.6197, + "step": 2031 + }, + { + "epoch": 0.8029636947394418, + "grad_norm": 0.6162145939572514, + "learning_rate": 4.988031779326389e-06, + "loss": 0.6274, + "step": 2032 + }, + { + "epoch": 0.8033588540380341, + "grad_norm": 0.48495525673379924, + "learning_rate": 4.988016462453082e-06, + "loss": 0.6293, + "step": 2033 + }, + { + "epoch": 0.8037540133366263, + "grad_norm": 0.5043930111987597, + "learning_rate": 4.988001135808358e-06, + "loss": 0.6138, + "step": 2034 + }, + { + "epoch": 0.8041491726352186, + "grad_norm": 0.46666576704300994, + "learning_rate": 4.987985799392277e-06, + "loss": 0.6072, + "step": 2035 + }, + { + "epoch": 0.8045443319338108, + "grad_norm": 0.48805370345241733, + "learning_rate": 4.987970453204898e-06, + "loss": 0.6249, + "step": 2036 + }, + { + "epoch": 0.804939491232403, + "grad_norm": 0.48377604545911973, + "learning_rate": 4.987955097246282e-06, + "loss": 0.617, + "step": 2037 + }, + { + "epoch": 0.8053346505309953, + "grad_norm": 0.4720053686199778, + "learning_rate": 4.987939731516489e-06, + "loss": 0.6046, + "step": 2038 + }, + { + "epoch": 0.8057298098295875, + "grad_norm": 0.5828932827171168, + "learning_rate": 4.987924356015579e-06, + "loss": 0.6315, + "step": 2039 + }, + { + "epoch": 0.8061249691281798, + "grad_norm": 0.4891162177690198, + "learning_rate": 4.987908970743614e-06, + "loss": 0.6332, + "step": 2040 + }, + { + "epoch": 0.806520128426772, + "grad_norm": 0.4827837446920491, + "learning_rate": 4.987893575700652e-06, + "loss": 0.6258, + "step": 2041 + }, + { + "epoch": 0.8069152877253642, + "grad_norm": 0.46077989778395895, + "learning_rate": 4.987878170886755e-06, + "loss": 0.5977, + "step": 2042 + }, + { + "epoch": 0.8073104470239565, + "grad_norm": 0.48314109012199535, + "learning_rate": 4.987862756301984e-06, + "loss": 0.6335, + "step": 2043 + }, + { + "epoch": 0.8077056063225487, + "grad_norm": 0.47542653192334977, + "learning_rate": 4.987847331946398e-06, + "loss": 0.6009, + "step": 2044 + }, + { + "epoch": 0.808100765621141, + "grad_norm": 0.46969812561265184, + "learning_rate": 4.987831897820059e-06, + "loss": 0.6262, + "step": 2045 + }, + { + "epoch": 0.8084959249197333, + "grad_norm": 0.4697880436971334, + "learning_rate": 4.987816453923027e-06, + "loss": 0.6212, + "step": 2046 + }, + { + "epoch": 0.8088910842183256, + "grad_norm": 0.5290819232360799, + "learning_rate": 4.987801000255362e-06, + "loss": 0.6359, + "step": 2047 + }, + { + "epoch": 0.8092862435169178, + "grad_norm": 0.4774269454534689, + "learning_rate": 4.987785536817127e-06, + "loss": 0.6069, + "step": 2048 + }, + { + "epoch": 0.80968140281551, + "grad_norm": 0.4925043786131469, + "learning_rate": 4.987770063608379e-06, + "loss": 0.6237, + "step": 2049 + }, + { + "epoch": 0.8100765621141023, + "grad_norm": 0.48913030453903555, + "learning_rate": 4.987754580629182e-06, + "loss": 0.617, + "step": 2050 + }, + { + "epoch": 0.8104717214126945, + "grad_norm": 0.4870701601969778, + "learning_rate": 4.987739087879596e-06, + "loss": 0.6152, + "step": 2051 + }, + { + "epoch": 0.8108668807112868, + "grad_norm": 0.4732126768516727, + "learning_rate": 4.987723585359681e-06, + "loss": 0.619, + "step": 2052 + }, + { + "epoch": 0.811262040009879, + "grad_norm": 0.4889602356350729, + "learning_rate": 4.987708073069498e-06, + "loss": 0.6352, + "step": 2053 + }, + { + "epoch": 0.8116571993084712, + "grad_norm": 0.48795631546959883, + "learning_rate": 4.9876925510091085e-06, + "loss": 0.6143, + "step": 2054 + }, + { + "epoch": 0.8120523586070635, + "grad_norm": 0.46539271557990114, + "learning_rate": 4.987677019178573e-06, + "loss": 0.6046, + "step": 2055 + }, + { + "epoch": 0.8124475179056557, + "grad_norm": 0.49405822426322776, + "learning_rate": 4.987661477577953e-06, + "loss": 0.6014, + "step": 2056 + }, + { + "epoch": 0.812842677204248, + "grad_norm": 0.48386934068049814, + "learning_rate": 4.98764592620731e-06, + "loss": 0.6001, + "step": 2057 + }, + { + "epoch": 0.8132378365028402, + "grad_norm": 0.5187696026651084, + "learning_rate": 4.987630365066703e-06, + "loss": 0.6317, + "step": 2058 + }, + { + "epoch": 0.8136329958014324, + "grad_norm": 0.5042378749814072, + "learning_rate": 4.987614794156196e-06, + "loss": 0.6258, + "step": 2059 + }, + { + "epoch": 0.8140281551000247, + "grad_norm": 0.4644958030378567, + "learning_rate": 4.987599213475848e-06, + "loss": 0.6111, + "step": 2060 + }, + { + "epoch": 0.8144233143986169, + "grad_norm": 0.45523155834489054, + "learning_rate": 4.98758362302572e-06, + "loss": 0.614, + "step": 2061 + }, + { + "epoch": 0.8148184736972092, + "grad_norm": 0.4798484565189588, + "learning_rate": 4.987568022805875e-06, + "loss": 0.6285, + "step": 2062 + }, + { + "epoch": 0.8152136329958014, + "grad_norm": 0.48279313141569785, + "learning_rate": 4.987552412816373e-06, + "loss": 0.6223, + "step": 2063 + }, + { + "epoch": 0.8156087922943936, + "grad_norm": 0.452316831060907, + "learning_rate": 4.9875367930572764e-06, + "loss": 0.5993, + "step": 2064 + }, + { + "epoch": 0.8160039515929859, + "grad_norm": 0.4964123620781851, + "learning_rate": 4.987521163528645e-06, + "loss": 0.6159, + "step": 2065 + }, + { + "epoch": 0.8163991108915781, + "grad_norm": 0.487979292436224, + "learning_rate": 4.9875055242305414e-06, + "loss": 0.6027, + "step": 2066 + }, + { + "epoch": 0.8167942701901704, + "grad_norm": 0.46013675854663044, + "learning_rate": 4.987489875163027e-06, + "loss": 0.6076, + "step": 2067 + }, + { + "epoch": 0.8171894294887626, + "grad_norm": 0.4929449077318516, + "learning_rate": 4.987474216326162e-06, + "loss": 0.6387, + "step": 2068 + }, + { + "epoch": 0.8175845887873548, + "grad_norm": 0.4845064197629273, + "learning_rate": 4.987458547720009e-06, + "loss": 0.624, + "step": 2069 + }, + { + "epoch": 0.8179797480859472, + "grad_norm": 0.5780674060390795, + "learning_rate": 4.987442869344629e-06, + "loss": 0.6473, + "step": 2070 + }, + { + "epoch": 0.8183749073845394, + "grad_norm": 0.5093404649783988, + "learning_rate": 4.987427181200084e-06, + "loss": 0.6399, + "step": 2071 + }, + { + "epoch": 0.8187700666831317, + "grad_norm": 0.5027970250091759, + "learning_rate": 4.987411483286436e-06, + "loss": 0.6364, + "step": 2072 + }, + { + "epoch": 0.8191652259817239, + "grad_norm": 0.4681596123762207, + "learning_rate": 4.987395775603746e-06, + "loss": 0.5992, + "step": 2073 + }, + { + "epoch": 0.8195603852803162, + "grad_norm": 0.4774118024924558, + "learning_rate": 4.987380058152076e-06, + "loss": 0.6076, + "step": 2074 + }, + { + "epoch": 0.8199555445789084, + "grad_norm": 0.4758447061485989, + "learning_rate": 4.987364330931487e-06, + "loss": 0.6348, + "step": 2075 + }, + { + "epoch": 0.8203507038775006, + "grad_norm": 0.4820431900570343, + "learning_rate": 4.9873485939420405e-06, + "loss": 0.6126, + "step": 2076 + }, + { + "epoch": 0.8207458631760929, + "grad_norm": 0.46836863605198487, + "learning_rate": 4.987332847183801e-06, + "loss": 0.6143, + "step": 2077 + }, + { + "epoch": 0.8211410224746851, + "grad_norm": 0.47555062710148005, + "learning_rate": 4.987317090656827e-06, + "loss": 0.6142, + "step": 2078 + }, + { + "epoch": 0.8215361817732774, + "grad_norm": 0.4870036833640785, + "learning_rate": 4.987301324361182e-06, + "loss": 0.6083, + "step": 2079 + }, + { + "epoch": 0.8219313410718696, + "grad_norm": 0.4844543461263999, + "learning_rate": 4.9872855482969284e-06, + "loss": 0.6168, + "step": 2080 + }, + { + "epoch": 0.8223265003704618, + "grad_norm": 0.4746621256551825, + "learning_rate": 4.987269762464127e-06, + "loss": 0.62, + "step": 2081 + }, + { + "epoch": 0.8227216596690541, + "grad_norm": 0.4718461178840169, + "learning_rate": 4.987253966862841e-06, + "loss": 0.626, + "step": 2082 + }, + { + "epoch": 0.8231168189676463, + "grad_norm": 0.47940330337486675, + "learning_rate": 4.987238161493132e-06, + "loss": 0.6384, + "step": 2083 + }, + { + "epoch": 0.8235119782662386, + "grad_norm": 0.4831189596082736, + "learning_rate": 4.987222346355061e-06, + "loss": 0.6006, + "step": 2084 + }, + { + "epoch": 0.8239071375648308, + "grad_norm": 0.5111828704378958, + "learning_rate": 4.987206521448691e-06, + "loss": 0.6275, + "step": 2085 + }, + { + "epoch": 0.824302296863423, + "grad_norm": 0.4761799453175209, + "learning_rate": 4.987190686774084e-06, + "loss": 0.6166, + "step": 2086 + }, + { + "epoch": 0.8246974561620153, + "grad_norm": 0.479237633298224, + "learning_rate": 4.987174842331303e-06, + "loss": 0.6157, + "step": 2087 + }, + { + "epoch": 0.8250926154606075, + "grad_norm": 0.4877816511875846, + "learning_rate": 4.9871589881204085e-06, + "loss": 0.633, + "step": 2088 + }, + { + "epoch": 0.8254877747591998, + "grad_norm": 0.5597118422036577, + "learning_rate": 4.987143124141465e-06, + "loss": 0.6281, + "step": 2089 + }, + { + "epoch": 0.825882934057792, + "grad_norm": 0.4808185647219305, + "learning_rate": 4.987127250394532e-06, + "loss": 0.6284, + "step": 2090 + }, + { + "epoch": 0.8262780933563842, + "grad_norm": 0.49350572331519976, + "learning_rate": 4.987111366879674e-06, + "loss": 0.6327, + "step": 2091 + }, + { + "epoch": 0.8266732526549765, + "grad_norm": 0.4950625588459782, + "learning_rate": 4.987095473596954e-06, + "loss": 0.5991, + "step": 2092 + }, + { + "epoch": 0.8270684119535687, + "grad_norm": 0.48717004314629225, + "learning_rate": 4.987079570546432e-06, + "loss": 0.6125, + "step": 2093 + }, + { + "epoch": 0.8274635712521611, + "grad_norm": 0.477632541157664, + "learning_rate": 4.987063657728172e-06, + "loss": 0.6184, + "step": 2094 + }, + { + "epoch": 0.8278587305507533, + "grad_norm": 0.490920143190879, + "learning_rate": 4.987047735142236e-06, + "loss": 0.6112, + "step": 2095 + }, + { + "epoch": 0.8282538898493456, + "grad_norm": 0.4963440592377515, + "learning_rate": 4.9870318027886874e-06, + "loss": 0.6122, + "step": 2096 + }, + { + "epoch": 0.8286490491479378, + "grad_norm": 0.4867870903292657, + "learning_rate": 4.9870158606675875e-06, + "loss": 0.5993, + "step": 2097 + }, + { + "epoch": 0.82904420844653, + "grad_norm": 0.4976198098833027, + "learning_rate": 4.986999908779e-06, + "loss": 0.6465, + "step": 2098 + }, + { + "epoch": 0.8294393677451223, + "grad_norm": 0.4870232342621309, + "learning_rate": 4.986983947122986e-06, + "loss": 0.6201, + "step": 2099 + }, + { + "epoch": 0.8298345270437145, + "grad_norm": 0.46830678342177057, + "learning_rate": 4.9869679756996105e-06, + "loss": 0.6107, + "step": 2100 + }, + { + "epoch": 0.8302296863423068, + "grad_norm": 0.6549407593688914, + "learning_rate": 4.986951994508934e-06, + "loss": 0.6266, + "step": 2101 + }, + { + "epoch": 0.830624845640899, + "grad_norm": 0.47125597170756056, + "learning_rate": 4.98693600355102e-06, + "loss": 0.6293, + "step": 2102 + }, + { + "epoch": 0.8310200049394912, + "grad_norm": 0.491814138267252, + "learning_rate": 4.9869200028259325e-06, + "loss": 0.6273, + "step": 2103 + }, + { + "epoch": 0.8314151642380835, + "grad_norm": 0.4992441103556944, + "learning_rate": 4.986903992333734e-06, + "loss": 0.6485, + "step": 2104 + }, + { + "epoch": 0.8318103235366757, + "grad_norm": 0.4838463362223243, + "learning_rate": 4.986887972074485e-06, + "loss": 0.6233, + "step": 2105 + }, + { + "epoch": 0.832205482835268, + "grad_norm": 0.5162232947613259, + "learning_rate": 4.986871942048252e-06, + "loss": 0.5937, + "step": 2106 + }, + { + "epoch": 0.8326006421338602, + "grad_norm": 0.491642550409257, + "learning_rate": 4.986855902255094e-06, + "loss": 0.6275, + "step": 2107 + }, + { + "epoch": 0.8329958014324524, + "grad_norm": 0.4697163915777976, + "learning_rate": 4.9868398526950765e-06, + "loss": 0.6012, + "step": 2108 + }, + { + "epoch": 0.8333909607310447, + "grad_norm": 0.5029059751532935, + "learning_rate": 4.986823793368263e-06, + "loss": 0.6184, + "step": 2109 + }, + { + "epoch": 0.8337861200296369, + "grad_norm": 0.49890701973097873, + "learning_rate": 4.9868077242747156e-06, + "loss": 0.6489, + "step": 2110 + }, + { + "epoch": 0.8341812793282292, + "grad_norm": 0.47086406769377864, + "learning_rate": 4.986791645414498e-06, + "loss": 0.6158, + "step": 2111 + }, + { + "epoch": 0.8345764386268214, + "grad_norm": 0.4711411719674825, + "learning_rate": 4.986775556787672e-06, + "loss": 0.6108, + "step": 2112 + }, + { + "epoch": 0.8349715979254136, + "grad_norm": 0.5170133429016198, + "learning_rate": 4.986759458394302e-06, + "loss": 0.6205, + "step": 2113 + }, + { + "epoch": 0.8353667572240059, + "grad_norm": 0.48113051440505156, + "learning_rate": 4.986743350234451e-06, + "loss": 0.5975, + "step": 2114 + }, + { + "epoch": 0.8357619165225981, + "grad_norm": 0.4867487446570137, + "learning_rate": 4.986727232308182e-06, + "loss": 0.6193, + "step": 2115 + }, + { + "epoch": 0.8361570758211904, + "grad_norm": 0.5120874526734966, + "learning_rate": 4.986711104615558e-06, + "loss": 0.6233, + "step": 2116 + }, + { + "epoch": 0.8365522351197827, + "grad_norm": 0.5108538780702159, + "learning_rate": 4.986694967156644e-06, + "loss": 0.6112, + "step": 2117 + }, + { + "epoch": 0.836947394418375, + "grad_norm": 0.5009432477935026, + "learning_rate": 4.986678819931501e-06, + "loss": 0.6129, + "step": 2118 + }, + { + "epoch": 0.8373425537169672, + "grad_norm": 0.5394279204944199, + "learning_rate": 4.986662662940193e-06, + "loss": 0.6328, + "step": 2119 + }, + { + "epoch": 0.8377377130155594, + "grad_norm": 0.5074352200988693, + "learning_rate": 4.986646496182786e-06, + "loss": 0.6297, + "step": 2120 + }, + { + "epoch": 0.8381328723141517, + "grad_norm": 0.4623558465170015, + "learning_rate": 4.98663031965934e-06, + "loss": 0.5988, + "step": 2121 + }, + { + "epoch": 0.8385280316127439, + "grad_norm": 0.5048081966048025, + "learning_rate": 4.9866141333699215e-06, + "loss": 0.6172, + "step": 2122 + }, + { + "epoch": 0.8389231909113362, + "grad_norm": 0.4969733235844862, + "learning_rate": 4.986597937314591e-06, + "loss": 0.6126, + "step": 2123 + }, + { + "epoch": 0.8393183502099284, + "grad_norm": 0.4644808410529684, + "learning_rate": 4.986581731493415e-06, + "loss": 0.6122, + "step": 2124 + }, + { + "epoch": 0.8397135095085206, + "grad_norm": 0.49621141240128863, + "learning_rate": 4.986565515906455e-06, + "loss": 0.627, + "step": 2125 + }, + { + "epoch": 0.8401086688071129, + "grad_norm": 0.5140052050923385, + "learning_rate": 4.986549290553777e-06, + "loss": 0.6076, + "step": 2126 + }, + { + "epoch": 0.8405038281057051, + "grad_norm": 0.4774110621495217, + "learning_rate": 4.986533055435442e-06, + "loss": 0.638, + "step": 2127 + }, + { + "epoch": 0.8408989874042974, + "grad_norm": 0.4831024360099805, + "learning_rate": 4.986516810551515e-06, + "loss": 0.5982, + "step": 2128 + }, + { + "epoch": 0.8412941467028896, + "grad_norm": 0.489641326798273, + "learning_rate": 4.9865005559020605e-06, + "loss": 0.6139, + "step": 2129 + }, + { + "epoch": 0.8416893060014818, + "grad_norm": 0.4700688822738762, + "learning_rate": 4.986484291487142e-06, + "loss": 0.6094, + "step": 2130 + }, + { + "epoch": 0.8420844653000741, + "grad_norm": 0.5498327067539832, + "learning_rate": 4.9864680173068215e-06, + "loss": 0.6159, + "step": 2131 + }, + { + "epoch": 0.8424796245986663, + "grad_norm": 0.5209813131829413, + "learning_rate": 4.986451733361165e-06, + "loss": 0.6058, + "step": 2132 + }, + { + "epoch": 0.8428747838972586, + "grad_norm": 0.4755760046484086, + "learning_rate": 4.986435439650236e-06, + "loss": 0.6139, + "step": 2133 + }, + { + "epoch": 0.8432699431958508, + "grad_norm": 0.4980942846797616, + "learning_rate": 4.9864191361741e-06, + "loss": 0.6253, + "step": 2134 + }, + { + "epoch": 0.843665102494443, + "grad_norm": 0.5023114342986108, + "learning_rate": 4.986402822932818e-06, + "loss": 0.6174, + "step": 2135 + }, + { + "epoch": 0.8440602617930353, + "grad_norm": 0.487624480191535, + "learning_rate": 4.986386499926456e-06, + "loss": 0.6156, + "step": 2136 + }, + { + "epoch": 0.8444554210916275, + "grad_norm": 0.47126111959332073, + "learning_rate": 4.986370167155078e-06, + "loss": 0.604, + "step": 2137 + }, + { + "epoch": 0.8448505803902198, + "grad_norm": 0.4859939062367633, + "learning_rate": 4.986353824618747e-06, + "loss": 0.6085, + "step": 2138 + }, + { + "epoch": 0.845245739688812, + "grad_norm": 0.4865870625029928, + "learning_rate": 4.9863374723175285e-06, + "loss": 0.6155, + "step": 2139 + }, + { + "epoch": 0.8456408989874042, + "grad_norm": 0.5154805544761918, + "learning_rate": 4.9863211102514855e-06, + "loss": 0.6092, + "step": 2140 + }, + { + "epoch": 0.8460360582859966, + "grad_norm": 0.479204794222624, + "learning_rate": 4.986304738420684e-06, + "loss": 0.6202, + "step": 2141 + }, + { + "epoch": 0.8464312175845888, + "grad_norm": 0.4854511339075718, + "learning_rate": 4.986288356825186e-06, + "loss": 0.5973, + "step": 2142 + }, + { + "epoch": 0.8468263768831811, + "grad_norm": 0.4947439475538887, + "learning_rate": 4.986271965465058e-06, + "loss": 0.606, + "step": 2143 + }, + { + "epoch": 0.8472215361817733, + "grad_norm": 0.4772366979723403, + "learning_rate": 4.9862555643403634e-06, + "loss": 0.6412, + "step": 2144 + }, + { + "epoch": 0.8476166954803656, + "grad_norm": 0.45202495018365624, + "learning_rate": 4.986239153451167e-06, + "loss": 0.6009, + "step": 2145 + }, + { + "epoch": 0.8480118547789578, + "grad_norm": 0.4674899554356609, + "learning_rate": 4.986222732797532e-06, + "loss": 0.6083, + "step": 2146 + }, + { + "epoch": 0.84840701407755, + "grad_norm": 0.4720229690067215, + "learning_rate": 4.986206302379524e-06, + "loss": 0.6193, + "step": 2147 + }, + { + "epoch": 0.8488021733761423, + "grad_norm": 0.5068197579171579, + "learning_rate": 4.986189862197208e-06, + "loss": 0.6188, + "step": 2148 + }, + { + "epoch": 0.8491973326747345, + "grad_norm": 0.475281381106489, + "learning_rate": 4.9861734122506475e-06, + "loss": 0.6115, + "step": 2149 + }, + { + "epoch": 0.8495924919733268, + "grad_norm": 0.46986989066929974, + "learning_rate": 4.986156952539908e-06, + "loss": 0.6269, + "step": 2150 + }, + { + "epoch": 0.849987651271919, + "grad_norm": 0.46811282834432916, + "learning_rate": 4.986140483065053e-06, + "loss": 0.6215, + "step": 2151 + }, + { + "epoch": 0.8503828105705112, + "grad_norm": 0.4910081036842562, + "learning_rate": 4.986124003826148e-06, + "loss": 0.6181, + "step": 2152 + }, + { + "epoch": 0.8507779698691035, + "grad_norm": 0.4872472385487101, + "learning_rate": 4.986107514823257e-06, + "loss": 0.6192, + "step": 2153 + }, + { + "epoch": 0.8511731291676957, + "grad_norm": 0.47883592093053695, + "learning_rate": 4.986091016056446e-06, + "loss": 0.6201, + "step": 2154 + }, + { + "epoch": 0.851568288466288, + "grad_norm": 0.485056443392643, + "learning_rate": 4.986074507525779e-06, + "loss": 0.6156, + "step": 2155 + }, + { + "epoch": 0.8519634477648802, + "grad_norm": 0.4833961803335379, + "learning_rate": 4.986057989231321e-06, + "loss": 0.5979, + "step": 2156 + }, + { + "epoch": 0.8523586070634724, + "grad_norm": 0.48491015695364686, + "learning_rate": 4.9860414611731375e-06, + "loss": 0.6379, + "step": 2157 + }, + { + "epoch": 0.8527537663620647, + "grad_norm": 0.47943852412179266, + "learning_rate": 4.986024923351292e-06, + "loss": 0.6266, + "step": 2158 + }, + { + "epoch": 0.8531489256606569, + "grad_norm": 0.4899269774275039, + "learning_rate": 4.9860083757658505e-06, + "loss": 0.6266, + "step": 2159 + }, + { + "epoch": 0.8535440849592492, + "grad_norm": 0.47032529957003527, + "learning_rate": 4.985991818416877e-06, + "loss": 0.6491, + "step": 2160 + }, + { + "epoch": 0.8539392442578414, + "grad_norm": 0.4902687851540872, + "learning_rate": 4.9859752513044375e-06, + "loss": 0.6301, + "step": 2161 + }, + { + "epoch": 0.8543344035564336, + "grad_norm": 0.48553682325045194, + "learning_rate": 4.985958674428597e-06, + "loss": 0.6061, + "step": 2162 + }, + { + "epoch": 0.8547295628550259, + "grad_norm": 0.48411182285405713, + "learning_rate": 4.98594208778942e-06, + "loss": 0.6247, + "step": 2163 + }, + { + "epoch": 0.8551247221536181, + "grad_norm": 0.47951482859948863, + "learning_rate": 4.985925491386973e-06, + "loss": 0.6312, + "step": 2164 + }, + { + "epoch": 0.8555198814522105, + "grad_norm": 0.4746537184476964, + "learning_rate": 4.98590888522132e-06, + "loss": 0.6109, + "step": 2165 + }, + { + "epoch": 0.8559150407508027, + "grad_norm": 0.4730418292508103, + "learning_rate": 4.985892269292526e-06, + "loss": 0.6161, + "step": 2166 + }, + { + "epoch": 0.856310200049395, + "grad_norm": 0.4904685249853375, + "learning_rate": 4.985875643600656e-06, + "loss": 0.6025, + "step": 2167 + }, + { + "epoch": 0.8567053593479872, + "grad_norm": 0.48706770829729257, + "learning_rate": 4.985859008145777e-06, + "loss": 0.6083, + "step": 2168 + }, + { + "epoch": 0.8571005186465794, + "grad_norm": 0.4789542146071817, + "learning_rate": 4.9858423629279525e-06, + "loss": 0.6022, + "step": 2169 + }, + { + "epoch": 0.8574956779451717, + "grad_norm": 0.4764312440150958, + "learning_rate": 4.98582570794725e-06, + "loss": 0.6269, + "step": 2170 + }, + { + "epoch": 0.8578908372437639, + "grad_norm": 0.4905768390433796, + "learning_rate": 4.985809043203732e-06, + "loss": 0.6202, + "step": 2171 + }, + { + "epoch": 0.8582859965423562, + "grad_norm": 0.48600066838590206, + "learning_rate": 4.9857923686974664e-06, + "loss": 0.6293, + "step": 2172 + }, + { + "epoch": 0.8586811558409484, + "grad_norm": 0.5278954101657808, + "learning_rate": 4.985775684428518e-06, + "loss": 0.626, + "step": 2173 + }, + { + "epoch": 0.8590763151395406, + "grad_norm": 0.4772206937537348, + "learning_rate": 4.985758990396952e-06, + "loss": 0.6127, + "step": 2174 + }, + { + "epoch": 0.8594714744381329, + "grad_norm": 0.5032599137233393, + "learning_rate": 4.985742286602834e-06, + "loss": 0.6247, + "step": 2175 + }, + { + "epoch": 0.8598666337367251, + "grad_norm": 0.4890130823058908, + "learning_rate": 4.985725573046229e-06, + "loss": 0.6138, + "step": 2176 + }, + { + "epoch": 0.8602617930353174, + "grad_norm": 0.5090384794958555, + "learning_rate": 4.985708849727205e-06, + "loss": 0.6221, + "step": 2177 + }, + { + "epoch": 0.8606569523339096, + "grad_norm": 0.5064691369911446, + "learning_rate": 4.985692116645825e-06, + "loss": 0.6365, + "step": 2178 + }, + { + "epoch": 0.8610521116325018, + "grad_norm": 0.4884256917672367, + "learning_rate": 4.985675373802155e-06, + "loss": 0.6245, + "step": 2179 + }, + { + "epoch": 0.8614472709310941, + "grad_norm": 0.48277194568900933, + "learning_rate": 4.9856586211962636e-06, + "loss": 0.6303, + "step": 2180 + }, + { + "epoch": 0.8618424302296863, + "grad_norm": 0.4796258764505248, + "learning_rate": 4.985641858828213e-06, + "loss": 0.5917, + "step": 2181 + }, + { + "epoch": 0.8622375895282786, + "grad_norm": 0.4742026777974947, + "learning_rate": 4.985625086698071e-06, + "loss": 0.6014, + "step": 2182 + }, + { + "epoch": 0.8626327488268708, + "grad_norm": 0.4798722649367731, + "learning_rate": 4.9856083048059025e-06, + "loss": 0.6058, + "step": 2183 + }, + { + "epoch": 0.863027908125463, + "grad_norm": 0.484402428507696, + "learning_rate": 4.985591513151775e-06, + "loss": 0.6244, + "step": 2184 + }, + { + "epoch": 0.8634230674240553, + "grad_norm": 0.4859733762622107, + "learning_rate": 4.985574711735752e-06, + "loss": 0.6098, + "step": 2185 + }, + { + "epoch": 0.8638182267226475, + "grad_norm": 0.4810113395505376, + "learning_rate": 4.985557900557902e-06, + "loss": 0.6142, + "step": 2186 + }, + { + "epoch": 0.8642133860212398, + "grad_norm": 0.504453340452416, + "learning_rate": 4.985541079618289e-06, + "loss": 0.6289, + "step": 2187 + }, + { + "epoch": 0.8646085453198321, + "grad_norm": 0.4855393706146718, + "learning_rate": 4.985524248916981e-06, + "loss": 0.6215, + "step": 2188 + }, + { + "epoch": 0.8650037046184244, + "grad_norm": 0.4816372880869928, + "learning_rate": 4.985507408454042e-06, + "loss": 0.6214, + "step": 2189 + }, + { + "epoch": 0.8653988639170166, + "grad_norm": 0.4971312058231278, + "learning_rate": 4.98549055822954e-06, + "loss": 0.6184, + "step": 2190 + }, + { + "epoch": 0.8657940232156088, + "grad_norm": 0.4737419973037183, + "learning_rate": 4.985473698243539e-06, + "loss": 0.6362, + "step": 2191 + }, + { + "epoch": 0.8661891825142011, + "grad_norm": 0.4819594165338205, + "learning_rate": 4.985456828496108e-06, + "loss": 0.6265, + "step": 2192 + }, + { + "epoch": 0.8665843418127933, + "grad_norm": 0.4713033614828359, + "learning_rate": 4.985439948987311e-06, + "loss": 0.6062, + "step": 2193 + }, + { + "epoch": 0.8669795011113856, + "grad_norm": 0.4731870858016755, + "learning_rate": 4.985423059717216e-06, + "loss": 0.6222, + "step": 2194 + }, + { + "epoch": 0.8673746604099778, + "grad_norm": 0.4835316201063381, + "learning_rate": 4.9854061606858875e-06, + "loss": 0.636, + "step": 2195 + }, + { + "epoch": 0.86776981970857, + "grad_norm": 0.48541889377733993, + "learning_rate": 4.985389251893393e-06, + "loss": 0.6183, + "step": 2196 + }, + { + "epoch": 0.8681649790071623, + "grad_norm": 0.49391047126615967, + "learning_rate": 4.985372333339799e-06, + "loss": 0.6044, + "step": 2197 + }, + { + "epoch": 0.8685601383057545, + "grad_norm": 0.5015005291120314, + "learning_rate": 4.985355405025172e-06, + "loss": 0.6142, + "step": 2198 + }, + { + "epoch": 0.8689552976043468, + "grad_norm": 0.4898017203240075, + "learning_rate": 4.985338466949577e-06, + "loss": 0.6299, + "step": 2199 + }, + { + "epoch": 0.869350456902939, + "grad_norm": 0.49139107555550443, + "learning_rate": 4.985321519113083e-06, + "loss": 0.5891, + "step": 2200 + }, + { + "epoch": 0.8697456162015312, + "grad_norm": 0.4653030042469201, + "learning_rate": 4.985304561515754e-06, + "loss": 0.6056, + "step": 2201 + }, + { + "epoch": 0.8701407755001235, + "grad_norm": 0.48204406635629615, + "learning_rate": 4.985287594157659e-06, + "loss": 0.6149, + "step": 2202 + }, + { + "epoch": 0.8705359347987157, + "grad_norm": 0.4944551279372508, + "learning_rate": 4.9852706170388635e-06, + "loss": 0.6122, + "step": 2203 + }, + { + "epoch": 0.870931094097308, + "grad_norm": 0.4632483556840172, + "learning_rate": 4.985253630159434e-06, + "loss": 0.6117, + "step": 2204 + }, + { + "epoch": 0.8713262533959002, + "grad_norm": 0.4758369257065419, + "learning_rate": 4.9852366335194365e-06, + "loss": 0.631, + "step": 2205 + }, + { + "epoch": 0.8717214126944924, + "grad_norm": 0.4784261261826601, + "learning_rate": 4.985219627118939e-06, + "loss": 0.6254, + "step": 2206 + }, + { + "epoch": 0.8721165719930847, + "grad_norm": 0.46820459867095077, + "learning_rate": 4.985202610958008e-06, + "loss": 0.6041, + "step": 2207 + }, + { + "epoch": 0.8725117312916769, + "grad_norm": 0.6774502372821436, + "learning_rate": 4.98518558503671e-06, + "loss": 0.6349, + "step": 2208 + }, + { + "epoch": 0.8729068905902692, + "grad_norm": 0.477847341760241, + "learning_rate": 4.985168549355113e-06, + "loss": 0.5999, + "step": 2209 + }, + { + "epoch": 0.8733020498888614, + "grad_norm": 0.4709206605453225, + "learning_rate": 4.985151503913283e-06, + "loss": 0.6219, + "step": 2210 + }, + { + "epoch": 0.8736972091874536, + "grad_norm": 0.5073393890746453, + "learning_rate": 4.985134448711285e-06, + "loss": 0.6131, + "step": 2211 + }, + { + "epoch": 0.874092368486046, + "grad_norm": 0.47895189242680536, + "learning_rate": 4.98511738374919e-06, + "loss": 0.6276, + "step": 2212 + }, + { + "epoch": 0.8744875277846382, + "grad_norm": 0.4732689476332835, + "learning_rate": 4.985100309027062e-06, + "loss": 0.6256, + "step": 2213 + }, + { + "epoch": 0.8748826870832305, + "grad_norm": 0.4682497810387466, + "learning_rate": 4.985083224544969e-06, + "loss": 0.6024, + "step": 2214 + }, + { + "epoch": 0.8752778463818227, + "grad_norm": 0.5044101641071467, + "learning_rate": 4.985066130302979e-06, + "loss": 0.6292, + "step": 2215 + }, + { + "epoch": 0.875673005680415, + "grad_norm": 0.5079557931579949, + "learning_rate": 4.985049026301158e-06, + "loss": 0.6228, + "step": 2216 + }, + { + "epoch": 0.8760681649790072, + "grad_norm": 0.4867583650129786, + "learning_rate": 4.985031912539572e-06, + "loss": 0.6211, + "step": 2217 + }, + { + "epoch": 0.8764633242775994, + "grad_norm": 0.48476520801738077, + "learning_rate": 4.985014789018291e-06, + "loss": 0.5925, + "step": 2218 + }, + { + "epoch": 0.8768584835761917, + "grad_norm": 0.49568692882589194, + "learning_rate": 4.9849976557373805e-06, + "loss": 0.6223, + "step": 2219 + }, + { + "epoch": 0.8772536428747839, + "grad_norm": 0.5244025646487809, + "learning_rate": 4.984980512696908e-06, + "loss": 0.6113, + "step": 2220 + }, + { + "epoch": 0.8776488021733762, + "grad_norm": 0.4647233758348524, + "learning_rate": 4.984963359896941e-06, + "loss": 0.6245, + "step": 2221 + }, + { + "epoch": 0.8780439614719684, + "grad_norm": 0.493296948580835, + "learning_rate": 4.984946197337548e-06, + "loss": 0.6289, + "step": 2222 + }, + { + "epoch": 0.8784391207705606, + "grad_norm": 0.48487371283817793, + "learning_rate": 4.984929025018794e-06, + "loss": 0.618, + "step": 2223 + }, + { + "epoch": 0.8788342800691529, + "grad_norm": 0.4628323988563299, + "learning_rate": 4.9849118429407486e-06, + "loss": 0.6116, + "step": 2224 + }, + { + "epoch": 0.8792294393677451, + "grad_norm": 0.4708565386034292, + "learning_rate": 4.984894651103478e-06, + "loss": 0.6073, + "step": 2225 + }, + { + "epoch": 0.8796245986663374, + "grad_norm": 0.47596980477482004, + "learning_rate": 4.98487744950705e-06, + "loss": 0.6134, + "step": 2226 + }, + { + "epoch": 0.8800197579649296, + "grad_norm": 0.48436684059520585, + "learning_rate": 4.984860238151533e-06, + "loss": 0.6306, + "step": 2227 + }, + { + "epoch": 0.8804149172635218, + "grad_norm": 2.901406553646194, + "learning_rate": 4.984843017036993e-06, + "loss": 0.6343, + "step": 2228 + }, + { + "epoch": 0.8808100765621141, + "grad_norm": 0.5052255791290899, + "learning_rate": 4.984825786163499e-06, + "loss": 0.6131, + "step": 2229 + }, + { + "epoch": 0.8812052358607063, + "grad_norm": 0.4790543312612506, + "learning_rate": 4.984808545531118e-06, + "loss": 0.6215, + "step": 2230 + }, + { + "epoch": 0.8816003951592986, + "grad_norm": 0.46921948088144955, + "learning_rate": 4.984791295139917e-06, + "loss": 0.6004, + "step": 2231 + }, + { + "epoch": 0.8819955544578908, + "grad_norm": 0.4774028382073232, + "learning_rate": 4.984774034989965e-06, + "loss": 0.6351, + "step": 2232 + }, + { + "epoch": 0.882390713756483, + "grad_norm": 0.48613593924598036, + "learning_rate": 4.98475676508133e-06, + "loss": 0.6223, + "step": 2233 + }, + { + "epoch": 0.8827858730550753, + "grad_norm": 0.47488141594904604, + "learning_rate": 4.9847394854140796e-06, + "loss": 0.6361, + "step": 2234 + }, + { + "epoch": 0.8831810323536676, + "grad_norm": 0.469545225131135, + "learning_rate": 4.984722195988281e-06, + "loss": 0.6227, + "step": 2235 + }, + { + "epoch": 0.8835761916522599, + "grad_norm": 0.5460234062255062, + "learning_rate": 4.984704896804003e-06, + "loss": 0.5994, + "step": 2236 + }, + { + "epoch": 0.8839713509508521, + "grad_norm": 0.5087695798593977, + "learning_rate": 4.984687587861311e-06, + "loss": 0.6432, + "step": 2237 + }, + { + "epoch": 0.8843665102494443, + "grad_norm": 0.49848654741613146, + "learning_rate": 4.984670269160277e-06, + "loss": 0.6081, + "step": 2238 + }, + { + "epoch": 0.8847616695480366, + "grad_norm": 0.47222113192747045, + "learning_rate": 4.984652940700966e-06, + "loss": 0.5955, + "step": 2239 + }, + { + "epoch": 0.8851568288466288, + "grad_norm": 0.47776469283687745, + "learning_rate": 4.984635602483447e-06, + "loss": 0.6107, + "step": 2240 + }, + { + "epoch": 0.8855519881452211, + "grad_norm": 0.4761875116964049, + "learning_rate": 4.984618254507788e-06, + "loss": 0.6104, + "step": 2241 + }, + { + "epoch": 0.8859471474438133, + "grad_norm": 0.4772606111305406, + "learning_rate": 4.984600896774058e-06, + "loss": 0.612, + "step": 2242 + }, + { + "epoch": 0.8863423067424056, + "grad_norm": 0.4875078890714507, + "learning_rate": 4.984583529282323e-06, + "loss": 0.6046, + "step": 2243 + }, + { + "epoch": 0.8867374660409978, + "grad_norm": 0.5351729625323042, + "learning_rate": 4.984566152032654e-06, + "loss": 0.6302, + "step": 2244 + }, + { + "epoch": 0.88713262533959, + "grad_norm": 0.5134865903542387, + "learning_rate": 4.984548765025117e-06, + "loss": 0.6398, + "step": 2245 + }, + { + "epoch": 0.8875277846381823, + "grad_norm": 0.47943572967975917, + "learning_rate": 4.984531368259782e-06, + "loss": 0.583, + "step": 2246 + }, + { + "epoch": 0.8879229439367745, + "grad_norm": 0.48674628922099783, + "learning_rate": 4.984513961736716e-06, + "loss": 0.6189, + "step": 2247 + }, + { + "epoch": 0.8883181032353668, + "grad_norm": 0.5054075145507174, + "learning_rate": 4.984496545455988e-06, + "loss": 0.6236, + "step": 2248 + }, + { + "epoch": 0.888713262533959, + "grad_norm": 0.4717072903007971, + "learning_rate": 4.984479119417666e-06, + "loss": 0.6108, + "step": 2249 + }, + { + "epoch": 0.8891084218325512, + "grad_norm": 0.4973332505120243, + "learning_rate": 4.984461683621818e-06, + "loss": 0.6141, + "step": 2250 + }, + { + "epoch": 0.8895035811311435, + "grad_norm": 0.5202503935334306, + "learning_rate": 4.984444238068515e-06, + "loss": 0.611, + "step": 2251 + }, + { + "epoch": 0.8898987404297357, + "grad_norm": 0.4868713771823043, + "learning_rate": 4.984426782757822e-06, + "loss": 0.623, + "step": 2252 + }, + { + "epoch": 0.890293899728328, + "grad_norm": 0.4797581634559235, + "learning_rate": 4.984409317689809e-06, + "loss": 0.6192, + "step": 2253 + }, + { + "epoch": 0.8906890590269202, + "grad_norm": 0.49427284931935045, + "learning_rate": 4.984391842864546e-06, + "loss": 0.6149, + "step": 2254 + }, + { + "epoch": 0.8910842183255124, + "grad_norm": 0.4869169065393114, + "learning_rate": 4.9843743582821005e-06, + "loss": 0.621, + "step": 2255 + }, + { + "epoch": 0.8914793776241047, + "grad_norm": 0.4876328379453221, + "learning_rate": 4.98435686394254e-06, + "loss": 0.5969, + "step": 2256 + }, + { + "epoch": 0.8918745369226969, + "grad_norm": 0.4832121246843812, + "learning_rate": 4.984339359845935e-06, + "loss": 0.6265, + "step": 2257 + }, + { + "epoch": 0.8922696962212892, + "grad_norm": 0.566324624003132, + "learning_rate": 4.9843218459923535e-06, + "loss": 0.6097, + "step": 2258 + }, + { + "epoch": 0.8926648555198815, + "grad_norm": 0.49414708906211646, + "learning_rate": 4.9843043223818646e-06, + "loss": 0.6336, + "step": 2259 + }, + { + "epoch": 0.8930600148184737, + "grad_norm": 0.4670689454184053, + "learning_rate": 4.984286789014536e-06, + "loss": 0.6231, + "step": 2260 + }, + { + "epoch": 0.893455174117066, + "grad_norm": 0.47425632523334776, + "learning_rate": 4.984269245890438e-06, + "loss": 0.6116, + "step": 2261 + }, + { + "epoch": 0.8938503334156582, + "grad_norm": 0.481098132153482, + "learning_rate": 4.98425169300964e-06, + "loss": 0.6339, + "step": 2262 + }, + { + "epoch": 0.8942454927142505, + "grad_norm": 0.4881412900652154, + "learning_rate": 4.984234130372209e-06, + "loss": 0.617, + "step": 2263 + }, + { + "epoch": 0.8946406520128427, + "grad_norm": 0.49122021239183733, + "learning_rate": 4.984216557978214e-06, + "loss": 0.6199, + "step": 2264 + }, + { + "epoch": 0.895035811311435, + "grad_norm": 0.4890052599536881, + "learning_rate": 4.9841989758277255e-06, + "loss": 0.6359, + "step": 2265 + }, + { + "epoch": 0.8954309706100272, + "grad_norm": 0.5014418130563002, + "learning_rate": 4.984181383920812e-06, + "loss": 0.6162, + "step": 2266 + }, + { + "epoch": 0.8958261299086194, + "grad_norm": 0.48774048149914556, + "learning_rate": 4.984163782257543e-06, + "loss": 0.6175, + "step": 2267 + }, + { + "epoch": 0.8962212892072117, + "grad_norm": 0.4619145815324718, + "learning_rate": 4.9841461708379865e-06, + "loss": 0.6129, + "step": 2268 + }, + { + "epoch": 0.8966164485058039, + "grad_norm": 0.4706359033079191, + "learning_rate": 4.9841285496622124e-06, + "loss": 0.6217, + "step": 2269 + }, + { + "epoch": 0.8970116078043961, + "grad_norm": 0.5330737136007412, + "learning_rate": 4.984110918730289e-06, + "loss": 0.6302, + "step": 2270 + }, + { + "epoch": 0.8974067671029884, + "grad_norm": 0.4929056846260133, + "learning_rate": 4.984093278042288e-06, + "loss": 0.6106, + "step": 2271 + }, + { + "epoch": 0.8978019264015806, + "grad_norm": 0.4780478045107924, + "learning_rate": 4.984075627598276e-06, + "loss": 0.6254, + "step": 2272 + }, + { + "epoch": 0.8981970857001729, + "grad_norm": 0.48039178372010166, + "learning_rate": 4.984057967398324e-06, + "loss": 0.613, + "step": 2273 + }, + { + "epoch": 0.8985922449987651, + "grad_norm": 0.4902534031318573, + "learning_rate": 4.984040297442499e-06, + "loss": 0.6196, + "step": 2274 + }, + { + "epoch": 0.8989874042973574, + "grad_norm": 0.48251041315749466, + "learning_rate": 4.9840226177308745e-06, + "loss": 0.635, + "step": 2275 + }, + { + "epoch": 0.8993825635959496, + "grad_norm": 0.5107948423795806, + "learning_rate": 4.984004928263516e-06, + "loss": 0.6371, + "step": 2276 + }, + { + "epoch": 0.8997777228945418, + "grad_norm": 0.4849099292899462, + "learning_rate": 4.983987229040495e-06, + "loss": 0.5988, + "step": 2277 + }, + { + "epoch": 0.9001728821931341, + "grad_norm": 0.45033030391921003, + "learning_rate": 4.9839695200618804e-06, + "loss": 0.5957, + "step": 2278 + }, + { + "epoch": 0.9005680414917263, + "grad_norm": 0.48511884291874885, + "learning_rate": 4.9839518013277425e-06, + "loss": 0.5963, + "step": 2279 + }, + { + "epoch": 0.9009632007903186, + "grad_norm": 0.4818554448134295, + "learning_rate": 4.983934072838149e-06, + "loss": 0.6199, + "step": 2280 + }, + { + "epoch": 0.9013583600889108, + "grad_norm": 0.4864050546230534, + "learning_rate": 4.983916334593171e-06, + "loss": 0.6268, + "step": 2281 + }, + { + "epoch": 0.901753519387503, + "grad_norm": 0.48126531375723003, + "learning_rate": 4.9838985865928794e-06, + "loss": 0.645, + "step": 2282 + }, + { + "epoch": 0.9021486786860954, + "grad_norm": 0.47513264097168434, + "learning_rate": 4.9838808288373405e-06, + "loss": 0.628, + "step": 2283 + }, + { + "epoch": 0.9025438379846876, + "grad_norm": 0.4858642717988499, + "learning_rate": 4.983863061326627e-06, + "loss": 0.6167, + "step": 2284 + }, + { + "epoch": 0.9029389972832799, + "grad_norm": 0.5024695807134403, + "learning_rate": 4.983845284060808e-06, + "loss": 0.6108, + "step": 2285 + }, + { + "epoch": 0.9033341565818721, + "grad_norm": 0.47725860181210494, + "learning_rate": 4.983827497039953e-06, + "loss": 0.5905, + "step": 2286 + }, + { + "epoch": 0.9037293158804643, + "grad_norm": 0.4725442142623689, + "learning_rate": 4.983809700264131e-06, + "loss": 0.6251, + "step": 2287 + }, + { + "epoch": 0.9041244751790566, + "grad_norm": 0.4917289285832563, + "learning_rate": 4.9837918937334125e-06, + "loss": 0.642, + "step": 2288 + }, + { + "epoch": 0.9045196344776488, + "grad_norm": 0.49254790176708696, + "learning_rate": 4.983774077447869e-06, + "loss": 0.6035, + "step": 2289 + }, + { + "epoch": 0.9049147937762411, + "grad_norm": 0.47663891087530386, + "learning_rate": 4.983756251407569e-06, + "loss": 0.6165, + "step": 2290 + }, + { + "epoch": 0.9053099530748333, + "grad_norm": 0.46771621207039893, + "learning_rate": 4.983738415612581e-06, + "loss": 0.5982, + "step": 2291 + }, + { + "epoch": 0.9057051123734255, + "grad_norm": 0.48007080844590533, + "learning_rate": 4.983720570062979e-06, + "loss": 0.6118, + "step": 2292 + }, + { + "epoch": 0.9061002716720178, + "grad_norm": 0.47976244296713044, + "learning_rate": 4.9837027147588294e-06, + "loss": 0.6042, + "step": 2293 + }, + { + "epoch": 0.90649543097061, + "grad_norm": 0.5070646462309771, + "learning_rate": 4.983684849700204e-06, + "loss": 0.6336, + "step": 2294 + }, + { + "epoch": 0.9068905902692023, + "grad_norm": 0.4746142937787273, + "learning_rate": 4.983666974887172e-06, + "loss": 0.6275, + "step": 2295 + }, + { + "epoch": 0.9072857495677945, + "grad_norm": 0.4773646681690295, + "learning_rate": 4.983649090319806e-06, + "loss": 0.6222, + "step": 2296 + }, + { + "epoch": 0.9076809088663867, + "grad_norm": 0.4870955712187313, + "learning_rate": 4.983631195998173e-06, + "loss": 0.6336, + "step": 2297 + }, + { + "epoch": 0.908076068164979, + "grad_norm": 0.5385238468634763, + "learning_rate": 4.983613291922345e-06, + "loss": 0.6218, + "step": 2298 + }, + { + "epoch": 0.9084712274635712, + "grad_norm": 0.4704713115373773, + "learning_rate": 4.983595378092393e-06, + "loss": 0.6295, + "step": 2299 + }, + { + "epoch": 0.9088663867621635, + "grad_norm": 0.5046914027772894, + "learning_rate": 4.9835774545083856e-06, + "loss": 0.6145, + "step": 2300 + }, + { + "epoch": 0.9092615460607557, + "grad_norm": 0.4839085038708857, + "learning_rate": 4.983559521170394e-06, + "loss": 0.6254, + "step": 2301 + }, + { + "epoch": 0.909656705359348, + "grad_norm": 0.5087717682587891, + "learning_rate": 4.98354157807849e-06, + "loss": 0.6161, + "step": 2302 + }, + { + "epoch": 0.9100518646579402, + "grad_norm": 0.46452927922636916, + "learning_rate": 4.983523625232741e-06, + "loss": 0.621, + "step": 2303 + }, + { + "epoch": 0.9104470239565324, + "grad_norm": 0.46904690117204156, + "learning_rate": 4.983505662633221e-06, + "loss": 0.6, + "step": 2304 + }, + { + "epoch": 0.9108421832551247, + "grad_norm": 0.5301523549946244, + "learning_rate": 4.983487690279998e-06, + "loss": 0.6086, + "step": 2305 + }, + { + "epoch": 0.911237342553717, + "grad_norm": 0.5060704545892506, + "learning_rate": 4.983469708173143e-06, + "loss": 0.6229, + "step": 2306 + }, + { + "epoch": 0.9116325018523093, + "grad_norm": 0.4813420559082125, + "learning_rate": 4.9834517163127275e-06, + "loss": 0.6211, + "step": 2307 + }, + { + "epoch": 0.9120276611509015, + "grad_norm": 0.5220914006598277, + "learning_rate": 4.983433714698821e-06, + "loss": 0.6152, + "step": 2308 + }, + { + "epoch": 0.9124228204494937, + "grad_norm": 0.49458336480253934, + "learning_rate": 4.983415703331496e-06, + "loss": 0.5969, + "step": 2309 + }, + { + "epoch": 0.912817979748086, + "grad_norm": 0.48129884758049346, + "learning_rate": 4.983397682210821e-06, + "loss": 0.621, + "step": 2310 + }, + { + "epoch": 0.9132131390466782, + "grad_norm": 0.4727468640393369, + "learning_rate": 4.983379651336869e-06, + "loss": 0.6197, + "step": 2311 + }, + { + "epoch": 0.9136082983452705, + "grad_norm": 0.49587024829033005, + "learning_rate": 4.983361610709709e-06, + "loss": 0.6351, + "step": 2312 + }, + { + "epoch": 0.9140034576438627, + "grad_norm": 0.4887985778014849, + "learning_rate": 4.983343560329413e-06, + "loss": 0.6036, + "step": 2313 + }, + { + "epoch": 0.914398616942455, + "grad_norm": 0.5094208232865344, + "learning_rate": 4.983325500196051e-06, + "loss": 0.6281, + "step": 2314 + }, + { + "epoch": 0.9147937762410472, + "grad_norm": 0.520779337465162, + "learning_rate": 4.983307430309695e-06, + "loss": 0.5966, + "step": 2315 + }, + { + "epoch": 0.9151889355396394, + "grad_norm": 0.4983724492164491, + "learning_rate": 4.983289350670415e-06, + "loss": 0.6002, + "step": 2316 + }, + { + "epoch": 0.9155840948382317, + "grad_norm": 0.4977585587251415, + "learning_rate": 4.983271261278282e-06, + "loss": 0.6048, + "step": 2317 + }, + { + "epoch": 0.9159792541368239, + "grad_norm": 0.5074344584060536, + "learning_rate": 4.983253162133368e-06, + "loss": 0.6061, + "step": 2318 + }, + { + "epoch": 0.9163744134354161, + "grad_norm": 0.47393303007323573, + "learning_rate": 4.983235053235743e-06, + "loss": 0.6107, + "step": 2319 + }, + { + "epoch": 0.9167695727340084, + "grad_norm": 0.48092444595983924, + "learning_rate": 4.983216934585478e-06, + "loss": 0.6252, + "step": 2320 + }, + { + "epoch": 0.9171647320326006, + "grad_norm": 0.5173009133356737, + "learning_rate": 4.983198806182647e-06, + "loss": 0.6342, + "step": 2321 + }, + { + "epoch": 0.9175598913311929, + "grad_norm": 0.4823899650053501, + "learning_rate": 4.983180668027317e-06, + "loss": 0.6254, + "step": 2322 + }, + { + "epoch": 0.9179550506297851, + "grad_norm": 0.48307570444067927, + "learning_rate": 4.9831625201195625e-06, + "loss": 0.6074, + "step": 2323 + }, + { + "epoch": 0.9183502099283773, + "grad_norm": 0.5114385837411004, + "learning_rate": 4.983144362459452e-06, + "loss": 0.5995, + "step": 2324 + }, + { + "epoch": 0.9187453692269696, + "grad_norm": 0.49319776581630537, + "learning_rate": 4.9831261950470595e-06, + "loss": 0.6033, + "step": 2325 + }, + { + "epoch": 0.9191405285255618, + "grad_norm": 0.48172601408960075, + "learning_rate": 4.9831080178824545e-06, + "loss": 0.6144, + "step": 2326 + }, + { + "epoch": 0.9195356878241541, + "grad_norm": 0.6998545259966715, + "learning_rate": 4.983089830965709e-06, + "loss": 0.5958, + "step": 2327 + }, + { + "epoch": 0.9199308471227463, + "grad_norm": 0.47578948749489164, + "learning_rate": 4.983071634296895e-06, + "loss": 0.6066, + "step": 2328 + }, + { + "epoch": 0.9203260064213385, + "grad_norm": 0.4924183719099034, + "learning_rate": 4.983053427876083e-06, + "loss": 0.6328, + "step": 2329 + }, + { + "epoch": 0.9207211657199309, + "grad_norm": 0.46828818805254213, + "learning_rate": 4.983035211703345e-06, + "loss": 0.6272, + "step": 2330 + }, + { + "epoch": 0.9211163250185231, + "grad_norm": 0.5014119318637013, + "learning_rate": 4.9830169857787524e-06, + "loss": 0.6223, + "step": 2331 + }, + { + "epoch": 0.9215114843171154, + "grad_norm": 0.509639756966488, + "learning_rate": 4.982998750102378e-06, + "loss": 0.6451, + "step": 2332 + }, + { + "epoch": 0.9219066436157076, + "grad_norm": 0.4874868797487242, + "learning_rate": 4.982980504674291e-06, + "loss": 0.6198, + "step": 2333 + }, + { + "epoch": 0.9223018029142999, + "grad_norm": 0.4910768108400356, + "learning_rate": 4.982962249494564e-06, + "loss": 0.6133, + "step": 2334 + }, + { + "epoch": 0.9226969622128921, + "grad_norm": 0.600038361094374, + "learning_rate": 4.98294398456327e-06, + "loss": 0.5906, + "step": 2335 + }, + { + "epoch": 0.9230921215114843, + "grad_norm": 0.4882683625038398, + "learning_rate": 4.982925709880479e-06, + "loss": 0.6216, + "step": 2336 + }, + { + "epoch": 0.9234872808100766, + "grad_norm": 0.4877944511418589, + "learning_rate": 4.982907425446264e-06, + "loss": 0.6142, + "step": 2337 + }, + { + "epoch": 0.9238824401086688, + "grad_norm": 0.47849837901974185, + "learning_rate": 4.982889131260696e-06, + "loss": 0.629, + "step": 2338 + }, + { + "epoch": 0.9242775994072611, + "grad_norm": 0.49657514695675, + "learning_rate": 4.982870827323847e-06, + "loss": 0.6344, + "step": 2339 + }, + { + "epoch": 0.9246727587058533, + "grad_norm": 0.4756867812643004, + "learning_rate": 4.98285251363579e-06, + "loss": 0.5892, + "step": 2340 + }, + { + "epoch": 0.9250679180044455, + "grad_norm": 0.4792345020023528, + "learning_rate": 4.982834190196595e-06, + "loss": 0.6405, + "step": 2341 + }, + { + "epoch": 0.9254630773030378, + "grad_norm": 0.47568126691061463, + "learning_rate": 4.982815857006336e-06, + "loss": 0.6218, + "step": 2342 + }, + { + "epoch": 0.92585823660163, + "grad_norm": 0.4622150942965156, + "learning_rate": 4.982797514065083e-06, + "loss": 0.5966, + "step": 2343 + }, + { + "epoch": 0.9262533959002223, + "grad_norm": 0.48062685913560826, + "learning_rate": 4.982779161372909e-06, + "loss": 0.6067, + "step": 2344 + }, + { + "epoch": 0.9266485551988145, + "grad_norm": 0.49832908386195485, + "learning_rate": 4.9827607989298874e-06, + "loss": 0.6213, + "step": 2345 + }, + { + "epoch": 0.9270437144974067, + "grad_norm": 0.4651264905943558, + "learning_rate": 4.982742426736088e-06, + "loss": 0.6104, + "step": 2346 + }, + { + "epoch": 0.927438873795999, + "grad_norm": 0.4627917129880916, + "learning_rate": 4.982724044791584e-06, + "loss": 0.5947, + "step": 2347 + }, + { + "epoch": 0.9278340330945912, + "grad_norm": 0.47459773400411004, + "learning_rate": 4.982705653096447e-06, + "loss": 0.6171, + "step": 2348 + }, + { + "epoch": 0.9282291923931835, + "grad_norm": 0.4754459015910213, + "learning_rate": 4.982687251650751e-06, + "loss": 0.5867, + "step": 2349 + }, + { + "epoch": 0.9286243516917757, + "grad_norm": 0.4693623306765774, + "learning_rate": 4.9826688404545655e-06, + "loss": 0.6016, + "step": 2350 + }, + { + "epoch": 0.929019510990368, + "grad_norm": 0.4595592610393575, + "learning_rate": 4.982650419507965e-06, + "loss": 0.6083, + "step": 2351 + }, + { + "epoch": 0.9294146702889602, + "grad_norm": 1.0329197407008157, + "learning_rate": 4.982631988811022e-06, + "loss": 0.6151, + "step": 2352 + }, + { + "epoch": 0.9298098295875524, + "grad_norm": 0.46434445302211896, + "learning_rate": 4.982613548363807e-06, + "loss": 0.6109, + "step": 2353 + }, + { + "epoch": 0.9302049888861448, + "grad_norm": 0.4720496810881737, + "learning_rate": 4.982595098166394e-06, + "loss": 0.6135, + "step": 2354 + }, + { + "epoch": 0.930600148184737, + "grad_norm": 0.46567296934174945, + "learning_rate": 4.982576638218855e-06, + "loss": 0.6092, + "step": 2355 + }, + { + "epoch": 0.9309953074833293, + "grad_norm": 0.4783627968108407, + "learning_rate": 4.982558168521263e-06, + "loss": 0.6226, + "step": 2356 + }, + { + "epoch": 0.9313904667819215, + "grad_norm": 0.46771411865600776, + "learning_rate": 4.982539689073689e-06, + "loss": 0.6216, + "step": 2357 + }, + { + "epoch": 0.9317856260805137, + "grad_norm": 0.4824264375712755, + "learning_rate": 4.982521199876207e-06, + "loss": 0.6331, + "step": 2358 + }, + { + "epoch": 0.932180785379106, + "grad_norm": 0.4870691371321396, + "learning_rate": 4.9825027009288896e-06, + "loss": 0.6275, + "step": 2359 + }, + { + "epoch": 0.9325759446776982, + "grad_norm": 0.48392613415399993, + "learning_rate": 4.982484192231808e-06, + "loss": 0.6143, + "step": 2360 + }, + { + "epoch": 0.9329711039762905, + "grad_norm": 0.4622678197643799, + "learning_rate": 4.9824656737850365e-06, + "loss": 0.6188, + "step": 2361 + }, + { + "epoch": 0.9333662632748827, + "grad_norm": 0.49520864864087916, + "learning_rate": 4.982447145588648e-06, + "loss": 0.5983, + "step": 2362 + }, + { + "epoch": 0.9337614225734749, + "grad_norm": 0.49238331015657344, + "learning_rate": 4.982428607642713e-06, + "loss": 0.6304, + "step": 2363 + }, + { + "epoch": 0.9341565818720672, + "grad_norm": 0.46810832966740035, + "learning_rate": 4.9824100599473065e-06, + "loss": 0.6057, + "step": 2364 + }, + { + "epoch": 0.9345517411706594, + "grad_norm": 0.4789733237570732, + "learning_rate": 4.982391502502501e-06, + "loss": 0.5931, + "step": 2365 + }, + { + "epoch": 0.9349469004692517, + "grad_norm": 0.48015565027094287, + "learning_rate": 4.98237293530837e-06, + "loss": 0.6304, + "step": 2366 + }, + { + "epoch": 0.9353420597678439, + "grad_norm": 0.4851134573753921, + "learning_rate": 4.9823543583649846e-06, + "loss": 0.6296, + "step": 2367 + }, + { + "epoch": 0.9357372190664361, + "grad_norm": 0.48585992460960437, + "learning_rate": 4.982335771672418e-06, + "loss": 0.5901, + "step": 2368 + }, + { + "epoch": 0.9361323783650284, + "grad_norm": 0.47063745016814984, + "learning_rate": 4.982317175230744e-06, + "loss": 0.6024, + "step": 2369 + }, + { + "epoch": 0.9365275376636206, + "grad_norm": 0.4799775780201179, + "learning_rate": 4.982298569040036e-06, + "loss": 0.6158, + "step": 2370 + }, + { + "epoch": 0.9369226969622129, + "grad_norm": 0.487398135567962, + "learning_rate": 4.982279953100366e-06, + "loss": 0.6207, + "step": 2371 + }, + { + "epoch": 0.9373178562608051, + "grad_norm": 0.4707569265394489, + "learning_rate": 4.9822613274118085e-06, + "loss": 0.6129, + "step": 2372 + }, + { + "epoch": 0.9377130155593973, + "grad_norm": 0.48484895295531333, + "learning_rate": 4.9822426919744355e-06, + "loss": 0.5927, + "step": 2373 + }, + { + "epoch": 0.9381081748579896, + "grad_norm": 0.5086631550313602, + "learning_rate": 4.9822240467883205e-06, + "loss": 0.6513, + "step": 2374 + }, + { + "epoch": 0.9385033341565818, + "grad_norm": 0.4854307750752647, + "learning_rate": 4.9822053918535366e-06, + "loss": 0.639, + "step": 2375 + }, + { + "epoch": 0.9388984934551741, + "grad_norm": 0.49908919962486925, + "learning_rate": 4.982186727170157e-06, + "loss": 0.6171, + "step": 2376 + }, + { + "epoch": 0.9392936527537664, + "grad_norm": 0.47151786983504285, + "learning_rate": 4.982168052738255e-06, + "loss": 0.6127, + "step": 2377 + }, + { + "epoch": 0.9396888120523587, + "grad_norm": 0.48250797464606165, + "learning_rate": 4.982149368557905e-06, + "loss": 0.636, + "step": 2378 + }, + { + "epoch": 0.9400839713509509, + "grad_norm": 0.4839100853335766, + "learning_rate": 4.982130674629179e-06, + "loss": 0.6175, + "step": 2379 + }, + { + "epoch": 0.9404791306495431, + "grad_norm": 0.46593074774551896, + "learning_rate": 4.982111970952151e-06, + "loss": 0.6, + "step": 2380 + }, + { + "epoch": 0.9408742899481354, + "grad_norm": 0.4726472043264624, + "learning_rate": 4.982093257526894e-06, + "loss": 0.6306, + "step": 2381 + }, + { + "epoch": 0.9412694492467276, + "grad_norm": 0.4707519207258955, + "learning_rate": 4.982074534353482e-06, + "loss": 0.6089, + "step": 2382 + }, + { + "epoch": 0.9416646085453199, + "grad_norm": 0.47810887989613376, + "learning_rate": 4.9820558014319895e-06, + "loss": 0.6068, + "step": 2383 + }, + { + "epoch": 0.9420597678439121, + "grad_norm": 0.46370942325328973, + "learning_rate": 4.9820370587624875e-06, + "loss": 0.5938, + "step": 2384 + }, + { + "epoch": 0.9424549271425043, + "grad_norm": 0.5035785783227827, + "learning_rate": 4.9820183063450525e-06, + "loss": 0.6265, + "step": 2385 + }, + { + "epoch": 0.9428500864410966, + "grad_norm": 0.5024383370795096, + "learning_rate": 4.981999544179756e-06, + "loss": 0.6236, + "step": 2386 + }, + { + "epoch": 0.9432452457396888, + "grad_norm": 0.45990102437488745, + "learning_rate": 4.981980772266672e-06, + "loss": 0.5894, + "step": 2387 + }, + { + "epoch": 0.9436404050382811, + "grad_norm": 0.4654023117324646, + "learning_rate": 4.981961990605876e-06, + "loss": 0.5898, + "step": 2388 + }, + { + "epoch": 0.9440355643368733, + "grad_norm": 0.4762087512767081, + "learning_rate": 4.981943199197439e-06, + "loss": 0.6056, + "step": 2389 + }, + { + "epoch": 0.9444307236354655, + "grad_norm": 0.4749986650314227, + "learning_rate": 4.981924398041436e-06, + "loss": 0.6135, + "step": 2390 + }, + { + "epoch": 0.9448258829340578, + "grad_norm": 0.4719526074802648, + "learning_rate": 4.981905587137943e-06, + "loss": 0.5893, + "step": 2391 + }, + { + "epoch": 0.94522104223265, + "grad_norm": 0.4755247323470361, + "learning_rate": 4.9818867664870306e-06, + "loss": 0.6073, + "step": 2392 + }, + { + "epoch": 0.9456162015312423, + "grad_norm": 0.48675290749713584, + "learning_rate": 4.981867936088774e-06, + "loss": 0.5924, + "step": 2393 + }, + { + "epoch": 0.9460113608298345, + "grad_norm": 0.4743633378571163, + "learning_rate": 4.9818490959432485e-06, + "loss": 0.62, + "step": 2394 + }, + { + "epoch": 0.9464065201284267, + "grad_norm": 0.49288950026353556, + "learning_rate": 4.981830246050526e-06, + "loss": 0.6045, + "step": 2395 + }, + { + "epoch": 0.946801679427019, + "grad_norm": 0.4884082163788517, + "learning_rate": 4.981811386410681e-06, + "loss": 0.6118, + "step": 2396 + }, + { + "epoch": 0.9471968387256112, + "grad_norm": 0.4818633127585905, + "learning_rate": 4.981792517023788e-06, + "loss": 0.6154, + "step": 2397 + }, + { + "epoch": 0.9475919980242035, + "grad_norm": 0.5212061592296549, + "learning_rate": 4.981773637889921e-06, + "loss": 0.5999, + "step": 2398 + }, + { + "epoch": 0.9479871573227957, + "grad_norm": 0.4921362608346308, + "learning_rate": 4.981754749009154e-06, + "loss": 0.6248, + "step": 2399 + }, + { + "epoch": 0.9483823166213879, + "grad_norm": 0.6375277486862304, + "learning_rate": 4.9817358503815616e-06, + "loss": 0.6205, + "step": 2400 + }, + { + "epoch": 0.9487774759199803, + "grad_norm": 0.4822713530796236, + "learning_rate": 4.9817169420072175e-06, + "loss": 0.6068, + "step": 2401 + }, + { + "epoch": 0.9491726352185725, + "grad_norm": 0.45735313205333067, + "learning_rate": 4.981698023886197e-06, + "loss": 0.6045, + "step": 2402 + }, + { + "epoch": 0.9495677945171648, + "grad_norm": 0.4870688284599884, + "learning_rate": 4.9816790960185725e-06, + "loss": 0.6169, + "step": 2403 + }, + { + "epoch": 0.949962953815757, + "grad_norm": 0.4508977591163079, + "learning_rate": 4.981660158404421e-06, + "loss": 0.6081, + "step": 2404 + }, + { + "epoch": 0.9503581131143493, + "grad_norm": 0.48177734751539525, + "learning_rate": 4.981641211043814e-06, + "loss": 0.6075, + "step": 2405 + }, + { + "epoch": 0.9507532724129415, + "grad_norm": 0.45900423930549045, + "learning_rate": 4.981622253936828e-06, + "loss": 0.5804, + "step": 2406 + }, + { + "epoch": 0.9511484317115337, + "grad_norm": 0.46636251306928633, + "learning_rate": 4.9816032870835366e-06, + "loss": 0.6093, + "step": 2407 + }, + { + "epoch": 0.951543591010126, + "grad_norm": 0.49173914273079045, + "learning_rate": 4.981584310484014e-06, + "loss": 0.6423, + "step": 2408 + }, + { + "epoch": 0.9519387503087182, + "grad_norm": 0.4871067267842084, + "learning_rate": 4.981565324138336e-06, + "loss": 0.6404, + "step": 2409 + }, + { + "epoch": 0.9523339096073105, + "grad_norm": 0.48801903916719275, + "learning_rate": 4.981546328046575e-06, + "loss": 0.6091, + "step": 2410 + }, + { + "epoch": 0.9527290689059027, + "grad_norm": 0.47631638584783426, + "learning_rate": 4.9815273222088075e-06, + "loss": 0.6086, + "step": 2411 + }, + { + "epoch": 0.9531242282044949, + "grad_norm": 0.45326422779082315, + "learning_rate": 4.981508306625108e-06, + "loss": 0.612, + "step": 2412 + }, + { + "epoch": 0.9535193875030872, + "grad_norm": 0.5469615293144247, + "learning_rate": 4.981489281295549e-06, + "loss": 0.5988, + "step": 2413 + }, + { + "epoch": 0.9539145468016794, + "grad_norm": 0.49738406140305297, + "learning_rate": 4.9814702462202094e-06, + "loss": 0.6225, + "step": 2414 + }, + { + "epoch": 0.9543097061002717, + "grad_norm": 0.4621955995020316, + "learning_rate": 4.98145120139916e-06, + "loss": 0.6016, + "step": 2415 + }, + { + "epoch": 0.9547048653988639, + "grad_norm": 0.46293528627278396, + "learning_rate": 4.981432146832478e-06, + "loss": 0.6214, + "step": 2416 + }, + { + "epoch": 0.9551000246974561, + "grad_norm": 0.49663994492008307, + "learning_rate": 4.981413082520236e-06, + "loss": 0.6133, + "step": 2417 + }, + { + "epoch": 0.9554951839960484, + "grad_norm": 0.4817582642896156, + "learning_rate": 4.981394008462511e-06, + "loss": 0.6238, + "step": 2418 + }, + { + "epoch": 0.9558903432946406, + "grad_norm": 0.5586123030609345, + "learning_rate": 4.981374924659377e-06, + "loss": 0.6243, + "step": 2419 + }, + { + "epoch": 0.9562855025932329, + "grad_norm": 0.4850500636019948, + "learning_rate": 4.9813558311109095e-06, + "loss": 0.622, + "step": 2420 + }, + { + "epoch": 0.9566806618918251, + "grad_norm": 0.4689441752707809, + "learning_rate": 4.9813367278171835e-06, + "loss": 0.5765, + "step": 2421 + }, + { + "epoch": 0.9570758211904173, + "grad_norm": 0.47158958076654856, + "learning_rate": 4.981317614778272e-06, + "loss": 0.6296, + "step": 2422 + }, + { + "epoch": 0.9574709804890096, + "grad_norm": 0.514101025815252, + "learning_rate": 4.981298491994252e-06, + "loss": 0.6234, + "step": 2423 + }, + { + "epoch": 0.9578661397876018, + "grad_norm": 0.48399944464009775, + "learning_rate": 4.981279359465199e-06, + "loss": 0.6107, + "step": 2424 + }, + { + "epoch": 0.9582612990861942, + "grad_norm": 0.48729938368312087, + "learning_rate": 4.981260217191187e-06, + "loss": 0.5893, + "step": 2425 + }, + { + "epoch": 0.9586564583847864, + "grad_norm": 0.49004477541134556, + "learning_rate": 4.981241065172292e-06, + "loss": 0.6008, + "step": 2426 + }, + { + "epoch": 0.9590516176833787, + "grad_norm": 0.48347415116566556, + "learning_rate": 4.9812219034085886e-06, + "loss": 0.618, + "step": 2427 + }, + { + "epoch": 0.9594467769819709, + "grad_norm": 0.4793590490738295, + "learning_rate": 4.981202731900152e-06, + "loss": 0.6251, + "step": 2428 + }, + { + "epoch": 0.9598419362805631, + "grad_norm": 0.48347957011954834, + "learning_rate": 4.9811835506470575e-06, + "loss": 0.6144, + "step": 2429 + }, + { + "epoch": 0.9602370955791554, + "grad_norm": 0.5000951926275022, + "learning_rate": 4.981164359649381e-06, + "loss": 0.6217, + "step": 2430 + }, + { + "epoch": 0.9606322548777476, + "grad_norm": 0.4853268721052706, + "learning_rate": 4.981145158907198e-06, + "loss": 0.6107, + "step": 2431 + }, + { + "epoch": 0.9610274141763399, + "grad_norm": 0.4775051457662627, + "learning_rate": 4.981125948420583e-06, + "loss": 0.619, + "step": 2432 + }, + { + "epoch": 0.9614225734749321, + "grad_norm": 0.4610581366616115, + "learning_rate": 4.981106728189612e-06, + "loss": 0.6011, + "step": 2433 + }, + { + "epoch": 0.9618177327735243, + "grad_norm": 0.46908821425073965, + "learning_rate": 4.98108749821436e-06, + "loss": 0.5991, + "step": 2434 + }, + { + "epoch": 0.9622128920721166, + "grad_norm": 0.4643140368630478, + "learning_rate": 4.981068258494903e-06, + "loss": 0.5848, + "step": 2435 + }, + { + "epoch": 0.9626080513707088, + "grad_norm": 0.48640039378791994, + "learning_rate": 4.981049009031317e-06, + "loss": 0.6175, + "step": 2436 + }, + { + "epoch": 0.9630032106693011, + "grad_norm": 0.4707625851268911, + "learning_rate": 4.9810297498236765e-06, + "loss": 0.5903, + "step": 2437 + }, + { + "epoch": 0.9633983699678933, + "grad_norm": 0.47186949155654456, + "learning_rate": 4.981010480872058e-06, + "loss": 0.6051, + "step": 2438 + }, + { + "epoch": 0.9637935292664855, + "grad_norm": 0.47253458553373584, + "learning_rate": 4.980991202176536e-06, + "loss": 0.6135, + "step": 2439 + }, + { + "epoch": 0.9641886885650778, + "grad_norm": 0.4739469724306039, + "learning_rate": 4.980971913737188e-06, + "loss": 0.5884, + "step": 2440 + }, + { + "epoch": 0.96458384786367, + "grad_norm": 0.47756433353936395, + "learning_rate": 4.980952615554089e-06, + "loss": 0.6054, + "step": 2441 + }, + { + "epoch": 0.9649790071622623, + "grad_norm": 0.46338471724667013, + "learning_rate": 4.980933307627315e-06, + "loss": 0.611, + "step": 2442 + }, + { + "epoch": 0.9653741664608545, + "grad_norm": 0.487680669856884, + "learning_rate": 4.98091398995694e-06, + "loss": 0.6192, + "step": 2443 + }, + { + "epoch": 0.9657693257594467, + "grad_norm": 0.5227924200190431, + "learning_rate": 4.9808946625430425e-06, + "loss": 0.6034, + "step": 2444 + }, + { + "epoch": 0.966164485058039, + "grad_norm": 0.49032597318513343, + "learning_rate": 4.980875325385697e-06, + "loss": 0.588, + "step": 2445 + }, + { + "epoch": 0.9665596443566312, + "grad_norm": 0.46454318608059514, + "learning_rate": 4.98085597848498e-06, + "loss": 0.5968, + "step": 2446 + }, + { + "epoch": 0.9669548036552235, + "grad_norm": 0.5078562623229934, + "learning_rate": 4.980836621840967e-06, + "loss": 0.6195, + "step": 2447 + }, + { + "epoch": 0.9673499629538158, + "grad_norm": 0.4752546102671055, + "learning_rate": 4.980817255453734e-06, + "loss": 0.6118, + "step": 2448 + }, + { + "epoch": 0.967745122252408, + "grad_norm": 0.4833480233978842, + "learning_rate": 4.980797879323357e-06, + "loss": 0.6261, + "step": 2449 + }, + { + "epoch": 0.9681402815510003, + "grad_norm": 0.4771570217763964, + "learning_rate": 4.980778493449912e-06, + "loss": 0.6318, + "step": 2450 + }, + { + "epoch": 0.9685354408495925, + "grad_norm": 0.47469969618301255, + "learning_rate": 4.980759097833476e-06, + "loss": 0.6027, + "step": 2451 + }, + { + "epoch": 0.9689306001481848, + "grad_norm": 0.4781745301282861, + "learning_rate": 4.980739692474125e-06, + "loss": 0.6172, + "step": 2452 + }, + { + "epoch": 0.969325759446777, + "grad_norm": 0.48953552287496055, + "learning_rate": 4.980720277371934e-06, + "loss": 0.6059, + "step": 2453 + }, + { + "epoch": 0.9697209187453693, + "grad_norm": 0.4697257378860957, + "learning_rate": 4.980700852526981e-06, + "loss": 0.6108, + "step": 2454 + }, + { + "epoch": 0.9701160780439615, + "grad_norm": 0.5170911041975621, + "learning_rate": 4.980681417939341e-06, + "loss": 0.6353, + "step": 2455 + }, + { + "epoch": 0.9705112373425537, + "grad_norm": 0.4765311737964586, + "learning_rate": 4.980661973609091e-06, + "loss": 0.5911, + "step": 2456 + }, + { + "epoch": 0.970906396641146, + "grad_norm": 0.4571918246118576, + "learning_rate": 4.980642519536307e-06, + "loss": 0.6204, + "step": 2457 + }, + { + "epoch": 0.9713015559397382, + "grad_norm": 0.47402704361826625, + "learning_rate": 4.980623055721065e-06, + "loss": 0.6075, + "step": 2458 + }, + { + "epoch": 0.9716967152383305, + "grad_norm": 0.4823370913476268, + "learning_rate": 4.980603582163443e-06, + "loss": 0.612, + "step": 2459 + }, + { + "epoch": 0.9720918745369227, + "grad_norm": 0.4665199872651599, + "learning_rate": 4.980584098863516e-06, + "loss": 0.6255, + "step": 2460 + }, + { + "epoch": 0.9724870338355149, + "grad_norm": 0.45850568990455864, + "learning_rate": 4.980564605821361e-06, + "loss": 0.6003, + "step": 2461 + }, + { + "epoch": 0.9728821931341072, + "grad_norm": 0.4660323061963318, + "learning_rate": 4.980545103037054e-06, + "loss": 0.6153, + "step": 2462 + }, + { + "epoch": 0.9732773524326994, + "grad_norm": 0.47426339473199003, + "learning_rate": 4.9805255905106735e-06, + "loss": 0.6203, + "step": 2463 + }, + { + "epoch": 0.9736725117312917, + "grad_norm": 0.4648762466922138, + "learning_rate": 4.9805060682422925e-06, + "loss": 0.602, + "step": 2464 + }, + { + "epoch": 0.9740676710298839, + "grad_norm": 0.4774681753525283, + "learning_rate": 4.980486536231992e-06, + "loss": 0.6199, + "step": 2465 + }, + { + "epoch": 0.9744628303284761, + "grad_norm": 0.5025939174795709, + "learning_rate": 4.980466994479845e-06, + "loss": 0.6342, + "step": 2466 + }, + { + "epoch": 0.9748579896270684, + "grad_norm": 0.46520163938175313, + "learning_rate": 4.980447442985931e-06, + "loss": 0.6082, + "step": 2467 + }, + { + "epoch": 0.9752531489256606, + "grad_norm": 0.5093555297811326, + "learning_rate": 4.980427881750325e-06, + "loss": 0.6406, + "step": 2468 + }, + { + "epoch": 0.9756483082242529, + "grad_norm": 0.4675282128894731, + "learning_rate": 4.980408310773105e-06, + "loss": 0.6013, + "step": 2469 + }, + { + "epoch": 0.9760434675228451, + "grad_norm": 0.4600464903665783, + "learning_rate": 4.980388730054347e-06, + "loss": 0.6021, + "step": 2470 + }, + { + "epoch": 0.9764386268214373, + "grad_norm": 0.4781471662113073, + "learning_rate": 4.980369139594128e-06, + "loss": 0.6302, + "step": 2471 + }, + { + "epoch": 0.9768337861200297, + "grad_norm": 0.569664962295512, + "learning_rate": 4.980349539392526e-06, + "loss": 0.6076, + "step": 2472 + }, + { + "epoch": 0.9772289454186219, + "grad_norm": 0.474728795793001, + "learning_rate": 4.980329929449616e-06, + "loss": 0.5973, + "step": 2473 + }, + { + "epoch": 0.9776241047172142, + "grad_norm": 0.466971617619074, + "learning_rate": 4.980310309765477e-06, + "loss": 0.5881, + "step": 2474 + }, + { + "epoch": 0.9780192640158064, + "grad_norm": 0.49734334610344294, + "learning_rate": 4.980290680340185e-06, + "loss": 0.6377, + "step": 2475 + }, + { + "epoch": 0.9784144233143987, + "grad_norm": 0.4660369065720144, + "learning_rate": 4.980271041173818e-06, + "loss": 0.5939, + "step": 2476 + }, + { + "epoch": 0.9788095826129909, + "grad_norm": 0.4541019720752234, + "learning_rate": 4.9802513922664506e-06, + "loss": 0.6013, + "step": 2477 + }, + { + "epoch": 0.9792047419115831, + "grad_norm": 0.46864260551647896, + "learning_rate": 4.980231733618164e-06, + "loss": 0.5934, + "step": 2478 + }, + { + "epoch": 0.9795999012101754, + "grad_norm": 0.49812016619203664, + "learning_rate": 4.980212065229032e-06, + "loss": 0.6074, + "step": 2479 + }, + { + "epoch": 0.9799950605087676, + "grad_norm": 0.47269225015752886, + "learning_rate": 4.980192387099133e-06, + "loss": 0.6075, + "step": 2480 + }, + { + "epoch": 0.9803902198073599, + "grad_norm": 0.46117462938553433, + "learning_rate": 4.980172699228545e-06, + "loss": 0.5987, + "step": 2481 + }, + { + "epoch": 0.9807853791059521, + "grad_norm": 0.47234556720051585, + "learning_rate": 4.980153001617344e-06, + "loss": 0.6008, + "step": 2482 + }, + { + "epoch": 0.9811805384045443, + "grad_norm": 0.46430799343376644, + "learning_rate": 4.980133294265608e-06, + "loss": 0.6126, + "step": 2483 + }, + { + "epoch": 0.9815756977031366, + "grad_norm": 0.48231158605504654, + "learning_rate": 4.980113577173415e-06, + "loss": 0.6166, + "step": 2484 + }, + { + "epoch": 0.9819708570017288, + "grad_norm": 0.4544084646577223, + "learning_rate": 4.980093850340842e-06, + "loss": 0.5911, + "step": 2485 + }, + { + "epoch": 0.982366016300321, + "grad_norm": 0.49807318396062805, + "learning_rate": 4.980074113767966e-06, + "loss": 0.596, + "step": 2486 + }, + { + "epoch": 0.9827611755989133, + "grad_norm": 0.47544603743432695, + "learning_rate": 4.980054367454865e-06, + "loss": 0.6095, + "step": 2487 + }, + { + "epoch": 0.9831563348975055, + "grad_norm": 0.4659987694152846, + "learning_rate": 4.9800346114016165e-06, + "loss": 0.606, + "step": 2488 + }, + { + "epoch": 0.9835514941960978, + "grad_norm": 0.47287386440876494, + "learning_rate": 4.980014845608298e-06, + "loss": 0.6352, + "step": 2489 + }, + { + "epoch": 0.98394665349469, + "grad_norm": 0.4864128975388848, + "learning_rate": 4.979995070074987e-06, + "loss": 0.6278, + "step": 2490 + }, + { + "epoch": 0.9843418127932823, + "grad_norm": 0.4544891691408379, + "learning_rate": 4.979975284801761e-06, + "loss": 0.608, + "step": 2491 + }, + { + "epoch": 0.9847369720918745, + "grad_norm": 0.4795256509587486, + "learning_rate": 4.979955489788698e-06, + "loss": 0.6273, + "step": 2492 + }, + { + "epoch": 0.9851321313904667, + "grad_norm": 0.4766741019260381, + "learning_rate": 4.979935685035876e-06, + "loss": 0.608, + "step": 2493 + }, + { + "epoch": 0.985527290689059, + "grad_norm": 0.4737813917288424, + "learning_rate": 4.979915870543372e-06, + "loss": 0.6259, + "step": 2494 + }, + { + "epoch": 0.9859224499876512, + "grad_norm": 0.4923664866297512, + "learning_rate": 4.979896046311266e-06, + "loss": 0.5989, + "step": 2495 + }, + { + "epoch": 0.9863176092862436, + "grad_norm": 0.49326185549267143, + "learning_rate": 4.979876212339632e-06, + "loss": 0.6077, + "step": 2496 + }, + { + "epoch": 0.9867127685848358, + "grad_norm": 0.4660497012513692, + "learning_rate": 4.9798563686285515e-06, + "loss": 0.6168, + "step": 2497 + }, + { + "epoch": 0.987107927883428, + "grad_norm": 0.4800478363437438, + "learning_rate": 4.979836515178101e-06, + "loss": 0.6211, + "step": 2498 + }, + { + "epoch": 0.9875030871820203, + "grad_norm": 0.47487319014526613, + "learning_rate": 4.979816651988358e-06, + "loss": 0.6136, + "step": 2499 + }, + { + "epoch": 0.9878982464806125, + "grad_norm": 0.47991505886589036, + "learning_rate": 4.979796779059401e-06, + "loss": 0.6081, + "step": 2500 + }, + { + "epoch": 0.9882934057792048, + "grad_norm": 0.472571865692289, + "learning_rate": 4.979776896391308e-06, + "loss": 0.6208, + "step": 2501 + }, + { + "epoch": 0.988688565077797, + "grad_norm": 0.5058758260925268, + "learning_rate": 4.979757003984158e-06, + "loss": 0.6318, + "step": 2502 + }, + { + "epoch": 0.9890837243763893, + "grad_norm": 0.45916735017432975, + "learning_rate": 4.979737101838028e-06, + "loss": 0.5949, + "step": 2503 + }, + { + "epoch": 0.9894788836749815, + "grad_norm": 0.46426645842628067, + "learning_rate": 4.979717189952996e-06, + "loss": 0.6169, + "step": 2504 + }, + { + "epoch": 0.9898740429735737, + "grad_norm": 0.46140290638450737, + "learning_rate": 4.9796972683291415e-06, + "loss": 0.6021, + "step": 2505 + }, + { + "epoch": 0.990269202272166, + "grad_norm": 0.4503137341547014, + "learning_rate": 4.979677336966541e-06, + "loss": 0.5937, + "step": 2506 + }, + { + "epoch": 0.9906643615707582, + "grad_norm": 0.4734628943802484, + "learning_rate": 4.9796573958652735e-06, + "loss": 0.6208, + "step": 2507 + }, + { + "epoch": 0.9910595208693505, + "grad_norm": 0.4665327413909791, + "learning_rate": 4.979637445025418e-06, + "loss": 0.5911, + "step": 2508 + }, + { + "epoch": 0.9914546801679427, + "grad_norm": 0.5283523686632157, + "learning_rate": 4.979617484447052e-06, + "loss": 0.6054, + "step": 2509 + }, + { + "epoch": 0.9918498394665349, + "grad_norm": 0.4671594121597241, + "learning_rate": 4.979597514130254e-06, + "loss": 0.6187, + "step": 2510 + }, + { + "epoch": 0.9922449987651272, + "grad_norm": 0.4893067227078538, + "learning_rate": 4.979577534075103e-06, + "loss": 0.6339, + "step": 2511 + }, + { + "epoch": 0.9926401580637194, + "grad_norm": 0.47208844535109873, + "learning_rate": 4.979557544281677e-06, + "loss": 0.6087, + "step": 2512 + }, + { + "epoch": 0.9930353173623117, + "grad_norm": 0.48119092703509836, + "learning_rate": 4.979537544750055e-06, + "loss": 0.6201, + "step": 2513 + }, + { + "epoch": 0.9934304766609039, + "grad_norm": 0.477557865345246, + "learning_rate": 4.979517535480315e-06, + "loss": 0.6143, + "step": 2514 + }, + { + "epoch": 0.9938256359594961, + "grad_norm": 0.4585124296524369, + "learning_rate": 4.979497516472535e-06, + "loss": 0.5979, + "step": 2515 + }, + { + "epoch": 0.9942207952580884, + "grad_norm": 0.47684697114164715, + "learning_rate": 4.979477487726796e-06, + "loss": 0.6031, + "step": 2516 + }, + { + "epoch": 0.9946159545566806, + "grad_norm": 0.46484256613871744, + "learning_rate": 4.979457449243174e-06, + "loss": 0.5965, + "step": 2517 + }, + { + "epoch": 0.9950111138552729, + "grad_norm": 0.46418868296419014, + "learning_rate": 4.97943740102175e-06, + "loss": 0.6187, + "step": 2518 + }, + { + "epoch": 0.9954062731538652, + "grad_norm": 0.4712799089967701, + "learning_rate": 4.9794173430626e-06, + "loss": 0.6179, + "step": 2519 + }, + { + "epoch": 0.9958014324524574, + "grad_norm": 0.4898574169755568, + "learning_rate": 4.979397275365804e-06, + "loss": 0.6182, + "step": 2520 + }, + { + "epoch": 0.9961965917510497, + "grad_norm": 0.4721726580593012, + "learning_rate": 4.979377197931442e-06, + "loss": 0.6141, + "step": 2521 + }, + { + "epoch": 0.9965917510496419, + "grad_norm": 0.4696253306480775, + "learning_rate": 4.979357110759592e-06, + "loss": 0.6024, + "step": 2522 + }, + { + "epoch": 0.9969869103482342, + "grad_norm": 0.4695198005530173, + "learning_rate": 4.979337013850332e-06, + "loss": 0.6183, + "step": 2523 + }, + { + "epoch": 0.9973820696468264, + "grad_norm": 0.46544585435480235, + "learning_rate": 4.979316907203743e-06, + "loss": 0.5947, + "step": 2524 + }, + { + "epoch": 0.9977772289454186, + "grad_norm": 0.4673922524133896, + "learning_rate": 4.979296790819901e-06, + "loss": 0.6276, + "step": 2525 + }, + { + "epoch": 0.9981723882440109, + "grad_norm": 0.4784936628964307, + "learning_rate": 4.979276664698888e-06, + "loss": 0.6163, + "step": 2526 + }, + { + "epoch": 0.9985675475426031, + "grad_norm": 0.46022244379627936, + "learning_rate": 4.979256528840782e-06, + "loss": 0.6152, + "step": 2527 + }, + { + "epoch": 0.9989627068411954, + "grad_norm": 0.46842270777733813, + "learning_rate": 4.979236383245661e-06, + "loss": 0.611, + "step": 2528 + }, + { + "epoch": 0.9993578661397876, + "grad_norm": 0.45419251266565824, + "learning_rate": 4.979216227913605e-06, + "loss": 0.601, + "step": 2529 + }, + { + "epoch": 0.9997530254383798, + "grad_norm": 0.4731512690482939, + "learning_rate": 4.9791960628446935e-06, + "loss": 0.6229, + "step": 2530 + }, + { + "epoch": 1.0003951592985922, + "grad_norm": 0.9039832839745028, + "learning_rate": 4.979175888039005e-06, + "loss": 0.5558, + "step": 2531 + }, + { + "epoch": 1.0007903185971845, + "grad_norm": 2.641950999929186, + "learning_rate": 4.979155703496619e-06, + "loss": 0.5752, + "step": 2532 + }, + { + "epoch": 1.0011854778957767, + "grad_norm": 1.076378753297078, + "learning_rate": 4.979135509217615e-06, + "loss": 0.5912, + "step": 2533 + }, + { + "epoch": 1.001580637194369, + "grad_norm": 0.8858294098499361, + "learning_rate": 4.979115305202073e-06, + "loss": 0.5723, + "step": 2534 + }, + { + "epoch": 1.0019757964929612, + "grad_norm": 0.9699126038857411, + "learning_rate": 4.9790950914500705e-06, + "loss": 0.5786, + "step": 2535 + }, + { + "epoch": 1.0023709557915534, + "grad_norm": 0.6877025575827704, + "learning_rate": 4.979074867961687e-06, + "loss": 0.5676, + "step": 2536 + }, + { + "epoch": 1.0027661150901457, + "grad_norm": 0.6507721272783759, + "learning_rate": 4.979054634737004e-06, + "loss": 0.5813, + "step": 2537 + }, + { + "epoch": 1.003161274388738, + "grad_norm": 0.7752079766225948, + "learning_rate": 4.9790343917761e-06, + "loss": 0.5741, + "step": 2538 + }, + { + "epoch": 1.0035564336873302, + "grad_norm": 0.728864402953281, + "learning_rate": 4.979014139079053e-06, + "loss": 0.5594, + "step": 2539 + }, + { + "epoch": 1.0039515929859224, + "grad_norm": 0.5774709293399436, + "learning_rate": 4.978993876645945e-06, + "loss": 0.5635, + "step": 2540 + }, + { + "epoch": 1.0043467522845146, + "grad_norm": 0.5686297060429989, + "learning_rate": 4.978973604476855e-06, + "loss": 0.5701, + "step": 2541 + }, + { + "epoch": 1.0047419115831069, + "grad_norm": 0.5887544543582653, + "learning_rate": 4.97895332257186e-06, + "loss": 0.5632, + "step": 2542 + }, + { + "epoch": 1.0051370708816991, + "grad_norm": 0.6127637272155544, + "learning_rate": 4.978933030931042e-06, + "loss": 0.5594, + "step": 2543 + }, + { + "epoch": 1.0055322301802914, + "grad_norm": 0.5483995865865325, + "learning_rate": 4.978912729554481e-06, + "loss": 0.5557, + "step": 2544 + }, + { + "epoch": 1.0059273894788836, + "grad_norm": 0.5341010434760727, + "learning_rate": 4.978892418442256e-06, + "loss": 0.5553, + "step": 2545 + }, + { + "epoch": 1.0063225487774758, + "grad_norm": 0.5396189198412058, + "learning_rate": 4.9788720975944475e-06, + "loss": 0.572, + "step": 2546 + }, + { + "epoch": 1.006717708076068, + "grad_norm": 0.5603122829844251, + "learning_rate": 4.978851767011134e-06, + "loss": 0.554, + "step": 2547 + }, + { + "epoch": 1.0071128673746603, + "grad_norm": 0.5973006671495175, + "learning_rate": 4.978831426692397e-06, + "loss": 0.573, + "step": 2548 + }, + { + "epoch": 1.0075080266732526, + "grad_norm": 0.5537555552956865, + "learning_rate": 4.9788110766383135e-06, + "loss": 0.5725, + "step": 2549 + }, + { + "epoch": 1.0079031859718448, + "grad_norm": 0.506831024700252, + "learning_rate": 4.978790716848966e-06, + "loss": 0.5605, + "step": 2550 + }, + { + "epoch": 1.008298345270437, + "grad_norm": 0.5146187226178434, + "learning_rate": 4.9787703473244346e-06, + "loss": 0.5725, + "step": 2551 + }, + { + "epoch": 1.0086935045690293, + "grad_norm": 0.584671063878508, + "learning_rate": 4.9787499680647975e-06, + "loss": 0.5535, + "step": 2552 + }, + { + "epoch": 1.0090886638676215, + "grad_norm": 0.517272250939791, + "learning_rate": 4.978729579070136e-06, + "loss": 0.5694, + "step": 2553 + }, + { + "epoch": 1.009483823166214, + "grad_norm": 0.5197158410846429, + "learning_rate": 4.978709180340529e-06, + "loss": 0.5703, + "step": 2554 + }, + { + "epoch": 1.0098789824648062, + "grad_norm": 0.5316647592420108, + "learning_rate": 4.978688771876059e-06, + "loss": 0.5614, + "step": 2555 + }, + { + "epoch": 1.0102741417633985, + "grad_norm": 0.47946328549941336, + "learning_rate": 4.978668353676804e-06, + "loss": 0.574, + "step": 2556 + }, + { + "epoch": 1.0106693010619907, + "grad_norm": 0.4799139075965923, + "learning_rate": 4.9786479257428455e-06, + "loss": 0.5638, + "step": 2557 + }, + { + "epoch": 1.011064460360583, + "grad_norm": 0.5034447217830896, + "learning_rate": 4.978627488074263e-06, + "loss": 0.576, + "step": 2558 + }, + { + "epoch": 1.0114596196591752, + "grad_norm": 0.5176884626827072, + "learning_rate": 4.978607040671136e-06, + "loss": 0.5636, + "step": 2559 + }, + { + "epoch": 1.0118547789577674, + "grad_norm": 0.5148490669814971, + "learning_rate": 4.978586583533545e-06, + "loss": 0.5838, + "step": 2560 + }, + { + "epoch": 1.0122499382563597, + "grad_norm": 0.4542757059816822, + "learning_rate": 4.978566116661573e-06, + "loss": 0.5532, + "step": 2561 + }, + { + "epoch": 1.012645097554952, + "grad_norm": 0.4811513221663893, + "learning_rate": 4.978545640055297e-06, + "loss": 0.5658, + "step": 2562 + }, + { + "epoch": 1.0130402568535442, + "grad_norm": 0.500097716390493, + "learning_rate": 4.978525153714799e-06, + "loss": 0.5744, + "step": 2563 + }, + { + "epoch": 1.0134354161521364, + "grad_norm": 0.5142140734871467, + "learning_rate": 4.978504657640159e-06, + "loss": 0.5538, + "step": 2564 + }, + { + "epoch": 1.0138305754507286, + "grad_norm": 0.47546571996143905, + "learning_rate": 4.978484151831458e-06, + "loss": 0.5587, + "step": 2565 + }, + { + "epoch": 1.0142257347493209, + "grad_norm": 0.472451374021239, + "learning_rate": 4.978463636288776e-06, + "loss": 0.5881, + "step": 2566 + }, + { + "epoch": 1.0146208940479131, + "grad_norm": 0.4541293970758113, + "learning_rate": 4.978443111012195e-06, + "loss": 0.5572, + "step": 2567 + }, + { + "epoch": 1.0150160533465054, + "grad_norm": 0.47790081006874097, + "learning_rate": 4.978422576001793e-06, + "loss": 0.552, + "step": 2568 + }, + { + "epoch": 1.0154112126450976, + "grad_norm": 0.4983815727839767, + "learning_rate": 4.978402031257653e-06, + "loss": 0.5655, + "step": 2569 + }, + { + "epoch": 1.0158063719436898, + "grad_norm": 0.484727652450049, + "learning_rate": 4.9783814767798545e-06, + "loss": 0.5726, + "step": 2570 + }, + { + "epoch": 1.016201531242282, + "grad_norm": 0.4852767066955579, + "learning_rate": 4.978360912568479e-06, + "loss": 0.5537, + "step": 2571 + }, + { + "epoch": 1.0165966905408743, + "grad_norm": 0.4485626151233385, + "learning_rate": 4.978340338623606e-06, + "loss": 0.5555, + "step": 2572 + }, + { + "epoch": 1.0169918498394666, + "grad_norm": 0.5060512578748023, + "learning_rate": 4.9783197549453164e-06, + "loss": 0.5898, + "step": 2573 + }, + { + "epoch": 1.0173870091380588, + "grad_norm": 0.4646178927626715, + "learning_rate": 4.978299161533693e-06, + "loss": 0.5744, + "step": 2574 + }, + { + "epoch": 1.017782168436651, + "grad_norm": 0.461846845018728, + "learning_rate": 4.978278558388815e-06, + "loss": 0.5654, + "step": 2575 + }, + { + "epoch": 1.0181773277352433, + "grad_norm": 0.6011786743216133, + "learning_rate": 4.978257945510764e-06, + "loss": 0.5593, + "step": 2576 + }, + { + "epoch": 1.0185724870338355, + "grad_norm": 0.523836605126601, + "learning_rate": 4.978237322899621e-06, + "loss": 0.572, + "step": 2577 + }, + { + "epoch": 1.0189676463324278, + "grad_norm": 0.4752809951230506, + "learning_rate": 4.978216690555465e-06, + "loss": 0.583, + "step": 2578 + }, + { + "epoch": 1.01936280563102, + "grad_norm": 0.45433798164574274, + "learning_rate": 4.978196048478381e-06, + "loss": 0.568, + "step": 2579 + }, + { + "epoch": 1.0197579649296122, + "grad_norm": 0.46744153312914954, + "learning_rate": 4.9781753966684455e-06, + "loss": 0.5526, + "step": 2580 + }, + { + "epoch": 1.0201531242282045, + "grad_norm": 0.47136324904331967, + "learning_rate": 4.978154735125743e-06, + "loss": 0.5734, + "step": 2581 + }, + { + "epoch": 1.0205482835267967, + "grad_norm": 0.476581814748425, + "learning_rate": 4.9781340638503536e-06, + "loss": 0.596, + "step": 2582 + }, + { + "epoch": 1.020943442825389, + "grad_norm": 0.4541300104338508, + "learning_rate": 4.9781133828423585e-06, + "loss": 0.5553, + "step": 2583 + }, + { + "epoch": 1.0213386021239812, + "grad_norm": 0.45621750533610406, + "learning_rate": 4.978092692101838e-06, + "loss": 0.5561, + "step": 2584 + }, + { + "epoch": 1.0217337614225734, + "grad_norm": 0.47098693586154, + "learning_rate": 4.978071991628875e-06, + "loss": 0.5524, + "step": 2585 + }, + { + "epoch": 1.0221289207211657, + "grad_norm": 0.4259588250799944, + "learning_rate": 4.97805128142355e-06, + "loss": 0.5235, + "step": 2586 + }, + { + "epoch": 1.022524080019758, + "grad_norm": 0.4686581079113878, + "learning_rate": 4.9780305614859435e-06, + "loss": 0.5574, + "step": 2587 + }, + { + "epoch": 1.0229192393183502, + "grad_norm": 0.46911512358386387, + "learning_rate": 4.9780098318161385e-06, + "loss": 0.5777, + "step": 2588 + }, + { + "epoch": 1.0233143986169424, + "grad_norm": 0.4642643584963139, + "learning_rate": 4.9779890924142155e-06, + "loss": 0.5726, + "step": 2589 + }, + { + "epoch": 1.0237095579155346, + "grad_norm": 0.4712873655824191, + "learning_rate": 4.977968343280256e-06, + "loss": 0.5823, + "step": 2590 + }, + { + "epoch": 1.0241047172141269, + "grad_norm": 0.4482243472573749, + "learning_rate": 4.977947584414341e-06, + "loss": 0.564, + "step": 2591 + }, + { + "epoch": 1.0244998765127191, + "grad_norm": 0.4530647693811047, + "learning_rate": 4.977926815816553e-06, + "loss": 0.5809, + "step": 2592 + }, + { + "epoch": 1.0248950358113114, + "grad_norm": 0.4479882772757105, + "learning_rate": 4.977906037486974e-06, + "loss": 0.5616, + "step": 2593 + }, + { + "epoch": 1.0252901951099036, + "grad_norm": 0.4610471046625982, + "learning_rate": 4.977885249425684e-06, + "loss": 0.5472, + "step": 2594 + }, + { + "epoch": 1.0256853544084958, + "grad_norm": 0.47862490507477046, + "learning_rate": 4.977864451632764e-06, + "loss": 0.5902, + "step": 2595 + }, + { + "epoch": 1.026080513707088, + "grad_norm": 0.5047546461386095, + "learning_rate": 4.977843644108299e-06, + "loss": 0.5884, + "step": 2596 + }, + { + "epoch": 1.0264756730056803, + "grad_norm": 0.4451423123413334, + "learning_rate": 4.977822826852369e-06, + "loss": 0.5642, + "step": 2597 + }, + { + "epoch": 1.0268708323042726, + "grad_norm": 0.4560284371925842, + "learning_rate": 4.977801999865054e-06, + "loss": 0.5606, + "step": 2598 + }, + { + "epoch": 1.0272659916028648, + "grad_norm": 0.4461067403640707, + "learning_rate": 4.977781163146438e-06, + "loss": 0.5649, + "step": 2599 + }, + { + "epoch": 1.027661150901457, + "grad_norm": 0.451597159891192, + "learning_rate": 4.977760316696603e-06, + "loss": 0.5634, + "step": 2600 + }, + { + "epoch": 1.0280563102000495, + "grad_norm": 0.4519191659487248, + "learning_rate": 4.977739460515629e-06, + "loss": 0.5796, + "step": 2601 + }, + { + "epoch": 1.0284514694986417, + "grad_norm": 0.4596070611618375, + "learning_rate": 4.977718594603599e-06, + "loss": 0.5764, + "step": 2602 + }, + { + "epoch": 1.028846628797234, + "grad_norm": 0.440619261667536, + "learning_rate": 4.977697718960595e-06, + "loss": 0.557, + "step": 2603 + }, + { + "epoch": 1.0292417880958262, + "grad_norm": 0.4585810235456444, + "learning_rate": 4.977676833586699e-06, + "loss": 0.5618, + "step": 2604 + }, + { + "epoch": 1.0296369473944185, + "grad_norm": 0.489223887277486, + "learning_rate": 4.977655938481994e-06, + "loss": 0.6054, + "step": 2605 + }, + { + "epoch": 1.0300321066930107, + "grad_norm": 0.4590644227459909, + "learning_rate": 4.97763503364656e-06, + "loss": 0.5575, + "step": 2606 + }, + { + "epoch": 1.030427265991603, + "grad_norm": 0.4676586979127894, + "learning_rate": 4.97761411908048e-06, + "loss": 0.5715, + "step": 2607 + }, + { + "epoch": 1.0308224252901952, + "grad_norm": 0.44950227149153915, + "learning_rate": 4.977593194783836e-06, + "loss": 0.5473, + "step": 2608 + }, + { + "epoch": 1.0312175845887874, + "grad_norm": 0.44607152531853783, + "learning_rate": 4.977572260756711e-06, + "loss": 0.5318, + "step": 2609 + }, + { + "epoch": 1.0316127438873797, + "grad_norm": 0.45941977696779834, + "learning_rate": 4.977551316999186e-06, + "loss": 0.5453, + "step": 2610 + }, + { + "epoch": 1.032007903185972, + "grad_norm": 0.4624807401390588, + "learning_rate": 4.977530363511344e-06, + "loss": 0.5721, + "step": 2611 + }, + { + "epoch": 1.0324030624845641, + "grad_norm": 0.4486414918582425, + "learning_rate": 4.977509400293268e-06, + "loss": 0.5634, + "step": 2612 + }, + { + "epoch": 1.0327982217831564, + "grad_norm": 0.460795352195087, + "learning_rate": 4.977488427345039e-06, + "loss": 0.5663, + "step": 2613 + }, + { + "epoch": 1.0331933810817486, + "grad_norm": 0.47567928790908864, + "learning_rate": 4.977467444666739e-06, + "loss": 0.586, + "step": 2614 + }, + { + "epoch": 1.0335885403803409, + "grad_norm": 0.4486438855377463, + "learning_rate": 4.977446452258452e-06, + "loss": 0.5904, + "step": 2615 + }, + { + "epoch": 1.033983699678933, + "grad_norm": 0.4415373719504563, + "learning_rate": 4.97742545012026e-06, + "loss": 0.5523, + "step": 2616 + }, + { + "epoch": 1.0343788589775254, + "grad_norm": 0.46461856034519633, + "learning_rate": 4.977404438252245e-06, + "loss": 0.556, + "step": 2617 + }, + { + "epoch": 1.0347740182761176, + "grad_norm": 0.45898412914751924, + "learning_rate": 4.97738341665449e-06, + "loss": 0.5868, + "step": 2618 + }, + { + "epoch": 1.0351691775747098, + "grad_norm": 0.44026432413454175, + "learning_rate": 4.977362385327077e-06, + "loss": 0.5399, + "step": 2619 + }, + { + "epoch": 1.035564336873302, + "grad_norm": 0.4536936643831469, + "learning_rate": 4.977341344270088e-06, + "loss": 0.5572, + "step": 2620 + }, + { + "epoch": 1.0359594961718943, + "grad_norm": 0.45300210621077247, + "learning_rate": 4.977320293483608e-06, + "loss": 0.5581, + "step": 2621 + }, + { + "epoch": 1.0363546554704866, + "grad_norm": 0.4768261944797516, + "learning_rate": 4.977299232967717e-06, + "loss": 0.5567, + "step": 2622 + }, + { + "epoch": 1.0367498147690788, + "grad_norm": 0.4490373651630668, + "learning_rate": 4.9772781627225e-06, + "loss": 0.5613, + "step": 2623 + }, + { + "epoch": 1.037144974067671, + "grad_norm": 0.4542757319001758, + "learning_rate": 4.977257082748038e-06, + "loss": 0.5598, + "step": 2624 + }, + { + "epoch": 1.0375401333662633, + "grad_norm": 0.4696319591373662, + "learning_rate": 4.977235993044415e-06, + "loss": 0.574, + "step": 2625 + }, + { + "epoch": 1.0379352926648555, + "grad_norm": 0.4544516411565146, + "learning_rate": 4.977214893611713e-06, + "loss": 0.5647, + "step": 2626 + }, + { + "epoch": 1.0383304519634478, + "grad_norm": 0.4636968131984274, + "learning_rate": 4.977193784450015e-06, + "loss": 0.5776, + "step": 2627 + }, + { + "epoch": 1.03872561126204, + "grad_norm": 0.4549508464045885, + "learning_rate": 4.977172665559403e-06, + "loss": 0.5721, + "step": 2628 + }, + { + "epoch": 1.0391207705606322, + "grad_norm": 0.4432377804345881, + "learning_rate": 4.9771515369399625e-06, + "loss": 0.5624, + "step": 2629 + }, + { + "epoch": 1.0395159298592245, + "grad_norm": 0.4512832309113786, + "learning_rate": 4.977130398591775e-06, + "loss": 0.5793, + "step": 2630 + }, + { + "epoch": 1.0399110891578167, + "grad_norm": 0.44957500296377784, + "learning_rate": 4.977109250514923e-06, + "loss": 0.5649, + "step": 2631 + }, + { + "epoch": 1.040306248456409, + "grad_norm": 0.45057963341150464, + "learning_rate": 4.97708809270949e-06, + "loss": 0.5607, + "step": 2632 + }, + { + "epoch": 1.0407014077550012, + "grad_norm": 0.4542174831876239, + "learning_rate": 4.977066925175559e-06, + "loss": 0.5539, + "step": 2633 + }, + { + "epoch": 1.0410965670535934, + "grad_norm": 0.46807162702065624, + "learning_rate": 4.977045747913213e-06, + "loss": 0.563, + "step": 2634 + }, + { + "epoch": 1.0414917263521857, + "grad_norm": 0.457076849040818, + "learning_rate": 4.977024560922537e-06, + "loss": 0.561, + "step": 2635 + }, + { + "epoch": 1.041886885650778, + "grad_norm": 0.45562089282056256, + "learning_rate": 4.9770033642036105e-06, + "loss": 0.5779, + "step": 2636 + }, + { + "epoch": 1.0422820449493702, + "grad_norm": 0.4525510725985377, + "learning_rate": 4.97698215775652e-06, + "loss": 0.6033, + "step": 2637 + }, + { + "epoch": 1.0426772042479624, + "grad_norm": 0.5400191406643862, + "learning_rate": 4.976960941581348e-06, + "loss": 0.5704, + "step": 2638 + }, + { + "epoch": 1.0430723635465546, + "grad_norm": 0.44608237054831834, + "learning_rate": 4.976939715678178e-06, + "loss": 0.5611, + "step": 2639 + }, + { + "epoch": 1.0434675228451469, + "grad_norm": 0.5052264389295701, + "learning_rate": 4.9769184800470915e-06, + "loss": 0.5755, + "step": 2640 + }, + { + "epoch": 1.0438626821437391, + "grad_norm": 0.44348147340006067, + "learning_rate": 4.976897234688174e-06, + "loss": 0.5598, + "step": 2641 + }, + { + "epoch": 1.0442578414423314, + "grad_norm": 0.451998380744831, + "learning_rate": 4.976875979601508e-06, + "loss": 0.5866, + "step": 2642 + }, + { + "epoch": 1.0446530007409236, + "grad_norm": 0.4450349262521982, + "learning_rate": 4.976854714787177e-06, + "loss": 0.5612, + "step": 2643 + }, + { + "epoch": 1.0450481600395158, + "grad_norm": 0.4554004140171642, + "learning_rate": 4.976833440245265e-06, + "loss": 0.5752, + "step": 2644 + }, + { + "epoch": 1.045443319338108, + "grad_norm": 0.4717355641847888, + "learning_rate": 4.976812155975855e-06, + "loss": 0.5696, + "step": 2645 + }, + { + "epoch": 1.0458384786367003, + "grad_norm": 0.451389235554898, + "learning_rate": 4.976790861979031e-06, + "loss": 0.5896, + "step": 2646 + }, + { + "epoch": 1.0462336379352926, + "grad_norm": 0.4587582612615821, + "learning_rate": 4.976769558254877e-06, + "loss": 0.5905, + "step": 2647 + }, + { + "epoch": 1.0466287972338848, + "grad_norm": 0.4587087573396345, + "learning_rate": 4.976748244803475e-06, + "loss": 0.5823, + "step": 2648 + }, + { + "epoch": 1.0470239565324773, + "grad_norm": 0.4477959820273805, + "learning_rate": 4.976726921624911e-06, + "loss": 0.5657, + "step": 2649 + }, + { + "epoch": 1.0474191158310695, + "grad_norm": 0.43767555375829137, + "learning_rate": 4.976705588719267e-06, + "loss": 0.5649, + "step": 2650 + }, + { + "epoch": 1.0478142751296617, + "grad_norm": 0.453863897792215, + "learning_rate": 4.976684246086627e-06, + "loss": 0.563, + "step": 2651 + }, + { + "epoch": 1.048209434428254, + "grad_norm": 0.44586538451216967, + "learning_rate": 4.976662893727076e-06, + "loss": 0.5541, + "step": 2652 + }, + { + "epoch": 1.0486045937268462, + "grad_norm": 0.46349135729431057, + "learning_rate": 4.9766415316406965e-06, + "loss": 0.5621, + "step": 2653 + }, + { + "epoch": 1.0489997530254385, + "grad_norm": 0.4485724204935941, + "learning_rate": 4.976620159827574e-06, + "loss": 0.5465, + "step": 2654 + }, + { + "epoch": 1.0493949123240307, + "grad_norm": 0.4388668643967193, + "learning_rate": 4.97659877828779e-06, + "loss": 0.5592, + "step": 2655 + }, + { + "epoch": 1.049790071622623, + "grad_norm": 0.46226835952230305, + "learning_rate": 4.976577387021431e-06, + "loss": 0.5814, + "step": 2656 + }, + { + "epoch": 1.0501852309212152, + "grad_norm": 0.4551414886420647, + "learning_rate": 4.97655598602858e-06, + "loss": 0.5489, + "step": 2657 + }, + { + "epoch": 1.0505803902198074, + "grad_norm": 0.4503034545836757, + "learning_rate": 4.97653457530932e-06, + "loss": 0.5847, + "step": 2658 + }, + { + "epoch": 1.0509755495183997, + "grad_norm": 0.4510190880823167, + "learning_rate": 4.976513154863735e-06, + "loss": 0.5638, + "step": 2659 + }, + { + "epoch": 1.051370708816992, + "grad_norm": 0.45601750337175206, + "learning_rate": 4.976491724691912e-06, + "loss": 0.5629, + "step": 2660 + }, + { + "epoch": 1.0517658681155841, + "grad_norm": 0.44161293110782124, + "learning_rate": 4.976470284793933e-06, + "loss": 0.5638, + "step": 2661 + }, + { + "epoch": 1.0521610274141764, + "grad_norm": 0.4971279919030968, + "learning_rate": 4.976448835169882e-06, + "loss": 0.5602, + "step": 2662 + }, + { + "epoch": 1.0525561867127686, + "grad_norm": 0.46201452125116643, + "learning_rate": 4.976427375819844e-06, + "loss": 0.5645, + "step": 2663 + }, + { + "epoch": 1.0529513460113609, + "grad_norm": 0.45839768743558873, + "learning_rate": 4.9764059067439045e-06, + "loss": 0.5729, + "step": 2664 + }, + { + "epoch": 1.053346505309953, + "grad_norm": 0.45571009890945263, + "learning_rate": 4.9763844279421444e-06, + "loss": 0.5669, + "step": 2665 + }, + { + "epoch": 1.0537416646085453, + "grad_norm": 0.4404670408454727, + "learning_rate": 4.97636293941465e-06, + "loss": 0.5537, + "step": 2666 + }, + { + "epoch": 1.0541368239071376, + "grad_norm": 0.4618688908161834, + "learning_rate": 4.976341441161507e-06, + "loss": 0.5666, + "step": 2667 + }, + { + "epoch": 1.0545319832057298, + "grad_norm": 0.4678051099127983, + "learning_rate": 4.976319933182797e-06, + "loss": 0.5852, + "step": 2668 + }, + { + "epoch": 1.054927142504322, + "grad_norm": 0.45030359679512383, + "learning_rate": 4.9762984154786075e-06, + "loss": 0.5749, + "step": 2669 + }, + { + "epoch": 1.0553223018029143, + "grad_norm": 0.4553020196001742, + "learning_rate": 4.9762768880490205e-06, + "loss": 0.5595, + "step": 2670 + }, + { + "epoch": 1.0557174611015065, + "grad_norm": 0.45256331743406897, + "learning_rate": 4.976255350894122e-06, + "loss": 0.5635, + "step": 2671 + }, + { + "epoch": 1.0561126204000988, + "grad_norm": 0.44922321136235305, + "learning_rate": 4.9762338040139965e-06, + "loss": 0.5488, + "step": 2672 + }, + { + "epoch": 1.056507779698691, + "grad_norm": 0.45045666513262506, + "learning_rate": 4.976212247408727e-06, + "loss": 0.5595, + "step": 2673 + }, + { + "epoch": 1.0569029389972833, + "grad_norm": 0.45788018345436676, + "learning_rate": 4.976190681078401e-06, + "loss": 0.5693, + "step": 2674 + }, + { + "epoch": 1.0572980982958755, + "grad_norm": 0.48385791595148814, + "learning_rate": 4.976169105023101e-06, + "loss": 0.5967, + "step": 2675 + }, + { + "epoch": 1.0576932575944678, + "grad_norm": 0.45899699704778635, + "learning_rate": 4.976147519242912e-06, + "loss": 0.5755, + "step": 2676 + }, + { + "epoch": 1.05808841689306, + "grad_norm": 0.44447734769605085, + "learning_rate": 4.976125923737919e-06, + "loss": 0.5678, + "step": 2677 + }, + { + "epoch": 1.0584835761916522, + "grad_norm": 0.4578877530549744, + "learning_rate": 4.976104318508207e-06, + "loss": 0.5781, + "step": 2678 + }, + { + "epoch": 1.0588787354902445, + "grad_norm": 0.4522345638325663, + "learning_rate": 4.9760827035538614e-06, + "loss": 0.5733, + "step": 2679 + }, + { + "epoch": 1.0592738947888367, + "grad_norm": 0.45242325394735017, + "learning_rate": 4.9760610788749665e-06, + "loss": 0.5781, + "step": 2680 + }, + { + "epoch": 1.059669054087429, + "grad_norm": 0.45775529697399175, + "learning_rate": 4.976039444471607e-06, + "loss": 0.5627, + "step": 2681 + }, + { + "epoch": 1.0600642133860212, + "grad_norm": 0.44249752472475534, + "learning_rate": 4.976017800343868e-06, + "loss": 0.5471, + "step": 2682 + }, + { + "epoch": 1.0604593726846134, + "grad_norm": 0.47201137623979633, + "learning_rate": 4.975996146491834e-06, + "loss": 0.5786, + "step": 2683 + }, + { + "epoch": 1.0608545319832057, + "grad_norm": 0.4775255993907024, + "learning_rate": 4.975974482915592e-06, + "loss": 0.5936, + "step": 2684 + }, + { + "epoch": 1.061249691281798, + "grad_norm": 0.44639736931494717, + "learning_rate": 4.975952809615225e-06, + "loss": 0.5579, + "step": 2685 + }, + { + "epoch": 1.0616448505803902, + "grad_norm": 0.44958051847488756, + "learning_rate": 4.975931126590819e-06, + "loss": 0.5706, + "step": 2686 + }, + { + "epoch": 1.0620400098789824, + "grad_norm": 0.4507031529753823, + "learning_rate": 4.975909433842459e-06, + "loss": 0.557, + "step": 2687 + }, + { + "epoch": 1.0624351691775746, + "grad_norm": 0.45147386789174, + "learning_rate": 4.975887731370229e-06, + "loss": 0.5561, + "step": 2688 + }, + { + "epoch": 1.0628303284761669, + "grad_norm": 0.45782580821366853, + "learning_rate": 4.975866019174217e-06, + "loss": 0.5511, + "step": 2689 + }, + { + "epoch": 1.0632254877747591, + "grad_norm": 0.4707517108424949, + "learning_rate": 4.975844297254506e-06, + "loss": 0.5485, + "step": 2690 + }, + { + "epoch": 1.0636206470733514, + "grad_norm": 0.451561493525649, + "learning_rate": 4.975822565611183e-06, + "loss": 0.5615, + "step": 2691 + }, + { + "epoch": 1.0640158063719436, + "grad_norm": 0.4669716121243911, + "learning_rate": 4.975800824244331e-06, + "loss": 0.5676, + "step": 2692 + }, + { + "epoch": 1.0644109656705358, + "grad_norm": 0.4657398109153327, + "learning_rate": 4.975779073154038e-06, + "loss": 0.5609, + "step": 2693 + }, + { + "epoch": 1.064806124969128, + "grad_norm": 0.4577124290606098, + "learning_rate": 4.975757312340387e-06, + "loss": 0.5649, + "step": 2694 + }, + { + "epoch": 1.0652012842677205, + "grad_norm": 0.459041880221058, + "learning_rate": 4.975735541803465e-06, + "loss": 0.5649, + "step": 2695 + }, + { + "epoch": 1.0655964435663128, + "grad_norm": 0.4665780766862844, + "learning_rate": 4.975713761543357e-06, + "loss": 0.5837, + "step": 2696 + }, + { + "epoch": 1.065991602864905, + "grad_norm": 0.45328946916325785, + "learning_rate": 4.975691971560149e-06, + "loss": 0.5687, + "step": 2697 + }, + { + "epoch": 1.0663867621634973, + "grad_norm": 0.46683872005405097, + "learning_rate": 4.975670171853926e-06, + "loss": 0.564, + "step": 2698 + }, + { + "epoch": 1.0667819214620895, + "grad_norm": 0.4490782215754304, + "learning_rate": 4.9756483624247745e-06, + "loss": 0.5694, + "step": 2699 + }, + { + "epoch": 1.0671770807606817, + "grad_norm": 0.4602344049001532, + "learning_rate": 4.975626543272779e-06, + "loss": 0.5829, + "step": 2700 + }, + { + "epoch": 1.067572240059274, + "grad_norm": 0.47778949312409547, + "learning_rate": 4.975604714398026e-06, + "loss": 0.5849, + "step": 2701 + }, + { + "epoch": 1.0679673993578662, + "grad_norm": 0.4766565902077033, + "learning_rate": 4.9755828758006e-06, + "loss": 0.5803, + "step": 2702 + }, + { + "epoch": 1.0683625586564585, + "grad_norm": 0.46556612189369995, + "learning_rate": 4.975561027480589e-06, + "loss": 0.565, + "step": 2703 + }, + { + "epoch": 1.0687577179550507, + "grad_norm": 0.4681074034544366, + "learning_rate": 4.975539169438077e-06, + "loss": 0.5826, + "step": 2704 + }, + { + "epoch": 1.069152877253643, + "grad_norm": 0.4638743898250165, + "learning_rate": 4.975517301673151e-06, + "loss": 0.5734, + "step": 2705 + }, + { + "epoch": 1.0695480365522352, + "grad_norm": 0.4791989380557464, + "learning_rate": 4.975495424185895e-06, + "loss": 0.5706, + "step": 2706 + }, + { + "epoch": 1.0699431958508274, + "grad_norm": 0.4648061747202406, + "learning_rate": 4.975473536976397e-06, + "loss": 0.5613, + "step": 2707 + }, + { + "epoch": 1.0703383551494197, + "grad_norm": 0.4758277574553136, + "learning_rate": 4.975451640044742e-06, + "loss": 0.5758, + "step": 2708 + }, + { + "epoch": 1.070733514448012, + "grad_norm": 0.46226325782703626, + "learning_rate": 4.975429733391016e-06, + "loss": 0.5685, + "step": 2709 + }, + { + "epoch": 1.0711286737466041, + "grad_norm": 0.45170576388419686, + "learning_rate": 4.975407817015306e-06, + "loss": 0.5501, + "step": 2710 + }, + { + "epoch": 1.0715238330451964, + "grad_norm": 0.4587950204803136, + "learning_rate": 4.975385890917696e-06, + "loss": 0.5627, + "step": 2711 + }, + { + "epoch": 1.0719189923437886, + "grad_norm": 0.5105258657179963, + "learning_rate": 4.975363955098273e-06, + "loss": 0.5664, + "step": 2712 + }, + { + "epoch": 1.0723141516423809, + "grad_norm": 0.49413851191960967, + "learning_rate": 4.975342009557125e-06, + "loss": 0.562, + "step": 2713 + }, + { + "epoch": 1.072709310940973, + "grad_norm": 0.45320917178937237, + "learning_rate": 4.975320054294336e-06, + "loss": 0.5385, + "step": 2714 + }, + { + "epoch": 1.0731044702395653, + "grad_norm": 0.47467856876046166, + "learning_rate": 4.975298089309993e-06, + "loss": 0.5721, + "step": 2715 + }, + { + "epoch": 1.0734996295381576, + "grad_norm": 0.4600018582339907, + "learning_rate": 4.975276114604182e-06, + "loss": 0.5653, + "step": 2716 + }, + { + "epoch": 1.0738947888367498, + "grad_norm": 0.4708300385350632, + "learning_rate": 4.97525413017699e-06, + "loss": 0.5603, + "step": 2717 + }, + { + "epoch": 1.074289948135342, + "grad_norm": 0.4506434681952806, + "learning_rate": 4.975232136028502e-06, + "loss": 0.5381, + "step": 2718 + }, + { + "epoch": 1.0746851074339343, + "grad_norm": 0.46055573294638946, + "learning_rate": 4.975210132158805e-06, + "loss": 0.5503, + "step": 2719 + }, + { + "epoch": 1.0750802667325265, + "grad_norm": 0.5855856780535553, + "learning_rate": 4.975188118567987e-06, + "loss": 0.582, + "step": 2720 + }, + { + "epoch": 1.0754754260311188, + "grad_norm": 0.48475880634450325, + "learning_rate": 4.975166095256132e-06, + "loss": 0.5735, + "step": 2721 + }, + { + "epoch": 1.075870585329711, + "grad_norm": 0.48720473179501395, + "learning_rate": 4.975144062223328e-06, + "loss": 0.5719, + "step": 2722 + }, + { + "epoch": 1.0762657446283033, + "grad_norm": 0.4703550004447043, + "learning_rate": 4.9751220194696615e-06, + "loss": 0.6014, + "step": 2723 + }, + { + "epoch": 1.0766609039268955, + "grad_norm": 0.47236077192721254, + "learning_rate": 4.975099966995218e-06, + "loss": 0.5712, + "step": 2724 + }, + { + "epoch": 1.0770560632254877, + "grad_norm": 0.46826552794346366, + "learning_rate": 4.975077904800086e-06, + "loss": 0.5742, + "step": 2725 + }, + { + "epoch": 1.07745122252408, + "grad_norm": 0.4976523466784567, + "learning_rate": 4.975055832884349e-06, + "loss": 0.5837, + "step": 2726 + }, + { + "epoch": 1.0778463818226722, + "grad_norm": 0.5218434843940184, + "learning_rate": 4.975033751248096e-06, + "loss": 0.5422, + "step": 2727 + }, + { + "epoch": 1.0782415411212645, + "grad_norm": 0.4931023411702474, + "learning_rate": 4.975011659891415e-06, + "loss": 0.5729, + "step": 2728 + }, + { + "epoch": 1.0786367004198567, + "grad_norm": 0.5049728108245582, + "learning_rate": 4.974989558814389e-06, + "loss": 0.5665, + "step": 2729 + }, + { + "epoch": 1.079031859718449, + "grad_norm": 0.49433774891751775, + "learning_rate": 4.974967448017109e-06, + "loss": 0.5706, + "step": 2730 + }, + { + "epoch": 1.0794270190170412, + "grad_norm": 0.4620589073782666, + "learning_rate": 4.974945327499658e-06, + "loss": 0.5621, + "step": 2731 + }, + { + "epoch": 1.0798221783156334, + "grad_norm": 0.48242430406086245, + "learning_rate": 4.974923197262126e-06, + "loss": 0.5584, + "step": 2732 + }, + { + "epoch": 1.0802173376142257, + "grad_norm": 0.46908301271787106, + "learning_rate": 4.974901057304598e-06, + "loss": 0.575, + "step": 2733 + }, + { + "epoch": 1.080612496912818, + "grad_norm": 0.4635792423887607, + "learning_rate": 4.974878907627161e-06, + "loss": 0.5666, + "step": 2734 + }, + { + "epoch": 1.0810076562114102, + "grad_norm": 0.45678202788616845, + "learning_rate": 4.974856748229902e-06, + "loss": 0.5513, + "step": 2735 + }, + { + "epoch": 1.0814028155100024, + "grad_norm": 0.48311485248397, + "learning_rate": 4.97483457911291e-06, + "loss": 0.5517, + "step": 2736 + }, + { + "epoch": 1.0817979748085946, + "grad_norm": 0.4636165852005326, + "learning_rate": 4.97481240027627e-06, + "loss": 0.5597, + "step": 2737 + }, + { + "epoch": 1.0821931341071869, + "grad_norm": 0.4417447450007961, + "learning_rate": 4.97479021172007e-06, + "loss": 0.5705, + "step": 2738 + }, + { + "epoch": 1.0825882934057791, + "grad_norm": 0.4518840030411349, + "learning_rate": 4.974768013444395e-06, + "loss": 0.5653, + "step": 2739 + }, + { + "epoch": 1.0829834527043714, + "grad_norm": 1.1302182473095272, + "learning_rate": 4.974745805449336e-06, + "loss": 0.5999, + "step": 2740 + }, + { + "epoch": 1.0833786120029636, + "grad_norm": 0.4646198565639412, + "learning_rate": 4.974723587734977e-06, + "loss": 0.5814, + "step": 2741 + }, + { + "epoch": 1.0837737713015558, + "grad_norm": 0.4561905295641323, + "learning_rate": 4.974701360301408e-06, + "loss": 0.5699, + "step": 2742 + }, + { + "epoch": 1.084168930600148, + "grad_norm": 0.48160057985917565, + "learning_rate": 4.974679123148713e-06, + "loss": 0.556, + "step": 2743 + }, + { + "epoch": 1.0845640898987403, + "grad_norm": 0.4611451474715039, + "learning_rate": 4.974656876276983e-06, + "loss": 0.577, + "step": 2744 + }, + { + "epoch": 1.0849592491973328, + "grad_norm": 0.4954859328073638, + "learning_rate": 4.974634619686303e-06, + "loss": 0.5785, + "step": 2745 + }, + { + "epoch": 1.085354408495925, + "grad_norm": 0.46526474266981815, + "learning_rate": 4.97461235337676e-06, + "loss": 0.5627, + "step": 2746 + }, + { + "epoch": 1.0857495677945173, + "grad_norm": 0.47088544330789683, + "learning_rate": 4.974590077348442e-06, + "loss": 0.5735, + "step": 2747 + }, + { + "epoch": 1.0861447270931095, + "grad_norm": 0.44275077749114805, + "learning_rate": 4.974567791601438e-06, + "loss": 0.56, + "step": 2748 + }, + { + "epoch": 1.0865398863917017, + "grad_norm": 0.44909868576930495, + "learning_rate": 4.974545496135834e-06, + "loss": 0.5626, + "step": 2749 + }, + { + "epoch": 1.086935045690294, + "grad_norm": 0.4435390432235627, + "learning_rate": 4.974523190951718e-06, + "loss": 0.5489, + "step": 2750 + }, + { + "epoch": 1.0873302049888862, + "grad_norm": 0.46104078067735216, + "learning_rate": 4.974500876049177e-06, + "loss": 0.5575, + "step": 2751 + }, + { + "epoch": 1.0877253642874785, + "grad_norm": 0.47232606154854884, + "learning_rate": 4.9744785514283e-06, + "loss": 0.5636, + "step": 2752 + }, + { + "epoch": 1.0881205235860707, + "grad_norm": 0.45996006809985085, + "learning_rate": 4.974456217089173e-06, + "loss": 0.5707, + "step": 2753 + }, + { + "epoch": 1.088515682884663, + "grad_norm": 0.45991550984476076, + "learning_rate": 4.9744338730318846e-06, + "loss": 0.565, + "step": 2754 + }, + { + "epoch": 1.0889108421832552, + "grad_norm": 0.45264942810273506, + "learning_rate": 4.974411519256523e-06, + "loss": 0.5611, + "step": 2755 + }, + { + "epoch": 1.0893060014818474, + "grad_norm": 0.4603092385989538, + "learning_rate": 4.974389155763175e-06, + "loss": 0.5729, + "step": 2756 + }, + { + "epoch": 1.0897011607804397, + "grad_norm": 0.4622551865908486, + "learning_rate": 4.974366782551929e-06, + "loss": 0.5706, + "step": 2757 + }, + { + "epoch": 1.090096320079032, + "grad_norm": 0.45746013561890175, + "learning_rate": 4.974344399622874e-06, + "loss": 0.5527, + "step": 2758 + }, + { + "epoch": 1.0904914793776241, + "grad_norm": 0.4503667034204893, + "learning_rate": 4.974322006976095e-06, + "loss": 0.56, + "step": 2759 + }, + { + "epoch": 1.0908866386762164, + "grad_norm": 0.45807333150826124, + "learning_rate": 4.974299604611682e-06, + "loss": 0.5475, + "step": 2760 + }, + { + "epoch": 1.0912817979748086, + "grad_norm": 0.46938442181289014, + "learning_rate": 4.974277192529723e-06, + "loss": 0.5607, + "step": 2761 + }, + { + "epoch": 1.0916769572734009, + "grad_norm": 0.4504533242644673, + "learning_rate": 4.974254770730306e-06, + "loss": 0.5817, + "step": 2762 + }, + { + "epoch": 1.092072116571993, + "grad_norm": 0.44892490581416017, + "learning_rate": 4.974232339213519e-06, + "loss": 0.5572, + "step": 2763 + }, + { + "epoch": 1.0924672758705853, + "grad_norm": 0.5598674028961942, + "learning_rate": 4.974209897979448e-06, + "loss": 0.5688, + "step": 2764 + }, + { + "epoch": 1.0928624351691776, + "grad_norm": 0.4463594082886901, + "learning_rate": 4.974187447028185e-06, + "loss": 0.5657, + "step": 2765 + }, + { + "epoch": 1.0932575944677698, + "grad_norm": 0.45947250553137825, + "learning_rate": 4.974164986359814e-06, + "loss": 0.5719, + "step": 2766 + }, + { + "epoch": 1.093652753766362, + "grad_norm": 0.4621085715770585, + "learning_rate": 4.974142515974427e-06, + "loss": 0.5801, + "step": 2767 + }, + { + "epoch": 1.0940479130649543, + "grad_norm": 0.44428474945478147, + "learning_rate": 4.97412003587211e-06, + "loss": 0.5583, + "step": 2768 + }, + { + "epoch": 1.0944430723635465, + "grad_norm": 0.4573957820670806, + "learning_rate": 4.974097546052952e-06, + "loss": 0.5731, + "step": 2769 + }, + { + "epoch": 1.0948382316621388, + "grad_norm": 0.45565450583105144, + "learning_rate": 4.9740750465170415e-06, + "loss": 0.5809, + "step": 2770 + }, + { + "epoch": 1.095233390960731, + "grad_norm": 0.4482788589021872, + "learning_rate": 4.974052537264465e-06, + "loss": 0.5792, + "step": 2771 + }, + { + "epoch": 1.0956285502593233, + "grad_norm": 0.4461820839459798, + "learning_rate": 4.974030018295314e-06, + "loss": 0.5786, + "step": 2772 + }, + { + "epoch": 1.0960237095579155, + "grad_norm": 0.44439297274093054, + "learning_rate": 4.974007489609675e-06, + "loss": 0.5887, + "step": 2773 + }, + { + "epoch": 1.0964188688565077, + "grad_norm": 0.44582844653174675, + "learning_rate": 4.973984951207638e-06, + "loss": 0.5897, + "step": 2774 + }, + { + "epoch": 1.0968140281551, + "grad_norm": 0.4553585727658302, + "learning_rate": 4.9739624030892885e-06, + "loss": 0.5661, + "step": 2775 + }, + { + "epoch": 1.0972091874536922, + "grad_norm": 0.4479235148347027, + "learning_rate": 4.973939845254718e-06, + "loss": 0.5931, + "step": 2776 + }, + { + "epoch": 1.0976043467522845, + "grad_norm": 0.4428344941159122, + "learning_rate": 4.973917277704014e-06, + "loss": 0.5598, + "step": 2777 + }, + { + "epoch": 1.0979995060508767, + "grad_norm": 0.476027880314366, + "learning_rate": 4.973894700437265e-06, + "loss": 0.5873, + "step": 2778 + }, + { + "epoch": 1.098394665349469, + "grad_norm": 0.4404118923653951, + "learning_rate": 4.97387211345456e-06, + "loss": 0.5493, + "step": 2779 + }, + { + "epoch": 1.0987898246480612, + "grad_norm": 0.4574987015710223, + "learning_rate": 4.973849516755987e-06, + "loss": 0.5498, + "step": 2780 + }, + { + "epoch": 1.0991849839466534, + "grad_norm": 0.45123369032502647, + "learning_rate": 4.973826910341636e-06, + "loss": 0.5636, + "step": 2781 + }, + { + "epoch": 1.0995801432452457, + "grad_norm": 0.4617618303547862, + "learning_rate": 4.973804294211595e-06, + "loss": 0.5835, + "step": 2782 + }, + { + "epoch": 1.099975302543838, + "grad_norm": 0.4579139636190123, + "learning_rate": 4.973781668365953e-06, + "loss": 0.5737, + "step": 2783 + }, + { + "epoch": 1.1003704618424301, + "grad_norm": 0.4771120050290431, + "learning_rate": 4.973759032804798e-06, + "loss": 0.581, + "step": 2784 + }, + { + "epoch": 1.1007656211410224, + "grad_norm": 0.4573835551232583, + "learning_rate": 4.973736387528219e-06, + "loss": 0.5536, + "step": 2785 + }, + { + "epoch": 1.1011607804396146, + "grad_norm": 0.43751748320789247, + "learning_rate": 4.973713732536307e-06, + "loss": 0.5533, + "step": 2786 + }, + { + "epoch": 1.1015559397382069, + "grad_norm": 0.4551112523172073, + "learning_rate": 4.97369106782915e-06, + "loss": 0.5727, + "step": 2787 + }, + { + "epoch": 1.101951099036799, + "grad_norm": 0.4728095976072211, + "learning_rate": 4.973668393406835e-06, + "loss": 0.5828, + "step": 2788 + }, + { + "epoch": 1.1023462583353916, + "grad_norm": 0.46554008898501564, + "learning_rate": 4.9736457092694545e-06, + "loss": 0.5754, + "step": 2789 + }, + { + "epoch": 1.1027414176339838, + "grad_norm": 0.44230286912396166, + "learning_rate": 4.9736230154170945e-06, + "loss": 0.5659, + "step": 2790 + }, + { + "epoch": 1.103136576932576, + "grad_norm": 0.44947861957297847, + "learning_rate": 4.973600311849845e-06, + "loss": 0.5709, + "step": 2791 + }, + { + "epoch": 1.1035317362311683, + "grad_norm": 0.44653133824766644, + "learning_rate": 4.973577598567797e-06, + "loss": 0.5887, + "step": 2792 + }, + { + "epoch": 1.1039268955297605, + "grad_norm": 0.46671707574431837, + "learning_rate": 4.9735548755710374e-06, + "loss": 0.5871, + "step": 2793 + }, + { + "epoch": 1.1043220548283528, + "grad_norm": 0.45380131690571734, + "learning_rate": 4.973532142859656e-06, + "loss": 0.5517, + "step": 2794 + }, + { + "epoch": 1.104717214126945, + "grad_norm": 0.47357589275123324, + "learning_rate": 4.973509400433743e-06, + "loss": 0.5503, + "step": 2795 + }, + { + "epoch": 1.1051123734255373, + "grad_norm": 0.45961644968240184, + "learning_rate": 4.9734866482933865e-06, + "loss": 0.5652, + "step": 2796 + }, + { + "epoch": 1.1055075327241295, + "grad_norm": 0.4625546092783896, + "learning_rate": 4.973463886438676e-06, + "loss": 0.5787, + "step": 2797 + }, + { + "epoch": 1.1059026920227217, + "grad_norm": 0.46020081411653113, + "learning_rate": 4.9734411148697025e-06, + "loss": 0.5609, + "step": 2798 + }, + { + "epoch": 1.106297851321314, + "grad_norm": 0.46162652087403155, + "learning_rate": 4.973418333586553e-06, + "loss": 0.5872, + "step": 2799 + }, + { + "epoch": 1.1066930106199062, + "grad_norm": 0.44195379442185856, + "learning_rate": 4.97339554258932e-06, + "loss": 0.5497, + "step": 2800 + }, + { + "epoch": 1.1070881699184985, + "grad_norm": 0.449613283566657, + "learning_rate": 4.97337274187809e-06, + "loss": 0.5603, + "step": 2801 + }, + { + "epoch": 1.1074833292170907, + "grad_norm": 0.45072734141619586, + "learning_rate": 4.973349931452953e-06, + "loss": 0.5577, + "step": 2802 + }, + { + "epoch": 1.107878488515683, + "grad_norm": 0.4674608199843895, + "learning_rate": 4.973327111314e-06, + "loss": 0.5786, + "step": 2803 + }, + { + "epoch": 1.1082736478142752, + "grad_norm": 0.4630627894242256, + "learning_rate": 4.9733042814613205e-06, + "loss": 0.5735, + "step": 2804 + }, + { + "epoch": 1.1086688071128674, + "grad_norm": 0.4583380745367057, + "learning_rate": 4.973281441895004e-06, + "loss": 0.56, + "step": 2805 + }, + { + "epoch": 1.1090639664114597, + "grad_norm": 0.4733069612546895, + "learning_rate": 4.973258592615139e-06, + "loss": 0.5684, + "step": 2806 + }, + { + "epoch": 1.109459125710052, + "grad_norm": 0.454537320554346, + "learning_rate": 4.973235733621816e-06, + "loss": 0.5611, + "step": 2807 + }, + { + "epoch": 1.1098542850086441, + "grad_norm": 0.46762331052618233, + "learning_rate": 4.973212864915126e-06, + "loss": 0.5712, + "step": 2808 + }, + { + "epoch": 1.1102494443072364, + "grad_norm": 0.46039286243703786, + "learning_rate": 4.973189986495157e-06, + "loss": 0.5706, + "step": 2809 + }, + { + "epoch": 1.1106446036058286, + "grad_norm": 0.45645844421382126, + "learning_rate": 4.973167098362e-06, + "loss": 0.5969, + "step": 2810 + }, + { + "epoch": 1.1110397629044209, + "grad_norm": 0.44815006924446793, + "learning_rate": 4.973144200515742e-06, + "loss": 0.5622, + "step": 2811 + }, + { + "epoch": 1.111434922203013, + "grad_norm": 0.45673663581682494, + "learning_rate": 4.9731212929564785e-06, + "loss": 0.5713, + "step": 2812 + }, + { + "epoch": 1.1118300815016053, + "grad_norm": 0.4410699289674135, + "learning_rate": 4.973098375684295e-06, + "loss": 0.5572, + "step": 2813 + }, + { + "epoch": 1.1122252408001976, + "grad_norm": 0.45953779864395006, + "learning_rate": 4.973075448699283e-06, + "loss": 0.5721, + "step": 2814 + }, + { + "epoch": 1.1126204000987898, + "grad_norm": 0.48969886056398254, + "learning_rate": 4.973052512001532e-06, + "loss": 0.5961, + "step": 2815 + }, + { + "epoch": 1.113015559397382, + "grad_norm": 0.4582083675335303, + "learning_rate": 4.9730295655911325e-06, + "loss": 0.5664, + "step": 2816 + }, + { + "epoch": 1.1134107186959743, + "grad_norm": 0.45873792204035635, + "learning_rate": 4.973006609468175e-06, + "loss": 0.5798, + "step": 2817 + }, + { + "epoch": 1.1138058779945665, + "grad_norm": 0.4595469818479189, + "learning_rate": 4.9729836436327486e-06, + "loss": 0.5583, + "step": 2818 + }, + { + "epoch": 1.1142010372931588, + "grad_norm": 0.4660954028217541, + "learning_rate": 4.972960668084945e-06, + "loss": 0.586, + "step": 2819 + }, + { + "epoch": 1.114596196591751, + "grad_norm": 0.45385089149228297, + "learning_rate": 4.972937682824853e-06, + "loss": 0.5617, + "step": 2820 + }, + { + "epoch": 1.1149913558903433, + "grad_norm": 0.4541868015985183, + "learning_rate": 4.972914687852564e-06, + "loss": 0.5709, + "step": 2821 + }, + { + "epoch": 1.1153865151889355, + "grad_norm": 0.4712498210514924, + "learning_rate": 4.972891683168166e-06, + "loss": 0.5643, + "step": 2822 + }, + { + "epoch": 1.1157816744875277, + "grad_norm": 0.4528912615721515, + "learning_rate": 4.9728686687717534e-06, + "loss": 0.5677, + "step": 2823 + }, + { + "epoch": 1.11617683378612, + "grad_norm": 0.5454379549098186, + "learning_rate": 4.972845644663412e-06, + "loss": 0.5848, + "step": 2824 + }, + { + "epoch": 1.1165719930847122, + "grad_norm": 0.46204902182734076, + "learning_rate": 4.972822610843236e-06, + "loss": 0.5509, + "step": 2825 + }, + { + "epoch": 1.1169671523833045, + "grad_norm": 0.45981650702917953, + "learning_rate": 4.972799567311314e-06, + "loss": 0.5589, + "step": 2826 + }, + { + "epoch": 1.1173623116818967, + "grad_norm": 0.4748451628776366, + "learning_rate": 4.9727765140677374e-06, + "loss": 0.5735, + "step": 2827 + }, + { + "epoch": 1.117757470980489, + "grad_norm": 0.43824092167576956, + "learning_rate": 4.972753451112596e-06, + "loss": 0.5595, + "step": 2828 + }, + { + "epoch": 1.1181526302790812, + "grad_norm": 0.4532662779925716, + "learning_rate": 4.97273037844598e-06, + "loss": 0.5644, + "step": 2829 + }, + { + "epoch": 1.1185477895776734, + "grad_norm": 0.45748152598351954, + "learning_rate": 4.972707296067981e-06, + "loss": 0.5677, + "step": 2830 + }, + { + "epoch": 1.1189429488762657, + "grad_norm": 0.4588021846664917, + "learning_rate": 4.972684203978689e-06, + "loss": 0.5741, + "step": 2831 + }, + { + "epoch": 1.119338108174858, + "grad_norm": 0.4571577453972498, + "learning_rate": 4.972661102178196e-06, + "loss": 0.5521, + "step": 2832 + }, + { + "epoch": 1.1197332674734501, + "grad_norm": 0.4584124940006824, + "learning_rate": 4.97263799066659e-06, + "loss": 0.555, + "step": 2833 + }, + { + "epoch": 1.1201284267720424, + "grad_norm": 0.4768334321839287, + "learning_rate": 4.972614869443965e-06, + "loss": 0.5554, + "step": 2834 + }, + { + "epoch": 1.1205235860706346, + "grad_norm": 0.45878358249431184, + "learning_rate": 4.972591738510409e-06, + "loss": 0.5652, + "step": 2835 + }, + { + "epoch": 1.1209187453692269, + "grad_norm": 0.4603002756472343, + "learning_rate": 4.972568597866014e-06, + "loss": 0.5706, + "step": 2836 + }, + { + "epoch": 1.121313904667819, + "grad_norm": 0.4331713760224906, + "learning_rate": 4.9725454475108714e-06, + "loss": 0.5544, + "step": 2837 + }, + { + "epoch": 1.1217090639664113, + "grad_norm": 0.4527006951431627, + "learning_rate": 4.972522287445072e-06, + "loss": 0.5435, + "step": 2838 + }, + { + "epoch": 1.1221042232650038, + "grad_norm": 0.4757703333155185, + "learning_rate": 4.972499117668707e-06, + "loss": 0.5851, + "step": 2839 + }, + { + "epoch": 1.122499382563596, + "grad_norm": 0.45557041972406903, + "learning_rate": 4.972475938181866e-06, + "loss": 0.5624, + "step": 2840 + }, + { + "epoch": 1.1228945418621883, + "grad_norm": 0.4512928810062557, + "learning_rate": 4.972452748984641e-06, + "loss": 0.5485, + "step": 2841 + }, + { + "epoch": 1.1232897011607805, + "grad_norm": 0.47660422203668557, + "learning_rate": 4.972429550077122e-06, + "loss": 0.5785, + "step": 2842 + }, + { + "epoch": 1.1236848604593728, + "grad_norm": 0.45195553481987427, + "learning_rate": 4.972406341459403e-06, + "loss": 0.5754, + "step": 2843 + }, + { + "epoch": 1.124080019757965, + "grad_norm": 0.4692003584527425, + "learning_rate": 4.972383123131572e-06, + "loss": 0.5713, + "step": 2844 + }, + { + "epoch": 1.1244751790565573, + "grad_norm": 0.46001788143359484, + "learning_rate": 4.9723598950937216e-06, + "loss": 0.5527, + "step": 2845 + }, + { + "epoch": 1.1248703383551495, + "grad_norm": 0.4530700630207127, + "learning_rate": 4.972336657345943e-06, + "loss": 0.5762, + "step": 2846 + }, + { + "epoch": 1.1252654976537417, + "grad_norm": 0.4814747648947358, + "learning_rate": 4.972313409888327e-06, + "loss": 0.5757, + "step": 2847 + }, + { + "epoch": 1.125660656952334, + "grad_norm": 0.4644316998225435, + "learning_rate": 4.972290152720965e-06, + "loss": 0.5932, + "step": 2848 + }, + { + "epoch": 1.1260558162509262, + "grad_norm": 0.43522985761057875, + "learning_rate": 4.97226688584395e-06, + "loss": 0.548, + "step": 2849 + }, + { + "epoch": 1.1264509755495185, + "grad_norm": 0.44885828698030505, + "learning_rate": 4.97224360925737e-06, + "loss": 0.5673, + "step": 2850 + }, + { + "epoch": 1.1268461348481107, + "grad_norm": 0.4546348191990088, + "learning_rate": 4.972220322961318e-06, + "loss": 0.5717, + "step": 2851 + }, + { + "epoch": 1.127241294146703, + "grad_norm": 0.45766425039161684, + "learning_rate": 4.972197026955888e-06, + "loss": 0.5738, + "step": 2852 + }, + { + "epoch": 1.1276364534452952, + "grad_norm": 0.48259567077817345, + "learning_rate": 4.972173721241168e-06, + "loss": 0.5711, + "step": 2853 + }, + { + "epoch": 1.1280316127438874, + "grad_norm": 0.4465075582537828, + "learning_rate": 4.972150405817251e-06, + "loss": 0.5608, + "step": 2854 + }, + { + "epoch": 1.1284267720424797, + "grad_norm": 0.4494060907226681, + "learning_rate": 4.972127080684228e-06, + "loss": 0.5597, + "step": 2855 + }, + { + "epoch": 1.128821931341072, + "grad_norm": 0.45776592151810525, + "learning_rate": 4.972103745842192e-06, + "loss": 0.5696, + "step": 2856 + }, + { + "epoch": 1.1292170906396641, + "grad_norm": 0.4517897855267896, + "learning_rate": 4.9720804012912325e-06, + "loss": 0.5597, + "step": 2857 + }, + { + "epoch": 1.1296122499382564, + "grad_norm": 0.45557106783434753, + "learning_rate": 4.9720570470314435e-06, + "loss": 0.585, + "step": 2858 + }, + { + "epoch": 1.1300074092368486, + "grad_norm": 0.44670409565234737, + "learning_rate": 4.9720336830629145e-06, + "loss": 0.5609, + "step": 2859 + }, + { + "epoch": 1.1304025685354409, + "grad_norm": 0.4616073635588755, + "learning_rate": 4.972010309385739e-06, + "loss": 0.5669, + "step": 2860 + }, + { + "epoch": 1.130797727834033, + "grad_norm": 0.48898771950037234, + "learning_rate": 4.971986926000008e-06, + "loss": 0.5781, + "step": 2861 + }, + { + "epoch": 1.1311928871326253, + "grad_norm": 0.47129122376722427, + "learning_rate": 4.971963532905812e-06, + "loss": 0.5856, + "step": 2862 + }, + { + "epoch": 1.1315880464312176, + "grad_norm": 0.4367114279377543, + "learning_rate": 4.971940130103245e-06, + "loss": 0.5498, + "step": 2863 + }, + { + "epoch": 1.1319832057298098, + "grad_norm": 0.45445984797614775, + "learning_rate": 4.9719167175924e-06, + "loss": 0.5585, + "step": 2864 + }, + { + "epoch": 1.132378365028402, + "grad_norm": 0.47288776531263177, + "learning_rate": 4.971893295373366e-06, + "loss": 0.5699, + "step": 2865 + }, + { + "epoch": 1.1327735243269943, + "grad_norm": 0.46037768185257905, + "learning_rate": 4.971869863446235e-06, + "loss": 0.555, + "step": 2866 + }, + { + "epoch": 1.1331686836255865, + "grad_norm": 0.4409672704352513, + "learning_rate": 4.971846421811101e-06, + "loss": 0.5741, + "step": 2867 + }, + { + "epoch": 1.1335638429241788, + "grad_norm": 0.4735947826477942, + "learning_rate": 4.971822970468056e-06, + "loss": 0.5751, + "step": 2868 + }, + { + "epoch": 1.133959002222771, + "grad_norm": 0.4985409552375511, + "learning_rate": 4.97179950941719e-06, + "loss": 0.5831, + "step": 2869 + }, + { + "epoch": 1.1343541615213633, + "grad_norm": 0.44396729751506975, + "learning_rate": 4.971776038658598e-06, + "loss": 0.5666, + "step": 2870 + }, + { + "epoch": 1.1347493208199555, + "grad_norm": 0.46267223426001386, + "learning_rate": 4.97175255819237e-06, + "loss": 0.5596, + "step": 2871 + }, + { + "epoch": 1.1351444801185477, + "grad_norm": 0.46568413072052744, + "learning_rate": 4.9717290680185985e-06, + "loss": 0.5686, + "step": 2872 + }, + { + "epoch": 1.13553963941714, + "grad_norm": 0.4673886583205963, + "learning_rate": 4.971705568137376e-06, + "loss": 0.5756, + "step": 2873 + }, + { + "epoch": 1.1359347987157322, + "grad_norm": 0.45942651811103113, + "learning_rate": 4.971682058548795e-06, + "loss": 0.5676, + "step": 2874 + }, + { + "epoch": 1.1363299580143245, + "grad_norm": 0.45192092503324155, + "learning_rate": 4.971658539252948e-06, + "loss": 0.5729, + "step": 2875 + }, + { + "epoch": 1.1367251173129167, + "grad_norm": 0.4428557809778483, + "learning_rate": 4.971635010249928e-06, + "loss": 0.5751, + "step": 2876 + }, + { + "epoch": 1.137120276611509, + "grad_norm": 0.4506504247352054, + "learning_rate": 4.971611471539826e-06, + "loss": 0.5628, + "step": 2877 + }, + { + "epoch": 1.1375154359101012, + "grad_norm": 0.45545331444170223, + "learning_rate": 4.971587923122734e-06, + "loss": 0.5642, + "step": 2878 + }, + { + "epoch": 1.1379105952086934, + "grad_norm": 0.479413221980225, + "learning_rate": 4.971564364998747e-06, + "loss": 0.5749, + "step": 2879 + }, + { + "epoch": 1.1383057545072857, + "grad_norm": 0.4439935558628392, + "learning_rate": 4.971540797167954e-06, + "loss": 0.5599, + "step": 2880 + }, + { + "epoch": 1.138700913805878, + "grad_norm": 0.4444069130578539, + "learning_rate": 4.971517219630451e-06, + "loss": 0.5862, + "step": 2881 + }, + { + "epoch": 1.1390960731044704, + "grad_norm": 0.4552609885225358, + "learning_rate": 4.971493632386329e-06, + "loss": 0.5575, + "step": 2882 + }, + { + "epoch": 1.1394912324030626, + "grad_norm": 0.4382492830615158, + "learning_rate": 4.97147003543568e-06, + "loss": 0.5589, + "step": 2883 + }, + { + "epoch": 1.1398863917016548, + "grad_norm": 0.4437390367384225, + "learning_rate": 4.971446428778599e-06, + "loss": 0.5714, + "step": 2884 + }, + { + "epoch": 1.140281551000247, + "grad_norm": 0.4456567659509046, + "learning_rate": 4.971422812415176e-06, + "loss": 0.5526, + "step": 2885 + }, + { + "epoch": 1.1406767102988393, + "grad_norm": 0.44730543435927383, + "learning_rate": 4.971399186345505e-06, + "loss": 0.5579, + "step": 2886 + }, + { + "epoch": 1.1410718695974316, + "grad_norm": 0.45573671820566486, + "learning_rate": 4.97137555056968e-06, + "loss": 0.56, + "step": 2887 + }, + { + "epoch": 1.1414670288960238, + "grad_norm": 0.4505940700048066, + "learning_rate": 4.971351905087791e-06, + "loss": 0.5677, + "step": 2888 + }, + { + "epoch": 1.141862188194616, + "grad_norm": 0.44421960402517957, + "learning_rate": 4.971328249899932e-06, + "loss": 0.5494, + "step": 2889 + }, + { + "epoch": 1.1422573474932083, + "grad_norm": 0.4494388053299841, + "learning_rate": 4.971304585006198e-06, + "loss": 0.5808, + "step": 2890 + }, + { + "epoch": 1.1426525067918005, + "grad_norm": 0.454206814685918, + "learning_rate": 4.971280910406679e-06, + "loss": 0.5647, + "step": 2891 + }, + { + "epoch": 1.1430476660903928, + "grad_norm": 0.4570937269415416, + "learning_rate": 4.971257226101469e-06, + "loss": 0.5645, + "step": 2892 + }, + { + "epoch": 1.143442825388985, + "grad_norm": 0.4424306427628629, + "learning_rate": 4.971233532090662e-06, + "loss": 0.5617, + "step": 2893 + }, + { + "epoch": 1.1438379846875772, + "grad_norm": 0.43897777777159125, + "learning_rate": 4.97120982837435e-06, + "loss": 0.5649, + "step": 2894 + }, + { + "epoch": 1.1442331439861695, + "grad_norm": 0.44861248994251357, + "learning_rate": 4.971186114952628e-06, + "loss": 0.565, + "step": 2895 + }, + { + "epoch": 1.1446283032847617, + "grad_norm": 0.47001095027879825, + "learning_rate": 4.971162391825586e-06, + "loss": 0.5803, + "step": 2896 + }, + { + "epoch": 1.145023462583354, + "grad_norm": 0.44454980270407224, + "learning_rate": 4.971138658993318e-06, + "loss": 0.561, + "step": 2897 + }, + { + "epoch": 1.1454186218819462, + "grad_norm": 0.4467127010573676, + "learning_rate": 4.971114916455919e-06, + "loss": 0.5531, + "step": 2898 + }, + { + "epoch": 1.1458137811805384, + "grad_norm": 0.46879881901374293, + "learning_rate": 4.9710911642134805e-06, + "loss": 0.5943, + "step": 2899 + }, + { + "epoch": 1.1462089404791307, + "grad_norm": 0.46048699268271154, + "learning_rate": 4.971067402266097e-06, + "loss": 0.5785, + "step": 2900 + }, + { + "epoch": 1.146604099777723, + "grad_norm": 0.4578268867042688, + "learning_rate": 4.971043630613861e-06, + "loss": 0.5627, + "step": 2901 + }, + { + "epoch": 1.1469992590763152, + "grad_norm": 0.44834695982191136, + "learning_rate": 4.971019849256866e-06, + "loss": 0.5596, + "step": 2902 + }, + { + "epoch": 1.1473944183749074, + "grad_norm": 0.4447032230484451, + "learning_rate": 4.970996058195206e-06, + "loss": 0.555, + "step": 2903 + }, + { + "epoch": 1.1477895776734997, + "grad_norm": 0.4655101994383645, + "learning_rate": 4.970972257428973e-06, + "loss": 0.5904, + "step": 2904 + }, + { + "epoch": 1.148184736972092, + "grad_norm": 0.48666828141831653, + "learning_rate": 4.970948446958262e-06, + "loss": 0.5872, + "step": 2905 + }, + { + "epoch": 1.1485798962706841, + "grad_norm": 0.45560221158301, + "learning_rate": 4.970924626783165e-06, + "loss": 0.572, + "step": 2906 + }, + { + "epoch": 1.1489750555692764, + "grad_norm": 0.4542327270574395, + "learning_rate": 4.970900796903778e-06, + "loss": 0.5807, + "step": 2907 + }, + { + "epoch": 1.1493702148678686, + "grad_norm": 0.45302601284820004, + "learning_rate": 4.970876957320193e-06, + "loss": 0.5556, + "step": 2908 + }, + { + "epoch": 1.1497653741664609, + "grad_norm": 0.45085641190037123, + "learning_rate": 4.970853108032503e-06, + "loss": 0.553, + "step": 2909 + }, + { + "epoch": 1.150160533465053, + "grad_norm": 0.44311699037774255, + "learning_rate": 4.970829249040803e-06, + "loss": 0.5783, + "step": 2910 + }, + { + "epoch": 1.1505556927636453, + "grad_norm": 0.4565247742626593, + "learning_rate": 4.970805380345186e-06, + "loss": 0.5668, + "step": 2911 + }, + { + "epoch": 1.1509508520622376, + "grad_norm": 0.442670058390068, + "learning_rate": 4.970781501945745e-06, + "loss": 0.5804, + "step": 2912 + }, + { + "epoch": 1.1513460113608298, + "grad_norm": 0.4587903036700737, + "learning_rate": 4.970757613842575e-06, + "loss": 0.5436, + "step": 2913 + }, + { + "epoch": 1.151741170659422, + "grad_norm": 0.4605719839378815, + "learning_rate": 4.970733716035769e-06, + "loss": 0.5708, + "step": 2914 + }, + { + "epoch": 1.1521363299580143, + "grad_norm": 0.4811516114353964, + "learning_rate": 4.970709808525423e-06, + "loss": 0.5695, + "step": 2915 + }, + { + "epoch": 1.1525314892566065, + "grad_norm": 0.4465598040548844, + "learning_rate": 4.970685891311627e-06, + "loss": 0.5585, + "step": 2916 + }, + { + "epoch": 1.1529266485551988, + "grad_norm": 0.4482126904622925, + "learning_rate": 4.970661964394479e-06, + "loss": 0.5531, + "step": 2917 + }, + { + "epoch": 1.153321807853791, + "grad_norm": 0.45465037670607433, + "learning_rate": 4.97063802777407e-06, + "loss": 0.5552, + "step": 2918 + }, + { + "epoch": 1.1537169671523833, + "grad_norm": 0.44832182812879506, + "learning_rate": 4.970614081450495e-06, + "loss": 0.5564, + "step": 2919 + }, + { + "epoch": 1.1541121264509755, + "grad_norm": 0.45584014338048706, + "learning_rate": 4.9705901254238485e-06, + "loss": 0.5569, + "step": 2920 + }, + { + "epoch": 1.1545072857495677, + "grad_norm": 0.44834971508394056, + "learning_rate": 4.970566159694224e-06, + "loss": 0.5461, + "step": 2921 + }, + { + "epoch": 1.15490244504816, + "grad_norm": 0.4578941775430072, + "learning_rate": 4.970542184261716e-06, + "loss": 0.5668, + "step": 2922 + }, + { + "epoch": 1.1552976043467522, + "grad_norm": 0.4817857268031156, + "learning_rate": 4.9705181991264185e-06, + "loss": 0.5776, + "step": 2923 + }, + { + "epoch": 1.1556927636453445, + "grad_norm": 0.4608535112462551, + "learning_rate": 4.9704942042884256e-06, + "loss": 0.5665, + "step": 2924 + }, + { + "epoch": 1.1560879229439367, + "grad_norm": 0.45851740053649265, + "learning_rate": 4.970470199747831e-06, + "loss": 0.5699, + "step": 2925 + }, + { + "epoch": 1.156483082242529, + "grad_norm": 0.47180228641422883, + "learning_rate": 4.97044618550473e-06, + "loss": 0.572, + "step": 2926 + }, + { + "epoch": 1.1568782415411212, + "grad_norm": 0.47156814207993075, + "learning_rate": 4.970422161559217e-06, + "loss": 0.5437, + "step": 2927 + }, + { + "epoch": 1.1572734008397134, + "grad_norm": 0.4543173767246186, + "learning_rate": 4.970398127911386e-06, + "loss": 0.5796, + "step": 2928 + }, + { + "epoch": 1.1576685601383057, + "grad_norm": 0.4467556140355343, + "learning_rate": 4.97037408456133e-06, + "loss": 0.5739, + "step": 2929 + }, + { + "epoch": 1.158063719436898, + "grad_norm": 0.4572687150991262, + "learning_rate": 4.970350031509146e-06, + "loss": 0.5789, + "step": 2930 + }, + { + "epoch": 1.1584588787354901, + "grad_norm": 0.46752603817680133, + "learning_rate": 4.970325968754926e-06, + "loss": 0.5902, + "step": 2931 + }, + { + "epoch": 1.1588540380340824, + "grad_norm": 0.44878914889504196, + "learning_rate": 4.970301896298767e-06, + "loss": 0.5721, + "step": 2932 + }, + { + "epoch": 1.1592491973326746, + "grad_norm": 0.4485973988983018, + "learning_rate": 4.9702778141407615e-06, + "loss": 0.5553, + "step": 2933 + }, + { + "epoch": 1.1596443566312669, + "grad_norm": 0.46028448862594296, + "learning_rate": 4.970253722281006e-06, + "loss": 0.5714, + "step": 2934 + }, + { + "epoch": 1.160039515929859, + "grad_norm": 0.43839159073806405, + "learning_rate": 4.970229620719592e-06, + "loss": 0.5682, + "step": 2935 + }, + { + "epoch": 1.1604346752284516, + "grad_norm": 0.4502203330745019, + "learning_rate": 4.970205509456617e-06, + "loss": 0.5633, + "step": 2936 + }, + { + "epoch": 1.1608298345270438, + "grad_norm": 0.4577454214170737, + "learning_rate": 4.970181388492174e-06, + "loss": 0.56, + "step": 2937 + }, + { + "epoch": 1.161224993825636, + "grad_norm": 0.5137258014666423, + "learning_rate": 4.970157257826359e-06, + "loss": 0.5807, + "step": 2938 + }, + { + "epoch": 1.1616201531242283, + "grad_norm": 0.456240539508233, + "learning_rate": 4.970133117459266e-06, + "loss": 0.5698, + "step": 2939 + }, + { + "epoch": 1.1620153124228205, + "grad_norm": 0.4571385214359614, + "learning_rate": 4.9701089673909905e-06, + "loss": 0.5712, + "step": 2940 + }, + { + "epoch": 1.1624104717214128, + "grad_norm": 0.43698412892461164, + "learning_rate": 4.970084807621627e-06, + "loss": 0.5518, + "step": 2941 + }, + { + "epoch": 1.162805631020005, + "grad_norm": 0.4508512464344563, + "learning_rate": 4.97006063815127e-06, + "loss": 0.5543, + "step": 2942 + }, + { + "epoch": 1.1632007903185972, + "grad_norm": 0.444206994150076, + "learning_rate": 4.970036458980014e-06, + "loss": 0.5581, + "step": 2943 + }, + { + "epoch": 1.1635959496171895, + "grad_norm": 0.4524722456542353, + "learning_rate": 4.9700122701079566e-06, + "loss": 0.5542, + "step": 2944 + }, + { + "epoch": 1.1639911089157817, + "grad_norm": 0.44578046719303255, + "learning_rate": 4.969988071535189e-06, + "loss": 0.5725, + "step": 2945 + }, + { + "epoch": 1.164386268214374, + "grad_norm": 0.4549968477995691, + "learning_rate": 4.969963863261808e-06, + "loss": 0.5848, + "step": 2946 + }, + { + "epoch": 1.1647814275129662, + "grad_norm": 0.4552555332588111, + "learning_rate": 4.969939645287911e-06, + "loss": 0.557, + "step": 2947 + }, + { + "epoch": 1.1651765868115584, + "grad_norm": 0.4750098537013762, + "learning_rate": 4.969915417613589e-06, + "loss": 0.5864, + "step": 2948 + }, + { + "epoch": 1.1655717461101507, + "grad_norm": 0.45082337345793494, + "learning_rate": 4.96989118023894e-06, + "loss": 0.5866, + "step": 2949 + }, + { + "epoch": 1.165966905408743, + "grad_norm": 0.4455097203171396, + "learning_rate": 4.969866933164057e-06, + "loss": 0.5607, + "step": 2950 + }, + { + "epoch": 1.1663620647073352, + "grad_norm": 0.4489253199947083, + "learning_rate": 4.969842676389038e-06, + "loss": 0.5662, + "step": 2951 + }, + { + "epoch": 1.1667572240059274, + "grad_norm": 0.46293264922861316, + "learning_rate": 4.969818409913976e-06, + "loss": 0.5551, + "step": 2952 + }, + { + "epoch": 1.1671523833045196, + "grad_norm": 0.45718754430330005, + "learning_rate": 4.969794133738967e-06, + "loss": 0.5541, + "step": 2953 + }, + { + "epoch": 1.1675475426031119, + "grad_norm": 0.45394398829100463, + "learning_rate": 4.9697698478641056e-06, + "loss": 0.5543, + "step": 2954 + }, + { + "epoch": 1.1679427019017041, + "grad_norm": 0.4580575806358461, + "learning_rate": 4.969745552289489e-06, + "loss": 0.5648, + "step": 2955 + }, + { + "epoch": 1.1683378612002964, + "grad_norm": 0.45929415970435655, + "learning_rate": 4.969721247015212e-06, + "loss": 0.5802, + "step": 2956 + }, + { + "epoch": 1.1687330204988886, + "grad_norm": 0.43706127656723726, + "learning_rate": 4.969696932041369e-06, + "loss": 0.5658, + "step": 2957 + }, + { + "epoch": 1.1691281797974808, + "grad_norm": 0.45589757018205523, + "learning_rate": 4.969672607368056e-06, + "loss": 0.5451, + "step": 2958 + }, + { + "epoch": 1.169523339096073, + "grad_norm": 0.4453274113893662, + "learning_rate": 4.969648272995368e-06, + "loss": 0.5651, + "step": 2959 + }, + { + "epoch": 1.1699184983946653, + "grad_norm": 0.4422585180450584, + "learning_rate": 4.9696239289234025e-06, + "loss": 0.5513, + "step": 2960 + }, + { + "epoch": 1.1703136576932576, + "grad_norm": 0.4609612248644218, + "learning_rate": 4.969599575152253e-06, + "loss": 0.579, + "step": 2961 + }, + { + "epoch": 1.1707088169918498, + "grad_norm": 0.46053331483814797, + "learning_rate": 4.969575211682016e-06, + "loss": 0.5456, + "step": 2962 + }, + { + "epoch": 1.171103976290442, + "grad_norm": 0.46253809484884606, + "learning_rate": 4.969550838512787e-06, + "loss": 0.5738, + "step": 2963 + }, + { + "epoch": 1.1714991355890343, + "grad_norm": 0.45757469040052795, + "learning_rate": 4.969526455644664e-06, + "loss": 0.5835, + "step": 2964 + }, + { + "epoch": 1.1718942948876265, + "grad_norm": 0.4649235194612874, + "learning_rate": 4.969502063077738e-06, + "loss": 0.5656, + "step": 2965 + }, + { + "epoch": 1.1722894541862188, + "grad_norm": 0.4529442270814776, + "learning_rate": 4.969477660812108e-06, + "loss": 0.5475, + "step": 2966 + }, + { + "epoch": 1.172684613484811, + "grad_norm": 0.46009271161789567, + "learning_rate": 4.969453248847871e-06, + "loss": 0.5638, + "step": 2967 + }, + { + "epoch": 1.1730797727834033, + "grad_norm": 0.4543848817490557, + "learning_rate": 4.96942882718512e-06, + "loss": 0.556, + "step": 2968 + }, + { + "epoch": 1.1734749320819955, + "grad_norm": 0.4486250704854233, + "learning_rate": 4.9694043958239515e-06, + "loss": 0.5475, + "step": 2969 + }, + { + "epoch": 1.1738700913805877, + "grad_norm": 0.475212385037125, + "learning_rate": 4.969379954764463e-06, + "loss": 0.5694, + "step": 2970 + }, + { + "epoch": 1.17426525067918, + "grad_norm": 0.46354530802940175, + "learning_rate": 4.969355504006749e-06, + "loss": 0.5693, + "step": 2971 + }, + { + "epoch": 1.1746604099777722, + "grad_norm": 0.4654693704390786, + "learning_rate": 4.969331043550907e-06, + "loss": 0.5821, + "step": 2972 + }, + { + "epoch": 1.1750555692763645, + "grad_norm": 0.45975428248741407, + "learning_rate": 4.969306573397031e-06, + "loss": 0.5763, + "step": 2973 + }, + { + "epoch": 1.1754507285749567, + "grad_norm": 0.4567317408231308, + "learning_rate": 4.969282093545218e-06, + "loss": 0.5785, + "step": 2974 + }, + { + "epoch": 1.175845887873549, + "grad_norm": 0.44978056274129696, + "learning_rate": 4.969257603995566e-06, + "loss": 0.5624, + "step": 2975 + }, + { + "epoch": 1.1762410471721412, + "grad_norm": 0.4535109810307646, + "learning_rate": 4.969233104748168e-06, + "loss": 0.5568, + "step": 2976 + }, + { + "epoch": 1.1766362064707336, + "grad_norm": 0.512694154610389, + "learning_rate": 4.9692085958031225e-06, + "loss": 0.5537, + "step": 2977 + }, + { + "epoch": 1.1770313657693259, + "grad_norm": 0.4657562385746197, + "learning_rate": 4.969184077160524e-06, + "loss": 0.5754, + "step": 2978 + }, + { + "epoch": 1.1774265250679181, + "grad_norm": 0.4676379011730835, + "learning_rate": 4.96915954882047e-06, + "loss": 0.5763, + "step": 2979 + }, + { + "epoch": 1.1778216843665104, + "grad_norm": 0.44433392586627835, + "learning_rate": 4.9691350107830575e-06, + "loss": 0.5544, + "step": 2980 + }, + { + "epoch": 1.1782168436651026, + "grad_norm": 0.4655458760369557, + "learning_rate": 4.9691104630483825e-06, + "loss": 0.5511, + "step": 2981 + }, + { + "epoch": 1.1786120029636948, + "grad_norm": 0.4685666784171711, + "learning_rate": 4.96908590561654e-06, + "loss": 0.5601, + "step": 2982 + }, + { + "epoch": 1.179007162262287, + "grad_norm": 0.46307465210699217, + "learning_rate": 4.969061338487627e-06, + "loss": 0.5804, + "step": 2983 + }, + { + "epoch": 1.1794023215608793, + "grad_norm": 0.4751945587922622, + "learning_rate": 4.969036761661741e-06, + "loss": 0.5761, + "step": 2984 + }, + { + "epoch": 1.1797974808594716, + "grad_norm": 0.4354064786650696, + "learning_rate": 4.969012175138978e-06, + "loss": 0.5496, + "step": 2985 + }, + { + "epoch": 1.1801926401580638, + "grad_norm": 0.4410526844854051, + "learning_rate": 4.968987578919434e-06, + "loss": 0.574, + "step": 2986 + }, + { + "epoch": 1.180587799456656, + "grad_norm": 0.5073421248174037, + "learning_rate": 4.9689629730032065e-06, + "loss": 0.5713, + "step": 2987 + }, + { + "epoch": 1.1809829587552483, + "grad_norm": 0.45094193815320693, + "learning_rate": 4.968938357390391e-06, + "loss": 0.5618, + "step": 2988 + }, + { + "epoch": 1.1813781180538405, + "grad_norm": 0.45984337128911706, + "learning_rate": 4.968913732081085e-06, + "loss": 0.5518, + "step": 2989 + }, + { + "epoch": 1.1817732773524328, + "grad_norm": 0.44962428315636743, + "learning_rate": 4.968889097075386e-06, + "loss": 0.5603, + "step": 2990 + }, + { + "epoch": 1.182168436651025, + "grad_norm": 0.4791783985253456, + "learning_rate": 4.968864452373388e-06, + "loss": 0.5695, + "step": 2991 + }, + { + "epoch": 1.1825635959496172, + "grad_norm": 0.46504234897652474, + "learning_rate": 4.968839797975192e-06, + "loss": 0.57, + "step": 2992 + }, + { + "epoch": 1.1829587552482095, + "grad_norm": 0.46532115508012195, + "learning_rate": 4.96881513388089e-06, + "loss": 0.5848, + "step": 2993 + }, + { + "epoch": 1.1833539145468017, + "grad_norm": 0.4736857697301262, + "learning_rate": 4.968790460090584e-06, + "loss": 0.5536, + "step": 2994 + }, + { + "epoch": 1.183749073845394, + "grad_norm": 0.47327160279398156, + "learning_rate": 4.968765776604366e-06, + "loss": 0.59, + "step": 2995 + }, + { + "epoch": 1.1841442331439862, + "grad_norm": 0.44968273792335417, + "learning_rate": 4.968741083422335e-06, + "loss": 0.5602, + "step": 2996 + }, + { + "epoch": 1.1845393924425784, + "grad_norm": 0.47160397918194813, + "learning_rate": 4.968716380544589e-06, + "loss": 0.578, + "step": 2997 + }, + { + "epoch": 1.1849345517411707, + "grad_norm": 0.4674565580152445, + "learning_rate": 4.968691667971224e-06, + "loss": 0.565, + "step": 2998 + }, + { + "epoch": 1.185329711039763, + "grad_norm": 0.45935108094182536, + "learning_rate": 4.968666945702338e-06, + "loss": 0.5511, + "step": 2999 + }, + { + "epoch": 1.1857248703383552, + "grad_norm": 0.4551811442826113, + "learning_rate": 4.9686422137380265e-06, + "loss": 0.5646, + "step": 3000 + }, + { + "epoch": 1.1861200296369474, + "grad_norm": 0.45211507022800734, + "learning_rate": 4.968617472078388e-06, + "loss": 0.5573, + "step": 3001 + }, + { + "epoch": 1.1865151889355396, + "grad_norm": 0.451017086480957, + "learning_rate": 4.968592720723518e-06, + "loss": 0.5851, + "step": 3002 + }, + { + "epoch": 1.1869103482341319, + "grad_norm": 0.45071015068421694, + "learning_rate": 4.968567959673515e-06, + "loss": 0.5484, + "step": 3003 + }, + { + "epoch": 1.1873055075327241, + "grad_norm": 0.47688770951933446, + "learning_rate": 4.968543188928476e-06, + "loss": 0.577, + "step": 3004 + }, + { + "epoch": 1.1877006668313164, + "grad_norm": 0.43827997699657956, + "learning_rate": 4.9685184084885e-06, + "loss": 0.5566, + "step": 3005 + }, + { + "epoch": 1.1880958261299086, + "grad_norm": 0.4566633468170921, + "learning_rate": 4.968493618353681e-06, + "loss": 0.5539, + "step": 3006 + }, + { + "epoch": 1.1884909854285008, + "grad_norm": 0.46120055132005444, + "learning_rate": 4.968468818524118e-06, + "loss": 0.5704, + "step": 3007 + }, + { + "epoch": 1.188886144727093, + "grad_norm": 0.44778171241028825, + "learning_rate": 4.968444008999909e-06, + "loss": 0.5855, + "step": 3008 + }, + { + "epoch": 1.1892813040256853, + "grad_norm": 0.4750224432409252, + "learning_rate": 4.9684191897811505e-06, + "loss": 0.5565, + "step": 3009 + }, + { + "epoch": 1.1896764633242776, + "grad_norm": 0.4458948426804506, + "learning_rate": 4.968394360867941e-06, + "loss": 0.5585, + "step": 3010 + }, + { + "epoch": 1.1900716226228698, + "grad_norm": 0.46227629169335344, + "learning_rate": 4.968369522260377e-06, + "loss": 0.5754, + "step": 3011 + }, + { + "epoch": 1.190466781921462, + "grad_norm": 0.46386108301772255, + "learning_rate": 4.968344673958556e-06, + "loss": 0.5743, + "step": 3012 + }, + { + "epoch": 1.1908619412200543, + "grad_norm": 0.45605726481124903, + "learning_rate": 4.968319815962577e-06, + "loss": 0.5698, + "step": 3013 + }, + { + "epoch": 1.1912571005186465, + "grad_norm": 0.4430878703628007, + "learning_rate": 4.968294948272535e-06, + "loss": 0.5432, + "step": 3014 + }, + { + "epoch": 1.1916522598172388, + "grad_norm": 0.45206251975568795, + "learning_rate": 4.96827007088853e-06, + "loss": 0.571, + "step": 3015 + }, + { + "epoch": 1.192047419115831, + "grad_norm": 0.4552471461255095, + "learning_rate": 4.968245183810659e-06, + "loss": 0.5691, + "step": 3016 + }, + { + "epoch": 1.1924425784144232, + "grad_norm": 0.4432473462755384, + "learning_rate": 4.968220287039021e-06, + "loss": 0.5401, + "step": 3017 + }, + { + "epoch": 1.1928377377130155, + "grad_norm": 0.44587530242670237, + "learning_rate": 4.9681953805737106e-06, + "loss": 0.5461, + "step": 3018 + }, + { + "epoch": 1.1932328970116077, + "grad_norm": 0.47050677473839214, + "learning_rate": 4.968170464414828e-06, + "loss": 0.563, + "step": 3019 + }, + { + "epoch": 1.1936280563102, + "grad_norm": 0.44130303947137217, + "learning_rate": 4.968145538562471e-06, + "loss": 0.5693, + "step": 3020 + }, + { + "epoch": 1.1940232156087922, + "grad_norm": 0.4544886703368673, + "learning_rate": 4.968120603016737e-06, + "loss": 0.5653, + "step": 3021 + }, + { + "epoch": 1.1944183749073845, + "grad_norm": 0.4896624605231788, + "learning_rate": 4.968095657777724e-06, + "loss": 0.5822, + "step": 3022 + }, + { + "epoch": 1.1948135342059767, + "grad_norm": 0.4703893538726537, + "learning_rate": 4.968070702845529e-06, + "loss": 0.5384, + "step": 3023 + }, + { + "epoch": 1.195208693504569, + "grad_norm": 0.5189597430778528, + "learning_rate": 4.968045738220252e-06, + "loss": 0.578, + "step": 3024 + }, + { + "epoch": 1.1956038528031612, + "grad_norm": 0.44433555063115776, + "learning_rate": 4.96802076390199e-06, + "loss": 0.5672, + "step": 3025 + }, + { + "epoch": 1.1959990121017534, + "grad_norm": 0.44690653875268826, + "learning_rate": 4.96799577989084e-06, + "loss": 0.5483, + "step": 3026 + }, + { + "epoch": 1.1963941714003457, + "grad_norm": 0.4456693441690437, + "learning_rate": 4.967970786186903e-06, + "loss": 0.5917, + "step": 3027 + }, + { + "epoch": 1.196789330698938, + "grad_norm": 0.4656615654209235, + "learning_rate": 4.967945782790275e-06, + "loss": 0.6074, + "step": 3028 + }, + { + "epoch": 1.1971844899975301, + "grad_norm": 0.4679283421472829, + "learning_rate": 4.967920769701053e-06, + "loss": 0.5562, + "step": 3029 + }, + { + "epoch": 1.1975796492961226, + "grad_norm": 0.4599878221931696, + "learning_rate": 4.967895746919339e-06, + "loss": 0.5898, + "step": 3030 + }, + { + "epoch": 1.1979748085947148, + "grad_norm": 0.44364183466084445, + "learning_rate": 4.967870714445227e-06, + "loss": 0.5518, + "step": 3031 + }, + { + "epoch": 1.198369967893307, + "grad_norm": 0.45964094956857143, + "learning_rate": 4.967845672278819e-06, + "loss": 0.5554, + "step": 3032 + }, + { + "epoch": 1.1987651271918993, + "grad_norm": 0.45133637183308356, + "learning_rate": 4.967820620420211e-06, + "loss": 0.5651, + "step": 3033 + }, + { + "epoch": 1.1991602864904916, + "grad_norm": 0.4427547623086265, + "learning_rate": 4.9677955588695025e-06, + "loss": 0.5748, + "step": 3034 + }, + { + "epoch": 1.1995554457890838, + "grad_norm": 0.6682331551539357, + "learning_rate": 4.967770487626791e-06, + "loss": 0.5567, + "step": 3035 + }, + { + "epoch": 1.199950605087676, + "grad_norm": 0.47306363628338366, + "learning_rate": 4.967745406692176e-06, + "loss": 0.5675, + "step": 3036 + }, + { + "epoch": 1.2003457643862683, + "grad_norm": 0.48270419923550345, + "learning_rate": 4.967720316065756e-06, + "loss": 0.5661, + "step": 3037 + }, + { + "epoch": 1.2007409236848605, + "grad_norm": 0.4423873238428253, + "learning_rate": 4.9676952157476285e-06, + "loss": 0.5681, + "step": 3038 + }, + { + "epoch": 1.2011360829834528, + "grad_norm": 0.4487466875408833, + "learning_rate": 4.967670105737892e-06, + "loss": 0.5489, + "step": 3039 + }, + { + "epoch": 1.201531242282045, + "grad_norm": 0.5042106639999401, + "learning_rate": 4.967644986036647e-06, + "loss": 0.5611, + "step": 3040 + }, + { + "epoch": 1.2019264015806372, + "grad_norm": 0.4759632066357272, + "learning_rate": 4.96761985664399e-06, + "loss": 0.5622, + "step": 3041 + }, + { + "epoch": 1.2023215608792295, + "grad_norm": 0.47451217869994206, + "learning_rate": 4.967594717560022e-06, + "loss": 0.5804, + "step": 3042 + }, + { + "epoch": 1.2027167201778217, + "grad_norm": 0.4593740341418387, + "learning_rate": 4.967569568784839e-06, + "loss": 0.5563, + "step": 3043 + }, + { + "epoch": 1.203111879476414, + "grad_norm": 0.4790716838554254, + "learning_rate": 4.967544410318541e-06, + "loss": 0.5562, + "step": 3044 + }, + { + "epoch": 1.2035070387750062, + "grad_norm": 0.44309498656578405, + "learning_rate": 4.967519242161227e-06, + "loss": 0.5516, + "step": 3045 + }, + { + "epoch": 1.2039021980735984, + "grad_norm": 0.4895189026135667, + "learning_rate": 4.967494064312996e-06, + "loss": 0.5667, + "step": 3046 + }, + { + "epoch": 1.2042973573721907, + "grad_norm": 0.4561602202621074, + "learning_rate": 4.967468876773948e-06, + "loss": 0.5629, + "step": 3047 + }, + { + "epoch": 1.204692516670783, + "grad_norm": 0.4697256795259107, + "learning_rate": 4.9674436795441795e-06, + "loss": 0.5786, + "step": 3048 + }, + { + "epoch": 1.2050876759693752, + "grad_norm": 0.5259340703235517, + "learning_rate": 4.96741847262379e-06, + "loss": 0.5583, + "step": 3049 + }, + { + "epoch": 1.2054828352679674, + "grad_norm": 0.4894487391041935, + "learning_rate": 4.967393256012879e-06, + "loss": 0.5904, + "step": 3050 + }, + { + "epoch": 1.2058779945665596, + "grad_norm": 0.46711138204567687, + "learning_rate": 4.967368029711547e-06, + "loss": 0.5702, + "step": 3051 + }, + { + "epoch": 1.2062731538651519, + "grad_norm": 0.4888902860202162, + "learning_rate": 4.96734279371989e-06, + "loss": 0.5771, + "step": 3052 + }, + { + "epoch": 1.2066683131637441, + "grad_norm": 0.5333072923347787, + "learning_rate": 4.96731754803801e-06, + "loss": 0.561, + "step": 3053 + }, + { + "epoch": 1.2070634724623364, + "grad_norm": 0.46654604201036526, + "learning_rate": 4.967292292666004e-06, + "loss": 0.5799, + "step": 3054 + }, + { + "epoch": 1.2074586317609286, + "grad_norm": 0.45887016609605846, + "learning_rate": 4.967267027603972e-06, + "loss": 0.5672, + "step": 3055 + }, + { + "epoch": 1.2078537910595208, + "grad_norm": 0.477396686518973, + "learning_rate": 4.967241752852015e-06, + "loss": 0.5846, + "step": 3056 + }, + { + "epoch": 1.208248950358113, + "grad_norm": 0.4613218129856247, + "learning_rate": 4.967216468410229e-06, + "loss": 0.5661, + "step": 3057 + }, + { + "epoch": 1.2086441096567053, + "grad_norm": 0.5240029995870954, + "learning_rate": 4.9671911742787145e-06, + "loss": 0.5709, + "step": 3058 + }, + { + "epoch": 1.2090392689552976, + "grad_norm": 0.4363976879978063, + "learning_rate": 4.967165870457573e-06, + "loss": 0.5458, + "step": 3059 + }, + { + "epoch": 1.2094344282538898, + "grad_norm": 0.43183191736087323, + "learning_rate": 4.9671405569469e-06, + "loss": 0.5615, + "step": 3060 + }, + { + "epoch": 1.209829587552482, + "grad_norm": 0.48194750159483546, + "learning_rate": 4.967115233746798e-06, + "loss": 0.5692, + "step": 3061 + }, + { + "epoch": 1.2102247468510743, + "grad_norm": 0.4403230569615395, + "learning_rate": 4.967089900857366e-06, + "loss": 0.5541, + "step": 3062 + }, + { + "epoch": 1.2106199061496665, + "grad_norm": 0.4521141643968015, + "learning_rate": 4.9670645582787025e-06, + "loss": 0.5683, + "step": 3063 + }, + { + "epoch": 1.2110150654482588, + "grad_norm": 0.4436203962884348, + "learning_rate": 4.967039206010908e-06, + "loss": 0.5293, + "step": 3064 + }, + { + "epoch": 1.211410224746851, + "grad_norm": 0.4412702021651393, + "learning_rate": 4.967013844054081e-06, + "loss": 0.5575, + "step": 3065 + }, + { + "epoch": 1.2118053840454432, + "grad_norm": 0.44313069086594686, + "learning_rate": 4.966988472408322e-06, + "loss": 0.5701, + "step": 3066 + }, + { + "epoch": 1.2122005433440355, + "grad_norm": 0.45845037915893444, + "learning_rate": 4.96696309107373e-06, + "loss": 0.5817, + "step": 3067 + }, + { + "epoch": 1.2125957026426277, + "grad_norm": 0.4574367758060889, + "learning_rate": 4.966937700050405e-06, + "loss": 0.5705, + "step": 3068 + }, + { + "epoch": 1.21299086194122, + "grad_norm": 0.4471000171164327, + "learning_rate": 4.966912299338447e-06, + "loss": 0.5413, + "step": 3069 + }, + { + "epoch": 1.2133860212398122, + "grad_norm": 0.45685994308151356, + "learning_rate": 4.966886888937955e-06, + "loss": 0.5688, + "step": 3070 + }, + { + "epoch": 1.2137811805384044, + "grad_norm": 0.4397052448052631, + "learning_rate": 4.96686146884903e-06, + "loss": 0.5555, + "step": 3071 + }, + { + "epoch": 1.214176339836997, + "grad_norm": 0.4522674726127247, + "learning_rate": 4.96683603907177e-06, + "loss": 0.5723, + "step": 3072 + }, + { + "epoch": 1.2145714991355891, + "grad_norm": 0.46257064891670174, + "learning_rate": 4.966810599606277e-06, + "loss": 0.5795, + "step": 3073 + }, + { + "epoch": 1.2149666584341814, + "grad_norm": 0.4638428914792885, + "learning_rate": 4.9667851504526495e-06, + "loss": 0.5765, + "step": 3074 + }, + { + "epoch": 1.2153618177327736, + "grad_norm": 0.4633324633840189, + "learning_rate": 4.966759691610989e-06, + "loss": 0.5901, + "step": 3075 + }, + { + "epoch": 1.2157569770313659, + "grad_norm": 0.4694538016404579, + "learning_rate": 4.966734223081392e-06, + "loss": 0.5755, + "step": 3076 + }, + { + "epoch": 1.2161521363299581, + "grad_norm": 0.44870540089281646, + "learning_rate": 4.966708744863962e-06, + "loss": 0.5664, + "step": 3077 + }, + { + "epoch": 1.2165472956285504, + "grad_norm": 0.4633771864468751, + "learning_rate": 4.966683256958799e-06, + "loss": 0.5683, + "step": 3078 + }, + { + "epoch": 1.2169424549271426, + "grad_norm": 0.4746476377257439, + "learning_rate": 4.966657759366e-06, + "loss": 0.5647, + "step": 3079 + }, + { + "epoch": 1.2173376142257348, + "grad_norm": 0.45147878211938924, + "learning_rate": 4.966632252085669e-06, + "loss": 0.5752, + "step": 3080 + }, + { + "epoch": 1.217732773524327, + "grad_norm": 0.44156118894699065, + "learning_rate": 4.966606735117902e-06, + "loss": 0.5563, + "step": 3081 + }, + { + "epoch": 1.2181279328229193, + "grad_norm": 0.45196789781220703, + "learning_rate": 4.966581208462804e-06, + "loss": 0.5475, + "step": 3082 + }, + { + "epoch": 1.2185230921215116, + "grad_norm": 0.4442256161242522, + "learning_rate": 4.966555672120472e-06, + "loss": 0.5695, + "step": 3083 + }, + { + "epoch": 1.2189182514201038, + "grad_norm": 0.44730065552547726, + "learning_rate": 4.966530126091007e-06, + "loss": 0.5599, + "step": 3084 + }, + { + "epoch": 1.219313410718696, + "grad_norm": 0.4544579855898216, + "learning_rate": 4.966504570374509e-06, + "loss": 0.5451, + "step": 3085 + }, + { + "epoch": 1.2197085700172883, + "grad_norm": 0.45542641771326065, + "learning_rate": 4.9664790049710795e-06, + "loss": 0.5691, + "step": 3086 + }, + { + "epoch": 1.2201037293158805, + "grad_norm": 0.46855478341533124, + "learning_rate": 4.966453429880818e-06, + "loss": 0.5757, + "step": 3087 + }, + { + "epoch": 1.2204988886144728, + "grad_norm": 0.43240914194877017, + "learning_rate": 4.966427845103825e-06, + "loss": 0.5719, + "step": 3088 + }, + { + "epoch": 1.220894047913065, + "grad_norm": 0.4182707219417738, + "learning_rate": 4.966402250640201e-06, + "loss": 0.5522, + "step": 3089 + }, + { + "epoch": 1.2212892072116572, + "grad_norm": 0.44846470069043676, + "learning_rate": 4.9663766464900465e-06, + "loss": 0.5701, + "step": 3090 + }, + { + "epoch": 1.2216843665102495, + "grad_norm": 0.48379385795172264, + "learning_rate": 4.966351032653463e-06, + "loss": 0.5632, + "step": 3091 + }, + { + "epoch": 1.2220795258088417, + "grad_norm": 0.4608900432943671, + "learning_rate": 4.966325409130549e-06, + "loss": 0.5672, + "step": 3092 + }, + { + "epoch": 1.222474685107434, + "grad_norm": 0.465833480879931, + "learning_rate": 4.9662997759214074e-06, + "loss": 0.5651, + "step": 3093 + }, + { + "epoch": 1.2228698444060262, + "grad_norm": 0.45118044139514435, + "learning_rate": 4.966274133026138e-06, + "loss": 0.573, + "step": 3094 + }, + { + "epoch": 1.2232650037046184, + "grad_norm": 0.4421531493404346, + "learning_rate": 4.966248480444841e-06, + "loss": 0.5657, + "step": 3095 + }, + { + "epoch": 1.2236601630032107, + "grad_norm": 0.44492466262445596, + "learning_rate": 4.966222818177617e-06, + "loss": 0.5542, + "step": 3096 + }, + { + "epoch": 1.224055322301803, + "grad_norm": 0.4389512642580722, + "learning_rate": 4.966197146224568e-06, + "loss": 0.5646, + "step": 3097 + }, + { + "epoch": 1.2244504816003952, + "grad_norm": 0.44609105957073, + "learning_rate": 4.966171464585794e-06, + "loss": 0.5512, + "step": 3098 + }, + { + "epoch": 1.2248456408989874, + "grad_norm": 0.45028716091193877, + "learning_rate": 4.966145773261396e-06, + "loss": 0.5528, + "step": 3099 + }, + { + "epoch": 1.2252408001975796, + "grad_norm": 0.4430745962507206, + "learning_rate": 4.966120072251475e-06, + "loss": 0.5486, + "step": 3100 + }, + { + "epoch": 1.2256359594961719, + "grad_norm": 0.4565725474643139, + "learning_rate": 4.966094361556132e-06, + "loss": 0.5637, + "step": 3101 + }, + { + "epoch": 1.2260311187947641, + "grad_norm": 0.4603270298388148, + "learning_rate": 4.966068641175469e-06, + "loss": 0.5742, + "step": 3102 + }, + { + "epoch": 1.2264262780933564, + "grad_norm": 0.4413064560746586, + "learning_rate": 4.966042911109584e-06, + "loss": 0.5513, + "step": 3103 + }, + { + "epoch": 1.2268214373919486, + "grad_norm": 0.45702241651020653, + "learning_rate": 4.9660171713585805e-06, + "loss": 0.5678, + "step": 3104 + }, + { + "epoch": 1.2272165966905408, + "grad_norm": 0.47067105579936214, + "learning_rate": 4.965991421922559e-06, + "loss": 0.5632, + "step": 3105 + }, + { + "epoch": 1.227611755989133, + "grad_norm": 0.5372601558221561, + "learning_rate": 4.965965662801621e-06, + "loss": 0.5575, + "step": 3106 + }, + { + "epoch": 1.2280069152877253, + "grad_norm": 0.48016720590726975, + "learning_rate": 4.965939893995867e-06, + "loss": 0.574, + "step": 3107 + }, + { + "epoch": 1.2284020745863176, + "grad_norm": 0.43658120984806364, + "learning_rate": 4.965914115505398e-06, + "loss": 0.5547, + "step": 3108 + }, + { + "epoch": 1.2287972338849098, + "grad_norm": 0.4464718194752851, + "learning_rate": 4.965888327330316e-06, + "loss": 0.5738, + "step": 3109 + }, + { + "epoch": 1.229192393183502, + "grad_norm": 0.44348094873374344, + "learning_rate": 4.9658625294707226e-06, + "loss": 0.5558, + "step": 3110 + }, + { + "epoch": 1.2295875524820943, + "grad_norm": 0.45414022219810424, + "learning_rate": 4.965836721926718e-06, + "loss": 0.5591, + "step": 3111 + }, + { + "epoch": 1.2299827117806865, + "grad_norm": 0.45465803398220034, + "learning_rate": 4.965810904698404e-06, + "loss": 0.5784, + "step": 3112 + }, + { + "epoch": 1.2303778710792788, + "grad_norm": 0.42507296664604205, + "learning_rate": 4.965785077785882e-06, + "loss": 0.5589, + "step": 3113 + }, + { + "epoch": 1.230773030377871, + "grad_norm": 0.4451171220895154, + "learning_rate": 4.965759241189254e-06, + "loss": 0.5653, + "step": 3114 + }, + { + "epoch": 1.2311681896764632, + "grad_norm": 0.44320611707236784, + "learning_rate": 4.965733394908621e-06, + "loss": 0.5764, + "step": 3115 + }, + { + "epoch": 1.2315633489750555, + "grad_norm": 0.4467513021648378, + "learning_rate": 4.965707538944085e-06, + "loss": 0.5681, + "step": 3116 + }, + { + "epoch": 1.2319585082736477, + "grad_norm": 0.4962560415068218, + "learning_rate": 4.9656816732957454e-06, + "loss": 0.5772, + "step": 3117 + }, + { + "epoch": 1.23235366757224, + "grad_norm": 0.4393887859931017, + "learning_rate": 4.965655797963707e-06, + "loss": 0.5647, + "step": 3118 + }, + { + "epoch": 1.2327488268708322, + "grad_norm": 0.464778893222507, + "learning_rate": 4.965629912948069e-06, + "loss": 0.5851, + "step": 3119 + }, + { + "epoch": 1.2331439861694244, + "grad_norm": 0.4525476192977244, + "learning_rate": 4.965604018248934e-06, + "loss": 0.578, + "step": 3120 + }, + { + "epoch": 1.2335391454680167, + "grad_norm": 0.461394081784411, + "learning_rate": 4.965578113866404e-06, + "loss": 0.561, + "step": 3121 + }, + { + "epoch": 1.233934304766609, + "grad_norm": 0.43936664461165786, + "learning_rate": 4.96555219980058e-06, + "loss": 0.5476, + "step": 3122 + }, + { + "epoch": 1.2343294640652012, + "grad_norm": 0.4509074107094845, + "learning_rate": 4.965526276051564e-06, + "loss": 0.567, + "step": 3123 + }, + { + "epoch": 1.2347246233637934, + "grad_norm": 0.45459696038357283, + "learning_rate": 4.965500342619458e-06, + "loss": 0.5763, + "step": 3124 + }, + { + "epoch": 1.2351197826623859, + "grad_norm": 0.44821748870137185, + "learning_rate": 4.965474399504364e-06, + "loss": 0.5668, + "step": 3125 + }, + { + "epoch": 1.235514941960978, + "grad_norm": 0.46197320609039444, + "learning_rate": 4.965448446706384e-06, + "loss": 0.5682, + "step": 3126 + }, + { + "epoch": 1.2359101012595703, + "grad_norm": 0.4396093700838116, + "learning_rate": 4.96542248422562e-06, + "loss": 0.5632, + "step": 3127 + }, + { + "epoch": 1.2363052605581626, + "grad_norm": 0.4308392401888905, + "learning_rate": 4.965396512062171e-06, + "loss": 0.5433, + "step": 3128 + }, + { + "epoch": 1.2367004198567548, + "grad_norm": 0.462787889383493, + "learning_rate": 4.9653705302161446e-06, + "loss": 0.5841, + "step": 3129 + }, + { + "epoch": 1.237095579155347, + "grad_norm": 0.43840660693999445, + "learning_rate": 4.965344538687638e-06, + "loss": 0.5452, + "step": 3130 + }, + { + "epoch": 1.2374907384539393, + "grad_norm": 0.4602319352251958, + "learning_rate": 4.965318537476756e-06, + "loss": 0.5811, + "step": 3131 + }, + { + "epoch": 1.2378858977525315, + "grad_norm": 0.45813906029186197, + "learning_rate": 4.9652925265836e-06, + "loss": 0.5694, + "step": 3132 + }, + { + "epoch": 1.2382810570511238, + "grad_norm": 0.4492594870763079, + "learning_rate": 4.965266506008271e-06, + "loss": 0.5884, + "step": 3133 + }, + { + "epoch": 1.238676216349716, + "grad_norm": 0.4627556622877327, + "learning_rate": 4.9652404757508726e-06, + "loss": 0.571, + "step": 3134 + }, + { + "epoch": 1.2390713756483083, + "grad_norm": 0.4629993337469515, + "learning_rate": 4.965214435811506e-06, + "loss": 0.5669, + "step": 3135 + }, + { + "epoch": 1.2394665349469005, + "grad_norm": 0.46398715244786704, + "learning_rate": 4.965188386190275e-06, + "loss": 0.5735, + "step": 3136 + }, + { + "epoch": 1.2398616942454928, + "grad_norm": 0.44440876198440876, + "learning_rate": 4.965162326887281e-06, + "loss": 0.5669, + "step": 3137 + }, + { + "epoch": 1.240256853544085, + "grad_norm": 0.49597914064913107, + "learning_rate": 4.965136257902626e-06, + "loss": 0.5807, + "step": 3138 + }, + { + "epoch": 1.2406520128426772, + "grad_norm": 0.44555439287267584, + "learning_rate": 4.965110179236412e-06, + "loss": 0.5439, + "step": 3139 + }, + { + "epoch": 1.2410471721412695, + "grad_norm": 0.45626121734971314, + "learning_rate": 4.965084090888743e-06, + "loss": 0.5689, + "step": 3140 + }, + { + "epoch": 1.2414423314398617, + "grad_norm": 0.4363299527581059, + "learning_rate": 4.96505799285972e-06, + "loss": 0.5476, + "step": 3141 + }, + { + "epoch": 1.241837490738454, + "grad_norm": 0.4671748329287142, + "learning_rate": 4.9650318851494465e-06, + "loss": 0.5817, + "step": 3142 + }, + { + "epoch": 1.2422326500370462, + "grad_norm": 0.472754493128498, + "learning_rate": 4.965005767758024e-06, + "loss": 0.5701, + "step": 3143 + }, + { + "epoch": 1.2426278093356384, + "grad_norm": 0.4760983983168848, + "learning_rate": 4.964979640685557e-06, + "loss": 0.5804, + "step": 3144 + }, + { + "epoch": 1.2430229686342307, + "grad_norm": 0.4717372783720608, + "learning_rate": 4.964953503932146e-06, + "loss": 0.5867, + "step": 3145 + }, + { + "epoch": 1.243418127932823, + "grad_norm": 0.45391804628270654, + "learning_rate": 4.964927357497894e-06, + "loss": 0.5214, + "step": 3146 + }, + { + "epoch": 1.2438132872314152, + "grad_norm": 0.4407103558661319, + "learning_rate": 4.964901201382905e-06, + "loss": 0.572, + "step": 3147 + }, + { + "epoch": 1.2442084465300074, + "grad_norm": 0.45045690316166226, + "learning_rate": 4.96487503558728e-06, + "loss": 0.5663, + "step": 3148 + }, + { + "epoch": 1.2446036058285996, + "grad_norm": 0.4434691882497208, + "learning_rate": 4.964848860111122e-06, + "loss": 0.5473, + "step": 3149 + }, + { + "epoch": 1.2449987651271919, + "grad_norm": 0.4460639321688961, + "learning_rate": 4.964822674954536e-06, + "loss": 0.5719, + "step": 3150 + }, + { + "epoch": 1.2453939244257841, + "grad_norm": 0.4709745072765176, + "learning_rate": 4.964796480117623e-06, + "loss": 0.5756, + "step": 3151 + }, + { + "epoch": 1.2457890837243764, + "grad_norm": 0.47049229358105993, + "learning_rate": 4.9647702756004855e-06, + "loss": 0.5714, + "step": 3152 + }, + { + "epoch": 1.2461842430229686, + "grad_norm": 0.4649418473083078, + "learning_rate": 4.964744061403227e-06, + "loss": 0.5749, + "step": 3153 + }, + { + "epoch": 1.2465794023215608, + "grad_norm": 0.47164232942914297, + "learning_rate": 4.964717837525951e-06, + "loss": 0.5735, + "step": 3154 + }, + { + "epoch": 1.246974561620153, + "grad_norm": 0.4529213434049233, + "learning_rate": 4.9646916039687594e-06, + "loss": 0.5836, + "step": 3155 + }, + { + "epoch": 1.2473697209187453, + "grad_norm": 0.45289976023026995, + "learning_rate": 4.964665360731757e-06, + "loss": 0.5695, + "step": 3156 + }, + { + "epoch": 1.2477648802173376, + "grad_norm": 0.4780170631528699, + "learning_rate": 4.964639107815044e-06, + "loss": 0.5602, + "step": 3157 + }, + { + "epoch": 1.2481600395159298, + "grad_norm": 0.44393148384313114, + "learning_rate": 4.964612845218726e-06, + "loss": 0.5618, + "step": 3158 + }, + { + "epoch": 1.248555198814522, + "grad_norm": 0.4486878703274451, + "learning_rate": 4.964586572942905e-06, + "loss": 0.5715, + "step": 3159 + }, + { + "epoch": 1.2489503581131143, + "grad_norm": 0.45262089196133376, + "learning_rate": 4.964560290987686e-06, + "loss": 0.5538, + "step": 3160 + }, + { + "epoch": 1.2493455174117065, + "grad_norm": 0.46646019654372667, + "learning_rate": 4.964533999353169e-06, + "loss": 0.5561, + "step": 3161 + }, + { + "epoch": 1.2497406767102988, + "grad_norm": 0.46395577187095943, + "learning_rate": 4.96450769803946e-06, + "loss": 0.5703, + "step": 3162 + }, + { + "epoch": 1.2501358360088912, + "grad_norm": 0.43225782469002116, + "learning_rate": 4.9644813870466605e-06, + "loss": 0.5475, + "step": 3163 + }, + { + "epoch": 1.2505309953074835, + "grad_norm": 0.4477412714097664, + "learning_rate": 4.9644550663748755e-06, + "loss": 0.588, + "step": 3164 + }, + { + "epoch": 1.2509261546060757, + "grad_norm": 0.4488220491257586, + "learning_rate": 4.964428736024207e-06, + "loss": 0.5456, + "step": 3165 + }, + { + "epoch": 1.251321313904668, + "grad_norm": 0.4591216638115739, + "learning_rate": 4.964402395994759e-06, + "loss": 0.5962, + "step": 3166 + }, + { + "epoch": 1.2517164732032602, + "grad_norm": 0.4496651980462876, + "learning_rate": 4.964376046286635e-06, + "loss": 0.5773, + "step": 3167 + }, + { + "epoch": 1.2521116325018524, + "grad_norm": 0.43068834285244056, + "learning_rate": 4.964349686899938e-06, + "loss": 0.5649, + "step": 3168 + }, + { + "epoch": 1.2525067918004447, + "grad_norm": 0.43656858666444814, + "learning_rate": 4.964323317834772e-06, + "loss": 0.5606, + "step": 3169 + }, + { + "epoch": 1.252901951099037, + "grad_norm": 0.4465521206805361, + "learning_rate": 4.96429693909124e-06, + "loss": 0.5581, + "step": 3170 + }, + { + "epoch": 1.2532971103976291, + "grad_norm": 0.4356583772476069, + "learning_rate": 4.964270550669447e-06, + "loss": 0.554, + "step": 3171 + }, + { + "epoch": 1.2536922696962214, + "grad_norm": 0.4538626705056506, + "learning_rate": 4.964244152569495e-06, + "loss": 0.5724, + "step": 3172 + }, + { + "epoch": 1.2540874289948136, + "grad_norm": 0.44499801851560644, + "learning_rate": 4.964217744791489e-06, + "loss": 0.5675, + "step": 3173 + }, + { + "epoch": 1.2544825882934059, + "grad_norm": 0.44634942356882007, + "learning_rate": 4.964191327335531e-06, + "loss": 0.5507, + "step": 3174 + }, + { + "epoch": 1.254877747591998, + "grad_norm": 0.47867361712053463, + "learning_rate": 4.964164900201726e-06, + "loss": 0.5587, + "step": 3175 + }, + { + "epoch": 1.2552729068905903, + "grad_norm": 0.4555734443969186, + "learning_rate": 4.964138463390178e-06, + "loss": 0.5646, + "step": 3176 + }, + { + "epoch": 1.2556680661891826, + "grad_norm": 0.4584704800532935, + "learning_rate": 4.964112016900991e-06, + "loss": 0.5517, + "step": 3177 + }, + { + "epoch": 1.2560632254877748, + "grad_norm": 0.45549434221447166, + "learning_rate": 4.964085560734267e-06, + "loss": 0.5771, + "step": 3178 + }, + { + "epoch": 1.256458384786367, + "grad_norm": 0.44914572871190456, + "learning_rate": 4.964059094890112e-06, + "loss": 0.5617, + "step": 3179 + }, + { + "epoch": 1.2568535440849593, + "grad_norm": 0.4512882813462357, + "learning_rate": 4.964032619368629e-06, + "loss": 0.5723, + "step": 3180 + }, + { + "epoch": 1.2572487033835515, + "grad_norm": 0.4509607204943829, + "learning_rate": 4.964006134169922e-06, + "loss": 0.5643, + "step": 3181 + }, + { + "epoch": 1.2576438626821438, + "grad_norm": 0.45921297434875, + "learning_rate": 4.9639796392940955e-06, + "loss": 0.5844, + "step": 3182 + }, + { + "epoch": 1.258039021980736, + "grad_norm": 0.48942904209316174, + "learning_rate": 4.963953134741253e-06, + "loss": 0.5802, + "step": 3183 + }, + { + "epoch": 1.2584341812793283, + "grad_norm": 0.45854271793023527, + "learning_rate": 4.963926620511497e-06, + "loss": 0.5815, + "step": 3184 + }, + { + "epoch": 1.2588293405779205, + "grad_norm": 0.4560649261852496, + "learning_rate": 4.963900096604936e-06, + "loss": 0.5534, + "step": 3185 + }, + { + "epoch": 1.2592244998765127, + "grad_norm": 0.4518649540196938, + "learning_rate": 4.9638735630216704e-06, + "loss": 0.5754, + "step": 3186 + }, + { + "epoch": 1.259619659175105, + "grad_norm": 0.6754753158823785, + "learning_rate": 4.963847019761806e-06, + "loss": 0.5704, + "step": 3187 + }, + { + "epoch": 1.2600148184736972, + "grad_norm": 0.452944137497009, + "learning_rate": 4.9638204668254465e-06, + "loss": 0.5644, + "step": 3188 + }, + { + "epoch": 1.2604099777722895, + "grad_norm": 0.44138818835118054, + "learning_rate": 4.9637939042126965e-06, + "loss": 0.5624, + "step": 3189 + }, + { + "epoch": 1.2608051370708817, + "grad_norm": 0.467795887447353, + "learning_rate": 4.96376733192366e-06, + "loss": 0.5909, + "step": 3190 + }, + { + "epoch": 1.261200296369474, + "grad_norm": 0.4567278109168832, + "learning_rate": 4.963740749958441e-06, + "loss": 0.5645, + "step": 3191 + }, + { + "epoch": 1.2615954556680662, + "grad_norm": 0.603665506544785, + "learning_rate": 4.9637141583171456e-06, + "loss": 0.5607, + "step": 3192 + }, + { + "epoch": 1.2619906149666584, + "grad_norm": 0.44076800996835624, + "learning_rate": 4.9636875569998756e-06, + "loss": 0.5479, + "step": 3193 + }, + { + "epoch": 1.2623857742652507, + "grad_norm": 0.4597963136042489, + "learning_rate": 4.963660946006737e-06, + "loss": 0.5677, + "step": 3194 + }, + { + "epoch": 1.262780933563843, + "grad_norm": 0.45342024851568213, + "learning_rate": 4.963634325337836e-06, + "loss": 0.5452, + "step": 3195 + }, + { + "epoch": 1.2631760928624352, + "grad_norm": 0.44313171394307665, + "learning_rate": 4.9636076949932736e-06, + "loss": 0.5699, + "step": 3196 + }, + { + "epoch": 1.2635712521610274, + "grad_norm": 0.45404430511750704, + "learning_rate": 4.9635810549731565e-06, + "loss": 0.579, + "step": 3197 + }, + { + "epoch": 1.2639664114596196, + "grad_norm": 0.4739171320560697, + "learning_rate": 4.9635544052775895e-06, + "loss": 0.5475, + "step": 3198 + }, + { + "epoch": 1.2643615707582119, + "grad_norm": 0.4528216117045259, + "learning_rate": 4.963527745906677e-06, + "loss": 0.5431, + "step": 3199 + }, + { + "epoch": 1.2647567300568041, + "grad_norm": 0.44237602152913297, + "learning_rate": 4.963501076860522e-06, + "loss": 0.5514, + "step": 3200 + }, + { + "epoch": 1.2651518893553964, + "grad_norm": 0.4472333931200183, + "learning_rate": 4.9634743981392316e-06, + "loss": 0.5597, + "step": 3201 + }, + { + "epoch": 1.2655470486539886, + "grad_norm": 0.4951585973064483, + "learning_rate": 4.9634477097429105e-06, + "loss": 0.5993, + "step": 3202 + }, + { + "epoch": 1.2659422079525808, + "grad_norm": 0.4588298365010358, + "learning_rate": 4.9634210116716606e-06, + "loss": 0.5612, + "step": 3203 + }, + { + "epoch": 1.266337367251173, + "grad_norm": 0.43949401153597856, + "learning_rate": 4.96339430392559e-06, + "loss": 0.5758, + "step": 3204 + }, + { + "epoch": 1.2667325265497653, + "grad_norm": 0.4429353608767374, + "learning_rate": 4.963367586504803e-06, + "loss": 0.5581, + "step": 3205 + }, + { + "epoch": 1.2671276858483576, + "grad_norm": 0.44990618359944873, + "learning_rate": 4.963340859409404e-06, + "loss": 0.554, + "step": 3206 + }, + { + "epoch": 1.2675228451469498, + "grad_norm": 0.46874642934363553, + "learning_rate": 4.963314122639497e-06, + "loss": 0.5549, + "step": 3207 + }, + { + "epoch": 1.267918004445542, + "grad_norm": 0.4366561437308232, + "learning_rate": 4.963287376195188e-06, + "loss": 0.568, + "step": 3208 + }, + { + "epoch": 1.2683131637441343, + "grad_norm": 0.43922781969850494, + "learning_rate": 4.963260620076582e-06, + "loss": 0.5612, + "step": 3209 + }, + { + "epoch": 1.2687083230427265, + "grad_norm": 0.46732316449861777, + "learning_rate": 4.963233854283785e-06, + "loss": 0.5675, + "step": 3210 + }, + { + "epoch": 1.2691034823413188, + "grad_norm": 0.4499324082172506, + "learning_rate": 4.9632070788169e-06, + "loss": 0.572, + "step": 3211 + }, + { + "epoch": 1.269498641639911, + "grad_norm": 0.4448079688254288, + "learning_rate": 4.9631802936760345e-06, + "loss": 0.5777, + "step": 3212 + }, + { + "epoch": 1.2698938009385032, + "grad_norm": 0.4681934528119531, + "learning_rate": 4.963153498861292e-06, + "loss": 0.5642, + "step": 3213 + }, + { + "epoch": 1.2702889602370955, + "grad_norm": 0.47693235289926394, + "learning_rate": 4.963126694372777e-06, + "loss": 0.5895, + "step": 3214 + }, + { + "epoch": 1.2706841195356877, + "grad_norm": 0.448800942766577, + "learning_rate": 4.963099880210598e-06, + "loss": 0.5754, + "step": 3215 + }, + { + "epoch": 1.27107927883428, + "grad_norm": 0.44749924388535084, + "learning_rate": 4.9630730563748575e-06, + "loss": 0.5618, + "step": 3216 + }, + { + "epoch": 1.2714744381328722, + "grad_norm": 0.45578491730563103, + "learning_rate": 4.963046222865662e-06, + "loss": 0.5689, + "step": 3217 + }, + { + "epoch": 1.2718695974314644, + "grad_norm": 0.5195796262214436, + "learning_rate": 4.963019379683116e-06, + "loss": 0.5939, + "step": 3218 + }, + { + "epoch": 1.2722647567300567, + "grad_norm": 0.47649433003286457, + "learning_rate": 4.962992526827326e-06, + "loss": 0.5771, + "step": 3219 + }, + { + "epoch": 1.272659916028649, + "grad_norm": 0.4410004496915249, + "learning_rate": 4.962965664298396e-06, + "loss": 0.5511, + "step": 3220 + }, + { + "epoch": 1.2730550753272412, + "grad_norm": 0.4636086332760471, + "learning_rate": 4.9629387920964335e-06, + "loss": 0.5684, + "step": 3221 + }, + { + "epoch": 1.2734502346258336, + "grad_norm": 0.46246815925142704, + "learning_rate": 4.962911910221543e-06, + "loss": 0.559, + "step": 3222 + }, + { + "epoch": 1.2738453939244259, + "grad_norm": 0.46339823849118905, + "learning_rate": 4.96288501867383e-06, + "loss": 0.5571, + "step": 3223 + }, + { + "epoch": 1.274240553223018, + "grad_norm": 0.4677333897938672, + "learning_rate": 4.9628581174534e-06, + "loss": 0.5793, + "step": 3224 + }, + { + "epoch": 1.2746357125216103, + "grad_norm": 0.4710260149695448, + "learning_rate": 4.962831206560358e-06, + "loss": 0.5927, + "step": 3225 + }, + { + "epoch": 1.2750308718202026, + "grad_norm": 0.45906156067530524, + "learning_rate": 4.962804285994811e-06, + "loss": 0.5508, + "step": 3226 + }, + { + "epoch": 1.2754260311187948, + "grad_norm": 0.4389981450527275, + "learning_rate": 4.962777355756865e-06, + "loss": 0.5544, + "step": 3227 + }, + { + "epoch": 1.275821190417387, + "grad_norm": 0.4566293309775638, + "learning_rate": 4.962750415846624e-06, + "loss": 0.5698, + "step": 3228 + }, + { + "epoch": 1.2762163497159793, + "grad_norm": 0.4565239233936326, + "learning_rate": 4.9627234662641965e-06, + "loss": 0.5843, + "step": 3229 + }, + { + "epoch": 1.2766115090145715, + "grad_norm": 0.4402634375950171, + "learning_rate": 4.962696507009686e-06, + "loss": 0.5716, + "step": 3230 + }, + { + "epoch": 1.2770066683131638, + "grad_norm": 0.43986945562125607, + "learning_rate": 4.962669538083198e-06, + "loss": 0.5645, + "step": 3231 + }, + { + "epoch": 1.277401827611756, + "grad_norm": 0.44151108436527936, + "learning_rate": 4.9626425594848404e-06, + "loss": 0.5351, + "step": 3232 + }, + { + "epoch": 1.2777969869103483, + "grad_norm": 0.46054451936494795, + "learning_rate": 4.962615571214718e-06, + "loss": 0.575, + "step": 3233 + }, + { + "epoch": 1.2781921462089405, + "grad_norm": 0.4832601682517206, + "learning_rate": 4.9625885732729365e-06, + "loss": 0.5724, + "step": 3234 + }, + { + "epoch": 1.2785873055075327, + "grad_norm": 0.4496650178160313, + "learning_rate": 4.962561565659603e-06, + "loss": 0.571, + "step": 3235 + }, + { + "epoch": 1.278982464806125, + "grad_norm": 0.4459537839833863, + "learning_rate": 4.962534548374823e-06, + "loss": 0.5496, + "step": 3236 + }, + { + "epoch": 1.2793776241047172, + "grad_norm": 0.4820361417897107, + "learning_rate": 4.962507521418703e-06, + "loss": 0.5488, + "step": 3237 + }, + { + "epoch": 1.2797727834033095, + "grad_norm": 0.4471059519027043, + "learning_rate": 4.962480484791348e-06, + "loss": 0.5557, + "step": 3238 + }, + { + "epoch": 1.2801679427019017, + "grad_norm": 0.44540448662756593, + "learning_rate": 4.962453438492865e-06, + "loss": 0.5403, + "step": 3239 + }, + { + "epoch": 1.280563102000494, + "grad_norm": 0.4706472390621206, + "learning_rate": 4.962426382523361e-06, + "loss": 0.575, + "step": 3240 + }, + { + "epoch": 1.2809582612990862, + "grad_norm": 0.531102576848962, + "learning_rate": 4.962399316882941e-06, + "loss": 0.5533, + "step": 3241 + }, + { + "epoch": 1.2813534205976784, + "grad_norm": 0.461239203184141, + "learning_rate": 4.962372241571711e-06, + "loss": 0.5704, + "step": 3242 + }, + { + "epoch": 1.2817485798962707, + "grad_norm": 0.43407314953672027, + "learning_rate": 4.962345156589779e-06, + "loss": 0.5503, + "step": 3243 + }, + { + "epoch": 1.282143739194863, + "grad_norm": 0.4493863750894706, + "learning_rate": 4.9623180619372505e-06, + "loss": 0.5661, + "step": 3244 + }, + { + "epoch": 1.2825388984934551, + "grad_norm": 0.4832832516667832, + "learning_rate": 4.962290957614231e-06, + "loss": 0.572, + "step": 3245 + }, + { + "epoch": 1.2829340577920474, + "grad_norm": 0.4503676002104791, + "learning_rate": 4.962263843620828e-06, + "loss": 0.5575, + "step": 3246 + }, + { + "epoch": 1.2833292170906396, + "grad_norm": 0.44527990189860395, + "learning_rate": 4.9622367199571485e-06, + "loss": 0.5598, + "step": 3247 + }, + { + "epoch": 1.2837243763892319, + "grad_norm": 0.4360326535872034, + "learning_rate": 4.962209586623298e-06, + "loss": 0.5625, + "step": 3248 + }, + { + "epoch": 1.284119535687824, + "grad_norm": 0.46369867577006174, + "learning_rate": 4.962182443619383e-06, + "loss": 0.5547, + "step": 3249 + }, + { + "epoch": 1.2845146949864163, + "grad_norm": 0.47697119598465904, + "learning_rate": 4.962155290945511e-06, + "loss": 0.5644, + "step": 3250 + }, + { + "epoch": 1.2849098542850086, + "grad_norm": 0.4572682032044699, + "learning_rate": 4.962128128601787e-06, + "loss": 0.6004, + "step": 3251 + }, + { + "epoch": 1.2853050135836008, + "grad_norm": 0.46293963615887146, + "learning_rate": 4.96210095658832e-06, + "loss": 0.5735, + "step": 3252 + }, + { + "epoch": 1.285700172882193, + "grad_norm": 0.4353007660494593, + "learning_rate": 4.962073774905216e-06, + "loss": 0.5515, + "step": 3253 + }, + { + "epoch": 1.2860953321807853, + "grad_norm": 0.4336239776816601, + "learning_rate": 4.96204658355258e-06, + "loss": 0.5479, + "step": 3254 + }, + { + "epoch": 1.2864904914793776, + "grad_norm": 0.4385631160161934, + "learning_rate": 4.962019382530521e-06, + "loss": 0.5581, + "step": 3255 + }, + { + "epoch": 1.2868856507779698, + "grad_norm": 0.47634454262632553, + "learning_rate": 4.961992171839144e-06, + "loss": 0.5794, + "step": 3256 + }, + { + "epoch": 1.287280810076562, + "grad_norm": 0.4672479339967555, + "learning_rate": 4.961964951478557e-06, + "loss": 0.5606, + "step": 3257 + }, + { + "epoch": 1.2876759693751545, + "grad_norm": 0.46957567960558677, + "learning_rate": 4.961937721448867e-06, + "loss": 0.5643, + "step": 3258 + }, + { + "epoch": 1.2880711286737467, + "grad_norm": 0.4773737016736467, + "learning_rate": 4.961910481750181e-06, + "loss": 0.5854, + "step": 3259 + }, + { + "epoch": 1.288466287972339, + "grad_norm": 0.4587288767028595, + "learning_rate": 4.961883232382604e-06, + "loss": 0.5533, + "step": 3260 + }, + { + "epoch": 1.2888614472709312, + "grad_norm": 0.4752043572517741, + "learning_rate": 4.961855973346246e-06, + "loss": 0.5837, + "step": 3261 + }, + { + "epoch": 1.2892566065695235, + "grad_norm": 0.4587769372346428, + "learning_rate": 4.961828704641212e-06, + "loss": 0.5547, + "step": 3262 + }, + { + "epoch": 1.2896517658681157, + "grad_norm": 0.4660024170483272, + "learning_rate": 4.96180142626761e-06, + "loss": 0.5511, + "step": 3263 + }, + { + "epoch": 1.290046925166708, + "grad_norm": 0.483664785494738, + "learning_rate": 4.961774138225547e-06, + "loss": 0.5766, + "step": 3264 + }, + { + "epoch": 1.2904420844653002, + "grad_norm": 0.4518446012693695, + "learning_rate": 4.96174684051513e-06, + "loss": 0.5557, + "step": 3265 + }, + { + "epoch": 1.2908372437638924, + "grad_norm": 0.44730158472491605, + "learning_rate": 4.961719533136466e-06, + "loss": 0.5657, + "step": 3266 + }, + { + "epoch": 1.2912324030624847, + "grad_norm": 0.45919328333593834, + "learning_rate": 4.961692216089663e-06, + "loss": 0.5659, + "step": 3267 + }, + { + "epoch": 1.291627562361077, + "grad_norm": 0.4670554649733668, + "learning_rate": 4.961664889374827e-06, + "loss": 0.5705, + "step": 3268 + }, + { + "epoch": 1.2920227216596691, + "grad_norm": 0.4470155299015426, + "learning_rate": 4.961637552992067e-06, + "loss": 0.5422, + "step": 3269 + }, + { + "epoch": 1.2924178809582614, + "grad_norm": 0.44944938927627665, + "learning_rate": 4.961610206941488e-06, + "loss": 0.5603, + "step": 3270 + }, + { + "epoch": 1.2928130402568536, + "grad_norm": 0.45882995864498305, + "learning_rate": 4.961582851223201e-06, + "loss": 0.5623, + "step": 3271 + }, + { + "epoch": 1.2932081995554459, + "grad_norm": 0.44864834182329033, + "learning_rate": 4.96155548583731e-06, + "loss": 0.5536, + "step": 3272 + }, + { + "epoch": 1.293603358854038, + "grad_norm": 0.4365082513173151, + "learning_rate": 4.961528110783924e-06, + "loss": 0.5492, + "step": 3273 + }, + { + "epoch": 1.2939985181526303, + "grad_norm": 0.4637763227170593, + "learning_rate": 4.961500726063151e-06, + "loss": 0.5704, + "step": 3274 + }, + { + "epoch": 1.2943936774512226, + "grad_norm": 0.4389304914923759, + "learning_rate": 4.961473331675096e-06, + "loss": 0.561, + "step": 3275 + }, + { + "epoch": 1.2947888367498148, + "grad_norm": 0.44926731822340105, + "learning_rate": 4.9614459276198705e-06, + "loss": 0.582, + "step": 3276 + }, + { + "epoch": 1.295183996048407, + "grad_norm": 0.4486660860991371, + "learning_rate": 4.961418513897579e-06, + "loss": 0.5635, + "step": 3277 + }, + { + "epoch": 1.2955791553469993, + "grad_norm": 0.4527867437707233, + "learning_rate": 4.96139109050833e-06, + "loss": 0.5733, + "step": 3278 + }, + { + "epoch": 1.2959743146455915, + "grad_norm": 0.45347608716724236, + "learning_rate": 4.961363657452232e-06, + "loss": 0.5518, + "step": 3279 + }, + { + "epoch": 1.2963694739441838, + "grad_norm": 0.4508697870043093, + "learning_rate": 4.961336214729392e-06, + "loss": 0.5548, + "step": 3280 + }, + { + "epoch": 1.296764633242776, + "grad_norm": 0.4637754516249654, + "learning_rate": 4.961308762339918e-06, + "loss": 0.5638, + "step": 3281 + }, + { + "epoch": 1.2971597925413683, + "grad_norm": 0.4574438583788397, + "learning_rate": 4.961281300283918e-06, + "loss": 0.5516, + "step": 3282 + }, + { + "epoch": 1.2975549518399605, + "grad_norm": 0.4476663326378099, + "learning_rate": 4.961253828561499e-06, + "loss": 0.5494, + "step": 3283 + }, + { + "epoch": 1.2979501111385527, + "grad_norm": 0.45758420853662424, + "learning_rate": 4.96122634717277e-06, + "loss": 0.5669, + "step": 3284 + }, + { + "epoch": 1.298345270437145, + "grad_norm": 0.4730047069561211, + "learning_rate": 4.9611988561178385e-06, + "loss": 0.5748, + "step": 3285 + }, + { + "epoch": 1.2987404297357372, + "grad_norm": 0.44003229484144685, + "learning_rate": 4.9611713553968125e-06, + "loss": 0.5465, + "step": 3286 + }, + { + "epoch": 1.2991355890343295, + "grad_norm": 0.4411529968194715, + "learning_rate": 4.9611438450098e-06, + "loss": 0.5559, + "step": 3287 + }, + { + "epoch": 1.2995307483329217, + "grad_norm": 0.4778237900402621, + "learning_rate": 4.9611163249569085e-06, + "loss": 0.5639, + "step": 3288 + }, + { + "epoch": 1.299925907631514, + "grad_norm": 0.44974909893119686, + "learning_rate": 4.961088795238247e-06, + "loss": 0.5758, + "step": 3289 + }, + { + "epoch": 1.3003210669301062, + "grad_norm": 0.4378290859300243, + "learning_rate": 4.9610612558539214e-06, + "loss": 0.5689, + "step": 3290 + }, + { + "epoch": 1.3007162262286984, + "grad_norm": 0.4429720821425789, + "learning_rate": 4.961033706804044e-06, + "loss": 0.5606, + "step": 3291 + }, + { + "epoch": 1.3011113855272907, + "grad_norm": 0.4633926009741077, + "learning_rate": 4.961006148088719e-06, + "loss": 0.5631, + "step": 3292 + }, + { + "epoch": 1.301506544825883, + "grad_norm": 0.46706115854462443, + "learning_rate": 4.960978579708058e-06, + "loss": 0.5813, + "step": 3293 + }, + { + "epoch": 1.3019017041244751, + "grad_norm": 0.47368413285383676, + "learning_rate": 4.9609510016621655e-06, + "loss": 0.5883, + "step": 3294 + }, + { + "epoch": 1.3022968634230674, + "grad_norm": 0.4737507951875103, + "learning_rate": 4.960923413951153e-06, + "loss": 0.5748, + "step": 3295 + }, + { + "epoch": 1.3026920227216596, + "grad_norm": 0.4570347719095943, + "learning_rate": 4.960895816575127e-06, + "loss": 0.5692, + "step": 3296 + }, + { + "epoch": 1.3030871820202519, + "grad_norm": 0.4577457839859938, + "learning_rate": 4.960868209534198e-06, + "loss": 0.5562, + "step": 3297 + }, + { + "epoch": 1.303482341318844, + "grad_norm": 0.47003885252402855, + "learning_rate": 4.960840592828472e-06, + "loss": 0.5734, + "step": 3298 + }, + { + "epoch": 1.3038775006174363, + "grad_norm": 0.4624743876597724, + "learning_rate": 4.960812966458058e-06, + "loss": 0.5603, + "step": 3299 + }, + { + "epoch": 1.3042726599160286, + "grad_norm": 0.48901567850108524, + "learning_rate": 4.960785330423066e-06, + "loss": 0.5893, + "step": 3300 + }, + { + "epoch": 1.3046678192146208, + "grad_norm": 0.4522014166198177, + "learning_rate": 4.960757684723603e-06, + "loss": 0.5538, + "step": 3301 + }, + { + "epoch": 1.305062978513213, + "grad_norm": 0.45770794284942456, + "learning_rate": 4.9607300293597774e-06, + "loss": 0.5656, + "step": 3302 + }, + { + "epoch": 1.3054581378118053, + "grad_norm": 0.4322173217917462, + "learning_rate": 4.960702364331699e-06, + "loss": 0.5497, + "step": 3303 + }, + { + "epoch": 1.3058532971103975, + "grad_norm": 0.4381343619517814, + "learning_rate": 4.960674689639477e-06, + "loss": 0.5591, + "step": 3304 + }, + { + "epoch": 1.3062484564089898, + "grad_norm": 0.44151218550571697, + "learning_rate": 4.960647005283217e-06, + "loss": 0.5694, + "step": 3305 + }, + { + "epoch": 1.306643615707582, + "grad_norm": 0.43450554135557107, + "learning_rate": 4.960619311263031e-06, + "loss": 0.5803, + "step": 3306 + }, + { + "epoch": 1.3070387750061743, + "grad_norm": 0.42935555212955734, + "learning_rate": 4.960591607579026e-06, + "loss": 0.5367, + "step": 3307 + }, + { + "epoch": 1.3074339343047665, + "grad_norm": 0.44216935598010304, + "learning_rate": 4.960563894231312e-06, + "loss": 0.5526, + "step": 3308 + }, + { + "epoch": 1.3078290936033587, + "grad_norm": 0.4436461836567033, + "learning_rate": 4.960536171219997e-06, + "loss": 0.5658, + "step": 3309 + }, + { + "epoch": 1.308224252901951, + "grad_norm": 0.46145987300596963, + "learning_rate": 4.96050843854519e-06, + "loss": 0.5712, + "step": 3310 + }, + { + "epoch": 1.3086194122005432, + "grad_norm": 0.4540758807723103, + "learning_rate": 4.960480696206999e-06, + "loss": 0.5551, + "step": 3311 + }, + { + "epoch": 1.3090145714991355, + "grad_norm": 0.4371619617022349, + "learning_rate": 4.960452944205535e-06, + "loss": 0.5526, + "step": 3312 + }, + { + "epoch": 1.3094097307977277, + "grad_norm": 0.44906152086035417, + "learning_rate": 4.960425182540905e-06, + "loss": 0.556, + "step": 3313 + }, + { + "epoch": 1.30980489009632, + "grad_norm": 0.6283881072933288, + "learning_rate": 4.96039741121322e-06, + "loss": 0.6207, + "step": 3314 + }, + { + "epoch": 1.3102000493949122, + "grad_norm": 0.43354415802766594, + "learning_rate": 4.960369630222588e-06, + "loss": 0.5782, + "step": 3315 + }, + { + "epoch": 1.3105952086935044, + "grad_norm": 0.4573902248334659, + "learning_rate": 4.960341839569117e-06, + "loss": 0.5657, + "step": 3316 + }, + { + "epoch": 1.310990367992097, + "grad_norm": 0.4618875741095821, + "learning_rate": 4.9603140392529185e-06, + "loss": 0.5504, + "step": 3317 + }, + { + "epoch": 1.3113855272906891, + "grad_norm": 0.4335794680087329, + "learning_rate": 4.9602862292740995e-06, + "loss": 0.547, + "step": 3318 + }, + { + "epoch": 1.3117806865892814, + "grad_norm": 0.46798359661776007, + "learning_rate": 4.960258409632771e-06, + "loss": 0.5565, + "step": 3319 + }, + { + "epoch": 1.3121758458878736, + "grad_norm": 0.44285226370838654, + "learning_rate": 4.960230580329041e-06, + "loss": 0.5641, + "step": 3320 + }, + { + "epoch": 1.3125710051864659, + "grad_norm": 0.4656857860504189, + "learning_rate": 4.960202741363018e-06, + "loss": 0.5633, + "step": 3321 + }, + { + "epoch": 1.312966164485058, + "grad_norm": 0.4524731635101246, + "learning_rate": 4.960174892734813e-06, + "loss": 0.5607, + "step": 3322 + }, + { + "epoch": 1.3133613237836503, + "grad_norm": 0.456230055946175, + "learning_rate": 4.960147034444537e-06, + "loss": 0.5584, + "step": 3323 + }, + { + "epoch": 1.3137564830822426, + "grad_norm": 0.4772423713788212, + "learning_rate": 4.960119166492295e-06, + "loss": 0.57, + "step": 3324 + }, + { + "epoch": 1.3141516423808348, + "grad_norm": 0.4586733788184475, + "learning_rate": 4.9600912888782e-06, + "loss": 0.5758, + "step": 3325 + }, + { + "epoch": 1.314546801679427, + "grad_norm": 0.5480041169557188, + "learning_rate": 4.9600634016023606e-06, + "loss": 0.5772, + "step": 3326 + }, + { + "epoch": 1.3149419609780193, + "grad_norm": 0.4415783584140013, + "learning_rate": 4.960035504664885e-06, + "loss": 0.5404, + "step": 3327 + }, + { + "epoch": 1.3153371202766115, + "grad_norm": 0.4291556202863314, + "learning_rate": 4.960007598065884e-06, + "loss": 0.5612, + "step": 3328 + }, + { + "epoch": 1.3157322795752038, + "grad_norm": 0.44774827375999054, + "learning_rate": 4.959979681805467e-06, + "loss": 0.5776, + "step": 3329 + }, + { + "epoch": 1.316127438873796, + "grad_norm": 0.44973594796897337, + "learning_rate": 4.959951755883744e-06, + "loss": 0.5768, + "step": 3330 + }, + { + "epoch": 1.3165225981723883, + "grad_norm": 0.44813772825578874, + "learning_rate": 4.959923820300824e-06, + "loss": 0.5623, + "step": 3331 + }, + { + "epoch": 1.3169177574709805, + "grad_norm": 0.44654411240174396, + "learning_rate": 4.959895875056816e-06, + "loss": 0.5655, + "step": 3332 + }, + { + "epoch": 1.3173129167695727, + "grad_norm": 0.4373133409477842, + "learning_rate": 4.959867920151832e-06, + "loss": 0.5592, + "step": 3333 + }, + { + "epoch": 1.317708076068165, + "grad_norm": 0.4363324235460165, + "learning_rate": 4.95983995558598e-06, + "loss": 0.5492, + "step": 3334 + }, + { + "epoch": 1.3181032353667572, + "grad_norm": 0.4346900282961237, + "learning_rate": 4.95981198135937e-06, + "loss": 0.5444, + "step": 3335 + }, + { + "epoch": 1.3184983946653495, + "grad_norm": 0.4523110093393848, + "learning_rate": 4.959783997472113e-06, + "loss": 0.5536, + "step": 3336 + }, + { + "epoch": 1.3188935539639417, + "grad_norm": 0.46544382830839454, + "learning_rate": 4.959756003924317e-06, + "loss": 0.5702, + "step": 3337 + }, + { + "epoch": 1.319288713262534, + "grad_norm": 0.44772145946647923, + "learning_rate": 4.959728000716094e-06, + "loss": 0.5616, + "step": 3338 + }, + { + "epoch": 1.3196838725611262, + "grad_norm": 0.4384987036176576, + "learning_rate": 4.959699987847554e-06, + "loss": 0.5685, + "step": 3339 + }, + { + "epoch": 1.3200790318597184, + "grad_norm": 0.43847040276558563, + "learning_rate": 4.9596719653188045e-06, + "loss": 0.5518, + "step": 3340 + }, + { + "epoch": 1.3204741911583107, + "grad_norm": 0.45117712104049984, + "learning_rate": 4.959643933129958e-06, + "loss": 0.5734, + "step": 3341 + }, + { + "epoch": 1.320869350456903, + "grad_norm": 0.45533362827402557, + "learning_rate": 4.9596158912811235e-06, + "loss": 0.5649, + "step": 3342 + }, + { + "epoch": 1.3212645097554951, + "grad_norm": 0.44551751419590213, + "learning_rate": 4.9595878397724106e-06, + "loss": 0.5778, + "step": 3343 + }, + { + "epoch": 1.3216596690540874, + "grad_norm": 0.4370316849726543, + "learning_rate": 4.959559778603931e-06, + "loss": 0.5602, + "step": 3344 + }, + { + "epoch": 1.3220548283526796, + "grad_norm": 0.4599955825604526, + "learning_rate": 4.959531707775793e-06, + "loss": 0.571, + "step": 3345 + }, + { + "epoch": 1.3224499876512719, + "grad_norm": 0.4424416975622541, + "learning_rate": 4.959503627288109e-06, + "loss": 0.561, + "step": 3346 + }, + { + "epoch": 1.322845146949864, + "grad_norm": 0.43661943086497584, + "learning_rate": 4.959475537140989e-06, + "loss": 0.5523, + "step": 3347 + }, + { + "epoch": 1.3232403062484563, + "grad_norm": 0.4373998487954772, + "learning_rate": 4.959447437334541e-06, + "loss": 0.5613, + "step": 3348 + }, + { + "epoch": 1.3236354655470486, + "grad_norm": 0.4494818246912831, + "learning_rate": 4.959419327868878e-06, + "loss": 0.5775, + "step": 3349 + }, + { + "epoch": 1.3240306248456408, + "grad_norm": 0.43842069386255866, + "learning_rate": 4.959391208744108e-06, + "loss": 0.5677, + "step": 3350 + }, + { + "epoch": 1.324425784144233, + "grad_norm": 0.4494362228050278, + "learning_rate": 4.959363079960344e-06, + "loss": 0.5517, + "step": 3351 + }, + { + "epoch": 1.3248209434428255, + "grad_norm": 0.45526558767555503, + "learning_rate": 4.959334941517695e-06, + "loss": 0.572, + "step": 3352 + }, + { + "epoch": 1.3252161027414178, + "grad_norm": 0.44027079643934225, + "learning_rate": 4.959306793416271e-06, + "loss": 0.5698, + "step": 3353 + }, + { + "epoch": 1.32561126204001, + "grad_norm": 0.4405471076038371, + "learning_rate": 4.959278635656184e-06, + "loss": 0.5653, + "step": 3354 + }, + { + "epoch": 1.3260064213386022, + "grad_norm": 0.45167721473900385, + "learning_rate": 4.959250468237544e-06, + "loss": 0.5757, + "step": 3355 + }, + { + "epoch": 1.3264015806371945, + "grad_norm": 0.4421108044099741, + "learning_rate": 4.959222291160461e-06, + "loss": 0.5642, + "step": 3356 + }, + { + "epoch": 1.3267967399357867, + "grad_norm": 0.4431212984296979, + "learning_rate": 4.959194104425047e-06, + "loss": 0.5505, + "step": 3357 + }, + { + "epoch": 1.327191899234379, + "grad_norm": 0.4521254913136757, + "learning_rate": 4.95916590803141e-06, + "loss": 0.5674, + "step": 3358 + }, + { + "epoch": 1.3275870585329712, + "grad_norm": 0.4738728564671993, + "learning_rate": 4.959137701979665e-06, + "loss": 0.5737, + "step": 3359 + }, + { + "epoch": 1.3279822178315634, + "grad_norm": 0.4503578411569048, + "learning_rate": 4.9591094862699185e-06, + "loss": 0.5767, + "step": 3360 + }, + { + "epoch": 1.3283773771301557, + "grad_norm": 0.4593996821030863, + "learning_rate": 4.959081260902284e-06, + "loss": 0.5819, + "step": 3361 + }, + { + "epoch": 1.328772536428748, + "grad_norm": 0.47458910560702716, + "learning_rate": 4.959053025876871e-06, + "loss": 0.5751, + "step": 3362 + }, + { + "epoch": 1.3291676957273402, + "grad_norm": 0.4601597399895584, + "learning_rate": 4.959024781193792e-06, + "loss": 0.5502, + "step": 3363 + }, + { + "epoch": 1.3295628550259324, + "grad_norm": 0.43863004190964605, + "learning_rate": 4.958996526853156e-06, + "loss": 0.552, + "step": 3364 + }, + { + "epoch": 1.3299580143245247, + "grad_norm": 0.449223468368665, + "learning_rate": 4.958968262855075e-06, + "loss": 0.5726, + "step": 3365 + }, + { + "epoch": 1.330353173623117, + "grad_norm": 0.45615386443341344, + "learning_rate": 4.958939989199659e-06, + "loss": 0.5764, + "step": 3366 + }, + { + "epoch": 1.3307483329217091, + "grad_norm": 0.44070542771087373, + "learning_rate": 4.958911705887022e-06, + "loss": 0.5687, + "step": 3367 + }, + { + "epoch": 1.3311434922203014, + "grad_norm": 0.44414182367934024, + "learning_rate": 4.958883412917271e-06, + "loss": 0.5658, + "step": 3368 + }, + { + "epoch": 1.3315386515188936, + "grad_norm": 0.44838583407126426, + "learning_rate": 4.9588551102905205e-06, + "loss": 0.5716, + "step": 3369 + }, + { + "epoch": 1.3319338108174859, + "grad_norm": 0.889969275288809, + "learning_rate": 4.9588267980068795e-06, + "loss": 0.556, + "step": 3370 + }, + { + "epoch": 1.332328970116078, + "grad_norm": 0.46358754540200775, + "learning_rate": 4.95879847606646e-06, + "loss": 0.5861, + "step": 3371 + }, + { + "epoch": 1.3327241294146703, + "grad_norm": 0.4282860491355915, + "learning_rate": 4.958770144469372e-06, + "loss": 0.5428, + "step": 3372 + }, + { + "epoch": 1.3331192887132626, + "grad_norm": 0.45729755865769983, + "learning_rate": 4.95874180321573e-06, + "loss": 0.5856, + "step": 3373 + }, + { + "epoch": 1.3335144480118548, + "grad_norm": 0.45588398035202177, + "learning_rate": 4.958713452305642e-06, + "loss": 0.5713, + "step": 3374 + }, + { + "epoch": 1.333909607310447, + "grad_norm": 0.44858104138575744, + "learning_rate": 4.958685091739221e-06, + "loss": 0.5657, + "step": 3375 + }, + { + "epoch": 1.3343047666090393, + "grad_norm": 0.44036953218064384, + "learning_rate": 4.958656721516577e-06, + "loss": 0.5587, + "step": 3376 + }, + { + "epoch": 1.3346999259076315, + "grad_norm": 0.4540748920166802, + "learning_rate": 4.958628341637823e-06, + "loss": 0.5646, + "step": 3377 + }, + { + "epoch": 1.3350950852062238, + "grad_norm": 0.4396313054654993, + "learning_rate": 4.9585999521030704e-06, + "loss": 0.5464, + "step": 3378 + }, + { + "epoch": 1.335490244504816, + "grad_norm": 0.4454412703380863, + "learning_rate": 4.958571552912429e-06, + "loss": 0.5545, + "step": 3379 + }, + { + "epoch": 1.3358854038034083, + "grad_norm": 0.45082119473828774, + "learning_rate": 4.9585431440660125e-06, + "loss": 0.561, + "step": 3380 + }, + { + "epoch": 1.3362805631020005, + "grad_norm": 0.4374755472502395, + "learning_rate": 4.958514725563931e-06, + "loss": 0.5571, + "step": 3381 + }, + { + "epoch": 1.3366757224005927, + "grad_norm": 0.46478260478991495, + "learning_rate": 4.958486297406296e-06, + "loss": 0.5891, + "step": 3382 + }, + { + "epoch": 1.337070881699185, + "grad_norm": 0.43814804662705964, + "learning_rate": 4.958457859593221e-06, + "loss": 0.565, + "step": 3383 + }, + { + "epoch": 1.3374660409977772, + "grad_norm": 0.44507180809627767, + "learning_rate": 4.958429412124816e-06, + "loss": 0.5599, + "step": 3384 + }, + { + "epoch": 1.3378612002963695, + "grad_norm": 0.441421838525371, + "learning_rate": 4.958400955001193e-06, + "loss": 0.5428, + "step": 3385 + }, + { + "epoch": 1.3382563595949617, + "grad_norm": 0.44374387122522124, + "learning_rate": 4.958372488222463e-06, + "loss": 0.5403, + "step": 3386 + }, + { + "epoch": 1.338651518893554, + "grad_norm": 0.4467964188449642, + "learning_rate": 4.958344011788739e-06, + "loss": 0.5503, + "step": 3387 + }, + { + "epoch": 1.3390466781921462, + "grad_norm": 0.4479117153175756, + "learning_rate": 4.958315525700134e-06, + "loss": 0.5775, + "step": 3388 + }, + { + "epoch": 1.3394418374907384, + "grad_norm": 0.5332367747737651, + "learning_rate": 4.958287029956757e-06, + "loss": 0.5715, + "step": 3389 + }, + { + "epoch": 1.3398369967893307, + "grad_norm": 0.45663522892567066, + "learning_rate": 4.958258524558721e-06, + "loss": 0.5881, + "step": 3390 + }, + { + "epoch": 1.340232156087923, + "grad_norm": 0.4521841960229859, + "learning_rate": 4.958230009506139e-06, + "loss": 0.5603, + "step": 3391 + }, + { + "epoch": 1.3406273153865151, + "grad_norm": 0.45825530510638524, + "learning_rate": 4.958201484799122e-06, + "loss": 0.5664, + "step": 3392 + }, + { + "epoch": 1.3410224746851074, + "grad_norm": 0.45965721161303025, + "learning_rate": 4.958172950437782e-06, + "loss": 0.5715, + "step": 3393 + }, + { + "epoch": 1.3414176339836996, + "grad_norm": 0.4412893524397775, + "learning_rate": 4.958144406422232e-06, + "loss": 0.5709, + "step": 3394 + }, + { + "epoch": 1.3418127932822919, + "grad_norm": 0.45819164319217937, + "learning_rate": 4.958115852752583e-06, + "loss": 0.5666, + "step": 3395 + }, + { + "epoch": 1.342207952580884, + "grad_norm": 0.45080426168635684, + "learning_rate": 4.958087289428948e-06, + "loss": 0.5452, + "step": 3396 + }, + { + "epoch": 1.3426031118794763, + "grad_norm": 0.46299421201178154, + "learning_rate": 4.958058716451438e-06, + "loss": 0.5788, + "step": 3397 + }, + { + "epoch": 1.3429982711780686, + "grad_norm": 0.44631538749161104, + "learning_rate": 4.9580301338201665e-06, + "loss": 0.5605, + "step": 3398 + }, + { + "epoch": 1.3433934304766608, + "grad_norm": 0.4753236947479198, + "learning_rate": 4.9580015415352446e-06, + "loss": 0.5833, + "step": 3399 + }, + { + "epoch": 1.343788589775253, + "grad_norm": 0.4543478319902544, + "learning_rate": 4.9579729395967855e-06, + "loss": 0.5459, + "step": 3400 + }, + { + "epoch": 1.3441837490738453, + "grad_norm": 0.42397174780842645, + "learning_rate": 4.957944328004902e-06, + "loss": 0.5408, + "step": 3401 + }, + { + "epoch": 1.3445789083724375, + "grad_norm": 0.44366683749260666, + "learning_rate": 4.957915706759705e-06, + "loss": 0.5672, + "step": 3402 + }, + { + "epoch": 1.3449740676710298, + "grad_norm": 0.4620326978290853, + "learning_rate": 4.957887075861308e-06, + "loss": 0.5912, + "step": 3403 + }, + { + "epoch": 1.345369226969622, + "grad_norm": 0.437254776572285, + "learning_rate": 4.957858435309822e-06, + "loss": 0.5584, + "step": 3404 + }, + { + "epoch": 1.3457643862682143, + "grad_norm": 0.44334059234747064, + "learning_rate": 4.9578297851053626e-06, + "loss": 0.5552, + "step": 3405 + }, + { + "epoch": 1.3461595455668065, + "grad_norm": 0.45277563926558906, + "learning_rate": 4.957801125248038e-06, + "loss": 0.5796, + "step": 3406 + }, + { + "epoch": 1.3465547048653987, + "grad_norm": 0.4529953395968256, + "learning_rate": 4.957772455737965e-06, + "loss": 0.5645, + "step": 3407 + }, + { + "epoch": 1.346949864163991, + "grad_norm": 0.45953659145635833, + "learning_rate": 4.9577437765752535e-06, + "loss": 0.5551, + "step": 3408 + }, + { + "epoch": 1.3473450234625832, + "grad_norm": 0.45243007027630033, + "learning_rate": 4.957715087760017e-06, + "loss": 0.562, + "step": 3409 + }, + { + "epoch": 1.3477401827611755, + "grad_norm": 0.44813612053487434, + "learning_rate": 4.9576863892923675e-06, + "loss": 0.5669, + "step": 3410 + }, + { + "epoch": 1.348135342059768, + "grad_norm": 0.4414843229708112, + "learning_rate": 4.957657681172419e-06, + "loss": 0.5564, + "step": 3411 + }, + { + "epoch": 1.3485305013583602, + "grad_norm": 0.4762734336021678, + "learning_rate": 4.957628963400283e-06, + "loss": 0.5576, + "step": 3412 + }, + { + "epoch": 1.3489256606569524, + "grad_norm": 0.4651845435138158, + "learning_rate": 4.957600235976072e-06, + "loss": 0.5471, + "step": 3413 + }, + { + "epoch": 1.3493208199555446, + "grad_norm": 0.4394461136350005, + "learning_rate": 4.957571498899901e-06, + "loss": 0.5542, + "step": 3414 + }, + { + "epoch": 1.3497159792541369, + "grad_norm": 0.44448501672351937, + "learning_rate": 4.95754275217188e-06, + "loss": 0.5635, + "step": 3415 + }, + { + "epoch": 1.3501111385527291, + "grad_norm": 0.4620591465811548, + "learning_rate": 4.9575139957921245e-06, + "loss": 0.5587, + "step": 3416 + }, + { + "epoch": 1.3505062978513214, + "grad_norm": 0.45815520871269744, + "learning_rate": 4.957485229760747e-06, + "loss": 0.5652, + "step": 3417 + }, + { + "epoch": 1.3509014571499136, + "grad_norm": 0.4493406020218002, + "learning_rate": 4.957456454077858e-06, + "loss": 0.5441, + "step": 3418 + }, + { + "epoch": 1.3512966164485058, + "grad_norm": 0.454603776283902, + "learning_rate": 4.957427668743573e-06, + "loss": 0.5598, + "step": 3419 + }, + { + "epoch": 1.351691775747098, + "grad_norm": 0.47114620064149954, + "learning_rate": 4.9573988737580045e-06, + "loss": 0.5714, + "step": 3420 + }, + { + "epoch": 1.3520869350456903, + "grad_norm": 0.4559096213569245, + "learning_rate": 4.957370069121265e-06, + "loss": 0.5616, + "step": 3421 + }, + { + "epoch": 1.3524820943442826, + "grad_norm": 0.42729193122760173, + "learning_rate": 4.95734125483347e-06, + "loss": 0.5561, + "step": 3422 + }, + { + "epoch": 1.3528772536428748, + "grad_norm": 0.46360094062631574, + "learning_rate": 4.957312430894729e-06, + "loss": 0.5475, + "step": 3423 + }, + { + "epoch": 1.353272412941467, + "grad_norm": 0.44865908216487266, + "learning_rate": 4.957283597305157e-06, + "loss": 0.5744, + "step": 3424 + }, + { + "epoch": 1.3536675722400593, + "grad_norm": 0.4527404050788081, + "learning_rate": 4.957254754064867e-06, + "loss": 0.5691, + "step": 3425 + }, + { + "epoch": 1.3540627315386515, + "grad_norm": 0.4380566310678888, + "learning_rate": 4.957225901173973e-06, + "loss": 0.5675, + "step": 3426 + }, + { + "epoch": 1.3544578908372438, + "grad_norm": 0.4480119306626472, + "learning_rate": 4.957197038632587e-06, + "loss": 0.5794, + "step": 3427 + }, + { + "epoch": 1.354853050135836, + "grad_norm": 0.44898242236087227, + "learning_rate": 4.957168166440824e-06, + "loss": 0.5749, + "step": 3428 + }, + { + "epoch": 1.3552482094344283, + "grad_norm": 0.4395620002210985, + "learning_rate": 4.9571392845987965e-06, + "loss": 0.5539, + "step": 3429 + }, + { + "epoch": 1.3556433687330205, + "grad_norm": 0.4828976972353129, + "learning_rate": 4.957110393106618e-06, + "loss": 0.5486, + "step": 3430 + }, + { + "epoch": 1.3560385280316127, + "grad_norm": 0.4340264093323101, + "learning_rate": 4.9570814919644015e-06, + "loss": 0.5528, + "step": 3431 + }, + { + "epoch": 1.356433687330205, + "grad_norm": 0.43633467432037276, + "learning_rate": 4.9570525811722604e-06, + "loss": 0.5823, + "step": 3432 + }, + { + "epoch": 1.3568288466287972, + "grad_norm": 0.44896667799179996, + "learning_rate": 4.957023660730309e-06, + "loss": 0.5674, + "step": 3433 + }, + { + "epoch": 1.3572240059273895, + "grad_norm": 0.4424965400029668, + "learning_rate": 4.9569947306386614e-06, + "loss": 0.5597, + "step": 3434 + }, + { + "epoch": 1.3576191652259817, + "grad_norm": 0.4505551734794091, + "learning_rate": 4.95696579089743e-06, + "loss": 0.5732, + "step": 3435 + }, + { + "epoch": 1.358014324524574, + "grad_norm": 0.42939268083383797, + "learning_rate": 4.956936841506729e-06, + "loss": 0.5455, + "step": 3436 + }, + { + "epoch": 1.3584094838231662, + "grad_norm": 0.4283591881731488, + "learning_rate": 4.956907882466672e-06, + "loss": 0.5599, + "step": 3437 + }, + { + "epoch": 1.3588046431217584, + "grad_norm": 0.45404223789984394, + "learning_rate": 4.956878913777373e-06, + "loss": 0.5664, + "step": 3438 + }, + { + "epoch": 1.3591998024203507, + "grad_norm": 0.4403257215242389, + "learning_rate": 4.956849935438945e-06, + "loss": 0.5615, + "step": 3439 + }, + { + "epoch": 1.359594961718943, + "grad_norm": 0.4557250216642013, + "learning_rate": 4.956820947451503e-06, + "loss": 0.569, + "step": 3440 + }, + { + "epoch": 1.3599901210175351, + "grad_norm": 0.45833724590035085, + "learning_rate": 4.956791949815159e-06, + "loss": 0.5823, + "step": 3441 + }, + { + "epoch": 1.3603852803161274, + "grad_norm": 0.4530682419897322, + "learning_rate": 4.956762942530029e-06, + "loss": 0.5707, + "step": 3442 + }, + { + "epoch": 1.3607804396147196, + "grad_norm": 0.4376158015116781, + "learning_rate": 4.956733925596227e-06, + "loss": 0.5708, + "step": 3443 + }, + { + "epoch": 1.3611755989133119, + "grad_norm": 0.4307700914051484, + "learning_rate": 4.956704899013864e-06, + "loss": 0.5522, + "step": 3444 + }, + { + "epoch": 1.361570758211904, + "grad_norm": 0.46554414982116393, + "learning_rate": 4.956675862783057e-06, + "loss": 0.5655, + "step": 3445 + }, + { + "epoch": 1.3619659175104963, + "grad_norm": 0.4671181738319043, + "learning_rate": 4.9566468169039185e-06, + "loss": 0.566, + "step": 3446 + }, + { + "epoch": 1.3623610768090888, + "grad_norm": 0.4297413408882112, + "learning_rate": 4.956617761376563e-06, + "loss": 0.5542, + "step": 3447 + }, + { + "epoch": 1.362756236107681, + "grad_norm": 0.44684012819132296, + "learning_rate": 4.956588696201105e-06, + "loss": 0.5513, + "step": 3448 + }, + { + "epoch": 1.3631513954062733, + "grad_norm": 0.4693033252542671, + "learning_rate": 4.956559621377658e-06, + "loss": 0.5557, + "step": 3449 + }, + { + "epoch": 1.3635465547048655, + "grad_norm": 0.4486147777129349, + "learning_rate": 4.956530536906337e-06, + "loss": 0.5835, + "step": 3450 + }, + { + "epoch": 1.3639417140034578, + "grad_norm": 0.4520688255329486, + "learning_rate": 4.956501442787256e-06, + "loss": 0.5975, + "step": 3451 + }, + { + "epoch": 1.36433687330205, + "grad_norm": 0.4478220251661161, + "learning_rate": 4.956472339020528e-06, + "loss": 0.5653, + "step": 3452 + }, + { + "epoch": 1.3647320326006422, + "grad_norm": 0.49319113298532397, + "learning_rate": 4.956443225606269e-06, + "loss": 0.5619, + "step": 3453 + }, + { + "epoch": 1.3651271918992345, + "grad_norm": 0.4472089262359022, + "learning_rate": 4.9564141025445924e-06, + "loss": 0.5851, + "step": 3454 + }, + { + "epoch": 1.3655223511978267, + "grad_norm": 0.45279260771258567, + "learning_rate": 4.956384969835613e-06, + "loss": 0.5825, + "step": 3455 + }, + { + "epoch": 1.365917510496419, + "grad_norm": 0.44893832422728336, + "learning_rate": 4.956355827479445e-06, + "loss": 0.54, + "step": 3456 + }, + { + "epoch": 1.3663126697950112, + "grad_norm": 0.44861167684626757, + "learning_rate": 4.9563266754762025e-06, + "loss": 0.5798, + "step": 3457 + }, + { + "epoch": 1.3667078290936034, + "grad_norm": 0.457323231515835, + "learning_rate": 4.956297513826e-06, + "loss": 0.5578, + "step": 3458 + }, + { + "epoch": 1.3671029883921957, + "grad_norm": 0.4497319765240072, + "learning_rate": 4.9562683425289535e-06, + "loss": 0.5738, + "step": 3459 + }, + { + "epoch": 1.367498147690788, + "grad_norm": 0.44725620193345855, + "learning_rate": 4.956239161585176e-06, + "loss": 0.5727, + "step": 3460 + }, + { + "epoch": 1.3678933069893802, + "grad_norm": 0.4464598621025446, + "learning_rate": 4.956209970994783e-06, + "loss": 0.5661, + "step": 3461 + }, + { + "epoch": 1.3682884662879724, + "grad_norm": 0.4446893412743254, + "learning_rate": 4.956180770757888e-06, + "loss": 0.5732, + "step": 3462 + }, + { + "epoch": 1.3686836255865646, + "grad_norm": 0.43091284408122377, + "learning_rate": 4.956151560874607e-06, + "loss": 0.5573, + "step": 3463 + }, + { + "epoch": 1.3690787848851569, + "grad_norm": 0.4503486991715494, + "learning_rate": 4.9561223413450544e-06, + "loss": 0.5629, + "step": 3464 + }, + { + "epoch": 1.3694739441837491, + "grad_norm": 0.4661947809210117, + "learning_rate": 4.956093112169343e-06, + "loss": 0.5912, + "step": 3465 + }, + { + "epoch": 1.3698691034823414, + "grad_norm": 0.4420942969976414, + "learning_rate": 4.956063873347591e-06, + "loss": 0.5605, + "step": 3466 + }, + { + "epoch": 1.3702642627809336, + "grad_norm": 0.4299631865847097, + "learning_rate": 4.956034624879911e-06, + "loss": 0.5311, + "step": 3467 + }, + { + "epoch": 1.3706594220795258, + "grad_norm": 0.444327734857436, + "learning_rate": 4.956005366766419e-06, + "loss": 0.5744, + "step": 3468 + }, + { + "epoch": 1.371054581378118, + "grad_norm": 0.42540391618117634, + "learning_rate": 4.955976099007228e-06, + "loss": 0.572, + "step": 3469 + }, + { + "epoch": 1.3714497406767103, + "grad_norm": 0.4460903620872563, + "learning_rate": 4.955946821602455e-06, + "loss": 0.5715, + "step": 3470 + }, + { + "epoch": 1.3718448999753026, + "grad_norm": 0.4464339414945937, + "learning_rate": 4.9559175345522135e-06, + "loss": 0.5477, + "step": 3471 + }, + { + "epoch": 1.3722400592738948, + "grad_norm": 0.4496737708805918, + "learning_rate": 4.95588823785662e-06, + "loss": 0.566, + "step": 3472 + }, + { + "epoch": 1.372635218572487, + "grad_norm": 0.46530073162081453, + "learning_rate": 4.955858931515789e-06, + "loss": 0.5863, + "step": 3473 + }, + { + "epoch": 1.3730303778710793, + "grad_norm": 0.4304612284801493, + "learning_rate": 4.955829615529835e-06, + "loss": 0.5608, + "step": 3474 + }, + { + "epoch": 1.3734255371696715, + "grad_norm": 0.43919382720802014, + "learning_rate": 4.955800289898874e-06, + "loss": 0.5455, + "step": 3475 + }, + { + "epoch": 1.3738206964682638, + "grad_norm": 0.4570983105615984, + "learning_rate": 4.95577095462302e-06, + "loss": 0.5623, + "step": 3476 + }, + { + "epoch": 1.374215855766856, + "grad_norm": 0.4377489629257734, + "learning_rate": 4.955741609702389e-06, + "loss": 0.5599, + "step": 3477 + }, + { + "epoch": 1.3746110150654482, + "grad_norm": 0.457732827339079, + "learning_rate": 4.9557122551370964e-06, + "loss": 0.5607, + "step": 3478 + }, + { + "epoch": 1.3750061743640405, + "grad_norm": 0.44043665757163497, + "learning_rate": 4.955682890927257e-06, + "loss": 0.5563, + "step": 3479 + }, + { + "epoch": 1.3754013336626327, + "grad_norm": 0.4583584687679824, + "learning_rate": 4.955653517072986e-06, + "loss": 0.5743, + "step": 3480 + }, + { + "epoch": 1.375796492961225, + "grad_norm": 0.45153202964049305, + "learning_rate": 4.955624133574401e-06, + "loss": 0.5677, + "step": 3481 + }, + { + "epoch": 1.3761916522598172, + "grad_norm": 0.4374756911240904, + "learning_rate": 4.955594740431613e-06, + "loss": 0.5654, + "step": 3482 + }, + { + "epoch": 1.3765868115584095, + "grad_norm": 0.4556124603164875, + "learning_rate": 4.9555653376447416e-06, + "loss": 0.5649, + "step": 3483 + }, + { + "epoch": 1.3769819708570017, + "grad_norm": 0.4757752596484646, + "learning_rate": 4.9555359252139e-06, + "loss": 0.5888, + "step": 3484 + }, + { + "epoch": 1.377377130155594, + "grad_norm": 0.4465560956225096, + "learning_rate": 4.955506503139205e-06, + "loss": 0.5591, + "step": 3485 + }, + { + "epoch": 1.3777722894541862, + "grad_norm": 0.46143614783960507, + "learning_rate": 4.955477071420771e-06, + "loss": 0.5767, + "step": 3486 + }, + { + "epoch": 1.3781674487527784, + "grad_norm": 0.4625557795823603, + "learning_rate": 4.955447630058714e-06, + "loss": 0.5471, + "step": 3487 + }, + { + "epoch": 1.3785626080513707, + "grad_norm": 0.45122028220004684, + "learning_rate": 4.95541817905315e-06, + "loss": 0.5763, + "step": 3488 + }, + { + "epoch": 1.378957767349963, + "grad_norm": 0.4442976727237174, + "learning_rate": 4.955388718404194e-06, + "loss": 0.5665, + "step": 3489 + }, + { + "epoch": 1.3793529266485551, + "grad_norm": 0.46811037604603395, + "learning_rate": 4.955359248111963e-06, + "loss": 0.5561, + "step": 3490 + }, + { + "epoch": 1.3797480859471474, + "grad_norm": 0.45401020607684417, + "learning_rate": 4.955329768176571e-06, + "loss": 0.5673, + "step": 3491 + }, + { + "epoch": 1.3801432452457396, + "grad_norm": 0.44325397259906013, + "learning_rate": 4.955300278598135e-06, + "loss": 0.5571, + "step": 3492 + }, + { + "epoch": 1.3805384045443319, + "grad_norm": 0.4477474281264573, + "learning_rate": 4.955270779376771e-06, + "loss": 0.5621, + "step": 3493 + }, + { + "epoch": 1.380933563842924, + "grad_norm": 0.4768670398130126, + "learning_rate": 4.955241270512593e-06, + "loss": 0.5839, + "step": 3494 + }, + { + "epoch": 1.3813287231415163, + "grad_norm": 0.4460401555017405, + "learning_rate": 4.955211752005719e-06, + "loss": 0.5539, + "step": 3495 + }, + { + "epoch": 1.3817238824401086, + "grad_norm": 0.4366552836817992, + "learning_rate": 4.955182223856264e-06, + "loss": 0.5556, + "step": 3496 + }, + { + "epoch": 1.3821190417387008, + "grad_norm": 0.4493868061509665, + "learning_rate": 4.955152686064344e-06, + "loss": 0.5756, + "step": 3497 + }, + { + "epoch": 1.382514201037293, + "grad_norm": 0.42939645657453296, + "learning_rate": 4.955123138630075e-06, + "loss": 0.5705, + "step": 3498 + }, + { + "epoch": 1.3829093603358853, + "grad_norm": 0.4338686438980797, + "learning_rate": 4.955093581553574e-06, + "loss": 0.5505, + "step": 3499 + }, + { + "epoch": 1.3833045196344775, + "grad_norm": 0.44361106507257514, + "learning_rate": 4.955064014834955e-06, + "loss": 0.5488, + "step": 3500 + }, + { + "epoch": 1.3836996789330698, + "grad_norm": 0.4420112874538524, + "learning_rate": 4.9550344384743365e-06, + "loss": 0.5624, + "step": 3501 + }, + { + "epoch": 1.384094838231662, + "grad_norm": 0.44997332812869023, + "learning_rate": 4.955004852471832e-06, + "loss": 0.5697, + "step": 3502 + }, + { + "epoch": 1.3844899975302543, + "grad_norm": 0.4591341499895522, + "learning_rate": 4.9549752568275605e-06, + "loss": 0.5587, + "step": 3503 + }, + { + "epoch": 1.3848851568288465, + "grad_norm": 0.460676095154433, + "learning_rate": 4.954945651541636e-06, + "loss": 0.565, + "step": 3504 + }, + { + "epoch": 1.3852803161274387, + "grad_norm": 0.4299069716032761, + "learning_rate": 4.954916036614177e-06, + "loss": 0.5545, + "step": 3505 + }, + { + "epoch": 1.3856754754260312, + "grad_norm": 0.43887083866057874, + "learning_rate": 4.954886412045298e-06, + "loss": 0.5921, + "step": 3506 + }, + { + "epoch": 1.3860706347246234, + "grad_norm": 0.42648077434777515, + "learning_rate": 4.954856777835115e-06, + "loss": 0.5456, + "step": 3507 + }, + { + "epoch": 1.3864657940232157, + "grad_norm": 0.4290139512004522, + "learning_rate": 4.954827133983746e-06, + "loss": 0.5867, + "step": 3508 + }, + { + "epoch": 1.386860953321808, + "grad_norm": 0.44830270070683675, + "learning_rate": 4.954797480491307e-06, + "loss": 0.5784, + "step": 3509 + }, + { + "epoch": 1.3872561126204002, + "grad_norm": 0.4477655067908414, + "learning_rate": 4.954767817357913e-06, + "loss": 0.5638, + "step": 3510 + }, + { + "epoch": 1.3876512719189924, + "grad_norm": 0.43658218747964356, + "learning_rate": 4.954738144583683e-06, + "loss": 0.5844, + "step": 3511 + }, + { + "epoch": 1.3880464312175846, + "grad_norm": 0.4393922211507673, + "learning_rate": 4.954708462168731e-06, + "loss": 0.5658, + "step": 3512 + }, + { + "epoch": 1.3884415905161769, + "grad_norm": 0.44104621414355816, + "learning_rate": 4.954678770113175e-06, + "loss": 0.5566, + "step": 3513 + }, + { + "epoch": 1.3888367498147691, + "grad_norm": 0.4578661431463306, + "learning_rate": 4.954649068417132e-06, + "loss": 0.581, + "step": 3514 + }, + { + "epoch": 1.3892319091133614, + "grad_norm": 0.44817762228122565, + "learning_rate": 4.954619357080717e-06, + "loss": 0.5633, + "step": 3515 + }, + { + "epoch": 1.3896270684119536, + "grad_norm": 0.4817436014812288, + "learning_rate": 4.954589636104049e-06, + "loss": 0.5447, + "step": 3516 + }, + { + "epoch": 1.3900222277105458, + "grad_norm": 0.46133823688460374, + "learning_rate": 4.954559905487242e-06, + "loss": 0.5746, + "step": 3517 + }, + { + "epoch": 1.390417387009138, + "grad_norm": 0.44329286540723245, + "learning_rate": 4.954530165230415e-06, + "loss": 0.5564, + "step": 3518 + }, + { + "epoch": 1.3908125463077303, + "grad_norm": 0.43383834817026123, + "learning_rate": 4.954500415333684e-06, + "loss": 0.5645, + "step": 3519 + }, + { + "epoch": 1.3912077056063226, + "grad_norm": 0.4378169397587514, + "learning_rate": 4.954470655797165e-06, + "loss": 0.5463, + "step": 3520 + }, + { + "epoch": 1.3916028649049148, + "grad_norm": 0.4550593061487435, + "learning_rate": 4.954440886620977e-06, + "loss": 0.5516, + "step": 3521 + }, + { + "epoch": 1.391998024203507, + "grad_norm": 0.4400286853367591, + "learning_rate": 4.9544111078052345e-06, + "loss": 0.5699, + "step": 3522 + }, + { + "epoch": 1.3923931835020993, + "grad_norm": 0.45981608819949793, + "learning_rate": 4.954381319350056e-06, + "loss": 0.569, + "step": 3523 + }, + { + "epoch": 1.3927883428006915, + "grad_norm": 0.44060684429684627, + "learning_rate": 4.9543515212555585e-06, + "loss": 0.5732, + "step": 3524 + }, + { + "epoch": 1.3931835020992838, + "grad_norm": 0.4516861113642003, + "learning_rate": 4.954321713521858e-06, + "loss": 0.5519, + "step": 3525 + }, + { + "epoch": 1.393578661397876, + "grad_norm": 0.4616718149420385, + "learning_rate": 4.954291896149072e-06, + "loss": 0.5516, + "step": 3526 + }, + { + "epoch": 1.3939738206964682, + "grad_norm": 0.4452993405905123, + "learning_rate": 4.954262069137318e-06, + "loss": 0.5572, + "step": 3527 + }, + { + "epoch": 1.3943689799950605, + "grad_norm": 0.434383788130944, + "learning_rate": 4.9542322324867136e-06, + "loss": 0.5472, + "step": 3528 + }, + { + "epoch": 1.3947641392936527, + "grad_norm": 0.44451550141787594, + "learning_rate": 4.954202386197375e-06, + "loss": 0.5587, + "step": 3529 + }, + { + "epoch": 1.395159298592245, + "grad_norm": 0.4481545332604534, + "learning_rate": 4.954172530269419e-06, + "loss": 0.5513, + "step": 3530 + }, + { + "epoch": 1.3955544578908372, + "grad_norm": 0.4396843141846436, + "learning_rate": 4.954142664702963e-06, + "loss": 0.5759, + "step": 3531 + }, + { + "epoch": 1.3959496171894294, + "grad_norm": 0.44315777199892625, + "learning_rate": 4.954112789498126e-06, + "loss": 0.5745, + "step": 3532 + }, + { + "epoch": 1.3963447764880217, + "grad_norm": 0.463836596007923, + "learning_rate": 4.9540829046550245e-06, + "loss": 0.5844, + "step": 3533 + }, + { + "epoch": 1.396739935786614, + "grad_norm": 0.527126180105154, + "learning_rate": 4.954053010173774e-06, + "loss": 0.5632, + "step": 3534 + }, + { + "epoch": 1.3971350950852062, + "grad_norm": 0.471207695771745, + "learning_rate": 4.954023106054495e-06, + "loss": 0.5727, + "step": 3535 + }, + { + "epoch": 1.3975302543837984, + "grad_norm": 0.4547374119776966, + "learning_rate": 4.953993192297303e-06, + "loss": 0.5639, + "step": 3536 + }, + { + "epoch": 1.3979254136823906, + "grad_norm": 0.4567625911659694, + "learning_rate": 4.953963268902315e-06, + "loss": 0.5772, + "step": 3537 + }, + { + "epoch": 1.398320572980983, + "grad_norm": 0.4241364653534491, + "learning_rate": 4.953933335869651e-06, + "loss": 0.5501, + "step": 3538 + }, + { + "epoch": 1.3987157322795751, + "grad_norm": 0.45962400820648397, + "learning_rate": 4.9539033931994255e-06, + "loss": 0.5597, + "step": 3539 + }, + { + "epoch": 1.3991108915781674, + "grad_norm": 0.44479747715613505, + "learning_rate": 4.953873440891758e-06, + "loss": 0.567, + "step": 3540 + }, + { + "epoch": 1.3995060508767596, + "grad_norm": 0.44139630414780884, + "learning_rate": 4.953843478946766e-06, + "loss": 0.5611, + "step": 3541 + }, + { + "epoch": 1.399901210175352, + "grad_norm": 0.44038065639815877, + "learning_rate": 4.953813507364566e-06, + "loss": 0.5572, + "step": 3542 + }, + { + "epoch": 1.4002963694739443, + "grad_norm": 0.44652707139959935, + "learning_rate": 4.9537835261452785e-06, + "loss": 0.552, + "step": 3543 + }, + { + "epoch": 1.4006915287725366, + "grad_norm": 0.447649590870608, + "learning_rate": 4.953753535289017e-06, + "loss": 0.55, + "step": 3544 + }, + { + "epoch": 1.4010866880711288, + "grad_norm": 0.4544049896666161, + "learning_rate": 4.953723534795903e-06, + "loss": 0.5504, + "step": 3545 + }, + { + "epoch": 1.401481847369721, + "grad_norm": 0.466964841617532, + "learning_rate": 4.953693524666054e-06, + "loss": 0.5722, + "step": 3546 + }, + { + "epoch": 1.4018770066683133, + "grad_norm": 0.527230905472802, + "learning_rate": 4.953663504899585e-06, + "loss": 0.5524, + "step": 3547 + }, + { + "epoch": 1.4022721659669055, + "grad_norm": 0.440070467466554, + "learning_rate": 4.953633475496615e-06, + "loss": 0.5412, + "step": 3548 + }, + { + "epoch": 1.4026673252654978, + "grad_norm": 0.4364642690771747, + "learning_rate": 4.9536034364572645e-06, + "loss": 0.5551, + "step": 3549 + }, + { + "epoch": 1.40306248456409, + "grad_norm": 0.45824115876308913, + "learning_rate": 4.953573387781649e-06, + "loss": 0.5731, + "step": 3550 + }, + { + "epoch": 1.4034576438626822, + "grad_norm": 0.4559809654130625, + "learning_rate": 4.9535433294698865e-06, + "loss": 0.5735, + "step": 3551 + }, + { + "epoch": 1.4038528031612745, + "grad_norm": 0.4437519963912747, + "learning_rate": 4.9535132615220965e-06, + "loss": 0.5715, + "step": 3552 + }, + { + "epoch": 1.4042479624598667, + "grad_norm": 0.45338834415060414, + "learning_rate": 4.953483183938395e-06, + "loss": 0.5724, + "step": 3553 + }, + { + "epoch": 1.404643121758459, + "grad_norm": 0.4568209368624167, + "learning_rate": 4.953453096718903e-06, + "loss": 0.5581, + "step": 3554 + }, + { + "epoch": 1.4050382810570512, + "grad_norm": 0.43314731805578655, + "learning_rate": 4.953422999863736e-06, + "loss": 0.5379, + "step": 3555 + }, + { + "epoch": 1.4054334403556434, + "grad_norm": 0.45449692655904694, + "learning_rate": 4.953392893373015e-06, + "loss": 0.5739, + "step": 3556 + }, + { + "epoch": 1.4058285996542357, + "grad_norm": 0.4562200435387982, + "learning_rate": 4.953362777246855e-06, + "loss": 0.5643, + "step": 3557 + }, + { + "epoch": 1.406223758952828, + "grad_norm": 0.4440850256076502, + "learning_rate": 4.953332651485375e-06, + "loss": 0.5689, + "step": 3558 + }, + { + "epoch": 1.4066189182514202, + "grad_norm": 0.43280961705826465, + "learning_rate": 4.953302516088695e-06, + "loss": 0.5452, + "step": 3559 + }, + { + "epoch": 1.4070140775500124, + "grad_norm": 0.43383955128353996, + "learning_rate": 4.953272371056933e-06, + "loss": 0.5437, + "step": 3560 + }, + { + "epoch": 1.4074092368486046, + "grad_norm": 0.46587228167336536, + "learning_rate": 4.953242216390206e-06, + "loss": 0.5626, + "step": 3561 + }, + { + "epoch": 1.4078043961471969, + "grad_norm": 0.4627748388388923, + "learning_rate": 4.953212052088634e-06, + "loss": 0.5578, + "step": 3562 + }, + { + "epoch": 1.4081995554457891, + "grad_norm": 0.4494606350210151, + "learning_rate": 4.953181878152334e-06, + "loss": 0.559, + "step": 3563 + }, + { + "epoch": 1.4085947147443814, + "grad_norm": 0.4689503889787178, + "learning_rate": 4.953151694581425e-06, + "loss": 0.5853, + "step": 3564 + }, + { + "epoch": 1.4089898740429736, + "grad_norm": 0.425074031613252, + "learning_rate": 4.953121501376027e-06, + "loss": 0.5469, + "step": 3565 + }, + { + "epoch": 1.4093850333415658, + "grad_norm": 0.43908518140110436, + "learning_rate": 4.953091298536256e-06, + "loss": 0.5573, + "step": 3566 + }, + { + "epoch": 1.409780192640158, + "grad_norm": 0.456588536080327, + "learning_rate": 4.953061086062233e-06, + "loss": 0.5738, + "step": 3567 + }, + { + "epoch": 1.4101753519387503, + "grad_norm": 0.4407899689038523, + "learning_rate": 4.953030863954075e-06, + "loss": 0.5485, + "step": 3568 + }, + { + "epoch": 1.4105705112373426, + "grad_norm": 0.44983477733657745, + "learning_rate": 4.953000632211902e-06, + "loss": 0.5587, + "step": 3569 + }, + { + "epoch": 1.4109656705359348, + "grad_norm": 0.4444516608223304, + "learning_rate": 4.952970390835831e-06, + "loss": 0.56, + "step": 3570 + }, + { + "epoch": 1.411360829834527, + "grad_norm": 0.44234362530280924, + "learning_rate": 4.952940139825982e-06, + "loss": 0.58, + "step": 3571 + }, + { + "epoch": 1.4117559891331193, + "grad_norm": 0.4303479209786135, + "learning_rate": 4.952909879182475e-06, + "loss": 0.5487, + "step": 3572 + }, + { + "epoch": 1.4121511484317115, + "grad_norm": 0.4395260223716139, + "learning_rate": 4.952879608905427e-06, + "loss": 0.5697, + "step": 3573 + }, + { + "epoch": 1.4125463077303038, + "grad_norm": 0.43025433618329334, + "learning_rate": 4.952849328994957e-06, + "loss": 0.5622, + "step": 3574 + }, + { + "epoch": 1.412941467028896, + "grad_norm": 0.4425864856013525, + "learning_rate": 4.9528190394511835e-06, + "loss": 0.5472, + "step": 3575 + }, + { + "epoch": 1.4133366263274882, + "grad_norm": 0.4515220954726901, + "learning_rate": 4.9527887402742266e-06, + "loss": 0.5593, + "step": 3576 + }, + { + "epoch": 1.4137317856260805, + "grad_norm": 0.4435809550125933, + "learning_rate": 4.952758431464206e-06, + "loss": 0.5604, + "step": 3577 + }, + { + "epoch": 1.4141269449246727, + "grad_norm": 0.45887561975872704, + "learning_rate": 4.952728113021239e-06, + "loss": 0.5573, + "step": 3578 + }, + { + "epoch": 1.414522104223265, + "grad_norm": 0.4382166236259494, + "learning_rate": 4.952697784945445e-06, + "loss": 0.5767, + "step": 3579 + }, + { + "epoch": 1.4149172635218572, + "grad_norm": 0.43576829414699486, + "learning_rate": 4.952667447236944e-06, + "loss": 0.5485, + "step": 3580 + }, + { + "epoch": 1.4153124228204494, + "grad_norm": 0.6295771726774199, + "learning_rate": 4.952637099895854e-06, + "loss": 0.5594, + "step": 3581 + }, + { + "epoch": 1.4157075821190417, + "grad_norm": 0.44289940633979036, + "learning_rate": 4.952606742922296e-06, + "loss": 0.5492, + "step": 3582 + }, + { + "epoch": 1.416102741417634, + "grad_norm": 0.44005534309853256, + "learning_rate": 4.952576376316387e-06, + "loss": 0.5488, + "step": 3583 + }, + { + "epoch": 1.4164979007162262, + "grad_norm": 0.5156052999824663, + "learning_rate": 4.952546000078247e-06, + "loss": 0.5623, + "step": 3584 + }, + { + "epoch": 1.4168930600148184, + "grad_norm": 0.4720203506988176, + "learning_rate": 4.952515614207996e-06, + "loss": 0.5709, + "step": 3585 + }, + { + "epoch": 1.4172882193134106, + "grad_norm": 0.4403789400135003, + "learning_rate": 4.952485218705753e-06, + "loss": 0.5612, + "step": 3586 + }, + { + "epoch": 1.4176833786120029, + "grad_norm": 0.4346034283155917, + "learning_rate": 4.952454813571638e-06, + "loss": 0.5532, + "step": 3587 + }, + { + "epoch": 1.4180785379105951, + "grad_norm": 0.4914958621897474, + "learning_rate": 4.952424398805769e-06, + "loss": 0.5874, + "step": 3588 + }, + { + "epoch": 1.4184736972091874, + "grad_norm": 0.44158849976736936, + "learning_rate": 4.952393974408265e-06, + "loss": 0.558, + "step": 3589 + }, + { + "epoch": 1.4188688565077796, + "grad_norm": 0.47059542127549936, + "learning_rate": 4.952363540379248e-06, + "loss": 0.56, + "step": 3590 + }, + { + "epoch": 1.4192640158063718, + "grad_norm": 0.4428834340575035, + "learning_rate": 4.952333096718837e-06, + "loss": 0.5779, + "step": 3591 + }, + { + "epoch": 1.419659175104964, + "grad_norm": 0.4318718915548664, + "learning_rate": 4.952302643427149e-06, + "loss": 0.5545, + "step": 3592 + }, + { + "epoch": 1.4200543344035563, + "grad_norm": 0.4399415267342125, + "learning_rate": 4.952272180504306e-06, + "loss": 0.5605, + "step": 3593 + }, + { + "epoch": 1.4204494937021486, + "grad_norm": 0.44720706398972543, + "learning_rate": 4.952241707950427e-06, + "loss": 0.5722, + "step": 3594 + }, + { + "epoch": 1.4208446530007408, + "grad_norm": 0.43958222609485287, + "learning_rate": 4.9522112257656315e-06, + "loss": 0.5474, + "step": 3595 + }, + { + "epoch": 1.421239812299333, + "grad_norm": 0.4335981477530248, + "learning_rate": 4.952180733950039e-06, + "loss": 0.5406, + "step": 3596 + }, + { + "epoch": 1.4216349715979253, + "grad_norm": 0.4452745792193868, + "learning_rate": 4.952150232503771e-06, + "loss": 0.5774, + "step": 3597 + }, + { + "epoch": 1.4220301308965175, + "grad_norm": 0.450111748575881, + "learning_rate": 4.952119721426945e-06, + "loss": 0.564, + "step": 3598 + }, + { + "epoch": 1.4224252901951098, + "grad_norm": 0.4514178883673883, + "learning_rate": 4.952089200719682e-06, + "loss": 0.5702, + "step": 3599 + }, + { + "epoch": 1.422820449493702, + "grad_norm": 0.44869890712620886, + "learning_rate": 4.9520586703821006e-06, + "loss": 0.5695, + "step": 3600 + }, + { + "epoch": 1.4232156087922945, + "grad_norm": 0.47920918782737604, + "learning_rate": 4.952028130414322e-06, + "loss": 0.5538, + "step": 3601 + }, + { + "epoch": 1.4236107680908867, + "grad_norm": 0.4359999152882843, + "learning_rate": 4.951997580816466e-06, + "loss": 0.5561, + "step": 3602 + }, + { + "epoch": 1.424005927389479, + "grad_norm": 0.4603603266157431, + "learning_rate": 4.951967021588654e-06, + "loss": 0.5539, + "step": 3603 + }, + { + "epoch": 1.4244010866880712, + "grad_norm": 0.4470887413164905, + "learning_rate": 4.9519364527310035e-06, + "loss": 0.557, + "step": 3604 + }, + { + "epoch": 1.4247962459866634, + "grad_norm": 0.45851610322617087, + "learning_rate": 4.9519058742436345e-06, + "loss": 0.5501, + "step": 3605 + }, + { + "epoch": 1.4251914052852557, + "grad_norm": 0.47419186727157, + "learning_rate": 4.951875286126669e-06, + "loss": 0.577, + "step": 3606 + }, + { + "epoch": 1.425586564583848, + "grad_norm": 0.4495038740005354, + "learning_rate": 4.951844688380226e-06, + "loss": 0.5761, + "step": 3607 + }, + { + "epoch": 1.4259817238824402, + "grad_norm": 0.42780708612897866, + "learning_rate": 4.951814081004426e-06, + "loss": 0.5507, + "step": 3608 + }, + { + "epoch": 1.4263768831810324, + "grad_norm": 0.4420018249267614, + "learning_rate": 4.951783463999389e-06, + "loss": 0.5659, + "step": 3609 + }, + { + "epoch": 1.4267720424796246, + "grad_norm": 0.4321949197983691, + "learning_rate": 4.951752837365236e-06, + "loss": 0.5578, + "step": 3610 + }, + { + "epoch": 1.4271672017782169, + "grad_norm": 0.45648209967593256, + "learning_rate": 4.951722201102085e-06, + "loss": 0.5829, + "step": 3611 + }, + { + "epoch": 1.4275623610768091, + "grad_norm": 0.4448948600493821, + "learning_rate": 4.9516915552100594e-06, + "loss": 0.5672, + "step": 3612 + }, + { + "epoch": 1.4279575203754014, + "grad_norm": 0.4357660265947906, + "learning_rate": 4.951660899689278e-06, + "loss": 0.5473, + "step": 3613 + }, + { + "epoch": 1.4283526796739936, + "grad_norm": 0.4697567901251499, + "learning_rate": 4.951630234539861e-06, + "loss": 0.5793, + "step": 3614 + }, + { + "epoch": 1.4287478389725858, + "grad_norm": 0.4474239868294111, + "learning_rate": 4.951599559761929e-06, + "loss": 0.5559, + "step": 3615 + }, + { + "epoch": 1.429142998271178, + "grad_norm": 0.46638942204948125, + "learning_rate": 4.951568875355603e-06, + "loss": 0.5674, + "step": 3616 + }, + { + "epoch": 1.4295381575697703, + "grad_norm": 0.46201733877960605, + "learning_rate": 4.951538181321003e-06, + "loss": 0.565, + "step": 3617 + }, + { + "epoch": 1.4299333168683626, + "grad_norm": 0.4468591448162063, + "learning_rate": 4.9515074776582495e-06, + "loss": 0.5644, + "step": 3618 + }, + { + "epoch": 1.4303284761669548, + "grad_norm": 0.4563343700830998, + "learning_rate": 4.951476764367463e-06, + "loss": 0.5749, + "step": 3619 + }, + { + "epoch": 1.430723635465547, + "grad_norm": 0.43658851767260776, + "learning_rate": 4.951446041448765e-06, + "loss": 0.5736, + "step": 3620 + }, + { + "epoch": 1.4311187947641393, + "grad_norm": 0.4349415610360004, + "learning_rate": 4.951415308902275e-06, + "loss": 0.5596, + "step": 3621 + }, + { + "epoch": 1.4315139540627315, + "grad_norm": 0.462135397048311, + "learning_rate": 4.951384566728115e-06, + "loss": 0.5595, + "step": 3622 + }, + { + "epoch": 1.4319091133613238, + "grad_norm": 0.4543875910238101, + "learning_rate": 4.951353814926405e-06, + "loss": 0.5489, + "step": 3623 + }, + { + "epoch": 1.432304272659916, + "grad_norm": 0.4297719147230641, + "learning_rate": 4.951323053497265e-06, + "loss": 0.565, + "step": 3624 + }, + { + "epoch": 1.4326994319585082, + "grad_norm": 0.45442855434813045, + "learning_rate": 4.951292282440817e-06, + "loss": 0.5698, + "step": 3625 + }, + { + "epoch": 1.4330945912571005, + "grad_norm": 0.5225354098317048, + "learning_rate": 4.951261501757182e-06, + "loss": 0.5531, + "step": 3626 + }, + { + "epoch": 1.4334897505556927, + "grad_norm": 0.4420068105595844, + "learning_rate": 4.951230711446479e-06, + "loss": 0.5766, + "step": 3627 + }, + { + "epoch": 1.433884909854285, + "grad_norm": 0.44501142719182346, + "learning_rate": 4.951199911508831e-06, + "loss": 0.5568, + "step": 3628 + }, + { + "epoch": 1.4342800691528772, + "grad_norm": 0.5349036907902572, + "learning_rate": 4.951169101944358e-06, + "loss": 0.5508, + "step": 3629 + }, + { + "epoch": 1.4346752284514694, + "grad_norm": 0.46471714809625664, + "learning_rate": 4.951138282753181e-06, + "loss": 0.579, + "step": 3630 + }, + { + "epoch": 1.4350703877500617, + "grad_norm": 0.4425488778078077, + "learning_rate": 4.951107453935421e-06, + "loss": 0.5469, + "step": 3631 + }, + { + "epoch": 1.435465547048654, + "grad_norm": 0.4395384121942282, + "learning_rate": 4.951076615491201e-06, + "loss": 0.5469, + "step": 3632 + }, + { + "epoch": 1.4358607063472462, + "grad_norm": 0.45264206492150943, + "learning_rate": 4.9510457674206385e-06, + "loss": 0.5669, + "step": 3633 + }, + { + "epoch": 1.4362558656458384, + "grad_norm": 0.4359952505534556, + "learning_rate": 4.951014909723858e-06, + "loss": 0.5581, + "step": 3634 + }, + { + "epoch": 1.4366510249444306, + "grad_norm": 0.43031272641598334, + "learning_rate": 4.950984042400978e-06, + "loss": 0.5614, + "step": 3635 + }, + { + "epoch": 1.437046184243023, + "grad_norm": 0.44952139574949684, + "learning_rate": 4.9509531654521216e-06, + "loss": 0.564, + "step": 3636 + }, + { + "epoch": 1.4374413435416153, + "grad_norm": 0.4894458991545237, + "learning_rate": 4.950922278877409e-06, + "loss": 0.5373, + "step": 3637 + }, + { + "epoch": 1.4378365028402076, + "grad_norm": 0.4435614177631397, + "learning_rate": 4.950891382676963e-06, + "loss": 0.561, + "step": 3638 + }, + { + "epoch": 1.4382316621387998, + "grad_norm": 0.4556594196014496, + "learning_rate": 4.950860476850903e-06, + "loss": 0.5612, + "step": 3639 + }, + { + "epoch": 1.438626821437392, + "grad_norm": 0.43556778440010696, + "learning_rate": 4.9508295613993515e-06, + "loss": 0.5494, + "step": 3640 + }, + { + "epoch": 1.4390219807359843, + "grad_norm": 0.4521754341667638, + "learning_rate": 4.9507986363224305e-06, + "loss": 0.5526, + "step": 3641 + }, + { + "epoch": 1.4394171400345765, + "grad_norm": 0.439981945538117, + "learning_rate": 4.950767701620259e-06, + "loss": 0.548, + "step": 3642 + }, + { + "epoch": 1.4398122993331688, + "grad_norm": 0.4351939027647017, + "learning_rate": 4.950736757292962e-06, + "loss": 0.5698, + "step": 3643 + }, + { + "epoch": 1.440207458631761, + "grad_norm": 0.43466458525745466, + "learning_rate": 4.950705803340657e-06, + "loss": 0.5595, + "step": 3644 + }, + { + "epoch": 1.4406026179303533, + "grad_norm": 0.4482718750704879, + "learning_rate": 4.9506748397634695e-06, + "loss": 0.5714, + "step": 3645 + }, + { + "epoch": 1.4409977772289455, + "grad_norm": 0.4424752556047753, + "learning_rate": 4.9506438665615195e-06, + "loss": 0.5542, + "step": 3646 + }, + { + "epoch": 1.4413929365275377, + "grad_norm": 0.4455607912735244, + "learning_rate": 4.950612883734928e-06, + "loss": 0.5591, + "step": 3647 + }, + { + "epoch": 1.44178809582613, + "grad_norm": 0.44170634526996, + "learning_rate": 4.950581891283816e-06, + "loss": 0.5546, + "step": 3648 + }, + { + "epoch": 1.4421832551247222, + "grad_norm": 0.45300602248436533, + "learning_rate": 4.950550889208308e-06, + "loss": 0.5494, + "step": 3649 + }, + { + "epoch": 1.4425784144233145, + "grad_norm": 0.45683270516581864, + "learning_rate": 4.950519877508524e-06, + "loss": 0.5805, + "step": 3650 + }, + { + "epoch": 1.4429735737219067, + "grad_norm": 0.4527318497547736, + "learning_rate": 4.950488856184585e-06, + "loss": 0.5625, + "step": 3651 + }, + { + "epoch": 1.443368733020499, + "grad_norm": 0.4384333802721848, + "learning_rate": 4.950457825236615e-06, + "loss": 0.5624, + "step": 3652 + }, + { + "epoch": 1.4437638923190912, + "grad_norm": 0.48213960406408873, + "learning_rate": 4.950426784664734e-06, + "loss": 0.5796, + "step": 3653 + }, + { + "epoch": 1.4441590516176834, + "grad_norm": 0.4656284274871252, + "learning_rate": 4.950395734469065e-06, + "loss": 0.5488, + "step": 3654 + }, + { + "epoch": 1.4445542109162757, + "grad_norm": 0.4342682580148919, + "learning_rate": 4.950364674649729e-06, + "loss": 0.5465, + "step": 3655 + }, + { + "epoch": 1.444949370214868, + "grad_norm": 0.4371079121432642, + "learning_rate": 4.9503336052068485e-06, + "loss": 0.5622, + "step": 3656 + }, + { + "epoch": 1.4453445295134602, + "grad_norm": 0.43992242202525345, + "learning_rate": 4.9503025261405455e-06, + "loss": 0.5514, + "step": 3657 + }, + { + "epoch": 1.4457396888120524, + "grad_norm": 0.4414816507834958, + "learning_rate": 4.950271437450943e-06, + "loss": 0.5563, + "step": 3658 + }, + { + "epoch": 1.4461348481106446, + "grad_norm": 0.44005325021141295, + "learning_rate": 4.950240339138161e-06, + "loss": 0.5747, + "step": 3659 + }, + { + "epoch": 1.4465300074092369, + "grad_norm": 0.4421664700998617, + "learning_rate": 4.950209231202323e-06, + "loss": 0.5498, + "step": 3660 + }, + { + "epoch": 1.4469251667078291, + "grad_norm": 0.43113979255459345, + "learning_rate": 4.950178113643551e-06, + "loss": 0.5798, + "step": 3661 + }, + { + "epoch": 1.4473203260064214, + "grad_norm": 0.4708087923129291, + "learning_rate": 4.950146986461968e-06, + "loss": 0.5421, + "step": 3662 + }, + { + "epoch": 1.4477154853050136, + "grad_norm": 0.45141956435396696, + "learning_rate": 4.9501158496576945e-06, + "loss": 0.5613, + "step": 3663 + }, + { + "epoch": 1.4481106446036058, + "grad_norm": 0.45661303582282825, + "learning_rate": 4.950084703230854e-06, + "loss": 0.5821, + "step": 3664 + }, + { + "epoch": 1.448505803902198, + "grad_norm": 0.4467554956414753, + "learning_rate": 4.9500535471815696e-06, + "loss": 0.5614, + "step": 3665 + }, + { + "epoch": 1.4489009632007903, + "grad_norm": 0.4375691215444136, + "learning_rate": 4.950022381509961e-06, + "loss": 0.5546, + "step": 3666 + }, + { + "epoch": 1.4492961224993826, + "grad_norm": 0.44417059941621506, + "learning_rate": 4.949991206216152e-06, + "loss": 0.5474, + "step": 3667 + }, + { + "epoch": 1.4496912817979748, + "grad_norm": 0.5239384393901027, + "learning_rate": 4.949960021300267e-06, + "loss": 0.5724, + "step": 3668 + }, + { + "epoch": 1.450086441096567, + "grad_norm": 0.4600101751343761, + "learning_rate": 4.949928826762425e-06, + "loss": 0.5643, + "step": 3669 + }, + { + "epoch": 1.4504816003951593, + "grad_norm": 0.4398883196726322, + "learning_rate": 4.949897622602752e-06, + "loss": 0.5645, + "step": 3670 + }, + { + "epoch": 1.4508767596937515, + "grad_norm": 0.4243916986024868, + "learning_rate": 4.949866408821368e-06, + "loss": 0.5414, + "step": 3671 + }, + { + "epoch": 1.4512719189923438, + "grad_norm": 0.46429052461409404, + "learning_rate": 4.949835185418397e-06, + "loss": 0.5675, + "step": 3672 + }, + { + "epoch": 1.451667078290936, + "grad_norm": 0.43727146868324845, + "learning_rate": 4.94980395239396e-06, + "loss": 0.5411, + "step": 3673 + }, + { + "epoch": 1.4520622375895282, + "grad_norm": 0.44296006305796626, + "learning_rate": 4.94977270974818e-06, + "loss": 0.5706, + "step": 3674 + }, + { + "epoch": 1.4524573968881205, + "grad_norm": 0.43581736362129647, + "learning_rate": 4.949741457481182e-06, + "loss": 0.5579, + "step": 3675 + }, + { + "epoch": 1.4528525561867127, + "grad_norm": 0.43935390623902243, + "learning_rate": 4.949710195593087e-06, + "loss": 0.5522, + "step": 3676 + }, + { + "epoch": 1.453247715485305, + "grad_norm": 0.4604206853638542, + "learning_rate": 4.949678924084017e-06, + "loss": 0.5861, + "step": 3677 + }, + { + "epoch": 1.4536428747838972, + "grad_norm": 0.4666269167795469, + "learning_rate": 4.949647642954096e-06, + "loss": 0.5733, + "step": 3678 + }, + { + "epoch": 1.4540380340824894, + "grad_norm": 0.4425711605975332, + "learning_rate": 4.949616352203447e-06, + "loss": 0.5599, + "step": 3679 + }, + { + "epoch": 1.4544331933810817, + "grad_norm": 0.438259767331312, + "learning_rate": 4.949585051832192e-06, + "loss": 0.5512, + "step": 3680 + }, + { + "epoch": 1.454828352679674, + "grad_norm": 0.47194140077374513, + "learning_rate": 4.949553741840455e-06, + "loss": 0.5684, + "step": 3681 + }, + { + "epoch": 1.4552235119782662, + "grad_norm": 0.4581698913385663, + "learning_rate": 4.9495224222283576e-06, + "loss": 0.5731, + "step": 3682 + }, + { + "epoch": 1.4556186712768584, + "grad_norm": 0.4729218162827143, + "learning_rate": 4.949491092996024e-06, + "loss": 0.5581, + "step": 3683 + }, + { + "epoch": 1.4560138305754506, + "grad_norm": 0.4631393154498066, + "learning_rate": 4.9494597541435764e-06, + "loss": 0.5891, + "step": 3684 + }, + { + "epoch": 1.4564089898740429, + "grad_norm": 0.44412962722285376, + "learning_rate": 4.949428405671138e-06, + "loss": 0.5669, + "step": 3685 + }, + { + "epoch": 1.4568041491726351, + "grad_norm": 0.44251521812090155, + "learning_rate": 4.949397047578833e-06, + "loss": 0.5767, + "step": 3686 + }, + { + "epoch": 1.4571993084712274, + "grad_norm": 0.5568404678574649, + "learning_rate": 4.949365679866783e-06, + "loss": 0.5695, + "step": 3687 + }, + { + "epoch": 1.4575944677698196, + "grad_norm": 0.4597237274269838, + "learning_rate": 4.9493343025351125e-06, + "loss": 0.5841, + "step": 3688 + }, + { + "epoch": 1.4579896270684118, + "grad_norm": 0.433320088836893, + "learning_rate": 4.9493029155839435e-06, + "loss": 0.5549, + "step": 3689 + }, + { + "epoch": 1.458384786367004, + "grad_norm": 0.4408848451488535, + "learning_rate": 4.949271519013401e-06, + "loss": 0.5628, + "step": 3690 + }, + { + "epoch": 1.4587799456655963, + "grad_norm": 0.45052941581991246, + "learning_rate": 4.949240112823606e-06, + "loss": 0.5777, + "step": 3691 + }, + { + "epoch": 1.4591751049641886, + "grad_norm": 0.5775749617611324, + "learning_rate": 4.949208697014685e-06, + "loss": 0.5599, + "step": 3692 + }, + { + "epoch": 1.4595702642627808, + "grad_norm": 0.44576099623969545, + "learning_rate": 4.949177271586758e-06, + "loss": 0.5525, + "step": 3693 + }, + { + "epoch": 1.459965423561373, + "grad_norm": 0.44043843064525395, + "learning_rate": 4.94914583653995e-06, + "loss": 0.5702, + "step": 3694 + }, + { + "epoch": 1.4603605828599655, + "grad_norm": 0.47947522923316277, + "learning_rate": 4.9491143918743845e-06, + "loss": 0.5558, + "step": 3695 + }, + { + "epoch": 1.4607557421585577, + "grad_norm": 0.4592839139494569, + "learning_rate": 4.949082937590185e-06, + "loss": 0.5702, + "step": 3696 + }, + { + "epoch": 1.46115090145715, + "grad_norm": 0.45188516970792914, + "learning_rate": 4.949051473687475e-06, + "loss": 0.5528, + "step": 3697 + }, + { + "epoch": 1.4615460607557422, + "grad_norm": 2.2730337039449893, + "learning_rate": 4.949020000166378e-06, + "loss": 0.5541, + "step": 3698 + }, + { + "epoch": 1.4619412200543345, + "grad_norm": 0.4339995210953989, + "learning_rate": 4.948988517027017e-06, + "loss": 0.5729, + "step": 3699 + }, + { + "epoch": 1.4623363793529267, + "grad_norm": 0.4500818289037516, + "learning_rate": 4.948957024269516e-06, + "loss": 0.5613, + "step": 3700 + }, + { + "epoch": 1.462731538651519, + "grad_norm": 0.44742467459872376, + "learning_rate": 4.948925521894e-06, + "loss": 0.5351, + "step": 3701 + }, + { + "epoch": 1.4631266979501112, + "grad_norm": 0.4499910805435301, + "learning_rate": 4.948894009900591e-06, + "loss": 0.5574, + "step": 3702 + }, + { + "epoch": 1.4635218572487034, + "grad_norm": 0.4357059717410755, + "learning_rate": 4.948862488289413e-06, + "loss": 0.552, + "step": 3703 + }, + { + "epoch": 1.4639170165472957, + "grad_norm": 0.6496169252590506, + "learning_rate": 4.948830957060591e-06, + "loss": 0.5701, + "step": 3704 + }, + { + "epoch": 1.464312175845888, + "grad_norm": 0.6005901899539847, + "learning_rate": 4.948799416214247e-06, + "loss": 0.5604, + "step": 3705 + }, + { + "epoch": 1.4647073351444801, + "grad_norm": 0.45141656213054204, + "learning_rate": 4.9487678657505065e-06, + "loss": 0.573, + "step": 3706 + }, + { + "epoch": 1.4651024944430724, + "grad_norm": 0.46166246230338953, + "learning_rate": 4.948736305669494e-06, + "loss": 0.5826, + "step": 3707 + }, + { + "epoch": 1.4654976537416646, + "grad_norm": 0.4485874822386171, + "learning_rate": 4.9487047359713304e-06, + "loss": 0.5683, + "step": 3708 + }, + { + "epoch": 1.4658928130402569, + "grad_norm": 0.4504494694062807, + "learning_rate": 4.9486731566561416e-06, + "loss": 0.5597, + "step": 3709 + }, + { + "epoch": 1.466287972338849, + "grad_norm": 0.46237624966282265, + "learning_rate": 4.948641567724053e-06, + "loss": 0.5587, + "step": 3710 + }, + { + "epoch": 1.4666831316374414, + "grad_norm": 0.4412338330627608, + "learning_rate": 4.948609969175186e-06, + "loss": 0.58, + "step": 3711 + }, + { + "epoch": 1.4670782909360336, + "grad_norm": 0.46261210643628975, + "learning_rate": 4.9485783610096664e-06, + "loss": 0.5919, + "step": 3712 + }, + { + "epoch": 1.4674734502346258, + "grad_norm": 0.4570667260557455, + "learning_rate": 4.948546743227617e-06, + "loss": 0.5623, + "step": 3713 + }, + { + "epoch": 1.467868609533218, + "grad_norm": 0.43799780214279865, + "learning_rate": 4.948515115829164e-06, + "loss": 0.5526, + "step": 3714 + }, + { + "epoch": 1.4682637688318103, + "grad_norm": 0.4867222574376945, + "learning_rate": 4.9484834788144295e-06, + "loss": 0.5734, + "step": 3715 + }, + { + "epoch": 1.4686589281304026, + "grad_norm": 0.46745491220808105, + "learning_rate": 4.948451832183539e-06, + "loss": 0.5864, + "step": 3716 + }, + { + "epoch": 1.4690540874289948, + "grad_norm": 0.45356535226498157, + "learning_rate": 4.948420175936618e-06, + "loss": 0.563, + "step": 3717 + }, + { + "epoch": 1.469449246727587, + "grad_norm": 0.46464161309085605, + "learning_rate": 4.9483885100737875e-06, + "loss": 0.5494, + "step": 3718 + }, + { + "epoch": 1.4698444060261793, + "grad_norm": 0.4296514326477871, + "learning_rate": 4.9483568345951735e-06, + "loss": 0.561, + "step": 3719 + }, + { + "epoch": 1.4702395653247715, + "grad_norm": 0.4449958968332145, + "learning_rate": 4.948325149500902e-06, + "loss": 0.5767, + "step": 3720 + }, + { + "epoch": 1.4706347246233638, + "grad_norm": 0.44304606813408115, + "learning_rate": 4.948293454791095e-06, + "loss": 0.5768, + "step": 3721 + }, + { + "epoch": 1.471029883921956, + "grad_norm": 0.4438521427352953, + "learning_rate": 4.948261750465878e-06, + "loss": 0.5467, + "step": 3722 + }, + { + "epoch": 1.4714250432205482, + "grad_norm": 0.4377012072556724, + "learning_rate": 4.948230036525375e-06, + "loss": 0.5613, + "step": 3723 + }, + { + "epoch": 1.4718202025191405, + "grad_norm": 0.43727189711397746, + "learning_rate": 4.948198312969712e-06, + "loss": 0.5428, + "step": 3724 + }, + { + "epoch": 1.4722153618177327, + "grad_norm": 0.48964762021273595, + "learning_rate": 4.948166579799013e-06, + "loss": 0.5568, + "step": 3725 + }, + { + "epoch": 1.472610521116325, + "grad_norm": 0.5323132325510923, + "learning_rate": 4.948134837013402e-06, + "loss": 0.5578, + "step": 3726 + }, + { + "epoch": 1.4730056804149172, + "grad_norm": 0.4398364066065741, + "learning_rate": 4.948103084613003e-06, + "loss": 0.5609, + "step": 3727 + }, + { + "epoch": 1.4734008397135094, + "grad_norm": 0.4310398347472198, + "learning_rate": 4.948071322597943e-06, + "loss": 0.5651, + "step": 3728 + }, + { + "epoch": 1.4737959990121017, + "grad_norm": 0.539562325199801, + "learning_rate": 4.948039550968345e-06, + "loss": 0.5635, + "step": 3729 + }, + { + "epoch": 1.474191158310694, + "grad_norm": 0.4463656077676281, + "learning_rate": 4.948007769724333e-06, + "loss": 0.55, + "step": 3730 + }, + { + "epoch": 1.4745863176092864, + "grad_norm": 0.44864795312044087, + "learning_rate": 4.947975978866034e-06, + "loss": 0.5385, + "step": 3731 + }, + { + "epoch": 1.4749814769078786, + "grad_norm": 3.1516964419751026, + "learning_rate": 4.947944178393572e-06, + "loss": 0.582, + "step": 3732 + }, + { + "epoch": 1.4753766362064709, + "grad_norm": 0.46934564073210333, + "learning_rate": 4.947912368307071e-06, + "loss": 0.555, + "step": 3733 + }, + { + "epoch": 1.475771795505063, + "grad_norm": 0.46254224035417973, + "learning_rate": 4.9478805486066575e-06, + "loss": 0.5879, + "step": 3734 + }, + { + "epoch": 1.4761669548036553, + "grad_norm": 0.4469674187179735, + "learning_rate": 4.947848719292455e-06, + "loss": 0.554, + "step": 3735 + }, + { + "epoch": 1.4765621141022476, + "grad_norm": 0.45010349935865934, + "learning_rate": 4.947816880364589e-06, + "loss": 0.5465, + "step": 3736 + }, + { + "epoch": 1.4769572734008398, + "grad_norm": 0.4671526430772705, + "learning_rate": 4.9477850318231855e-06, + "loss": 0.5768, + "step": 3737 + }, + { + "epoch": 1.477352432699432, + "grad_norm": 0.44222712097266215, + "learning_rate": 4.947753173668368e-06, + "loss": 0.5647, + "step": 3738 + }, + { + "epoch": 1.4777475919980243, + "grad_norm": 0.4346426270801945, + "learning_rate": 4.947721305900263e-06, + "loss": 0.5883, + "step": 3739 + }, + { + "epoch": 1.4781427512966165, + "grad_norm": 0.4428949633974912, + "learning_rate": 4.947689428518994e-06, + "loss": 0.5585, + "step": 3740 + }, + { + "epoch": 1.4785379105952088, + "grad_norm": 0.45290050689849504, + "learning_rate": 4.947657541524689e-06, + "loss": 0.5698, + "step": 3741 + }, + { + "epoch": 1.478933069893801, + "grad_norm": 0.46047272522893107, + "learning_rate": 4.947625644917471e-06, + "loss": 0.5853, + "step": 3742 + }, + { + "epoch": 1.4793282291923933, + "grad_norm": 0.436358562052887, + "learning_rate": 4.9475937386974645e-06, + "loss": 0.5556, + "step": 3743 + }, + { + "epoch": 1.4797233884909855, + "grad_norm": 0.44605536186917205, + "learning_rate": 4.947561822864797e-06, + "loss": 0.5691, + "step": 3744 + }, + { + "epoch": 1.4801185477895777, + "grad_norm": 0.45340872193909043, + "learning_rate": 4.947529897419593e-06, + "loss": 0.5595, + "step": 3745 + }, + { + "epoch": 1.48051370708817, + "grad_norm": 0.44732431526653443, + "learning_rate": 4.947497962361977e-06, + "loss": 0.5469, + "step": 3746 + }, + { + "epoch": 1.4809088663867622, + "grad_norm": 0.43950554191375485, + "learning_rate": 4.947466017692075e-06, + "loss": 0.5493, + "step": 3747 + }, + { + "epoch": 1.4813040256853545, + "grad_norm": 0.451698847161343, + "learning_rate": 4.947434063410014e-06, + "loss": 0.5763, + "step": 3748 + }, + { + "epoch": 1.4816991849839467, + "grad_norm": 0.4383967173667518, + "learning_rate": 4.947402099515918e-06, + "loss": 0.5574, + "step": 3749 + }, + { + "epoch": 1.482094344282539, + "grad_norm": 0.46921309981868, + "learning_rate": 4.947370126009912e-06, + "loss": 0.5698, + "step": 3750 + }, + { + "epoch": 1.4824895035811312, + "grad_norm": 0.4576340583848198, + "learning_rate": 4.947338142892123e-06, + "loss": 0.5555, + "step": 3751 + }, + { + "epoch": 1.4828846628797234, + "grad_norm": 0.45431167387415017, + "learning_rate": 4.947306150162675e-06, + "loss": 0.5584, + "step": 3752 + }, + { + "epoch": 1.4832798221783157, + "grad_norm": 0.44151975322514186, + "learning_rate": 4.947274147821694e-06, + "loss": 0.5395, + "step": 3753 + }, + { + "epoch": 1.483674981476908, + "grad_norm": 0.4508179501579237, + "learning_rate": 4.947242135869308e-06, + "loss": 0.5629, + "step": 3754 + }, + { + "epoch": 1.4840701407755001, + "grad_norm": 0.44449630571041165, + "learning_rate": 4.94721011430564e-06, + "loss": 0.5543, + "step": 3755 + }, + { + "epoch": 1.4844653000740924, + "grad_norm": 0.44088055023127704, + "learning_rate": 4.947178083130817e-06, + "loss": 0.5525, + "step": 3756 + }, + { + "epoch": 1.4848604593726846, + "grad_norm": 0.4349780866471166, + "learning_rate": 4.947146042344964e-06, + "loss": 0.5431, + "step": 3757 + }, + { + "epoch": 1.4852556186712769, + "grad_norm": 0.4690057952385603, + "learning_rate": 4.947113991948207e-06, + "loss": 0.609, + "step": 3758 + }, + { + "epoch": 1.485650777969869, + "grad_norm": 0.4570722839955001, + "learning_rate": 4.947081931940673e-06, + "loss": 0.5589, + "step": 3759 + }, + { + "epoch": 1.4860459372684613, + "grad_norm": 0.44584022143653507, + "learning_rate": 4.9470498623224875e-06, + "loss": 0.5509, + "step": 3760 + }, + { + "epoch": 1.4864410965670536, + "grad_norm": 0.44483253459798316, + "learning_rate": 4.947017783093775e-06, + "loss": 0.5671, + "step": 3761 + }, + { + "epoch": 1.4868362558656458, + "grad_norm": 0.4637492181059857, + "learning_rate": 4.946985694254662e-06, + "loss": 0.5649, + "step": 3762 + }, + { + "epoch": 1.487231415164238, + "grad_norm": 0.4443774539885124, + "learning_rate": 4.946953595805277e-06, + "loss": 0.5664, + "step": 3763 + }, + { + "epoch": 1.4876265744628303, + "grad_norm": 0.4371331634851273, + "learning_rate": 4.946921487745743e-06, + "loss": 0.5585, + "step": 3764 + }, + { + "epoch": 1.4880217337614225, + "grad_norm": 0.44138169702671043, + "learning_rate": 4.9468893700761874e-06, + "loss": 0.5576, + "step": 3765 + }, + { + "epoch": 1.4884168930600148, + "grad_norm": 0.4525359288431999, + "learning_rate": 4.946857242796737e-06, + "loss": 0.5688, + "step": 3766 + }, + { + "epoch": 1.488812052358607, + "grad_norm": 0.4488955169238718, + "learning_rate": 4.946825105907516e-06, + "loss": 0.5708, + "step": 3767 + }, + { + "epoch": 1.4892072116571993, + "grad_norm": 0.453722892192494, + "learning_rate": 4.946792959408652e-06, + "loss": 0.574, + "step": 3768 + }, + { + "epoch": 1.4896023709557915, + "grad_norm": 0.4438539782138186, + "learning_rate": 4.9467608033002715e-06, + "loss": 0.5736, + "step": 3769 + }, + { + "epoch": 1.4899975302543838, + "grad_norm": 0.4734446349651656, + "learning_rate": 4.9467286375824995e-06, + "loss": 0.5587, + "step": 3770 + }, + { + "epoch": 1.490392689552976, + "grad_norm": 0.45483078234475843, + "learning_rate": 4.946696462255464e-06, + "loss": 0.5843, + "step": 3771 + }, + { + "epoch": 1.4907878488515682, + "grad_norm": 0.43491380893057063, + "learning_rate": 4.94666427731929e-06, + "loss": 0.5568, + "step": 3772 + }, + { + "epoch": 1.4911830081501605, + "grad_norm": 0.4593799609237185, + "learning_rate": 4.946632082774105e-06, + "loss": 0.568, + "step": 3773 + }, + { + "epoch": 1.4915781674487527, + "grad_norm": 0.4756685082546809, + "learning_rate": 4.946599878620034e-06, + "loss": 0.5525, + "step": 3774 + }, + { + "epoch": 1.491973326747345, + "grad_norm": 0.4375243311510868, + "learning_rate": 4.946567664857205e-06, + "loss": 0.5688, + "step": 3775 + }, + { + "epoch": 1.4923684860459372, + "grad_norm": 0.44023853517164957, + "learning_rate": 4.946535441485744e-06, + "loss": 0.5461, + "step": 3776 + }, + { + "epoch": 1.4927636453445294, + "grad_norm": 0.4417756181030543, + "learning_rate": 4.946503208505776e-06, + "loss": 0.5483, + "step": 3777 + }, + { + "epoch": 1.4931588046431217, + "grad_norm": 0.4459801582543887, + "learning_rate": 4.94647096591743e-06, + "loss": 0.5494, + "step": 3778 + }, + { + "epoch": 1.493553963941714, + "grad_norm": 0.44654308211555016, + "learning_rate": 4.9464387137208326e-06, + "loss": 0.5711, + "step": 3779 + }, + { + "epoch": 1.4939491232403062, + "grad_norm": 0.44171711692198834, + "learning_rate": 4.946406451916108e-06, + "loss": 0.5607, + "step": 3780 + }, + { + "epoch": 1.4943442825388984, + "grad_norm": 0.4390434383203796, + "learning_rate": 4.946374180503385e-06, + "loss": 0.5462, + "step": 3781 + }, + { + "epoch": 1.4947394418374906, + "grad_norm": 0.4548707333166944, + "learning_rate": 4.94634189948279e-06, + "loss": 0.554, + "step": 3782 + }, + { + "epoch": 1.4951346011360829, + "grad_norm": 0.4499388348643594, + "learning_rate": 4.946309608854449e-06, + "loss": 0.5705, + "step": 3783 + }, + { + "epoch": 1.4955297604346751, + "grad_norm": 0.4451819872634511, + "learning_rate": 4.94627730861849e-06, + "loss": 0.5653, + "step": 3784 + }, + { + "epoch": 1.4959249197332674, + "grad_norm": 0.4429157766074593, + "learning_rate": 4.946244998775039e-06, + "loss": 0.5591, + "step": 3785 + }, + { + "epoch": 1.4963200790318596, + "grad_norm": 0.45149504764423787, + "learning_rate": 4.946212679324222e-06, + "loss": 0.5531, + "step": 3786 + }, + { + "epoch": 1.4967152383304518, + "grad_norm": 0.45311480641045254, + "learning_rate": 4.946180350266168e-06, + "loss": 0.5738, + "step": 3787 + }, + { + "epoch": 1.497110397629044, + "grad_norm": 0.5577932771269037, + "learning_rate": 4.946148011601003e-06, + "loss": 0.5918, + "step": 3788 + }, + { + "epoch": 1.4975055569276363, + "grad_norm": 0.4442006706613438, + "learning_rate": 4.9461156633288535e-06, + "loss": 0.5573, + "step": 3789 + }, + { + "epoch": 1.4979007162262288, + "grad_norm": 0.448842435666703, + "learning_rate": 4.946083305449847e-06, + "loss": 0.5694, + "step": 3790 + }, + { + "epoch": 1.498295875524821, + "grad_norm": 0.47063189404977285, + "learning_rate": 4.946050937964112e-06, + "loss": 0.5663, + "step": 3791 + }, + { + "epoch": 1.4986910348234133, + "grad_norm": 0.4222521912580929, + "learning_rate": 4.946018560871772e-06, + "loss": 0.5333, + "step": 3792 + }, + { + "epoch": 1.4990861941220055, + "grad_norm": 0.708119310733299, + "learning_rate": 4.945986174172958e-06, + "loss": 0.5816, + "step": 3793 + }, + { + "epoch": 1.4994813534205977, + "grad_norm": 0.44863172461117107, + "learning_rate": 4.9459537778677955e-06, + "loss": 0.5848, + "step": 3794 + }, + { + "epoch": 1.49987651271919, + "grad_norm": 0.4411602360525524, + "learning_rate": 4.945921371956411e-06, + "loss": 0.5733, + "step": 3795 + }, + { + "epoch": 1.5002716720177822, + "grad_norm": 0.433024812052623, + "learning_rate": 4.945888956438933e-06, + "loss": 0.5644, + "step": 3796 + }, + { + "epoch": 1.5006668313163745, + "grad_norm": 0.5014185664609002, + "learning_rate": 4.945856531315489e-06, + "loss": 0.564, + "step": 3797 + }, + { + "epoch": 1.5010619906149667, + "grad_norm": 0.46228942218632185, + "learning_rate": 4.945824096586205e-06, + "loss": 0.569, + "step": 3798 + }, + { + "epoch": 1.501457149913559, + "grad_norm": 0.4623049423084713, + "learning_rate": 4.94579165225121e-06, + "loss": 0.5614, + "step": 3799 + }, + { + "epoch": 1.5018523092121512, + "grad_norm": 0.4294050033616448, + "learning_rate": 4.945759198310629e-06, + "loss": 0.5661, + "step": 3800 + }, + { + "epoch": 1.5022474685107434, + "grad_norm": 0.4383701048448765, + "learning_rate": 4.945726734764592e-06, + "loss": 0.5541, + "step": 3801 + }, + { + "epoch": 1.5026426278093357, + "grad_norm": 0.43965998205136625, + "learning_rate": 4.945694261613225e-06, + "loss": 0.5503, + "step": 3802 + }, + { + "epoch": 1.503037787107928, + "grad_norm": 0.43862817298801543, + "learning_rate": 4.945661778856658e-06, + "loss": 0.5793, + "step": 3803 + }, + { + "epoch": 1.5034329464065201, + "grad_norm": 0.45968905394437315, + "learning_rate": 4.945629286495014e-06, + "loss": 0.5836, + "step": 3804 + }, + { + "epoch": 1.5038281057051124, + "grad_norm": 0.43146289605158555, + "learning_rate": 4.945596784528425e-06, + "loss": 0.5553, + "step": 3805 + }, + { + "epoch": 1.5042232650037046, + "grad_norm": 0.4227325494043722, + "learning_rate": 4.945564272957016e-06, + "loss": 0.5467, + "step": 3806 + }, + { + "epoch": 1.5046184243022969, + "grad_norm": 0.45665440959944387, + "learning_rate": 4.945531751780915e-06, + "loss": 0.5825, + "step": 3807 + }, + { + "epoch": 1.505013583600889, + "grad_norm": 0.4370355896617275, + "learning_rate": 4.9454992210002515e-06, + "loss": 0.5757, + "step": 3808 + }, + { + "epoch": 1.5054087428994813, + "grad_norm": 0.44183906572444237, + "learning_rate": 4.9454666806151515e-06, + "loss": 0.5619, + "step": 3809 + }, + { + "epoch": 1.5058039021980736, + "grad_norm": 0.4214971459805865, + "learning_rate": 4.945434130625744e-06, + "loss": 0.5638, + "step": 3810 + }, + { + "epoch": 1.5061990614966658, + "grad_norm": 0.43677389141426, + "learning_rate": 4.945401571032156e-06, + "loss": 0.5626, + "step": 3811 + }, + { + "epoch": 1.506594220795258, + "grad_norm": 0.43865734507834114, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.5353, + "step": 3812 + }, + { + "epoch": 1.5069893800938503, + "grad_norm": 0.5342126887725891, + "learning_rate": 4.945336423032949e-06, + "loss": 0.5695, + "step": 3813 + }, + { + "epoch": 1.5073845393924425, + "grad_norm": 0.4349734124143589, + "learning_rate": 4.945303834627587e-06, + "loss": 0.5418, + "step": 3814 + }, + { + "epoch": 1.5077796986910348, + "grad_norm": 0.4385647900898778, + "learning_rate": 4.945271236618557e-06, + "loss": 0.5658, + "step": 3815 + }, + { + "epoch": 1.508174857989627, + "grad_norm": 0.42971941105827866, + "learning_rate": 4.945238629005986e-06, + "loss": 0.5727, + "step": 3816 + }, + { + "epoch": 1.5085700172882193, + "grad_norm": 0.4454445834803687, + "learning_rate": 4.945206011790002e-06, + "loss": 0.582, + "step": 3817 + }, + { + "epoch": 1.5089651765868115, + "grad_norm": 0.4534020504734814, + "learning_rate": 4.945173384970734e-06, + "loss": 0.5659, + "step": 3818 + }, + { + "epoch": 1.509360335885404, + "grad_norm": 0.4615252795337526, + "learning_rate": 4.945140748548309e-06, + "loss": 0.5762, + "step": 3819 + }, + { + "epoch": 1.5097554951839962, + "grad_norm": 0.43867764806826415, + "learning_rate": 4.945108102522858e-06, + "loss": 0.5667, + "step": 3820 + }, + { + "epoch": 1.5101506544825885, + "grad_norm": 0.43752103633853423, + "learning_rate": 4.945075446894505e-06, + "loss": 0.5648, + "step": 3821 + }, + { + "epoch": 1.5105458137811807, + "grad_norm": 0.48724137180843774, + "learning_rate": 4.945042781663381e-06, + "loss": 0.5496, + "step": 3822 + }, + { + "epoch": 1.510940973079773, + "grad_norm": 0.4386933082638948, + "learning_rate": 4.945010106829614e-06, + "loss": 0.5613, + "step": 3823 + }, + { + "epoch": 1.5113361323783652, + "grad_norm": 0.5117191506598214, + "learning_rate": 4.944977422393332e-06, + "loss": 0.5537, + "step": 3824 + }, + { + "epoch": 1.5117312916769574, + "grad_norm": 0.43364355513059377, + "learning_rate": 4.944944728354663e-06, + "loss": 0.5655, + "step": 3825 + }, + { + "epoch": 1.5121264509755497, + "grad_norm": 0.4467784125266709, + "learning_rate": 4.9449120247137365e-06, + "loss": 0.5664, + "step": 3826 + }, + { + "epoch": 1.512521610274142, + "grad_norm": 0.5300397203257904, + "learning_rate": 4.944879311470679e-06, + "loss": 0.5769, + "step": 3827 + }, + { + "epoch": 1.5129167695727341, + "grad_norm": 0.4257934452475102, + "learning_rate": 4.944846588625621e-06, + "loss": 0.5436, + "step": 3828 + }, + { + "epoch": 1.5133119288713264, + "grad_norm": 0.4455795318956715, + "learning_rate": 4.94481385617869e-06, + "loss": 0.5385, + "step": 3829 + }, + { + "epoch": 1.5137070881699186, + "grad_norm": 0.45355241250172695, + "learning_rate": 4.944781114130015e-06, + "loss": 0.5721, + "step": 3830 + }, + { + "epoch": 1.5141022474685109, + "grad_norm": 0.44014963072857977, + "learning_rate": 4.944748362479723e-06, + "loss": 0.562, + "step": 3831 + }, + { + "epoch": 1.514497406767103, + "grad_norm": 0.42555153523264766, + "learning_rate": 4.9447156012279455e-06, + "loss": 0.5624, + "step": 3832 + }, + { + "epoch": 1.5148925660656953, + "grad_norm": 0.48013397168835886, + "learning_rate": 4.944682830374809e-06, + "loss": 0.5513, + "step": 3833 + }, + { + "epoch": 1.5152877253642876, + "grad_norm": 0.45697256603832853, + "learning_rate": 4.944650049920443e-06, + "loss": 0.5629, + "step": 3834 + }, + { + "epoch": 1.5156828846628798, + "grad_norm": 0.44411668965768414, + "learning_rate": 4.944617259864976e-06, + "loss": 0.5506, + "step": 3835 + }, + { + "epoch": 1.516078043961472, + "grad_norm": 0.4416796017067383, + "learning_rate": 4.944584460208537e-06, + "loss": 0.5536, + "step": 3836 + }, + { + "epoch": 1.5164732032600643, + "grad_norm": 0.4349355230140983, + "learning_rate": 4.944551650951255e-06, + "loss": 0.5599, + "step": 3837 + }, + { + "epoch": 1.5168683625586565, + "grad_norm": 0.42051333332637836, + "learning_rate": 4.944518832093258e-06, + "loss": 0.549, + "step": 3838 + }, + { + "epoch": 1.5172635218572488, + "grad_norm": 0.43470424455730705, + "learning_rate": 4.944486003634675e-06, + "loss": 0.5705, + "step": 3839 + }, + { + "epoch": 1.517658681155841, + "grad_norm": 0.4484568492506127, + "learning_rate": 4.944453165575635e-06, + "loss": 0.5527, + "step": 3840 + }, + { + "epoch": 1.5180538404544333, + "grad_norm": 0.43871369779576824, + "learning_rate": 4.944420317916269e-06, + "loss": 0.5572, + "step": 3841 + }, + { + "epoch": 1.5184489997530255, + "grad_norm": 0.4267428186392819, + "learning_rate": 4.944387460656703e-06, + "loss": 0.5602, + "step": 3842 + }, + { + "epoch": 1.5188441590516177, + "grad_norm": 0.43416355081762653, + "learning_rate": 4.9443545937970686e-06, + "loss": 0.567, + "step": 3843 + }, + { + "epoch": 1.51923931835021, + "grad_norm": 0.49590515295094667, + "learning_rate": 4.944321717337493e-06, + "loss": 0.5598, + "step": 3844 + }, + { + "epoch": 1.5196344776488022, + "grad_norm": 0.4626680777777951, + "learning_rate": 4.9442888312781056e-06, + "loss": 0.5583, + "step": 3845 + }, + { + "epoch": 1.5200296369473945, + "grad_norm": 0.4449293851147631, + "learning_rate": 4.944255935619036e-06, + "loss": 0.562, + "step": 3846 + }, + { + "epoch": 1.5204247962459867, + "grad_norm": 0.43950123990367407, + "learning_rate": 4.944223030360414e-06, + "loss": 0.5642, + "step": 3847 + }, + { + "epoch": 1.520819955544579, + "grad_norm": 0.44149298105890766, + "learning_rate": 4.9441901155023675e-06, + "loss": 0.5796, + "step": 3848 + }, + { + "epoch": 1.5212151148431712, + "grad_norm": 0.4482301026578816, + "learning_rate": 4.944157191045027e-06, + "loss": 0.5688, + "step": 3849 + }, + { + "epoch": 1.5216102741417634, + "grad_norm": 0.5488343962637722, + "learning_rate": 4.94412425698852e-06, + "loss": 0.5546, + "step": 3850 + }, + { + "epoch": 1.5220054334403557, + "grad_norm": 0.44100420516713945, + "learning_rate": 4.944091313332978e-06, + "loss": 0.5624, + "step": 3851 + }, + { + "epoch": 1.522400592738948, + "grad_norm": 0.42706076298730056, + "learning_rate": 4.94405836007853e-06, + "loss": 0.5365, + "step": 3852 + }, + { + "epoch": 1.5227957520375401, + "grad_norm": 0.4359259767312831, + "learning_rate": 4.944025397225304e-06, + "loss": 0.5411, + "step": 3853 + }, + { + "epoch": 1.5231909113361324, + "grad_norm": 0.4388004674269903, + "learning_rate": 4.943992424773431e-06, + "loss": 0.5655, + "step": 3854 + }, + { + "epoch": 1.5235860706347246, + "grad_norm": 0.4747569771286888, + "learning_rate": 4.943959442723039e-06, + "loss": 0.5583, + "step": 3855 + }, + { + "epoch": 1.5239812299333169, + "grad_norm": 0.4673031379896023, + "learning_rate": 4.943926451074258e-06, + "loss": 0.5663, + "step": 3856 + }, + { + "epoch": 1.524376389231909, + "grad_norm": 0.45989463226172245, + "learning_rate": 4.943893449827219e-06, + "loss": 0.554, + "step": 3857 + }, + { + "epoch": 1.5247715485305013, + "grad_norm": 0.4488593649799981, + "learning_rate": 4.94386043898205e-06, + "loss": 0.5733, + "step": 3858 + }, + { + "epoch": 1.5251667078290936, + "grad_norm": 0.4485345293642178, + "learning_rate": 4.943827418538882e-06, + "loss": 0.5603, + "step": 3859 + }, + { + "epoch": 1.5255618671276858, + "grad_norm": 0.4535620010441292, + "learning_rate": 4.943794388497842e-06, + "loss": 0.5548, + "step": 3860 + }, + { + "epoch": 1.525957026426278, + "grad_norm": 0.4735064699244355, + "learning_rate": 4.943761348859063e-06, + "loss": 0.5524, + "step": 3861 + }, + { + "epoch": 1.5263521857248703, + "grad_norm": 0.4513665077962669, + "learning_rate": 4.9437282996226734e-06, + "loss": 0.5677, + "step": 3862 + }, + { + "epoch": 1.5267473450234625, + "grad_norm": 0.4556372954566632, + "learning_rate": 4.943695240788803e-06, + "loss": 0.559, + "step": 3863 + }, + { + "epoch": 1.5271425043220548, + "grad_norm": 0.4597655079944917, + "learning_rate": 4.943662172357582e-06, + "loss": 0.5529, + "step": 3864 + }, + { + "epoch": 1.527537663620647, + "grad_norm": 0.45852506910423174, + "learning_rate": 4.943629094329139e-06, + "loss": 0.556, + "step": 3865 + }, + { + "epoch": 1.5279328229192393, + "grad_norm": 0.4475865214143378, + "learning_rate": 4.9435960067036045e-06, + "loss": 0.5638, + "step": 3866 + }, + { + "epoch": 1.5283279822178315, + "grad_norm": 0.44719416313388277, + "learning_rate": 4.943562909481109e-06, + "loss": 0.5652, + "step": 3867 + }, + { + "epoch": 1.5287231415164237, + "grad_norm": 0.4288620192017668, + "learning_rate": 4.943529802661783e-06, + "loss": 0.5436, + "step": 3868 + }, + { + "epoch": 1.529118300815016, + "grad_norm": 0.45692475861207277, + "learning_rate": 4.943496686245754e-06, + "loss": 0.5723, + "step": 3869 + }, + { + "epoch": 1.5295134601136082, + "grad_norm": 0.43936779754731264, + "learning_rate": 4.943463560233155e-06, + "loss": 0.5631, + "step": 3870 + }, + { + "epoch": 1.5299086194122005, + "grad_norm": 0.45634250616291433, + "learning_rate": 4.943430424624115e-06, + "loss": 0.5876, + "step": 3871 + }, + { + "epoch": 1.5303037787107927, + "grad_norm": 0.46250472654805214, + "learning_rate": 4.943397279418764e-06, + "loss": 0.5704, + "step": 3872 + }, + { + "epoch": 1.530698938009385, + "grad_norm": 0.4445910904786816, + "learning_rate": 4.943364124617232e-06, + "loss": 0.5762, + "step": 3873 + }, + { + "epoch": 1.5310940973079772, + "grad_norm": 0.4492442500667238, + "learning_rate": 4.9433309602196494e-06, + "loss": 0.5812, + "step": 3874 + }, + { + "epoch": 1.5314892566065694, + "grad_norm": 0.4375970310778139, + "learning_rate": 4.943297786226147e-06, + "loss": 0.5673, + "step": 3875 + }, + { + "epoch": 1.5318844159051617, + "grad_norm": 0.43961087035282753, + "learning_rate": 4.9432646026368535e-06, + "loss": 0.5671, + "step": 3876 + }, + { + "epoch": 1.532279575203754, + "grad_norm": 0.4415868685793641, + "learning_rate": 4.943231409451901e-06, + "loss": 0.5559, + "step": 3877 + }, + { + "epoch": 1.5326747345023461, + "grad_norm": 0.43726516852584973, + "learning_rate": 4.943198206671419e-06, + "loss": 0.5684, + "step": 3878 + }, + { + "epoch": 1.5330698938009384, + "grad_norm": 0.5804283001744497, + "learning_rate": 4.943164994295538e-06, + "loss": 0.5731, + "step": 3879 + }, + { + "epoch": 1.5334650530995306, + "grad_norm": 0.45583485277725394, + "learning_rate": 4.943131772324388e-06, + "loss": 0.5572, + "step": 3880 + }, + { + "epoch": 1.5338602123981229, + "grad_norm": 0.4399328351020496, + "learning_rate": 4.9430985407581e-06, + "loss": 0.5427, + "step": 3881 + }, + { + "epoch": 1.534255371696715, + "grad_norm": 0.42834809808516533, + "learning_rate": 4.943065299596806e-06, + "loss": 0.5681, + "step": 3882 + }, + { + "epoch": 1.5346505309953073, + "grad_norm": 0.5826505399905085, + "learning_rate": 4.943032048840633e-06, + "loss": 0.5551, + "step": 3883 + }, + { + "epoch": 1.5350456902938996, + "grad_norm": 0.4471917315958813, + "learning_rate": 4.942998788489715e-06, + "loss": 0.5762, + "step": 3884 + }, + { + "epoch": 1.5354408495924918, + "grad_norm": 0.44668514989336283, + "learning_rate": 4.94296551854418e-06, + "loss": 0.5721, + "step": 3885 + }, + { + "epoch": 1.535836008891084, + "grad_norm": 0.4631760060913264, + "learning_rate": 4.942932239004161e-06, + "loss": 0.5684, + "step": 3886 + }, + { + "epoch": 1.5362311681896763, + "grad_norm": 0.45324498503225535, + "learning_rate": 4.942898949869787e-06, + "loss": 0.5749, + "step": 3887 + }, + { + "epoch": 1.5366263274882686, + "grad_norm": 0.43838680281308623, + "learning_rate": 4.942865651141189e-06, + "loss": 0.5723, + "step": 3888 + }, + { + "epoch": 1.5370214867868608, + "grad_norm": 0.44695717165338794, + "learning_rate": 4.942832342818499e-06, + "loss": 0.58, + "step": 3889 + }, + { + "epoch": 1.5374166460854533, + "grad_norm": 0.42767213191242315, + "learning_rate": 4.942799024901846e-06, + "loss": 0.5508, + "step": 3890 + }, + { + "epoch": 1.5378118053840455, + "grad_norm": 0.440495140617439, + "learning_rate": 4.942765697391363e-06, + "loss": 0.5735, + "step": 3891 + }, + { + "epoch": 1.5382069646826377, + "grad_norm": 0.4607800617717655, + "learning_rate": 4.942732360287179e-06, + "loss": 0.5874, + "step": 3892 + }, + { + "epoch": 1.53860212398123, + "grad_norm": 0.44999899303277263, + "learning_rate": 4.942699013589425e-06, + "loss": 0.5793, + "step": 3893 + }, + { + "epoch": 1.5389972832798222, + "grad_norm": 0.4558760646566193, + "learning_rate": 4.942665657298233e-06, + "loss": 0.5714, + "step": 3894 + }, + { + "epoch": 1.5393924425784145, + "grad_norm": 0.43776240719293746, + "learning_rate": 4.9426322914137335e-06, + "loss": 0.5676, + "step": 3895 + }, + { + "epoch": 1.5397876018770067, + "grad_norm": 0.4437159939338884, + "learning_rate": 4.942598915936058e-06, + "loss": 0.5893, + "step": 3896 + }, + { + "epoch": 1.540182761175599, + "grad_norm": 0.4374941926171507, + "learning_rate": 4.942565530865337e-06, + "loss": 0.555, + "step": 3897 + }, + { + "epoch": 1.5405779204741912, + "grad_norm": 0.4208573300497547, + "learning_rate": 4.942532136201702e-06, + "loss": 0.5565, + "step": 3898 + }, + { + "epoch": 1.5409730797727834, + "grad_norm": 0.4337013789375931, + "learning_rate": 4.942498731945283e-06, + "loss": 0.5616, + "step": 3899 + }, + { + "epoch": 1.5413682390713757, + "grad_norm": 0.4527938862232506, + "learning_rate": 4.942465318096213e-06, + "loss": 0.556, + "step": 3900 + }, + { + "epoch": 1.541763398369968, + "grad_norm": 0.4427938210330589, + "learning_rate": 4.942431894654622e-06, + "loss": 0.5716, + "step": 3901 + }, + { + "epoch": 1.5421585576685601, + "grad_norm": 0.439043234576677, + "learning_rate": 4.942398461620642e-06, + "loss": 0.5627, + "step": 3902 + }, + { + "epoch": 1.5425537169671524, + "grad_norm": 0.4556850317164177, + "learning_rate": 4.942365018994404e-06, + "loss": 0.564, + "step": 3903 + }, + { + "epoch": 1.5429488762657446, + "grad_norm": 0.5109127133225726, + "learning_rate": 4.942331566776039e-06, + "loss": 0.5574, + "step": 3904 + }, + { + "epoch": 1.5433440355643369, + "grad_norm": 0.4363940727408752, + "learning_rate": 4.942298104965679e-06, + "loss": 0.575, + "step": 3905 + }, + { + "epoch": 1.543739194862929, + "grad_norm": 0.42938184413018377, + "learning_rate": 4.942264633563455e-06, + "loss": 0.5562, + "step": 3906 + }, + { + "epoch": 1.5441343541615213, + "grad_norm": 0.44370460437623127, + "learning_rate": 4.942231152569499e-06, + "loss": 0.5753, + "step": 3907 + }, + { + "epoch": 1.5445295134601136, + "grad_norm": 0.4367799811992645, + "learning_rate": 4.94219766198394e-06, + "loss": 0.5531, + "step": 3908 + }, + { + "epoch": 1.5449246727587058, + "grad_norm": 0.436026557215798, + "learning_rate": 4.942164161806914e-06, + "loss": 0.5641, + "step": 3909 + }, + { + "epoch": 1.545319832057298, + "grad_norm": 0.43686930660689016, + "learning_rate": 4.94213065203855e-06, + "loss": 0.5514, + "step": 3910 + }, + { + "epoch": 1.5457149913558903, + "grad_norm": 0.459912181031538, + "learning_rate": 4.942097132678978e-06, + "loss": 0.5717, + "step": 3911 + }, + { + "epoch": 1.5461101506544825, + "grad_norm": 0.432296005037305, + "learning_rate": 4.942063603728332e-06, + "loss": 0.5598, + "step": 3912 + }, + { + "epoch": 1.5465053099530748, + "grad_norm": 0.44580566374151687, + "learning_rate": 4.942030065186744e-06, + "loss": 0.5649, + "step": 3913 + }, + { + "epoch": 1.5469004692516672, + "grad_norm": 0.4362823236216351, + "learning_rate": 4.941996517054344e-06, + "loss": 0.5658, + "step": 3914 + }, + { + "epoch": 1.5472956285502595, + "grad_norm": 0.42956539131805577, + "learning_rate": 4.941962959331265e-06, + "loss": 0.564, + "step": 3915 + }, + { + "epoch": 1.5476907878488517, + "grad_norm": 0.445691619032847, + "learning_rate": 4.941929392017637e-06, + "loss": 0.5615, + "step": 3916 + }, + { + "epoch": 1.548085947147444, + "grad_norm": 0.43714883646638897, + "learning_rate": 4.9418958151135946e-06, + "loss": 0.5746, + "step": 3917 + }, + { + "epoch": 1.5484811064460362, + "grad_norm": 0.4574932956702348, + "learning_rate": 4.941862228619267e-06, + "loss": 0.5527, + "step": 3918 + }, + { + "epoch": 1.5488762657446284, + "grad_norm": 0.4530888175518143, + "learning_rate": 4.941828632534789e-06, + "loss": 0.5569, + "step": 3919 + }, + { + "epoch": 1.5492714250432207, + "grad_norm": 0.44308964244366394, + "learning_rate": 4.94179502686029e-06, + "loss": 0.5578, + "step": 3920 + }, + { + "epoch": 1.549666584341813, + "grad_norm": 0.4395901315241771, + "learning_rate": 4.941761411595903e-06, + "loss": 0.5559, + "step": 3921 + }, + { + "epoch": 1.5500617436404052, + "grad_norm": 0.4506494793506838, + "learning_rate": 4.94172778674176e-06, + "loss": 0.5751, + "step": 3922 + }, + { + "epoch": 1.5504569029389974, + "grad_norm": 0.43273605402011445, + "learning_rate": 4.9416941522979926e-06, + "loss": 0.5476, + "step": 3923 + }, + { + "epoch": 1.5508520622375896, + "grad_norm": 0.43415805767715054, + "learning_rate": 4.9416605082647325e-06, + "loss": 0.5693, + "step": 3924 + }, + { + "epoch": 1.5512472215361819, + "grad_norm": 0.4434283494921018, + "learning_rate": 4.941626854642114e-06, + "loss": 0.546, + "step": 3925 + }, + { + "epoch": 1.5516423808347741, + "grad_norm": 0.443351882260677, + "learning_rate": 4.941593191430267e-06, + "loss": 0.5511, + "step": 3926 + }, + { + "epoch": 1.5520375401333664, + "grad_norm": 0.4371115950925369, + "learning_rate": 4.941559518629325e-06, + "loss": 0.5655, + "step": 3927 + }, + { + "epoch": 1.5524326994319586, + "grad_norm": 0.4500093849359018, + "learning_rate": 4.94152583623942e-06, + "loss": 0.5556, + "step": 3928 + }, + { + "epoch": 1.5528278587305508, + "grad_norm": 0.4353898954337321, + "learning_rate": 4.941492144260683e-06, + "loss": 0.5571, + "step": 3929 + }, + { + "epoch": 1.553223018029143, + "grad_norm": 0.45540296825899435, + "learning_rate": 4.941458442693249e-06, + "loss": 0.5649, + "step": 3930 + }, + { + "epoch": 1.5536181773277353, + "grad_norm": 0.43590100396307097, + "learning_rate": 4.9414247315372474e-06, + "loss": 0.5592, + "step": 3931 + }, + { + "epoch": 1.5540133366263276, + "grad_norm": 0.4273362760763734, + "learning_rate": 4.9413910107928136e-06, + "loss": 0.5559, + "step": 3932 + }, + { + "epoch": 1.5544084959249198, + "grad_norm": 0.43873328194495376, + "learning_rate": 4.941357280460076e-06, + "loss": 0.5707, + "step": 3933 + }, + { + "epoch": 1.554803655223512, + "grad_norm": 0.44105964043710244, + "learning_rate": 4.941323540539171e-06, + "loss": 0.5683, + "step": 3934 + }, + { + "epoch": 1.5551988145221043, + "grad_norm": 0.436934179465898, + "learning_rate": 4.941289791030229e-06, + "loss": 0.5757, + "step": 3935 + }, + { + "epoch": 1.5555939738206965, + "grad_norm": 0.45737730913945995, + "learning_rate": 4.9412560319333844e-06, + "loss": 0.5711, + "step": 3936 + }, + { + "epoch": 1.5559891331192888, + "grad_norm": 0.42468924058156193, + "learning_rate": 4.941222263248767e-06, + "loss": 0.5478, + "step": 3937 + }, + { + "epoch": 1.556384292417881, + "grad_norm": 0.43195983842147717, + "learning_rate": 4.941188484976512e-06, + "loss": 0.5659, + "step": 3938 + }, + { + "epoch": 1.5567794517164733, + "grad_norm": 0.44375257055854056, + "learning_rate": 4.9411546971167505e-06, + "loss": 0.5867, + "step": 3939 + }, + { + "epoch": 1.5571746110150655, + "grad_norm": 0.43710025950059056, + "learning_rate": 4.941120899669616e-06, + "loss": 0.5735, + "step": 3940 + }, + { + "epoch": 1.5575697703136577, + "grad_norm": 0.4474954937975031, + "learning_rate": 4.94108709263524e-06, + "loss": 0.602, + "step": 3941 + }, + { + "epoch": 1.55796492961225, + "grad_norm": 0.4305607613140482, + "learning_rate": 4.941053276013758e-06, + "loss": 0.5548, + "step": 3942 + }, + { + "epoch": 1.5583600889108422, + "grad_norm": 0.4398028265526488, + "learning_rate": 4.941019449805299e-06, + "loss": 0.5495, + "step": 3943 + }, + { + "epoch": 1.5587552482094345, + "grad_norm": 0.427010931690335, + "learning_rate": 4.940985614009999e-06, + "loss": 0.561, + "step": 3944 + }, + { + "epoch": 1.5591504075080267, + "grad_norm": 0.43014008416807015, + "learning_rate": 4.9409517686279895e-06, + "loss": 0.5796, + "step": 3945 + }, + { + "epoch": 1.559545566806619, + "grad_norm": 0.4473068429218742, + "learning_rate": 4.940917913659404e-06, + "loss": 0.5594, + "step": 3946 + }, + { + "epoch": 1.5599407261052112, + "grad_norm": 0.44348060278334916, + "learning_rate": 4.940884049104374e-06, + "loss": 0.5603, + "step": 3947 + }, + { + "epoch": 1.5603358854038034, + "grad_norm": 0.4254990006538986, + "learning_rate": 4.940850174963035e-06, + "loss": 0.5496, + "step": 3948 + }, + { + "epoch": 1.5607310447023957, + "grad_norm": 0.4599229925834786, + "learning_rate": 4.9408162912355185e-06, + "loss": 0.569, + "step": 3949 + }, + { + "epoch": 1.561126204000988, + "grad_norm": 0.4608183190992476, + "learning_rate": 4.940782397921957e-06, + "loss": 0.5475, + "step": 3950 + }, + { + "epoch": 1.5615213632995801, + "grad_norm": 0.4334203860785032, + "learning_rate": 4.940748495022485e-06, + "loss": 0.5617, + "step": 3951 + }, + { + "epoch": 1.5619165225981724, + "grad_norm": 0.4286064511326075, + "learning_rate": 4.940714582537235e-06, + "loss": 0.5508, + "step": 3952 + }, + { + "epoch": 1.5623116818967646, + "grad_norm": 0.4451771504215982, + "learning_rate": 4.94068066046634e-06, + "loss": 0.5679, + "step": 3953 + }, + { + "epoch": 1.5627068411953569, + "grad_norm": 0.44950544874820486, + "learning_rate": 4.940646728809933e-06, + "loss": 0.5571, + "step": 3954 + }, + { + "epoch": 1.563102000493949, + "grad_norm": 0.43332984605668107, + "learning_rate": 4.940612787568148e-06, + "loss": 0.5412, + "step": 3955 + }, + { + "epoch": 1.5634971597925413, + "grad_norm": 0.4537080751951872, + "learning_rate": 4.940578836741119e-06, + "loss": 0.5922, + "step": 3956 + }, + { + "epoch": 1.5638923190911336, + "grad_norm": 0.45065137229775004, + "learning_rate": 4.940544876328977e-06, + "loss": 0.5761, + "step": 3957 + }, + { + "epoch": 1.5642874783897258, + "grad_norm": 0.468459520127438, + "learning_rate": 4.940510906331856e-06, + "loss": 0.5766, + "step": 3958 + }, + { + "epoch": 1.564682637688318, + "grad_norm": 0.4841953450723878, + "learning_rate": 4.940476926749892e-06, + "loss": 0.5509, + "step": 3959 + }, + { + "epoch": 1.5650777969869103, + "grad_norm": 0.43421269689782305, + "learning_rate": 4.940442937583216e-06, + "loss": 0.5428, + "step": 3960 + }, + { + "epoch": 1.5654729562855025, + "grad_norm": 0.44740726473549824, + "learning_rate": 4.940408938831962e-06, + "loss": 0.5543, + "step": 3961 + }, + { + "epoch": 1.5658681155840948, + "grad_norm": 0.43972490377414003, + "learning_rate": 4.9403749304962635e-06, + "loss": 0.5711, + "step": 3962 + }, + { + "epoch": 1.566263274882687, + "grad_norm": 0.42877178820545214, + "learning_rate": 4.940340912576254e-06, + "loss": 0.5674, + "step": 3963 + }, + { + "epoch": 1.5666584341812793, + "grad_norm": 0.4180015219456009, + "learning_rate": 4.940306885072067e-06, + "loss": 0.5465, + "step": 3964 + }, + { + "epoch": 1.5670535934798715, + "grad_norm": 0.44880071394905674, + "learning_rate": 4.940272847983837e-06, + "loss": 0.5638, + "step": 3965 + }, + { + "epoch": 1.5674487527784637, + "grad_norm": 0.4470580270494271, + "learning_rate": 4.9402388013116965e-06, + "loss": 0.5577, + "step": 3966 + }, + { + "epoch": 1.567843912077056, + "grad_norm": 0.46986742626918304, + "learning_rate": 4.94020474505578e-06, + "loss": 0.5754, + "step": 3967 + }, + { + "epoch": 1.5682390713756482, + "grad_norm": 0.4719511276473468, + "learning_rate": 4.940170679216222e-06, + "loss": 0.5465, + "step": 3968 + }, + { + "epoch": 1.5686342306742405, + "grad_norm": 0.4819185669692573, + "learning_rate": 4.940136603793154e-06, + "loss": 0.5903, + "step": 3969 + }, + { + "epoch": 1.5690293899728327, + "grad_norm": 0.4596392737276118, + "learning_rate": 4.940102518786711e-06, + "loss": 0.5827, + "step": 3970 + }, + { + "epoch": 1.569424549271425, + "grad_norm": 0.4350274926677015, + "learning_rate": 4.9400684241970285e-06, + "loss": 0.5732, + "step": 3971 + }, + { + "epoch": 1.5698197085700172, + "grad_norm": 0.43547065854272937, + "learning_rate": 4.940034320024237e-06, + "loss": 0.5587, + "step": 3972 + }, + { + "epoch": 1.5702148678686094, + "grad_norm": 0.44965772836569196, + "learning_rate": 4.940000206268474e-06, + "loss": 0.5757, + "step": 3973 + }, + { + "epoch": 1.5706100271672017, + "grad_norm": 0.42960226035038346, + "learning_rate": 4.939966082929872e-06, + "loss": 0.5411, + "step": 3974 + }, + { + "epoch": 1.571005186465794, + "grad_norm": 0.43654243807808146, + "learning_rate": 4.939931950008563e-06, + "loss": 0.5681, + "step": 3975 + }, + { + "epoch": 1.5714003457643861, + "grad_norm": 0.46158348779701347, + "learning_rate": 4.939897807504684e-06, + "loss": 0.5637, + "step": 3976 + }, + { + "epoch": 1.5717955050629784, + "grad_norm": 0.4506664040924429, + "learning_rate": 4.939863655418368e-06, + "loss": 0.5716, + "step": 3977 + }, + { + "epoch": 1.5721906643615706, + "grad_norm": 0.43736830707484653, + "learning_rate": 4.939829493749749e-06, + "loss": 0.5634, + "step": 3978 + }, + { + "epoch": 1.5725858236601629, + "grad_norm": 0.47956231415468115, + "learning_rate": 4.939795322498961e-06, + "loss": 0.5768, + "step": 3979 + }, + { + "epoch": 1.572980982958755, + "grad_norm": 0.47009053670758705, + "learning_rate": 4.93976114166614e-06, + "loss": 0.5739, + "step": 3980 + }, + { + "epoch": 1.5733761422573473, + "grad_norm": 0.4519666405286993, + "learning_rate": 4.9397269512514175e-06, + "loss": 0.5548, + "step": 3981 + }, + { + "epoch": 1.5737713015559396, + "grad_norm": 0.46600888709363636, + "learning_rate": 4.939692751254929e-06, + "loss": 0.5409, + "step": 3982 + }, + { + "epoch": 1.5741664608545318, + "grad_norm": 0.4362128507908923, + "learning_rate": 4.939658541676809e-06, + "loss": 0.557, + "step": 3983 + }, + { + "epoch": 1.574561620153124, + "grad_norm": 0.45311061929248453, + "learning_rate": 4.9396243225171916e-06, + "loss": 0.5534, + "step": 3984 + }, + { + "epoch": 1.5749567794517165, + "grad_norm": 0.4344985151811654, + "learning_rate": 4.939590093776211e-06, + "loss": 0.5646, + "step": 3985 + }, + { + "epoch": 1.5753519387503088, + "grad_norm": 0.4480209913301821, + "learning_rate": 4.939555855454003e-06, + "loss": 0.5561, + "step": 3986 + }, + { + "epoch": 1.575747098048901, + "grad_norm": 0.4531075976767362, + "learning_rate": 4.9395216075507e-06, + "loss": 0.565, + "step": 3987 + }, + { + "epoch": 1.5761422573474932, + "grad_norm": 0.4291612629531522, + "learning_rate": 4.939487350066438e-06, + "loss": 0.5561, + "step": 3988 + }, + { + "epoch": 1.5765374166460855, + "grad_norm": 0.4411400070808928, + "learning_rate": 4.93945308300135e-06, + "loss": 0.5722, + "step": 3989 + }, + { + "epoch": 1.5769325759446777, + "grad_norm": 0.4411690987818465, + "learning_rate": 4.939418806355573e-06, + "loss": 0.5744, + "step": 3990 + }, + { + "epoch": 1.57732773524327, + "grad_norm": 0.4234846841720286, + "learning_rate": 4.939384520129239e-06, + "loss": 0.5242, + "step": 3991 + }, + { + "epoch": 1.5777228945418622, + "grad_norm": 0.43794297369469665, + "learning_rate": 4.9393502243224844e-06, + "loss": 0.5702, + "step": 3992 + }, + { + "epoch": 1.5781180538404544, + "grad_norm": 0.4619795012466095, + "learning_rate": 4.9393159189354435e-06, + "loss": 0.5602, + "step": 3993 + }, + { + "epoch": 1.5785132131390467, + "grad_norm": 0.4248706656017066, + "learning_rate": 4.9392816039682516e-06, + "loss": 0.5593, + "step": 3994 + }, + { + "epoch": 1.578908372437639, + "grad_norm": 0.4581079886861394, + "learning_rate": 4.939247279421041e-06, + "loss": 0.5602, + "step": 3995 + }, + { + "epoch": 1.5793035317362312, + "grad_norm": 0.4305195033029715, + "learning_rate": 4.93921294529395e-06, + "loss": 0.5596, + "step": 3996 + }, + { + "epoch": 1.5796986910348234, + "grad_norm": 0.4366623407663496, + "learning_rate": 4.9391786015871106e-06, + "loss": 0.5597, + "step": 3997 + }, + { + "epoch": 1.5800938503334157, + "grad_norm": 0.44069624259394685, + "learning_rate": 4.939144248300659e-06, + "loss": 0.5897, + "step": 3998 + }, + { + "epoch": 1.580489009632008, + "grad_norm": 0.43890619357978694, + "learning_rate": 4.939109885434731e-06, + "loss": 0.569, + "step": 3999 + }, + { + "epoch": 1.5808841689306001, + "grad_norm": 0.4364576916674103, + "learning_rate": 4.939075512989459e-06, + "loss": 0.5787, + "step": 4000 + }, + { + "epoch": 1.5812793282291924, + "grad_norm": 0.45319416604702467, + "learning_rate": 4.93904113096498e-06, + "loss": 0.5832, + "step": 4001 + }, + { + "epoch": 1.5816744875277846, + "grad_norm": 0.4408144136335299, + "learning_rate": 4.939006739361429e-06, + "loss": 0.5757, + "step": 4002 + }, + { + "epoch": 1.5820696468263769, + "grad_norm": 0.43847351893891434, + "learning_rate": 4.93897233817894e-06, + "loss": 0.57, + "step": 4003 + }, + { + "epoch": 1.582464806124969, + "grad_norm": 0.4272786242930415, + "learning_rate": 4.9389379274176485e-06, + "loss": 0.561, + "step": 4004 + }, + { + "epoch": 1.5828599654235613, + "grad_norm": 0.4273445360462422, + "learning_rate": 4.93890350707769e-06, + "loss": 0.5543, + "step": 4005 + }, + { + "epoch": 1.5832551247221536, + "grad_norm": 0.43334855462016986, + "learning_rate": 4.9388690771592e-06, + "loss": 0.5632, + "step": 4006 + }, + { + "epoch": 1.5836502840207458, + "grad_norm": 0.4412911003838902, + "learning_rate": 4.938834637662313e-06, + "loss": 0.5717, + "step": 4007 + }, + { + "epoch": 1.584045443319338, + "grad_norm": 0.4384873516991556, + "learning_rate": 4.9388001885871635e-06, + "loss": 0.5569, + "step": 4008 + }, + { + "epoch": 1.5844406026179305, + "grad_norm": 0.4357385688585848, + "learning_rate": 4.938765729933889e-06, + "loss": 0.5656, + "step": 4009 + }, + { + "epoch": 1.5848357619165228, + "grad_norm": 0.43820335981947794, + "learning_rate": 4.938731261702624e-06, + "loss": 0.5726, + "step": 4010 + }, + { + "epoch": 1.585230921215115, + "grad_norm": 0.4369840237848306, + "learning_rate": 4.938696783893502e-06, + "loss": 0.5543, + "step": 4011 + }, + { + "epoch": 1.5856260805137072, + "grad_norm": 0.4472393204243838, + "learning_rate": 4.938662296506661e-06, + "loss": 0.5565, + "step": 4012 + }, + { + "epoch": 1.5860212398122995, + "grad_norm": 0.4615011233799181, + "learning_rate": 4.938627799542235e-06, + "loss": 0.5587, + "step": 4013 + }, + { + "epoch": 1.5864163991108917, + "grad_norm": 0.42914755332494525, + "learning_rate": 4.93859329300036e-06, + "loss": 0.5551, + "step": 4014 + }, + { + "epoch": 1.586811558409484, + "grad_norm": 0.6994500397802149, + "learning_rate": 4.93855877688117e-06, + "loss": 0.5641, + "step": 4015 + }, + { + "epoch": 1.5872067177080762, + "grad_norm": 0.4539471102905313, + "learning_rate": 4.938524251184803e-06, + "loss": 0.5772, + "step": 4016 + }, + { + "epoch": 1.5876018770066684, + "grad_norm": 0.4737130066010192, + "learning_rate": 4.938489715911394e-06, + "loss": 0.5753, + "step": 4017 + }, + { + "epoch": 1.5879970363052607, + "grad_norm": 0.4459029139930523, + "learning_rate": 4.938455171061077e-06, + "loss": 0.5774, + "step": 4018 + }, + { + "epoch": 1.588392195603853, + "grad_norm": 0.4434353397507251, + "learning_rate": 4.93842061663399e-06, + "loss": 0.5581, + "step": 4019 + }, + { + "epoch": 1.5887873549024452, + "grad_norm": 0.43023249449874645, + "learning_rate": 4.938386052630267e-06, + "loss": 0.5429, + "step": 4020 + }, + { + "epoch": 1.5891825142010374, + "grad_norm": 0.45310635393738163, + "learning_rate": 4.938351479050044e-06, + "loss": 0.5674, + "step": 4021 + }, + { + "epoch": 1.5895776734996296, + "grad_norm": 0.45352464255321795, + "learning_rate": 4.938316895893458e-06, + "loss": 0.5661, + "step": 4022 + }, + { + "epoch": 1.5899728327982219, + "grad_norm": 0.4532822555467358, + "learning_rate": 4.938282303160643e-06, + "loss": 0.5515, + "step": 4023 + }, + { + "epoch": 1.5903679920968141, + "grad_norm": 0.44533687524094095, + "learning_rate": 4.938247700851735e-06, + "loss": 0.5511, + "step": 4024 + }, + { + "epoch": 1.5907631513954064, + "grad_norm": 0.45327422167542913, + "learning_rate": 4.938213088966872e-06, + "loss": 0.5691, + "step": 4025 + }, + { + "epoch": 1.5911583106939986, + "grad_norm": 0.43728559584246246, + "learning_rate": 4.938178467506187e-06, + "loss": 0.5565, + "step": 4026 + }, + { + "epoch": 1.5915534699925908, + "grad_norm": 0.4393345012809165, + "learning_rate": 4.938143836469818e-06, + "loss": 0.5432, + "step": 4027 + }, + { + "epoch": 1.591948629291183, + "grad_norm": 0.4446012831175294, + "learning_rate": 4.938109195857902e-06, + "loss": 0.5782, + "step": 4028 + }, + { + "epoch": 1.5923437885897753, + "grad_norm": 0.4476104890005792, + "learning_rate": 4.938074545670573e-06, + "loss": 0.58, + "step": 4029 + }, + { + "epoch": 1.5927389478883676, + "grad_norm": 0.4469302927676119, + "learning_rate": 4.938039885907967e-06, + "loss": 0.582, + "step": 4030 + }, + { + "epoch": 1.5931341071869598, + "grad_norm": 0.4326506856607228, + "learning_rate": 4.938005216570221e-06, + "loss": 0.5587, + "step": 4031 + }, + { + "epoch": 1.593529266485552, + "grad_norm": 0.45215823190175686, + "learning_rate": 4.9379705376574705e-06, + "loss": 0.5416, + "step": 4032 + }, + { + "epoch": 1.5939244257841443, + "grad_norm": 0.4604923926371955, + "learning_rate": 4.937935849169853e-06, + "loss": 0.5814, + "step": 4033 + }, + { + "epoch": 1.5943195850827365, + "grad_norm": 0.4483793169947381, + "learning_rate": 4.937901151107504e-06, + "loss": 0.5606, + "step": 4034 + }, + { + "epoch": 1.5947147443813288, + "grad_norm": 0.4731607560798562, + "learning_rate": 4.937866443470558e-06, + "loss": 0.5616, + "step": 4035 + }, + { + "epoch": 1.595109903679921, + "grad_norm": 0.44139056004958066, + "learning_rate": 4.9378317262591545e-06, + "loss": 0.5704, + "step": 4036 + }, + { + "epoch": 1.5955050629785132, + "grad_norm": 0.43225652789170976, + "learning_rate": 4.937796999473427e-06, + "loss": 0.545, + "step": 4037 + }, + { + "epoch": 1.5959002222771055, + "grad_norm": 0.4376950142264724, + "learning_rate": 4.937762263113515e-06, + "loss": 0.5607, + "step": 4038 + }, + { + "epoch": 1.5962953815756977, + "grad_norm": 0.43266642151648355, + "learning_rate": 4.937727517179552e-06, + "loss": 0.5734, + "step": 4039 + }, + { + "epoch": 1.59669054087429, + "grad_norm": 0.4491269605040788, + "learning_rate": 4.9376927616716764e-06, + "loss": 0.5593, + "step": 4040 + }, + { + "epoch": 1.5970857001728822, + "grad_norm": 0.4339310588220914, + "learning_rate": 4.937657996590023e-06, + "loss": 0.5706, + "step": 4041 + }, + { + "epoch": 1.5974808594714744, + "grad_norm": 0.4300915782723815, + "learning_rate": 4.93762322193473e-06, + "loss": 0.549, + "step": 4042 + }, + { + "epoch": 1.5978760187700667, + "grad_norm": 0.4349429575023993, + "learning_rate": 4.937588437705933e-06, + "loss": 0.5673, + "step": 4043 + }, + { + "epoch": 1.598271178068659, + "grad_norm": 0.43087812609014015, + "learning_rate": 4.937553643903768e-06, + "loss": 0.5514, + "step": 4044 + }, + { + "epoch": 1.5986663373672512, + "grad_norm": 0.4497997607400426, + "learning_rate": 4.937518840528373e-06, + "loss": 0.5593, + "step": 4045 + }, + { + "epoch": 1.5990614966658434, + "grad_norm": 0.44562506368324917, + "learning_rate": 4.937484027579883e-06, + "loss": 0.5587, + "step": 4046 + }, + { + "epoch": 1.5994566559644356, + "grad_norm": 0.43652547554219967, + "learning_rate": 4.937449205058438e-06, + "loss": 0.58, + "step": 4047 + }, + { + "epoch": 1.5998518152630279, + "grad_norm": 0.4240519663611341, + "learning_rate": 4.937414372964171e-06, + "loss": 0.5486, + "step": 4048 + }, + { + "epoch": 1.6002469745616201, + "grad_norm": 0.45799730349541395, + "learning_rate": 4.937379531297221e-06, + "loss": 0.5567, + "step": 4049 + }, + { + "epoch": 1.6006421338602124, + "grad_norm": 0.4292261145352039, + "learning_rate": 4.937344680057724e-06, + "loss": 0.5505, + "step": 4050 + }, + { + "epoch": 1.6010372931588046, + "grad_norm": 0.42937598054316073, + "learning_rate": 4.937309819245818e-06, + "loss": 0.5601, + "step": 4051 + }, + { + "epoch": 1.6014324524573968, + "grad_norm": 0.4380122077896125, + "learning_rate": 4.937274948861638e-06, + "loss": 0.5608, + "step": 4052 + }, + { + "epoch": 1.601827611755989, + "grad_norm": 0.4548432558319798, + "learning_rate": 4.937240068905322e-06, + "loss": 0.5685, + "step": 4053 + }, + { + "epoch": 1.6022227710545813, + "grad_norm": 0.43181199483513577, + "learning_rate": 4.937205179377008e-06, + "loss": 0.5646, + "step": 4054 + }, + { + "epoch": 1.6026179303531736, + "grad_norm": 0.43979884745381725, + "learning_rate": 4.937170280276831e-06, + "loss": 0.582, + "step": 4055 + }, + { + "epoch": 1.6030130896517658, + "grad_norm": 0.428767044050838, + "learning_rate": 4.93713537160493e-06, + "loss": 0.545, + "step": 4056 + }, + { + "epoch": 1.603408248950358, + "grad_norm": 0.4444074071525543, + "learning_rate": 4.9371004533614395e-06, + "loss": 0.5357, + "step": 4057 + }, + { + "epoch": 1.6038034082489503, + "grad_norm": 0.4305444155233475, + "learning_rate": 4.9370655255465e-06, + "loss": 0.5483, + "step": 4058 + }, + { + "epoch": 1.6041985675475425, + "grad_norm": 0.44066258955092896, + "learning_rate": 4.937030588160247e-06, + "loss": 0.5631, + "step": 4059 + }, + { + "epoch": 1.6045937268461348, + "grad_norm": 0.4487310628582567, + "learning_rate": 4.936995641202816e-06, + "loss": 0.5761, + "step": 4060 + }, + { + "epoch": 1.604988886144727, + "grad_norm": 0.44499130482437343, + "learning_rate": 4.936960684674348e-06, + "loss": 0.5692, + "step": 4061 + }, + { + "epoch": 1.6053840454433193, + "grad_norm": 0.4412506035853422, + "learning_rate": 4.9369257185749766e-06, + "loss": 0.5496, + "step": 4062 + }, + { + "epoch": 1.6057792047419115, + "grad_norm": 0.4359722621598534, + "learning_rate": 4.936890742904842e-06, + "loss": 0.5493, + "step": 4063 + }, + { + "epoch": 1.6061743640405037, + "grad_norm": 0.4340637645593551, + "learning_rate": 4.936855757664079e-06, + "loss": 0.5566, + "step": 4064 + }, + { + "epoch": 1.606569523339096, + "grad_norm": 0.4289469251048877, + "learning_rate": 4.936820762852827e-06, + "loss": 0.5362, + "step": 4065 + }, + { + "epoch": 1.6069646826376882, + "grad_norm": 0.4319109469211228, + "learning_rate": 4.936785758471223e-06, + "loss": 0.5601, + "step": 4066 + }, + { + "epoch": 1.6073598419362805, + "grad_norm": 0.4476806397110603, + "learning_rate": 4.936750744519404e-06, + "loss": 0.5868, + "step": 4067 + }, + { + "epoch": 1.6077550012348727, + "grad_norm": 0.4386152539571968, + "learning_rate": 4.936715720997508e-06, + "loss": 0.5643, + "step": 4068 + }, + { + "epoch": 1.608150160533465, + "grad_norm": 0.4279438753027654, + "learning_rate": 4.936680687905673e-06, + "loss": 0.5575, + "step": 4069 + }, + { + "epoch": 1.6085453198320572, + "grad_norm": 0.4326188532708726, + "learning_rate": 4.936645645244034e-06, + "loss": 0.5598, + "step": 4070 + }, + { + "epoch": 1.6089404791306494, + "grad_norm": 0.44654932545961695, + "learning_rate": 4.936610593012732e-06, + "loss": 0.5678, + "step": 4071 + }, + { + "epoch": 1.6093356384292417, + "grad_norm": 0.4350127525034427, + "learning_rate": 4.936575531211902e-06, + "loss": 0.552, + "step": 4072 + }, + { + "epoch": 1.609730797727834, + "grad_norm": 0.4334061095799106, + "learning_rate": 4.936540459841684e-06, + "loss": 0.5631, + "step": 4073 + }, + { + "epoch": 1.6101259570264261, + "grad_norm": 0.4380393010701646, + "learning_rate": 4.9365053789022145e-06, + "loss": 0.5505, + "step": 4074 + }, + { + "epoch": 1.6105211163250184, + "grad_norm": 0.44462390814852865, + "learning_rate": 4.936470288393631e-06, + "loss": 0.548, + "step": 4075 + }, + { + "epoch": 1.6109162756236106, + "grad_norm": 0.4391814847764043, + "learning_rate": 4.936435188316071e-06, + "loss": 0.5508, + "step": 4076 + }, + { + "epoch": 1.6113114349222029, + "grad_norm": 0.45314650675562296, + "learning_rate": 4.936400078669674e-06, + "loss": 0.5763, + "step": 4077 + }, + { + "epoch": 1.611706594220795, + "grad_norm": 0.42859513707939456, + "learning_rate": 4.936364959454577e-06, + "loss": 0.5534, + "step": 4078 + }, + { + "epoch": 1.6121017535193873, + "grad_norm": 0.43651873348895415, + "learning_rate": 4.936329830670918e-06, + "loss": 0.5539, + "step": 4079 + }, + { + "epoch": 1.6124969128179798, + "grad_norm": 0.4464140156441266, + "learning_rate": 4.9362946923188345e-06, + "loss": 0.5639, + "step": 4080 + }, + { + "epoch": 1.612892072116572, + "grad_norm": 0.43474468854326337, + "learning_rate": 4.936259544398465e-06, + "loss": 0.567, + "step": 4081 + }, + { + "epoch": 1.6132872314151643, + "grad_norm": 0.446752616186375, + "learning_rate": 4.936224386909947e-06, + "loss": 0.5701, + "step": 4082 + }, + { + "epoch": 1.6136823907137565, + "grad_norm": 0.4281198845056971, + "learning_rate": 4.93618921985342e-06, + "loss": 0.5582, + "step": 4083 + }, + { + "epoch": 1.6140775500123488, + "grad_norm": 0.4460784589294146, + "learning_rate": 4.93615404322902e-06, + "loss": 0.5565, + "step": 4084 + }, + { + "epoch": 1.614472709310941, + "grad_norm": 0.4488735337458113, + "learning_rate": 4.936118857036887e-06, + "loss": 0.5498, + "step": 4085 + }, + { + "epoch": 1.6148678686095332, + "grad_norm": 0.4364780045470906, + "learning_rate": 4.936083661277158e-06, + "loss": 0.5603, + "step": 4086 + }, + { + "epoch": 1.6152630279081255, + "grad_norm": 0.4263849821332742, + "learning_rate": 4.936048455949971e-06, + "loss": 0.5407, + "step": 4087 + }, + { + "epoch": 1.6156581872067177, + "grad_norm": 0.4400718397147201, + "learning_rate": 4.936013241055465e-06, + "loss": 0.5762, + "step": 4088 + }, + { + "epoch": 1.61605334650531, + "grad_norm": 0.4308631720264388, + "learning_rate": 4.935978016593779e-06, + "loss": 0.566, + "step": 4089 + }, + { + "epoch": 1.6164485058039022, + "grad_norm": 0.4299942853443749, + "learning_rate": 4.935942782565051e-06, + "loss": 0.5337, + "step": 4090 + }, + { + "epoch": 1.6168436651024944, + "grad_norm": 0.4366671539606536, + "learning_rate": 4.935907538969418e-06, + "loss": 0.555, + "step": 4091 + }, + { + "epoch": 1.6172388244010867, + "grad_norm": 0.48254923126585936, + "learning_rate": 4.93587228580702e-06, + "loss": 0.5595, + "step": 4092 + }, + { + "epoch": 1.617633983699679, + "grad_norm": 0.4239126920096438, + "learning_rate": 4.935837023077994e-06, + "loss": 0.5517, + "step": 4093 + }, + { + "epoch": 1.6180291429982712, + "grad_norm": 0.436879102852562, + "learning_rate": 4.93580175078248e-06, + "loss": 0.545, + "step": 4094 + }, + { + "epoch": 1.6184243022968634, + "grad_norm": 0.44498901921393713, + "learning_rate": 4.935766468920615e-06, + "loss": 0.5639, + "step": 4095 + }, + { + "epoch": 1.6188194615954556, + "grad_norm": 0.4208783410765207, + "learning_rate": 4.935731177492539e-06, + "loss": 0.5513, + "step": 4096 + }, + { + "epoch": 1.6192146208940479, + "grad_norm": 0.4403934391341918, + "learning_rate": 4.93569587649839e-06, + "loss": 0.5597, + "step": 4097 + }, + { + "epoch": 1.6196097801926401, + "grad_norm": 0.4445680674509993, + "learning_rate": 4.935660565938306e-06, + "loss": 0.5747, + "step": 4098 + }, + { + "epoch": 1.6200049394912324, + "grad_norm": 0.4542236992519073, + "learning_rate": 4.935625245812426e-06, + "loss": 0.5557, + "step": 4099 + }, + { + "epoch": 1.6204000987898246, + "grad_norm": 0.4448277099086342, + "learning_rate": 4.935589916120891e-06, + "loss": 0.5652, + "step": 4100 + }, + { + "epoch": 1.6207952580884168, + "grad_norm": 0.4383946322249754, + "learning_rate": 4.935554576863837e-06, + "loss": 0.5433, + "step": 4101 + }, + { + "epoch": 1.621190417387009, + "grad_norm": 0.43502119286992535, + "learning_rate": 4.9355192280414024e-06, + "loss": 0.5693, + "step": 4102 + }, + { + "epoch": 1.6215855766856015, + "grad_norm": 0.4316383774413981, + "learning_rate": 4.935483869653728e-06, + "loss": 0.5677, + "step": 4103 + }, + { + "epoch": 1.6219807359841938, + "grad_norm": 0.4341260683812164, + "learning_rate": 4.935448501700953e-06, + "loss": 0.5391, + "step": 4104 + }, + { + "epoch": 1.622375895282786, + "grad_norm": 0.43195315752672536, + "learning_rate": 4.935413124183212e-06, + "loss": 0.5639, + "step": 4105 + }, + { + "epoch": 1.6227710545813783, + "grad_norm": 0.43507822462287415, + "learning_rate": 4.93537773710065e-06, + "loss": 0.5576, + "step": 4106 + }, + { + "epoch": 1.6231662138799705, + "grad_norm": 0.46638962293882175, + "learning_rate": 4.935342340453402e-06, + "loss": 0.5631, + "step": 4107 + }, + { + "epoch": 1.6235613731785628, + "grad_norm": 0.4558364456192893, + "learning_rate": 4.9353069342416085e-06, + "loss": 0.5566, + "step": 4108 + }, + { + "epoch": 1.623956532477155, + "grad_norm": 0.43312180311952486, + "learning_rate": 4.935271518465408e-06, + "loss": 0.56, + "step": 4109 + }, + { + "epoch": 1.6243516917757472, + "grad_norm": 0.4365748990534713, + "learning_rate": 4.93523609312494e-06, + "loss": 0.5629, + "step": 4110 + }, + { + "epoch": 1.6247468510743395, + "grad_norm": 0.4587870833960197, + "learning_rate": 4.935200658220342e-06, + "loss": 0.5741, + "step": 4111 + }, + { + "epoch": 1.6251420103729317, + "grad_norm": 0.43339139678044175, + "learning_rate": 4.935165213751757e-06, + "loss": 0.5569, + "step": 4112 + }, + { + "epoch": 1.625537169671524, + "grad_norm": 0.4428057531936081, + "learning_rate": 4.93512975971932e-06, + "loss": 0.5683, + "step": 4113 + }, + { + "epoch": 1.6259323289701162, + "grad_norm": 0.4578729689342045, + "learning_rate": 4.935094296123172e-06, + "loss": 0.5501, + "step": 4114 + }, + { + "epoch": 1.6263274882687084, + "grad_norm": 0.45476951551439365, + "learning_rate": 4.935058822963454e-06, + "loss": 0.5709, + "step": 4115 + }, + { + "epoch": 1.6267226475673007, + "grad_norm": 0.45641536816268186, + "learning_rate": 4.935023340240301e-06, + "loss": 0.5564, + "step": 4116 + }, + { + "epoch": 1.627117806865893, + "grad_norm": 0.4488224757255472, + "learning_rate": 4.934987847953856e-06, + "loss": 0.5619, + "step": 4117 + }, + { + "epoch": 1.6275129661644852, + "grad_norm": 0.46880383951054816, + "learning_rate": 4.934952346104258e-06, + "loss": 0.5604, + "step": 4118 + }, + { + "epoch": 1.6279081254630774, + "grad_norm": 0.545037272078235, + "learning_rate": 4.9349168346916454e-06, + "loss": 0.543, + "step": 4119 + }, + { + "epoch": 1.6283032847616696, + "grad_norm": 0.4366418096115771, + "learning_rate": 4.934881313716158e-06, + "loss": 0.5722, + "step": 4120 + }, + { + "epoch": 1.6286984440602619, + "grad_norm": 0.43777846292338235, + "learning_rate": 4.934845783177935e-06, + "loss": 0.5524, + "step": 4121 + }, + { + "epoch": 1.6290936033588541, + "grad_norm": 0.422011213186665, + "learning_rate": 4.934810243077117e-06, + "loss": 0.5733, + "step": 4122 + }, + { + "epoch": 1.6294887626574464, + "grad_norm": 0.44138201150444306, + "learning_rate": 4.934774693413843e-06, + "loss": 0.5686, + "step": 4123 + }, + { + "epoch": 1.6298839219560386, + "grad_norm": 0.44921636818156546, + "learning_rate": 4.934739134188251e-06, + "loss": 0.5724, + "step": 4124 + }, + { + "epoch": 1.6302790812546308, + "grad_norm": 0.44859391331022486, + "learning_rate": 4.934703565400484e-06, + "loss": 0.5821, + "step": 4125 + }, + { + "epoch": 1.630674240553223, + "grad_norm": 0.4233228039840541, + "learning_rate": 4.934667987050678e-06, + "loss": 0.5571, + "step": 4126 + }, + { + "epoch": 1.6310693998518153, + "grad_norm": 0.4338865601440852, + "learning_rate": 4.934632399138976e-06, + "loss": 0.5675, + "step": 4127 + }, + { + "epoch": 1.6314645591504076, + "grad_norm": 0.4394241216685595, + "learning_rate": 4.934596801665515e-06, + "loss": 0.5571, + "step": 4128 + }, + { + "epoch": 1.6318597184489998, + "grad_norm": 0.43864069069871264, + "learning_rate": 4.934561194630437e-06, + "loss": 0.5381, + "step": 4129 + }, + { + "epoch": 1.632254877747592, + "grad_norm": 0.42429399086993547, + "learning_rate": 4.934525578033881e-06, + "loss": 0.5587, + "step": 4130 + }, + { + "epoch": 1.6326500370461843, + "grad_norm": 0.4404107423141753, + "learning_rate": 4.934489951875987e-06, + "loss": 0.5721, + "step": 4131 + }, + { + "epoch": 1.6330451963447765, + "grad_norm": 0.4332961401456829, + "learning_rate": 4.934454316156894e-06, + "loss": 0.5638, + "step": 4132 + }, + { + "epoch": 1.6334403556433688, + "grad_norm": 0.4695705815950691, + "learning_rate": 4.934418670876743e-06, + "loss": 0.561, + "step": 4133 + }, + { + "epoch": 1.633835514941961, + "grad_norm": 0.45507102791172976, + "learning_rate": 4.9343830160356744e-06, + "loss": 0.5713, + "step": 4134 + }, + { + "epoch": 1.6342306742405532, + "grad_norm": 0.43768991836081356, + "learning_rate": 4.934347351633827e-06, + "loss": 0.556, + "step": 4135 + }, + { + "epoch": 1.6346258335391455, + "grad_norm": 0.43283166401271617, + "learning_rate": 4.934311677671342e-06, + "loss": 0.5596, + "step": 4136 + }, + { + "epoch": 1.6350209928377377, + "grad_norm": 0.42555330658880147, + "learning_rate": 4.934275994148357e-06, + "loss": 0.5593, + "step": 4137 + }, + { + "epoch": 1.63541615213633, + "grad_norm": 0.44485035790650657, + "learning_rate": 4.934240301065016e-06, + "loss": 0.5739, + "step": 4138 + }, + { + "epoch": 1.6358113114349222, + "grad_norm": 0.4409925294439782, + "learning_rate": 4.934204598421457e-06, + "loss": 0.5643, + "step": 4139 + }, + { + "epoch": 1.6362064707335144, + "grad_norm": 0.4310591741238124, + "learning_rate": 4.934168886217821e-06, + "loss": 0.571, + "step": 4140 + }, + { + "epoch": 1.6366016300321067, + "grad_norm": 0.43546524799796604, + "learning_rate": 4.934133164454246e-06, + "loss": 0.5682, + "step": 4141 + }, + { + "epoch": 1.636996789330699, + "grad_norm": 0.42766476649034174, + "learning_rate": 4.934097433130875e-06, + "loss": 0.5587, + "step": 4142 + }, + { + "epoch": 1.6373919486292912, + "grad_norm": 0.4381721056390369, + "learning_rate": 4.934061692247847e-06, + "loss": 0.5627, + "step": 4143 + }, + { + "epoch": 1.6377871079278834, + "grad_norm": 0.4289920275995806, + "learning_rate": 4.9340259418053035e-06, + "loss": 0.547, + "step": 4144 + }, + { + "epoch": 1.6381822672264756, + "grad_norm": 0.4264307971370297, + "learning_rate": 4.933990181803383e-06, + "loss": 0.5569, + "step": 4145 + }, + { + "epoch": 1.6385774265250679, + "grad_norm": 0.4296221573612008, + "learning_rate": 4.933954412242228e-06, + "loss": 0.5523, + "step": 4146 + }, + { + "epoch": 1.6389725858236601, + "grad_norm": 0.43507977648083157, + "learning_rate": 4.933918633121978e-06, + "loss": 0.5561, + "step": 4147 + }, + { + "epoch": 1.6393677451222524, + "grad_norm": 0.42056933159661486, + "learning_rate": 4.933882844442773e-06, + "loss": 0.5636, + "step": 4148 + }, + { + "epoch": 1.6397629044208446, + "grad_norm": 0.4433116034912783, + "learning_rate": 4.933847046204754e-06, + "loss": 0.5553, + "step": 4149 + }, + { + "epoch": 1.6401580637194368, + "grad_norm": 0.4459630268656738, + "learning_rate": 4.933811238408063e-06, + "loss": 0.5577, + "step": 4150 + }, + { + "epoch": 1.640553223018029, + "grad_norm": 0.4403056379106962, + "learning_rate": 4.933775421052838e-06, + "loss": 0.5833, + "step": 4151 + }, + { + "epoch": 1.6409483823166213, + "grad_norm": 0.42886531696805363, + "learning_rate": 4.933739594139221e-06, + "loss": 0.541, + "step": 4152 + }, + { + "epoch": 1.6413435416152136, + "grad_norm": 0.4315557640640133, + "learning_rate": 4.933703757667353e-06, + "loss": 0.5575, + "step": 4153 + }, + { + "epoch": 1.6417387009138058, + "grad_norm": 0.48140901833929156, + "learning_rate": 4.933667911637375e-06, + "loss": 0.5783, + "step": 4154 + }, + { + "epoch": 1.642133860212398, + "grad_norm": 0.42997667165062453, + "learning_rate": 4.933632056049427e-06, + "loss": 0.5482, + "step": 4155 + }, + { + "epoch": 1.6425290195109903, + "grad_norm": 0.429706934348882, + "learning_rate": 4.93359619090365e-06, + "loss": 0.5721, + "step": 4156 + }, + { + "epoch": 1.6429241788095825, + "grad_norm": 0.4445206300843515, + "learning_rate": 4.933560316200185e-06, + "loss": 0.5742, + "step": 4157 + }, + { + "epoch": 1.6433193381081748, + "grad_norm": 0.433653943288636, + "learning_rate": 4.933524431939173e-06, + "loss": 0.5682, + "step": 4158 + }, + { + "epoch": 1.643714497406767, + "grad_norm": 0.43516906274908146, + "learning_rate": 4.933488538120754e-06, + "loss": 0.5658, + "step": 4159 + }, + { + "epoch": 1.6441096567053592, + "grad_norm": 0.43315071912764175, + "learning_rate": 4.933452634745071e-06, + "loss": 0.5605, + "step": 4160 + }, + { + "epoch": 1.6445048160039515, + "grad_norm": 0.4623128049190355, + "learning_rate": 4.933416721812262e-06, + "loss": 0.5451, + "step": 4161 + }, + { + "epoch": 1.6448999753025437, + "grad_norm": 0.4322419596499138, + "learning_rate": 4.933380799322471e-06, + "loss": 0.5405, + "step": 4162 + }, + { + "epoch": 1.645295134601136, + "grad_norm": 0.4353017445216181, + "learning_rate": 4.933344867275837e-06, + "loss": 0.5465, + "step": 4163 + }, + { + "epoch": 1.6456902938997282, + "grad_norm": 0.42774307582614995, + "learning_rate": 4.9333089256725034e-06, + "loss": 0.5272, + "step": 4164 + }, + { + "epoch": 1.6460854531983204, + "grad_norm": 0.43158324134092596, + "learning_rate": 4.9332729745126085e-06, + "loss": 0.5593, + "step": 4165 + }, + { + "epoch": 1.6464806124969127, + "grad_norm": 0.43107023737645744, + "learning_rate": 4.933237013796295e-06, + "loss": 0.5674, + "step": 4166 + }, + { + "epoch": 1.646875771795505, + "grad_norm": 0.470689528328689, + "learning_rate": 4.9332010435237045e-06, + "loss": 0.5735, + "step": 4167 + }, + { + "epoch": 1.6472709310940972, + "grad_norm": 0.4337591992128468, + "learning_rate": 4.933165063694978e-06, + "loss": 0.5651, + "step": 4168 + }, + { + "epoch": 1.6476660903926894, + "grad_norm": 0.433598238285656, + "learning_rate": 4.933129074310257e-06, + "loss": 0.5609, + "step": 4169 + }, + { + "epoch": 1.6480612496912816, + "grad_norm": 0.4383209332082589, + "learning_rate": 4.933093075369681e-06, + "loss": 0.5675, + "step": 4170 + }, + { + "epoch": 1.648456408989874, + "grad_norm": 0.47207585803002206, + "learning_rate": 4.933057066873394e-06, + "loss": 0.5662, + "step": 4171 + }, + { + "epoch": 1.6488515682884661, + "grad_norm": 0.472525910555258, + "learning_rate": 4.933021048821536e-06, + "loss": 0.5402, + "step": 4172 + }, + { + "epoch": 1.6492467275870584, + "grad_norm": 0.4760221302508032, + "learning_rate": 4.932985021214248e-06, + "loss": 0.5783, + "step": 4173 + }, + { + "epoch": 1.6496418868856508, + "grad_norm": 0.433597879420229, + "learning_rate": 4.932948984051673e-06, + "loss": 0.5731, + "step": 4174 + }, + { + "epoch": 1.650037046184243, + "grad_norm": 0.4402059167564898, + "learning_rate": 4.9329129373339525e-06, + "loss": 0.547, + "step": 4175 + }, + { + "epoch": 1.6504322054828353, + "grad_norm": 0.4546433582700222, + "learning_rate": 4.932876881061226e-06, + "loss": 0.5469, + "step": 4176 + }, + { + "epoch": 1.6508273647814276, + "grad_norm": 0.4316382194174931, + "learning_rate": 4.932840815233637e-06, + "loss": 0.5392, + "step": 4177 + }, + { + "epoch": 1.6512225240800198, + "grad_norm": 0.4334651261656393, + "learning_rate": 4.9328047398513265e-06, + "loss": 0.5556, + "step": 4178 + }, + { + "epoch": 1.651617683378612, + "grad_norm": 0.5167325693733595, + "learning_rate": 4.932768654914437e-06, + "loss": 0.5574, + "step": 4179 + }, + { + "epoch": 1.6520128426772043, + "grad_norm": 0.4537887100737346, + "learning_rate": 4.932732560423108e-06, + "loss": 0.5519, + "step": 4180 + }, + { + "epoch": 1.6524080019757965, + "grad_norm": 0.4407088531206237, + "learning_rate": 4.932696456377484e-06, + "loss": 0.5602, + "step": 4181 + }, + { + "epoch": 1.6528031612743888, + "grad_norm": 0.4239271803777524, + "learning_rate": 4.932660342777705e-06, + "loss": 0.5344, + "step": 4182 + }, + { + "epoch": 1.653198320572981, + "grad_norm": 0.4221587419527885, + "learning_rate": 4.932624219623913e-06, + "loss": 0.5419, + "step": 4183 + }, + { + "epoch": 1.6535934798715732, + "grad_norm": 0.45091328428360944, + "learning_rate": 4.932588086916251e-06, + "loss": 0.5569, + "step": 4184 + }, + { + "epoch": 1.6539886391701655, + "grad_norm": 0.4535113787923153, + "learning_rate": 4.93255194465486e-06, + "loss": 0.5311, + "step": 4185 + }, + { + "epoch": 1.6543837984687577, + "grad_norm": 0.4495246513741634, + "learning_rate": 4.932515792839882e-06, + "loss": 0.5459, + "step": 4186 + }, + { + "epoch": 1.65477895776735, + "grad_norm": 0.45103027154827796, + "learning_rate": 4.932479631471459e-06, + "loss": 0.5617, + "step": 4187 + }, + { + "epoch": 1.6551741170659422, + "grad_norm": 0.46161108604777257, + "learning_rate": 4.932443460549733e-06, + "loss": 0.5744, + "step": 4188 + }, + { + "epoch": 1.6555692763645344, + "grad_norm": 0.4585641137181729, + "learning_rate": 4.932407280074846e-06, + "loss": 0.5571, + "step": 4189 + }, + { + "epoch": 1.6559644356631267, + "grad_norm": 0.47002181393717396, + "learning_rate": 4.93237109004694e-06, + "loss": 0.5591, + "step": 4190 + }, + { + "epoch": 1.656359594961719, + "grad_norm": 0.45813305720090625, + "learning_rate": 4.932334890466158e-06, + "loss": 0.5509, + "step": 4191 + }, + { + "epoch": 1.6567547542603112, + "grad_norm": 0.4442101744900535, + "learning_rate": 4.932298681332641e-06, + "loss": 0.5834, + "step": 4192 + }, + { + "epoch": 1.6571499135589034, + "grad_norm": 0.43904506024703893, + "learning_rate": 4.932262462646532e-06, + "loss": 0.5628, + "step": 4193 + }, + { + "epoch": 1.6575450728574956, + "grad_norm": 0.45081395145304054, + "learning_rate": 4.932226234407973e-06, + "loss": 0.5597, + "step": 4194 + }, + { + "epoch": 1.6579402321560879, + "grad_norm": 0.45308321586467587, + "learning_rate": 4.932189996617106e-06, + "loss": 0.5424, + "step": 4195 + }, + { + "epoch": 1.6583353914546801, + "grad_norm": 0.43748276416781745, + "learning_rate": 4.932153749274074e-06, + "loss": 0.5478, + "step": 4196 + }, + { + "epoch": 1.6587305507532724, + "grad_norm": 0.4390093028892108, + "learning_rate": 4.932117492379019e-06, + "loss": 0.5576, + "step": 4197 + }, + { + "epoch": 1.6591257100518648, + "grad_norm": 0.43009631591048475, + "learning_rate": 4.932081225932084e-06, + "loss": 0.5461, + "step": 4198 + }, + { + "epoch": 1.659520869350457, + "grad_norm": 0.4421965633593312, + "learning_rate": 4.93204494993341e-06, + "loss": 0.5644, + "step": 4199 + }, + { + "epoch": 1.6599160286490493, + "grad_norm": 0.44435453925130375, + "learning_rate": 4.93200866438314e-06, + "loss": 0.5522, + "step": 4200 + }, + { + "epoch": 1.6603111879476415, + "grad_norm": 0.47458082371469706, + "learning_rate": 4.931972369281417e-06, + "loss": 0.5471, + "step": 4201 + }, + { + "epoch": 1.6607063472462338, + "grad_norm": 0.4301353638936425, + "learning_rate": 4.931936064628383e-06, + "loss": 0.5703, + "step": 4202 + }, + { + "epoch": 1.661101506544826, + "grad_norm": 0.4372380561131763, + "learning_rate": 4.931899750424182e-06, + "loss": 0.5683, + "step": 4203 + }, + { + "epoch": 1.6614966658434183, + "grad_norm": 0.46217752541691776, + "learning_rate": 4.931863426668955e-06, + "loss": 0.5559, + "step": 4204 + }, + { + "epoch": 1.6618918251420105, + "grad_norm": 0.4414697866376089, + "learning_rate": 4.931827093362844e-06, + "loss": 0.5533, + "step": 4205 + }, + { + "epoch": 1.6622869844406027, + "grad_norm": 0.44026690012982383, + "learning_rate": 4.931790750505994e-06, + "loss": 0.5473, + "step": 4206 + }, + { + "epoch": 1.662682143739195, + "grad_norm": 0.44908034456035184, + "learning_rate": 4.931754398098546e-06, + "loss": 0.5676, + "step": 4207 + }, + { + "epoch": 1.6630773030377872, + "grad_norm": 0.43506228540161684, + "learning_rate": 4.931718036140645e-06, + "loss": 0.56, + "step": 4208 + }, + { + "epoch": 1.6634724623363795, + "grad_norm": 0.4414844514234546, + "learning_rate": 4.9316816646324305e-06, + "loss": 0.5395, + "step": 4209 + }, + { + "epoch": 1.6638676216349717, + "grad_norm": 0.4353535377517495, + "learning_rate": 4.931645283574047e-06, + "loss": 0.5706, + "step": 4210 + }, + { + "epoch": 1.664262780933564, + "grad_norm": 0.4352610887294283, + "learning_rate": 4.931608892965638e-06, + "loss": 0.5529, + "step": 4211 + }, + { + "epoch": 1.6646579402321562, + "grad_norm": 0.48475567073134784, + "learning_rate": 4.931572492807346e-06, + "loss": 0.5691, + "step": 4212 + }, + { + "epoch": 1.6650530995307484, + "grad_norm": 0.43914859193682443, + "learning_rate": 4.931536083099313e-06, + "loss": 0.5447, + "step": 4213 + }, + { + "epoch": 1.6654482588293407, + "grad_norm": 0.42574696114957944, + "learning_rate": 4.931499663841683e-06, + "loss": 0.5506, + "step": 4214 + }, + { + "epoch": 1.665843418127933, + "grad_norm": 0.44003634839285843, + "learning_rate": 4.9314632350345995e-06, + "loss": 0.5714, + "step": 4215 + }, + { + "epoch": 1.6662385774265251, + "grad_norm": 0.44345794553567824, + "learning_rate": 4.931426796678204e-06, + "loss": 0.5525, + "step": 4216 + }, + { + "epoch": 1.6666337367251174, + "grad_norm": 0.44184543039510044, + "learning_rate": 4.9313903487726415e-06, + "loss": 0.5533, + "step": 4217 + }, + { + "epoch": 1.6670288960237096, + "grad_norm": 0.4482583566446144, + "learning_rate": 4.931353891318053e-06, + "loss": 0.5567, + "step": 4218 + }, + { + "epoch": 1.6674240553223019, + "grad_norm": 0.44359624620051485, + "learning_rate": 4.931317424314583e-06, + "loss": 0.5417, + "step": 4219 + }, + { + "epoch": 1.667819214620894, + "grad_norm": 0.45039139024204555, + "learning_rate": 4.9312809477623755e-06, + "loss": 0.5756, + "step": 4220 + }, + { + "epoch": 1.6682143739194863, + "grad_norm": 0.46242291351750414, + "learning_rate": 4.931244461661571e-06, + "loss": 0.5548, + "step": 4221 + }, + { + "epoch": 1.6686095332180786, + "grad_norm": 0.44761433940772233, + "learning_rate": 4.9312079660123165e-06, + "loss": 0.5545, + "step": 4222 + }, + { + "epoch": 1.6690046925166708, + "grad_norm": 0.42745307952513, + "learning_rate": 4.931171460814752e-06, + "loss": 0.5612, + "step": 4223 + }, + { + "epoch": 1.669399851815263, + "grad_norm": 0.4409759305327771, + "learning_rate": 4.9311349460690235e-06, + "loss": 0.5295, + "step": 4224 + }, + { + "epoch": 1.6697950111138553, + "grad_norm": 0.44041060145128974, + "learning_rate": 4.9310984217752725e-06, + "loss": 0.5807, + "step": 4225 + }, + { + "epoch": 1.6701901704124476, + "grad_norm": 0.45082681364452676, + "learning_rate": 4.931061887933643e-06, + "loss": 0.5525, + "step": 4226 + }, + { + "epoch": 1.6705853297110398, + "grad_norm": 0.4325179559967354, + "learning_rate": 4.931025344544279e-06, + "loss": 0.5614, + "step": 4227 + }, + { + "epoch": 1.670980489009632, + "grad_norm": 0.4491688766617088, + "learning_rate": 4.930988791607324e-06, + "loss": 0.5547, + "step": 4228 + }, + { + "epoch": 1.6713756483082243, + "grad_norm": 0.43251674883633673, + "learning_rate": 4.930952229122921e-06, + "loss": 0.5525, + "step": 4229 + }, + { + "epoch": 1.6717708076068165, + "grad_norm": 0.450173373250462, + "learning_rate": 4.930915657091213e-06, + "loss": 0.5539, + "step": 4230 + }, + { + "epoch": 1.6721659669054088, + "grad_norm": 0.452547822179787, + "learning_rate": 4.930879075512345e-06, + "loss": 0.5783, + "step": 4231 + }, + { + "epoch": 1.672561126204001, + "grad_norm": 0.456787048326293, + "learning_rate": 4.93084248438646e-06, + "loss": 0.5851, + "step": 4232 + }, + { + "epoch": 1.6729562855025932, + "grad_norm": 0.4398970129307844, + "learning_rate": 4.930805883713702e-06, + "loss": 0.5588, + "step": 4233 + }, + { + "epoch": 1.6733514448011855, + "grad_norm": 0.4344964137010482, + "learning_rate": 4.930769273494215e-06, + "loss": 0.5422, + "step": 4234 + }, + { + "epoch": 1.6737466040997777, + "grad_norm": 0.4480357286208341, + "learning_rate": 4.930732653728141e-06, + "loss": 0.5719, + "step": 4235 + }, + { + "epoch": 1.67414176339837, + "grad_norm": 0.4468533257133995, + "learning_rate": 4.930696024415626e-06, + "loss": 0.5706, + "step": 4236 + }, + { + "epoch": 1.6745369226969622, + "grad_norm": 0.43589038296389493, + "learning_rate": 4.930659385556813e-06, + "loss": 0.5718, + "step": 4237 + }, + { + "epoch": 1.6749320819955544, + "grad_norm": 0.46213277997599467, + "learning_rate": 4.9306227371518455e-06, + "loss": 0.543, + "step": 4238 + }, + { + "epoch": 1.6753272412941467, + "grad_norm": 0.4418860283426867, + "learning_rate": 4.930586079200869e-06, + "loss": 0.555, + "step": 4239 + }, + { + "epoch": 1.675722400592739, + "grad_norm": 0.41996717679739026, + "learning_rate": 4.930549411704025e-06, + "loss": 0.5357, + "step": 4240 + }, + { + "epoch": 1.6761175598913312, + "grad_norm": 0.4575919872171913, + "learning_rate": 4.930512734661459e-06, + "loss": 0.5545, + "step": 4241 + }, + { + "epoch": 1.6765127191899234, + "grad_norm": 0.44970217578452903, + "learning_rate": 4.930476048073316e-06, + "loss": 0.5634, + "step": 4242 + }, + { + "epoch": 1.6769078784885156, + "grad_norm": 0.4436734001641958, + "learning_rate": 4.930439351939738e-06, + "loss": 0.5848, + "step": 4243 + }, + { + "epoch": 1.6773030377871079, + "grad_norm": 0.43230787717115343, + "learning_rate": 4.93040264626087e-06, + "loss": 0.5758, + "step": 4244 + }, + { + "epoch": 1.6776981970857001, + "grad_norm": 0.45704860376232215, + "learning_rate": 4.9303659310368565e-06, + "loss": 0.5644, + "step": 4245 + }, + { + "epoch": 1.6780933563842924, + "grad_norm": 0.458499629507116, + "learning_rate": 4.930329206267841e-06, + "loss": 0.5714, + "step": 4246 + }, + { + "epoch": 1.6784885156828846, + "grad_norm": 0.4436057764210825, + "learning_rate": 4.930292471953969e-06, + "loss": 0.5512, + "step": 4247 + }, + { + "epoch": 1.6788836749814768, + "grad_norm": 0.4322151502298006, + "learning_rate": 4.930255728095383e-06, + "loss": 0.5475, + "step": 4248 + }, + { + "epoch": 1.679278834280069, + "grad_norm": 0.4549937231291737, + "learning_rate": 4.930218974692229e-06, + "loss": 0.5568, + "step": 4249 + }, + { + "epoch": 1.6796739935786613, + "grad_norm": 0.44063968344614446, + "learning_rate": 4.930182211744649e-06, + "loss": 0.562, + "step": 4250 + }, + { + "epoch": 1.6800691528772536, + "grad_norm": 0.4635898234904489, + "learning_rate": 4.930145439252791e-06, + "loss": 0.566, + "step": 4251 + }, + { + "epoch": 1.6804643121758458, + "grad_norm": 0.4347065121603174, + "learning_rate": 4.930108657216796e-06, + "loss": 0.5579, + "step": 4252 + }, + { + "epoch": 1.680859471474438, + "grad_norm": 0.45191051205485905, + "learning_rate": 4.930071865636811e-06, + "loss": 0.5616, + "step": 4253 + }, + { + "epoch": 1.6812546307730303, + "grad_norm": 0.5003942515624423, + "learning_rate": 4.930035064512979e-06, + "loss": 0.5544, + "step": 4254 + }, + { + "epoch": 1.6816497900716225, + "grad_norm": 0.43280130558001995, + "learning_rate": 4.929998253845444e-06, + "loss": 0.5839, + "step": 4255 + }, + { + "epoch": 1.6820449493702148, + "grad_norm": 0.4347329458274132, + "learning_rate": 4.929961433634352e-06, + "loss": 0.5512, + "step": 4256 + }, + { + "epoch": 1.682440108668807, + "grad_norm": 0.4595377167928187, + "learning_rate": 4.9299246038798474e-06, + "loss": 0.5797, + "step": 4257 + }, + { + "epoch": 1.6828352679673992, + "grad_norm": 0.4542171341695038, + "learning_rate": 4.9298877645820735e-06, + "loss": 0.5421, + "step": 4258 + }, + { + "epoch": 1.6832304272659915, + "grad_norm": 0.43845775532831804, + "learning_rate": 4.929850915741177e-06, + "loss": 0.565, + "step": 4259 + }, + { + "epoch": 1.6836255865645837, + "grad_norm": 0.44402420830903166, + "learning_rate": 4.929814057357301e-06, + "loss": 0.5605, + "step": 4260 + }, + { + "epoch": 1.684020745863176, + "grad_norm": 0.44396077168291875, + "learning_rate": 4.929777189430591e-06, + "loss": 0.5694, + "step": 4261 + }, + { + "epoch": 1.6844159051617682, + "grad_norm": 0.4413223103845851, + "learning_rate": 4.929740311961192e-06, + "loss": 0.5681, + "step": 4262 + }, + { + "epoch": 1.6848110644603604, + "grad_norm": 0.4432712724622185, + "learning_rate": 4.929703424949248e-06, + "loss": 0.5791, + "step": 4263 + }, + { + "epoch": 1.6852062237589527, + "grad_norm": 0.4291806701241977, + "learning_rate": 4.929666528394904e-06, + "loss": 0.5608, + "step": 4264 + }, + { + "epoch": 1.685601383057545, + "grad_norm": 0.44175554248806165, + "learning_rate": 4.929629622298307e-06, + "loss": 0.5367, + "step": 4265 + }, + { + "epoch": 1.6859965423561372, + "grad_norm": 0.43417237826237876, + "learning_rate": 4.929592706659599e-06, + "loss": 0.5621, + "step": 4266 + }, + { + "epoch": 1.6863917016547294, + "grad_norm": 0.436619782617058, + "learning_rate": 4.929555781478925e-06, + "loss": 0.556, + "step": 4267 + }, + { + "epoch": 1.6867868609533216, + "grad_norm": 0.4412555558446154, + "learning_rate": 4.9295188467564326e-06, + "loss": 0.5549, + "step": 4268 + }, + { + "epoch": 1.687182020251914, + "grad_norm": 0.4429817647687938, + "learning_rate": 4.929481902492265e-06, + "loss": 0.5451, + "step": 4269 + }, + { + "epoch": 1.6875771795505063, + "grad_norm": 0.42710204000708135, + "learning_rate": 4.929444948686568e-06, + "loss": 0.5574, + "step": 4270 + }, + { + "epoch": 1.6879723388490986, + "grad_norm": 0.4345670015513589, + "learning_rate": 4.929407985339486e-06, + "loss": 0.573, + "step": 4271 + }, + { + "epoch": 1.6883674981476908, + "grad_norm": 0.42754387278545575, + "learning_rate": 4.929371012451165e-06, + "loss": 0.5504, + "step": 4272 + }, + { + "epoch": 1.688762657446283, + "grad_norm": 0.44346425482357044, + "learning_rate": 4.9293340300217505e-06, + "loss": 0.566, + "step": 4273 + }, + { + "epoch": 1.6891578167448753, + "grad_norm": 0.43395132844534345, + "learning_rate": 4.929297038051386e-06, + "loss": 0.5751, + "step": 4274 + }, + { + "epoch": 1.6895529760434675, + "grad_norm": 0.4289911868889898, + "learning_rate": 4.929260036540218e-06, + "loss": 0.5545, + "step": 4275 + }, + { + "epoch": 1.6899481353420598, + "grad_norm": 0.4299481945107919, + "learning_rate": 4.929223025488393e-06, + "loss": 0.5655, + "step": 4276 + }, + { + "epoch": 1.690343294640652, + "grad_norm": 0.4315808805682797, + "learning_rate": 4.929186004896054e-06, + "loss": 0.5586, + "step": 4277 + }, + { + "epoch": 1.6907384539392443, + "grad_norm": 0.43737402235911826, + "learning_rate": 4.929148974763347e-06, + "loss": 0.5677, + "step": 4278 + }, + { + "epoch": 1.6911336132378365, + "grad_norm": 0.44163862598713893, + "learning_rate": 4.929111935090418e-06, + "loss": 0.5484, + "step": 4279 + }, + { + "epoch": 1.6915287725364287, + "grad_norm": 0.44050203475372246, + "learning_rate": 4.929074885877414e-06, + "loss": 0.5639, + "step": 4280 + }, + { + "epoch": 1.691923931835021, + "grad_norm": 0.44036371463743945, + "learning_rate": 4.929037827124477e-06, + "loss": 0.5532, + "step": 4281 + }, + { + "epoch": 1.6923190911336132, + "grad_norm": 0.42541252276330227, + "learning_rate": 4.929000758831755e-06, + "loss": 0.5541, + "step": 4282 + }, + { + "epoch": 1.6927142504322055, + "grad_norm": 0.45077617747089965, + "learning_rate": 4.928963680999393e-06, + "loss": 0.5728, + "step": 4283 + }, + { + "epoch": 1.6931094097307977, + "grad_norm": 0.43862795202138594, + "learning_rate": 4.928926593627537e-06, + "loss": 0.5609, + "step": 4284 + }, + { + "epoch": 1.69350456902939, + "grad_norm": 0.4355391950216865, + "learning_rate": 4.928889496716331e-06, + "loss": 0.5577, + "step": 4285 + }, + { + "epoch": 1.6938997283279822, + "grad_norm": 0.449424153003478, + "learning_rate": 4.928852390265923e-06, + "loss": 0.5589, + "step": 4286 + }, + { + "epoch": 1.6942948876265744, + "grad_norm": 0.42444603371479195, + "learning_rate": 4.928815274276458e-06, + "loss": 0.5542, + "step": 4287 + }, + { + "epoch": 1.6946900469251667, + "grad_norm": 0.4374300606831901, + "learning_rate": 4.928778148748081e-06, + "loss": 0.5768, + "step": 4288 + }, + { + "epoch": 1.695085206223759, + "grad_norm": 0.4267602747759363, + "learning_rate": 4.928741013680939e-06, + "loss": 0.5606, + "step": 4289 + }, + { + "epoch": 1.6954803655223512, + "grad_norm": 0.4366846729132836, + "learning_rate": 4.928703869075176e-06, + "loss": 0.5498, + "step": 4290 + }, + { + "epoch": 1.6958755248209434, + "grad_norm": 0.4435134184494798, + "learning_rate": 4.92866671493094e-06, + "loss": 0.5485, + "step": 4291 + }, + { + "epoch": 1.6962706841195359, + "grad_norm": 0.42497901408157157, + "learning_rate": 4.928629551248375e-06, + "loss": 0.5383, + "step": 4292 + }, + { + "epoch": 1.696665843418128, + "grad_norm": 0.4443410896472178, + "learning_rate": 4.928592378027628e-06, + "loss": 0.543, + "step": 4293 + }, + { + "epoch": 1.6970610027167203, + "grad_norm": 0.4546919285525924, + "learning_rate": 4.928555195268845e-06, + "loss": 0.5598, + "step": 4294 + }, + { + "epoch": 1.6974561620153126, + "grad_norm": 0.4327018959048797, + "learning_rate": 4.928518002972172e-06, + "loss": 0.5607, + "step": 4295 + }, + { + "epoch": 1.6978513213139048, + "grad_norm": 0.43177140667053937, + "learning_rate": 4.928480801137755e-06, + "loss": 0.5468, + "step": 4296 + }, + { + "epoch": 1.698246480612497, + "grad_norm": 0.43141416580935815, + "learning_rate": 4.92844358976574e-06, + "loss": 0.5663, + "step": 4297 + }, + { + "epoch": 1.6986416399110893, + "grad_norm": 0.4787754721973709, + "learning_rate": 4.928406368856273e-06, + "loss": 0.5636, + "step": 4298 + }, + { + "epoch": 1.6990367992096815, + "grad_norm": 0.442334169803762, + "learning_rate": 4.9283691384095e-06, + "loss": 0.5823, + "step": 4299 + }, + { + "epoch": 1.6994319585082738, + "grad_norm": 0.432453303132636, + "learning_rate": 4.928331898425568e-06, + "loss": 0.5589, + "step": 4300 + }, + { + "epoch": 1.699827117806866, + "grad_norm": 0.4490638732729113, + "learning_rate": 4.9282946489046235e-06, + "loss": 0.5754, + "step": 4301 + }, + { + "epoch": 1.7002222771054583, + "grad_norm": 0.4853856757096015, + "learning_rate": 4.928257389846812e-06, + "loss": 0.5503, + "step": 4302 + }, + { + "epoch": 1.7006174364040505, + "grad_norm": 0.42812505775715426, + "learning_rate": 4.92822012125228e-06, + "loss": 0.5477, + "step": 4303 + }, + { + "epoch": 1.7010125957026427, + "grad_norm": 0.43886286312335837, + "learning_rate": 4.928182843121173e-06, + "loss": 0.5716, + "step": 4304 + }, + { + "epoch": 1.701407755001235, + "grad_norm": 0.4523032752577509, + "learning_rate": 4.928145555453638e-06, + "loss": 0.5509, + "step": 4305 + }, + { + "epoch": 1.7018029142998272, + "grad_norm": 0.4517070813549554, + "learning_rate": 4.928108258249823e-06, + "loss": 0.566, + "step": 4306 + }, + { + "epoch": 1.7021980735984195, + "grad_norm": 0.5646470946712949, + "learning_rate": 4.928070951509873e-06, + "loss": 0.575, + "step": 4307 + }, + { + "epoch": 1.7025932328970117, + "grad_norm": 0.42398116604836683, + "learning_rate": 4.928033635233934e-06, + "loss": 0.5449, + "step": 4308 + }, + { + "epoch": 1.702988392195604, + "grad_norm": 0.45125784631407473, + "learning_rate": 4.927996309422154e-06, + "loss": 0.555, + "step": 4309 + }, + { + "epoch": 1.7033835514941962, + "grad_norm": 0.4516857652827014, + "learning_rate": 4.927958974074678e-06, + "loss": 0.5358, + "step": 4310 + }, + { + "epoch": 1.7037787107927884, + "grad_norm": 0.4486541477736022, + "learning_rate": 4.927921629191654e-06, + "loss": 0.5706, + "step": 4311 + }, + { + "epoch": 1.7041738700913807, + "grad_norm": 0.5504549124395234, + "learning_rate": 4.927884274773229e-06, + "loss": 0.5499, + "step": 4312 + }, + { + "epoch": 1.704569029389973, + "grad_norm": 0.4377036298330416, + "learning_rate": 4.927846910819547e-06, + "loss": 0.5624, + "step": 4313 + }, + { + "epoch": 1.7049641886885651, + "grad_norm": 0.44142723756552443, + "learning_rate": 4.9278095373307586e-06, + "loss": 0.5641, + "step": 4314 + }, + { + "epoch": 1.7053593479871574, + "grad_norm": 0.4305246703883127, + "learning_rate": 4.927772154307007e-06, + "loss": 0.5584, + "step": 4315 + }, + { + "epoch": 1.7057545072857496, + "grad_norm": 0.42333610423419793, + "learning_rate": 4.927734761748441e-06, + "loss": 0.5662, + "step": 4316 + }, + { + "epoch": 1.7061496665843419, + "grad_norm": 0.43713955439883345, + "learning_rate": 4.927697359655208e-06, + "loss": 0.5574, + "step": 4317 + }, + { + "epoch": 1.706544825882934, + "grad_norm": 0.4350887115565443, + "learning_rate": 4.927659948027453e-06, + "loss": 0.5484, + "step": 4318 + }, + { + "epoch": 1.7069399851815263, + "grad_norm": 0.4305661898067747, + "learning_rate": 4.927622526865324e-06, + "loss": 0.5721, + "step": 4319 + }, + { + "epoch": 1.7073351444801186, + "grad_norm": 0.4237425713774883, + "learning_rate": 4.927585096168967e-06, + "loss": 0.5385, + "step": 4320 + }, + { + "epoch": 1.7077303037787108, + "grad_norm": 0.42768772713800246, + "learning_rate": 4.9275476559385316e-06, + "loss": 0.5615, + "step": 4321 + }, + { + "epoch": 1.708125463077303, + "grad_norm": 0.45101823860302, + "learning_rate": 4.927510206174162e-06, + "loss": 0.5777, + "step": 4322 + }, + { + "epoch": 1.7085206223758953, + "grad_norm": 0.45186205819480785, + "learning_rate": 4.927472746876007e-06, + "loss": 0.55, + "step": 4323 + }, + { + "epoch": 1.7089157816744875, + "grad_norm": 0.4632522948748882, + "learning_rate": 4.9274352780442125e-06, + "loss": 0.5719, + "step": 4324 + }, + { + "epoch": 1.7093109409730798, + "grad_norm": 0.43100101608453223, + "learning_rate": 4.927397799678927e-06, + "loss": 0.5549, + "step": 4325 + }, + { + "epoch": 1.709706100271672, + "grad_norm": 0.434763481025286, + "learning_rate": 4.927360311780296e-06, + "loss": 0.566, + "step": 4326 + }, + { + "epoch": 1.7101012595702643, + "grad_norm": 0.43309030464048226, + "learning_rate": 4.927322814348468e-06, + "loss": 0.5345, + "step": 4327 + }, + { + "epoch": 1.7104964188688565, + "grad_norm": 0.48013463816215374, + "learning_rate": 4.92728530738359e-06, + "loss": 0.5912, + "step": 4328 + }, + { + "epoch": 1.7108915781674487, + "grad_norm": 0.47034903875978573, + "learning_rate": 4.927247790885809e-06, + "loss": 0.5778, + "step": 4329 + }, + { + "epoch": 1.711286737466041, + "grad_norm": 0.4278937802281345, + "learning_rate": 4.927210264855274e-06, + "loss": 0.5597, + "step": 4330 + }, + { + "epoch": 1.7116818967646332, + "grad_norm": 0.43626768378812447, + "learning_rate": 4.927172729292129e-06, + "loss": 0.5651, + "step": 4331 + }, + { + "epoch": 1.7120770560632255, + "grad_norm": 0.4465422521558346, + "learning_rate": 4.927135184196524e-06, + "loss": 0.5756, + "step": 4332 + }, + { + "epoch": 1.7124722153618177, + "grad_norm": 0.44974916569628687, + "learning_rate": 4.927097629568606e-06, + "loss": 0.568, + "step": 4333 + }, + { + "epoch": 1.71286737466041, + "grad_norm": 0.4472334023755386, + "learning_rate": 4.927060065408522e-06, + "loss": 0.5797, + "step": 4334 + }, + { + "epoch": 1.7132625339590022, + "grad_norm": 0.4481801045998597, + "learning_rate": 4.9270224917164204e-06, + "loss": 0.5675, + "step": 4335 + }, + { + "epoch": 1.7136576932575944, + "grad_norm": 0.4549341973795611, + "learning_rate": 4.926984908492448e-06, + "loss": 0.5803, + "step": 4336 + }, + { + "epoch": 1.7140528525561867, + "grad_norm": 0.4359842869982814, + "learning_rate": 4.9269473157367535e-06, + "loss": 0.5499, + "step": 4337 + }, + { + "epoch": 1.714448011854779, + "grad_norm": 0.5200059350206986, + "learning_rate": 4.926909713449482e-06, + "loss": 0.5577, + "step": 4338 + }, + { + "epoch": 1.7148431711533711, + "grad_norm": 0.4689154411180137, + "learning_rate": 4.926872101630784e-06, + "loss": 0.5545, + "step": 4339 + }, + { + "epoch": 1.7152383304519634, + "grad_norm": 0.45440101006285427, + "learning_rate": 4.9268344802808055e-06, + "loss": 0.5633, + "step": 4340 + }, + { + "epoch": 1.7156334897505556, + "grad_norm": 0.45895219511156227, + "learning_rate": 4.926796849399694e-06, + "loss": 0.5515, + "step": 4341 + }, + { + "epoch": 1.7160286490491479, + "grad_norm": 0.43433747556834423, + "learning_rate": 4.9267592089876e-06, + "loss": 0.55, + "step": 4342 + }, + { + "epoch": 1.71642380834774, + "grad_norm": 0.4338247101488299, + "learning_rate": 4.926721559044668e-06, + "loss": 0.5563, + "step": 4343 + }, + { + "epoch": 1.7168189676463324, + "grad_norm": 0.45944141139935163, + "learning_rate": 4.926683899571048e-06, + "loss": 0.5357, + "step": 4344 + }, + { + "epoch": 1.7172141269449246, + "grad_norm": 0.4264619359745203, + "learning_rate": 4.9266462305668876e-06, + "loss": 0.5385, + "step": 4345 + }, + { + "epoch": 1.7176092862435168, + "grad_norm": 0.43636381830305015, + "learning_rate": 4.926608552032334e-06, + "loss": 0.5781, + "step": 4346 + }, + { + "epoch": 1.718004445542109, + "grad_norm": 0.4488377673647985, + "learning_rate": 4.926570863967535e-06, + "loss": 0.559, + "step": 4347 + }, + { + "epoch": 1.7183996048407013, + "grad_norm": 0.44153153641448994, + "learning_rate": 4.926533166372639e-06, + "loss": 0.5704, + "step": 4348 + }, + { + "epoch": 1.7187947641392936, + "grad_norm": 0.43552915303217893, + "learning_rate": 4.926495459247795e-06, + "loss": 0.5564, + "step": 4349 + }, + { + "epoch": 1.7191899234378858, + "grad_norm": 0.4306421081394027, + "learning_rate": 4.9264577425931505e-06, + "loss": 0.5668, + "step": 4350 + }, + { + "epoch": 1.719585082736478, + "grad_norm": 0.43170183294012887, + "learning_rate": 4.926420016408852e-06, + "loss": 0.56, + "step": 4351 + }, + { + "epoch": 1.7199802420350703, + "grad_norm": 0.44310661756844827, + "learning_rate": 4.92638228069505e-06, + "loss": 0.5444, + "step": 4352 + }, + { + "epoch": 1.7203754013336625, + "grad_norm": 0.5473701951430664, + "learning_rate": 4.926344535451892e-06, + "loss": 0.5518, + "step": 4353 + }, + { + "epoch": 1.7207705606322548, + "grad_norm": 0.45773670023142704, + "learning_rate": 4.926306780679526e-06, + "loss": 0.5879, + "step": 4354 + }, + { + "epoch": 1.721165719930847, + "grad_norm": 0.4470158656152022, + "learning_rate": 4.926269016378099e-06, + "loss": 0.5574, + "step": 4355 + }, + { + "epoch": 1.7215608792294392, + "grad_norm": 0.4326444239390642, + "learning_rate": 4.926231242547762e-06, + "loss": 0.5575, + "step": 4356 + }, + { + "epoch": 1.7219560385280315, + "grad_norm": 0.43096022881772944, + "learning_rate": 4.926193459188662e-06, + "loss": 0.5237, + "step": 4357 + }, + { + "epoch": 1.7223511978266237, + "grad_norm": 0.4335631980043337, + "learning_rate": 4.926155666300947e-06, + "loss": 0.5508, + "step": 4358 + }, + { + "epoch": 1.722746357125216, + "grad_norm": 0.43344519231600975, + "learning_rate": 4.926117863884765e-06, + "loss": 0.5578, + "step": 4359 + }, + { + "epoch": 1.7231415164238082, + "grad_norm": 0.439357665476649, + "learning_rate": 4.926080051940267e-06, + "loss": 0.5677, + "step": 4360 + }, + { + "epoch": 1.7235366757224004, + "grad_norm": 0.45079399416037436, + "learning_rate": 4.926042230467598e-06, + "loss": 0.5653, + "step": 4361 + }, + { + "epoch": 1.7239318350209927, + "grad_norm": 0.41470626875209715, + "learning_rate": 4.9260043994669094e-06, + "loss": 0.5375, + "step": 4362 + }, + { + "epoch": 1.7243269943195851, + "grad_norm": 0.4313994825808373, + "learning_rate": 4.925966558938348e-06, + "loss": 0.5692, + "step": 4363 + }, + { + "epoch": 1.7247221536181774, + "grad_norm": 0.4321560662738567, + "learning_rate": 4.925928708882064e-06, + "loss": 0.5467, + "step": 4364 + }, + { + "epoch": 1.7251173129167696, + "grad_norm": 0.43615051233389873, + "learning_rate": 4.925890849298204e-06, + "loss": 0.561, + "step": 4365 + }, + { + "epoch": 1.7255124722153619, + "grad_norm": 0.4386066095911232, + "learning_rate": 4.925852980186918e-06, + "loss": 0.5702, + "step": 4366 + }, + { + "epoch": 1.725907631513954, + "grad_norm": 0.4294008469837109, + "learning_rate": 4.9258151015483555e-06, + "loss": 0.5611, + "step": 4367 + }, + { + "epoch": 1.7263027908125463, + "grad_norm": 0.43462199012351793, + "learning_rate": 4.925777213382663e-06, + "loss": 0.571, + "step": 4368 + }, + { + "epoch": 1.7266979501111386, + "grad_norm": 0.45940417820864915, + "learning_rate": 4.925739315689991e-06, + "loss": 0.576, + "step": 4369 + }, + { + "epoch": 1.7270931094097308, + "grad_norm": 0.43409099796596096, + "learning_rate": 4.925701408470489e-06, + "loss": 0.5583, + "step": 4370 + }, + { + "epoch": 1.727488268708323, + "grad_norm": 0.4364436014175337, + "learning_rate": 4.925663491724304e-06, + "loss": 0.5465, + "step": 4371 + }, + { + "epoch": 1.7278834280069153, + "grad_norm": 0.4242367032389472, + "learning_rate": 4.9256255654515865e-06, + "loss": 0.5566, + "step": 4372 + }, + { + "epoch": 1.7282785873055075, + "grad_norm": 0.4348950743141649, + "learning_rate": 4.925587629652483e-06, + "loss": 0.5524, + "step": 4373 + }, + { + "epoch": 1.7286737466040998, + "grad_norm": 0.440868038863271, + "learning_rate": 4.925549684327145e-06, + "loss": 0.5432, + "step": 4374 + }, + { + "epoch": 1.729068905902692, + "grad_norm": 0.4594372909416891, + "learning_rate": 4.925511729475722e-06, + "loss": 0.573, + "step": 4375 + }, + { + "epoch": 1.7294640652012843, + "grad_norm": 0.442038328983773, + "learning_rate": 4.92547376509836e-06, + "loss": 0.5436, + "step": 4376 + }, + { + "epoch": 1.7298592244998765, + "grad_norm": 0.4488082560018164, + "learning_rate": 4.925435791195211e-06, + "loss": 0.5382, + "step": 4377 + }, + { + "epoch": 1.7302543837984687, + "grad_norm": 0.4451552451369125, + "learning_rate": 4.925397807766422e-06, + "loss": 0.5708, + "step": 4378 + }, + { + "epoch": 1.730649543097061, + "grad_norm": 0.44829170627702375, + "learning_rate": 4.925359814812144e-06, + "loss": 0.5566, + "step": 4379 + }, + { + "epoch": 1.7310447023956532, + "grad_norm": 0.45319114553972123, + "learning_rate": 4.925321812332526e-06, + "loss": 0.5812, + "step": 4380 + }, + { + "epoch": 1.7314398616942455, + "grad_norm": 0.4730466509935319, + "learning_rate": 4.925283800327715e-06, + "loss": 0.5595, + "step": 4381 + }, + { + "epoch": 1.7318350209928377, + "grad_norm": 0.44863564694406577, + "learning_rate": 4.925245778797863e-06, + "loss": 0.5537, + "step": 4382 + }, + { + "epoch": 1.73223018029143, + "grad_norm": 0.449712427848464, + "learning_rate": 4.925207747743118e-06, + "loss": 0.5548, + "step": 4383 + }, + { + "epoch": 1.7326253395900222, + "grad_norm": 0.4353187498862786, + "learning_rate": 4.925169707163629e-06, + "loss": 0.5699, + "step": 4384 + }, + { + "epoch": 1.7330204988886144, + "grad_norm": 0.4552613973646662, + "learning_rate": 4.925131657059548e-06, + "loss": 0.5589, + "step": 4385 + }, + { + "epoch": 1.7334156581872067, + "grad_norm": 0.43804313299916375, + "learning_rate": 4.925093597431021e-06, + "loss": 0.5665, + "step": 4386 + }, + { + "epoch": 1.7338108174857991, + "grad_norm": 0.44336856585398643, + "learning_rate": 4.925055528278199e-06, + "loss": 0.5691, + "step": 4387 + }, + { + "epoch": 1.7342059767843914, + "grad_norm": 0.4453678495451275, + "learning_rate": 4.925017449601231e-06, + "loss": 0.5799, + "step": 4388 + }, + { + "epoch": 1.7346011360829836, + "grad_norm": 0.43883849979746115, + "learning_rate": 4.924979361400268e-06, + "loss": 0.5714, + "step": 4389 + }, + { + "epoch": 1.7349962953815758, + "grad_norm": 0.4301004660209356, + "learning_rate": 4.924941263675458e-06, + "loss": 0.5542, + "step": 4390 + }, + { + "epoch": 1.735391454680168, + "grad_norm": 0.4371671225872987, + "learning_rate": 4.924903156426952e-06, + "loss": 0.5634, + "step": 4391 + }, + { + "epoch": 1.7357866139787603, + "grad_norm": 0.4467984256797525, + "learning_rate": 4.924865039654898e-06, + "loss": 0.5741, + "step": 4392 + }, + { + "epoch": 1.7361817732773526, + "grad_norm": 0.4373830450072716, + "learning_rate": 4.9248269133594464e-06, + "loss": 0.5546, + "step": 4393 + }, + { + "epoch": 1.7365769325759448, + "grad_norm": 0.44157114771440137, + "learning_rate": 4.924788777540748e-06, + "loss": 0.5758, + "step": 4394 + }, + { + "epoch": 1.736972091874537, + "grad_norm": 0.42686715929933167, + "learning_rate": 4.9247506321989514e-06, + "loss": 0.5612, + "step": 4395 + }, + { + "epoch": 1.7373672511731293, + "grad_norm": 0.429547791413384, + "learning_rate": 4.924712477334206e-06, + "loss": 0.5474, + "step": 4396 + }, + { + "epoch": 1.7377624104717215, + "grad_norm": 0.4456343541323995, + "learning_rate": 4.924674312946663e-06, + "loss": 0.5648, + "step": 4397 + }, + { + "epoch": 1.7381575697703138, + "grad_norm": 0.4557318656179386, + "learning_rate": 4.924636139036472e-06, + "loss": 0.5811, + "step": 4398 + }, + { + "epoch": 1.738552729068906, + "grad_norm": 0.42877428492133096, + "learning_rate": 4.924597955603782e-06, + "loss": 0.5719, + "step": 4399 + }, + { + "epoch": 1.7389478883674983, + "grad_norm": 0.44201823103262, + "learning_rate": 4.924559762648744e-06, + "loss": 0.5527, + "step": 4400 + }, + { + "epoch": 1.7393430476660905, + "grad_norm": 0.4406878460734057, + "learning_rate": 4.924521560171507e-06, + "loss": 0.5506, + "step": 4401 + }, + { + "epoch": 1.7397382069646827, + "grad_norm": 0.4238997986809783, + "learning_rate": 4.924483348172222e-06, + "loss": 0.555, + "step": 4402 + }, + { + "epoch": 1.740133366263275, + "grad_norm": 0.444786923090035, + "learning_rate": 4.924445126651038e-06, + "loss": 0.5585, + "step": 4403 + }, + { + "epoch": 1.7405285255618672, + "grad_norm": 0.42484925220727704, + "learning_rate": 4.924406895608106e-06, + "loss": 0.5482, + "step": 4404 + }, + { + "epoch": 1.7409236848604595, + "grad_norm": 0.44349133709535676, + "learning_rate": 4.924368655043577e-06, + "loss": 0.5573, + "step": 4405 + }, + { + "epoch": 1.7413188441590517, + "grad_norm": 0.4227263535085141, + "learning_rate": 4.924330404957599e-06, + "loss": 0.5465, + "step": 4406 + }, + { + "epoch": 1.741714003457644, + "grad_norm": 0.44828080019324945, + "learning_rate": 4.924292145350323e-06, + "loss": 0.5723, + "step": 4407 + }, + { + "epoch": 1.7421091627562362, + "grad_norm": 0.426266339908026, + "learning_rate": 4.924253876221899e-06, + "loss": 0.5772, + "step": 4408 + }, + { + "epoch": 1.7425043220548284, + "grad_norm": 0.43419093036176604, + "learning_rate": 4.92421559757248e-06, + "loss": 0.5596, + "step": 4409 + }, + { + "epoch": 1.7428994813534207, + "grad_norm": 0.43554329163688116, + "learning_rate": 4.924177309402213e-06, + "loss": 0.5697, + "step": 4410 + }, + { + "epoch": 1.743294640652013, + "grad_norm": 0.4473385143087953, + "learning_rate": 4.9241390117112495e-06, + "loss": 0.5642, + "step": 4411 + }, + { + "epoch": 1.7436897999506051, + "grad_norm": 0.4387919180383067, + "learning_rate": 4.92410070449974e-06, + "loss": 0.5712, + "step": 4412 + }, + { + "epoch": 1.7440849592491974, + "grad_norm": 0.4440101663736749, + "learning_rate": 4.924062387767835e-06, + "loss": 0.5767, + "step": 4413 + }, + { + "epoch": 1.7444801185477896, + "grad_norm": 0.43157217680041976, + "learning_rate": 4.924024061515684e-06, + "loss": 0.5589, + "step": 4414 + }, + { + "epoch": 1.7448752778463819, + "grad_norm": 0.4207925328836993, + "learning_rate": 4.9239857257434395e-06, + "loss": 0.5494, + "step": 4415 + }, + { + "epoch": 1.745270437144974, + "grad_norm": 0.4343427301041821, + "learning_rate": 4.923947380451252e-06, + "loss": 0.5418, + "step": 4416 + }, + { + "epoch": 1.7456655964435663, + "grad_norm": 0.43915247356661835, + "learning_rate": 4.92390902563927e-06, + "loss": 0.5457, + "step": 4417 + }, + { + "epoch": 1.7460607557421586, + "grad_norm": 0.412423518922869, + "learning_rate": 4.923870661307645e-06, + "loss": 0.543, + "step": 4418 + }, + { + "epoch": 1.7464559150407508, + "grad_norm": 0.4659475964234698, + "learning_rate": 4.923832287456527e-06, + "loss": 0.5727, + "step": 4419 + }, + { + "epoch": 1.746851074339343, + "grad_norm": 0.4386733240756286, + "learning_rate": 4.92379390408607e-06, + "loss": 0.546, + "step": 4420 + }, + { + "epoch": 1.7472462336379353, + "grad_norm": 0.4566302235906808, + "learning_rate": 4.9237555111964204e-06, + "loss": 0.577, + "step": 4421 + }, + { + "epoch": 1.7476413929365275, + "grad_norm": 0.4544004401768208, + "learning_rate": 4.923717108787731e-06, + "loss": 0.5863, + "step": 4422 + }, + { + "epoch": 1.7480365522351198, + "grad_norm": 0.4364008166512911, + "learning_rate": 4.923678696860153e-06, + "loss": 0.5588, + "step": 4423 + }, + { + "epoch": 1.748431711533712, + "grad_norm": 0.44646539196245927, + "learning_rate": 4.923640275413838e-06, + "loss": 0.5797, + "step": 4424 + }, + { + "epoch": 1.7488268708323043, + "grad_norm": 0.42595898093134776, + "learning_rate": 4.923601844448934e-06, + "loss": 0.5326, + "step": 4425 + }, + { + "epoch": 1.7492220301308965, + "grad_norm": 0.43632414312178214, + "learning_rate": 4.923563403965595e-06, + "loss": 0.5477, + "step": 4426 + }, + { + "epoch": 1.7496171894294887, + "grad_norm": 0.43161605146182297, + "learning_rate": 4.923524953963969e-06, + "loss": 0.5652, + "step": 4427 + }, + { + "epoch": 1.750012348728081, + "grad_norm": 0.42112039961812653, + "learning_rate": 4.923486494444209e-06, + "loss": 0.5403, + "step": 4428 + }, + { + "epoch": 1.7504075080266732, + "grad_norm": 0.4387403266260913, + "learning_rate": 4.923448025406467e-06, + "loss": 0.5646, + "step": 4429 + }, + { + "epoch": 1.7508026673252655, + "grad_norm": 0.44195630371865374, + "learning_rate": 4.923409546850891e-06, + "loss": 0.5514, + "step": 4430 + }, + { + "epoch": 1.7511978266238577, + "grad_norm": 0.44395804443010556, + "learning_rate": 4.923371058777635e-06, + "loss": 0.5645, + "step": 4431 + }, + { + "epoch": 1.75159298592245, + "grad_norm": 0.4326703585981714, + "learning_rate": 4.923332561186849e-06, + "loss": 0.5616, + "step": 4432 + }, + { + "epoch": 1.7519881452210422, + "grad_norm": 0.41917281291116726, + "learning_rate": 4.923294054078684e-06, + "loss": 0.5365, + "step": 4433 + }, + { + "epoch": 1.7523833045196344, + "grad_norm": 0.43629197498616906, + "learning_rate": 4.923255537453292e-06, + "loss": 0.5715, + "step": 4434 + }, + { + "epoch": 1.7527784638182267, + "grad_norm": 0.43248199576929525, + "learning_rate": 4.923217011310823e-06, + "loss": 0.5339, + "step": 4435 + }, + { + "epoch": 1.753173623116819, + "grad_norm": 0.4796555742650671, + "learning_rate": 4.923178475651429e-06, + "loss": 0.5628, + "step": 4436 + }, + { + "epoch": 1.7535687824154111, + "grad_norm": 0.44016438303089994, + "learning_rate": 4.923139930475262e-06, + "loss": 0.5796, + "step": 4437 + }, + { + "epoch": 1.7539639417140034, + "grad_norm": 0.44061698759839063, + "learning_rate": 4.923101375782472e-06, + "loss": 0.5676, + "step": 4438 + }, + { + "epoch": 1.7543591010125956, + "grad_norm": 0.43587278759955905, + "learning_rate": 4.923062811573211e-06, + "loss": 0.5731, + "step": 4439 + }, + { + "epoch": 1.7547542603111879, + "grad_norm": 0.4324193780709019, + "learning_rate": 4.9230242378476325e-06, + "loss": 0.5662, + "step": 4440 + }, + { + "epoch": 1.75514941960978, + "grad_norm": 0.4330363556293551, + "learning_rate": 4.922985654605884e-06, + "loss": 0.5526, + "step": 4441 + }, + { + "epoch": 1.7555445789083723, + "grad_norm": 0.4314258458445518, + "learning_rate": 4.922947061848121e-06, + "loss": 0.5468, + "step": 4442 + }, + { + "epoch": 1.7559397382069646, + "grad_norm": 0.4367345050157589, + "learning_rate": 4.922908459574492e-06, + "loss": 0.5573, + "step": 4443 + }, + { + "epoch": 1.7563348975055568, + "grad_norm": 0.5054619145164584, + "learning_rate": 4.92286984778515e-06, + "loss": 0.5611, + "step": 4444 + }, + { + "epoch": 1.756730056804149, + "grad_norm": 0.4244999054734717, + "learning_rate": 4.922831226480247e-06, + "loss": 0.5533, + "step": 4445 + }, + { + "epoch": 1.7571252161027413, + "grad_norm": 0.4172564528587188, + "learning_rate": 4.9227925956599336e-06, + "loss": 0.5394, + "step": 4446 + }, + { + "epoch": 1.7575203754013335, + "grad_norm": 0.4487356151355405, + "learning_rate": 4.922753955324362e-06, + "loss": 0.5593, + "step": 4447 + }, + { + "epoch": 1.7579155346999258, + "grad_norm": 0.4471793604107619, + "learning_rate": 4.922715305473684e-06, + "loss": 0.545, + "step": 4448 + }, + { + "epoch": 1.758310693998518, + "grad_norm": 0.42116771593933916, + "learning_rate": 4.922676646108052e-06, + "loss": 0.5565, + "step": 4449 + }, + { + "epoch": 1.7587058532971103, + "grad_norm": 0.42986579453156515, + "learning_rate": 4.9226379772276165e-06, + "loss": 0.5542, + "step": 4450 + }, + { + "epoch": 1.7591010125957025, + "grad_norm": 0.4247741467500481, + "learning_rate": 4.922599298832531e-06, + "loss": 0.5277, + "step": 4451 + }, + { + "epoch": 1.7594961718942947, + "grad_norm": 0.43883975962836197, + "learning_rate": 4.922560610922946e-06, + "loss": 0.5554, + "step": 4452 + }, + { + "epoch": 1.759891331192887, + "grad_norm": 0.4269542206063633, + "learning_rate": 4.922521913499014e-06, + "loss": 0.55, + "step": 4453 + }, + { + "epoch": 1.7602864904914792, + "grad_norm": 0.44358761831837534, + "learning_rate": 4.922483206560888e-06, + "loss": 0.5681, + "step": 4454 + }, + { + "epoch": 1.7606816497900715, + "grad_norm": 0.41524870259004626, + "learning_rate": 4.9224444901087174e-06, + "loss": 0.5287, + "step": 4455 + }, + { + "epoch": 1.7610768090886637, + "grad_norm": 0.47449363146584195, + "learning_rate": 4.922405764142656e-06, + "loss": 0.5534, + "step": 4456 + }, + { + "epoch": 1.761471968387256, + "grad_norm": 0.4274899599226474, + "learning_rate": 4.9223670286628566e-06, + "loss": 0.5513, + "step": 4457 + }, + { + "epoch": 1.7618671276858484, + "grad_norm": 0.4229395462347126, + "learning_rate": 4.92232828366947e-06, + "loss": 0.5643, + "step": 4458 + }, + { + "epoch": 1.7622622869844407, + "grad_norm": 0.43795229928660484, + "learning_rate": 4.922289529162649e-06, + "loss": 0.5577, + "step": 4459 + }, + { + "epoch": 1.762657446283033, + "grad_norm": 0.4509336716051313, + "learning_rate": 4.922250765142546e-06, + "loss": 0.5369, + "step": 4460 + }, + { + "epoch": 1.7630526055816251, + "grad_norm": 0.4963694155527903, + "learning_rate": 4.9222119916093115e-06, + "loss": 0.5729, + "step": 4461 + }, + { + "epoch": 1.7634477648802174, + "grad_norm": 0.4311986192626347, + "learning_rate": 4.9221732085631e-06, + "loss": 0.5636, + "step": 4462 + }, + { + "epoch": 1.7638429241788096, + "grad_norm": 0.4319021698000714, + "learning_rate": 4.9221344160040626e-06, + "loss": 0.554, + "step": 4463 + }, + { + "epoch": 1.7642380834774019, + "grad_norm": 0.42976334810476374, + "learning_rate": 4.922095613932353e-06, + "loss": 0.5721, + "step": 4464 + }, + { + "epoch": 1.764633242775994, + "grad_norm": 0.4348605034262508, + "learning_rate": 4.922056802348122e-06, + "loss": 0.5481, + "step": 4465 + }, + { + "epoch": 1.7650284020745863, + "grad_norm": 0.4402318664595867, + "learning_rate": 4.9220179812515226e-06, + "loss": 0.5659, + "step": 4466 + }, + { + "epoch": 1.7654235613731786, + "grad_norm": 0.43124398530041586, + "learning_rate": 4.921979150642707e-06, + "loss": 0.5737, + "step": 4467 + }, + { + "epoch": 1.7658187206717708, + "grad_norm": 0.43936196155478946, + "learning_rate": 4.921940310521828e-06, + "loss": 0.5595, + "step": 4468 + }, + { + "epoch": 1.766213879970363, + "grad_norm": 0.43223733140815723, + "learning_rate": 4.921901460889039e-06, + "loss": 0.5488, + "step": 4469 + }, + { + "epoch": 1.7666090392689553, + "grad_norm": 0.43010489859753376, + "learning_rate": 4.921862601744491e-06, + "loss": 0.5597, + "step": 4470 + }, + { + "epoch": 1.7670041985675475, + "grad_norm": 0.4258097789495025, + "learning_rate": 4.9218237330883375e-06, + "loss": 0.5548, + "step": 4471 + }, + { + "epoch": 1.7673993578661398, + "grad_norm": 0.4441676656075857, + "learning_rate": 4.921784854920731e-06, + "loss": 0.5439, + "step": 4472 + }, + { + "epoch": 1.767794517164732, + "grad_norm": 0.429127198914778, + "learning_rate": 4.921745967241825e-06, + "loss": 0.5624, + "step": 4473 + }, + { + "epoch": 1.7681896764633243, + "grad_norm": 0.42973439941503405, + "learning_rate": 4.921707070051769e-06, + "loss": 0.5405, + "step": 4474 + }, + { + "epoch": 1.7685848357619165, + "grad_norm": 0.4368917818588069, + "learning_rate": 4.92166816335072e-06, + "loss": 0.571, + "step": 4475 + }, + { + "epoch": 1.7689799950605087, + "grad_norm": 0.4329313735901494, + "learning_rate": 4.921629247138829e-06, + "loss": 0.5425, + "step": 4476 + }, + { + "epoch": 1.769375154359101, + "grad_norm": 0.4522903482231501, + "learning_rate": 4.9215903214162485e-06, + "loss": 0.5566, + "step": 4477 + }, + { + "epoch": 1.7697703136576932, + "grad_norm": 0.4460656956099865, + "learning_rate": 4.921551386183131e-06, + "loss": 0.5558, + "step": 4478 + }, + { + "epoch": 1.7701654729562855, + "grad_norm": 0.41968856516312, + "learning_rate": 4.921512441439631e-06, + "loss": 0.558, + "step": 4479 + }, + { + "epoch": 1.7705606322548777, + "grad_norm": 0.4512166443513065, + "learning_rate": 4.9214734871859e-06, + "loss": 0.543, + "step": 4480 + }, + { + "epoch": 1.77095579155347, + "grad_norm": 0.44040585068512283, + "learning_rate": 4.921434523422093e-06, + "loss": 0.5606, + "step": 4481 + }, + { + "epoch": 1.7713509508520624, + "grad_norm": 0.41848653488225906, + "learning_rate": 4.9213955501483605e-06, + "loss": 0.5361, + "step": 4482 + }, + { + "epoch": 1.7717461101506546, + "grad_norm": 0.43807590314563183, + "learning_rate": 4.921356567364856e-06, + "loss": 0.5314, + "step": 4483 + }, + { + "epoch": 1.7721412694492469, + "grad_norm": 0.4700972659356998, + "learning_rate": 4.921317575071733e-06, + "loss": 0.5636, + "step": 4484 + }, + { + "epoch": 1.7725364287478391, + "grad_norm": 0.45042036525685325, + "learning_rate": 4.921278573269146e-06, + "loss": 0.5544, + "step": 4485 + }, + { + "epoch": 1.7729315880464314, + "grad_norm": 0.44614873313304904, + "learning_rate": 4.9212395619572474e-06, + "loss": 0.5693, + "step": 4486 + }, + { + "epoch": 1.7733267473450236, + "grad_norm": 0.44860751788633424, + "learning_rate": 4.92120054113619e-06, + "loss": 0.558, + "step": 4487 + }, + { + "epoch": 1.7737219066436158, + "grad_norm": 0.43151386401272274, + "learning_rate": 4.921161510806125e-06, + "loss": 0.5542, + "step": 4488 + }, + { + "epoch": 1.774117065942208, + "grad_norm": 0.4356644343958426, + "learning_rate": 4.92112247096721e-06, + "loss": 0.5368, + "step": 4489 + }, + { + "epoch": 1.7745122252408003, + "grad_norm": 0.4548142559509372, + "learning_rate": 4.921083421619595e-06, + "loss": 0.5736, + "step": 4490 + }, + { + "epoch": 1.7749073845393926, + "grad_norm": 0.4459993625684455, + "learning_rate": 4.921044362763436e-06, + "loss": 0.557, + "step": 4491 + }, + { + "epoch": 1.7753025438379848, + "grad_norm": 0.44551019969621913, + "learning_rate": 4.921005294398883e-06, + "loss": 0.5612, + "step": 4492 + }, + { + "epoch": 1.775697703136577, + "grad_norm": 0.4534954772992248, + "learning_rate": 4.9209662165260916e-06, + "loss": 0.5764, + "step": 4493 + }, + { + "epoch": 1.7760928624351693, + "grad_norm": 0.45806011580752654, + "learning_rate": 4.9209271291452156e-06, + "loss": 0.5565, + "step": 4494 + }, + { + "epoch": 1.7764880217337615, + "grad_norm": 0.42599165598927907, + "learning_rate": 4.920888032256408e-06, + "loss": 0.5434, + "step": 4495 + }, + { + "epoch": 1.7768831810323538, + "grad_norm": 0.427438042814662, + "learning_rate": 4.920848925859822e-06, + "loss": 0.5496, + "step": 4496 + }, + { + "epoch": 1.777278340330946, + "grad_norm": 0.4584387507023162, + "learning_rate": 4.9208098099556114e-06, + "loss": 0.5712, + "step": 4497 + }, + { + "epoch": 1.7776734996295382, + "grad_norm": 0.4355189310861454, + "learning_rate": 4.920770684543929e-06, + "loss": 0.5534, + "step": 4498 + }, + { + "epoch": 1.7780686589281305, + "grad_norm": 0.43597371943223134, + "learning_rate": 4.920731549624931e-06, + "loss": 0.565, + "step": 4499 + }, + { + "epoch": 1.7784638182267227, + "grad_norm": 0.4268466727675861, + "learning_rate": 4.920692405198769e-06, + "loss": 0.5426, + "step": 4500 + }, + { + "epoch": 1.778858977525315, + "grad_norm": 0.4224690255424684, + "learning_rate": 4.920653251265597e-06, + "loss": 0.555, + "step": 4501 + }, + { + "epoch": 1.7792541368239072, + "grad_norm": 0.42762282174827215, + "learning_rate": 4.920614087825568e-06, + "loss": 0.5525, + "step": 4502 + }, + { + "epoch": 1.7796492961224994, + "grad_norm": 0.4290253808006352, + "learning_rate": 4.9205749148788376e-06, + "loss": 0.5723, + "step": 4503 + }, + { + "epoch": 1.7800444554210917, + "grad_norm": 0.4316979082254998, + "learning_rate": 4.920535732425559e-06, + "loss": 0.555, + "step": 4504 + }, + { + "epoch": 1.780439614719684, + "grad_norm": 0.438272777807001, + "learning_rate": 4.920496540465885e-06, + "loss": 0.5421, + "step": 4505 + }, + { + "epoch": 1.7808347740182762, + "grad_norm": 0.455364594727949, + "learning_rate": 4.920457338999971e-06, + "loss": 0.5472, + "step": 4506 + }, + { + "epoch": 1.7812299333168684, + "grad_norm": 0.44459923067197527, + "learning_rate": 4.920418128027971e-06, + "loss": 0.5707, + "step": 4507 + }, + { + "epoch": 1.7816250926154606, + "grad_norm": 0.4417890387860095, + "learning_rate": 4.920378907550037e-06, + "loss": 0.5691, + "step": 4508 + }, + { + "epoch": 1.7820202519140529, + "grad_norm": 0.44085319317841215, + "learning_rate": 4.9203396775663245e-06, + "loss": 0.5439, + "step": 4509 + }, + { + "epoch": 1.7824154112126451, + "grad_norm": 0.45781235310847124, + "learning_rate": 4.920300438076989e-06, + "loss": 0.5535, + "step": 4510 + }, + { + "epoch": 1.7828105705112374, + "grad_norm": 0.42751841581481953, + "learning_rate": 4.9202611890821815e-06, + "loss": 0.5404, + "step": 4511 + }, + { + "epoch": 1.7832057298098296, + "grad_norm": 0.4341840586645018, + "learning_rate": 4.920221930582059e-06, + "loss": 0.5357, + "step": 4512 + }, + { + "epoch": 1.7836008891084218, + "grad_norm": 0.44062665450398303, + "learning_rate": 4.920182662576773e-06, + "loss": 0.5533, + "step": 4513 + }, + { + "epoch": 1.783996048407014, + "grad_norm": 0.4569945381667453, + "learning_rate": 4.920143385066479e-06, + "loss": 0.5605, + "step": 4514 + }, + { + "epoch": 1.7843912077056063, + "grad_norm": 0.5311247214308134, + "learning_rate": 4.920104098051333e-06, + "loss": 0.5786, + "step": 4515 + }, + { + "epoch": 1.7847863670041986, + "grad_norm": 0.46139739197081714, + "learning_rate": 4.920064801531486e-06, + "loss": 0.5773, + "step": 4516 + }, + { + "epoch": 1.7851815263027908, + "grad_norm": 0.43510101422224146, + "learning_rate": 4.920025495507095e-06, + "loss": 0.5375, + "step": 4517 + }, + { + "epoch": 1.785576685601383, + "grad_norm": 0.4299696722046226, + "learning_rate": 4.919986179978313e-06, + "loss": 0.5596, + "step": 4518 + }, + { + "epoch": 1.7859718448999753, + "grad_norm": 0.4245007247792506, + "learning_rate": 4.9199468549452956e-06, + "loss": 0.5575, + "step": 4519 + }, + { + "epoch": 1.7863670041985675, + "grad_norm": 0.427078611580864, + "learning_rate": 4.919907520408196e-06, + "loss": 0.5594, + "step": 4520 + }, + { + "epoch": 1.7867621634971598, + "grad_norm": 0.43840491567979, + "learning_rate": 4.919868176367168e-06, + "loss": 0.5404, + "step": 4521 + }, + { + "epoch": 1.787157322795752, + "grad_norm": 0.4569134416853893, + "learning_rate": 4.919828822822369e-06, + "loss": 0.5508, + "step": 4522 + }, + { + "epoch": 1.7875524820943443, + "grad_norm": 0.43107835382000415, + "learning_rate": 4.91978945977395e-06, + "loss": 0.5594, + "step": 4523 + }, + { + "epoch": 1.7879476413929365, + "grad_norm": 0.4361231870002236, + "learning_rate": 4.919750087222068e-06, + "loss": 0.5726, + "step": 4524 + }, + { + "epoch": 1.7883428006915287, + "grad_norm": 0.4353192973361471, + "learning_rate": 4.919710705166878e-06, + "loss": 0.5641, + "step": 4525 + }, + { + "epoch": 1.788737959990121, + "grad_norm": 0.4364332444547788, + "learning_rate": 4.919671313608533e-06, + "loss": 0.541, + "step": 4526 + }, + { + "epoch": 1.7891331192887132, + "grad_norm": 0.43409123496491997, + "learning_rate": 4.919631912547188e-06, + "loss": 0.5687, + "step": 4527 + }, + { + "epoch": 1.7895282785873055, + "grad_norm": 0.43446982982003957, + "learning_rate": 4.919592501982998e-06, + "loss": 0.5556, + "step": 4528 + }, + { + "epoch": 1.7899234378858977, + "grad_norm": 0.4345438030521792, + "learning_rate": 4.9195530819161185e-06, + "loss": 0.5474, + "step": 4529 + }, + { + "epoch": 1.79031859718449, + "grad_norm": 0.43436414603534473, + "learning_rate": 4.919513652346704e-06, + "loss": 0.5548, + "step": 4530 + }, + { + "epoch": 1.7907137564830822, + "grad_norm": 0.4320792577761697, + "learning_rate": 4.919474213274908e-06, + "loss": 0.5293, + "step": 4531 + }, + { + "epoch": 1.7911089157816744, + "grad_norm": 0.4417297782885469, + "learning_rate": 4.919434764700888e-06, + "loss": 0.5532, + "step": 4532 + }, + { + "epoch": 1.7915040750802667, + "grad_norm": 0.4491898130529519, + "learning_rate": 4.9193953066247965e-06, + "loss": 0.5691, + "step": 4533 + }, + { + "epoch": 1.791899234378859, + "grad_norm": 0.44010561319843694, + "learning_rate": 4.919355839046789e-06, + "loss": 0.5601, + "step": 4534 + }, + { + "epoch": 1.7922943936774511, + "grad_norm": 0.4326683763258569, + "learning_rate": 4.919316361967021e-06, + "loss": 0.5601, + "step": 4535 + }, + { + "epoch": 1.7926895529760434, + "grad_norm": 0.49880321858382415, + "learning_rate": 4.919276875385648e-06, + "loss": 0.5802, + "step": 4536 + }, + { + "epoch": 1.7930847122746356, + "grad_norm": 0.4402605348668624, + "learning_rate": 4.919237379302824e-06, + "loss": 0.5442, + "step": 4537 + }, + { + "epoch": 1.7934798715732279, + "grad_norm": 0.4487929856641043, + "learning_rate": 4.919197873718705e-06, + "loss": 0.5683, + "step": 4538 + }, + { + "epoch": 1.79387503087182, + "grad_norm": 0.42534492592015277, + "learning_rate": 4.919158358633445e-06, + "loss": 0.5652, + "step": 4539 + }, + { + "epoch": 1.7942701901704123, + "grad_norm": 0.4457639959321754, + "learning_rate": 4.919118834047201e-06, + "loss": 0.5658, + "step": 4540 + }, + { + "epoch": 1.7946653494690046, + "grad_norm": 0.4541593885926761, + "learning_rate": 4.919079299960127e-06, + "loss": 0.5688, + "step": 4541 + }, + { + "epoch": 1.7950605087675968, + "grad_norm": 0.42333628277611585, + "learning_rate": 4.919039756372378e-06, + "loss": 0.5474, + "step": 4542 + }, + { + "epoch": 1.795455668066189, + "grad_norm": 0.42037667148139624, + "learning_rate": 4.91900020328411e-06, + "loss": 0.5578, + "step": 4543 + }, + { + "epoch": 1.7958508273647813, + "grad_norm": 0.41437397951305344, + "learning_rate": 4.918960640695478e-06, + "loss": 0.5507, + "step": 4544 + }, + { + "epoch": 1.7962459866633735, + "grad_norm": 0.43166169225760287, + "learning_rate": 4.918921068606638e-06, + "loss": 0.5518, + "step": 4545 + }, + { + "epoch": 1.7966411459619658, + "grad_norm": 0.4327031422658885, + "learning_rate": 4.9188814870177435e-06, + "loss": 0.5559, + "step": 4546 + }, + { + "epoch": 1.797036305260558, + "grad_norm": 0.43810822912426284, + "learning_rate": 4.918841895928953e-06, + "loss": 0.5538, + "step": 4547 + }, + { + "epoch": 1.7974314645591503, + "grad_norm": 0.43509704083525685, + "learning_rate": 4.918802295340419e-06, + "loss": 0.5534, + "step": 4548 + }, + { + "epoch": 1.7978266238577425, + "grad_norm": 0.4204249306753487, + "learning_rate": 4.918762685252299e-06, + "loss": 0.555, + "step": 4549 + }, + { + "epoch": 1.7982217831563347, + "grad_norm": 0.429252379340873, + "learning_rate": 4.918723065664747e-06, + "loss": 0.5569, + "step": 4550 + }, + { + "epoch": 1.798616942454927, + "grad_norm": 0.4222980081248242, + "learning_rate": 4.918683436577921e-06, + "loss": 0.5271, + "step": 4551 + }, + { + "epoch": 1.7990121017535192, + "grad_norm": 0.4313877563692727, + "learning_rate": 4.918643797991975e-06, + "loss": 0.5702, + "step": 4552 + }, + { + "epoch": 1.7994072610521117, + "grad_norm": 0.43938081010488067, + "learning_rate": 4.918604149907064e-06, + "loss": 0.5914, + "step": 4553 + }, + { + "epoch": 1.799802420350704, + "grad_norm": 0.42842791710741546, + "learning_rate": 4.918564492323346e-06, + "loss": 0.5464, + "step": 4554 + }, + { + "epoch": 1.8001975796492962, + "grad_norm": 0.4346701478393733, + "learning_rate": 4.918524825240973e-06, + "loss": 0.5426, + "step": 4555 + }, + { + "epoch": 1.8005927389478884, + "grad_norm": 0.4300228304840291, + "learning_rate": 4.918485148660105e-06, + "loss": 0.5718, + "step": 4556 + }, + { + "epoch": 1.8009878982464806, + "grad_norm": 0.4328620116394069, + "learning_rate": 4.918445462580895e-06, + "loss": 0.567, + "step": 4557 + }, + { + "epoch": 1.8013830575450729, + "grad_norm": 0.43079626932416043, + "learning_rate": 4.9184057670035e-06, + "loss": 0.5518, + "step": 4558 + }, + { + "epoch": 1.8017782168436651, + "grad_norm": 0.43706587631711175, + "learning_rate": 4.918366061928076e-06, + "loss": 0.5492, + "step": 4559 + }, + { + "epoch": 1.8021733761422574, + "grad_norm": 0.4403528370210141, + "learning_rate": 4.918326347354778e-06, + "loss": 0.564, + "step": 4560 + }, + { + "epoch": 1.8025685354408496, + "grad_norm": 0.4265423519618484, + "learning_rate": 4.918286623283763e-06, + "loss": 0.5441, + "step": 4561 + }, + { + "epoch": 1.8029636947394418, + "grad_norm": 0.4497893586884997, + "learning_rate": 4.918246889715186e-06, + "loss": 0.5729, + "step": 4562 + }, + { + "epoch": 1.803358854038034, + "grad_norm": 0.4394008615994604, + "learning_rate": 4.918207146649204e-06, + "loss": 0.5577, + "step": 4563 + }, + { + "epoch": 1.8037540133366263, + "grad_norm": 0.4541079886766186, + "learning_rate": 4.918167394085974e-06, + "loss": 0.5629, + "step": 4564 + }, + { + "epoch": 1.8041491726352186, + "grad_norm": 0.45506948243874085, + "learning_rate": 4.91812763202565e-06, + "loss": 0.5561, + "step": 4565 + }, + { + "epoch": 1.8045443319338108, + "grad_norm": 0.4342638593346457, + "learning_rate": 4.918087860468388e-06, + "loss": 0.576, + "step": 4566 + }, + { + "epoch": 1.804939491232403, + "grad_norm": 0.4340982482700696, + "learning_rate": 4.918048079414346e-06, + "loss": 0.5554, + "step": 4567 + }, + { + "epoch": 1.8053346505309953, + "grad_norm": 0.4409861288626772, + "learning_rate": 4.91800828886368e-06, + "loss": 0.5726, + "step": 4568 + }, + { + "epoch": 1.8057298098295875, + "grad_norm": 0.4322705316149436, + "learning_rate": 4.917968488816545e-06, + "loss": 0.5329, + "step": 4569 + }, + { + "epoch": 1.8061249691281798, + "grad_norm": 0.4192848854350575, + "learning_rate": 4.917928679273098e-06, + "loss": 0.5427, + "step": 4570 + }, + { + "epoch": 1.806520128426772, + "grad_norm": 0.44939852167500016, + "learning_rate": 4.917888860233496e-06, + "loss": 0.556, + "step": 4571 + }, + { + "epoch": 1.8069152877253642, + "grad_norm": 0.5167355570733759, + "learning_rate": 4.917849031697894e-06, + "loss": 0.6005, + "step": 4572 + }, + { + "epoch": 1.8073104470239565, + "grad_norm": 0.4403454466703087, + "learning_rate": 4.91780919366645e-06, + "loss": 0.5508, + "step": 4573 + }, + { + "epoch": 1.8077056063225487, + "grad_norm": 0.4293181191267813, + "learning_rate": 4.917769346139319e-06, + "loss": 0.5607, + "step": 4574 + }, + { + "epoch": 1.808100765621141, + "grad_norm": 0.4338122274539976, + "learning_rate": 4.9177294891166585e-06, + "loss": 0.5423, + "step": 4575 + }, + { + "epoch": 1.8084959249197334, + "grad_norm": 0.43148032764038485, + "learning_rate": 4.917689622598625e-06, + "loss": 0.5718, + "step": 4576 + }, + { + "epoch": 1.8088910842183257, + "grad_norm": 0.43602229802024295, + "learning_rate": 4.917649746585374e-06, + "loss": 0.5404, + "step": 4577 + }, + { + "epoch": 1.809286243516918, + "grad_norm": 0.44355545237967436, + "learning_rate": 4.917609861077064e-06, + "loss": 0.5673, + "step": 4578 + }, + { + "epoch": 1.8096814028155102, + "grad_norm": 0.45356140574084836, + "learning_rate": 4.917569966073849e-06, + "loss": 0.5466, + "step": 4579 + }, + { + "epoch": 1.8100765621141024, + "grad_norm": 0.4133512710636827, + "learning_rate": 4.917530061575888e-06, + "loss": 0.5353, + "step": 4580 + }, + { + "epoch": 1.8104717214126946, + "grad_norm": 0.4368743815868589, + "learning_rate": 4.917490147583337e-06, + "loss": 0.5711, + "step": 4581 + }, + { + "epoch": 1.8108668807112869, + "grad_norm": 0.429475779733978, + "learning_rate": 4.917450224096353e-06, + "loss": 0.5663, + "step": 4582 + }, + { + "epoch": 1.8112620400098791, + "grad_norm": 0.4489675045116446, + "learning_rate": 4.917410291115092e-06, + "loss": 0.569, + "step": 4583 + }, + { + "epoch": 1.8116571993084714, + "grad_norm": 0.4253227142982292, + "learning_rate": 4.917370348639712e-06, + "loss": 0.5518, + "step": 4584 + }, + { + "epoch": 1.8120523586070636, + "grad_norm": 0.43792903853943244, + "learning_rate": 4.917330396670368e-06, + "loss": 0.5547, + "step": 4585 + }, + { + "epoch": 1.8124475179056558, + "grad_norm": 0.431983453096147, + "learning_rate": 4.917290435207219e-06, + "loss": 0.5622, + "step": 4586 + }, + { + "epoch": 1.812842677204248, + "grad_norm": 0.4421445444609276, + "learning_rate": 4.9172504642504204e-06, + "loss": 0.5734, + "step": 4587 + }, + { + "epoch": 1.8132378365028403, + "grad_norm": 0.4410353497824584, + "learning_rate": 4.91721048380013e-06, + "loss": 0.5692, + "step": 4588 + }, + { + "epoch": 1.8136329958014326, + "grad_norm": 0.41201401996131587, + "learning_rate": 4.917170493856504e-06, + "loss": 0.5427, + "step": 4589 + }, + { + "epoch": 1.8140281551000248, + "grad_norm": 0.4334126547698007, + "learning_rate": 4.917130494419702e-06, + "loss": 0.553, + "step": 4590 + }, + { + "epoch": 1.814423314398617, + "grad_norm": 0.4336096435165992, + "learning_rate": 4.917090485489877e-06, + "loss": 0.5694, + "step": 4591 + }, + { + "epoch": 1.8148184736972093, + "grad_norm": 0.43402803896434733, + "learning_rate": 4.91705046706719e-06, + "loss": 0.5671, + "step": 4592 + }, + { + "epoch": 1.8152136329958015, + "grad_norm": 0.43863493460328706, + "learning_rate": 4.917010439151796e-06, + "loss": 0.5469, + "step": 4593 + }, + { + "epoch": 1.8156087922943938, + "grad_norm": 0.4232411523985182, + "learning_rate": 4.916970401743852e-06, + "loss": 0.5397, + "step": 4594 + }, + { + "epoch": 1.816003951592986, + "grad_norm": 0.41773834042402014, + "learning_rate": 4.916930354843516e-06, + "loss": 0.5595, + "step": 4595 + }, + { + "epoch": 1.8163991108915782, + "grad_norm": 0.42847635907614967, + "learning_rate": 4.9168902984509456e-06, + "loss": 0.5675, + "step": 4596 + }, + { + "epoch": 1.8167942701901705, + "grad_norm": 0.43702921400183176, + "learning_rate": 4.9168502325662985e-06, + "loss": 0.544, + "step": 4597 + }, + { + "epoch": 1.8171894294887627, + "grad_norm": 0.4359050509194601, + "learning_rate": 4.91681015718973e-06, + "loss": 0.5531, + "step": 4598 + }, + { + "epoch": 1.817584588787355, + "grad_norm": 0.4238189029244474, + "learning_rate": 4.9167700723214e-06, + "loss": 0.5222, + "step": 4599 + }, + { + "epoch": 1.8179797480859472, + "grad_norm": 0.42884238541026204, + "learning_rate": 4.916729977961463e-06, + "loss": 0.5486, + "step": 4600 + }, + { + "epoch": 1.8183749073845394, + "grad_norm": 0.42474360636234726, + "learning_rate": 4.91668987411008e-06, + "loss": 0.5553, + "step": 4601 + }, + { + "epoch": 1.8187700666831317, + "grad_norm": 0.44168858012742196, + "learning_rate": 4.916649760767405e-06, + "loss": 0.5518, + "step": 4602 + }, + { + "epoch": 1.819165225981724, + "grad_norm": 0.4556359965693222, + "learning_rate": 4.916609637933598e-06, + "loss": 0.5668, + "step": 4603 + }, + { + "epoch": 1.8195603852803162, + "grad_norm": 0.43307847780187875, + "learning_rate": 4.916569505608816e-06, + "loss": 0.5498, + "step": 4604 + }, + { + "epoch": 1.8199555445789084, + "grad_norm": 0.4311936492838269, + "learning_rate": 4.916529363793216e-06, + "loss": 0.554, + "step": 4605 + }, + { + "epoch": 1.8203507038775006, + "grad_norm": 0.42973054647548137, + "learning_rate": 4.916489212486956e-06, + "loss": 0.565, + "step": 4606 + }, + { + "epoch": 1.8207458631760929, + "grad_norm": 0.42836235280682294, + "learning_rate": 4.916449051690194e-06, + "loss": 0.5599, + "step": 4607 + }, + { + "epoch": 1.8211410224746851, + "grad_norm": 0.41648067518333354, + "learning_rate": 4.916408881403087e-06, + "loss": 0.5499, + "step": 4608 + }, + { + "epoch": 1.8215361817732774, + "grad_norm": 0.4389273257682643, + "learning_rate": 4.916368701625795e-06, + "loss": 0.5582, + "step": 4609 + }, + { + "epoch": 1.8219313410718696, + "grad_norm": 0.4996066817560874, + "learning_rate": 4.916328512358472e-06, + "loss": 0.544, + "step": 4610 + }, + { + "epoch": 1.8223265003704618, + "grad_norm": 0.4320671880478789, + "learning_rate": 4.916288313601278e-06, + "loss": 0.5684, + "step": 4611 + }, + { + "epoch": 1.822721659669054, + "grad_norm": 0.47994501476908125, + "learning_rate": 4.916248105354372e-06, + "loss": 0.5627, + "step": 4612 + }, + { + "epoch": 1.8231168189676463, + "grad_norm": 0.44725918814624716, + "learning_rate": 4.91620788761791e-06, + "loss": 0.5801, + "step": 4613 + }, + { + "epoch": 1.8235119782662386, + "grad_norm": 0.41780605996678455, + "learning_rate": 4.9161676603920505e-06, + "loss": 0.5368, + "step": 4614 + }, + { + "epoch": 1.8239071375648308, + "grad_norm": 0.42149016682401924, + "learning_rate": 4.9161274236769516e-06, + "loss": 0.5417, + "step": 4615 + }, + { + "epoch": 1.824302296863423, + "grad_norm": 0.4159145131021071, + "learning_rate": 4.916087177472771e-06, + "loss": 0.5634, + "step": 4616 + }, + { + "epoch": 1.8246974561620153, + "grad_norm": 0.427772247807, + "learning_rate": 4.916046921779668e-06, + "loss": 0.5597, + "step": 4617 + }, + { + "epoch": 1.8250926154606075, + "grad_norm": 0.504175016285204, + "learning_rate": 4.916006656597799e-06, + "loss": 0.5534, + "step": 4618 + }, + { + "epoch": 1.8254877747591998, + "grad_norm": 0.43930170796960316, + "learning_rate": 4.915966381927324e-06, + "loss": 0.5625, + "step": 4619 + }, + { + "epoch": 1.825882934057792, + "grad_norm": 0.4317015969701484, + "learning_rate": 4.9159260977683986e-06, + "loss": 0.5535, + "step": 4620 + }, + { + "epoch": 1.8262780933563842, + "grad_norm": 0.4319583379497269, + "learning_rate": 4.915885804121184e-06, + "loss": 0.5443, + "step": 4621 + }, + { + "epoch": 1.8266732526549765, + "grad_norm": 0.46283454804644775, + "learning_rate": 4.915845500985836e-06, + "loss": 0.5798, + "step": 4622 + }, + { + "epoch": 1.8270684119535687, + "grad_norm": 0.4578316028623526, + "learning_rate": 4.915805188362514e-06, + "loss": 0.5873, + "step": 4623 + }, + { + "epoch": 1.827463571252161, + "grad_norm": 0.4380807378788219, + "learning_rate": 4.915764866251376e-06, + "loss": 0.5702, + "step": 4624 + }, + { + "epoch": 1.8278587305507532, + "grad_norm": 0.4635611902517724, + "learning_rate": 4.915724534652581e-06, + "loss": 0.5676, + "step": 4625 + }, + { + "epoch": 1.8282538898493454, + "grad_norm": 0.44336086993981816, + "learning_rate": 4.915684193566287e-06, + "loss": 0.571, + "step": 4626 + }, + { + "epoch": 1.8286490491479377, + "grad_norm": 0.49804959145623445, + "learning_rate": 4.915643842992652e-06, + "loss": 0.5704, + "step": 4627 + }, + { + "epoch": 1.82904420844653, + "grad_norm": 0.4484804351198767, + "learning_rate": 4.915603482931835e-06, + "loss": 0.5601, + "step": 4628 + }, + { + "epoch": 1.8294393677451222, + "grad_norm": 0.4259382481340384, + "learning_rate": 4.915563113383994e-06, + "loss": 0.5414, + "step": 4629 + }, + { + "epoch": 1.8298345270437144, + "grad_norm": 0.46410055417082385, + "learning_rate": 4.915522734349289e-06, + "loss": 0.5668, + "step": 4630 + }, + { + "epoch": 1.8302296863423066, + "grad_norm": 0.4450378060474863, + "learning_rate": 4.915482345827876e-06, + "loss": 0.5625, + "step": 4631 + }, + { + "epoch": 1.830624845640899, + "grad_norm": 0.4469682638808664, + "learning_rate": 4.915441947819916e-06, + "loss": 0.56, + "step": 4632 + }, + { + "epoch": 1.8310200049394911, + "grad_norm": 0.4386874685983757, + "learning_rate": 4.915401540325566e-06, + "loss": 0.5763, + "step": 4633 + }, + { + "epoch": 1.8314151642380834, + "grad_norm": 0.4386404133941846, + "learning_rate": 4.9153611233449864e-06, + "loss": 0.5513, + "step": 4634 + }, + { + "epoch": 1.8318103235366756, + "grad_norm": 0.45589832428091503, + "learning_rate": 4.915320696878335e-06, + "loss": 0.5545, + "step": 4635 + }, + { + "epoch": 1.8322054828352679, + "grad_norm": 0.4388397614681877, + "learning_rate": 4.91528026092577e-06, + "loss": 0.5369, + "step": 4636 + }, + { + "epoch": 1.83260064213386, + "grad_norm": 0.4220869515007803, + "learning_rate": 4.915239815487451e-06, + "loss": 0.5536, + "step": 4637 + }, + { + "epoch": 1.8329958014324523, + "grad_norm": 0.4516307562377459, + "learning_rate": 4.915199360563536e-06, + "loss": 0.5663, + "step": 4638 + }, + { + "epoch": 1.8333909607310446, + "grad_norm": 0.43082783021013155, + "learning_rate": 4.915158896154185e-06, + "loss": 0.5711, + "step": 4639 + }, + { + "epoch": 1.8337861200296368, + "grad_norm": 0.4280112586626305, + "learning_rate": 4.915118422259557e-06, + "loss": 0.5586, + "step": 4640 + }, + { + "epoch": 1.834181279328229, + "grad_norm": 0.455862847226725, + "learning_rate": 4.91507793887981e-06, + "loss": 0.5567, + "step": 4641 + }, + { + "epoch": 1.8345764386268213, + "grad_norm": 0.43432786099005233, + "learning_rate": 4.915037446015103e-06, + "loss": 0.5722, + "step": 4642 + }, + { + "epoch": 1.8349715979254135, + "grad_norm": 0.45288339991619825, + "learning_rate": 4.914996943665596e-06, + "loss": 0.5548, + "step": 4643 + }, + { + "epoch": 1.8353667572240058, + "grad_norm": 0.430304761849555, + "learning_rate": 4.914956431831447e-06, + "loss": 0.5569, + "step": 4644 + }, + { + "epoch": 1.835761916522598, + "grad_norm": 0.4516766791005426, + "learning_rate": 4.914915910512815e-06, + "loss": 0.5439, + "step": 4645 + }, + { + "epoch": 1.8361570758211903, + "grad_norm": 0.4262026911256271, + "learning_rate": 4.914875379709861e-06, + "loss": 0.5502, + "step": 4646 + }, + { + "epoch": 1.8365522351197827, + "grad_norm": 0.44982296126025034, + "learning_rate": 4.914834839422742e-06, + "loss": 0.5753, + "step": 4647 + }, + { + "epoch": 1.836947394418375, + "grad_norm": 0.4305160106289168, + "learning_rate": 4.914794289651619e-06, + "loss": 0.5476, + "step": 4648 + }, + { + "epoch": 1.8373425537169672, + "grad_norm": 0.43698124283033885, + "learning_rate": 4.91475373039665e-06, + "loss": 0.5626, + "step": 4649 + }, + { + "epoch": 1.8377377130155594, + "grad_norm": 0.4324559418236934, + "learning_rate": 4.914713161657993e-06, + "loss": 0.5465, + "step": 4650 + }, + { + "epoch": 1.8381328723141517, + "grad_norm": 0.46505067847109016, + "learning_rate": 4.914672583435811e-06, + "loss": 0.5773, + "step": 4651 + }, + { + "epoch": 1.838528031612744, + "grad_norm": 0.44054104719934584, + "learning_rate": 4.9146319957302615e-06, + "loss": 0.5462, + "step": 4652 + }, + { + "epoch": 1.8389231909113362, + "grad_norm": 0.4293700652645379, + "learning_rate": 4.914591398541503e-06, + "loss": 0.5514, + "step": 4653 + }, + { + "epoch": 1.8393183502099284, + "grad_norm": 0.45753087472514964, + "learning_rate": 4.9145507918696956e-06, + "loss": 0.5478, + "step": 4654 + }, + { + "epoch": 1.8397135095085206, + "grad_norm": 0.4461916227350306, + "learning_rate": 4.9145101757149994e-06, + "loss": 0.5548, + "step": 4655 + }, + { + "epoch": 1.8401086688071129, + "grad_norm": 0.4471190461133453, + "learning_rate": 4.914469550077573e-06, + "loss": 0.5522, + "step": 4656 + }, + { + "epoch": 1.8405038281057051, + "grad_norm": 0.44384611797235896, + "learning_rate": 4.914428914957576e-06, + "loss": 0.572, + "step": 4657 + }, + { + "epoch": 1.8408989874042974, + "grad_norm": 0.4574610337875189, + "learning_rate": 4.9143882703551685e-06, + "loss": 0.5625, + "step": 4658 + }, + { + "epoch": 1.8412941467028896, + "grad_norm": 0.5048055930740011, + "learning_rate": 4.914347616270511e-06, + "loss": 0.5431, + "step": 4659 + }, + { + "epoch": 1.8416893060014818, + "grad_norm": 0.4421724478683854, + "learning_rate": 4.914306952703761e-06, + "loss": 0.5472, + "step": 4660 + }, + { + "epoch": 1.842084465300074, + "grad_norm": 0.43918097376339305, + "learning_rate": 4.914266279655079e-06, + "loss": 0.5677, + "step": 4661 + }, + { + "epoch": 1.8424796245986663, + "grad_norm": 0.4518794942674701, + "learning_rate": 4.914225597124626e-06, + "loss": 0.5492, + "step": 4662 + }, + { + "epoch": 1.8428747838972586, + "grad_norm": 0.47641842318354, + "learning_rate": 4.9141849051125614e-06, + "loss": 0.5769, + "step": 4663 + }, + { + "epoch": 1.8432699431958508, + "grad_norm": 0.43242823970216165, + "learning_rate": 4.9141442036190435e-06, + "loss": 0.5474, + "step": 4664 + }, + { + "epoch": 1.843665102494443, + "grad_norm": 0.43975219203201016, + "learning_rate": 4.914103492644233e-06, + "loss": 0.5428, + "step": 4665 + }, + { + "epoch": 1.8440602617930353, + "grad_norm": 0.44273932980174646, + "learning_rate": 4.91406277218829e-06, + "loss": 0.5429, + "step": 4666 + }, + { + "epoch": 1.8444554210916275, + "grad_norm": 0.4371689468098226, + "learning_rate": 4.914022042251375e-06, + "loss": 0.5443, + "step": 4667 + }, + { + "epoch": 1.8448505803902198, + "grad_norm": 0.46091840391838906, + "learning_rate": 4.9139813028336465e-06, + "loss": 0.5623, + "step": 4668 + }, + { + "epoch": 1.845245739688812, + "grad_norm": 0.4515022813696254, + "learning_rate": 4.9139405539352655e-06, + "loss": 0.5615, + "step": 4669 + }, + { + "epoch": 1.8456408989874042, + "grad_norm": 0.44395923841481416, + "learning_rate": 4.913899795556391e-06, + "loss": 0.5688, + "step": 4670 + }, + { + "epoch": 1.8460360582859967, + "grad_norm": 0.44117565523570207, + "learning_rate": 4.913859027697185e-06, + "loss": 0.5654, + "step": 4671 + }, + { + "epoch": 1.846431217584589, + "grad_norm": 0.42429096303861064, + "learning_rate": 4.913818250357807e-06, + "loss": 0.5462, + "step": 4672 + }, + { + "epoch": 1.8468263768831812, + "grad_norm": 0.44958055380119594, + "learning_rate": 4.913777463538416e-06, + "loss": 0.5686, + "step": 4673 + }, + { + "epoch": 1.8472215361817734, + "grad_norm": 0.4443331190570723, + "learning_rate": 4.913736667239173e-06, + "loss": 0.5567, + "step": 4674 + }, + { + "epoch": 1.8476166954803657, + "grad_norm": 0.42627047085078384, + "learning_rate": 4.913695861460238e-06, + "loss": 0.5426, + "step": 4675 + }, + { + "epoch": 1.848011854778958, + "grad_norm": 0.4500558947718227, + "learning_rate": 4.9136550462017716e-06, + "loss": 0.5542, + "step": 4676 + }, + { + "epoch": 1.8484070140775501, + "grad_norm": 0.4414303454674209, + "learning_rate": 4.913614221463932e-06, + "loss": 0.555, + "step": 4677 + }, + { + "epoch": 1.8488021733761424, + "grad_norm": 0.43962204738708255, + "learning_rate": 4.913573387246884e-06, + "loss": 0.5704, + "step": 4678 + }, + { + "epoch": 1.8491973326747346, + "grad_norm": 0.44664322555736324, + "learning_rate": 4.9135325435507845e-06, + "loss": 0.57, + "step": 4679 + }, + { + "epoch": 1.8495924919733269, + "grad_norm": 0.4539113882483917, + "learning_rate": 4.913491690375794e-06, + "loss": 0.5679, + "step": 4680 + }, + { + "epoch": 1.849987651271919, + "grad_norm": 0.4392523191394003, + "learning_rate": 4.913450827722074e-06, + "loss": 0.5388, + "step": 4681 + }, + { + "epoch": 1.8503828105705113, + "grad_norm": 0.430956543329709, + "learning_rate": 4.913409955589785e-06, + "loss": 0.5727, + "step": 4682 + }, + { + "epoch": 1.8507779698691036, + "grad_norm": 0.4509366979993098, + "learning_rate": 4.9133690739790864e-06, + "loss": 0.5503, + "step": 4683 + }, + { + "epoch": 1.8511731291676958, + "grad_norm": 0.4412502685023789, + "learning_rate": 4.91332818289014e-06, + "loss": 0.542, + "step": 4684 + }, + { + "epoch": 1.851568288466288, + "grad_norm": 0.4421900270354689, + "learning_rate": 4.913287282323107e-06, + "loss": 0.5562, + "step": 4685 + }, + { + "epoch": 1.8519634477648803, + "grad_norm": 0.5017741843721322, + "learning_rate": 4.913246372278145e-06, + "loss": 0.5546, + "step": 4686 + }, + { + "epoch": 1.8523586070634726, + "grad_norm": 0.42598558349235277, + "learning_rate": 4.913205452755418e-06, + "loss": 0.5565, + "step": 4687 + }, + { + "epoch": 1.8527537663620648, + "grad_norm": 0.44801891691212664, + "learning_rate": 4.913164523755085e-06, + "loss": 0.5622, + "step": 4688 + }, + { + "epoch": 1.853148925660657, + "grad_norm": 0.46504021620652847, + "learning_rate": 4.9131235852773075e-06, + "loss": 0.5486, + "step": 4689 + }, + { + "epoch": 1.8535440849592493, + "grad_norm": 0.42392415227635827, + "learning_rate": 4.913082637322245e-06, + "loss": 0.5462, + "step": 4690 + }, + { + "epoch": 1.8539392442578415, + "grad_norm": 0.43104217850617627, + "learning_rate": 4.91304167989006e-06, + "loss": 0.5406, + "step": 4691 + }, + { + "epoch": 1.8543344035564338, + "grad_norm": 0.47167572808016, + "learning_rate": 4.9130007129809135e-06, + "loss": 0.5469, + "step": 4692 + }, + { + "epoch": 1.854729562855026, + "grad_norm": 0.4747344328170053, + "learning_rate": 4.912959736594963e-06, + "loss": 0.5653, + "step": 4693 + }, + { + "epoch": 1.8551247221536182, + "grad_norm": 0.4343101893171199, + "learning_rate": 4.912918750732374e-06, + "loss": 0.5373, + "step": 4694 + }, + { + "epoch": 1.8555198814522105, + "grad_norm": 0.42499504251024883, + "learning_rate": 4.9128777553933035e-06, + "loss": 0.5499, + "step": 4695 + }, + { + "epoch": 1.8559150407508027, + "grad_norm": 0.45857770081696003, + "learning_rate": 4.9128367505779165e-06, + "loss": 0.5665, + "step": 4696 + }, + { + "epoch": 1.856310200049395, + "grad_norm": 0.47188633170732824, + "learning_rate": 4.91279573628637e-06, + "loss": 0.5745, + "step": 4697 + }, + { + "epoch": 1.8567053593479872, + "grad_norm": 0.43004014361775683, + "learning_rate": 4.912754712518828e-06, + "loss": 0.5387, + "step": 4698 + }, + { + "epoch": 1.8571005186465794, + "grad_norm": 0.450368047124033, + "learning_rate": 4.912713679275451e-06, + "loss": 0.551, + "step": 4699 + }, + { + "epoch": 1.8574956779451717, + "grad_norm": 0.4561974358066812, + "learning_rate": 4.912672636556398e-06, + "loss": 0.5645, + "step": 4700 + }, + { + "epoch": 1.857890837243764, + "grad_norm": 0.4327321080207485, + "learning_rate": 4.912631584361833e-06, + "loss": 0.5449, + "step": 4701 + }, + { + "epoch": 1.8582859965423562, + "grad_norm": 0.42737763737038065, + "learning_rate": 4.912590522691917e-06, + "loss": 0.5381, + "step": 4702 + }, + { + "epoch": 1.8586811558409484, + "grad_norm": 0.43781196017871354, + "learning_rate": 4.912549451546809e-06, + "loss": 0.5615, + "step": 4703 + }, + { + "epoch": 1.8590763151395406, + "grad_norm": 0.43337841503621866, + "learning_rate": 4.912508370926672e-06, + "loss": 0.5569, + "step": 4704 + }, + { + "epoch": 1.8594714744381329, + "grad_norm": 0.451977525541082, + "learning_rate": 4.912467280831668e-06, + "loss": 0.5494, + "step": 4705 + }, + { + "epoch": 1.8598666337367251, + "grad_norm": 0.4728561044232118, + "learning_rate": 4.9124261812619566e-06, + "loss": 0.5483, + "step": 4706 + }, + { + "epoch": 1.8602617930353174, + "grad_norm": 0.4415020307643117, + "learning_rate": 4.9123850722177e-06, + "loss": 0.548, + "step": 4707 + }, + { + "epoch": 1.8606569523339096, + "grad_norm": 0.45173525627796496, + "learning_rate": 4.912343953699061e-06, + "loss": 0.5574, + "step": 4708 + }, + { + "epoch": 1.8610521116325018, + "grad_norm": 0.4267764458634889, + "learning_rate": 4.912302825706198e-06, + "loss": 0.553, + "step": 4709 + }, + { + "epoch": 1.861447270931094, + "grad_norm": 0.4357451577511683, + "learning_rate": 4.912261688239275e-06, + "loss": 0.5541, + "step": 4710 + }, + { + "epoch": 1.8618424302296863, + "grad_norm": 0.46999902555719913, + "learning_rate": 4.912220541298454e-06, + "loss": 0.5777, + "step": 4711 + }, + { + "epoch": 1.8622375895282786, + "grad_norm": 0.4574693196717579, + "learning_rate": 4.912179384883894e-06, + "loss": 0.5587, + "step": 4712 + }, + { + "epoch": 1.8626327488268708, + "grad_norm": 0.44445542433033436, + "learning_rate": 4.912138218995759e-06, + "loss": 0.5649, + "step": 4713 + }, + { + "epoch": 1.863027908125463, + "grad_norm": 0.4342553915239652, + "learning_rate": 4.9120970436342095e-06, + "loss": 0.562, + "step": 4714 + }, + { + "epoch": 1.8634230674240553, + "grad_norm": 0.4543224773995171, + "learning_rate": 4.912055858799407e-06, + "loss": 0.5718, + "step": 4715 + }, + { + "epoch": 1.8638182267226475, + "grad_norm": 0.42547741598029465, + "learning_rate": 4.912014664491514e-06, + "loss": 0.5439, + "step": 4716 + }, + { + "epoch": 1.8642133860212398, + "grad_norm": 0.45619787933952977, + "learning_rate": 4.911973460710692e-06, + "loss": 0.5515, + "step": 4717 + }, + { + "epoch": 1.864608545319832, + "grad_norm": 0.46059272985729816, + "learning_rate": 4.911932247457104e-06, + "loss": 0.5648, + "step": 4718 + }, + { + "epoch": 1.8650037046184242, + "grad_norm": 0.44116740865007975, + "learning_rate": 4.911891024730911e-06, + "loss": 0.5574, + "step": 4719 + }, + { + "epoch": 1.8653988639170165, + "grad_norm": 0.41870208648406154, + "learning_rate": 4.9118497925322725e-06, + "loss": 0.537, + "step": 4720 + }, + { + "epoch": 1.8657940232156087, + "grad_norm": 0.5142452344989176, + "learning_rate": 4.911808550861353e-06, + "loss": 0.5673, + "step": 4721 + }, + { + "epoch": 1.866189182514201, + "grad_norm": 0.4401669678300714, + "learning_rate": 4.9117672997183155e-06, + "loss": 0.5578, + "step": 4722 + }, + { + "epoch": 1.8665843418127932, + "grad_norm": 0.4455903304729346, + "learning_rate": 4.911726039103319e-06, + "loss": 0.5608, + "step": 4723 + }, + { + "epoch": 1.8669795011113854, + "grad_norm": 0.44233877100194063, + "learning_rate": 4.911684769016528e-06, + "loss": 0.5764, + "step": 4724 + }, + { + "epoch": 1.8673746604099777, + "grad_norm": 0.43688108562586403, + "learning_rate": 4.911643489458104e-06, + "loss": 0.5526, + "step": 4725 + }, + { + "epoch": 1.86776981970857, + "grad_norm": 0.4534632460594352, + "learning_rate": 4.911602200428208e-06, + "loss": 0.5542, + "step": 4726 + }, + { + "epoch": 1.8681649790071622, + "grad_norm": 0.4803823461812875, + "learning_rate": 4.911560901927003e-06, + "loss": 0.5458, + "step": 4727 + }, + { + "epoch": 1.8685601383057544, + "grad_norm": 0.4390720418788202, + "learning_rate": 4.911519593954652e-06, + "loss": 0.5585, + "step": 4728 + }, + { + "epoch": 1.8689552976043466, + "grad_norm": 0.43151939301169534, + "learning_rate": 4.9114782765113155e-06, + "loss": 0.5717, + "step": 4729 + }, + { + "epoch": 1.8693504569029389, + "grad_norm": 0.43397827245657605, + "learning_rate": 4.911436949597157e-06, + "loss": 0.5294, + "step": 4730 + }, + { + "epoch": 1.8697456162015311, + "grad_norm": 0.4348332309375543, + "learning_rate": 4.911395613212339e-06, + "loss": 0.5472, + "step": 4731 + }, + { + "epoch": 1.8701407755001234, + "grad_norm": 0.4244300368938872, + "learning_rate": 4.911354267357022e-06, + "loss": 0.565, + "step": 4732 + }, + { + "epoch": 1.8705359347987156, + "grad_norm": 0.4353867070970591, + "learning_rate": 4.911312912031371e-06, + "loss": 0.5642, + "step": 4733 + }, + { + "epoch": 1.8709310940973078, + "grad_norm": 0.42839969001511663, + "learning_rate": 4.9112715472355464e-06, + "loss": 0.5475, + "step": 4734 + }, + { + "epoch": 1.8713262533959, + "grad_norm": 0.44110960690425427, + "learning_rate": 4.911230172969711e-06, + "loss": 0.5614, + "step": 4735 + }, + { + "epoch": 1.8717214126944923, + "grad_norm": 0.4995723449596207, + "learning_rate": 4.911188789234028e-06, + "loss": 0.5662, + "step": 4736 + }, + { + "epoch": 1.8721165719930846, + "grad_norm": 0.423171830947674, + "learning_rate": 4.91114739602866e-06, + "loss": 0.5564, + "step": 4737 + }, + { + "epoch": 1.8725117312916768, + "grad_norm": 0.44108599595757897, + "learning_rate": 4.911105993353769e-06, + "loss": 0.5664, + "step": 4738 + }, + { + "epoch": 1.872906890590269, + "grad_norm": 0.4502796994319581, + "learning_rate": 4.9110645812095174e-06, + "loss": 0.5572, + "step": 4739 + }, + { + "epoch": 1.8733020498888613, + "grad_norm": 0.4245422544885071, + "learning_rate": 4.911023159596069e-06, + "loss": 0.538, + "step": 4740 + }, + { + "epoch": 1.8736972091874535, + "grad_norm": 0.41908462248992445, + "learning_rate": 4.910981728513586e-06, + "loss": 0.5383, + "step": 4741 + }, + { + "epoch": 1.874092368486046, + "grad_norm": 0.4232818477813306, + "learning_rate": 4.910940287962229e-06, + "loss": 0.5348, + "step": 4742 + }, + { + "epoch": 1.8744875277846382, + "grad_norm": 0.43217070766658694, + "learning_rate": 4.910898837942163e-06, + "loss": 0.5527, + "step": 4743 + }, + { + "epoch": 1.8748826870832305, + "grad_norm": 0.4312801007800965, + "learning_rate": 4.9108573784535515e-06, + "loss": 0.5627, + "step": 4744 + }, + { + "epoch": 1.8752778463818227, + "grad_norm": 0.44318569935703905, + "learning_rate": 4.910815909496555e-06, + "loss": 0.5666, + "step": 4745 + }, + { + "epoch": 1.875673005680415, + "grad_norm": 0.4264547180929095, + "learning_rate": 4.910774431071338e-06, + "loss": 0.5566, + "step": 4746 + }, + { + "epoch": 1.8760681649790072, + "grad_norm": 0.43447024061939904, + "learning_rate": 4.910732943178063e-06, + "loss": 0.5388, + "step": 4747 + }, + { + "epoch": 1.8764633242775994, + "grad_norm": 0.44201190438269433, + "learning_rate": 4.9106914458168934e-06, + "loss": 0.537, + "step": 4748 + }, + { + "epoch": 1.8768584835761917, + "grad_norm": 0.43667832935964324, + "learning_rate": 4.91064993898799e-06, + "loss": 0.554, + "step": 4749 + }, + { + "epoch": 1.877253642874784, + "grad_norm": 0.4332594072357512, + "learning_rate": 4.910608422691519e-06, + "loss": 0.5458, + "step": 4750 + }, + { + "epoch": 1.8776488021733762, + "grad_norm": 0.4281199350474413, + "learning_rate": 4.910566896927642e-06, + "loss": 0.5343, + "step": 4751 + }, + { + "epoch": 1.8780439614719684, + "grad_norm": 0.4300151216135207, + "learning_rate": 4.910525361696521e-06, + "loss": 0.5442, + "step": 4752 + }, + { + "epoch": 1.8784391207705606, + "grad_norm": 0.4448357946018024, + "learning_rate": 4.91048381699832e-06, + "loss": 0.5608, + "step": 4753 + }, + { + "epoch": 1.8788342800691529, + "grad_norm": 0.43367677911395175, + "learning_rate": 4.910442262833204e-06, + "loss": 0.5497, + "step": 4754 + }, + { + "epoch": 1.8792294393677451, + "grad_norm": 0.4307967775333535, + "learning_rate": 4.9104006992013335e-06, + "loss": 0.5522, + "step": 4755 + }, + { + "epoch": 1.8796245986663374, + "grad_norm": 0.44915265676312033, + "learning_rate": 4.910359126102872e-06, + "loss": 0.5541, + "step": 4756 + }, + { + "epoch": 1.8800197579649296, + "grad_norm": 0.4513270587407442, + "learning_rate": 4.910317543537984e-06, + "loss": 0.5704, + "step": 4757 + }, + { + "epoch": 1.8804149172635218, + "grad_norm": 0.42836930222487574, + "learning_rate": 4.910275951506832e-06, + "loss": 0.5455, + "step": 4758 + }, + { + "epoch": 1.880810076562114, + "grad_norm": 0.4200332636052868, + "learning_rate": 4.91023435000958e-06, + "loss": 0.5493, + "step": 4759 + }, + { + "epoch": 1.8812052358607063, + "grad_norm": 0.4322816077163206, + "learning_rate": 4.910192739046392e-06, + "loss": 0.5828, + "step": 4760 + }, + { + "epoch": 1.8816003951592986, + "grad_norm": 0.42914581453180367, + "learning_rate": 4.910151118617429e-06, + "loss": 0.5444, + "step": 4761 + }, + { + "epoch": 1.8819955544578908, + "grad_norm": 0.4164751887870663, + "learning_rate": 4.910109488722857e-06, + "loss": 0.5345, + "step": 4762 + }, + { + "epoch": 1.882390713756483, + "grad_norm": 0.4231358763864964, + "learning_rate": 4.910067849362838e-06, + "loss": 0.5464, + "step": 4763 + }, + { + "epoch": 1.8827858730550753, + "grad_norm": 0.4289620642079432, + "learning_rate": 4.910026200537535e-06, + "loss": 0.5629, + "step": 4764 + }, + { + "epoch": 1.8831810323536677, + "grad_norm": 0.4296872173261681, + "learning_rate": 4.909984542247115e-06, + "loss": 0.5323, + "step": 4765 + }, + { + "epoch": 1.88357619165226, + "grad_norm": 0.4405891662125567, + "learning_rate": 4.909942874491736e-06, + "loss": 0.5581, + "step": 4766 + }, + { + "epoch": 1.8839713509508522, + "grad_norm": 0.49421695499827306, + "learning_rate": 4.9099011972715674e-06, + "loss": 0.561, + "step": 4767 + }, + { + "epoch": 1.8843665102494445, + "grad_norm": 0.44152259881922284, + "learning_rate": 4.909859510586769e-06, + "loss": 0.55, + "step": 4768 + }, + { + "epoch": 1.8847616695480367, + "grad_norm": 0.5509522024405635, + "learning_rate": 4.909817814437506e-06, + "loss": 0.5424, + "step": 4769 + }, + { + "epoch": 1.885156828846629, + "grad_norm": 0.42019717906356363, + "learning_rate": 4.909776108823941e-06, + "loss": 0.5529, + "step": 4770 + }, + { + "epoch": 1.8855519881452212, + "grad_norm": 0.42580576090703337, + "learning_rate": 4.909734393746241e-06, + "loss": 0.5465, + "step": 4771 + }, + { + "epoch": 1.8859471474438134, + "grad_norm": 0.429426688268032, + "learning_rate": 4.909692669204565e-06, + "loss": 0.5518, + "step": 4772 + }, + { + "epoch": 1.8863423067424057, + "grad_norm": 0.4597075112764053, + "learning_rate": 4.909650935199082e-06, + "loss": 0.593, + "step": 4773 + }, + { + "epoch": 1.886737466040998, + "grad_norm": 0.4231583750958899, + "learning_rate": 4.909609191729951e-06, + "loss": 0.5623, + "step": 4774 + }, + { + "epoch": 1.8871326253395901, + "grad_norm": 0.42211619397536165, + "learning_rate": 4.90956743879734e-06, + "loss": 0.5507, + "step": 4775 + }, + { + "epoch": 1.8875277846381824, + "grad_norm": 0.42252973826533957, + "learning_rate": 4.90952567640141e-06, + "loss": 0.5614, + "step": 4776 + }, + { + "epoch": 1.8879229439367746, + "grad_norm": 0.4531797772700644, + "learning_rate": 4.909483904542327e-06, + "loss": 0.5533, + "step": 4777 + }, + { + "epoch": 1.8883181032353669, + "grad_norm": 0.4376214503057026, + "learning_rate": 4.909442123220255e-06, + "loss": 0.5564, + "step": 4778 + }, + { + "epoch": 1.888713262533959, + "grad_norm": 0.43083585059658475, + "learning_rate": 4.909400332435357e-06, + "loss": 0.5523, + "step": 4779 + }, + { + "epoch": 1.8891084218325513, + "grad_norm": 0.41258509304143803, + "learning_rate": 4.909358532187796e-06, + "loss": 0.5389, + "step": 4780 + }, + { + "epoch": 1.8895035811311436, + "grad_norm": 0.4343631923478858, + "learning_rate": 4.909316722477739e-06, + "loss": 0.5707, + "step": 4781 + }, + { + "epoch": 1.8898987404297358, + "grad_norm": 0.4432077662590313, + "learning_rate": 4.909274903305349e-06, + "loss": 0.5461, + "step": 4782 + }, + { + "epoch": 1.890293899728328, + "grad_norm": 0.43233155649566796, + "learning_rate": 4.909233074670791e-06, + "loss": 0.568, + "step": 4783 + }, + { + "epoch": 1.8906890590269203, + "grad_norm": 0.42852851485106386, + "learning_rate": 4.909191236574227e-06, + "loss": 0.5621, + "step": 4784 + }, + { + "epoch": 1.8910842183255125, + "grad_norm": 0.4322477381113751, + "learning_rate": 4.909149389015823e-06, + "loss": 0.5531, + "step": 4785 + }, + { + "epoch": 1.8914793776241048, + "grad_norm": 0.43499368654441956, + "learning_rate": 4.909107531995744e-06, + "loss": 0.5732, + "step": 4786 + }, + { + "epoch": 1.891874536922697, + "grad_norm": 0.43733806705560363, + "learning_rate": 4.909065665514152e-06, + "loss": 0.5763, + "step": 4787 + }, + { + "epoch": 1.8922696962212893, + "grad_norm": 0.45421207033142263, + "learning_rate": 4.909023789571214e-06, + "loss": 0.5762, + "step": 4788 + }, + { + "epoch": 1.8926648555198815, + "grad_norm": 0.47864113189844826, + "learning_rate": 4.908981904167094e-06, + "loss": 0.5378, + "step": 4789 + }, + { + "epoch": 1.8930600148184737, + "grad_norm": 0.4498106966078856, + "learning_rate": 4.908940009301955e-06, + "loss": 0.554, + "step": 4790 + }, + { + "epoch": 1.893455174117066, + "grad_norm": 0.47655100568930403, + "learning_rate": 4.908898104975962e-06, + "loss": 0.5498, + "step": 4791 + }, + { + "epoch": 1.8938503334156582, + "grad_norm": 0.43699221579300107, + "learning_rate": 4.908856191189281e-06, + "loss": 0.5803, + "step": 4792 + }, + { + "epoch": 1.8942454927142505, + "grad_norm": 0.44031567020576257, + "learning_rate": 4.908814267942075e-06, + "loss": 0.5603, + "step": 4793 + }, + { + "epoch": 1.8946406520128427, + "grad_norm": 0.4381512496766988, + "learning_rate": 4.908772335234509e-06, + "loss": 0.56, + "step": 4794 + }, + { + "epoch": 1.895035811311435, + "grad_norm": 0.4387758165017901, + "learning_rate": 4.9087303930667485e-06, + "loss": 0.5642, + "step": 4795 + }, + { + "epoch": 1.8954309706100272, + "grad_norm": 0.4239025821881657, + "learning_rate": 4.908688441438957e-06, + "loss": 0.5522, + "step": 4796 + }, + { + "epoch": 1.8958261299086194, + "grad_norm": 0.4273577795828686, + "learning_rate": 4.908646480351301e-06, + "loss": 0.5502, + "step": 4797 + }, + { + "epoch": 1.8962212892072117, + "grad_norm": 0.44421556427677866, + "learning_rate": 4.908604509803944e-06, + "loss": 0.5495, + "step": 4798 + }, + { + "epoch": 1.896616448505804, + "grad_norm": 0.44492416211462005, + "learning_rate": 4.908562529797051e-06, + "loss": 0.5577, + "step": 4799 + }, + { + "epoch": 1.8970116078043961, + "grad_norm": 0.4388278234952724, + "learning_rate": 4.908520540330786e-06, + "loss": 0.5615, + "step": 4800 + }, + { + "epoch": 1.8974067671029884, + "grad_norm": 0.43228612343838674, + "learning_rate": 4.908478541405316e-06, + "loss": 0.5467, + "step": 4801 + }, + { + "epoch": 1.8978019264015806, + "grad_norm": 0.4309700517194293, + "learning_rate": 4.908436533020804e-06, + "loss": 0.5321, + "step": 4802 + }, + { + "epoch": 1.8981970857001729, + "grad_norm": 0.4281007407461444, + "learning_rate": 4.908394515177416e-06, + "loss": 0.5356, + "step": 4803 + }, + { + "epoch": 1.898592244998765, + "grad_norm": 0.4323019441412228, + "learning_rate": 4.908352487875317e-06, + "loss": 0.5679, + "step": 4804 + }, + { + "epoch": 1.8989874042973574, + "grad_norm": 0.4403836139433748, + "learning_rate": 4.908310451114672e-06, + "loss": 0.5662, + "step": 4805 + }, + { + "epoch": 1.8993825635959496, + "grad_norm": 0.44165797245282984, + "learning_rate": 4.908268404895645e-06, + "loss": 0.543, + "step": 4806 + }, + { + "epoch": 1.8997777228945418, + "grad_norm": 0.4437790299847047, + "learning_rate": 4.908226349218404e-06, + "loss": 0.5511, + "step": 4807 + }, + { + "epoch": 1.900172882193134, + "grad_norm": 0.44724367559807077, + "learning_rate": 4.908184284083111e-06, + "loss": 0.5635, + "step": 4808 + }, + { + "epoch": 1.9005680414917263, + "grad_norm": 0.4365212081127553, + "learning_rate": 4.908142209489932e-06, + "loss": 0.5542, + "step": 4809 + }, + { + "epoch": 1.9009632007903186, + "grad_norm": 0.45163950732319247, + "learning_rate": 4.908100125439033e-06, + "loss": 0.5627, + "step": 4810 + }, + { + "epoch": 1.9013583600889108, + "grad_norm": 0.4316244913107137, + "learning_rate": 4.90805803193058e-06, + "loss": 0.5498, + "step": 4811 + }, + { + "epoch": 1.901753519387503, + "grad_norm": 0.445880369848173, + "learning_rate": 4.908015928964735e-06, + "loss": 0.5565, + "step": 4812 + }, + { + "epoch": 1.9021486786860953, + "grad_norm": 0.4344296261072089, + "learning_rate": 4.9079738165416676e-06, + "loss": 0.5358, + "step": 4813 + }, + { + "epoch": 1.9025438379846875, + "grad_norm": 0.4352424451727997, + "learning_rate": 4.907931694661541e-06, + "loss": 0.5495, + "step": 4814 + }, + { + "epoch": 1.9029389972832798, + "grad_norm": 0.4200141642812343, + "learning_rate": 4.907889563324521e-06, + "loss": 0.5529, + "step": 4815 + }, + { + "epoch": 1.903334156581872, + "grad_norm": 0.42328744956338943, + "learning_rate": 4.907847422530773e-06, + "loss": 0.5408, + "step": 4816 + }, + { + "epoch": 1.9037293158804642, + "grad_norm": 0.43808904720068553, + "learning_rate": 4.907805272280461e-06, + "loss": 0.5654, + "step": 4817 + }, + { + "epoch": 1.9041244751790565, + "grad_norm": 0.4345122275007242, + "learning_rate": 4.907763112573754e-06, + "loss": 0.5558, + "step": 4818 + }, + { + "epoch": 1.9045196344776487, + "grad_norm": 0.41639788021854496, + "learning_rate": 4.907720943410814e-06, + "loss": 0.5506, + "step": 4819 + }, + { + "epoch": 1.904914793776241, + "grad_norm": 0.42802462554370857, + "learning_rate": 4.90767876479181e-06, + "loss": 0.5433, + "step": 4820 + }, + { + "epoch": 1.9053099530748332, + "grad_norm": 0.43712860812027704, + "learning_rate": 4.907636576716904e-06, + "loss": 0.5482, + "step": 4821 + }, + { + "epoch": 1.9057051123734254, + "grad_norm": 0.4442981118293308, + "learning_rate": 4.9075943791862645e-06, + "loss": 0.5516, + "step": 4822 + }, + { + "epoch": 1.9061002716720177, + "grad_norm": 0.42923155588906153, + "learning_rate": 4.907552172200056e-06, + "loss": 0.5646, + "step": 4823 + }, + { + "epoch": 1.90649543097061, + "grad_norm": 0.4175409742504318, + "learning_rate": 4.907509955758444e-06, + "loss": 0.5476, + "step": 4824 + }, + { + "epoch": 1.9068905902692022, + "grad_norm": 0.43260553320884426, + "learning_rate": 4.907467729861595e-06, + "loss": 0.5576, + "step": 4825 + }, + { + "epoch": 1.9072857495677944, + "grad_norm": 0.4789723482405854, + "learning_rate": 4.907425494509675e-06, + "loss": 0.5863, + "step": 4826 + }, + { + "epoch": 1.9076809088663866, + "grad_norm": 0.43256077905417156, + "learning_rate": 4.90738324970285e-06, + "loss": 0.5648, + "step": 4827 + }, + { + "epoch": 1.9080760681649789, + "grad_norm": 0.4301753052479235, + "learning_rate": 4.907340995441284e-06, + "loss": 0.5765, + "step": 4828 + }, + { + "epoch": 1.9084712274635711, + "grad_norm": 0.4262264166103991, + "learning_rate": 4.907298731725146e-06, + "loss": 0.549, + "step": 4829 + }, + { + "epoch": 1.9088663867621634, + "grad_norm": 0.41997042490994213, + "learning_rate": 4.9072564585546e-06, + "loss": 0.558, + "step": 4830 + }, + { + "epoch": 1.9092615460607556, + "grad_norm": 0.4364550869207615, + "learning_rate": 4.9072141759298114e-06, + "loss": 0.5643, + "step": 4831 + }, + { + "epoch": 1.9096567053593478, + "grad_norm": 0.42201288173328566, + "learning_rate": 4.907171883850948e-06, + "loss": 0.534, + "step": 4832 + }, + { + "epoch": 1.91005186465794, + "grad_norm": 0.4417197054389254, + "learning_rate": 4.907129582318175e-06, + "loss": 0.5823, + "step": 4833 + }, + { + "epoch": 1.9104470239565323, + "grad_norm": 0.43489230519142597, + "learning_rate": 4.907087271331658e-06, + "loss": 0.5634, + "step": 4834 + }, + { + "epoch": 1.9108421832551246, + "grad_norm": 0.4273968780202027, + "learning_rate": 4.907044950891565e-06, + "loss": 0.5668, + "step": 4835 + }, + { + "epoch": 1.911237342553717, + "grad_norm": 0.42553680975953273, + "learning_rate": 4.907002620998061e-06, + "loss": 0.5562, + "step": 4836 + }, + { + "epoch": 1.9116325018523093, + "grad_norm": 0.45057537156888056, + "learning_rate": 4.906960281651312e-06, + "loss": 0.5906, + "step": 4837 + }, + { + "epoch": 1.9120276611509015, + "grad_norm": 0.43561521398581576, + "learning_rate": 4.906917932851484e-06, + "loss": 0.5673, + "step": 4838 + }, + { + "epoch": 1.9124228204494937, + "grad_norm": 0.41696070212722824, + "learning_rate": 4.906875574598745e-06, + "loss": 0.545, + "step": 4839 + }, + { + "epoch": 1.912817979748086, + "grad_norm": 0.4542669385242667, + "learning_rate": 4.90683320689326e-06, + "loss": 0.5732, + "step": 4840 + }, + { + "epoch": 1.9132131390466782, + "grad_norm": 0.42925333709930363, + "learning_rate": 4.906790829735195e-06, + "loss": 0.5553, + "step": 4841 + }, + { + "epoch": 1.9136082983452705, + "grad_norm": 0.4189953420798717, + "learning_rate": 4.906748443124718e-06, + "loss": 0.5412, + "step": 4842 + }, + { + "epoch": 1.9140034576438627, + "grad_norm": 0.43842151872555235, + "learning_rate": 4.906706047061994e-06, + "loss": 0.5842, + "step": 4843 + }, + { + "epoch": 1.914398616942455, + "grad_norm": 0.43303434492630205, + "learning_rate": 4.906663641547191e-06, + "loss": 0.5446, + "step": 4844 + }, + { + "epoch": 1.9147937762410472, + "grad_norm": 0.44750651700901584, + "learning_rate": 4.906621226580473e-06, + "loss": 0.5565, + "step": 4845 + }, + { + "epoch": 1.9151889355396394, + "grad_norm": 0.43669244850265104, + "learning_rate": 4.906578802162008e-06, + "loss": 0.5539, + "step": 4846 + }, + { + "epoch": 1.9155840948382317, + "grad_norm": 0.42310364393737065, + "learning_rate": 4.906536368291964e-06, + "loss": 0.5537, + "step": 4847 + }, + { + "epoch": 1.915979254136824, + "grad_norm": 0.41308731234212065, + "learning_rate": 4.9064939249705066e-06, + "loss": 0.5426, + "step": 4848 + }, + { + "epoch": 1.9163744134354161, + "grad_norm": 0.4284823800137259, + "learning_rate": 4.906451472197802e-06, + "loss": 0.5337, + "step": 4849 + }, + { + "epoch": 1.9167695727340084, + "grad_norm": 0.44109819338095785, + "learning_rate": 4.906409009974018e-06, + "loss": 0.5352, + "step": 4850 + }, + { + "epoch": 1.9171647320326006, + "grad_norm": 0.43268349819650925, + "learning_rate": 4.90636653829932e-06, + "loss": 0.5475, + "step": 4851 + }, + { + "epoch": 1.9175598913311929, + "grad_norm": 0.430803043302547, + "learning_rate": 4.906324057173875e-06, + "loss": 0.5426, + "step": 4852 + }, + { + "epoch": 1.917955050629785, + "grad_norm": 0.4266825335162365, + "learning_rate": 4.9062815665978504e-06, + "loss": 0.5547, + "step": 4853 + }, + { + "epoch": 1.9183502099283773, + "grad_norm": 0.4498735174649945, + "learning_rate": 4.906239066571413e-06, + "loss": 0.5621, + "step": 4854 + }, + { + "epoch": 1.9187453692269696, + "grad_norm": 0.4363381817986835, + "learning_rate": 4.90619655709473e-06, + "loss": 0.5578, + "step": 4855 + }, + { + "epoch": 1.9191405285255618, + "grad_norm": 0.4320421517760889, + "learning_rate": 4.906154038167968e-06, + "loss": 0.5516, + "step": 4856 + }, + { + "epoch": 1.919535687824154, + "grad_norm": 0.4377025215430826, + "learning_rate": 4.9061115097912944e-06, + "loss": 0.5528, + "step": 4857 + }, + { + "epoch": 1.9199308471227463, + "grad_norm": 0.45155283623556963, + "learning_rate": 4.906068971964876e-06, + "loss": 0.5677, + "step": 4858 + }, + { + "epoch": 1.9203260064213385, + "grad_norm": 0.4400211424480233, + "learning_rate": 4.906026424688879e-06, + "loss": 0.562, + "step": 4859 + }, + { + "epoch": 1.920721165719931, + "grad_norm": 0.4343248105860274, + "learning_rate": 4.905983867963472e-06, + "loss": 0.5668, + "step": 4860 + }, + { + "epoch": 1.9211163250185233, + "grad_norm": 0.4353078976406567, + "learning_rate": 4.905941301788821e-06, + "loss": 0.5536, + "step": 4861 + }, + { + "epoch": 1.9215114843171155, + "grad_norm": 0.44191754785521875, + "learning_rate": 4.905898726165093e-06, + "loss": 0.5397, + "step": 4862 + }, + { + "epoch": 1.9219066436157077, + "grad_norm": 0.4319364852789675, + "learning_rate": 4.905856141092457e-06, + "loss": 0.5535, + "step": 4863 + }, + { + "epoch": 1.9223018029143, + "grad_norm": 0.4303876762850279, + "learning_rate": 4.9058135465710776e-06, + "loss": 0.5388, + "step": 4864 + }, + { + "epoch": 1.9226969622128922, + "grad_norm": 0.4264199761833403, + "learning_rate": 4.9057709426011236e-06, + "loss": 0.5561, + "step": 4865 + }, + { + "epoch": 1.9230921215114845, + "grad_norm": 0.44314277804736335, + "learning_rate": 4.905728329182763e-06, + "loss": 0.5609, + "step": 4866 + }, + { + "epoch": 1.9234872808100767, + "grad_norm": 0.4404975868482517, + "learning_rate": 4.905685706316162e-06, + "loss": 0.5404, + "step": 4867 + }, + { + "epoch": 1.923882440108669, + "grad_norm": 0.45402270420659185, + "learning_rate": 4.9056430740014885e-06, + "loss": 0.5515, + "step": 4868 + }, + { + "epoch": 1.9242775994072612, + "grad_norm": 0.43115642120987685, + "learning_rate": 4.90560043223891e-06, + "loss": 0.5809, + "step": 4869 + }, + { + "epoch": 1.9246727587058534, + "grad_norm": 0.4508455280612912, + "learning_rate": 4.905557781028593e-06, + "loss": 0.5532, + "step": 4870 + }, + { + "epoch": 1.9250679180044457, + "grad_norm": 0.42373614920434, + "learning_rate": 4.905515120370706e-06, + "loss": 0.5552, + "step": 4871 + }, + { + "epoch": 1.925463077303038, + "grad_norm": 0.5279871782928506, + "learning_rate": 4.905472450265416e-06, + "loss": 0.5716, + "step": 4872 + }, + { + "epoch": 1.9258582366016301, + "grad_norm": 0.42805135252978943, + "learning_rate": 4.905429770712892e-06, + "loss": 0.5521, + "step": 4873 + }, + { + "epoch": 1.9262533959002224, + "grad_norm": 0.4368318688771723, + "learning_rate": 4.9053870817133e-06, + "loss": 0.5486, + "step": 4874 + }, + { + "epoch": 1.9266485551988146, + "grad_norm": 0.44355571797903176, + "learning_rate": 4.905344383266808e-06, + "loss": 0.5667, + "step": 4875 + }, + { + "epoch": 1.9270437144974069, + "grad_norm": 0.42560534528870747, + "learning_rate": 4.9053016753735836e-06, + "loss": 0.5561, + "step": 4876 + }, + { + "epoch": 1.927438873795999, + "grad_norm": 0.43288235759183613, + "learning_rate": 4.905258958033795e-06, + "loss": 0.5374, + "step": 4877 + }, + { + "epoch": 1.9278340330945913, + "grad_norm": 0.42388914415273304, + "learning_rate": 4.90521623124761e-06, + "loss": 0.5719, + "step": 4878 + }, + { + "epoch": 1.9282291923931836, + "grad_norm": 0.4261428683430966, + "learning_rate": 4.905173495015196e-06, + "loss": 0.5525, + "step": 4879 + }, + { + "epoch": 1.9286243516917758, + "grad_norm": 0.4238364660888561, + "learning_rate": 4.9051307493367205e-06, + "loss": 0.5456, + "step": 4880 + }, + { + "epoch": 1.929019510990368, + "grad_norm": 0.43787334764811864, + "learning_rate": 4.905087994212353e-06, + "loss": 0.5666, + "step": 4881 + }, + { + "epoch": 1.9294146702889603, + "grad_norm": 0.41851611764846186, + "learning_rate": 4.9050452296422595e-06, + "loss": 0.5355, + "step": 4882 + }, + { + "epoch": 1.9298098295875525, + "grad_norm": 0.4221693340127139, + "learning_rate": 4.905002455626609e-06, + "loss": 0.5524, + "step": 4883 + }, + { + "epoch": 1.9302049888861448, + "grad_norm": 0.4334189440111412, + "learning_rate": 4.904959672165569e-06, + "loss": 0.5448, + "step": 4884 + }, + { + "epoch": 1.930600148184737, + "grad_norm": 0.4304554597293204, + "learning_rate": 4.904916879259308e-06, + "loss": 0.5651, + "step": 4885 + }, + { + "epoch": 1.9309953074833293, + "grad_norm": 0.4325837099787629, + "learning_rate": 4.904874076907994e-06, + "loss": 0.571, + "step": 4886 + }, + { + "epoch": 1.9313904667819215, + "grad_norm": 0.43207355005966136, + "learning_rate": 4.904831265111795e-06, + "loss": 0.5691, + "step": 4887 + }, + { + "epoch": 1.9317856260805137, + "grad_norm": 0.432638152421682, + "learning_rate": 4.904788443870879e-06, + "loss": 0.5524, + "step": 4888 + }, + { + "epoch": 1.932180785379106, + "grad_norm": 0.4512618098367983, + "learning_rate": 4.904745613185415e-06, + "loss": 0.5496, + "step": 4889 + }, + { + "epoch": 1.9325759446776982, + "grad_norm": 0.4374527551224479, + "learning_rate": 4.904702773055568e-06, + "loss": 0.5732, + "step": 4890 + }, + { + "epoch": 1.9329711039762905, + "grad_norm": 0.4468449203480439, + "learning_rate": 4.9046599234815105e-06, + "loss": 0.5587, + "step": 4891 + }, + { + "epoch": 1.9333662632748827, + "grad_norm": 0.4316133274910195, + "learning_rate": 4.90461706446341e-06, + "loss": 0.5355, + "step": 4892 + }, + { + "epoch": 1.933761422573475, + "grad_norm": 0.4246061221952136, + "learning_rate": 4.904574196001432e-06, + "loss": 0.5326, + "step": 4893 + }, + { + "epoch": 1.9341565818720672, + "grad_norm": 0.45695254659882967, + "learning_rate": 4.9045313180957474e-06, + "loss": 0.5655, + "step": 4894 + }, + { + "epoch": 1.9345517411706594, + "grad_norm": 0.44841848061126316, + "learning_rate": 4.904488430746524e-06, + "loss": 0.566, + "step": 4895 + }, + { + "epoch": 1.9349469004692517, + "grad_norm": 0.43653829446312165, + "learning_rate": 4.90444553395393e-06, + "loss": 0.5223, + "step": 4896 + }, + { + "epoch": 1.935342059767844, + "grad_norm": 0.44040644249133537, + "learning_rate": 4.904402627718134e-06, + "loss": 0.5481, + "step": 4897 + }, + { + "epoch": 1.9357372190664361, + "grad_norm": 0.4403357250362934, + "learning_rate": 4.904359712039304e-06, + "loss": 0.5652, + "step": 4898 + }, + { + "epoch": 1.9361323783650284, + "grad_norm": 0.4196447153062507, + "learning_rate": 4.90431678691761e-06, + "loss": 0.5494, + "step": 4899 + }, + { + "epoch": 1.9365275376636206, + "grad_norm": 0.4507476048268143, + "learning_rate": 4.904273852353219e-06, + "loss": 0.5388, + "step": 4900 + }, + { + "epoch": 1.9369226969622129, + "grad_norm": 0.45818820763144874, + "learning_rate": 4.9042309083463e-06, + "loss": 0.5647, + "step": 4901 + }, + { + "epoch": 1.937317856260805, + "grad_norm": 0.4169132869917713, + "learning_rate": 4.904187954897023e-06, + "loss": 0.5328, + "step": 4902 + }, + { + "epoch": 1.9377130155593973, + "grad_norm": 0.42776873057488746, + "learning_rate": 4.904144992005555e-06, + "loss": 0.5401, + "step": 4903 + }, + { + "epoch": 1.9381081748579896, + "grad_norm": 0.4452737398761506, + "learning_rate": 4.904102019672066e-06, + "loss": 0.5849, + "step": 4904 + }, + { + "epoch": 1.9385033341565818, + "grad_norm": 0.4268122814283881, + "learning_rate": 4.904059037896723e-06, + "loss": 0.5577, + "step": 4905 + }, + { + "epoch": 1.938898493455174, + "grad_norm": 0.43474540031672093, + "learning_rate": 4.904016046679696e-06, + "loss": 0.5651, + "step": 4906 + }, + { + "epoch": 1.9392936527537663, + "grad_norm": 0.4439080883548797, + "learning_rate": 4.9039730460211545e-06, + "loss": 0.5525, + "step": 4907 + }, + { + "epoch": 1.9396888120523585, + "grad_norm": 0.4380749282591096, + "learning_rate": 4.9039300359212665e-06, + "loss": 0.5647, + "step": 4908 + }, + { + "epoch": 1.9400839713509508, + "grad_norm": 0.4409470401445144, + "learning_rate": 4.9038870163802e-06, + "loss": 0.5645, + "step": 4909 + }, + { + "epoch": 1.940479130649543, + "grad_norm": 0.4453293955228133, + "learning_rate": 4.903843987398127e-06, + "loss": 0.5473, + "step": 4910 + }, + { + "epoch": 1.9408742899481353, + "grad_norm": 0.4327605271211098, + "learning_rate": 4.903800948975213e-06, + "loss": 0.5391, + "step": 4911 + }, + { + "epoch": 1.9412694492467275, + "grad_norm": 0.4436439359315798, + "learning_rate": 4.903757901111629e-06, + "loss": 0.5592, + "step": 4912 + }, + { + "epoch": 1.9416646085453197, + "grad_norm": 0.4224199241527376, + "learning_rate": 4.903714843807543e-06, + "loss": 0.5408, + "step": 4913 + }, + { + "epoch": 1.942059767843912, + "grad_norm": 0.44551596874456995, + "learning_rate": 4.903671777063126e-06, + "loss": 0.5426, + "step": 4914 + }, + { + "epoch": 1.9424549271425042, + "grad_norm": 0.4296694239782312, + "learning_rate": 4.9036287008785446e-06, + "loss": 0.5388, + "step": 4915 + }, + { + "epoch": 1.9428500864410965, + "grad_norm": 0.44248224422217924, + "learning_rate": 4.903585615253969e-06, + "loss": 0.5597, + "step": 4916 + }, + { + "epoch": 1.9432452457396887, + "grad_norm": 0.4494129495563011, + "learning_rate": 4.90354252018957e-06, + "loss": 0.5744, + "step": 4917 + }, + { + "epoch": 1.943640405038281, + "grad_norm": 0.4668938690509174, + "learning_rate": 4.903499415685515e-06, + "loss": 0.5719, + "step": 4918 + }, + { + "epoch": 1.9440355643368732, + "grad_norm": 0.4397994559767837, + "learning_rate": 4.903456301741973e-06, + "loss": 0.5566, + "step": 4919 + }, + { + "epoch": 1.9444307236354654, + "grad_norm": 0.44596460788657327, + "learning_rate": 4.903413178359115e-06, + "loss": 0.5804, + "step": 4920 + }, + { + "epoch": 1.9448258829340577, + "grad_norm": 0.43722931620391176, + "learning_rate": 4.9033700455371095e-06, + "loss": 0.5356, + "step": 4921 + }, + { + "epoch": 1.94522104223265, + "grad_norm": 0.42315317154912496, + "learning_rate": 4.903326903276125e-06, + "loss": 0.5544, + "step": 4922 + }, + { + "epoch": 1.9456162015312422, + "grad_norm": 0.43461478276395443, + "learning_rate": 4.903283751576333e-06, + "loss": 0.5385, + "step": 4923 + }, + { + "epoch": 1.9460113608298344, + "grad_norm": 0.43858622832546995, + "learning_rate": 4.903240590437901e-06, + "loss": 0.5365, + "step": 4924 + }, + { + "epoch": 1.9464065201284266, + "grad_norm": 0.4381635670286083, + "learning_rate": 4.903197419861e-06, + "loss": 0.5522, + "step": 4925 + }, + { + "epoch": 1.9468016794270189, + "grad_norm": 0.4211868434644149, + "learning_rate": 4.903154239845798e-06, + "loss": 0.5383, + "step": 4926 + }, + { + "epoch": 1.9471968387256111, + "grad_norm": 0.4367363445108265, + "learning_rate": 4.903111050392465e-06, + "loss": 0.5585, + "step": 4927 + }, + { + "epoch": 1.9475919980242034, + "grad_norm": 0.43254406845062815, + "learning_rate": 4.903067851501172e-06, + "loss": 0.5458, + "step": 4928 + }, + { + "epoch": 1.9479871573227956, + "grad_norm": 0.42229361659520137, + "learning_rate": 4.9030246431720875e-06, + "loss": 0.5438, + "step": 4929 + }, + { + "epoch": 1.9483823166213878, + "grad_norm": 0.44413774805607853, + "learning_rate": 4.902981425405381e-06, + "loss": 0.5668, + "step": 4930 + }, + { + "epoch": 1.9487774759199803, + "grad_norm": 0.4380542711297867, + "learning_rate": 4.902938198201223e-06, + "loss": 0.5631, + "step": 4931 + }, + { + "epoch": 1.9491726352185725, + "grad_norm": 0.42049828655472166, + "learning_rate": 4.902894961559783e-06, + "loss": 0.5418, + "step": 4932 + }, + { + "epoch": 1.9495677945171648, + "grad_norm": 0.42075791692900755, + "learning_rate": 4.90285171548123e-06, + "loss": 0.543, + "step": 4933 + }, + { + "epoch": 1.949962953815757, + "grad_norm": 0.43396399038359434, + "learning_rate": 4.9028084599657355e-06, + "loss": 0.5334, + "step": 4934 + }, + { + "epoch": 1.9503581131143493, + "grad_norm": 0.4238423271020982, + "learning_rate": 4.902765195013468e-06, + "loss": 0.5629, + "step": 4935 + }, + { + "epoch": 1.9507532724129415, + "grad_norm": 0.4390345402600252, + "learning_rate": 4.902721920624598e-06, + "loss": 0.5535, + "step": 4936 + }, + { + "epoch": 1.9511484317115337, + "grad_norm": 0.433652416350482, + "learning_rate": 4.9026786367992955e-06, + "loss": 0.569, + "step": 4937 + }, + { + "epoch": 1.951543591010126, + "grad_norm": 0.43072162266053016, + "learning_rate": 4.90263534353773e-06, + "loss": 0.5486, + "step": 4938 + }, + { + "epoch": 1.9519387503087182, + "grad_norm": 0.4219105439697877, + "learning_rate": 4.902592040840071e-06, + "loss": 0.5656, + "step": 4939 + }, + { + "epoch": 1.9523339096073105, + "grad_norm": 0.4397667987358108, + "learning_rate": 4.9025487287064905e-06, + "loss": 0.5836, + "step": 4940 + }, + { + "epoch": 1.9527290689059027, + "grad_norm": 0.4444122529654959, + "learning_rate": 4.9025054071371565e-06, + "loss": 0.5395, + "step": 4941 + }, + { + "epoch": 1.953124228204495, + "grad_norm": 0.446787855529709, + "learning_rate": 4.9024620761322415e-06, + "loss": 0.5595, + "step": 4942 + }, + { + "epoch": 1.9535193875030872, + "grad_norm": 0.42700864336105265, + "learning_rate": 4.902418735691914e-06, + "loss": 0.5347, + "step": 4943 + }, + { + "epoch": 1.9539145468016794, + "grad_norm": 0.44112913377219515, + "learning_rate": 4.902375385816344e-06, + "loss": 0.55, + "step": 4944 + }, + { + "epoch": 1.9543097061002717, + "grad_norm": 0.452174058015207, + "learning_rate": 4.902332026505703e-06, + "loss": 0.5553, + "step": 4945 + }, + { + "epoch": 1.954704865398864, + "grad_norm": 0.5493673198052214, + "learning_rate": 4.902288657760159e-06, + "loss": 0.5628, + "step": 4946 + }, + { + "epoch": 1.9551000246974561, + "grad_norm": 0.4388936013126185, + "learning_rate": 4.902245279579886e-06, + "loss": 0.5442, + "step": 4947 + }, + { + "epoch": 1.9554951839960484, + "grad_norm": 0.4342127172483669, + "learning_rate": 4.9022018919650505e-06, + "loss": 0.5716, + "step": 4948 + }, + { + "epoch": 1.9558903432946406, + "grad_norm": 0.4409460564547779, + "learning_rate": 4.902158494915825e-06, + "loss": 0.555, + "step": 4949 + }, + { + "epoch": 1.9562855025932329, + "grad_norm": 0.43477091609114554, + "learning_rate": 4.90211508843238e-06, + "loss": 0.5425, + "step": 4950 + }, + { + "epoch": 1.956680661891825, + "grad_norm": 0.44011026483098825, + "learning_rate": 4.902071672514886e-06, + "loss": 0.5586, + "step": 4951 + }, + { + "epoch": 1.9570758211904173, + "grad_norm": 0.4279433040735619, + "learning_rate": 4.902028247163512e-06, + "loss": 0.5603, + "step": 4952 + }, + { + "epoch": 1.9574709804890096, + "grad_norm": 0.43493371661347063, + "learning_rate": 4.901984812378431e-06, + "loss": 0.5496, + "step": 4953 + }, + { + "epoch": 1.9578661397876018, + "grad_norm": 0.429552671257518, + "learning_rate": 4.901941368159812e-06, + "loss": 0.5667, + "step": 4954 + }, + { + "epoch": 1.9582612990861943, + "grad_norm": 0.45149418076182646, + "learning_rate": 4.901897914507825e-06, + "loss": 0.5721, + "step": 4955 + }, + { + "epoch": 1.9586564583847865, + "grad_norm": 0.4440050617964623, + "learning_rate": 4.901854451422642e-06, + "loss": 0.5447, + "step": 4956 + }, + { + "epoch": 1.9590516176833788, + "grad_norm": 0.4271232711419705, + "learning_rate": 4.901810978904433e-06, + "loss": 0.5303, + "step": 4957 + }, + { + "epoch": 1.959446776981971, + "grad_norm": 0.4178784895693934, + "learning_rate": 4.901767496953369e-06, + "loss": 0.5398, + "step": 4958 + }, + { + "epoch": 1.9598419362805632, + "grad_norm": 0.45219556101457836, + "learning_rate": 4.90172400556962e-06, + "loss": 0.5464, + "step": 4959 + }, + { + "epoch": 1.9602370955791555, + "grad_norm": 0.43549913764734244, + "learning_rate": 4.901680504753358e-06, + "loss": 0.5469, + "step": 4960 + }, + { + "epoch": 1.9606322548777477, + "grad_norm": 0.4288069140164951, + "learning_rate": 4.901636994504754e-06, + "loss": 0.5438, + "step": 4961 + }, + { + "epoch": 1.96102741417634, + "grad_norm": 0.558867701364055, + "learning_rate": 4.901593474823978e-06, + "loss": 0.5632, + "step": 4962 + }, + { + "epoch": 1.9614225734749322, + "grad_norm": 0.4402971314310295, + "learning_rate": 4.9015499457112e-06, + "loss": 0.5496, + "step": 4963 + }, + { + "epoch": 1.9618177327735244, + "grad_norm": 0.4280995907810528, + "learning_rate": 4.901506407166594e-06, + "loss": 0.5323, + "step": 4964 + }, + { + "epoch": 1.9622128920721167, + "grad_norm": 0.43660734305856075, + "learning_rate": 4.901462859190328e-06, + "loss": 0.5439, + "step": 4965 + }, + { + "epoch": 1.962608051370709, + "grad_norm": 0.4364007886164183, + "learning_rate": 4.9014193017825735e-06, + "loss": 0.5532, + "step": 4966 + }, + { + "epoch": 1.9630032106693012, + "grad_norm": 0.4597936604604986, + "learning_rate": 4.901375734943504e-06, + "loss": 0.5571, + "step": 4967 + }, + { + "epoch": 1.9633983699678934, + "grad_norm": 0.45327235813160105, + "learning_rate": 4.901332158673288e-06, + "loss": 0.5574, + "step": 4968 + }, + { + "epoch": 1.9637935292664856, + "grad_norm": 0.44715737005782, + "learning_rate": 4.901288572972097e-06, + "loss": 0.579, + "step": 4969 + }, + { + "epoch": 1.964188688565078, + "grad_norm": 0.42524834539399503, + "learning_rate": 4.901244977840103e-06, + "loss": 0.5186, + "step": 4970 + }, + { + "epoch": 1.9645838478636701, + "grad_norm": 0.45527798672472874, + "learning_rate": 4.9012013732774765e-06, + "loss": 0.5373, + "step": 4971 + }, + { + "epoch": 1.9649790071622624, + "grad_norm": 0.4343605181054587, + "learning_rate": 4.901157759284389e-06, + "loss": 0.5686, + "step": 4972 + }, + { + "epoch": 1.9653741664608546, + "grad_norm": 0.44258891979240134, + "learning_rate": 4.901114135861013e-06, + "loss": 0.5697, + "step": 4973 + }, + { + "epoch": 1.9657693257594469, + "grad_norm": 0.42190562697963224, + "learning_rate": 4.901070503007516e-06, + "loss": 0.5309, + "step": 4974 + }, + { + "epoch": 1.966164485058039, + "grad_norm": 0.45195265147050434, + "learning_rate": 4.901026860724075e-06, + "loss": 0.5568, + "step": 4975 + }, + { + "epoch": 1.9665596443566313, + "grad_norm": 0.4352261875981932, + "learning_rate": 4.900983209010858e-06, + "loss": 0.5507, + "step": 4976 + }, + { + "epoch": 1.9669548036552236, + "grad_norm": 0.43978785905912626, + "learning_rate": 4.9009395478680355e-06, + "loss": 0.5599, + "step": 4977 + }, + { + "epoch": 1.9673499629538158, + "grad_norm": 0.4475750883161317, + "learning_rate": 4.9008958772957815e-06, + "loss": 0.5493, + "step": 4978 + }, + { + "epoch": 1.967745122252408, + "grad_norm": 0.4328811619048258, + "learning_rate": 4.900852197294266e-06, + "loss": 0.5361, + "step": 4979 + }, + { + "epoch": 1.9681402815510003, + "grad_norm": 0.43773523890770105, + "learning_rate": 4.900808507863661e-06, + "loss": 0.5548, + "step": 4980 + }, + { + "epoch": 1.9685354408495925, + "grad_norm": 0.43452862600059244, + "learning_rate": 4.900764809004138e-06, + "loss": 0.5336, + "step": 4981 + }, + { + "epoch": 1.9689306001481848, + "grad_norm": 0.419201512372097, + "learning_rate": 4.900721100715869e-06, + "loss": 0.551, + "step": 4982 + }, + { + "epoch": 1.969325759446777, + "grad_norm": 0.45373661373694074, + "learning_rate": 4.900677382999025e-06, + "loss": 0.5657, + "step": 4983 + }, + { + "epoch": 1.9697209187453693, + "grad_norm": 0.4370236493158466, + "learning_rate": 4.900633655853778e-06, + "loss": 0.528, + "step": 4984 + }, + { + "epoch": 1.9701160780439615, + "grad_norm": 0.4528150258511628, + "learning_rate": 4.9005899192803e-06, + "loss": 0.5427, + "step": 4985 + }, + { + "epoch": 1.9705112373425537, + "grad_norm": 0.4506026367726158, + "learning_rate": 4.900546173278762e-06, + "loss": 0.5555, + "step": 4986 + }, + { + "epoch": 1.970906396641146, + "grad_norm": 0.4207709811591268, + "learning_rate": 4.900502417849337e-06, + "loss": 0.5502, + "step": 4987 + }, + { + "epoch": 1.9713015559397382, + "grad_norm": 0.4266100877818548, + "learning_rate": 4.900458652992196e-06, + "loss": 0.5469, + "step": 4988 + }, + { + "epoch": 1.9716967152383305, + "grad_norm": 0.4387635930216414, + "learning_rate": 4.900414878707511e-06, + "loss": 0.5587, + "step": 4989 + }, + { + "epoch": 1.9720918745369227, + "grad_norm": 0.42751985993998615, + "learning_rate": 4.9003710949954535e-06, + "loss": 0.5488, + "step": 4990 + }, + { + "epoch": 1.972487033835515, + "grad_norm": 0.429513289497151, + "learning_rate": 4.900327301856196e-06, + "loss": 0.5538, + "step": 4991 + }, + { + "epoch": 1.9728821931341072, + "grad_norm": 0.42988406007517393, + "learning_rate": 4.9002834992899104e-06, + "loss": 0.5279, + "step": 4992 + }, + { + "epoch": 1.9732773524326994, + "grad_norm": 0.4380519285585141, + "learning_rate": 4.90023968729677e-06, + "loss": 0.5651, + "step": 4993 + }, + { + "epoch": 1.9736725117312917, + "grad_norm": 0.4489032418464982, + "learning_rate": 4.900195865876944e-06, + "loss": 0.5577, + "step": 4994 + }, + { + "epoch": 1.974067671029884, + "grad_norm": 0.45508873672141004, + "learning_rate": 4.900152035030607e-06, + "loss": 0.5582, + "step": 4995 + }, + { + "epoch": 1.9744628303284761, + "grad_norm": 0.43159558253407226, + "learning_rate": 4.90010819475793e-06, + "loss": 0.5623, + "step": 4996 + }, + { + "epoch": 1.9748579896270684, + "grad_norm": 0.4372402377219953, + "learning_rate": 4.900064345059086e-06, + "loss": 0.5519, + "step": 4997 + }, + { + "epoch": 1.9752531489256606, + "grad_norm": 0.4627096781522281, + "learning_rate": 4.900020485934245e-06, + "loss": 0.5693, + "step": 4998 + }, + { + "epoch": 1.9756483082242529, + "grad_norm": 0.4518788906698329, + "learning_rate": 4.899976617383583e-06, + "loss": 0.5577, + "step": 4999 + }, + { + "epoch": 1.976043467522845, + "grad_norm": 0.44234025817296496, + "learning_rate": 4.899932739407268e-06, + "loss": 0.552, + "step": 5000 + }, + { + "epoch": 1.9764386268214373, + "grad_norm": 0.42902794698726177, + "learning_rate": 4.899888852005477e-06, + "loss": 0.535, + "step": 5001 + }, + { + "epoch": 1.9768337861200296, + "grad_norm": 0.4337101022811792, + "learning_rate": 4.899844955178378e-06, + "loss": 0.5477, + "step": 5002 + }, + { + "epoch": 1.9772289454186218, + "grad_norm": 0.4334561135618806, + "learning_rate": 4.899801048926146e-06, + "loss": 0.5536, + "step": 5003 + }, + { + "epoch": 1.977624104717214, + "grad_norm": 0.4539433104177194, + "learning_rate": 4.899757133248953e-06, + "loss": 0.5634, + "step": 5004 + }, + { + "epoch": 1.9780192640158063, + "grad_norm": 0.46398410451302946, + "learning_rate": 4.89971320814697e-06, + "loss": 0.5579, + "step": 5005 + }, + { + "epoch": 1.9784144233143985, + "grad_norm": 0.4224716161481286, + "learning_rate": 4.899669273620372e-06, + "loss": 0.5213, + "step": 5006 + }, + { + "epoch": 1.9788095826129908, + "grad_norm": 0.43890124660317376, + "learning_rate": 4.899625329669329e-06, + "loss": 0.5585, + "step": 5007 + }, + { + "epoch": 1.979204741911583, + "grad_norm": 0.45718913393571137, + "learning_rate": 4.899581376294016e-06, + "loss": 0.5739, + "step": 5008 + }, + { + "epoch": 1.9795999012101753, + "grad_norm": 0.4274912725996382, + "learning_rate": 4.899537413494604e-06, + "loss": 0.5284, + "step": 5009 + }, + { + "epoch": 1.9799950605087675, + "grad_norm": 0.4263688372409198, + "learning_rate": 4.899493441271266e-06, + "loss": 0.5371, + "step": 5010 + }, + { + "epoch": 1.9803902198073597, + "grad_norm": 0.42390616403436115, + "learning_rate": 4.899449459624175e-06, + "loss": 0.5525, + "step": 5011 + }, + { + "epoch": 1.980785379105952, + "grad_norm": 0.439575208819799, + "learning_rate": 4.899405468553503e-06, + "loss": 0.5596, + "step": 5012 + }, + { + "epoch": 1.9811805384045442, + "grad_norm": 0.438514692310827, + "learning_rate": 4.899361468059424e-06, + "loss": 0.5656, + "step": 5013 + }, + { + "epoch": 1.9815756977031365, + "grad_norm": 0.4286665241622738, + "learning_rate": 4.8993174581421095e-06, + "loss": 0.5462, + "step": 5014 + }, + { + "epoch": 1.9819708570017287, + "grad_norm": 0.4310129047836364, + "learning_rate": 4.899273438801734e-06, + "loss": 0.5717, + "step": 5015 + }, + { + "epoch": 1.982366016300321, + "grad_norm": 0.4410419994536329, + "learning_rate": 4.899229410038468e-06, + "loss": 0.5432, + "step": 5016 + }, + { + "epoch": 1.9827611755989132, + "grad_norm": 0.4421044202734814, + "learning_rate": 4.899185371852487e-06, + "loss": 0.552, + "step": 5017 + }, + { + "epoch": 1.9831563348975054, + "grad_norm": 0.43915212841424395, + "learning_rate": 4.899141324243962e-06, + "loss": 0.5384, + "step": 5018 + }, + { + "epoch": 1.9835514941960977, + "grad_norm": 0.42929780686144275, + "learning_rate": 4.8990972672130675e-06, + "loss": 0.5467, + "step": 5019 + }, + { + "epoch": 1.98394665349469, + "grad_norm": 0.44257765457839277, + "learning_rate": 4.899053200759975e-06, + "loss": 0.5723, + "step": 5020 + }, + { + "epoch": 1.9843418127932821, + "grad_norm": 0.4491299367521981, + "learning_rate": 4.8990091248848586e-06, + "loss": 0.56, + "step": 5021 + }, + { + "epoch": 1.9847369720918744, + "grad_norm": 0.42594398347130197, + "learning_rate": 4.898965039587891e-06, + "loss": 0.544, + "step": 5022 + }, + { + "epoch": 1.9851321313904666, + "grad_norm": 0.430300760036231, + "learning_rate": 4.898920944869245e-06, + "loss": 0.542, + "step": 5023 + }, + { + "epoch": 1.9855272906890589, + "grad_norm": 0.43273660717146456, + "learning_rate": 4.898876840729095e-06, + "loss": 0.5361, + "step": 5024 + }, + { + "epoch": 1.985922449987651, + "grad_norm": 0.44664865716579133, + "learning_rate": 4.898832727167613e-06, + "loss": 0.5703, + "step": 5025 + }, + { + "epoch": 1.9863176092862436, + "grad_norm": 0.4268197117087961, + "learning_rate": 4.898788604184973e-06, + "loss": 0.5773, + "step": 5026 + }, + { + "epoch": 1.9867127685848358, + "grad_norm": 0.4453737100715003, + "learning_rate": 4.8987444717813475e-06, + "loss": 0.5687, + "step": 5027 + }, + { + "epoch": 1.987107927883428, + "grad_norm": 0.43272023462491616, + "learning_rate": 4.898700329956911e-06, + "loss": 0.558, + "step": 5028 + }, + { + "epoch": 1.9875030871820203, + "grad_norm": 0.4264184633199343, + "learning_rate": 4.898656178711836e-06, + "loss": 0.5548, + "step": 5029 + }, + { + "epoch": 1.9878982464806125, + "grad_norm": 0.4221266341561921, + "learning_rate": 4.898612018046296e-06, + "loss": 0.5594, + "step": 5030 + }, + { + "epoch": 1.9882934057792048, + "grad_norm": 0.4347414491375779, + "learning_rate": 4.898567847960463e-06, + "loss": 0.5357, + "step": 5031 + }, + { + "epoch": 1.988688565077797, + "grad_norm": 0.43586397162379814, + "learning_rate": 4.898523668454514e-06, + "loss": 0.5446, + "step": 5032 + }, + { + "epoch": 1.9890837243763893, + "grad_norm": 0.5110052342258178, + "learning_rate": 4.89847947952862e-06, + "loss": 0.5537, + "step": 5033 + }, + { + "epoch": 1.9894788836749815, + "grad_norm": 0.4451068478930137, + "learning_rate": 4.898435281182955e-06, + "loss": 0.5476, + "step": 5034 + }, + { + "epoch": 1.9898740429735737, + "grad_norm": 0.4416455338525789, + "learning_rate": 4.898391073417692e-06, + "loss": 0.5623, + "step": 5035 + }, + { + "epoch": 1.990269202272166, + "grad_norm": 0.4258928324683576, + "learning_rate": 4.898346856233006e-06, + "loss": 0.5516, + "step": 5036 + }, + { + "epoch": 1.9906643615707582, + "grad_norm": 0.43772124082226155, + "learning_rate": 4.89830262962907e-06, + "loss": 0.5732, + "step": 5037 + }, + { + "epoch": 1.9910595208693505, + "grad_norm": 0.44967948499329063, + "learning_rate": 4.898258393606057e-06, + "loss": 0.5642, + "step": 5038 + }, + { + "epoch": 1.9914546801679427, + "grad_norm": 0.43814703911183817, + "learning_rate": 4.898214148164142e-06, + "loss": 0.5469, + "step": 5039 + }, + { + "epoch": 1.991849839466535, + "grad_norm": 0.4230077406376979, + "learning_rate": 4.898169893303497e-06, + "loss": 0.5466, + "step": 5040 + }, + { + "epoch": 1.9922449987651272, + "grad_norm": 0.43628697564315577, + "learning_rate": 4.898125629024298e-06, + "loss": 0.5445, + "step": 5041 + }, + { + "epoch": 1.9926401580637194, + "grad_norm": 0.45113306979240975, + "learning_rate": 4.898081355326717e-06, + "loss": 0.5658, + "step": 5042 + }, + { + "epoch": 1.9930353173623117, + "grad_norm": 0.4334951853126296, + "learning_rate": 4.898037072210929e-06, + "loss": 0.557, + "step": 5043 + }, + { + "epoch": 1.993430476660904, + "grad_norm": 0.42752413492781205, + "learning_rate": 4.897992779677108e-06, + "loss": 0.5546, + "step": 5044 + }, + { + "epoch": 1.9938256359594961, + "grad_norm": 0.4435208315687634, + "learning_rate": 4.8979484777254275e-06, + "loss": 0.5583, + "step": 5045 + }, + { + "epoch": 1.9942207952580884, + "grad_norm": 0.4487454343438756, + "learning_rate": 4.89790416635606e-06, + "loss": 0.55, + "step": 5046 + }, + { + "epoch": 1.9946159545566806, + "grad_norm": 0.4504942707382518, + "learning_rate": 4.8978598455691825e-06, + "loss": 0.5575, + "step": 5047 + }, + { + "epoch": 1.9950111138552729, + "grad_norm": 0.45640354880823075, + "learning_rate": 4.8978155153649674e-06, + "loss": 0.5693, + "step": 5048 + }, + { + "epoch": 1.9954062731538653, + "grad_norm": 0.442089413525396, + "learning_rate": 4.897771175743588e-06, + "loss": 0.559, + "step": 5049 + }, + { + "epoch": 1.9958014324524576, + "grad_norm": 0.45021406883084675, + "learning_rate": 4.89772682670522e-06, + "loss": 0.566, + "step": 5050 + }, + { + "epoch": 1.9961965917510498, + "grad_norm": 0.45325350234496126, + "learning_rate": 4.897682468250038e-06, + "loss": 0.5639, + "step": 5051 + }, + { + "epoch": 1.996591751049642, + "grad_norm": 0.4320207290666673, + "learning_rate": 4.897638100378214e-06, + "loss": 0.559, + "step": 5052 + }, + { + "epoch": 1.9969869103482343, + "grad_norm": 0.4471743726766185, + "learning_rate": 4.897593723089924e-06, + "loss": 0.5859, + "step": 5053 + }, + { + "epoch": 1.9973820696468265, + "grad_norm": 0.4526408463021309, + "learning_rate": 4.897549336385341e-06, + "loss": 0.562, + "step": 5054 + }, + { + "epoch": 1.9977772289454188, + "grad_norm": 0.4204803007502452, + "learning_rate": 4.897504940264641e-06, + "loss": 0.544, + "step": 5055 + }, + { + "epoch": 1.998172388244011, + "grad_norm": 0.4293273991156161, + "learning_rate": 4.897460534727997e-06, + "loss": 0.5356, + "step": 5056 + }, + { + "epoch": 1.9985675475426032, + "grad_norm": 0.43667313213086545, + "learning_rate": 4.897416119775584e-06, + "loss": 0.5415, + "step": 5057 + }, + { + "epoch": 1.9989627068411955, + "grad_norm": 0.4463243145959753, + "learning_rate": 4.897371695407576e-06, + "loss": 0.5554, + "step": 5058 + }, + { + "epoch": 1.9993578661397877, + "grad_norm": 0.4449190372166922, + "learning_rate": 4.897327261624148e-06, + "loss": 0.5541, + "step": 5059 + }, + { + "epoch": 1.99975302543838, + "grad_norm": 0.4548254873381931, + "learning_rate": 4.897282818425474e-06, + "loss": 0.5736, + "step": 5060 + }, + { + "epoch": 2.000148184736972, + "grad_norm": 0.45300066146970297, + "learning_rate": 4.89723836581173e-06, + "loss": 0.5866, + "step": 5061 + }, + { + "epoch": 2.0005433440355644, + "grad_norm": 0.44240572381302623, + "learning_rate": 4.897193903783087e-06, + "loss": 0.5543, + "step": 5062 + }, + { + "epoch": 2.0009385033341567, + "grad_norm": 0.4507961288693878, + "learning_rate": 4.8971494323397236e-06, + "loss": 0.5534, + "step": 5063 + }, + { + "epoch": 2.001333662632749, + "grad_norm": 0.42461245435649125, + "learning_rate": 4.897104951481813e-06, + "loss": 0.546, + "step": 5064 + }, + { + "epoch": 2.001728821931341, + "grad_norm": 0.43117780751367724, + "learning_rate": 4.897060461209529e-06, + "loss": 0.5413, + "step": 5065 + }, + { + "epoch": 2.0021239812299334, + "grad_norm": 0.4590693569061441, + "learning_rate": 4.8970159615230476e-06, + "loss": 0.5553, + "step": 5066 + }, + { + "epoch": 2.0025191405285256, + "grad_norm": 0.43910645784012625, + "learning_rate": 4.896971452422543e-06, + "loss": 0.5453, + "step": 5067 + }, + { + "epoch": 2.002914299827118, + "grad_norm": 0.4491473519263672, + "learning_rate": 4.89692693390819e-06, + "loss": 0.5344, + "step": 5068 + }, + { + "epoch": 2.00330945912571, + "grad_norm": 0.4558312418893952, + "learning_rate": 4.896882405980164e-06, + "loss": 0.5424, + "step": 5069 + }, + { + "epoch": 2.0037046184243024, + "grad_norm": 0.4342578385234224, + "learning_rate": 4.896837868638638e-06, + "loss": 0.5609, + "step": 5070 + }, + { + "epoch": 2.0040997777228946, + "grad_norm": 0.472485898038852, + "learning_rate": 4.896793321883789e-06, + "loss": 0.5507, + "step": 5071 + }, + { + "epoch": 2.004494937021487, + "grad_norm": 0.43961186761669674, + "learning_rate": 4.896748765715792e-06, + "loss": 0.5627, + "step": 5072 + }, + { + "epoch": 2.004890096320079, + "grad_norm": 0.5682972949137233, + "learning_rate": 4.89670420013482e-06, + "loss": 0.5572, + "step": 5073 + }, + { + "epoch": 2.0052852556186713, + "grad_norm": 0.4414444744926705, + "learning_rate": 4.89665962514105e-06, + "loss": 0.5509, + "step": 5074 + }, + { + "epoch": 2.0056804149172636, + "grad_norm": 0.41346400399643507, + "learning_rate": 4.896615040734656e-06, + "loss": 0.5201, + "step": 5075 + }, + { + "epoch": 2.006075574215856, + "grad_norm": 0.4444163402767339, + "learning_rate": 4.896570446915814e-06, + "loss": 0.5373, + "step": 5076 + }, + { + "epoch": 2.006470733514448, + "grad_norm": 0.4296320259295841, + "learning_rate": 4.896525843684698e-06, + "loss": 0.5574, + "step": 5077 + }, + { + "epoch": 2.0068658928130403, + "grad_norm": 0.4324921273129445, + "learning_rate": 4.896481231041483e-06, + "loss": 0.5597, + "step": 5078 + }, + { + "epoch": 2.0072610521116325, + "grad_norm": 0.5733673612744304, + "learning_rate": 4.896436608986347e-06, + "loss": 0.5648, + "step": 5079 + }, + { + "epoch": 2.0076562114102248, + "grad_norm": 0.42676323466611693, + "learning_rate": 4.896391977519461e-06, + "loss": 0.5537, + "step": 5080 + }, + { + "epoch": 2.008051370708817, + "grad_norm": 0.4430572425107361, + "learning_rate": 4.896347336641004e-06, + "loss": 0.5509, + "step": 5081 + }, + { + "epoch": 2.0084465300074092, + "grad_norm": 0.45815046775746454, + "learning_rate": 4.896302686351149e-06, + "loss": 0.5684, + "step": 5082 + }, + { + "epoch": 2.0088416893060015, + "grad_norm": 0.4498740575972979, + "learning_rate": 4.896258026650072e-06, + "loss": 0.5895, + "step": 5083 + }, + { + "epoch": 2.0092368486045937, + "grad_norm": 0.4208347192689959, + "learning_rate": 4.89621335753795e-06, + "loss": 0.546, + "step": 5084 + }, + { + "epoch": 2.009632007903186, + "grad_norm": 0.4859059756580774, + "learning_rate": 4.8961686790149554e-06, + "loss": 0.5585, + "step": 5085 + }, + { + "epoch": 2.010027167201778, + "grad_norm": 0.43391858881607787, + "learning_rate": 4.896123991081266e-06, + "loss": 0.563, + "step": 5086 + }, + { + "epoch": 2.0104223265003704, + "grad_norm": 0.4314428130560567, + "learning_rate": 4.8960792937370565e-06, + "loss": 0.544, + "step": 5087 + }, + { + "epoch": 2.0108174857989627, + "grad_norm": 0.42222335118568166, + "learning_rate": 4.896034586982502e-06, + "loss": 0.5453, + "step": 5088 + }, + { + "epoch": 2.011212645097555, + "grad_norm": 0.4360640333190303, + "learning_rate": 4.89598987081778e-06, + "loss": 0.5461, + "step": 5089 + }, + { + "epoch": 2.011607804396147, + "grad_norm": 0.44026271873516537, + "learning_rate": 4.8959451452430635e-06, + "loss": 0.5541, + "step": 5090 + }, + { + "epoch": 2.0120029636947394, + "grad_norm": 0.42715031631521877, + "learning_rate": 4.895900410258529e-06, + "loss": 0.537, + "step": 5091 + }, + { + "epoch": 2.0123981229933317, + "grad_norm": 0.41412343827373277, + "learning_rate": 4.8958556658643535e-06, + "loss": 0.5494, + "step": 5092 + }, + { + "epoch": 2.012793282291924, + "grad_norm": 0.4358377535786107, + "learning_rate": 4.8958109120607115e-06, + "loss": 0.5513, + "step": 5093 + }, + { + "epoch": 2.013188441590516, + "grad_norm": 0.42454225257888584, + "learning_rate": 4.895766148847779e-06, + "loss": 0.542, + "step": 5094 + }, + { + "epoch": 2.0135836008891084, + "grad_norm": 0.42998865577635037, + "learning_rate": 4.895721376225732e-06, + "loss": 0.5523, + "step": 5095 + }, + { + "epoch": 2.0139787601877006, + "grad_norm": 0.4344501069089372, + "learning_rate": 4.8956765941947456e-06, + "loss": 0.5486, + "step": 5096 + }, + { + "epoch": 2.014373919486293, + "grad_norm": 0.43353826879791363, + "learning_rate": 4.895631802754997e-06, + "loss": 0.543, + "step": 5097 + }, + { + "epoch": 2.014769078784885, + "grad_norm": 0.4575411830513003, + "learning_rate": 4.895587001906661e-06, + "loss": 0.5689, + "step": 5098 + }, + { + "epoch": 2.0151642380834773, + "grad_norm": 0.42786800682913717, + "learning_rate": 4.895542191649914e-06, + "loss": 0.5578, + "step": 5099 + }, + { + "epoch": 2.0155593973820696, + "grad_norm": 0.4964495652383933, + "learning_rate": 4.895497371984932e-06, + "loss": 0.564, + "step": 5100 + }, + { + "epoch": 2.015954556680662, + "grad_norm": 0.42115240187280495, + "learning_rate": 4.895452542911891e-06, + "loss": 0.5498, + "step": 5101 + }, + { + "epoch": 2.016349715979254, + "grad_norm": 0.42588276441255346, + "learning_rate": 4.895407704430967e-06, + "loss": 0.5556, + "step": 5102 + }, + { + "epoch": 2.0167448752778463, + "grad_norm": 0.4302156099061691, + "learning_rate": 4.895362856542336e-06, + "loss": 0.5442, + "step": 5103 + }, + { + "epoch": 2.0171400345764385, + "grad_norm": 0.43604945564161973, + "learning_rate": 4.895317999246174e-06, + "loss": 0.5656, + "step": 5104 + }, + { + "epoch": 2.0000987898246483, + "grad_norm": 0.44789874867544277, + "learning_rate": 4.895273132542658e-06, + "loss": 0.5109, + "step": 5105 + }, + { + "epoch": 2.0004939491232405, + "grad_norm": 0.7893264350562275, + "learning_rate": 4.895228256431963e-06, + "loss": 0.4954, + "step": 5106 + }, + { + "epoch": 2.0008891084218328, + "grad_norm": 0.5938590122165337, + "learning_rate": 4.895183370914267e-06, + "loss": 0.504, + "step": 5107 + }, + { + "epoch": 2.001284267720425, + "grad_norm": 0.5235383132112481, + "learning_rate": 4.8951384759897435e-06, + "loss": 0.5104, + "step": 5108 + }, + { + "epoch": 2.0016794270190172, + "grad_norm": 0.6882530599009438, + "learning_rate": 4.895093571658571e-06, + "loss": 0.4983, + "step": 5109 + }, + { + "epoch": 2.0020745863176095, + "grad_norm": 0.7510915200577702, + "learning_rate": 4.895048657920926e-06, + "loss": 0.4983, + "step": 5110 + }, + { + "epoch": 2.0024697456162017, + "grad_norm": 0.6065619194319212, + "learning_rate": 4.895003734776984e-06, + "loss": 0.49, + "step": 5111 + }, + { + "epoch": 2.002864904914794, + "grad_norm": 0.5356219470458906, + "learning_rate": 4.894958802226921e-06, + "loss": 0.4856, + "step": 5112 + }, + { + "epoch": 2.003260064213386, + "grad_norm": 0.5780607077723401, + "learning_rate": 4.894913860270915e-06, + "loss": 0.4935, + "step": 5113 + }, + { + "epoch": 2.0036552235119784, + "grad_norm": 0.5595862883969074, + "learning_rate": 4.8948689089091414e-06, + "loss": 0.4943, + "step": 5114 + }, + { + "epoch": 2.0040503828105707, + "grad_norm": 0.5217610935709599, + "learning_rate": 4.8948239481417766e-06, + "loss": 0.5112, + "step": 5115 + }, + { + "epoch": 2.004445542109163, + "grad_norm": 0.49305801115666537, + "learning_rate": 4.894778977968998e-06, + "loss": 0.4907, + "step": 5116 + }, + { + "epoch": 2.004840701407755, + "grad_norm": 0.530247044656353, + "learning_rate": 4.894733998390982e-06, + "loss": 0.4904, + "step": 5117 + }, + { + "epoch": 2.0052358607063474, + "grad_norm": 0.552426427747116, + "learning_rate": 4.894689009407903e-06, + "loss": 0.49, + "step": 5118 + }, + { + "epoch": 2.0056310200049396, + "grad_norm": 0.5685491982079328, + "learning_rate": 4.894644011019942e-06, + "loss": 0.4948, + "step": 5119 + }, + { + "epoch": 2.006026179303532, + "grad_norm": 0.502595184515819, + "learning_rate": 4.894599003227273e-06, + "loss": 0.4962, + "step": 5120 + }, + { + "epoch": 2.006421338602124, + "grad_norm": 0.5130021858315302, + "learning_rate": 4.8945539860300725e-06, + "loss": 0.5107, + "step": 5121 + }, + { + "epoch": 2.0068164979007164, + "grad_norm": 0.5350797674291279, + "learning_rate": 4.8945089594285185e-06, + "loss": 0.5076, + "step": 5122 + }, + { + "epoch": 2.0072116571993086, + "grad_norm": 0.49090717650738347, + "learning_rate": 4.894463923422787e-06, + "loss": 0.4939, + "step": 5123 + }, + { + "epoch": 2.007606816497901, + "grad_norm": 0.48813921337582705, + "learning_rate": 4.8944188780130555e-06, + "loss": 0.5004, + "step": 5124 + }, + { + "epoch": 2.008001975796493, + "grad_norm": 0.4912844935858703, + "learning_rate": 4.8943738231995005e-06, + "loss": 0.4914, + "step": 5125 + }, + { + "epoch": 2.0083971350950853, + "grad_norm": 0.5117780002863588, + "learning_rate": 4.894328758982299e-06, + "loss": 0.4892, + "step": 5126 + }, + { + "epoch": 2.0087922943936776, + "grad_norm": 0.4929822170606211, + "learning_rate": 4.894283685361628e-06, + "loss": 0.5052, + "step": 5127 + }, + { + "epoch": 2.00918745369227, + "grad_norm": 0.47940057487781756, + "learning_rate": 4.894238602337665e-06, + "loss": 0.4986, + "step": 5128 + }, + { + "epoch": 2.009582612990862, + "grad_norm": 0.4738863333886923, + "learning_rate": 4.894193509910586e-06, + "loss": 0.4841, + "step": 5129 + }, + { + "epoch": 2.0099777722894543, + "grad_norm": 0.470276754348612, + "learning_rate": 4.8941484080805695e-06, + "loss": 0.5098, + "step": 5130 + }, + { + "epoch": 2.0103729315880465, + "grad_norm": 0.4767147238302886, + "learning_rate": 4.8941032968477914e-06, + "loss": 0.4909, + "step": 5131 + }, + { + "epoch": 2.0107680908866388, + "grad_norm": 0.4721168271949362, + "learning_rate": 4.894058176212429e-06, + "loss": 0.484, + "step": 5132 + }, + { + "epoch": 2.011163250185231, + "grad_norm": 0.4694032432392925, + "learning_rate": 4.89401304617466e-06, + "loss": 0.4952, + "step": 5133 + }, + { + "epoch": 2.0115584094838233, + "grad_norm": 0.5016813592368355, + "learning_rate": 4.8939679067346625e-06, + "loss": 0.4858, + "step": 5134 + }, + { + "epoch": 2.0119535687824155, + "grad_norm": 0.4764925899996865, + "learning_rate": 4.893922757892612e-06, + "loss": 0.4997, + "step": 5135 + }, + { + "epoch": 2.0123487280810077, + "grad_norm": 0.48398112841687313, + "learning_rate": 4.893877599648686e-06, + "loss": 0.5019, + "step": 5136 + }, + { + "epoch": 2.0127438873796, + "grad_norm": 0.47136229426376053, + "learning_rate": 4.893832432003062e-06, + "loss": 0.4867, + "step": 5137 + }, + { + "epoch": 2.013139046678192, + "grad_norm": 0.48191425839984575, + "learning_rate": 4.893787254955919e-06, + "loss": 0.4825, + "step": 5138 + }, + { + "epoch": 2.0135342059767845, + "grad_norm": 0.4670311308730599, + "learning_rate": 4.893742068507434e-06, + "loss": 0.4951, + "step": 5139 + }, + { + "epoch": 2.0139293652753767, + "grad_norm": 0.4890864968827146, + "learning_rate": 4.893696872657782e-06, + "loss": 0.5113, + "step": 5140 + }, + { + "epoch": 2.014324524573969, + "grad_norm": 0.4584922074591445, + "learning_rate": 4.893651667407143e-06, + "loss": 0.4888, + "step": 5141 + }, + { + "epoch": 2.014719683872561, + "grad_norm": 0.48036994938581246, + "learning_rate": 4.893606452755693e-06, + "loss": 0.4916, + "step": 5142 + }, + { + "epoch": 2.0151148431711534, + "grad_norm": 0.6649796099167191, + "learning_rate": 4.893561228703611e-06, + "loss": 0.5061, + "step": 5143 + }, + { + "epoch": 2.0155100024697457, + "grad_norm": 0.46584443556518795, + "learning_rate": 4.8935159952510745e-06, + "loss": 0.4786, + "step": 5144 + }, + { + "epoch": 2.015905161768338, + "grad_norm": 0.46787656590183857, + "learning_rate": 4.893470752398261e-06, + "loss": 0.4822, + "step": 5145 + }, + { + "epoch": 2.01630032106693, + "grad_norm": 0.46824424287806576, + "learning_rate": 4.893425500145346e-06, + "loss": 0.5001, + "step": 5146 + }, + { + "epoch": 2.0166954803655224, + "grad_norm": 0.49194078179454165, + "learning_rate": 4.89338023849251e-06, + "loss": 0.4799, + "step": 5147 + }, + { + "epoch": 2.0170906396641146, + "grad_norm": 0.476320179653451, + "learning_rate": 4.893334967439929e-06, + "loss": 0.5194, + "step": 5148 + }, + { + "epoch": 2.017485798962707, + "grad_norm": 0.4780909181467547, + "learning_rate": 4.893289686987782e-06, + "loss": 0.5032, + "step": 5149 + }, + { + "epoch": 2.017880958261299, + "grad_norm": 0.4548175332915629, + "learning_rate": 4.893244397136247e-06, + "loss": 0.4723, + "step": 5150 + }, + { + "epoch": 2.0182761175598913, + "grad_norm": 0.44903119971257, + "learning_rate": 4.8931990978855005e-06, + "loss": 0.4873, + "step": 5151 + }, + { + "epoch": 2.0186712768584836, + "grad_norm": 0.46396916970746166, + "learning_rate": 4.893153789235722e-06, + "loss": 0.5, + "step": 5152 + }, + { + "epoch": 2.019066436157076, + "grad_norm": 0.4569485462711414, + "learning_rate": 4.893108471187088e-06, + "loss": 0.5015, + "step": 5153 + }, + { + "epoch": 2.019461595455668, + "grad_norm": 0.45922275497265813, + "learning_rate": 4.893063143739777e-06, + "loss": 0.502, + "step": 5154 + }, + { + "epoch": 2.0198567547542603, + "grad_norm": 0.4695075729482603, + "learning_rate": 4.893017806893967e-06, + "loss": 0.5134, + "step": 5155 + }, + { + "epoch": 2.0202519140528525, + "grad_norm": 0.4530917617622478, + "learning_rate": 4.892972460649836e-06, + "loss": 0.5076, + "step": 5156 + }, + { + "epoch": 2.020647073351445, + "grad_norm": 0.4504342750468262, + "learning_rate": 4.892927105007563e-06, + "loss": 0.505, + "step": 5157 + }, + { + "epoch": 2.021042232650037, + "grad_norm": 0.4667012843063457, + "learning_rate": 4.892881739967325e-06, + "loss": 0.5167, + "step": 5158 + }, + { + "epoch": 2.0214373919486293, + "grad_norm": 0.4494930932302058, + "learning_rate": 4.892836365529301e-06, + "loss": 0.4803, + "step": 5159 + }, + { + "epoch": 2.0218325512472215, + "grad_norm": 0.4588896369546433, + "learning_rate": 4.892790981693668e-06, + "loss": 0.5003, + "step": 5160 + }, + { + "epoch": 2.0222277105458137, + "grad_norm": 0.4497347107376531, + "learning_rate": 4.892745588460606e-06, + "loss": 0.4777, + "step": 5161 + }, + { + "epoch": 2.022622869844406, + "grad_norm": 0.4547275239393893, + "learning_rate": 4.892700185830291e-06, + "loss": 0.4908, + "step": 5162 + }, + { + "epoch": 2.023018029142998, + "grad_norm": 0.4776327997808457, + "learning_rate": 4.892654773802904e-06, + "loss": 0.5075, + "step": 5163 + }, + { + "epoch": 2.0234131884415905, + "grad_norm": 0.45100034823235746, + "learning_rate": 4.892609352378621e-06, + "loss": 0.4954, + "step": 5164 + }, + { + "epoch": 2.0238083477401827, + "grad_norm": 0.453949322209561, + "learning_rate": 4.8925639215576215e-06, + "loss": 0.4988, + "step": 5165 + }, + { + "epoch": 2.024203507038775, + "grad_norm": 0.4485757581479012, + "learning_rate": 4.8925184813400835e-06, + "loss": 0.4954, + "step": 5166 + }, + { + "epoch": 2.024598666337367, + "grad_norm": 0.4523702947118703, + "learning_rate": 4.892473031726187e-06, + "loss": 0.5083, + "step": 5167 + }, + { + "epoch": 2.0249938256359594, + "grad_norm": 0.4510797356282732, + "learning_rate": 4.8924275727161075e-06, + "loss": 0.4964, + "step": 5168 + }, + { + "epoch": 2.0253889849345517, + "grad_norm": 0.464165953716701, + "learning_rate": 4.892382104310026e-06, + "loss": 0.506, + "step": 5169 + }, + { + "epoch": 2.025784144233144, + "grad_norm": 0.4598347669774403, + "learning_rate": 4.892336626508121e-06, + "loss": 0.5139, + "step": 5170 + }, + { + "epoch": 2.026179303531736, + "grad_norm": 0.44063169455818113, + "learning_rate": 4.89229113931057e-06, + "loss": 0.5005, + "step": 5171 + }, + { + "epoch": 2.0265744628303284, + "grad_norm": 0.4520710240225769, + "learning_rate": 4.892245642717551e-06, + "loss": 0.4917, + "step": 5172 + }, + { + "epoch": 2.0269696221289206, + "grad_norm": 0.4558674076863556, + "learning_rate": 4.8922001367292445e-06, + "loss": 0.487, + "step": 5173 + }, + { + "epoch": 2.027364781427513, + "grad_norm": 0.4637585590513638, + "learning_rate": 4.892154621345829e-06, + "loss": 0.5094, + "step": 5174 + }, + { + "epoch": 2.027759940726105, + "grad_norm": 0.45275964455192436, + "learning_rate": 4.8921090965674825e-06, + "loss": 0.4866, + "step": 5175 + }, + { + "epoch": 2.0281551000246973, + "grad_norm": 0.4544576816718463, + "learning_rate": 4.892063562394384e-06, + "loss": 0.5135, + "step": 5176 + }, + { + "epoch": 2.0285502593232896, + "grad_norm": 0.4713873273127142, + "learning_rate": 4.892018018826712e-06, + "loss": 0.4952, + "step": 5177 + }, + { + "epoch": 2.028945418621882, + "grad_norm": 0.44993919688256234, + "learning_rate": 4.8919724658646465e-06, + "loss": 0.4896, + "step": 5178 + }, + { + "epoch": 2.029340577920474, + "grad_norm": 0.4689922683938695, + "learning_rate": 4.891926903508365e-06, + "loss": 0.4828, + "step": 5179 + }, + { + "epoch": 2.0297357372190663, + "grad_norm": 0.4838250197458008, + "learning_rate": 4.891881331758047e-06, + "loss": 0.5096, + "step": 5180 + }, + { + "epoch": 2.0301308965176585, + "grad_norm": 0.4675284522861168, + "learning_rate": 4.891835750613872e-06, + "loss": 0.4982, + "step": 5181 + }, + { + "epoch": 2.030526055816251, + "grad_norm": 0.45385770764223, + "learning_rate": 4.891790160076018e-06, + "loss": 0.5001, + "step": 5182 + }, + { + "epoch": 2.030921215114843, + "grad_norm": 0.4600669620147579, + "learning_rate": 4.8917445601446656e-06, + "loss": 0.4839, + "step": 5183 + }, + { + "epoch": 2.0313163744134353, + "grad_norm": 0.4530041700317838, + "learning_rate": 4.891698950819992e-06, + "loss": 0.501, + "step": 5184 + }, + { + "epoch": 2.0317115337120275, + "grad_norm": 0.47142109619231753, + "learning_rate": 4.891653332102177e-06, + "loss": 0.5274, + "step": 5185 + }, + { + "epoch": 2.0321066930106197, + "grad_norm": 0.46742031828739516, + "learning_rate": 4.891607703991401e-06, + "loss": 0.5046, + "step": 5186 + }, + { + "epoch": 2.032501852309212, + "grad_norm": 0.7198025805905561, + "learning_rate": 4.891562066487842e-06, + "loss": 0.4781, + "step": 5187 + }, + { + "epoch": 2.0328970116078042, + "grad_norm": 0.47277610272117676, + "learning_rate": 4.891516419591679e-06, + "loss": 0.4994, + "step": 5188 + }, + { + "epoch": 2.0332921709063965, + "grad_norm": 0.49484180211119877, + "learning_rate": 4.891470763303092e-06, + "loss": 0.4852, + "step": 5189 + }, + { + "epoch": 2.0336873302049887, + "grad_norm": 0.4859621511332183, + "learning_rate": 4.89142509762226e-06, + "loss": 0.5134, + "step": 5190 + }, + { + "epoch": 2.034082489503581, + "grad_norm": 0.4577229468236649, + "learning_rate": 4.891379422549361e-06, + "loss": 0.4986, + "step": 5191 + }, + { + "epoch": 2.034477648802173, + "grad_norm": 0.47417760848062124, + "learning_rate": 4.891333738084578e-06, + "loss": 0.511, + "step": 5192 + }, + { + "epoch": 2.0348728081007654, + "grad_norm": 0.48539047796098145, + "learning_rate": 4.891288044228088e-06, + "loss": 0.496, + "step": 5193 + }, + { + "epoch": 2.0352679673993577, + "grad_norm": 0.46849711925970705, + "learning_rate": 4.891242340980069e-06, + "loss": 0.5081, + "step": 5194 + }, + { + "epoch": 2.03566312669795, + "grad_norm": 0.4690072112312264, + "learning_rate": 4.891196628340703e-06, + "loss": 0.5097, + "step": 5195 + }, + { + "epoch": 2.036058285996542, + "grad_norm": 0.46808264537794086, + "learning_rate": 4.8911509063101685e-06, + "loss": 0.488, + "step": 5196 + }, + { + "epoch": 2.0364534452951344, + "grad_norm": 0.5519283415442977, + "learning_rate": 4.891105174888645e-06, + "loss": 0.5065, + "step": 5197 + }, + { + "epoch": 2.0368486045937266, + "grad_norm": 0.46362954842671233, + "learning_rate": 4.891059434076313e-06, + "loss": 0.4878, + "step": 5198 + }, + { + "epoch": 2.0372437638923193, + "grad_norm": 0.5173109880832085, + "learning_rate": 4.891013683873351e-06, + "loss": 0.4982, + "step": 5199 + }, + { + "epoch": 2.0376389231909116, + "grad_norm": 0.46175244060924875, + "learning_rate": 4.890967924279939e-06, + "loss": 0.4923, + "step": 5200 + }, + { + "epoch": 2.038034082489504, + "grad_norm": 0.44259574184550005, + "learning_rate": 4.8909221552962574e-06, + "loss": 0.4824, + "step": 5201 + }, + { + "epoch": 2.038429241788096, + "grad_norm": 0.4586966341918503, + "learning_rate": 4.890876376922486e-06, + "loss": 0.4779, + "step": 5202 + }, + { + "epoch": 2.0388244010866883, + "grad_norm": 0.46102142747514496, + "learning_rate": 4.890830589158802e-06, + "loss": 0.49, + "step": 5203 + }, + { + "epoch": 2.0392195603852805, + "grad_norm": 0.471145434062063, + "learning_rate": 4.8907847920053885e-06, + "loss": 0.5156, + "step": 5204 + }, + { + "epoch": 2.0396147196838728, + "grad_norm": 0.4549769045467522, + "learning_rate": 4.890738985462424e-06, + "loss": 0.4871, + "step": 5205 + }, + { + "epoch": 2.040009878982465, + "grad_norm": 0.44669007469835803, + "learning_rate": 4.890693169530088e-06, + "loss": 0.48, + "step": 5206 + }, + { + "epoch": 2.0404050382810572, + "grad_norm": 0.4617411167739797, + "learning_rate": 4.890647344208562e-06, + "loss": 0.4818, + "step": 5207 + }, + { + "epoch": 2.0408001975796495, + "grad_norm": 0.4689178727542056, + "learning_rate": 4.8906015094980246e-06, + "loss": 0.4983, + "step": 5208 + }, + { + "epoch": 2.0411953568782417, + "grad_norm": 0.45516172538535027, + "learning_rate": 4.890555665398656e-06, + "loss": 0.4964, + "step": 5209 + }, + { + "epoch": 2.041590516176834, + "grad_norm": 0.4391222692490777, + "learning_rate": 4.890509811910637e-06, + "loss": 0.472, + "step": 5210 + }, + { + "epoch": 2.041985675475426, + "grad_norm": 0.4739508445179121, + "learning_rate": 4.890463949034145e-06, + "loss": 0.5124, + "step": 5211 + }, + { + "epoch": 2.0423808347740184, + "grad_norm": 0.4597047754530406, + "learning_rate": 4.890418076769364e-06, + "loss": 0.4901, + "step": 5212 + }, + { + "epoch": 2.0427759940726107, + "grad_norm": 0.4400780410171843, + "learning_rate": 4.890372195116472e-06, + "loss": 0.4955, + "step": 5213 + }, + { + "epoch": 2.043171153371203, + "grad_norm": 0.4525419825437649, + "learning_rate": 4.890326304075649e-06, + "loss": 0.4837, + "step": 5214 + }, + { + "epoch": 2.043566312669795, + "grad_norm": 0.45871098059276433, + "learning_rate": 4.890280403647076e-06, + "loss": 0.4951, + "step": 5215 + }, + { + "epoch": 2.0439614719683874, + "grad_norm": 0.4643059677550961, + "learning_rate": 4.890234493830933e-06, + "loss": 0.503, + "step": 5216 + }, + { + "epoch": 2.0443566312669796, + "grad_norm": 0.4499775130020552, + "learning_rate": 4.8901885746274e-06, + "loss": 0.484, + "step": 5217 + }, + { + "epoch": 2.044751790565572, + "grad_norm": 0.45820087196407705, + "learning_rate": 4.890142646036659e-06, + "loss": 0.504, + "step": 5218 + }, + { + "epoch": 2.045146949864164, + "grad_norm": 0.4532918197234839, + "learning_rate": 4.890096708058888e-06, + "loss": 0.4833, + "step": 5219 + }, + { + "epoch": 2.0455421091627564, + "grad_norm": 0.4596033870554187, + "learning_rate": 4.890050760694268e-06, + "loss": 0.4818, + "step": 5220 + }, + { + "epoch": 2.0459372684613486, + "grad_norm": 0.5275495863257932, + "learning_rate": 4.890004803942982e-06, + "loss": 0.5084, + "step": 5221 + }, + { + "epoch": 2.046332427759941, + "grad_norm": 0.44983675009756247, + "learning_rate": 4.889958837805207e-06, + "loss": 0.4839, + "step": 5222 + }, + { + "epoch": 2.046727587058533, + "grad_norm": 0.46932242931178536, + "learning_rate": 4.889912862281124e-06, + "loss": 0.4891, + "step": 5223 + }, + { + "epoch": 2.0471227463571253, + "grad_norm": 0.4580616059487756, + "learning_rate": 4.889866877370915e-06, + "loss": 0.4697, + "step": 5224 + }, + { + "epoch": 2.0475179056557176, + "grad_norm": 0.4543027878441082, + "learning_rate": 4.8898208830747615e-06, + "loss": 0.4971, + "step": 5225 + }, + { + "epoch": 2.04791306495431, + "grad_norm": 0.45234992583478045, + "learning_rate": 4.889774879392841e-06, + "loss": 0.5034, + "step": 5226 + }, + { + "epoch": 2.048308224252902, + "grad_norm": 0.45898065072529526, + "learning_rate": 4.889728866325337e-06, + "loss": 0.5033, + "step": 5227 + }, + { + "epoch": 2.0487033835514943, + "grad_norm": 0.4601022979706201, + "learning_rate": 4.889682843872429e-06, + "loss": 0.4865, + "step": 5228 + }, + { + "epoch": 2.0490985428500865, + "grad_norm": 0.44681491251197947, + "learning_rate": 4.889636812034298e-06, + "loss": 0.5066, + "step": 5229 + }, + { + "epoch": 2.0494937021486788, + "grad_norm": 0.4506415649381756, + "learning_rate": 4.889590770811125e-06, + "loss": 0.4907, + "step": 5230 + }, + { + "epoch": 2.049888861447271, + "grad_norm": 0.45120214151761606, + "learning_rate": 4.88954472020309e-06, + "loss": 0.5062, + "step": 5231 + }, + { + "epoch": 2.0502840207458632, + "grad_norm": 0.45154401506062164, + "learning_rate": 4.8894986602103735e-06, + "loss": 0.4961, + "step": 5232 + }, + { + "epoch": 2.0506791800444555, + "grad_norm": 0.4511811539872512, + "learning_rate": 4.889452590833158e-06, + "loss": 0.4858, + "step": 5233 + }, + { + "epoch": 2.0510743393430477, + "grad_norm": 0.44882241957643904, + "learning_rate": 4.8894065120716235e-06, + "loss": 0.5002, + "step": 5234 + }, + { + "epoch": 2.05146949864164, + "grad_norm": 0.44681664863949433, + "learning_rate": 4.889360423925952e-06, + "loss": 0.4889, + "step": 5235 + }, + { + "epoch": 2.051864657940232, + "grad_norm": 0.45495838986619974, + "learning_rate": 4.889314326396323e-06, + "loss": 0.4913, + "step": 5236 + }, + { + "epoch": 2.0522598172388244, + "grad_norm": 0.45058039719748044, + "learning_rate": 4.889268219482918e-06, + "loss": 0.4974, + "step": 5237 + }, + { + "epoch": 2.0526549765374167, + "grad_norm": 0.4673937972973294, + "learning_rate": 4.889222103185919e-06, + "loss": 0.4963, + "step": 5238 + }, + { + "epoch": 2.053050135836009, + "grad_norm": 0.4650737282011602, + "learning_rate": 4.889175977505505e-06, + "loss": 0.4903, + "step": 5239 + }, + { + "epoch": 2.053445295134601, + "grad_norm": 0.45116470306677475, + "learning_rate": 4.88912984244186e-06, + "loss": 0.4992, + "step": 5240 + }, + { + "epoch": 2.0538404544331934, + "grad_norm": 0.5298176727549352, + "learning_rate": 4.889083697995163e-06, + "loss": 0.4919, + "step": 5241 + }, + { + "epoch": 2.0542356137317856, + "grad_norm": 0.45177097015636897, + "learning_rate": 4.889037544165596e-06, + "loss": 0.4904, + "step": 5242 + }, + { + "epoch": 2.054630773030378, + "grad_norm": 0.5119412810802527, + "learning_rate": 4.8889913809533404e-06, + "loss": 0.4968, + "step": 5243 + }, + { + "epoch": 2.05502593232897, + "grad_norm": 0.4616855361620579, + "learning_rate": 4.888945208358577e-06, + "loss": 0.5005, + "step": 5244 + }, + { + "epoch": 2.0554210916275624, + "grad_norm": 0.4672209154136625, + "learning_rate": 4.888899026381487e-06, + "loss": 0.5107, + "step": 5245 + }, + { + "epoch": 2.0558162509261546, + "grad_norm": 0.4528232853036076, + "learning_rate": 4.888852835022253e-06, + "loss": 0.4762, + "step": 5246 + }, + { + "epoch": 2.056211410224747, + "grad_norm": 0.4558490462001704, + "learning_rate": 4.8888066342810555e-06, + "loss": 0.491, + "step": 5247 + }, + { + "epoch": 2.056606569523339, + "grad_norm": 0.4527416296938898, + "learning_rate": 4.888760424158077e-06, + "loss": 0.4922, + "step": 5248 + }, + { + "epoch": 2.0570017288219313, + "grad_norm": 0.46237981652455606, + "learning_rate": 4.8887142046534975e-06, + "loss": 0.5024, + "step": 5249 + }, + { + "epoch": 2.0573968881205236, + "grad_norm": 0.4637558739077874, + "learning_rate": 4.888667975767499e-06, + "loss": 0.4894, + "step": 5250 + }, + { + "epoch": 2.057792047419116, + "grad_norm": 0.4744951570899205, + "learning_rate": 4.888621737500262e-06, + "loss": 0.4908, + "step": 5251 + }, + { + "epoch": 2.058187206717708, + "grad_norm": 0.4584214020537685, + "learning_rate": 4.888575489851971e-06, + "loss": 0.5037, + "step": 5252 + }, + { + "epoch": 2.0585823660163003, + "grad_norm": 0.4727044592007818, + "learning_rate": 4.888529232822805e-06, + "loss": 0.4776, + "step": 5253 + }, + { + "epoch": 2.0589775253148925, + "grad_norm": 0.4632461314737739, + "learning_rate": 4.888482966412947e-06, + "loss": 0.4921, + "step": 5254 + }, + { + "epoch": 2.0593726846134848, + "grad_norm": 0.46365343578610424, + "learning_rate": 4.888436690622578e-06, + "loss": 0.4982, + "step": 5255 + }, + { + "epoch": 2.059767843912077, + "grad_norm": 0.4927502347234346, + "learning_rate": 4.8883904054518805e-06, + "loss": 0.5025, + "step": 5256 + }, + { + "epoch": 2.0601630032106693, + "grad_norm": 0.45431480591589724, + "learning_rate": 4.888344110901035e-06, + "loss": 0.4862, + "step": 5257 + }, + { + "epoch": 2.0605581625092615, + "grad_norm": 0.44639395998441367, + "learning_rate": 4.888297806970225e-06, + "loss": 0.5002, + "step": 5258 + }, + { + "epoch": 2.0609533218078537, + "grad_norm": 0.4581725919289095, + "learning_rate": 4.888251493659631e-06, + "loss": 0.4763, + "step": 5259 + }, + { + "epoch": 2.061348481106446, + "grad_norm": 0.455160272080671, + "learning_rate": 4.888205170969435e-06, + "loss": 0.499, + "step": 5260 + }, + { + "epoch": 2.061743640405038, + "grad_norm": 0.4643189428275059, + "learning_rate": 4.888158838899819e-06, + "loss": 0.4932, + "step": 5261 + }, + { + "epoch": 2.0621387997036305, + "grad_norm": 0.4463638667832112, + "learning_rate": 4.888112497450966e-06, + "loss": 0.5011, + "step": 5262 + }, + { + "epoch": 2.0625339590022227, + "grad_norm": 0.44491650271349903, + "learning_rate": 4.888066146623058e-06, + "loss": 0.4968, + "step": 5263 + }, + { + "epoch": 2.062929118300815, + "grad_norm": 0.4741916119379606, + "learning_rate": 4.888019786416275e-06, + "loss": 0.5109, + "step": 5264 + }, + { + "epoch": 2.063324277599407, + "grad_norm": 0.46077105210136354, + "learning_rate": 4.887973416830801e-06, + "loss": 0.4847, + "step": 5265 + }, + { + "epoch": 2.0637194368979994, + "grad_norm": 0.5849360991852471, + "learning_rate": 4.887927037866817e-06, + "loss": 0.4926, + "step": 5266 + }, + { + "epoch": 2.0641145961965917, + "grad_norm": 0.4633200832882633, + "learning_rate": 4.8878806495245055e-06, + "loss": 0.4919, + "step": 5267 + }, + { + "epoch": 2.064509755495184, + "grad_norm": 0.46236301178780204, + "learning_rate": 4.887834251804049e-06, + "loss": 0.4954, + "step": 5268 + }, + { + "epoch": 2.064904914793776, + "grad_norm": 0.457629583658407, + "learning_rate": 4.8877878447056305e-06, + "loss": 0.4923, + "step": 5269 + }, + { + "epoch": 2.0653000740923684, + "grad_norm": 0.4673120651485547, + "learning_rate": 4.88774142822943e-06, + "loss": 0.4808, + "step": 5270 + }, + { + "epoch": 2.0656952333909606, + "grad_norm": 0.4568906136006192, + "learning_rate": 4.887695002375631e-06, + "loss": 0.485, + "step": 5271 + }, + { + "epoch": 2.066090392689553, + "grad_norm": 0.4734779549670619, + "learning_rate": 4.8876485671444175e-06, + "loss": 0.4925, + "step": 5272 + }, + { + "epoch": 2.066485551988145, + "grad_norm": 0.47832808820634337, + "learning_rate": 4.887602122535969e-06, + "loss": 0.4957, + "step": 5273 + }, + { + "epoch": 2.0668807112867373, + "grad_norm": 0.4749938197663773, + "learning_rate": 4.887555668550469e-06, + "loss": 0.5011, + "step": 5274 + }, + { + "epoch": 2.0672758705853296, + "grad_norm": 0.46181910473113197, + "learning_rate": 4.887509205188101e-06, + "loss": 0.4954, + "step": 5275 + }, + { + "epoch": 2.067671029883922, + "grad_norm": 0.4476900675547803, + "learning_rate": 4.887462732449046e-06, + "loss": 0.5033, + "step": 5276 + }, + { + "epoch": 2.068066189182514, + "grad_norm": 0.4540259110444361, + "learning_rate": 4.887416250333487e-06, + "loss": 0.4902, + "step": 5277 + }, + { + "epoch": 2.0684613484811063, + "grad_norm": 0.448149981706839, + "learning_rate": 4.8873697588416075e-06, + "loss": 0.4998, + "step": 5278 + }, + { + "epoch": 2.0688565077796985, + "grad_norm": 0.44190241176056827, + "learning_rate": 4.887323257973589e-06, + "loss": 0.4946, + "step": 5279 + }, + { + "epoch": 2.069251667078291, + "grad_norm": 0.4462613061115609, + "learning_rate": 4.887276747729614e-06, + "loss": 0.4817, + "step": 5280 + }, + { + "epoch": 2.069646826376883, + "grad_norm": 0.45013660019262763, + "learning_rate": 4.887230228109866e-06, + "loss": 0.4897, + "step": 5281 + }, + { + "epoch": 2.0700419856754753, + "grad_norm": 0.4482753274443419, + "learning_rate": 4.887183699114526e-06, + "loss": 0.5006, + "step": 5282 + }, + { + "epoch": 2.0704371449740675, + "grad_norm": 0.46170470070203723, + "learning_rate": 4.88713716074378e-06, + "loss": 0.5052, + "step": 5283 + }, + { + "epoch": 2.0708323042726597, + "grad_norm": 0.4669933900589111, + "learning_rate": 4.887090612997808e-06, + "loss": 0.4917, + "step": 5284 + }, + { + "epoch": 2.071227463571252, + "grad_norm": 0.483843226668854, + "learning_rate": 4.887044055876793e-06, + "loss": 0.5046, + "step": 5285 + }, + { + "epoch": 2.071622622869844, + "grad_norm": 0.46714368758553043, + "learning_rate": 4.886997489380919e-06, + "loss": 0.489, + "step": 5286 + }, + { + "epoch": 2.0720177821684365, + "grad_norm": 0.44469009126903564, + "learning_rate": 4.886950913510368e-06, + "loss": 0.4925, + "step": 5287 + }, + { + "epoch": 2.0724129414670287, + "grad_norm": 0.46045303664787235, + "learning_rate": 4.886904328265323e-06, + "loss": 0.4839, + "step": 5288 + }, + { + "epoch": 2.072808100765621, + "grad_norm": 0.4601616045035097, + "learning_rate": 4.886857733645968e-06, + "loss": 0.4876, + "step": 5289 + }, + { + "epoch": 2.073203260064213, + "grad_norm": 0.48030791644212956, + "learning_rate": 4.886811129652484e-06, + "loss": 0.5098, + "step": 5290 + }, + { + "epoch": 2.0735984193628054, + "grad_norm": 0.4499859205882746, + "learning_rate": 4.886764516285057e-06, + "loss": 0.4892, + "step": 5291 + }, + { + "epoch": 2.0739935786613977, + "grad_norm": 0.46426479744305693, + "learning_rate": 4.886717893543868e-06, + "loss": 0.4841, + "step": 5292 + }, + { + "epoch": 2.07438873795999, + "grad_norm": 0.4559454042901413, + "learning_rate": 4.886671261429099e-06, + "loss": 0.488, + "step": 5293 + }, + { + "epoch": 2.074783897258582, + "grad_norm": 0.4563383108320409, + "learning_rate": 4.8866246199409354e-06, + "loss": 0.4792, + "step": 5294 + }, + { + "epoch": 2.0751790565571744, + "grad_norm": 0.4641041054840762, + "learning_rate": 4.886577969079559e-06, + "loss": 0.4883, + "step": 5295 + }, + { + "epoch": 2.075574215855767, + "grad_norm": 0.47969805202822735, + "learning_rate": 4.8865313088451544e-06, + "loss": 0.5032, + "step": 5296 + }, + { + "epoch": 2.0759693751543593, + "grad_norm": 0.46863300818021786, + "learning_rate": 4.886484639237903e-06, + "loss": 0.498, + "step": 5297 + }, + { + "epoch": 2.0763645344529515, + "grad_norm": 0.4657095895273564, + "learning_rate": 4.88643796025799e-06, + "loss": 0.4925, + "step": 5298 + }, + { + "epoch": 2.076759693751544, + "grad_norm": 0.6267052883363237, + "learning_rate": 4.886391271905597e-06, + "loss": 0.5167, + "step": 5299 + }, + { + "epoch": 2.077154853050136, + "grad_norm": 0.4773221517600891, + "learning_rate": 4.886344574180909e-06, + "loss": 0.4994, + "step": 5300 + }, + { + "epoch": 2.0775500123487283, + "grad_norm": 0.48647171806501055, + "learning_rate": 4.886297867084109e-06, + "loss": 0.4895, + "step": 5301 + }, + { + "epoch": 2.0779451716473205, + "grad_norm": 0.47734592420468774, + "learning_rate": 4.886251150615379e-06, + "loss": 0.4964, + "step": 5302 + }, + { + "epoch": 2.0783403309459128, + "grad_norm": 0.4822474079732969, + "learning_rate": 4.886204424774904e-06, + "loss": 0.5145, + "step": 5303 + }, + { + "epoch": 2.078735490244505, + "grad_norm": 0.4648912162923145, + "learning_rate": 4.886157689562866e-06, + "loss": 0.5063, + "step": 5304 + }, + { + "epoch": 2.0791306495430972, + "grad_norm": 0.45932677809466166, + "learning_rate": 4.886110944979451e-06, + "loss": 0.495, + "step": 5305 + }, + { + "epoch": 2.0795258088416895, + "grad_norm": 0.47706737183878567, + "learning_rate": 4.88606419102484e-06, + "loss": 0.5103, + "step": 5306 + }, + { + "epoch": 2.0799209681402817, + "grad_norm": 0.4532623596914375, + "learning_rate": 4.886017427699218e-06, + "loss": 0.4873, + "step": 5307 + }, + { + "epoch": 2.080316127438874, + "grad_norm": 0.4483128492884054, + "learning_rate": 4.885970655002768e-06, + "loss": 0.5029, + "step": 5308 + }, + { + "epoch": 2.080711286737466, + "grad_norm": 0.46648648387318337, + "learning_rate": 4.885923872935675e-06, + "loss": 0.4932, + "step": 5309 + }, + { + "epoch": 2.0811064460360584, + "grad_norm": 0.470504146656533, + "learning_rate": 4.885877081498122e-06, + "loss": 0.4986, + "step": 5310 + }, + { + "epoch": 2.0815016053346507, + "grad_norm": 0.46815656364735725, + "learning_rate": 4.8858302806902925e-06, + "loss": 0.5053, + "step": 5311 + }, + { + "epoch": 2.081896764633243, + "grad_norm": 0.47676947419158733, + "learning_rate": 4.88578347051237e-06, + "loss": 0.5048, + "step": 5312 + }, + { + "epoch": 2.082291923931835, + "grad_norm": 0.4551948922256932, + "learning_rate": 4.885736650964539e-06, + "loss": 0.4846, + "step": 5313 + }, + { + "epoch": 2.0826870832304274, + "grad_norm": 0.4689297776862242, + "learning_rate": 4.885689822046983e-06, + "loss": 0.5138, + "step": 5314 + }, + { + "epoch": 2.0830822425290196, + "grad_norm": 0.5266371128832054, + "learning_rate": 4.885642983759885e-06, + "loss": 0.502, + "step": 5315 + }, + { + "epoch": 2.083477401827612, + "grad_norm": 0.45232336122965455, + "learning_rate": 4.885596136103432e-06, + "loss": 0.4926, + "step": 5316 + }, + { + "epoch": 2.083872561126204, + "grad_norm": 0.45815345336313995, + "learning_rate": 4.885549279077805e-06, + "loss": 0.4706, + "step": 5317 + }, + { + "epoch": 2.0842677204247964, + "grad_norm": 0.4508115268547949, + "learning_rate": 4.885502412683189e-06, + "loss": 0.5158, + "step": 5318 + }, + { + "epoch": 2.0846628797233886, + "grad_norm": 0.4506873596094119, + "learning_rate": 4.885455536919767e-06, + "loss": 0.4826, + "step": 5319 + }, + { + "epoch": 2.085058039021981, + "grad_norm": 0.47613431415557844, + "learning_rate": 4.885408651787725e-06, + "loss": 0.5037, + "step": 5320 + }, + { + "epoch": 2.085453198320573, + "grad_norm": 0.4603502263078611, + "learning_rate": 4.885361757287247e-06, + "loss": 0.4886, + "step": 5321 + }, + { + "epoch": 2.0858483576191653, + "grad_norm": 0.46036610887217155, + "learning_rate": 4.8853148534185165e-06, + "loss": 0.4953, + "step": 5322 + }, + { + "epoch": 2.0862435169177576, + "grad_norm": 0.4529241365840035, + "learning_rate": 4.885267940181717e-06, + "loss": 0.4953, + "step": 5323 + }, + { + "epoch": 2.08663867621635, + "grad_norm": 0.48675482273393644, + "learning_rate": 4.885221017577033e-06, + "loss": 0.5134, + "step": 5324 + }, + { + "epoch": 2.087033835514942, + "grad_norm": 0.45569874226039914, + "learning_rate": 4.88517408560465e-06, + "loss": 0.5097, + "step": 5325 + }, + { + "epoch": 2.0874289948135343, + "grad_norm": 0.4673253599053655, + "learning_rate": 4.885127144264752e-06, + "loss": 0.5054, + "step": 5326 + }, + { + "epoch": 2.0878241541121265, + "grad_norm": 0.4531561763675708, + "learning_rate": 4.885080193557522e-06, + "loss": 0.5016, + "step": 5327 + }, + { + "epoch": 2.0882193134107188, + "grad_norm": 0.4872090735746867, + "learning_rate": 4.885033233483146e-06, + "loss": 0.5176, + "step": 5328 + }, + { + "epoch": 2.088614472709311, + "grad_norm": 0.4513837913446904, + "learning_rate": 4.884986264041808e-06, + "loss": 0.469, + "step": 5329 + }, + { + "epoch": 2.0890096320079032, + "grad_norm": 0.4590797104168329, + "learning_rate": 4.884939285233691e-06, + "loss": 0.504, + "step": 5330 + }, + { + "epoch": 2.0894047913064955, + "grad_norm": 0.4627972424027732, + "learning_rate": 4.884892297058981e-06, + "loss": 0.4901, + "step": 5331 + }, + { + "epoch": 2.0897999506050877, + "grad_norm": 0.44688340629975803, + "learning_rate": 4.884845299517863e-06, + "loss": 0.4877, + "step": 5332 + }, + { + "epoch": 2.09019510990368, + "grad_norm": 0.4611503172302672, + "learning_rate": 4.88479829261052e-06, + "loss": 0.4921, + "step": 5333 + }, + { + "epoch": 2.090590269202272, + "grad_norm": 0.46829455437768785, + "learning_rate": 4.884751276337138e-06, + "loss": 0.5113, + "step": 5334 + }, + { + "epoch": 2.0909854285008644, + "grad_norm": 0.4473009959944776, + "learning_rate": 4.8847042506979e-06, + "loss": 0.4904, + "step": 5335 + }, + { + "epoch": 2.0913805877994567, + "grad_norm": 0.4604208483059509, + "learning_rate": 4.8846572156929936e-06, + "loss": 0.5003, + "step": 5336 + }, + { + "epoch": 2.091775747098049, + "grad_norm": 0.45767779762723915, + "learning_rate": 4.8846101713226005e-06, + "loss": 0.4935, + "step": 5337 + }, + { + "epoch": 2.092170906396641, + "grad_norm": 0.4615944313852972, + "learning_rate": 4.884563117586907e-06, + "loss": 0.4979, + "step": 5338 + }, + { + "epoch": 2.0925660656952334, + "grad_norm": 0.4810789677245922, + "learning_rate": 4.884516054486097e-06, + "loss": 0.5252, + "step": 5339 + }, + { + "epoch": 2.0929612249938256, + "grad_norm": 0.4573861349101158, + "learning_rate": 4.884468982020357e-06, + "loss": 0.5129, + "step": 5340 + }, + { + "epoch": 2.093356384292418, + "grad_norm": 0.4528184175451582, + "learning_rate": 4.88442190018987e-06, + "loss": 0.5138, + "step": 5341 + }, + { + "epoch": 2.09375154359101, + "grad_norm": 0.4566897018760855, + "learning_rate": 4.884374808994822e-06, + "loss": 0.5044, + "step": 5342 + }, + { + "epoch": 2.0941467028896024, + "grad_norm": 0.4516180831443502, + "learning_rate": 4.884327708435397e-06, + "loss": 0.4991, + "step": 5343 + }, + { + "epoch": 2.0945418621881946, + "grad_norm": 0.44852554152023116, + "learning_rate": 4.884280598511781e-06, + "loss": 0.5032, + "step": 5344 + }, + { + "epoch": 2.094937021486787, + "grad_norm": 0.45516969248231826, + "learning_rate": 4.8842334792241586e-06, + "loss": 0.4983, + "step": 5345 + }, + { + "epoch": 2.095332180785379, + "grad_norm": 0.46631444223982815, + "learning_rate": 4.884186350572715e-06, + "loss": 0.502, + "step": 5346 + }, + { + "epoch": 2.0957273400839713, + "grad_norm": 0.47398673780754186, + "learning_rate": 4.884139212557635e-06, + "loss": 0.4953, + "step": 5347 + }, + { + "epoch": 2.0961224993825636, + "grad_norm": 0.47579743471894775, + "learning_rate": 4.884092065179103e-06, + "loss": 0.5056, + "step": 5348 + }, + { + "epoch": 2.096517658681156, + "grad_norm": 0.48515241623044925, + "learning_rate": 4.884044908437306e-06, + "loss": 0.5059, + "step": 5349 + }, + { + "epoch": 2.096912817979748, + "grad_norm": 0.4652147279266291, + "learning_rate": 4.883997742332429e-06, + "loss": 0.4976, + "step": 5350 + }, + { + "epoch": 2.0973079772783403, + "grad_norm": 0.4453104642842236, + "learning_rate": 4.883950566864656e-06, + "loss": 0.5074, + "step": 5351 + }, + { + "epoch": 2.0977031365769325, + "grad_norm": 0.45143539446002273, + "learning_rate": 4.883903382034172e-06, + "loss": 0.4835, + "step": 5352 + }, + { + "epoch": 2.0980982958755248, + "grad_norm": 0.4628598192673955, + "learning_rate": 4.883856187841164e-06, + "loss": 0.5055, + "step": 5353 + }, + { + "epoch": 2.098493455174117, + "grad_norm": 0.4564188127449865, + "learning_rate": 4.883808984285816e-06, + "loss": 0.4958, + "step": 5354 + }, + { + "epoch": 2.0988886144727092, + "grad_norm": 0.45860123136973613, + "learning_rate": 4.8837617713683146e-06, + "loss": 0.4906, + "step": 5355 + }, + { + "epoch": 2.0992837737713015, + "grad_norm": 0.4732141389245651, + "learning_rate": 4.883714549088844e-06, + "loss": 0.493, + "step": 5356 + }, + { + "epoch": 2.0996789330698937, + "grad_norm": 0.4611681662392404, + "learning_rate": 4.8836673174475894e-06, + "loss": 0.5004, + "step": 5357 + }, + { + "epoch": 2.100074092368486, + "grad_norm": 0.4779009985374861, + "learning_rate": 4.883620076444738e-06, + "loss": 0.4939, + "step": 5358 + }, + { + "epoch": 2.100469251667078, + "grad_norm": 0.4644243183886433, + "learning_rate": 4.883572826080474e-06, + "loss": 0.4901, + "step": 5359 + }, + { + "epoch": 2.1008644109656704, + "grad_norm": 0.4730815340398359, + "learning_rate": 4.883525566354983e-06, + "loss": 0.4929, + "step": 5360 + }, + { + "epoch": 2.1012595702642627, + "grad_norm": 0.4645212217941286, + "learning_rate": 4.883478297268451e-06, + "loss": 0.485, + "step": 5361 + }, + { + "epoch": 2.101654729562855, + "grad_norm": 0.5883162574204609, + "learning_rate": 4.883431018821064e-06, + "loss": 0.4936, + "step": 5362 + }, + { + "epoch": 2.102049888861447, + "grad_norm": 0.44813227041664244, + "learning_rate": 4.883383731013007e-06, + "loss": 0.4999, + "step": 5363 + }, + { + "epoch": 2.1024450481600394, + "grad_norm": 0.4693538528184338, + "learning_rate": 4.883336433844465e-06, + "loss": 0.5014, + "step": 5364 + }, + { + "epoch": 2.1028402074586316, + "grad_norm": 0.4613356297326133, + "learning_rate": 4.883289127315627e-06, + "loss": 0.4986, + "step": 5365 + }, + { + "epoch": 2.103235366757224, + "grad_norm": 0.47462410453708387, + "learning_rate": 4.883241811426675e-06, + "loss": 0.4964, + "step": 5366 + }, + { + "epoch": 2.103630526055816, + "grad_norm": 0.4585830157369119, + "learning_rate": 4.883194486177796e-06, + "loss": 0.5305, + "step": 5367 + }, + { + "epoch": 2.1040256853544084, + "grad_norm": 0.5374049651711528, + "learning_rate": 4.883147151569178e-06, + "loss": 0.5013, + "step": 5368 + }, + { + "epoch": 2.1044208446530006, + "grad_norm": 0.4505696100137808, + "learning_rate": 4.883099807601003e-06, + "loss": 0.4905, + "step": 5369 + }, + { + "epoch": 2.104816003951593, + "grad_norm": 0.472748675203154, + "learning_rate": 4.88305245427346e-06, + "loss": 0.5043, + "step": 5370 + }, + { + "epoch": 2.105211163250185, + "grad_norm": 0.4703878721715555, + "learning_rate": 4.883005091586734e-06, + "loss": 0.484, + "step": 5371 + }, + { + "epoch": 2.1056063225487773, + "grad_norm": 0.4556588761245561, + "learning_rate": 4.882957719541011e-06, + "loss": 0.5031, + "step": 5372 + }, + { + "epoch": 2.1060014818473696, + "grad_norm": 0.44797746752549056, + "learning_rate": 4.882910338136478e-06, + "loss": 0.4944, + "step": 5373 + }, + { + "epoch": 2.106396641145962, + "grad_norm": 0.4886233397662977, + "learning_rate": 4.882862947373318e-06, + "loss": 0.5049, + "step": 5374 + }, + { + "epoch": 2.106791800444554, + "grad_norm": 0.4669275499791975, + "learning_rate": 4.882815547251721e-06, + "loss": 0.5095, + "step": 5375 + }, + { + "epoch": 2.1071869597431463, + "grad_norm": 0.4797695708025504, + "learning_rate": 4.8827681377718715e-06, + "loss": 0.5128, + "step": 5376 + }, + { + "epoch": 2.1075821190417385, + "grad_norm": 0.45320448299389626, + "learning_rate": 4.8827207189339545e-06, + "loss": 0.4986, + "step": 5377 + }, + { + "epoch": 2.1079772783403308, + "grad_norm": 0.44937616960762794, + "learning_rate": 4.882673290738158e-06, + "loss": 0.499, + "step": 5378 + }, + { + "epoch": 2.108372437638923, + "grad_norm": 0.4609581797871518, + "learning_rate": 4.8826258531846686e-06, + "loss": 0.5022, + "step": 5379 + }, + { + "epoch": 2.1087675969375153, + "grad_norm": 0.4582795163905733, + "learning_rate": 4.882578406273671e-06, + "loss": 0.4887, + "step": 5380 + }, + { + "epoch": 2.1091627562361075, + "grad_norm": 0.45000378134977503, + "learning_rate": 4.882530950005351e-06, + "loss": 0.5033, + "step": 5381 + }, + { + "epoch": 2.1095579155346997, + "grad_norm": 0.4535208499466116, + "learning_rate": 4.882483484379898e-06, + "loss": 0.4893, + "step": 5382 + }, + { + "epoch": 2.109953074833292, + "grad_norm": 0.4976229552655962, + "learning_rate": 4.8824360093974945e-06, + "loss": 0.4984, + "step": 5383 + }, + { + "epoch": 2.110348234131884, + "grad_norm": 0.4747654674353325, + "learning_rate": 4.88238852505833e-06, + "loss": 0.5173, + "step": 5384 + }, + { + "epoch": 2.1107433934304765, + "grad_norm": 0.45542034736390435, + "learning_rate": 4.88234103136259e-06, + "loss": 0.4833, + "step": 5385 + }, + { + "epoch": 2.1111385527290687, + "grad_norm": 0.46024000204519655, + "learning_rate": 4.882293528310462e-06, + "loss": 0.5103, + "step": 5386 + }, + { + "epoch": 2.111533712027661, + "grad_norm": 0.4874077584347664, + "learning_rate": 4.882246015902131e-06, + "loss": 0.5052, + "step": 5387 + }, + { + "epoch": 2.1119288713262536, + "grad_norm": 0.44865374080295306, + "learning_rate": 4.882198494137785e-06, + "loss": 0.4952, + "step": 5388 + }, + { + "epoch": 2.112324030624846, + "grad_norm": 0.46109394452242536, + "learning_rate": 4.882150963017609e-06, + "loss": 0.5005, + "step": 5389 + }, + { + "epoch": 2.112719189923438, + "grad_norm": 0.45930292764238206, + "learning_rate": 4.88210342254179e-06, + "loss": 0.4739, + "step": 5390 + }, + { + "epoch": 2.1131143492220303, + "grad_norm": 0.5451627115880612, + "learning_rate": 4.882055872710516e-06, + "loss": 0.5029, + "step": 5391 + }, + { + "epoch": 2.1135095085206226, + "grad_norm": 0.45461534307695234, + "learning_rate": 4.882008313523973e-06, + "loss": 0.4902, + "step": 5392 + }, + { + "epoch": 2.113904667819215, + "grad_norm": 0.4700402964260748, + "learning_rate": 4.881960744982348e-06, + "loss": 0.5006, + "step": 5393 + }, + { + "epoch": 2.114299827117807, + "grad_norm": 0.4558104260192114, + "learning_rate": 4.881913167085826e-06, + "loss": 0.4926, + "step": 5394 + }, + { + "epoch": 2.1146949864163993, + "grad_norm": 0.46103896695677804, + "learning_rate": 4.881865579834598e-06, + "loss": 0.5009, + "step": 5395 + }, + { + "epoch": 2.1150901457149915, + "grad_norm": 0.46008105522327936, + "learning_rate": 4.881817983228847e-06, + "loss": 0.5072, + "step": 5396 + }, + { + "epoch": 2.115485305013584, + "grad_norm": 0.459804457470388, + "learning_rate": 4.881770377268761e-06, + "loss": 0.4966, + "step": 5397 + }, + { + "epoch": 2.115880464312176, + "grad_norm": 0.4564873017201415, + "learning_rate": 4.8817227619545274e-06, + "loss": 0.5065, + "step": 5398 + }, + { + "epoch": 2.1162756236107683, + "grad_norm": 0.4572790123320898, + "learning_rate": 4.881675137286334e-06, + "loss": 0.4917, + "step": 5399 + }, + { + "epoch": 2.1166707829093605, + "grad_norm": 0.4583018544454832, + "learning_rate": 4.881627503264365e-06, + "loss": 0.5122, + "step": 5400 + }, + { + "epoch": 2.1170659422079527, + "grad_norm": 0.45766918038131965, + "learning_rate": 4.881579859888811e-06, + "loss": 0.5008, + "step": 5401 + }, + { + "epoch": 2.117461101506545, + "grad_norm": 0.47089609794076337, + "learning_rate": 4.881532207159857e-06, + "loss": 0.5111, + "step": 5402 + }, + { + "epoch": 2.1178562608051372, + "grad_norm": 0.48698638605698286, + "learning_rate": 4.881484545077691e-06, + "loss": 0.4955, + "step": 5403 + }, + { + "epoch": 2.1182514201037295, + "grad_norm": 0.47367199549589856, + "learning_rate": 4.881436873642499e-06, + "loss": 0.4843, + "step": 5404 + }, + { + "epoch": 2.1186465794023217, + "grad_norm": 0.4548370429625071, + "learning_rate": 4.881389192854469e-06, + "loss": 0.5109, + "step": 5405 + }, + { + "epoch": 2.119041738700914, + "grad_norm": 0.46207813913976026, + "learning_rate": 4.881341502713789e-06, + "loss": 0.5102, + "step": 5406 + }, + { + "epoch": 2.119436897999506, + "grad_norm": 0.4665523026201837, + "learning_rate": 4.881293803220646e-06, + "loss": 0.4937, + "step": 5407 + }, + { + "epoch": 2.1198320572980984, + "grad_norm": 0.44584389144997866, + "learning_rate": 4.881246094375226e-06, + "loss": 0.4929, + "step": 5408 + }, + { + "epoch": 2.1202272165966907, + "grad_norm": 0.45411441910971817, + "learning_rate": 4.881198376177717e-06, + "loss": 0.49, + "step": 5409 + }, + { + "epoch": 2.120622375895283, + "grad_norm": 0.45916208922818735, + "learning_rate": 4.8811506486283075e-06, + "loss": 0.4891, + "step": 5410 + }, + { + "epoch": 2.121017535193875, + "grad_norm": 0.45426066117787084, + "learning_rate": 4.881102911727184e-06, + "loss": 0.4934, + "step": 5411 + }, + { + "epoch": 2.1214126944924674, + "grad_norm": 0.4676617535350159, + "learning_rate": 4.881055165474535e-06, + "loss": 0.5035, + "step": 5412 + }, + { + "epoch": 2.1218078537910596, + "grad_norm": 0.47051348318804287, + "learning_rate": 4.881007409870546e-06, + "loss": 0.5064, + "step": 5413 + }, + { + "epoch": 2.122203013089652, + "grad_norm": 0.4496266026428253, + "learning_rate": 4.880959644915406e-06, + "loss": 0.5047, + "step": 5414 + }, + { + "epoch": 2.122598172388244, + "grad_norm": 0.4595779522195174, + "learning_rate": 4.880911870609302e-06, + "loss": 0.4893, + "step": 5415 + }, + { + "epoch": 2.1229933316868363, + "grad_norm": 0.4569941314020582, + "learning_rate": 4.880864086952423e-06, + "loss": 0.4922, + "step": 5416 + }, + { + "epoch": 2.1233884909854286, + "grad_norm": 0.469415079482959, + "learning_rate": 4.880816293944955e-06, + "loss": 0.4938, + "step": 5417 + }, + { + "epoch": 2.123783650284021, + "grad_norm": 0.4447409912431963, + "learning_rate": 4.880768491587085e-06, + "loss": 0.5183, + "step": 5418 + }, + { + "epoch": 2.124178809582613, + "grad_norm": 0.45255535919526285, + "learning_rate": 4.880720679879004e-06, + "loss": 0.4958, + "step": 5419 + }, + { + "epoch": 2.1245739688812053, + "grad_norm": 0.4614305972818154, + "learning_rate": 4.880672858820897e-06, + "loss": 0.4974, + "step": 5420 + }, + { + "epoch": 2.1249691281797976, + "grad_norm": 0.47210760737931706, + "learning_rate": 4.880625028412952e-06, + "loss": 0.4918, + "step": 5421 + }, + { + "epoch": 2.12536428747839, + "grad_norm": 0.46822573176234256, + "learning_rate": 4.880577188655359e-06, + "loss": 0.5063, + "step": 5422 + }, + { + "epoch": 2.125759446776982, + "grad_norm": 0.480592867802166, + "learning_rate": 4.880529339548303e-06, + "loss": 0.4923, + "step": 5423 + }, + { + "epoch": 2.1261546060755743, + "grad_norm": 0.45661426326462, + "learning_rate": 4.880481481091974e-06, + "loss": 0.5092, + "step": 5424 + }, + { + "epoch": 2.1265497653741665, + "grad_norm": 0.4704280977766564, + "learning_rate": 4.8804336132865595e-06, + "loss": 0.5105, + "step": 5425 + }, + { + "epoch": 2.1269449246727588, + "grad_norm": 0.4618387396924572, + "learning_rate": 4.880385736132246e-06, + "loss": 0.4752, + "step": 5426 + }, + { + "epoch": 2.127340083971351, + "grad_norm": 0.45751503112924735, + "learning_rate": 4.8803378496292244e-06, + "loss": 0.4965, + "step": 5427 + }, + { + "epoch": 2.1277352432699432, + "grad_norm": 0.4641374268101188, + "learning_rate": 4.88028995377768e-06, + "loss": 0.5184, + "step": 5428 + }, + { + "epoch": 2.1281304025685355, + "grad_norm": 0.46565955389095914, + "learning_rate": 4.880242048577802e-06, + "loss": 0.5042, + "step": 5429 + }, + { + "epoch": 2.1285255618671277, + "grad_norm": 0.45584303689164063, + "learning_rate": 4.8801941340297795e-06, + "loss": 0.5004, + "step": 5430 + }, + { + "epoch": 2.12892072116572, + "grad_norm": 0.4710850087326468, + "learning_rate": 4.8801462101338e-06, + "loss": 0.5192, + "step": 5431 + }, + { + "epoch": 2.129315880464312, + "grad_norm": 0.44061458101376594, + "learning_rate": 4.88009827689005e-06, + "loss": 0.4933, + "step": 5432 + }, + { + "epoch": 2.1297110397629044, + "grad_norm": 0.4519911211493473, + "learning_rate": 4.88005033429872e-06, + "loss": 0.4949, + "step": 5433 + }, + { + "epoch": 2.1301061990614967, + "grad_norm": 0.4504066589256489, + "learning_rate": 4.880002382359998e-06, + "loss": 0.4791, + "step": 5434 + }, + { + "epoch": 2.130501358360089, + "grad_norm": 0.45764033049204544, + "learning_rate": 4.879954421074071e-06, + "loss": 0.4903, + "step": 5435 + }, + { + "epoch": 2.130896517658681, + "grad_norm": 0.4706731789649233, + "learning_rate": 4.879906450441129e-06, + "loss": 0.5127, + "step": 5436 + }, + { + "epoch": 2.1312916769572734, + "grad_norm": 0.44961446232384666, + "learning_rate": 4.8798584704613585e-06, + "loss": 0.4987, + "step": 5437 + }, + { + "epoch": 2.1316868362558656, + "grad_norm": 0.47280770131697647, + "learning_rate": 4.87981048113495e-06, + "loss": 0.5161, + "step": 5438 + }, + { + "epoch": 2.132081995554458, + "grad_norm": 0.46339098157068087, + "learning_rate": 4.879762482462091e-06, + "loss": 0.4894, + "step": 5439 + }, + { + "epoch": 2.13247715485305, + "grad_norm": 0.4487372634389615, + "learning_rate": 4.87971447444297e-06, + "loss": 0.5221, + "step": 5440 + }, + { + "epoch": 2.1328723141516424, + "grad_norm": 0.4562917626396976, + "learning_rate": 4.879666457077775e-06, + "loss": 0.524, + "step": 5441 + }, + { + "epoch": 2.1332674734502346, + "grad_norm": 0.4821307353976656, + "learning_rate": 4.879618430366696e-06, + "loss": 0.5079, + "step": 5442 + }, + { + "epoch": 2.133662632748827, + "grad_norm": 0.5353325642318681, + "learning_rate": 4.879570394309921e-06, + "loss": 0.5145, + "step": 5443 + }, + { + "epoch": 2.134057792047419, + "grad_norm": 0.47763771900604635, + "learning_rate": 4.879522348907637e-06, + "loss": 0.4835, + "step": 5444 + }, + { + "epoch": 2.1344529513460113, + "grad_norm": 0.44913009790356634, + "learning_rate": 4.879474294160035e-06, + "loss": 0.4757, + "step": 5445 + }, + { + "epoch": 2.1348481106446036, + "grad_norm": 0.4597607997906684, + "learning_rate": 4.879426230067303e-06, + "loss": 0.5005, + "step": 5446 + }, + { + "epoch": 2.135243269943196, + "grad_norm": 0.4623134190589581, + "learning_rate": 4.8793781566296294e-06, + "loss": 0.5043, + "step": 5447 + }, + { + "epoch": 2.135638429241788, + "grad_norm": 0.4506998220430185, + "learning_rate": 4.8793300738472025e-06, + "loss": 0.475, + "step": 5448 + }, + { + "epoch": 2.1360335885403803, + "grad_norm": 0.45507178878371285, + "learning_rate": 4.879281981720213e-06, + "loss": 0.498, + "step": 5449 + }, + { + "epoch": 2.1364287478389725, + "grad_norm": 0.46575574764050215, + "learning_rate": 4.879233880248848e-06, + "loss": 0.5143, + "step": 5450 + }, + { + "epoch": 2.1368239071375648, + "grad_norm": 0.4468946988002143, + "learning_rate": 4.879185769433298e-06, + "loss": 0.5058, + "step": 5451 + }, + { + "epoch": 2.137219066436157, + "grad_norm": 0.45705479305728886, + "learning_rate": 4.87913764927375e-06, + "loss": 0.4844, + "step": 5452 + }, + { + "epoch": 2.1376142257347492, + "grad_norm": 0.4535460076744396, + "learning_rate": 4.8790895197703945e-06, + "loss": 0.5187, + "step": 5453 + }, + { + "epoch": 2.1380093850333415, + "grad_norm": 0.4508238974301958, + "learning_rate": 4.879041380923421e-06, + "loss": 0.498, + "step": 5454 + }, + { + "epoch": 2.1384045443319337, + "grad_norm": 0.47611124879857625, + "learning_rate": 4.878993232733016e-06, + "loss": 0.5015, + "step": 5455 + }, + { + "epoch": 2.138799703630526, + "grad_norm": 0.47623598181038684, + "learning_rate": 4.8789450751993705e-06, + "loss": 0.4918, + "step": 5456 + }, + { + "epoch": 2.139194862929118, + "grad_norm": 0.4463328063702259, + "learning_rate": 4.878896908322673e-06, + "loss": 0.4949, + "step": 5457 + }, + { + "epoch": 2.1395900222277104, + "grad_norm": 0.4688848998545471, + "learning_rate": 4.878848732103114e-06, + "loss": 0.4784, + "step": 5458 + }, + { + "epoch": 2.1399851815263027, + "grad_norm": 0.44121522041937494, + "learning_rate": 4.878800546540881e-06, + "loss": 0.4849, + "step": 5459 + }, + { + "epoch": 2.140380340824895, + "grad_norm": 0.46258456174979196, + "learning_rate": 4.878752351636164e-06, + "loss": 0.498, + "step": 5460 + }, + { + "epoch": 2.140775500123487, + "grad_norm": 0.4515779539663671, + "learning_rate": 4.878704147389153e-06, + "loss": 0.5277, + "step": 5461 + }, + { + "epoch": 2.1411706594220794, + "grad_norm": 0.44675655713159357, + "learning_rate": 4.878655933800036e-06, + "loss": 0.4781, + "step": 5462 + }, + { + "epoch": 2.1415658187206716, + "grad_norm": 0.4787595797738257, + "learning_rate": 4.878607710869002e-06, + "loss": 0.5159, + "step": 5463 + }, + { + "epoch": 2.141960978019264, + "grad_norm": 0.46495746367564106, + "learning_rate": 4.878559478596242e-06, + "loss": 0.5156, + "step": 5464 + }, + { + "epoch": 2.142356137317856, + "grad_norm": 0.4502923794920052, + "learning_rate": 4.8785112369819455e-06, + "loss": 0.4891, + "step": 5465 + }, + { + "epoch": 2.1427512966164484, + "grad_norm": 0.451977669133556, + "learning_rate": 4.8784629860263e-06, + "loss": 0.5063, + "step": 5466 + }, + { + "epoch": 2.1431464559150406, + "grad_norm": 0.45073347613474773, + "learning_rate": 4.878414725729497e-06, + "loss": 0.5037, + "step": 5467 + }, + { + "epoch": 2.143541615213633, + "grad_norm": 0.4739115032234395, + "learning_rate": 4.878366456091724e-06, + "loss": 0.4945, + "step": 5468 + }, + { + "epoch": 2.143936774512225, + "grad_norm": 0.46171831924852413, + "learning_rate": 4.8783181771131735e-06, + "loss": 0.5097, + "step": 5469 + }, + { + "epoch": 2.1443319338108173, + "grad_norm": 0.4657774448437735, + "learning_rate": 4.878269888794032e-06, + "loss": 0.4919, + "step": 5470 + }, + { + "epoch": 2.1447270931094096, + "grad_norm": 0.47080691090966, + "learning_rate": 4.878221591134491e-06, + "loss": 0.5119, + "step": 5471 + }, + { + "epoch": 2.145122252408002, + "grad_norm": 0.4513544487027582, + "learning_rate": 4.8781732841347395e-06, + "loss": 0.4851, + "step": 5472 + }, + { + "epoch": 2.145517411706594, + "grad_norm": 0.47195481464231376, + "learning_rate": 4.878124967794968e-06, + "loss": 0.5161, + "step": 5473 + }, + { + "epoch": 2.1459125710051863, + "grad_norm": 0.4609154414666813, + "learning_rate": 4.878076642115366e-06, + "loss": 0.4985, + "step": 5474 + }, + { + "epoch": 2.1463077303037785, + "grad_norm": 0.46676548499669845, + "learning_rate": 4.878028307096122e-06, + "loss": 0.4724, + "step": 5475 + }, + { + "epoch": 2.1467028896023708, + "grad_norm": 0.4611191108433873, + "learning_rate": 4.8779799627374265e-06, + "loss": 0.4943, + "step": 5476 + }, + { + "epoch": 2.147098048900963, + "grad_norm": 0.4521869519269894, + "learning_rate": 4.877931609039471e-06, + "loss": 0.4997, + "step": 5477 + }, + { + "epoch": 2.1474932081995552, + "grad_norm": 0.505573034174829, + "learning_rate": 4.877883246002444e-06, + "loss": 0.5042, + "step": 5478 + }, + { + "epoch": 2.1478883674981475, + "grad_norm": 0.47546280424045617, + "learning_rate": 4.877834873626535e-06, + "loss": 0.5167, + "step": 5479 + }, + { + "epoch": 2.1482835267967397, + "grad_norm": 0.4634538760180319, + "learning_rate": 4.877786491911935e-06, + "loss": 0.4926, + "step": 5480 + }, + { + "epoch": 2.148678686095332, + "grad_norm": 0.4677531886753567, + "learning_rate": 4.877738100858832e-06, + "loss": 0.4891, + "step": 5481 + }, + { + "epoch": 2.149073845393924, + "grad_norm": 0.4690745882384834, + "learning_rate": 4.877689700467419e-06, + "loss": 0.5207, + "step": 5482 + }, + { + "epoch": 2.1494690046925164, + "grad_norm": 0.4692422873547366, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.4979, + "step": 5483 + }, + { + "epoch": 2.1498641639911087, + "grad_norm": 0.45669923066085627, + "learning_rate": 4.877592871670419e-06, + "loss": 0.5017, + "step": 5484 + }, + { + "epoch": 2.150259323289701, + "grad_norm": 0.4647143592549349, + "learning_rate": 4.877544443265212e-06, + "loss": 0.5213, + "step": 5485 + }, + { + "epoch": 2.150654482588293, + "grad_norm": 0.4487735037385304, + "learning_rate": 4.877496005522454e-06, + "loss": 0.4956, + "step": 5486 + }, + { + "epoch": 2.151049641886886, + "grad_norm": 0.4732217043292161, + "learning_rate": 4.877447558442335e-06, + "loss": 0.5098, + "step": 5487 + }, + { + "epoch": 2.151444801185478, + "grad_norm": 0.4868586481311084, + "learning_rate": 4.877399102025046e-06, + "loss": 0.4968, + "step": 5488 + }, + { + "epoch": 2.1518399604840703, + "grad_norm": 0.4679716331191469, + "learning_rate": 4.877350636270778e-06, + "loss": 0.5025, + "step": 5489 + }, + { + "epoch": 2.1522351197826626, + "grad_norm": 0.455600959345513, + "learning_rate": 4.87730216117972e-06, + "loss": 0.5006, + "step": 5490 + }, + { + "epoch": 2.152630279081255, + "grad_norm": 0.4729311871353748, + "learning_rate": 4.877253676752062e-06, + "loss": 0.4905, + "step": 5491 + }, + { + "epoch": 2.153025438379847, + "grad_norm": 0.47191264067689354, + "learning_rate": 4.877205182987995e-06, + "loss": 0.5033, + "step": 5492 + }, + { + "epoch": 2.1534205976784393, + "grad_norm": 0.4734254855624861, + "learning_rate": 4.87715667988771e-06, + "loss": 0.5113, + "step": 5493 + }, + { + "epoch": 2.1538157569770315, + "grad_norm": 0.46092003008011734, + "learning_rate": 4.8771081674513965e-06, + "loss": 0.4878, + "step": 5494 + }, + { + "epoch": 2.1542109162756238, + "grad_norm": 0.45908545472439594, + "learning_rate": 4.877059645679246e-06, + "loss": 0.4934, + "step": 5495 + }, + { + "epoch": 2.154606075574216, + "grad_norm": 0.4468868294393995, + "learning_rate": 4.877011114571449e-06, + "loss": 0.4947, + "step": 5496 + }, + { + "epoch": 2.1550012348728083, + "grad_norm": 0.444489465663762, + "learning_rate": 4.876962574128196e-06, + "loss": 0.4967, + "step": 5497 + }, + { + "epoch": 2.1553963941714005, + "grad_norm": 0.4393378102668866, + "learning_rate": 4.876914024349676e-06, + "loss": 0.4844, + "step": 5498 + }, + { + "epoch": 2.1557915534699927, + "grad_norm": 0.4640805678204216, + "learning_rate": 4.876865465236082e-06, + "loss": 0.5285, + "step": 5499 + }, + { + "epoch": 2.156186712768585, + "grad_norm": 0.5289230739266989, + "learning_rate": 4.876816896787603e-06, + "loss": 0.4989, + "step": 5500 + }, + { + "epoch": 2.156581872067177, + "grad_norm": 0.5419539291904234, + "learning_rate": 4.876768319004431e-06, + "loss": 0.5113, + "step": 5501 + }, + { + "epoch": 2.1569770313657695, + "grad_norm": 0.4517913697063531, + "learning_rate": 4.876719731886757e-06, + "loss": 0.4963, + "step": 5502 + }, + { + "epoch": 2.1573721906643617, + "grad_norm": 0.4504437690654093, + "learning_rate": 4.87667113543477e-06, + "loss": 0.4958, + "step": 5503 + }, + { + "epoch": 2.157767349962954, + "grad_norm": 0.4619724381275875, + "learning_rate": 4.876622529648663e-06, + "loss": 0.4998, + "step": 5504 + }, + { + "epoch": 2.158162509261546, + "grad_norm": 0.46378800765710737, + "learning_rate": 4.876573914528625e-06, + "loss": 0.5158, + "step": 5505 + }, + { + "epoch": 2.1585576685601384, + "grad_norm": 0.45305071404161723, + "learning_rate": 4.876525290074848e-06, + "loss": 0.4932, + "step": 5506 + }, + { + "epoch": 2.1589528278587307, + "grad_norm": 0.46744932250978694, + "learning_rate": 4.8764766562875235e-06, + "loss": 0.5075, + "step": 5507 + }, + { + "epoch": 2.159347987157323, + "grad_norm": 0.44330667766097176, + "learning_rate": 4.87642801316684e-06, + "loss": 0.4809, + "step": 5508 + }, + { + "epoch": 2.159743146455915, + "grad_norm": 0.46354232388964045, + "learning_rate": 4.876379360712993e-06, + "loss": 0.5029, + "step": 5509 + }, + { + "epoch": 2.1601383057545074, + "grad_norm": 0.46977655177705924, + "learning_rate": 4.876330698926169e-06, + "loss": 0.5001, + "step": 5510 + }, + { + "epoch": 2.1605334650530996, + "grad_norm": 0.45685117233330014, + "learning_rate": 4.876282027806561e-06, + "loss": 0.5151, + "step": 5511 + }, + { + "epoch": 2.160928624351692, + "grad_norm": 0.47320943013293004, + "learning_rate": 4.87623334735436e-06, + "loss": 0.5153, + "step": 5512 + }, + { + "epoch": 2.161323783650284, + "grad_norm": 0.45887136394693434, + "learning_rate": 4.876184657569759e-06, + "loss": 0.4955, + "step": 5513 + }, + { + "epoch": 2.1617189429488763, + "grad_norm": 0.45614045191471664, + "learning_rate": 4.876135958452946e-06, + "loss": 0.5055, + "step": 5514 + }, + { + "epoch": 2.1621141022474686, + "grad_norm": 0.4545945601618243, + "learning_rate": 4.876087250004114e-06, + "loss": 0.5069, + "step": 5515 + }, + { + "epoch": 2.162509261546061, + "grad_norm": 0.447681420448537, + "learning_rate": 4.876038532223454e-06, + "loss": 0.4999, + "step": 5516 + }, + { + "epoch": 2.162904420844653, + "grad_norm": 0.47058053249246107, + "learning_rate": 4.875989805111158e-06, + "loss": 0.4918, + "step": 5517 + }, + { + "epoch": 2.1632995801432453, + "grad_norm": 0.7930113043114789, + "learning_rate": 4.875941068667417e-06, + "loss": 0.4778, + "step": 5518 + }, + { + "epoch": 2.1636947394418375, + "grad_norm": 0.445244286794454, + "learning_rate": 4.875892322892421e-06, + "loss": 0.4965, + "step": 5519 + }, + { + "epoch": 2.16408989874043, + "grad_norm": 0.4577281165835478, + "learning_rate": 4.875843567786364e-06, + "loss": 0.4979, + "step": 5520 + }, + { + "epoch": 2.164485058039022, + "grad_norm": 0.464223962296654, + "learning_rate": 4.8757948033494365e-06, + "loss": 0.4995, + "step": 5521 + }, + { + "epoch": 2.1648802173376143, + "grad_norm": 0.44936416883238395, + "learning_rate": 4.875746029581828e-06, + "loss": 0.5189, + "step": 5522 + }, + { + "epoch": 2.1652753766362065, + "grad_norm": 0.45656574932816285, + "learning_rate": 4.875697246483733e-06, + "loss": 0.4947, + "step": 5523 + }, + { + "epoch": 2.1656705359347987, + "grad_norm": 0.45384101923535536, + "learning_rate": 4.875648454055341e-06, + "loss": 0.5006, + "step": 5524 + }, + { + "epoch": 2.166065695233391, + "grad_norm": 0.5008396943055093, + "learning_rate": 4.875599652296845e-06, + "loss": 0.4967, + "step": 5525 + }, + { + "epoch": 2.1664608545319832, + "grad_norm": 0.45982000765099335, + "learning_rate": 4.8755508412084364e-06, + "loss": 0.4969, + "step": 5526 + }, + { + "epoch": 2.1668560138305755, + "grad_norm": 0.45510607157877675, + "learning_rate": 4.875502020790306e-06, + "loss": 0.5089, + "step": 5527 + }, + { + "epoch": 2.1672511731291677, + "grad_norm": 0.45718988116180564, + "learning_rate": 4.875453191042646e-06, + "loss": 0.5141, + "step": 5528 + }, + { + "epoch": 2.16764633242776, + "grad_norm": 0.46235453264969417, + "learning_rate": 4.875404351965648e-06, + "loss": 0.4939, + "step": 5529 + }, + { + "epoch": 2.168041491726352, + "grad_norm": 0.45617899670326717, + "learning_rate": 4.875355503559506e-06, + "loss": 0.4993, + "step": 5530 + }, + { + "epoch": 2.1684366510249444, + "grad_norm": 0.47662452130332766, + "learning_rate": 4.875306645824408e-06, + "loss": 0.5141, + "step": 5531 + }, + { + "epoch": 2.1688318103235367, + "grad_norm": 0.4622948877225635, + "learning_rate": 4.875257778760549e-06, + "loss": 0.515, + "step": 5532 + }, + { + "epoch": 2.169226969622129, + "grad_norm": 0.4842991640779124, + "learning_rate": 4.8752089023681195e-06, + "loss": 0.5125, + "step": 5533 + }, + { + "epoch": 2.169622128920721, + "grad_norm": 0.48514678815733153, + "learning_rate": 4.875160016647311e-06, + "loss": 0.5048, + "step": 5534 + }, + { + "epoch": 2.1700172882193134, + "grad_norm": 0.45733286495961495, + "learning_rate": 4.875111121598317e-06, + "loss": 0.5015, + "step": 5535 + }, + { + "epoch": 2.1704124475179056, + "grad_norm": 0.45364214188982693, + "learning_rate": 4.875062217221329e-06, + "loss": 0.5115, + "step": 5536 + }, + { + "epoch": 2.170807606816498, + "grad_norm": 0.4421312637696224, + "learning_rate": 4.875013303516538e-06, + "loss": 0.4895, + "step": 5537 + }, + { + "epoch": 2.17120276611509, + "grad_norm": 0.4908166354246267, + "learning_rate": 4.874964380484138e-06, + "loss": 0.4812, + "step": 5538 + }, + { + "epoch": 2.1715979254136824, + "grad_norm": 0.4616856713589806, + "learning_rate": 4.874915448124319e-06, + "loss": 0.4974, + "step": 5539 + }, + { + "epoch": 2.1719930847122746, + "grad_norm": 0.4646457406849082, + "learning_rate": 4.874866506437275e-06, + "loss": 0.5034, + "step": 5540 + }, + { + "epoch": 2.172388244010867, + "grad_norm": 0.45047160391577984, + "learning_rate": 4.874817555423196e-06, + "loss": 0.4877, + "step": 5541 + }, + { + "epoch": 2.172783403309459, + "grad_norm": 0.4500708056755459, + "learning_rate": 4.874768595082277e-06, + "loss": 0.4918, + "step": 5542 + }, + { + "epoch": 2.1731785626080513, + "grad_norm": 0.47456977571950365, + "learning_rate": 4.874719625414709e-06, + "loss": 0.4933, + "step": 5543 + }, + { + "epoch": 2.1735737219066436, + "grad_norm": 0.46200205295141555, + "learning_rate": 4.874670646420684e-06, + "loss": 0.5142, + "step": 5544 + }, + { + "epoch": 2.173968881205236, + "grad_norm": 0.4734243176175764, + "learning_rate": 4.874621658100395e-06, + "loss": 0.5208, + "step": 5545 + }, + { + "epoch": 2.174364040503828, + "grad_norm": 0.44815049758957626, + "learning_rate": 4.874572660454034e-06, + "loss": 0.4968, + "step": 5546 + }, + { + "epoch": 2.1747591998024203, + "grad_norm": 0.4614058105293498, + "learning_rate": 4.874523653481793e-06, + "loss": 0.5072, + "step": 5547 + }, + { + "epoch": 2.1751543591010125, + "grad_norm": 0.44451885664415813, + "learning_rate": 4.874474637183866e-06, + "loss": 0.5043, + "step": 5548 + }, + { + "epoch": 2.1755495183996048, + "grad_norm": 0.4591881542386408, + "learning_rate": 4.874425611560444e-06, + "loss": 0.4854, + "step": 5549 + }, + { + "epoch": 2.175944677698197, + "grad_norm": 0.47533595495611164, + "learning_rate": 4.874376576611719e-06, + "loss": 0.5194, + "step": 5550 + }, + { + "epoch": 2.1763398369967892, + "grad_norm": 0.46610351057678207, + "learning_rate": 4.874327532337886e-06, + "loss": 0.5002, + "step": 5551 + }, + { + "epoch": 2.1767349962953815, + "grad_norm": 0.4633556164186955, + "learning_rate": 4.8742784787391355e-06, + "loss": 0.5062, + "step": 5552 + }, + { + "epoch": 2.1771301555939737, + "grad_norm": 0.4676833644787226, + "learning_rate": 4.874229415815661e-06, + "loss": 0.5077, + "step": 5553 + }, + { + "epoch": 2.177525314892566, + "grad_norm": 0.4861231482509832, + "learning_rate": 4.874180343567655e-06, + "loss": 0.5149, + "step": 5554 + }, + { + "epoch": 2.177920474191158, + "grad_norm": 0.47393974173121844, + "learning_rate": 4.8741312619953106e-06, + "loss": 0.4931, + "step": 5555 + }, + { + "epoch": 2.1783156334897504, + "grad_norm": 0.46438049892681393, + "learning_rate": 4.87408217109882e-06, + "loss": 0.4922, + "step": 5556 + }, + { + "epoch": 2.1787107927883427, + "grad_norm": 0.480552674800388, + "learning_rate": 4.874033070878377e-06, + "loss": 0.5093, + "step": 5557 + }, + { + "epoch": 2.179105952086935, + "grad_norm": 0.49032805311209543, + "learning_rate": 4.873983961334172e-06, + "loss": 0.5129, + "step": 5558 + }, + { + "epoch": 2.179501111385527, + "grad_norm": 0.46500235243264837, + "learning_rate": 4.873934842466401e-06, + "loss": 0.4974, + "step": 5559 + }, + { + "epoch": 2.1798962706841194, + "grad_norm": 0.4601347793294418, + "learning_rate": 4.873885714275255e-06, + "loss": 0.5178, + "step": 5560 + }, + { + "epoch": 2.1802914299827116, + "grad_norm": 0.4601391528297951, + "learning_rate": 4.873836576760927e-06, + "loss": 0.501, + "step": 5561 + }, + { + "epoch": 2.180686589281304, + "grad_norm": 0.5067694421292206, + "learning_rate": 4.873787429923611e-06, + "loss": 0.484, + "step": 5562 + }, + { + "epoch": 2.181081748579896, + "grad_norm": 0.4663242902766598, + "learning_rate": 4.8737382737635e-06, + "loss": 0.5084, + "step": 5563 + }, + { + "epoch": 2.1814769078784884, + "grad_norm": 0.4553398223962099, + "learning_rate": 4.873689108280786e-06, + "loss": 0.4965, + "step": 5564 + }, + { + "epoch": 2.1818720671770806, + "grad_norm": 0.529541497892225, + "learning_rate": 4.873639933475662e-06, + "loss": 0.4955, + "step": 5565 + }, + { + "epoch": 2.182267226475673, + "grad_norm": 0.4713945957848912, + "learning_rate": 4.8735907493483216e-06, + "loss": 0.5047, + "step": 5566 + }, + { + "epoch": 2.182662385774265, + "grad_norm": 0.4572708785437191, + "learning_rate": 4.873541555898959e-06, + "loss": 0.4803, + "step": 5567 + }, + { + "epoch": 2.1830575450728573, + "grad_norm": 0.45611106467714163, + "learning_rate": 4.873492353127765e-06, + "loss": 0.4936, + "step": 5568 + }, + { + "epoch": 2.1834527043714496, + "grad_norm": 0.4595792613339848, + "learning_rate": 4.873443141034936e-06, + "loss": 0.4874, + "step": 5569 + }, + { + "epoch": 2.183847863670042, + "grad_norm": 0.4640866973139749, + "learning_rate": 4.873393919620663e-06, + "loss": 0.4924, + "step": 5570 + }, + { + "epoch": 2.184243022968634, + "grad_norm": 0.46111098813030116, + "learning_rate": 4.873344688885139e-06, + "loss": 0.5057, + "step": 5571 + }, + { + "epoch": 2.1846381822672263, + "grad_norm": 0.46999556887546373, + "learning_rate": 4.873295448828559e-06, + "loss": 0.5093, + "step": 5572 + }, + { + "epoch": 2.1850333415658185, + "grad_norm": 0.4718054070690503, + "learning_rate": 4.873246199451116e-06, + "loss": 0.4879, + "step": 5573 + }, + { + "epoch": 2.1854285008644108, + "grad_norm": 0.4710239940304868, + "learning_rate": 4.873196940753002e-06, + "loss": 0.4993, + "step": 5574 + }, + { + "epoch": 2.1858236601630034, + "grad_norm": 0.49030453082387426, + "learning_rate": 4.873147672734412e-06, + "loss": 0.5107, + "step": 5575 + }, + { + "epoch": 2.1862188194615957, + "grad_norm": 0.46406412622436066, + "learning_rate": 4.873098395395539e-06, + "loss": 0.5081, + "step": 5576 + }, + { + "epoch": 2.186613978760188, + "grad_norm": 0.4628191661905533, + "learning_rate": 4.873049108736577e-06, + "loss": 0.5031, + "step": 5577 + }, + { + "epoch": 2.18700913805878, + "grad_norm": 0.44331929710296053, + "learning_rate": 4.872999812757718e-06, + "loss": 0.5046, + "step": 5578 + }, + { + "epoch": 2.1874042973573724, + "grad_norm": 0.446907560016826, + "learning_rate": 4.872950507459158e-06, + "loss": 0.5187, + "step": 5579 + }, + { + "epoch": 2.1877994566559646, + "grad_norm": 0.45847831566326597, + "learning_rate": 4.872901192841089e-06, + "loss": 0.4828, + "step": 5580 + }, + { + "epoch": 2.188194615954557, + "grad_norm": 0.4487958830148146, + "learning_rate": 4.872851868903704e-06, + "loss": 0.5003, + "step": 5581 + }, + { + "epoch": 2.188589775253149, + "grad_norm": 0.4535135827385076, + "learning_rate": 4.872802535647199e-06, + "loss": 0.5003, + "step": 5582 + }, + { + "epoch": 2.1889849345517414, + "grad_norm": 0.4375345693693656, + "learning_rate": 4.872753193071766e-06, + "loss": 0.4881, + "step": 5583 + }, + { + "epoch": 2.1893800938503336, + "grad_norm": 0.4641343320330823, + "learning_rate": 4.872703841177599e-06, + "loss": 0.4941, + "step": 5584 + }, + { + "epoch": 2.189775253148926, + "grad_norm": 0.45644605766947677, + "learning_rate": 4.872654479964892e-06, + "loss": 0.5066, + "step": 5585 + }, + { + "epoch": 2.190170412447518, + "grad_norm": 0.46839020316567503, + "learning_rate": 4.87260510943384e-06, + "loss": 0.5116, + "step": 5586 + }, + { + "epoch": 2.1905655717461103, + "grad_norm": 0.4731473500021345, + "learning_rate": 4.872555729584635e-06, + "loss": 0.5096, + "step": 5587 + }, + { + "epoch": 2.1909607310447026, + "grad_norm": 0.4433081776671587, + "learning_rate": 4.872506340417471e-06, + "loss": 0.4933, + "step": 5588 + }, + { + "epoch": 2.191355890343295, + "grad_norm": 0.48768292944297426, + "learning_rate": 4.872456941932544e-06, + "loss": 0.507, + "step": 5589 + }, + { + "epoch": 2.191751049641887, + "grad_norm": 0.4436677280755508, + "learning_rate": 4.872407534130047e-06, + "loss": 0.5024, + "step": 5590 + }, + { + "epoch": 2.1921462089404793, + "grad_norm": 0.4536188851299345, + "learning_rate": 4.8723581170101734e-06, + "loss": 0.4982, + "step": 5591 + }, + { + "epoch": 2.1925413682390715, + "grad_norm": 0.44412822568870913, + "learning_rate": 4.872308690573118e-06, + "loss": 0.4957, + "step": 5592 + }, + { + "epoch": 2.1929365275376638, + "grad_norm": 0.6642864573262021, + "learning_rate": 4.872259254819073e-06, + "loss": 0.5231, + "step": 5593 + }, + { + "epoch": 2.193331686836256, + "grad_norm": 0.46456261247267744, + "learning_rate": 4.872209809748236e-06, + "loss": 0.5085, + "step": 5594 + }, + { + "epoch": 2.1937268461348483, + "grad_norm": 0.45989758175778567, + "learning_rate": 4.872160355360798e-06, + "loss": 0.4912, + "step": 5595 + }, + { + "epoch": 2.1941220054334405, + "grad_norm": 0.47722595433382775, + "learning_rate": 4.8721108916569555e-06, + "loss": 0.5026, + "step": 5596 + }, + { + "epoch": 2.1945171647320327, + "grad_norm": 0.4588929552955127, + "learning_rate": 4.872061418636902e-06, + "loss": 0.4941, + "step": 5597 + }, + { + "epoch": 2.194912324030625, + "grad_norm": 0.43968483295091, + "learning_rate": 4.872011936300831e-06, + "loss": 0.5046, + "step": 5598 + }, + { + "epoch": 2.195307483329217, + "grad_norm": 0.44462587632009504, + "learning_rate": 4.871962444648938e-06, + "loss": 0.4819, + "step": 5599 + }, + { + "epoch": 2.1957026426278095, + "grad_norm": 0.45683508501671893, + "learning_rate": 4.871912943681416e-06, + "loss": 0.504, + "step": 5600 + }, + { + "epoch": 2.1960978019264017, + "grad_norm": 0.4446678291894044, + "learning_rate": 4.87186343339846e-06, + "loss": 0.4973, + "step": 5601 + }, + { + "epoch": 2.196492961224994, + "grad_norm": 0.4736599203705845, + "learning_rate": 4.871813913800266e-06, + "loss": 0.5027, + "step": 5602 + }, + { + "epoch": 2.196888120523586, + "grad_norm": 0.45846116866212805, + "learning_rate": 4.8717643848870265e-06, + "loss": 0.5032, + "step": 5603 + }, + { + "epoch": 2.1972832798221784, + "grad_norm": 0.5481111316900571, + "learning_rate": 4.871714846658937e-06, + "loss": 0.5066, + "step": 5604 + }, + { + "epoch": 2.1976784391207707, + "grad_norm": 0.45324845161744187, + "learning_rate": 4.871665299116192e-06, + "loss": 0.496, + "step": 5605 + }, + { + "epoch": 2.198073598419363, + "grad_norm": 0.46469652330820316, + "learning_rate": 4.871615742258985e-06, + "loss": 0.5095, + "step": 5606 + }, + { + "epoch": 2.198468757717955, + "grad_norm": 0.5196428348937057, + "learning_rate": 4.871566176087512e-06, + "loss": 0.4955, + "step": 5607 + }, + { + "epoch": 2.1988639170165474, + "grad_norm": 0.47622508477020703, + "learning_rate": 4.871516600601968e-06, + "loss": 0.5222, + "step": 5608 + }, + { + "epoch": 2.1992590763151396, + "grad_norm": 0.4644820525166222, + "learning_rate": 4.871467015802545e-06, + "loss": 0.5046, + "step": 5609 + }, + { + "epoch": 2.199654235613732, + "grad_norm": 0.4563914565316861, + "learning_rate": 4.871417421689442e-06, + "loss": 0.502, + "step": 5610 + }, + { + "epoch": 2.200049394912324, + "grad_norm": 0.4521533603619674, + "learning_rate": 4.871367818262849e-06, + "loss": 0.4989, + "step": 5611 + }, + { + "epoch": 2.2004445542109163, + "grad_norm": 0.4679852547890778, + "learning_rate": 4.871318205522965e-06, + "loss": 0.5271, + "step": 5612 + }, + { + "epoch": 2.2008397135095086, + "grad_norm": 0.44239899933907284, + "learning_rate": 4.871268583469982e-06, + "loss": 0.4922, + "step": 5613 + }, + { + "epoch": 2.201234872808101, + "grad_norm": 0.46044033765149545, + "learning_rate": 4.8712189521040955e-06, + "loss": 0.505, + "step": 5614 + }, + { + "epoch": 2.201630032106693, + "grad_norm": 0.4509076270729074, + "learning_rate": 4.871169311425501e-06, + "loss": 0.4841, + "step": 5615 + }, + { + "epoch": 2.2020251914052853, + "grad_norm": 0.46744577639760204, + "learning_rate": 4.871119661434395e-06, + "loss": 0.4923, + "step": 5616 + }, + { + "epoch": 2.2024203507038775, + "grad_norm": 0.46269441156741004, + "learning_rate": 4.871070002130968e-06, + "loss": 0.5057, + "step": 5617 + }, + { + "epoch": 2.20281551000247, + "grad_norm": 0.4615270330282239, + "learning_rate": 4.871020333515421e-06, + "loss": 0.4931, + "step": 5618 + }, + { + "epoch": 2.203210669301062, + "grad_norm": 0.45781453501465114, + "learning_rate": 4.870970655587943e-06, + "loss": 0.4993, + "step": 5619 + }, + { + "epoch": 2.2036058285996543, + "grad_norm": 0.46448710878947935, + "learning_rate": 4.870920968348734e-06, + "loss": 0.513, + "step": 5620 + }, + { + "epoch": 2.2040009878982465, + "grad_norm": 0.45923364426870794, + "learning_rate": 4.870871271797986e-06, + "loss": 0.4827, + "step": 5621 + }, + { + "epoch": 2.2043961471968387, + "grad_norm": 0.4663678964949085, + "learning_rate": 4.870821565935896e-06, + "loss": 0.487, + "step": 5622 + }, + { + "epoch": 2.204791306495431, + "grad_norm": 0.4652080789551877, + "learning_rate": 4.870771850762658e-06, + "loss": 0.5101, + "step": 5623 + }, + { + "epoch": 2.205186465794023, + "grad_norm": 0.7697658307753772, + "learning_rate": 4.870722126278468e-06, + "loss": 0.4702, + "step": 5624 + }, + { + "epoch": 2.2055816250926155, + "grad_norm": 0.46255357306164907, + "learning_rate": 4.870672392483521e-06, + "loss": 0.5035, + "step": 5625 + }, + { + "epoch": 2.2059767843912077, + "grad_norm": 0.49091601504343524, + "learning_rate": 4.870622649378012e-06, + "loss": 0.5232, + "step": 5626 + }, + { + "epoch": 2.2063719436898, + "grad_norm": 0.46764422378827775, + "learning_rate": 4.870572896962138e-06, + "loss": 0.4822, + "step": 5627 + }, + { + "epoch": 2.206767102988392, + "grad_norm": 0.4580646300847785, + "learning_rate": 4.870523135236092e-06, + "loss": 0.516, + "step": 5628 + }, + { + "epoch": 2.2071622622869844, + "grad_norm": 0.4716946592361151, + "learning_rate": 4.8704733642000714e-06, + "loss": 0.5058, + "step": 5629 + }, + { + "epoch": 2.2075574215855767, + "grad_norm": 0.4610497435658121, + "learning_rate": 4.8704235838542705e-06, + "loss": 0.5128, + "step": 5630 + }, + { + "epoch": 2.207952580884169, + "grad_norm": 0.46580653784035464, + "learning_rate": 4.870373794198885e-06, + "loss": 0.5168, + "step": 5631 + }, + { + "epoch": 2.208347740182761, + "grad_norm": 0.45033186180479673, + "learning_rate": 4.870323995234109e-06, + "loss": 0.4908, + "step": 5632 + }, + { + "epoch": 2.2087428994813534, + "grad_norm": 0.4623037931279241, + "learning_rate": 4.870274186960142e-06, + "loss": 0.5007, + "step": 5633 + }, + { + "epoch": 2.2091380587799456, + "grad_norm": 0.4543745068546596, + "learning_rate": 4.870224369377176e-06, + "loss": 0.4904, + "step": 5634 + }, + { + "epoch": 2.209533218078538, + "grad_norm": 0.4525710119538143, + "learning_rate": 4.87017454248541e-06, + "loss": 0.4789, + "step": 5635 + }, + { + "epoch": 2.20992837737713, + "grad_norm": 0.475347791482502, + "learning_rate": 4.870124706285036e-06, + "loss": 0.5116, + "step": 5636 + }, + { + "epoch": 2.2103235366757223, + "grad_norm": 0.4668581607053355, + "learning_rate": 4.8700748607762515e-06, + "loss": 0.4948, + "step": 5637 + }, + { + "epoch": 2.2107186959743146, + "grad_norm": 0.4562173473097316, + "learning_rate": 4.870025005959252e-06, + "loss": 0.5139, + "step": 5638 + }, + { + "epoch": 2.211113855272907, + "grad_norm": 0.4638745581699392, + "learning_rate": 4.869975141834234e-06, + "loss": 0.5042, + "step": 5639 + }, + { + "epoch": 2.211509014571499, + "grad_norm": 0.45690466092589344, + "learning_rate": 4.869925268401392e-06, + "loss": 0.5122, + "step": 5640 + }, + { + "epoch": 2.2119041738700913, + "grad_norm": 0.46129161407797814, + "learning_rate": 4.869875385660923e-06, + "loss": 0.5034, + "step": 5641 + }, + { + "epoch": 2.2122993331686835, + "grad_norm": 0.45249931427852574, + "learning_rate": 4.869825493613023e-06, + "loss": 0.4944, + "step": 5642 + }, + { + "epoch": 2.212694492467276, + "grad_norm": 0.4717240333098036, + "learning_rate": 4.869775592257887e-06, + "loss": 0.5079, + "step": 5643 + }, + { + "epoch": 2.213089651765868, + "grad_norm": 0.45646269978521453, + "learning_rate": 4.869725681595712e-06, + "loss": 0.5086, + "step": 5644 + }, + { + "epoch": 2.2134848110644603, + "grad_norm": 0.4660158700586118, + "learning_rate": 4.869675761626693e-06, + "loss": 0.5024, + "step": 5645 + }, + { + "epoch": 2.2138799703630525, + "grad_norm": 0.46107902427465175, + "learning_rate": 4.869625832351026e-06, + "loss": 0.4842, + "step": 5646 + }, + { + "epoch": 2.2142751296616447, + "grad_norm": 0.4610113532446741, + "learning_rate": 4.869575893768909e-06, + "loss": 0.5025, + "step": 5647 + }, + { + "epoch": 2.214670288960237, + "grad_norm": 0.46330124836282083, + "learning_rate": 4.869525945880536e-06, + "loss": 0.5011, + "step": 5648 + }, + { + "epoch": 2.2150654482588292, + "grad_norm": 0.44825650922902094, + "learning_rate": 4.869475988686105e-06, + "loss": 0.5132, + "step": 5649 + }, + { + "epoch": 2.2154606075574215, + "grad_norm": 0.438523700300251, + "learning_rate": 4.8694260221858095e-06, + "loss": 0.4888, + "step": 5650 + }, + { + "epoch": 2.2158557668560137, + "grad_norm": 0.446477475948747, + "learning_rate": 4.869376046379848e-06, + "loss": 0.5027, + "step": 5651 + }, + { + "epoch": 2.216250926154606, + "grad_norm": 0.47359530183611437, + "learning_rate": 4.869326061268416e-06, + "loss": 0.5057, + "step": 5652 + }, + { + "epoch": 2.216646085453198, + "grad_norm": 0.45346141292171366, + "learning_rate": 4.869276066851711e-06, + "loss": 0.5238, + "step": 5653 + }, + { + "epoch": 2.2170412447517904, + "grad_norm": 0.4697574903847061, + "learning_rate": 4.869226063129926e-06, + "loss": 0.5157, + "step": 5654 + }, + { + "epoch": 2.2174364040503827, + "grad_norm": 0.4992277490563748, + "learning_rate": 4.869176050103262e-06, + "loss": 0.5312, + "step": 5655 + }, + { + "epoch": 2.217831563348975, + "grad_norm": 0.4463421488924404, + "learning_rate": 4.869126027771912e-06, + "loss": 0.4857, + "step": 5656 + }, + { + "epoch": 2.218226722647567, + "grad_norm": 0.445536527726352, + "learning_rate": 4.8690759961360736e-06, + "loss": 0.511, + "step": 5657 + }, + { + "epoch": 2.2186218819461594, + "grad_norm": 0.4608081468830687, + "learning_rate": 4.869025955195944e-06, + "loss": 0.4942, + "step": 5658 + }, + { + "epoch": 2.2190170412447516, + "grad_norm": 0.471512152814528, + "learning_rate": 4.868975904951718e-06, + "loss": 0.5061, + "step": 5659 + }, + { + "epoch": 2.219412200543344, + "grad_norm": 0.4550754501913592, + "learning_rate": 4.868925845403594e-06, + "loss": 0.4992, + "step": 5660 + }, + { + "epoch": 2.219807359841936, + "grad_norm": 0.456877328887177, + "learning_rate": 4.868875776551767e-06, + "loss": 0.5084, + "step": 5661 + }, + { + "epoch": 2.2202025191405284, + "grad_norm": 0.44849181279718897, + "learning_rate": 4.868825698396435e-06, + "loss": 0.4798, + "step": 5662 + }, + { + "epoch": 2.2205976784391206, + "grad_norm": 0.4478184080732161, + "learning_rate": 4.8687756109377935e-06, + "loss": 0.4878, + "step": 5663 + }, + { + "epoch": 2.220992837737713, + "grad_norm": 0.4558088044427646, + "learning_rate": 4.86872551417604e-06, + "loss": 0.5042, + "step": 5664 + }, + { + "epoch": 2.221387997036305, + "grad_norm": 0.45237458187762586, + "learning_rate": 4.8686754081113715e-06, + "loss": 0.4871, + "step": 5665 + }, + { + "epoch": 2.2217831563348973, + "grad_norm": 0.4344840693507297, + "learning_rate": 4.868625292743985e-06, + "loss": 0.4937, + "step": 5666 + }, + { + "epoch": 2.2221783156334896, + "grad_norm": 0.45680335357644025, + "learning_rate": 4.868575168074075e-06, + "loss": 0.5051, + "step": 5667 + }, + { + "epoch": 2.222573474932082, + "grad_norm": 0.4786605604468794, + "learning_rate": 4.8685250341018405e-06, + "loss": 0.5145, + "step": 5668 + }, + { + "epoch": 2.222968634230674, + "grad_norm": 0.45133256657617904, + "learning_rate": 4.868474890827479e-06, + "loss": 0.4852, + "step": 5669 + }, + { + "epoch": 2.2233637935292663, + "grad_norm": 0.45874816511999167, + "learning_rate": 4.8684247382511855e-06, + "loss": 0.506, + "step": 5670 + }, + { + "epoch": 2.2237589528278585, + "grad_norm": 0.47105938523376145, + "learning_rate": 4.868374576373157e-06, + "loss": 0.5193, + "step": 5671 + }, + { + "epoch": 2.2241541121264508, + "grad_norm": 0.4499761946748633, + "learning_rate": 4.868324405193593e-06, + "loss": 0.4957, + "step": 5672 + }, + { + "epoch": 2.224549271425043, + "grad_norm": 0.44396274883575054, + "learning_rate": 4.868274224712688e-06, + "loss": 0.4968, + "step": 5673 + }, + { + "epoch": 2.2249444307236352, + "grad_norm": 0.45650883917884044, + "learning_rate": 4.86822403493064e-06, + "loss": 0.4963, + "step": 5674 + }, + { + "epoch": 2.2253395900222275, + "grad_norm": 0.45717604881834667, + "learning_rate": 4.868173835847646e-06, + "loss": 0.5214, + "step": 5675 + }, + { + "epoch": 2.22573474932082, + "grad_norm": 0.4587246120001337, + "learning_rate": 4.8681236274639024e-06, + "loss": 0.4964, + "step": 5676 + }, + { + "epoch": 2.2261299086194124, + "grad_norm": 0.46577038706127155, + "learning_rate": 4.868073409779609e-06, + "loss": 0.501, + "step": 5677 + }, + { + "epoch": 2.2265250679180046, + "grad_norm": 0.45292880378261396, + "learning_rate": 4.86802318279496e-06, + "loss": 0.5076, + "step": 5678 + }, + { + "epoch": 2.226920227216597, + "grad_norm": 0.46823804273718916, + "learning_rate": 4.867972946510154e-06, + "loss": 0.4996, + "step": 5679 + }, + { + "epoch": 2.227315386515189, + "grad_norm": 0.4651990124613078, + "learning_rate": 4.867922700925388e-06, + "loss": 0.4958, + "step": 5680 + }, + { + "epoch": 2.2277105458137814, + "grad_norm": 0.48024213114162967, + "learning_rate": 4.86787244604086e-06, + "loss": 0.5189, + "step": 5681 + }, + { + "epoch": 2.2281057051123736, + "grad_norm": 0.5081055107145902, + "learning_rate": 4.867822181856766e-06, + "loss": 0.4902, + "step": 5682 + }, + { + "epoch": 2.228500864410966, + "grad_norm": 0.4430423080870507, + "learning_rate": 4.867771908373306e-06, + "loss": 0.5037, + "step": 5683 + }, + { + "epoch": 2.228896023709558, + "grad_norm": 0.46192941764536816, + "learning_rate": 4.867721625590674e-06, + "loss": 0.5109, + "step": 5684 + }, + { + "epoch": 2.2292911830081503, + "grad_norm": 0.47441393038250607, + "learning_rate": 4.8676713335090694e-06, + "loss": 0.4973, + "step": 5685 + }, + { + "epoch": 2.2296863423067426, + "grad_norm": 0.4466027577701095, + "learning_rate": 4.867621032128691e-06, + "loss": 0.4853, + "step": 5686 + }, + { + "epoch": 2.230081501605335, + "grad_norm": 0.46435968422699964, + "learning_rate": 4.867570721449734e-06, + "loss": 0.5097, + "step": 5687 + }, + { + "epoch": 2.230476660903927, + "grad_norm": 0.45475109267398633, + "learning_rate": 4.867520401472396e-06, + "loss": 0.4937, + "step": 5688 + }, + { + "epoch": 2.2308718202025193, + "grad_norm": 0.46084767398038173, + "learning_rate": 4.867470072196876e-06, + "loss": 0.4913, + "step": 5689 + }, + { + "epoch": 2.2312669795011115, + "grad_norm": 0.45201456818615127, + "learning_rate": 4.867419733623372e-06, + "loss": 0.494, + "step": 5690 + }, + { + "epoch": 2.2316621387997038, + "grad_norm": 0.4669002434804088, + "learning_rate": 4.86736938575208e-06, + "loss": 0.5061, + "step": 5691 + }, + { + "epoch": 2.232057298098296, + "grad_norm": 0.5274175443355472, + "learning_rate": 4.867319028583199e-06, + "loss": 0.4944, + "step": 5692 + }, + { + "epoch": 2.2324524573968882, + "grad_norm": 0.45829216977076004, + "learning_rate": 4.867268662116926e-06, + "loss": 0.5064, + "step": 5693 + }, + { + "epoch": 2.2328476166954805, + "grad_norm": 0.45758070298208187, + "learning_rate": 4.86721828635346e-06, + "loss": 0.49, + "step": 5694 + }, + { + "epoch": 2.2332427759940727, + "grad_norm": 0.47928993400834125, + "learning_rate": 4.867167901292997e-06, + "loss": 0.5075, + "step": 5695 + }, + { + "epoch": 2.233637935292665, + "grad_norm": 0.46840348523008235, + "learning_rate": 4.867117506935737e-06, + "loss": 0.517, + "step": 5696 + }, + { + "epoch": 2.234033094591257, + "grad_norm": 0.48509443499209903, + "learning_rate": 4.867067103281876e-06, + "loss": 0.5159, + "step": 5697 + }, + { + "epoch": 2.2344282538898494, + "grad_norm": 0.5618447420828913, + "learning_rate": 4.867016690331613e-06, + "loss": 0.498, + "step": 5698 + }, + { + "epoch": 2.2348234131884417, + "grad_norm": 0.46216383707180914, + "learning_rate": 4.866966268085146e-06, + "loss": 0.5139, + "step": 5699 + }, + { + "epoch": 2.235218572487034, + "grad_norm": 0.45344618660738734, + "learning_rate": 4.866915836542672e-06, + "loss": 0.4922, + "step": 5700 + }, + { + "epoch": 2.235613731785626, + "grad_norm": 0.4679880087576633, + "learning_rate": 4.866865395704391e-06, + "loss": 0.4877, + "step": 5701 + }, + { + "epoch": 2.2360088910842184, + "grad_norm": 0.4683019810711741, + "learning_rate": 4.8668149455705e-06, + "loss": 0.4969, + "step": 5702 + }, + { + "epoch": 2.2364040503828106, + "grad_norm": 0.44030516087498117, + "learning_rate": 4.866764486141195e-06, + "loss": 0.5104, + "step": 5703 + }, + { + "epoch": 2.236799209681403, + "grad_norm": 0.4568263053340173, + "learning_rate": 4.866714017416678e-06, + "loss": 0.4835, + "step": 5704 + }, + { + "epoch": 2.237194368979995, + "grad_norm": 0.4855558978372768, + "learning_rate": 4.866663539397145e-06, + "loss": 0.5128, + "step": 5705 + }, + { + "epoch": 2.2375895282785874, + "grad_norm": 0.4698552204107376, + "learning_rate": 4.866613052082795e-06, + "loss": 0.4989, + "step": 5706 + }, + { + "epoch": 2.2379846875771796, + "grad_norm": 0.4573675129961296, + "learning_rate": 4.866562555473826e-06, + "loss": 0.5163, + "step": 5707 + }, + { + "epoch": 2.238379846875772, + "grad_norm": 0.4951264964599398, + "learning_rate": 4.866512049570437e-06, + "loss": 0.5161, + "step": 5708 + }, + { + "epoch": 2.238775006174364, + "grad_norm": 0.4687182481039132, + "learning_rate": 4.866461534372825e-06, + "loss": 0.5193, + "step": 5709 + }, + { + "epoch": 2.2391701654729563, + "grad_norm": 0.4524062924624281, + "learning_rate": 4.866411009881189e-06, + "loss": 0.5038, + "step": 5710 + }, + { + "epoch": 2.2395653247715486, + "grad_norm": 0.44991629928323695, + "learning_rate": 4.866360476095727e-06, + "loss": 0.4756, + "step": 5711 + }, + { + "epoch": 2.239960484070141, + "grad_norm": 0.45306535987909613, + "learning_rate": 4.866309933016639e-06, + "loss": 0.5075, + "step": 5712 + }, + { + "epoch": 2.240355643368733, + "grad_norm": 0.4582923597497876, + "learning_rate": 4.866259380644122e-06, + "loss": 0.4876, + "step": 5713 + }, + { + "epoch": 2.2407508026673253, + "grad_norm": 0.4756588236793085, + "learning_rate": 4.866208818978375e-06, + "loss": 0.5107, + "step": 5714 + }, + { + "epoch": 2.2411459619659175, + "grad_norm": 0.4463905269080376, + "learning_rate": 4.866158248019597e-06, + "loss": 0.4959, + "step": 5715 + }, + { + "epoch": 2.2415411212645098, + "grad_norm": 0.46283025201138467, + "learning_rate": 4.866107667767986e-06, + "loss": 0.5118, + "step": 5716 + }, + { + "epoch": 2.241936280563102, + "grad_norm": 0.45145005862470505, + "learning_rate": 4.866057078223741e-06, + "loss": 0.5036, + "step": 5717 + }, + { + "epoch": 2.2423314398616943, + "grad_norm": 0.4496155378592595, + "learning_rate": 4.86600647938706e-06, + "loss": 0.4873, + "step": 5718 + }, + { + "epoch": 2.2427265991602865, + "grad_norm": 0.46541226899418187, + "learning_rate": 4.865955871258142e-06, + "loss": 0.4932, + "step": 5719 + }, + { + "epoch": 2.2431217584588787, + "grad_norm": 0.46041587527729605, + "learning_rate": 4.865905253837187e-06, + "loss": 0.4907, + "step": 5720 + }, + { + "epoch": 2.243516917757471, + "grad_norm": 0.45128087864299693, + "learning_rate": 4.865854627124392e-06, + "loss": 0.5083, + "step": 5721 + }, + { + "epoch": 2.243912077056063, + "grad_norm": 0.4597446068090033, + "learning_rate": 4.8658039911199575e-06, + "loss": 0.4876, + "step": 5722 + }, + { + "epoch": 2.2443072363546555, + "grad_norm": 0.4531642673031476, + "learning_rate": 4.8657533458240814e-06, + "loss": 0.5167, + "step": 5723 + }, + { + "epoch": 2.2447023956532477, + "grad_norm": 0.4436869230047204, + "learning_rate": 4.865702691236962e-06, + "loss": 0.4931, + "step": 5724 + }, + { + "epoch": 2.24509755495184, + "grad_norm": 0.4651798857195074, + "learning_rate": 4.865652027358799e-06, + "loss": 0.5097, + "step": 5725 + }, + { + "epoch": 2.245492714250432, + "grad_norm": 0.45772796873736676, + "learning_rate": 4.865601354189791e-06, + "loss": 0.4916, + "step": 5726 + }, + { + "epoch": 2.2458878735490244, + "grad_norm": 0.46051001258089186, + "learning_rate": 4.865550671730139e-06, + "loss": 0.505, + "step": 5727 + }, + { + "epoch": 2.2462830328476167, + "grad_norm": 0.46185516258558523, + "learning_rate": 4.8654999799800394e-06, + "loss": 0.489, + "step": 5728 + }, + { + "epoch": 2.246678192146209, + "grad_norm": 0.47864882828994293, + "learning_rate": 4.865449278939693e-06, + "loss": 0.528, + "step": 5729 + }, + { + "epoch": 2.247073351444801, + "grad_norm": 0.49285960618658853, + "learning_rate": 4.865398568609297e-06, + "loss": 0.5121, + "step": 5730 + }, + { + "epoch": 2.2474685107433934, + "grad_norm": 0.470443068321539, + "learning_rate": 4.865347848989052e-06, + "loss": 0.5095, + "step": 5731 + }, + { + "epoch": 2.2478636700419856, + "grad_norm": 0.4560396366626315, + "learning_rate": 4.865297120079157e-06, + "loss": 0.4911, + "step": 5732 + }, + { + "epoch": 2.248258829340578, + "grad_norm": 0.4510249709001742, + "learning_rate": 4.8652463818798115e-06, + "loss": 0.491, + "step": 5733 + }, + { + "epoch": 2.24865398863917, + "grad_norm": 0.4646681611937035, + "learning_rate": 4.8651956343912145e-06, + "loss": 0.4974, + "step": 5734 + }, + { + "epoch": 2.2490491479377623, + "grad_norm": 0.45710106589102445, + "learning_rate": 4.8651448776135655e-06, + "loss": 0.4957, + "step": 5735 + }, + { + "epoch": 2.2494443072363546, + "grad_norm": 0.4432218553818469, + "learning_rate": 4.8650941115470636e-06, + "loss": 0.5001, + "step": 5736 + }, + { + "epoch": 2.249839466534947, + "grad_norm": 0.4461130216348829, + "learning_rate": 4.865043336191908e-06, + "loss": 0.4993, + "step": 5737 + }, + { + "epoch": 2.250234625833539, + "grad_norm": 0.4765715644527521, + "learning_rate": 4.864992551548298e-06, + "loss": 0.4984, + "step": 5738 + }, + { + "epoch": 2.2506297851321313, + "grad_norm": 0.44134482235233946, + "learning_rate": 4.864941757616434e-06, + "loss": 0.4966, + "step": 5739 + }, + { + "epoch": 2.2510249444307235, + "grad_norm": 0.4565907937298236, + "learning_rate": 4.864890954396514e-06, + "loss": 0.5124, + "step": 5740 + }, + { + "epoch": 2.251420103729316, + "grad_norm": 0.4445014385916519, + "learning_rate": 4.8648401418887385e-06, + "loss": 0.5036, + "step": 5741 + }, + { + "epoch": 2.251815263027908, + "grad_norm": 0.5046116665567862, + "learning_rate": 4.864789320093307e-06, + "loss": 0.4885, + "step": 5742 + }, + { + "epoch": 2.2522104223265003, + "grad_norm": 0.45743791687221036, + "learning_rate": 4.86473848901042e-06, + "loss": 0.5157, + "step": 5743 + }, + { + "epoch": 2.2526055816250925, + "grad_norm": 0.45713971041275187, + "learning_rate": 4.864687648640275e-06, + "loss": 0.5209, + "step": 5744 + }, + { + "epoch": 2.2530007409236847, + "grad_norm": 0.45916966205747045, + "learning_rate": 4.864636798983073e-06, + "loss": 0.5218, + "step": 5745 + }, + { + "epoch": 2.253395900222277, + "grad_norm": 0.4645094613852396, + "learning_rate": 4.864585940039014e-06, + "loss": 0.4919, + "step": 5746 + }, + { + "epoch": 2.2537910595208692, + "grad_norm": 0.4611192692789078, + "learning_rate": 4.864535071808298e-06, + "loss": 0.5181, + "step": 5747 + }, + { + "epoch": 2.2541862188194615, + "grad_norm": 0.462528547985611, + "learning_rate": 4.8644841942911225e-06, + "loss": 0.5048, + "step": 5748 + }, + { + "epoch": 2.2545813781180537, + "grad_norm": 0.4622175816401159, + "learning_rate": 4.8644333074876896e-06, + "loss": 0.5138, + "step": 5749 + }, + { + "epoch": 2.254976537416646, + "grad_norm": 0.4802227886602715, + "learning_rate": 4.864382411398198e-06, + "loss": 0.4925, + "step": 5750 + }, + { + "epoch": 2.255371696715238, + "grad_norm": 0.4542300433876316, + "learning_rate": 4.864331506022848e-06, + "loss": 0.5204, + "step": 5751 + }, + { + "epoch": 2.2557668560138304, + "grad_norm": 0.46121165508412626, + "learning_rate": 4.86428059136184e-06, + "loss": 0.489, + "step": 5752 + }, + { + "epoch": 2.2561620153124227, + "grad_norm": 0.45065089171005923, + "learning_rate": 4.864229667415373e-06, + "loss": 0.5082, + "step": 5753 + }, + { + "epoch": 2.256557174611015, + "grad_norm": 0.4664108459131774, + "learning_rate": 4.864178734183649e-06, + "loss": 0.5203, + "step": 5754 + }, + { + "epoch": 2.256952333909607, + "grad_norm": 0.46599726286903415, + "learning_rate": 4.864127791666865e-06, + "loss": 0.5044, + "step": 5755 + }, + { + "epoch": 2.2573474932081994, + "grad_norm": 0.4579348177710053, + "learning_rate": 4.864076839865223e-06, + "loss": 0.4966, + "step": 5756 + }, + { + "epoch": 2.2577426525067916, + "grad_norm": 0.4710610355784568, + "learning_rate": 4.864025878778923e-06, + "loss": 0.5039, + "step": 5757 + }, + { + "epoch": 2.258137811805384, + "grad_norm": 0.5098836289434676, + "learning_rate": 4.863974908408164e-06, + "loss": 0.4864, + "step": 5758 + }, + { + "epoch": 2.258532971103976, + "grad_norm": 0.444685462826527, + "learning_rate": 4.863923928753148e-06, + "loss": 0.4929, + "step": 5759 + }, + { + "epoch": 2.2589281304025683, + "grad_norm": 0.45233517234419085, + "learning_rate": 4.8638729398140735e-06, + "loss": 0.5061, + "step": 5760 + }, + { + "epoch": 2.2593232897011606, + "grad_norm": 0.465371345730281, + "learning_rate": 4.863821941591142e-06, + "loss": 0.4998, + "step": 5761 + }, + { + "epoch": 2.2597184489997533, + "grad_norm": 0.4562128947163187, + "learning_rate": 4.863770934084553e-06, + "loss": 0.5238, + "step": 5762 + }, + { + "epoch": 2.2601136082983455, + "grad_norm": 0.45638820275354225, + "learning_rate": 4.863719917294507e-06, + "loss": 0.5035, + "step": 5763 + }, + { + "epoch": 2.2605087675969378, + "grad_norm": 0.47757782000563964, + "learning_rate": 4.863668891221206e-06, + "loss": 0.5079, + "step": 5764 + }, + { + "epoch": 2.26090392689553, + "grad_norm": 0.4662080915894657, + "learning_rate": 4.863617855864847e-06, + "loss": 0.5124, + "step": 5765 + }, + { + "epoch": 2.2612990861941222, + "grad_norm": 0.47892291257771435, + "learning_rate": 4.863566811225634e-06, + "loss": 0.5184, + "step": 5766 + }, + { + "epoch": 2.2616942454927145, + "grad_norm": 0.45977277292864505, + "learning_rate": 4.863515757303764e-06, + "loss": 0.4941, + "step": 5767 + }, + { + "epoch": 2.2620894047913067, + "grad_norm": 0.46916509655903266, + "learning_rate": 4.863464694099441e-06, + "loss": 0.5056, + "step": 5768 + }, + { + "epoch": 2.262484564089899, + "grad_norm": 0.4504962671579038, + "learning_rate": 4.863413621612862e-06, + "loss": 0.5062, + "step": 5769 + }, + { + "epoch": 2.262879723388491, + "grad_norm": 0.44867824093263253, + "learning_rate": 4.863362539844231e-06, + "loss": 0.4993, + "step": 5770 + }, + { + "epoch": 2.2632748826870834, + "grad_norm": 0.4670694822124429, + "learning_rate": 4.863311448793747e-06, + "loss": 0.5106, + "step": 5771 + }, + { + "epoch": 2.2636700419856757, + "grad_norm": 0.4632432969652908, + "learning_rate": 4.8632603484616095e-06, + "loss": 0.5138, + "step": 5772 + }, + { + "epoch": 2.264065201284268, + "grad_norm": 0.5486722092316492, + "learning_rate": 4.8632092388480216e-06, + "loss": 0.4994, + "step": 5773 + }, + { + "epoch": 2.26446036058286, + "grad_norm": 0.46134894866499143, + "learning_rate": 4.863158119953182e-06, + "loss": 0.5041, + "step": 5774 + }, + { + "epoch": 2.2648555198814524, + "grad_norm": 0.4645211586085868, + "learning_rate": 4.863106991777293e-06, + "loss": 0.5062, + "step": 5775 + }, + { + "epoch": 2.2652506791800446, + "grad_norm": 0.46537662412865477, + "learning_rate": 4.863055854320554e-06, + "loss": 0.4913, + "step": 5776 + }, + { + "epoch": 2.265645838478637, + "grad_norm": 0.4463910826863991, + "learning_rate": 4.863004707583167e-06, + "loss": 0.5051, + "step": 5777 + }, + { + "epoch": 2.266040997777229, + "grad_norm": 0.4461844055881152, + "learning_rate": 4.862953551565332e-06, + "loss": 0.4879, + "step": 5778 + }, + { + "epoch": 2.2664361570758214, + "grad_norm": 0.4671496378324098, + "learning_rate": 4.862902386267251e-06, + "loss": 0.4962, + "step": 5779 + }, + { + "epoch": 2.2668313163744136, + "grad_norm": 0.47268040765077685, + "learning_rate": 4.862851211689124e-06, + "loss": 0.5075, + "step": 5780 + }, + { + "epoch": 2.267226475673006, + "grad_norm": 0.4482777628793822, + "learning_rate": 4.8628000278311515e-06, + "loss": 0.5135, + "step": 5781 + }, + { + "epoch": 2.267621634971598, + "grad_norm": 0.4589821173101706, + "learning_rate": 4.862748834693536e-06, + "loss": 0.4909, + "step": 5782 + }, + { + "epoch": 2.2680167942701903, + "grad_norm": 0.4670920261314897, + "learning_rate": 4.862697632276477e-06, + "loss": 0.5232, + "step": 5783 + }, + { + "epoch": 2.2684119535687826, + "grad_norm": 0.4421128362683695, + "learning_rate": 4.862646420580178e-06, + "loss": 0.4887, + "step": 5784 + }, + { + "epoch": 2.268807112867375, + "grad_norm": 0.44505858305719176, + "learning_rate": 4.862595199604837e-06, + "loss": 0.5003, + "step": 5785 + }, + { + "epoch": 2.269202272165967, + "grad_norm": 0.49135116256536704, + "learning_rate": 4.862543969350657e-06, + "loss": 0.4949, + "step": 5786 + }, + { + "epoch": 2.2695974314645593, + "grad_norm": 0.46918700677450953, + "learning_rate": 4.86249272981784e-06, + "loss": 0.4885, + "step": 5787 + }, + { + "epoch": 2.2699925907631515, + "grad_norm": 0.4485662070030722, + "learning_rate": 4.862441481006586e-06, + "loss": 0.5049, + "step": 5788 + }, + { + "epoch": 2.2703877500617438, + "grad_norm": 0.43803428307843834, + "learning_rate": 4.862390222917095e-06, + "loss": 0.4808, + "step": 5789 + }, + { + "epoch": 2.270782909360336, + "grad_norm": 0.46807207385887406, + "learning_rate": 4.86233895554957e-06, + "loss": 0.5187, + "step": 5790 + }, + { + "epoch": 2.2711780686589282, + "grad_norm": 0.44793935494961473, + "learning_rate": 4.862287678904213e-06, + "loss": 0.4933, + "step": 5791 + }, + { + "epoch": 2.2715732279575205, + "grad_norm": 0.46535343446328026, + "learning_rate": 4.862236392981225e-06, + "loss": 0.5276, + "step": 5792 + }, + { + "epoch": 2.2719683872561127, + "grad_norm": 0.4698550219155634, + "learning_rate": 4.8621850977808046e-06, + "loss": 0.4872, + "step": 5793 + }, + { + "epoch": 2.272363546554705, + "grad_norm": 0.4682879416283325, + "learning_rate": 4.862133793303157e-06, + "loss": 0.4903, + "step": 5794 + }, + { + "epoch": 2.272758705853297, + "grad_norm": 0.48270114542131143, + "learning_rate": 4.862082479548482e-06, + "loss": 0.4993, + "step": 5795 + }, + { + "epoch": 2.2731538651518894, + "grad_norm": 0.5493323187094451, + "learning_rate": 4.862031156516982e-06, + "loss": 0.4965, + "step": 5796 + }, + { + "epoch": 2.2735490244504817, + "grad_norm": 0.4516215367368103, + "learning_rate": 4.861979824208857e-06, + "loss": 0.5034, + "step": 5797 + }, + { + "epoch": 2.273944183749074, + "grad_norm": 0.4538027339697062, + "learning_rate": 4.86192848262431e-06, + "loss": 0.5006, + "step": 5798 + }, + { + "epoch": 2.274339343047666, + "grad_norm": 0.4616229995385204, + "learning_rate": 4.861877131763542e-06, + "loss": 0.5013, + "step": 5799 + }, + { + "epoch": 2.2747345023462584, + "grad_norm": 0.46227058646164154, + "learning_rate": 4.861825771626755e-06, + "loss": 0.4987, + "step": 5800 + }, + { + "epoch": 2.2751296616448506, + "grad_norm": 0.45156002524949446, + "learning_rate": 4.86177440221415e-06, + "loss": 0.4905, + "step": 5801 + }, + { + "epoch": 2.275524820943443, + "grad_norm": 0.45398486859479054, + "learning_rate": 4.861723023525929e-06, + "loss": 0.5127, + "step": 5802 + }, + { + "epoch": 2.275919980242035, + "grad_norm": 0.4546600141282232, + "learning_rate": 4.861671635562295e-06, + "loss": 0.512, + "step": 5803 + }, + { + "epoch": 2.2763151395406274, + "grad_norm": 0.4420173118969808, + "learning_rate": 4.861620238323449e-06, + "loss": 0.508, + "step": 5804 + }, + { + "epoch": 2.2767102988392196, + "grad_norm": 0.4481000176106849, + "learning_rate": 4.861568831809592e-06, + "loss": 0.505, + "step": 5805 + }, + { + "epoch": 2.277105458137812, + "grad_norm": 0.48239910462261965, + "learning_rate": 4.861517416020928e-06, + "loss": 0.5103, + "step": 5806 + }, + { + "epoch": 2.277500617436404, + "grad_norm": 0.47340057062183605, + "learning_rate": 4.861465990957656e-06, + "loss": 0.4927, + "step": 5807 + }, + { + "epoch": 2.2778957767349963, + "grad_norm": 0.4552226094608049, + "learning_rate": 4.86141455661998e-06, + "loss": 0.4982, + "step": 5808 + }, + { + "epoch": 2.2782909360335886, + "grad_norm": 0.580540080884017, + "learning_rate": 4.861363113008102e-06, + "loss": 0.4945, + "step": 5809 + }, + { + "epoch": 2.278686095332181, + "grad_norm": 0.4563722531620255, + "learning_rate": 4.861311660122223e-06, + "loss": 0.491, + "step": 5810 + }, + { + "epoch": 2.279081254630773, + "grad_norm": 0.44479481415363753, + "learning_rate": 4.861260197962546e-06, + "loss": 0.4733, + "step": 5811 + }, + { + "epoch": 2.2794764139293653, + "grad_norm": 0.46120540937626275, + "learning_rate": 4.861208726529273e-06, + "loss": 0.4971, + "step": 5812 + }, + { + "epoch": 2.2798715732279575, + "grad_norm": 0.46393191062150557, + "learning_rate": 4.861157245822605e-06, + "loss": 0.5004, + "step": 5813 + }, + { + "epoch": 2.2802667325265498, + "grad_norm": 0.4515647504745636, + "learning_rate": 4.861105755842747e-06, + "loss": 0.4777, + "step": 5814 + }, + { + "epoch": 2.280661891825142, + "grad_norm": 0.438205927734934, + "learning_rate": 4.8610542565898975e-06, + "loss": 0.4916, + "step": 5815 + }, + { + "epoch": 2.2810570511237342, + "grad_norm": 0.457098274611986, + "learning_rate": 4.861002748064261e-06, + "loss": 0.5009, + "step": 5816 + }, + { + "epoch": 2.2814522104223265, + "grad_norm": 0.4615257787626016, + "learning_rate": 4.86095123026604e-06, + "loss": 0.4986, + "step": 5817 + }, + { + "epoch": 2.2818473697209187, + "grad_norm": 0.45965059976307776, + "learning_rate": 4.860899703195435e-06, + "loss": 0.4986, + "step": 5818 + }, + { + "epoch": 2.282242529019511, + "grad_norm": 0.46417580461089847, + "learning_rate": 4.860848166852651e-06, + "loss": 0.4937, + "step": 5819 + }, + { + "epoch": 2.282637688318103, + "grad_norm": 0.47161773166956317, + "learning_rate": 4.860796621237888e-06, + "loss": 0.4979, + "step": 5820 + }, + { + "epoch": 2.2830328476166954, + "grad_norm": 0.4660616983606871, + "learning_rate": 4.86074506635135e-06, + "loss": 0.509, + "step": 5821 + }, + { + "epoch": 2.2834280069152877, + "grad_norm": 0.4832205275826543, + "learning_rate": 4.860693502193239e-06, + "loss": 0.516, + "step": 5822 + }, + { + "epoch": 2.28382316621388, + "grad_norm": 0.49641441825377863, + "learning_rate": 4.860641928763757e-06, + "loss": 0.5138, + "step": 5823 + }, + { + "epoch": 2.284218325512472, + "grad_norm": 0.4591396621913851, + "learning_rate": 4.860590346063107e-06, + "loss": 0.4956, + "step": 5824 + }, + { + "epoch": 2.2846134848110644, + "grad_norm": 0.46808791796018534, + "learning_rate": 4.8605387540914915e-06, + "loss": 0.4979, + "step": 5825 + }, + { + "epoch": 2.2850086441096567, + "grad_norm": 0.44616669574023354, + "learning_rate": 4.8604871528491135e-06, + "loss": 0.4809, + "step": 5826 + }, + { + "epoch": 2.285403803408249, + "grad_norm": 0.46293514962772786, + "learning_rate": 4.860435542336175e-06, + "loss": 0.5058, + "step": 5827 + }, + { + "epoch": 2.285798962706841, + "grad_norm": 0.47952265408731376, + "learning_rate": 4.86038392255288e-06, + "loss": 0.4998, + "step": 5828 + }, + { + "epoch": 2.2861941220054334, + "grad_norm": 0.4585566892611743, + "learning_rate": 4.8603322934994284e-06, + "loss": 0.5039, + "step": 5829 + }, + { + "epoch": 2.2865892813040256, + "grad_norm": 0.45394838026461104, + "learning_rate": 4.860280655176026e-06, + "loss": 0.5084, + "step": 5830 + }, + { + "epoch": 2.286984440602618, + "grad_norm": 0.4556930344196025, + "learning_rate": 4.860229007582874e-06, + "loss": 0.512, + "step": 5831 + }, + { + "epoch": 2.28737959990121, + "grad_norm": 0.5160699721279627, + "learning_rate": 4.860177350720176e-06, + "loss": 0.4992, + "step": 5832 + }, + { + "epoch": 2.2877747591998023, + "grad_norm": 0.4683352173737334, + "learning_rate": 4.860125684588135e-06, + "loss": 0.4984, + "step": 5833 + }, + { + "epoch": 2.2881699184983946, + "grad_norm": 0.44948200378048164, + "learning_rate": 4.860074009186952e-06, + "loss": 0.5026, + "step": 5834 + }, + { + "epoch": 2.288565077796987, + "grad_norm": 0.4462063995104546, + "learning_rate": 4.8600223245168325e-06, + "loss": 0.4989, + "step": 5835 + }, + { + "epoch": 2.288960237095579, + "grad_norm": 0.4769414323903259, + "learning_rate": 4.8599706305779785e-06, + "loss": 0.5109, + "step": 5836 + }, + { + "epoch": 2.2893553963941713, + "grad_norm": 0.4838732772951883, + "learning_rate": 4.8599189273705926e-06, + "loss": 0.4982, + "step": 5837 + }, + { + "epoch": 2.2897505556927635, + "grad_norm": 0.47336977014120124, + "learning_rate": 4.859867214894878e-06, + "loss": 0.501, + "step": 5838 + }, + { + "epoch": 2.2901457149913558, + "grad_norm": 0.4708351136561112, + "learning_rate": 4.8598154931510385e-06, + "loss": 0.5142, + "step": 5839 + }, + { + "epoch": 2.290540874289948, + "grad_norm": 0.4335493511834357, + "learning_rate": 4.859763762139276e-06, + "loss": 0.4897, + "step": 5840 + }, + { + "epoch": 2.2909360335885403, + "grad_norm": 0.4551238002159216, + "learning_rate": 4.859712021859795e-06, + "loss": 0.506, + "step": 5841 + }, + { + "epoch": 2.2913311928871325, + "grad_norm": 0.46419964327160607, + "learning_rate": 4.8596602723127975e-06, + "loss": 0.4924, + "step": 5842 + }, + { + "epoch": 2.2917263521857247, + "grad_norm": 0.4504273710533772, + "learning_rate": 4.859608513498488e-06, + "loss": 0.5141, + "step": 5843 + }, + { + "epoch": 2.292121511484317, + "grad_norm": 0.4592991376285661, + "learning_rate": 4.859556745417068e-06, + "loss": 0.5224, + "step": 5844 + }, + { + "epoch": 2.292516670782909, + "grad_norm": 0.45863529913627665, + "learning_rate": 4.859504968068743e-06, + "loss": 0.5019, + "step": 5845 + }, + { + "epoch": 2.2929118300815015, + "grad_norm": 0.46302406671745044, + "learning_rate": 4.859453181453715e-06, + "loss": 0.5123, + "step": 5846 + }, + { + "epoch": 2.2933069893800937, + "grad_norm": 0.5210997355300256, + "learning_rate": 4.8594013855721875e-06, + "loss": 0.5157, + "step": 5847 + }, + { + "epoch": 2.293702148678686, + "grad_norm": 0.4698909559181017, + "learning_rate": 4.859349580424364e-06, + "loss": 0.5115, + "step": 5848 + }, + { + "epoch": 2.294097307977278, + "grad_norm": 0.4753355992191938, + "learning_rate": 4.859297766010448e-06, + "loss": 0.508, + "step": 5849 + }, + { + "epoch": 2.2944924672758704, + "grad_norm": 0.44406133299641976, + "learning_rate": 4.859245942330643e-06, + "loss": 0.5031, + "step": 5850 + }, + { + "epoch": 2.2948876265744627, + "grad_norm": 0.46793872675388737, + "learning_rate": 4.859194109385152e-06, + "loss": 0.5028, + "step": 5851 + }, + { + "epoch": 2.295282785873055, + "grad_norm": 0.456450080256325, + "learning_rate": 4.85914226717418e-06, + "loss": 0.4996, + "step": 5852 + }, + { + "epoch": 2.295677945171647, + "grad_norm": 0.4678454186698638, + "learning_rate": 4.85909041569793e-06, + "loss": 0.4944, + "step": 5853 + }, + { + "epoch": 2.2960731044702394, + "grad_norm": 0.4558794558467581, + "learning_rate": 4.8590385549566046e-06, + "loss": 0.5009, + "step": 5854 + }, + { + "epoch": 2.2964682637688316, + "grad_norm": 0.45402991768060114, + "learning_rate": 4.858986684950408e-06, + "loss": 0.5311, + "step": 5855 + }, + { + "epoch": 2.296863423067424, + "grad_norm": 0.46642376112323874, + "learning_rate": 4.858934805679545e-06, + "loss": 0.4982, + "step": 5856 + }, + { + "epoch": 2.297258582366016, + "grad_norm": 0.4582563144966381, + "learning_rate": 4.858882917144218e-06, + "loss": 0.4966, + "step": 5857 + }, + { + "epoch": 2.2976537416646083, + "grad_norm": 0.44640576325625786, + "learning_rate": 4.858831019344632e-06, + "loss": 0.5014, + "step": 5858 + }, + { + "epoch": 2.2980489009632006, + "grad_norm": 0.45330919927536356, + "learning_rate": 4.858779112280989e-06, + "loss": 0.4981, + "step": 5859 + }, + { + "epoch": 2.298444060261793, + "grad_norm": 0.4575939467608079, + "learning_rate": 4.858727195953495e-06, + "loss": 0.5004, + "step": 5860 + }, + { + "epoch": 2.298839219560385, + "grad_norm": 0.45894810163579486, + "learning_rate": 4.858675270362352e-06, + "loss": 0.5005, + "step": 5861 + }, + { + "epoch": 2.2992343788589773, + "grad_norm": 0.4667448223237067, + "learning_rate": 4.858623335507765e-06, + "loss": 0.5131, + "step": 5862 + }, + { + "epoch": 2.2996295381575695, + "grad_norm": 0.4466389707322938, + "learning_rate": 4.858571391389938e-06, + "loss": 0.5065, + "step": 5863 + }, + { + "epoch": 2.300024697456162, + "grad_norm": 0.46052845698144745, + "learning_rate": 4.858519438009075e-06, + "loss": 0.4981, + "step": 5864 + }, + { + "epoch": 2.300419856754754, + "grad_norm": 0.4521693219071108, + "learning_rate": 4.8584674753653795e-06, + "loss": 0.4932, + "step": 5865 + }, + { + "epoch": 2.3008150160533463, + "grad_norm": 0.4747140612853695, + "learning_rate": 4.858415503459056e-06, + "loss": 0.5212, + "step": 5866 + }, + { + "epoch": 2.3012101753519385, + "grad_norm": 0.45520487283506444, + "learning_rate": 4.858363522290308e-06, + "loss": 0.5063, + "step": 5867 + }, + { + "epoch": 2.301605334650531, + "grad_norm": 0.4551319537165788, + "learning_rate": 4.858311531859341e-06, + "loss": 0.488, + "step": 5868 + }, + { + "epoch": 2.3020004939491234, + "grad_norm": 0.4794988475592662, + "learning_rate": 4.858259532166358e-06, + "loss": 0.5088, + "step": 5869 + }, + { + "epoch": 2.3023956532477157, + "grad_norm": 0.4581559846294469, + "learning_rate": 4.858207523211563e-06, + "loss": 0.5049, + "step": 5870 + }, + { + "epoch": 2.302790812546308, + "grad_norm": 0.4616610380469891, + "learning_rate": 4.858155504995162e-06, + "loss": 0.4891, + "step": 5871 + }, + { + "epoch": 2.3031859718449, + "grad_norm": 0.46422859211150336, + "learning_rate": 4.8581034775173575e-06, + "loss": 0.512, + "step": 5872 + }, + { + "epoch": 2.3035811311434924, + "grad_norm": 0.4693415161744398, + "learning_rate": 4.858051440778354e-06, + "loss": 0.5236, + "step": 5873 + }, + { + "epoch": 2.3039762904420846, + "grad_norm": 0.45548329510092883, + "learning_rate": 4.857999394778357e-06, + "loss": 0.5045, + "step": 5874 + }, + { + "epoch": 2.304371449740677, + "grad_norm": 0.4568687640660631, + "learning_rate": 4.857947339517571e-06, + "loss": 0.5055, + "step": 5875 + }, + { + "epoch": 2.304766609039269, + "grad_norm": 0.46035875823989625, + "learning_rate": 4.857895274996198e-06, + "loss": 0.4936, + "step": 5876 + }, + { + "epoch": 2.3051617683378613, + "grad_norm": 0.4852045992439535, + "learning_rate": 4.857843201214445e-06, + "loss": 0.515, + "step": 5877 + }, + { + "epoch": 2.3055569276364536, + "grad_norm": 0.4595559022314449, + "learning_rate": 4.857791118172515e-06, + "loss": 0.4981, + "step": 5878 + }, + { + "epoch": 2.305952086935046, + "grad_norm": 0.4647597601816075, + "learning_rate": 4.857739025870614e-06, + "loss": 0.4914, + "step": 5879 + }, + { + "epoch": 2.306347246233638, + "grad_norm": 0.4745172978982465, + "learning_rate": 4.857686924308946e-06, + "loss": 0.4986, + "step": 5880 + }, + { + "epoch": 2.3067424055322303, + "grad_norm": 0.4738840639632119, + "learning_rate": 4.857634813487715e-06, + "loss": 0.5071, + "step": 5881 + }, + { + "epoch": 2.3071375648308226, + "grad_norm": 0.4528727368292258, + "learning_rate": 4.857582693407126e-06, + "loss": 0.4974, + "step": 5882 + }, + { + "epoch": 2.307532724129415, + "grad_norm": 0.46653799947338975, + "learning_rate": 4.857530564067383e-06, + "loss": 0.4883, + "step": 5883 + }, + { + "epoch": 2.307927883428007, + "grad_norm": 0.46309044360177304, + "learning_rate": 4.857478425468693e-06, + "loss": 0.4971, + "step": 5884 + }, + { + "epoch": 2.3083230427265993, + "grad_norm": 0.46207981907616164, + "learning_rate": 4.857426277611258e-06, + "loss": 0.512, + "step": 5885 + }, + { + "epoch": 2.3087182020251915, + "grad_norm": 0.46314752976399587, + "learning_rate": 4.857374120495285e-06, + "loss": 0.5063, + "step": 5886 + }, + { + "epoch": 2.3091133613237838, + "grad_norm": 0.4649842929739362, + "learning_rate": 4.857321954120977e-06, + "loss": 0.5114, + "step": 5887 + }, + { + "epoch": 2.309508520622376, + "grad_norm": 0.5535102504683219, + "learning_rate": 4.857269778488541e-06, + "loss": 0.5134, + "step": 5888 + }, + { + "epoch": 2.3099036799209682, + "grad_norm": 0.4423259127513932, + "learning_rate": 4.85721759359818e-06, + "loss": 0.4975, + "step": 5889 + }, + { + "epoch": 2.3102988392195605, + "grad_norm": 0.44102253287067894, + "learning_rate": 4.8571653994501e-06, + "loss": 0.487, + "step": 5890 + }, + { + "epoch": 2.3106939985181527, + "grad_norm": 0.4528723880487849, + "learning_rate": 4.857113196044505e-06, + "loss": 0.5174, + "step": 5891 + }, + { + "epoch": 2.311089157816745, + "grad_norm": 0.4594030254108174, + "learning_rate": 4.857060983381601e-06, + "loss": 0.5041, + "step": 5892 + }, + { + "epoch": 2.311484317115337, + "grad_norm": 0.4688774998319715, + "learning_rate": 4.857008761461593e-06, + "loss": 0.5058, + "step": 5893 + }, + { + "epoch": 2.3118794764139294, + "grad_norm": 0.4649598927489743, + "learning_rate": 4.856956530284686e-06, + "loss": 0.5022, + "step": 5894 + }, + { + "epoch": 2.3122746357125217, + "grad_norm": 0.4498229566988718, + "learning_rate": 4.856904289851084e-06, + "loss": 0.5122, + "step": 5895 + }, + { + "epoch": 2.312669795011114, + "grad_norm": 0.4683470822663338, + "learning_rate": 4.856852040160994e-06, + "loss": 0.5005, + "step": 5896 + }, + { + "epoch": 2.313064954309706, + "grad_norm": 0.4911558695679091, + "learning_rate": 4.856799781214621e-06, + "loss": 0.5253, + "step": 5897 + }, + { + "epoch": 2.3134601136082984, + "grad_norm": 0.464409082612125, + "learning_rate": 4.856747513012168e-06, + "loss": 0.5019, + "step": 5898 + }, + { + "epoch": 2.3138552729068906, + "grad_norm": 0.4579973840895186, + "learning_rate": 4.856695235553843e-06, + "loss": 0.4942, + "step": 5899 + }, + { + "epoch": 2.314250432205483, + "grad_norm": 0.4478579726762588, + "learning_rate": 4.85664294883985e-06, + "loss": 0.4892, + "step": 5900 + }, + { + "epoch": 2.314645591504075, + "grad_norm": 0.4693973591026146, + "learning_rate": 4.856590652870395e-06, + "loss": 0.5006, + "step": 5901 + }, + { + "epoch": 2.3150407508026674, + "grad_norm": 0.5045358380289714, + "learning_rate": 4.856538347645681e-06, + "loss": 0.5154, + "step": 5902 + }, + { + "epoch": 2.3154359101012596, + "grad_norm": 0.4649328917632037, + "learning_rate": 4.856486033165917e-06, + "loss": 0.4955, + "step": 5903 + }, + { + "epoch": 2.315831069399852, + "grad_norm": 0.46867742048507893, + "learning_rate": 4.856433709431307e-06, + "loss": 0.5016, + "step": 5904 + }, + { + "epoch": 2.316226228698444, + "grad_norm": 0.4520581706582322, + "learning_rate": 4.8563813764420555e-06, + "loss": 0.5125, + "step": 5905 + }, + { + "epoch": 2.3166213879970363, + "grad_norm": 0.45026206385228246, + "learning_rate": 4.856329034198368e-06, + "loss": 0.4939, + "step": 5906 + }, + { + "epoch": 2.3170165472956286, + "grad_norm": 0.45030736202082045, + "learning_rate": 4.8562766827004525e-06, + "loss": 0.5242, + "step": 5907 + }, + { + "epoch": 2.317411706594221, + "grad_norm": 0.4469905948107838, + "learning_rate": 4.856224321948512e-06, + "loss": 0.5073, + "step": 5908 + }, + { + "epoch": 2.317806865892813, + "grad_norm": 0.45894383299198227, + "learning_rate": 4.856171951942754e-06, + "loss": 0.5101, + "step": 5909 + }, + { + "epoch": 2.3182020251914053, + "grad_norm": 0.4395899572240971, + "learning_rate": 4.856119572683383e-06, + "loss": 0.4837, + "step": 5910 + }, + { + "epoch": 2.3185971844899975, + "grad_norm": 0.44751988426963857, + "learning_rate": 4.856067184170604e-06, + "loss": 0.5126, + "step": 5911 + }, + { + "epoch": 2.3189923437885898, + "grad_norm": 0.45656584660916594, + "learning_rate": 4.856014786404625e-06, + "loss": 0.4979, + "step": 5912 + }, + { + "epoch": 2.319387503087182, + "grad_norm": 0.4506209307224017, + "learning_rate": 4.8559623793856505e-06, + "loss": 0.4942, + "step": 5913 + }, + { + "epoch": 2.3197826623857742, + "grad_norm": 0.4461798611971729, + "learning_rate": 4.855909963113886e-06, + "loss": 0.4802, + "step": 5914 + }, + { + "epoch": 2.3201778216843665, + "grad_norm": 0.46130265251689806, + "learning_rate": 4.8558575375895375e-06, + "loss": 0.5252, + "step": 5915 + }, + { + "epoch": 2.3205729809829587, + "grad_norm": 0.45320094317982595, + "learning_rate": 4.855805102812811e-06, + "loss": 0.5089, + "step": 5916 + }, + { + "epoch": 2.320968140281551, + "grad_norm": 0.4508029872376069, + "learning_rate": 4.855752658783914e-06, + "loss": 0.5247, + "step": 5917 + }, + { + "epoch": 2.321363299580143, + "grad_norm": 0.4448389675565296, + "learning_rate": 4.85570020550305e-06, + "loss": 0.4821, + "step": 5918 + }, + { + "epoch": 2.3217584588787354, + "grad_norm": 0.46839616082852714, + "learning_rate": 4.8556477429704265e-06, + "loss": 0.4997, + "step": 5919 + }, + { + "epoch": 2.3221536181773277, + "grad_norm": 0.45777076533689726, + "learning_rate": 4.855595271186249e-06, + "loss": 0.4868, + "step": 5920 + }, + { + "epoch": 2.32254877747592, + "grad_norm": 0.4606460723597928, + "learning_rate": 4.855542790150723e-06, + "loss": 0.4996, + "step": 5921 + }, + { + "epoch": 2.322943936774512, + "grad_norm": 0.474039852550981, + "learning_rate": 4.855490299864055e-06, + "loss": 0.5024, + "step": 5922 + }, + { + "epoch": 2.3233390960731044, + "grad_norm": 0.45026962258097275, + "learning_rate": 4.8554378003264525e-06, + "loss": 0.4911, + "step": 5923 + }, + { + "epoch": 2.3237342553716966, + "grad_norm": 0.47088446275779033, + "learning_rate": 4.85538529153812e-06, + "loss": 0.5054, + "step": 5924 + }, + { + "epoch": 2.324129414670289, + "grad_norm": 0.4520409677257949, + "learning_rate": 4.855332773499265e-06, + "loss": 0.5, + "step": 5925 + }, + { + "epoch": 2.324524573968881, + "grad_norm": 0.5002895000010738, + "learning_rate": 4.855280246210093e-06, + "loss": 0.5013, + "step": 5926 + }, + { + "epoch": 2.3249197332674734, + "grad_norm": 0.43977050907935966, + "learning_rate": 4.8552277096708104e-06, + "loss": 0.4955, + "step": 5927 + }, + { + "epoch": 2.3253148925660656, + "grad_norm": 0.44665214038970635, + "learning_rate": 4.855175163881623e-06, + "loss": 0.491, + "step": 5928 + }, + { + "epoch": 2.325710051864658, + "grad_norm": 0.4543680288607131, + "learning_rate": 4.855122608842738e-06, + "loss": 0.5021, + "step": 5929 + }, + { + "epoch": 2.32610521116325, + "grad_norm": 0.47105886533075386, + "learning_rate": 4.855070044554361e-06, + "loss": 0.4931, + "step": 5930 + }, + { + "epoch": 2.3265003704618423, + "grad_norm": 0.4542183871966276, + "learning_rate": 4.8550174710167e-06, + "loss": 0.4898, + "step": 5931 + }, + { + "epoch": 2.3268955297604346, + "grad_norm": 0.44138796987873863, + "learning_rate": 4.854964888229959e-06, + "loss": 0.4778, + "step": 5932 + }, + { + "epoch": 2.327290689059027, + "grad_norm": 0.4453768384169529, + "learning_rate": 4.854912296194347e-06, + "loss": 0.4952, + "step": 5933 + }, + { + "epoch": 2.327685848357619, + "grad_norm": 0.46292907268573785, + "learning_rate": 4.854859694910069e-06, + "loss": 0.5194, + "step": 5934 + }, + { + "epoch": 2.3280810076562113, + "grad_norm": 0.5289798354022537, + "learning_rate": 4.854807084377332e-06, + "loss": 0.4982, + "step": 5935 + }, + { + "epoch": 2.3284761669548035, + "grad_norm": 0.4844679893735134, + "learning_rate": 4.854754464596344e-06, + "loss": 0.5083, + "step": 5936 + }, + { + "epoch": 2.3288713262533958, + "grad_norm": 0.46477016057992765, + "learning_rate": 4.854701835567309e-06, + "loss": 0.5174, + "step": 5937 + }, + { + "epoch": 2.329266485551988, + "grad_norm": 0.4434767608118626, + "learning_rate": 4.8546491972904354e-06, + "loss": 0.4904, + "step": 5938 + }, + { + "epoch": 2.3296616448505802, + "grad_norm": 0.4481318939794802, + "learning_rate": 4.854596549765929e-06, + "loss": 0.5045, + "step": 5939 + }, + { + "epoch": 2.3300568041491725, + "grad_norm": 0.4534393794450779, + "learning_rate": 4.8545438929939985e-06, + "loss": 0.4957, + "step": 5940 + }, + { + "epoch": 2.3304519634477647, + "grad_norm": 0.4687124392558075, + "learning_rate": 4.854491226974848e-06, + "loss": 0.5176, + "step": 5941 + }, + { + "epoch": 2.330847122746357, + "grad_norm": 0.6355213440329489, + "learning_rate": 4.854438551708686e-06, + "loss": 0.5251, + "step": 5942 + }, + { + "epoch": 2.331242282044949, + "grad_norm": 0.45957482273724554, + "learning_rate": 4.854385867195719e-06, + "loss": 0.4918, + "step": 5943 + }, + { + "epoch": 2.3316374413435415, + "grad_norm": 0.46063015196640106, + "learning_rate": 4.854333173436154e-06, + "loss": 0.5072, + "step": 5944 + }, + { + "epoch": 2.3320326006421337, + "grad_norm": 0.46027939590902706, + "learning_rate": 4.854280470430199e-06, + "loss": 0.5212, + "step": 5945 + }, + { + "epoch": 2.332427759940726, + "grad_norm": 0.4697655644185264, + "learning_rate": 4.854227758178058e-06, + "loss": 0.4959, + "step": 5946 + }, + { + "epoch": 2.332822919239318, + "grad_norm": 0.45342411422226236, + "learning_rate": 4.854175036679941e-06, + "loss": 0.5064, + "step": 5947 + }, + { + "epoch": 2.3332180785379104, + "grad_norm": 0.4534823543771811, + "learning_rate": 4.854122305936054e-06, + "loss": 0.5074, + "step": 5948 + }, + { + "epoch": 2.3336132378365027, + "grad_norm": 0.46415220608251223, + "learning_rate": 4.8540695659466045e-06, + "loss": 0.4908, + "step": 5949 + }, + { + "epoch": 2.334008397135095, + "grad_norm": 0.4555359616647736, + "learning_rate": 4.854016816711799e-06, + "loss": 0.5075, + "step": 5950 + }, + { + "epoch": 2.3344035564336876, + "grad_norm": 0.4602201174143169, + "learning_rate": 4.853964058231844e-06, + "loss": 0.5022, + "step": 5951 + }, + { + "epoch": 2.33479871573228, + "grad_norm": 0.445668129634111, + "learning_rate": 4.853911290506949e-06, + "loss": 0.4977, + "step": 5952 + }, + { + "epoch": 2.335193875030872, + "grad_norm": 0.4677802262684022, + "learning_rate": 4.853858513537319e-06, + "loss": 0.5155, + "step": 5953 + }, + { + "epoch": 2.3355890343294643, + "grad_norm": 0.46379592618135573, + "learning_rate": 4.853805727323162e-06, + "loss": 0.5065, + "step": 5954 + }, + { + "epoch": 2.3359841936280565, + "grad_norm": 0.4438035631679444, + "learning_rate": 4.853752931864685e-06, + "loss": 0.4997, + "step": 5955 + }, + { + "epoch": 2.336379352926649, + "grad_norm": 0.46352707418602923, + "learning_rate": 4.853700127162097e-06, + "loss": 0.499, + "step": 5956 + }, + { + "epoch": 2.336774512225241, + "grad_norm": 0.4428618122246438, + "learning_rate": 4.8536473132156025e-06, + "loss": 0.4866, + "step": 5957 + }, + { + "epoch": 2.3371696715238333, + "grad_norm": 0.4521576676325371, + "learning_rate": 4.8535944900254115e-06, + "loss": 0.5002, + "step": 5958 + }, + { + "epoch": 2.3375648308224255, + "grad_norm": 0.4626467646021148, + "learning_rate": 4.853541657591731e-06, + "loss": 0.5019, + "step": 5959 + }, + { + "epoch": 2.3379599901210177, + "grad_norm": 0.46120148479836487, + "learning_rate": 4.853488815914768e-06, + "loss": 0.5007, + "step": 5960 + }, + { + "epoch": 2.33835514941961, + "grad_norm": 0.4506914517703311, + "learning_rate": 4.85343596499473e-06, + "loss": 0.4884, + "step": 5961 + }, + { + "epoch": 2.338750308718202, + "grad_norm": 0.43940905649704776, + "learning_rate": 4.853383104831823e-06, + "loss": 0.4956, + "step": 5962 + }, + { + "epoch": 2.3391454680167945, + "grad_norm": 0.46981121876573256, + "learning_rate": 4.853330235426258e-06, + "loss": 0.5144, + "step": 5963 + }, + { + "epoch": 2.3395406273153867, + "grad_norm": 0.4910013553279312, + "learning_rate": 4.85327735677824e-06, + "loss": 0.5202, + "step": 5964 + }, + { + "epoch": 2.339935786613979, + "grad_norm": 0.45060719865992904, + "learning_rate": 4.853224468887978e-06, + "loss": 0.502, + "step": 5965 + }, + { + "epoch": 2.340330945912571, + "grad_norm": 0.4553193621696011, + "learning_rate": 4.853171571755679e-06, + "loss": 0.4781, + "step": 5966 + }, + { + "epoch": 2.3407261052111634, + "grad_norm": 0.4515357883492096, + "learning_rate": 4.853118665381551e-06, + "loss": 0.4948, + "step": 5967 + }, + { + "epoch": 2.3411212645097557, + "grad_norm": 0.45077739345730894, + "learning_rate": 4.853065749765802e-06, + "loss": 0.494, + "step": 5968 + }, + { + "epoch": 2.341516423808348, + "grad_norm": 0.6697241027454435, + "learning_rate": 4.853012824908639e-06, + "loss": 0.5153, + "step": 5969 + }, + { + "epoch": 2.34191158310694, + "grad_norm": 0.46052939599652937, + "learning_rate": 4.852959890810271e-06, + "loss": 0.4899, + "step": 5970 + }, + { + "epoch": 2.3423067424055324, + "grad_norm": 0.4629470441078412, + "learning_rate": 4.852906947470905e-06, + "loss": 0.5122, + "step": 5971 + }, + { + "epoch": 2.3427019017041246, + "grad_norm": 0.45554485923089544, + "learning_rate": 4.8528539948907495e-06, + "loss": 0.5118, + "step": 5972 + }, + { + "epoch": 2.343097061002717, + "grad_norm": 0.46703177457997613, + "learning_rate": 4.8528010330700125e-06, + "loss": 0.4909, + "step": 5973 + }, + { + "epoch": 2.343492220301309, + "grad_norm": 0.4592646549588022, + "learning_rate": 4.852748062008901e-06, + "loss": 0.4904, + "step": 5974 + }, + { + "epoch": 2.3438873795999013, + "grad_norm": 0.474115303348079, + "learning_rate": 4.8526950817076244e-06, + "loss": 0.5038, + "step": 5975 + }, + { + "epoch": 2.3442825388984936, + "grad_norm": 0.48149869168465126, + "learning_rate": 4.85264209216639e-06, + "loss": 0.5108, + "step": 5976 + }, + { + "epoch": 2.344677698197086, + "grad_norm": 0.45837357450934973, + "learning_rate": 4.852589093385406e-06, + "loss": 0.5158, + "step": 5977 + }, + { + "epoch": 2.345072857495678, + "grad_norm": 0.44593351395167496, + "learning_rate": 4.852536085364881e-06, + "loss": 0.4967, + "step": 5978 + }, + { + "epoch": 2.3454680167942703, + "grad_norm": 0.4531644610848413, + "learning_rate": 4.852483068105022e-06, + "loss": 0.4981, + "step": 5979 + }, + { + "epoch": 2.3458631760928625, + "grad_norm": 0.4595627732654401, + "learning_rate": 4.852430041606039e-06, + "loss": 0.4888, + "step": 5980 + }, + { + "epoch": 2.346258335391455, + "grad_norm": 0.4581903361601766, + "learning_rate": 4.852377005868138e-06, + "loss": 0.4822, + "step": 5981 + }, + { + "epoch": 2.346653494690047, + "grad_norm": 0.4483989827690783, + "learning_rate": 4.85232396089153e-06, + "loss": 0.5124, + "step": 5982 + }, + { + "epoch": 2.3470486539886393, + "grad_norm": 0.45908454030513607, + "learning_rate": 4.8522709066764204e-06, + "loss": 0.5086, + "step": 5983 + }, + { + "epoch": 2.3474438132872315, + "grad_norm": 0.4583400146391861, + "learning_rate": 4.85221784322302e-06, + "loss": 0.5112, + "step": 5984 + }, + { + "epoch": 2.3478389725858237, + "grad_norm": 0.45763389698982093, + "learning_rate": 4.852164770531536e-06, + "loss": 0.5045, + "step": 5985 + }, + { + "epoch": 2.348234131884416, + "grad_norm": 0.460537227961999, + "learning_rate": 4.852111688602177e-06, + "loss": 0.5215, + "step": 5986 + }, + { + "epoch": 2.3486292911830082, + "grad_norm": 0.46679943581021927, + "learning_rate": 4.852058597435152e-06, + "loss": 0.5116, + "step": 5987 + }, + { + "epoch": 2.3490244504816005, + "grad_norm": 0.44817207334568054, + "learning_rate": 4.852005497030669e-06, + "loss": 0.5015, + "step": 5988 + }, + { + "epoch": 2.3494196097801927, + "grad_norm": 0.4489419200289153, + "learning_rate": 4.851952387388936e-06, + "loss": 0.5115, + "step": 5989 + }, + { + "epoch": 2.349814769078785, + "grad_norm": 0.45473940760337633, + "learning_rate": 4.851899268510163e-06, + "loss": 0.4971, + "step": 5990 + }, + { + "epoch": 2.350209928377377, + "grad_norm": 0.4595345160766586, + "learning_rate": 4.851846140394557e-06, + "loss": 0.5165, + "step": 5991 + }, + { + "epoch": 2.3506050876759694, + "grad_norm": 0.45078645033514275, + "learning_rate": 4.851793003042328e-06, + "loss": 0.4801, + "step": 5992 + }, + { + "epoch": 2.3510002469745617, + "grad_norm": 0.46083912000399313, + "learning_rate": 4.851739856453685e-06, + "loss": 0.506, + "step": 5993 + }, + { + "epoch": 2.351395406273154, + "grad_norm": 0.456612483620041, + "learning_rate": 4.851686700628834e-06, + "loss": 0.5031, + "step": 5994 + }, + { + "epoch": 2.351790565571746, + "grad_norm": 0.4617876715575406, + "learning_rate": 4.851633535567987e-06, + "loss": 0.5115, + "step": 5995 + }, + { + "epoch": 2.3521857248703384, + "grad_norm": 0.5326660248295265, + "learning_rate": 4.851580361271351e-06, + "loss": 0.5117, + "step": 5996 + }, + { + "epoch": 2.3525808841689306, + "grad_norm": 0.4613788668226646, + "learning_rate": 4.851527177739135e-06, + "loss": 0.5209, + "step": 5997 + }, + { + "epoch": 2.352976043467523, + "grad_norm": 0.44443451469751716, + "learning_rate": 4.851473984971549e-06, + "loss": 0.4932, + "step": 5998 + }, + { + "epoch": 2.353371202766115, + "grad_norm": 0.45353874492848084, + "learning_rate": 4.851420782968801e-06, + "loss": 0.512, + "step": 5999 + }, + { + "epoch": 2.3537663620647074, + "grad_norm": 0.45136448249781186, + "learning_rate": 4.8513675717311e-06, + "loss": 0.4939, + "step": 6000 + }, + { + "epoch": 2.3541615213632996, + "grad_norm": 0.4516803569929796, + "learning_rate": 4.851314351258654e-06, + "loss": 0.5113, + "step": 6001 + }, + { + "epoch": 2.354556680661892, + "grad_norm": 0.4561308566297529, + "learning_rate": 4.851261121551674e-06, + "loss": 0.4967, + "step": 6002 + }, + { + "epoch": 2.354951839960484, + "grad_norm": 0.5260342783696013, + "learning_rate": 4.8512078826103675e-06, + "loss": 0.4982, + "step": 6003 + }, + { + "epoch": 2.3553469992590763, + "grad_norm": 0.47897271506812045, + "learning_rate": 4.8511546344349444e-06, + "loss": 0.5193, + "step": 6004 + }, + { + "epoch": 2.3557421585576686, + "grad_norm": 0.4638833627455354, + "learning_rate": 4.851101377025614e-06, + "loss": 0.4941, + "step": 6005 + }, + { + "epoch": 2.356137317856261, + "grad_norm": 0.4427664156009943, + "learning_rate": 4.8510481103825845e-06, + "loss": 0.4878, + "step": 6006 + }, + { + "epoch": 2.356532477154853, + "grad_norm": 0.44716700666783465, + "learning_rate": 4.850994834506065e-06, + "loss": 0.4942, + "step": 6007 + }, + { + "epoch": 2.3569276364534453, + "grad_norm": 0.45290206397069754, + "learning_rate": 4.850941549396267e-06, + "loss": 0.504, + "step": 6008 + }, + { + "epoch": 2.3573227957520375, + "grad_norm": 0.4624224635836012, + "learning_rate": 4.850888255053398e-06, + "loss": 0.5128, + "step": 6009 + }, + { + "epoch": 2.3577179550506298, + "grad_norm": 0.4685159236259062, + "learning_rate": 4.850834951477666e-06, + "loss": 0.5056, + "step": 6010 + }, + { + "epoch": 2.358113114349222, + "grad_norm": 0.4637190744227209, + "learning_rate": 4.850781638669283e-06, + "loss": 0.4789, + "step": 6011 + }, + { + "epoch": 2.3585082736478142, + "grad_norm": 0.4519731150582127, + "learning_rate": 4.850728316628457e-06, + "loss": 0.5129, + "step": 6012 + }, + { + "epoch": 2.3589034329464065, + "grad_norm": 0.4538513437911314, + "learning_rate": 4.8506749853553974e-06, + "loss": 0.4994, + "step": 6013 + }, + { + "epoch": 2.3592985922449987, + "grad_norm": 0.46979489205654373, + "learning_rate": 4.850621644850314e-06, + "loss": 0.5151, + "step": 6014 + }, + { + "epoch": 2.359693751543591, + "grad_norm": 0.45746167900887663, + "learning_rate": 4.850568295113416e-06, + "loss": 0.5168, + "step": 6015 + }, + { + "epoch": 2.360088910842183, + "grad_norm": 0.4575481082495061, + "learning_rate": 4.850514936144913e-06, + "loss": 0.5072, + "step": 6016 + }, + { + "epoch": 2.3604840701407754, + "grad_norm": 0.45975740355910427, + "learning_rate": 4.850461567945015e-06, + "loss": 0.5, + "step": 6017 + }, + { + "epoch": 2.3608792294393677, + "grad_norm": 0.4748694051073642, + "learning_rate": 4.850408190513931e-06, + "loss": 0.5029, + "step": 6018 + }, + { + "epoch": 2.36127438873796, + "grad_norm": 0.4914980276998342, + "learning_rate": 4.850354803851871e-06, + "loss": 0.5254, + "step": 6019 + }, + { + "epoch": 2.361669548036552, + "grad_norm": 0.4800284534033041, + "learning_rate": 4.850301407959045e-06, + "loss": 0.5161, + "step": 6020 + }, + { + "epoch": 2.3620647073351444, + "grad_norm": 0.4462205733041414, + "learning_rate": 4.8502480028356615e-06, + "loss": 0.5042, + "step": 6021 + }, + { + "epoch": 2.3624598666337366, + "grad_norm": 0.4709204676815143, + "learning_rate": 4.850194588481931e-06, + "loss": 0.502, + "step": 6022 + }, + { + "epoch": 2.362855025932329, + "grad_norm": 0.4568772714716113, + "learning_rate": 4.8501411648980635e-06, + "loss": 0.4653, + "step": 6023 + }, + { + "epoch": 2.363250185230921, + "grad_norm": 0.4547015608724961, + "learning_rate": 4.850087732084269e-06, + "loss": 0.5, + "step": 6024 + }, + { + "epoch": 2.3636453445295134, + "grad_norm": 0.480326169527228, + "learning_rate": 4.850034290040756e-06, + "loss": 0.5118, + "step": 6025 + }, + { + "epoch": 2.3640405038281056, + "grad_norm": 0.4598744970305476, + "learning_rate": 4.849980838767736e-06, + "loss": 0.498, + "step": 6026 + }, + { + "epoch": 2.364435663126698, + "grad_norm": 0.4600510631142906, + "learning_rate": 4.849927378265418e-06, + "loss": 0.4725, + "step": 6027 + }, + { + "epoch": 2.36483082242529, + "grad_norm": 0.4633225031808518, + "learning_rate": 4.8498739085340125e-06, + "loss": 0.5029, + "step": 6028 + }, + { + "epoch": 2.3652259817238823, + "grad_norm": 0.4497673903824722, + "learning_rate": 4.849820429573729e-06, + "loss": 0.5061, + "step": 6029 + }, + { + "epoch": 2.3656211410224746, + "grad_norm": 0.45283626682133, + "learning_rate": 4.849766941384777e-06, + "loss": 0.5035, + "step": 6030 + }, + { + "epoch": 2.366016300321067, + "grad_norm": 0.45318111987548476, + "learning_rate": 4.8497134439673685e-06, + "loss": 0.5059, + "step": 6031 + }, + { + "epoch": 2.366411459619659, + "grad_norm": 0.45785207671037326, + "learning_rate": 4.849659937321713e-06, + "loss": 0.527, + "step": 6032 + }, + { + "epoch": 2.3668066189182513, + "grad_norm": 0.45427024013798384, + "learning_rate": 4.849606421448018e-06, + "loss": 0.5253, + "step": 6033 + }, + { + "epoch": 2.3672017782168435, + "grad_norm": 0.45732742980462926, + "learning_rate": 4.849552896346497e-06, + "loss": 0.504, + "step": 6034 + }, + { + "epoch": 2.3675969375154358, + "grad_norm": 0.5128918032034719, + "learning_rate": 4.849499362017359e-06, + "loss": 0.5012, + "step": 6035 + }, + { + "epoch": 2.367992096814028, + "grad_norm": 0.46442492874120134, + "learning_rate": 4.8494458184608135e-06, + "loss": 0.4985, + "step": 6036 + }, + { + "epoch": 2.3683872561126202, + "grad_norm": 0.44382409744050355, + "learning_rate": 4.849392265677072e-06, + "loss": 0.4984, + "step": 6037 + }, + { + "epoch": 2.3687824154112125, + "grad_norm": 0.4417781251327167, + "learning_rate": 4.8493387036663445e-06, + "loss": 0.4904, + "step": 6038 + }, + { + "epoch": 2.3691775747098047, + "grad_norm": 0.46452188611272743, + "learning_rate": 4.84928513242884e-06, + "loss": 0.5057, + "step": 6039 + }, + { + "epoch": 2.369572734008397, + "grad_norm": 0.4939272964413806, + "learning_rate": 4.849231551964771e-06, + "loss": 0.5275, + "step": 6040 + }, + { + "epoch": 2.369967893306989, + "grad_norm": 0.4656217747759951, + "learning_rate": 4.849177962274348e-06, + "loss": 0.4959, + "step": 6041 + }, + { + "epoch": 2.3703630526055814, + "grad_norm": 0.45451303351162, + "learning_rate": 4.8491243633577785e-06, + "loss": 0.4827, + "step": 6042 + }, + { + "epoch": 2.3707582119041737, + "grad_norm": 0.46682609497280303, + "learning_rate": 4.849070755215276e-06, + "loss": 0.4905, + "step": 6043 + }, + { + "epoch": 2.371153371202766, + "grad_norm": 0.451485975019632, + "learning_rate": 4.849017137847049e-06, + "loss": 0.5045, + "step": 6044 + }, + { + "epoch": 2.371548530501358, + "grad_norm": 0.4631255854420321, + "learning_rate": 4.84896351125331e-06, + "loss": 0.5326, + "step": 6045 + }, + { + "epoch": 2.3719436897999504, + "grad_norm": 0.4571223855093204, + "learning_rate": 4.848909875434269e-06, + "loss": 0.4987, + "step": 6046 + }, + { + "epoch": 2.3723388490985426, + "grad_norm": 0.4680677582589837, + "learning_rate": 4.848856230390137e-06, + "loss": 0.5158, + "step": 6047 + }, + { + "epoch": 2.372734008397135, + "grad_norm": 0.4446257712595702, + "learning_rate": 4.848802576121122e-06, + "loss": 0.5033, + "step": 6048 + }, + { + "epoch": 2.373129167695727, + "grad_norm": 0.4630473866895991, + "learning_rate": 4.848748912627438e-06, + "loss": 0.5042, + "step": 6049 + }, + { + "epoch": 2.3735243269943194, + "grad_norm": 0.44778383563697305, + "learning_rate": 4.848695239909295e-06, + "loss": 0.4975, + "step": 6050 + }, + { + "epoch": 2.3739194862929116, + "grad_norm": 0.4385248274134849, + "learning_rate": 4.848641557966902e-06, + "loss": 0.4859, + "step": 6051 + }, + { + "epoch": 2.374314645591504, + "grad_norm": 0.4527513035838103, + "learning_rate": 4.848587866800472e-06, + "loss": 0.4834, + "step": 6052 + }, + { + "epoch": 2.374709804890096, + "grad_norm": 0.45987272190820844, + "learning_rate": 4.8485341664102146e-06, + "loss": 0.4934, + "step": 6053 + }, + { + "epoch": 2.3751049641886883, + "grad_norm": 0.44214190083964766, + "learning_rate": 4.84848045679634e-06, + "loss": 0.4863, + "step": 6054 + }, + { + "epoch": 2.3755001234872806, + "grad_norm": 0.47023720851459105, + "learning_rate": 4.848426737959062e-06, + "loss": 0.5147, + "step": 6055 + }, + { + "epoch": 2.375895282785873, + "grad_norm": 0.4536759734264281, + "learning_rate": 4.848373009898589e-06, + "loss": 0.5058, + "step": 6056 + }, + { + "epoch": 2.3762904420844655, + "grad_norm": 0.46339461388038883, + "learning_rate": 4.848319272615134e-06, + "loss": 0.5292, + "step": 6057 + }, + { + "epoch": 2.3766856013830577, + "grad_norm": 0.43958334885581213, + "learning_rate": 4.848265526108906e-06, + "loss": 0.485, + "step": 6058 + }, + { + "epoch": 2.37708076068165, + "grad_norm": 0.4442145240300568, + "learning_rate": 4.848211770380117e-06, + "loss": 0.4961, + "step": 6059 + }, + { + "epoch": 2.377475919980242, + "grad_norm": 0.47607365592400747, + "learning_rate": 4.848158005428978e-06, + "loss": 0.5151, + "step": 6060 + }, + { + "epoch": 2.3778710792788345, + "grad_norm": 0.4512904547194257, + "learning_rate": 4.8481042312557e-06, + "loss": 0.4923, + "step": 6061 + }, + { + "epoch": 2.3782662385774267, + "grad_norm": 0.44456571012970686, + "learning_rate": 4.8480504478604946e-06, + "loss": 0.5055, + "step": 6062 + }, + { + "epoch": 2.378661397876019, + "grad_norm": 0.4560502078542866, + "learning_rate": 4.847996655243572e-06, + "loss": 0.4988, + "step": 6063 + }, + { + "epoch": 2.379056557174611, + "grad_norm": 0.45340332096358, + "learning_rate": 4.847942853405146e-06, + "loss": 0.4981, + "step": 6064 + }, + { + "epoch": 2.3794517164732034, + "grad_norm": 0.4581291968585293, + "learning_rate": 4.847889042345425e-06, + "loss": 0.4771, + "step": 6065 + }, + { + "epoch": 2.3798468757717957, + "grad_norm": 0.459995603273202, + "learning_rate": 4.8478352220646215e-06, + "loss": 0.4929, + "step": 6066 + }, + { + "epoch": 2.380242035070388, + "grad_norm": 0.4474153131280138, + "learning_rate": 4.847781392562948e-06, + "loss": 0.5069, + "step": 6067 + }, + { + "epoch": 2.38063719436898, + "grad_norm": 0.4493358041327097, + "learning_rate": 4.847727553840615e-06, + "loss": 0.4881, + "step": 6068 + }, + { + "epoch": 2.3810323536675724, + "grad_norm": 0.45233974550022343, + "learning_rate": 4.847673705897832e-06, + "loss": 0.5081, + "step": 6069 + }, + { + "epoch": 2.3814275129661646, + "grad_norm": 0.5070469825872892, + "learning_rate": 4.847619848734814e-06, + "loss": 0.5149, + "step": 6070 + }, + { + "epoch": 2.381822672264757, + "grad_norm": 0.4565568022870259, + "learning_rate": 4.84756598235177e-06, + "loss": 0.4859, + "step": 6071 + }, + { + "epoch": 2.382217831563349, + "grad_norm": 0.4871291260740165, + "learning_rate": 4.847512106748912e-06, + "loss": 0.5239, + "step": 6072 + }, + { + "epoch": 2.3826129908619413, + "grad_norm": 0.4504152728422771, + "learning_rate": 4.847458221926453e-06, + "loss": 0.5099, + "step": 6073 + }, + { + "epoch": 2.3830081501605336, + "grad_norm": 0.4591280231930706, + "learning_rate": 4.847404327884603e-06, + "loss": 0.4842, + "step": 6074 + }, + { + "epoch": 2.383403309459126, + "grad_norm": 0.45851366936986665, + "learning_rate": 4.847350424623574e-06, + "loss": 0.5088, + "step": 6075 + }, + { + "epoch": 2.383798468757718, + "grad_norm": 0.4634869558586333, + "learning_rate": 4.847296512143577e-06, + "loss": 0.5131, + "step": 6076 + }, + { + "epoch": 2.3841936280563103, + "grad_norm": 0.45304642062746575, + "learning_rate": 4.847242590444826e-06, + "loss": 0.4977, + "step": 6077 + }, + { + "epoch": 2.3845887873549025, + "grad_norm": 0.4561346179180274, + "learning_rate": 4.847188659527532e-06, + "loss": 0.5047, + "step": 6078 + }, + { + "epoch": 2.384983946653495, + "grad_norm": 0.4526743884370499, + "learning_rate": 4.847134719391905e-06, + "loss": 0.4914, + "step": 6079 + }, + { + "epoch": 2.385379105952087, + "grad_norm": 0.46407896971341867, + "learning_rate": 4.847080770038158e-06, + "loss": 0.5089, + "step": 6080 + }, + { + "epoch": 2.3857742652506793, + "grad_norm": 0.45258104010987427, + "learning_rate": 4.847026811466504e-06, + "loss": 0.5022, + "step": 6081 + }, + { + "epoch": 2.3861694245492715, + "grad_norm": 0.461969126357832, + "learning_rate": 4.846972843677153e-06, + "loss": 0.497, + "step": 6082 + }, + { + "epoch": 2.3865645838478637, + "grad_norm": 0.4492623434194907, + "learning_rate": 4.846918866670318e-06, + "loss": 0.4979, + "step": 6083 + }, + { + "epoch": 2.386959743146456, + "grad_norm": 0.4571175682930661, + "learning_rate": 4.846864880446211e-06, + "loss": 0.5196, + "step": 6084 + }, + { + "epoch": 2.387354902445048, + "grad_norm": 0.45913437019988257, + "learning_rate": 4.8468108850050436e-06, + "loss": 0.5046, + "step": 6085 + }, + { + "epoch": 2.3877500617436405, + "grad_norm": 0.45074777593560994, + "learning_rate": 4.846756880347029e-06, + "loss": 0.4989, + "step": 6086 + }, + { + "epoch": 2.3881452210422327, + "grad_norm": 0.44092077546541725, + "learning_rate": 4.846702866472377e-06, + "loss": 0.4794, + "step": 6087 + }, + { + "epoch": 2.388540380340825, + "grad_norm": 0.46155684618109705, + "learning_rate": 4.846648843381302e-06, + "loss": 0.5089, + "step": 6088 + }, + { + "epoch": 2.388935539639417, + "grad_norm": 0.46885706820416534, + "learning_rate": 4.846594811074015e-06, + "loss": 0.5171, + "step": 6089 + }, + { + "epoch": 2.3893306989380094, + "grad_norm": 0.43931037237601184, + "learning_rate": 4.846540769550728e-06, + "loss": 0.4882, + "step": 6090 + }, + { + "epoch": 2.3897258582366017, + "grad_norm": 0.4353274011341564, + "learning_rate": 4.8464867188116545e-06, + "loss": 0.482, + "step": 6091 + }, + { + "epoch": 2.390121017535194, + "grad_norm": 0.47037433987558747, + "learning_rate": 4.846432658857006e-06, + "loss": 0.4994, + "step": 6092 + }, + { + "epoch": 2.390516176833786, + "grad_norm": 0.4593847264977308, + "learning_rate": 4.846378589686995e-06, + "loss": 0.507, + "step": 6093 + }, + { + "epoch": 2.3909113361323784, + "grad_norm": 0.4601985413381888, + "learning_rate": 4.846324511301834e-06, + "loss": 0.5023, + "step": 6094 + }, + { + "epoch": 2.3913064954309706, + "grad_norm": 0.452640709655825, + "learning_rate": 4.846270423701734e-06, + "loss": 0.4995, + "step": 6095 + }, + { + "epoch": 2.391701654729563, + "grad_norm": 0.45638264701889014, + "learning_rate": 4.846216326886909e-06, + "loss": 0.4959, + "step": 6096 + }, + { + "epoch": 2.392096814028155, + "grad_norm": 0.4826711924842887, + "learning_rate": 4.846162220857571e-06, + "loss": 0.5177, + "step": 6097 + }, + { + "epoch": 2.3924919733267473, + "grad_norm": 0.4888215683792986, + "learning_rate": 4.846108105613932e-06, + "loss": 0.5008, + "step": 6098 + }, + { + "epoch": 2.3928871326253396, + "grad_norm": 0.46320608794522933, + "learning_rate": 4.8460539811562055e-06, + "loss": 0.5089, + "step": 6099 + }, + { + "epoch": 2.393282291923932, + "grad_norm": 0.45725756454431987, + "learning_rate": 4.845999847484604e-06, + "loss": 0.5203, + "step": 6100 + }, + { + "epoch": 2.393677451222524, + "grad_norm": 0.44991333274030526, + "learning_rate": 4.84594570459934e-06, + "loss": 0.4848, + "step": 6101 + }, + { + "epoch": 2.3940726105211163, + "grad_norm": 0.4600972177592437, + "learning_rate": 4.845891552500625e-06, + "loss": 0.5093, + "step": 6102 + }, + { + "epoch": 2.3944677698197085, + "grad_norm": 0.5860812755960613, + "learning_rate": 4.8458373911886716e-06, + "loss": 0.5069, + "step": 6103 + }, + { + "epoch": 2.394862929118301, + "grad_norm": 0.4538835124457885, + "learning_rate": 4.845783220663694e-06, + "loss": 0.4874, + "step": 6104 + }, + { + "epoch": 2.395258088416893, + "grad_norm": 0.45218392628786025, + "learning_rate": 4.845729040925905e-06, + "loss": 0.496, + "step": 6105 + }, + { + "epoch": 2.3956532477154853, + "grad_norm": 0.4493156953117064, + "learning_rate": 4.845674851975516e-06, + "loss": 0.5225, + "step": 6106 + }, + { + "epoch": 2.3960484070140775, + "grad_norm": 0.4721248920273722, + "learning_rate": 4.845620653812742e-06, + "loss": 0.503, + "step": 6107 + }, + { + "epoch": 2.3964435663126697, + "grad_norm": 0.4702634244511809, + "learning_rate": 4.845566446437793e-06, + "loss": 0.509, + "step": 6108 + }, + { + "epoch": 2.396838725611262, + "grad_norm": 0.4517371761872178, + "learning_rate": 4.845512229850883e-06, + "loss": 0.5005, + "step": 6109 + }, + { + "epoch": 2.3972338849098542, + "grad_norm": 0.45784931465490497, + "learning_rate": 4.845458004052226e-06, + "loss": 0.5292, + "step": 6110 + }, + { + "epoch": 2.3976290442084465, + "grad_norm": 0.47366039215477546, + "learning_rate": 4.845403769042034e-06, + "loss": 0.4974, + "step": 6111 + }, + { + "epoch": 2.3980242035070387, + "grad_norm": 0.45191947098684676, + "learning_rate": 4.8453495248205205e-06, + "loss": 0.5141, + "step": 6112 + }, + { + "epoch": 2.398419362805631, + "grad_norm": 0.4713607350755517, + "learning_rate": 4.845295271387897e-06, + "loss": 0.4976, + "step": 6113 + }, + { + "epoch": 2.398814522104223, + "grad_norm": 0.4660999045638153, + "learning_rate": 4.84524100874438e-06, + "loss": 0.5113, + "step": 6114 + }, + { + "epoch": 2.3992096814028154, + "grad_norm": 0.47015668850310877, + "learning_rate": 4.845186736890179e-06, + "loss": 0.4907, + "step": 6115 + }, + { + "epoch": 2.3996048407014077, + "grad_norm": 0.46766792350712855, + "learning_rate": 4.845132455825508e-06, + "loss": 0.513, + "step": 6116 + }, + { + "epoch": 2.4, + "grad_norm": 1.0836917520596936, + "learning_rate": 4.8450781655505815e-06, + "loss": 0.4935, + "step": 6117 + }, + { + "epoch": 2.400395159298592, + "grad_norm": 0.46151727429931927, + "learning_rate": 4.845023866065612e-06, + "loss": 0.5011, + "step": 6118 + }, + { + "epoch": 2.4007903185971844, + "grad_norm": 0.4605620408152997, + "learning_rate": 4.844969557370813e-06, + "loss": 0.5045, + "step": 6119 + }, + { + "epoch": 2.4011854778957766, + "grad_norm": 0.45858291441150556, + "learning_rate": 4.844915239466398e-06, + "loss": 0.5035, + "step": 6120 + }, + { + "epoch": 2.401580637194369, + "grad_norm": 0.4572405133813424, + "learning_rate": 4.844860912352579e-06, + "loss": 0.4987, + "step": 6121 + }, + { + "epoch": 2.401975796492961, + "grad_norm": 0.4532962343436789, + "learning_rate": 4.844806576029571e-06, + "loss": 0.5168, + "step": 6122 + }, + { + "epoch": 2.4023709557915534, + "grad_norm": 0.44424364135152433, + "learning_rate": 4.844752230497586e-06, + "loss": 0.4874, + "step": 6123 + }, + { + "epoch": 2.4027661150901456, + "grad_norm": 0.45178951408739765, + "learning_rate": 4.844697875756837e-06, + "loss": 0.5028, + "step": 6124 + }, + { + "epoch": 2.403161274388738, + "grad_norm": 0.46131443978892384, + "learning_rate": 4.844643511807539e-06, + "loss": 0.4997, + "step": 6125 + }, + { + "epoch": 2.40355643368733, + "grad_norm": 0.44557753095639435, + "learning_rate": 4.844589138649906e-06, + "loss": 0.4961, + "step": 6126 + }, + { + "epoch": 2.4039515929859223, + "grad_norm": 0.44660632251012705, + "learning_rate": 4.84453475628415e-06, + "loss": 0.5055, + "step": 6127 + }, + { + "epoch": 2.4043467522845146, + "grad_norm": 0.46297338232365337, + "learning_rate": 4.844480364710486e-06, + "loss": 0.5106, + "step": 6128 + }, + { + "epoch": 2.404741911583107, + "grad_norm": 0.4546931439519957, + "learning_rate": 4.844425963929126e-06, + "loss": 0.507, + "step": 6129 + }, + { + "epoch": 2.405137070881699, + "grad_norm": 0.4393631016625778, + "learning_rate": 4.844371553940284e-06, + "loss": 0.5011, + "step": 6130 + }, + { + "epoch": 2.4055322301802913, + "grad_norm": 0.44198752628335675, + "learning_rate": 4.844317134744174e-06, + "loss": 0.5045, + "step": 6131 + }, + { + "epoch": 2.4059273894788835, + "grad_norm": 0.4529021301640311, + "learning_rate": 4.844262706341011e-06, + "loss": 0.5139, + "step": 6132 + }, + { + "epoch": 2.4063225487774758, + "grad_norm": 0.46321469858495373, + "learning_rate": 4.844208268731007e-06, + "loss": 0.4949, + "step": 6133 + }, + { + "epoch": 2.406717708076068, + "grad_norm": 0.5352727595143272, + "learning_rate": 4.8441538219143765e-06, + "loss": 0.5112, + "step": 6134 + }, + { + "epoch": 2.4071128673746602, + "grad_norm": 0.45404512767050736, + "learning_rate": 4.844099365891333e-06, + "loss": 0.5071, + "step": 6135 + }, + { + "epoch": 2.4075080266732525, + "grad_norm": 0.44661826142521477, + "learning_rate": 4.844044900662091e-06, + "loss": 0.4842, + "step": 6136 + }, + { + "epoch": 2.4079031859718447, + "grad_norm": 0.4866476660219383, + "learning_rate": 4.843990426226864e-06, + "loss": 0.4978, + "step": 6137 + }, + { + "epoch": 2.408298345270437, + "grad_norm": 0.48353551024417507, + "learning_rate": 4.843935942585865e-06, + "loss": 0.5364, + "step": 6138 + }, + { + "epoch": 2.408693504569029, + "grad_norm": 0.4710485363082892, + "learning_rate": 4.84388144973931e-06, + "loss": 0.503, + "step": 6139 + }, + { + "epoch": 2.409088663867622, + "grad_norm": 0.4574698914104126, + "learning_rate": 4.843826947687412e-06, + "loss": 0.5185, + "step": 6140 + }, + { + "epoch": 2.409483823166214, + "grad_norm": 0.46016949246716193, + "learning_rate": 4.843772436430384e-06, + "loss": 0.5148, + "step": 6141 + }, + { + "epoch": 2.4098789824648064, + "grad_norm": 0.4482322493892279, + "learning_rate": 4.843717915968442e-06, + "loss": 0.5046, + "step": 6142 + }, + { + "epoch": 2.4102741417633986, + "grad_norm": 0.45116521545047983, + "learning_rate": 4.843663386301799e-06, + "loss": 0.515, + "step": 6143 + }, + { + "epoch": 2.410669301061991, + "grad_norm": 0.4590353660506984, + "learning_rate": 4.843608847430669e-06, + "loss": 0.5015, + "step": 6144 + }, + { + "epoch": 2.411064460360583, + "grad_norm": 0.4509801127921544, + "learning_rate": 4.843554299355267e-06, + "loss": 0.5036, + "step": 6145 + }, + { + "epoch": 2.4114596196591753, + "grad_norm": 0.46483479028215885, + "learning_rate": 4.8434997420758065e-06, + "loss": 0.5002, + "step": 6146 + }, + { + "epoch": 2.4118547789577676, + "grad_norm": 0.4594113983493731, + "learning_rate": 4.843445175592502e-06, + "loss": 0.5072, + "step": 6147 + }, + { + "epoch": 2.41224993825636, + "grad_norm": 0.45010607482721837, + "learning_rate": 4.843390599905568e-06, + "loss": 0.509, + "step": 6148 + }, + { + "epoch": 2.412645097554952, + "grad_norm": 0.47368631625572943, + "learning_rate": 4.843336015015218e-06, + "loss": 0.5363, + "step": 6149 + }, + { + "epoch": 2.4130402568535443, + "grad_norm": 0.4583173953744174, + "learning_rate": 4.843281420921668e-06, + "loss": 0.4834, + "step": 6150 + }, + { + "epoch": 2.4134354161521365, + "grad_norm": 0.46218829266694594, + "learning_rate": 4.843226817625132e-06, + "loss": 0.5132, + "step": 6151 + }, + { + "epoch": 2.4138305754507288, + "grad_norm": 0.46198156224588854, + "learning_rate": 4.843172205125824e-06, + "loss": 0.5132, + "step": 6152 + }, + { + "epoch": 2.414225734749321, + "grad_norm": 0.4703069352755098, + "learning_rate": 4.843117583423957e-06, + "loss": 0.5222, + "step": 6153 + }, + { + "epoch": 2.4146208940479132, + "grad_norm": 0.4614134551221174, + "learning_rate": 4.843062952519748e-06, + "loss": 0.4887, + "step": 6154 + }, + { + "epoch": 2.4150160533465055, + "grad_norm": 0.4697970415651049, + "learning_rate": 4.843008312413409e-06, + "loss": 0.5093, + "step": 6155 + }, + { + "epoch": 2.4154112126450977, + "grad_norm": 0.45775748905364666, + "learning_rate": 4.842953663105158e-06, + "loss": 0.4919, + "step": 6156 + }, + { + "epoch": 2.41580637194369, + "grad_norm": 0.45387013696855416, + "learning_rate": 4.8428990045952075e-06, + "loss": 0.4933, + "step": 6157 + }, + { + "epoch": 2.416201531242282, + "grad_norm": 0.49320450897769486, + "learning_rate": 4.842844336883772e-06, + "loss": 0.5058, + "step": 6158 + }, + { + "epoch": 2.4165966905408744, + "grad_norm": 0.4594972163377635, + "learning_rate": 4.842789659971065e-06, + "loss": 0.4956, + "step": 6159 + }, + { + "epoch": 2.4169918498394667, + "grad_norm": 0.44799652445453403, + "learning_rate": 4.842734973857305e-06, + "loss": 0.5003, + "step": 6160 + }, + { + "epoch": 2.417387009138059, + "grad_norm": 0.4555779832663966, + "learning_rate": 4.842680278542704e-06, + "loss": 0.4988, + "step": 6161 + }, + { + "epoch": 2.417782168436651, + "grad_norm": 0.46273844976885875, + "learning_rate": 4.8426255740274776e-06, + "loss": 0.4881, + "step": 6162 + }, + { + "epoch": 2.4181773277352434, + "grad_norm": 0.44452340365885856, + "learning_rate": 4.84257086031184e-06, + "loss": 0.5114, + "step": 6163 + }, + { + "epoch": 2.4185724870338356, + "grad_norm": 0.45595741947905993, + "learning_rate": 4.842516137396007e-06, + "loss": 0.4976, + "step": 6164 + }, + { + "epoch": 2.418967646332428, + "grad_norm": 0.45112785524674676, + "learning_rate": 4.842461405280192e-06, + "loss": 0.5021, + "step": 6165 + }, + { + "epoch": 2.41936280563102, + "grad_norm": 0.4540156886101404, + "learning_rate": 4.842406663964612e-06, + "loss": 0.4895, + "step": 6166 + }, + { + "epoch": 2.4197579649296124, + "grad_norm": 0.4514068135997412, + "learning_rate": 4.842351913449481e-06, + "loss": 0.5139, + "step": 6167 + }, + { + "epoch": 2.4201531242282046, + "grad_norm": 0.4612849080685796, + "learning_rate": 4.842297153735014e-06, + "loss": 0.4983, + "step": 6168 + }, + { + "epoch": 2.420548283526797, + "grad_norm": 0.47339152666834156, + "learning_rate": 4.842242384821426e-06, + "loss": 0.4881, + "step": 6169 + }, + { + "epoch": 2.420943442825389, + "grad_norm": 0.46244595327643095, + "learning_rate": 4.842187606708932e-06, + "loss": 0.5034, + "step": 6170 + }, + { + "epoch": 2.4213386021239813, + "grad_norm": 0.4386337093959035, + "learning_rate": 4.8421328193977475e-06, + "loss": 0.4869, + "step": 6171 + }, + { + "epoch": 2.4217337614225736, + "grad_norm": 0.4718123255716426, + "learning_rate": 4.842078022888088e-06, + "loss": 0.4913, + "step": 6172 + }, + { + "epoch": 2.422128920721166, + "grad_norm": 0.4612919926951648, + "learning_rate": 4.8420232171801675e-06, + "loss": 0.5129, + "step": 6173 + }, + { + "epoch": 2.422524080019758, + "grad_norm": 0.4486144324992224, + "learning_rate": 4.841968402274202e-06, + "loss": 0.5147, + "step": 6174 + }, + { + "epoch": 2.4229192393183503, + "grad_norm": 0.46891228322282674, + "learning_rate": 4.841913578170407e-06, + "loss": 0.5172, + "step": 6175 + }, + { + "epoch": 2.4233143986169425, + "grad_norm": 0.44889237277545324, + "learning_rate": 4.841858744868998e-06, + "loss": 0.511, + "step": 6176 + }, + { + "epoch": 2.4237095579155348, + "grad_norm": 0.45184987151406364, + "learning_rate": 4.841803902370189e-06, + "loss": 0.4915, + "step": 6177 + }, + { + "epoch": 2.424104717214127, + "grad_norm": 0.45539717660361717, + "learning_rate": 4.841749050674196e-06, + "loss": 0.508, + "step": 6178 + }, + { + "epoch": 2.4244998765127193, + "grad_norm": 0.46183897777708616, + "learning_rate": 4.841694189781235e-06, + "loss": 0.5147, + "step": 6179 + }, + { + "epoch": 2.4248950358113115, + "grad_norm": 0.4596909143183604, + "learning_rate": 4.841639319691522e-06, + "loss": 0.506, + "step": 6180 + }, + { + "epoch": 2.4252901951099037, + "grad_norm": 0.4446113633044188, + "learning_rate": 4.841584440405271e-06, + "loss": 0.5064, + "step": 6181 + }, + { + "epoch": 2.425685354408496, + "grad_norm": 0.45743332618448895, + "learning_rate": 4.841529551922699e-06, + "loss": 0.504, + "step": 6182 + }, + { + "epoch": 2.426080513707088, + "grad_norm": 0.4816180094305801, + "learning_rate": 4.84147465424402e-06, + "loss": 0.4925, + "step": 6183 + }, + { + "epoch": 2.4264756730056805, + "grad_norm": 0.4543170154356122, + "learning_rate": 4.84141974736945e-06, + "loss": 0.5244, + "step": 6184 + }, + { + "epoch": 2.4268708323042727, + "grad_norm": 0.448689306635801, + "learning_rate": 4.841364831299206e-06, + "loss": 0.511, + "step": 6185 + }, + { + "epoch": 2.427265991602865, + "grad_norm": 0.45167259627359596, + "learning_rate": 4.8413099060335026e-06, + "loss": 0.5084, + "step": 6186 + }, + { + "epoch": 2.427661150901457, + "grad_norm": 0.4610385677246653, + "learning_rate": 4.841254971572555e-06, + "loss": 0.5097, + "step": 6187 + }, + { + "epoch": 2.4280563102000494, + "grad_norm": 0.4709029211327468, + "learning_rate": 4.84120002791658e-06, + "loss": 0.5125, + "step": 6188 + }, + { + "epoch": 2.4284514694986417, + "grad_norm": 0.4585385642897026, + "learning_rate": 4.841145075065793e-06, + "loss": 0.5255, + "step": 6189 + }, + { + "epoch": 2.428846628797234, + "grad_norm": 0.45124097243480904, + "learning_rate": 4.841090113020409e-06, + "loss": 0.4851, + "step": 6190 + }, + { + "epoch": 2.429241788095826, + "grad_norm": 0.4589307579275915, + "learning_rate": 4.841035141780645e-06, + "loss": 0.5237, + "step": 6191 + }, + { + "epoch": 2.4296369473944184, + "grad_norm": 0.4590381604206053, + "learning_rate": 4.840980161346717e-06, + "loss": 0.502, + "step": 6192 + }, + { + "epoch": 2.4300321066930106, + "grad_norm": 0.4575028892209658, + "learning_rate": 4.84092517171884e-06, + "loss": 0.5317, + "step": 6193 + }, + { + "epoch": 2.430427265991603, + "grad_norm": 0.4410480910569929, + "learning_rate": 4.840870172897231e-06, + "loss": 0.5079, + "step": 6194 + }, + { + "epoch": 2.430822425290195, + "grad_norm": 0.45485313052571497, + "learning_rate": 4.840815164882104e-06, + "loss": 0.5069, + "step": 6195 + }, + { + "epoch": 2.4312175845887873, + "grad_norm": 0.45726694565940096, + "learning_rate": 4.840760147673678e-06, + "loss": 0.4904, + "step": 6196 + }, + { + "epoch": 2.4316127438873796, + "grad_norm": 0.4591479178991767, + "learning_rate": 4.8407051212721664e-06, + "loss": 0.5311, + "step": 6197 + }, + { + "epoch": 2.432007903185972, + "grad_norm": 0.45199105003630935, + "learning_rate": 4.8406500856777875e-06, + "loss": 0.5167, + "step": 6198 + }, + { + "epoch": 2.432403062484564, + "grad_norm": 0.4629676193358175, + "learning_rate": 4.840595040890756e-06, + "loss": 0.5048, + "step": 6199 + }, + { + "epoch": 2.4327982217831563, + "grad_norm": 0.4476163591715802, + "learning_rate": 4.840539986911288e-06, + "loss": 0.4954, + "step": 6200 + }, + { + "epoch": 2.4331933810817485, + "grad_norm": 0.44798378662618715, + "learning_rate": 4.8404849237396005e-06, + "loss": 0.4933, + "step": 6201 + }, + { + "epoch": 2.433588540380341, + "grad_norm": 0.4571828487256416, + "learning_rate": 4.840429851375909e-06, + "loss": 0.5018, + "step": 6202 + }, + { + "epoch": 2.433983699678933, + "grad_norm": 0.4487082083135661, + "learning_rate": 4.840374769820432e-06, + "loss": 0.4855, + "step": 6203 + }, + { + "epoch": 2.4343788589775253, + "grad_norm": 0.5361808616989676, + "learning_rate": 4.840319679073382e-06, + "loss": 0.5127, + "step": 6204 + }, + { + "epoch": 2.4347740182761175, + "grad_norm": 0.4556095333028292, + "learning_rate": 4.840264579134978e-06, + "loss": 0.5055, + "step": 6205 + }, + { + "epoch": 2.4351691775747097, + "grad_norm": 0.4582035559570844, + "learning_rate": 4.840209470005436e-06, + "loss": 0.5032, + "step": 6206 + }, + { + "epoch": 2.435564336873302, + "grad_norm": 0.4848275765020931, + "learning_rate": 4.840154351684973e-06, + "loss": 0.515, + "step": 6207 + }, + { + "epoch": 2.4359594961718942, + "grad_norm": 0.46189716770494005, + "learning_rate": 4.840099224173803e-06, + "loss": 0.5158, + "step": 6208 + }, + { + "epoch": 2.4363546554704865, + "grad_norm": 0.4586463067687978, + "learning_rate": 4.840044087472145e-06, + "loss": 0.5169, + "step": 6209 + }, + { + "epoch": 2.4367498147690787, + "grad_norm": 0.45983904272371406, + "learning_rate": 4.839988941580216e-06, + "loss": 0.4927, + "step": 6210 + }, + { + "epoch": 2.437144974067671, + "grad_norm": 0.479677905098288, + "learning_rate": 4.83993378649823e-06, + "loss": 0.5466, + "step": 6211 + }, + { + "epoch": 2.437540133366263, + "grad_norm": 0.4605259736107779, + "learning_rate": 4.839878622226405e-06, + "loss": 0.5026, + "step": 6212 + }, + { + "epoch": 2.4379352926648554, + "grad_norm": 0.45622589883484194, + "learning_rate": 4.839823448764957e-06, + "loss": 0.5026, + "step": 6213 + }, + { + "epoch": 2.4383304519634477, + "grad_norm": 0.4780280716829919, + "learning_rate": 4.839768266114105e-06, + "loss": 0.515, + "step": 6214 + }, + { + "epoch": 2.43872561126204, + "grad_norm": 0.46441875870580496, + "learning_rate": 4.839713074274064e-06, + "loss": 0.508, + "step": 6215 + }, + { + "epoch": 2.439120770560632, + "grad_norm": 0.45047431718755854, + "learning_rate": 4.83965787324505e-06, + "loss": 0.5011, + "step": 6216 + }, + { + "epoch": 2.4395159298592244, + "grad_norm": 0.45440270281711137, + "learning_rate": 4.83960266302728e-06, + "loss": 0.5024, + "step": 6217 + }, + { + "epoch": 2.4399110891578166, + "grad_norm": 0.4638601787236139, + "learning_rate": 4.839547443620972e-06, + "loss": 0.5113, + "step": 6218 + }, + { + "epoch": 2.440306248456409, + "grad_norm": 0.4488674824729464, + "learning_rate": 4.839492215026342e-06, + "loss": 0.4858, + "step": 6219 + }, + { + "epoch": 2.440701407755001, + "grad_norm": 0.4536981413225659, + "learning_rate": 4.839436977243608e-06, + "loss": 0.4959, + "step": 6220 + }, + { + "epoch": 2.4410965670535933, + "grad_norm": 0.4614565162626784, + "learning_rate": 4.839381730272985e-06, + "loss": 0.5012, + "step": 6221 + }, + { + "epoch": 2.4414917263521856, + "grad_norm": 0.48492633372884375, + "learning_rate": 4.839326474114692e-06, + "loss": 0.5146, + "step": 6222 + }, + { + "epoch": 2.441886885650778, + "grad_norm": 0.4704218385973438, + "learning_rate": 4.839271208768945e-06, + "loss": 0.5129, + "step": 6223 + }, + { + "epoch": 2.44228204494937, + "grad_norm": 0.45965041302697135, + "learning_rate": 4.839215934235961e-06, + "loss": 0.5075, + "step": 6224 + }, + { + "epoch": 2.4426772042479623, + "grad_norm": 0.4561316353748442, + "learning_rate": 4.839160650515957e-06, + "loss": 0.4964, + "step": 6225 + }, + { + "epoch": 2.4430723635465545, + "grad_norm": 0.4590933327579659, + "learning_rate": 4.839105357609151e-06, + "loss": 0.5038, + "step": 6226 + }, + { + "epoch": 2.443467522845147, + "grad_norm": 0.47776100605193667, + "learning_rate": 4.839050055515759e-06, + "loss": 0.49, + "step": 6227 + }, + { + "epoch": 2.443862682143739, + "grad_norm": 0.43282187857296134, + "learning_rate": 4.838994744236e-06, + "loss": 0.4863, + "step": 6228 + }, + { + "epoch": 2.4442578414423313, + "grad_norm": 0.45445540357185477, + "learning_rate": 4.838939423770088e-06, + "loss": 0.5021, + "step": 6229 + }, + { + "epoch": 2.4446530007409235, + "grad_norm": 0.4562136501460368, + "learning_rate": 4.838884094118244e-06, + "loss": 0.5307, + "step": 6230 + }, + { + "epoch": 2.4450481600395157, + "grad_norm": 0.5480503669106829, + "learning_rate": 4.8388287552806825e-06, + "loss": 0.5152, + "step": 6231 + }, + { + "epoch": 2.445443319338108, + "grad_norm": 0.4651543841687929, + "learning_rate": 4.838773407257622e-06, + "loss": 0.5007, + "step": 6232 + }, + { + "epoch": 2.4458384786367002, + "grad_norm": 0.4685356376510576, + "learning_rate": 4.8387180500492795e-06, + "loss": 0.5048, + "step": 6233 + }, + { + "epoch": 2.4462336379352925, + "grad_norm": 0.4586342865694105, + "learning_rate": 4.838662683655872e-06, + "loss": 0.5004, + "step": 6234 + }, + { + "epoch": 2.4466287972338847, + "grad_norm": 0.44817924080411603, + "learning_rate": 4.83860730807762e-06, + "loss": 0.5024, + "step": 6235 + }, + { + "epoch": 2.447023956532477, + "grad_norm": 0.45665521330466075, + "learning_rate": 4.838551923314736e-06, + "loss": 0.5106, + "step": 6236 + }, + { + "epoch": 2.447419115831069, + "grad_norm": 0.46636023225981704, + "learning_rate": 4.838496529367441e-06, + "loss": 0.5154, + "step": 6237 + }, + { + "epoch": 2.4478142751296614, + "grad_norm": 0.47231613865652516, + "learning_rate": 4.8384411262359525e-06, + "loss": 0.5131, + "step": 6238 + }, + { + "epoch": 2.4482094344282537, + "grad_norm": 0.4613639527463871, + "learning_rate": 4.838385713920486e-06, + "loss": 0.5084, + "step": 6239 + }, + { + "epoch": 2.448604593726846, + "grad_norm": 0.4604238114892905, + "learning_rate": 4.838330292421262e-06, + "loss": 0.5065, + "step": 6240 + }, + { + "epoch": 2.448999753025438, + "grad_norm": 0.4633869738387279, + "learning_rate": 4.838274861738494e-06, + "loss": 0.5095, + "step": 6241 + }, + { + "epoch": 2.4493949123240304, + "grad_norm": 0.47977255402722213, + "learning_rate": 4.838219421872405e-06, + "loss": 0.5242, + "step": 6242 + }, + { + "epoch": 2.4497900716226226, + "grad_norm": 0.4649910129362038, + "learning_rate": 4.8381639728232075e-06, + "loss": 0.5007, + "step": 6243 + }, + { + "epoch": 2.450185230921215, + "grad_norm": 0.44567230911849703, + "learning_rate": 4.838108514591124e-06, + "loss": 0.4886, + "step": 6244 + }, + { + "epoch": 2.450580390219807, + "grad_norm": 0.4802947747613605, + "learning_rate": 4.838053047176368e-06, + "loss": 0.5182, + "step": 6245 + }, + { + "epoch": 2.4509755495184, + "grad_norm": 0.4529418893441322, + "learning_rate": 4.83799757057916e-06, + "loss": 0.4978, + "step": 6246 + }, + { + "epoch": 2.451370708816992, + "grad_norm": 0.4707140210320241, + "learning_rate": 4.837942084799717e-06, + "loss": 0.5148, + "step": 6247 + }, + { + "epoch": 2.4517658681155843, + "grad_norm": 0.4514623045887543, + "learning_rate": 4.837886589838259e-06, + "loss": 0.5095, + "step": 6248 + }, + { + "epoch": 2.4521610274141765, + "grad_norm": 0.4760633546771482, + "learning_rate": 4.837831085695e-06, + "loss": 0.5098, + "step": 6249 + }, + { + "epoch": 2.4525561867127688, + "grad_norm": 0.4602919310882103, + "learning_rate": 4.8377755723701614e-06, + "loss": 0.5025, + "step": 6250 + }, + { + "epoch": 2.452951346011361, + "grad_norm": 0.5205070905111004, + "learning_rate": 4.837720049863959e-06, + "loss": 0.5558, + "step": 6251 + }, + { + "epoch": 2.4533465053099532, + "grad_norm": 0.44480305897623773, + "learning_rate": 4.837664518176613e-06, + "loss": 0.4903, + "step": 6252 + }, + { + "epoch": 2.4537416646085455, + "grad_norm": 0.45504363171585244, + "learning_rate": 4.837608977308339e-06, + "loss": 0.5183, + "step": 6253 + }, + { + "epoch": 2.4541368239071377, + "grad_norm": 0.44828627366707774, + "learning_rate": 4.837553427259356e-06, + "loss": 0.5075, + "step": 6254 + }, + { + "epoch": 2.45453198320573, + "grad_norm": 0.4429586869046653, + "learning_rate": 4.837497868029884e-06, + "loss": 0.495, + "step": 6255 + }, + { + "epoch": 2.454927142504322, + "grad_norm": 0.4580942265610526, + "learning_rate": 4.837442299620139e-06, + "loss": 0.5026, + "step": 6256 + }, + { + "epoch": 2.4553223018029144, + "grad_norm": 0.46738087782305204, + "learning_rate": 4.83738672203034e-06, + "loss": 0.5316, + "step": 6257 + }, + { + "epoch": 2.4557174611015067, + "grad_norm": 0.47534561406612363, + "learning_rate": 4.837331135260705e-06, + "loss": 0.529, + "step": 6258 + }, + { + "epoch": 2.456112620400099, + "grad_norm": 0.45632596134859843, + "learning_rate": 4.837275539311454e-06, + "loss": 0.5025, + "step": 6259 + }, + { + "epoch": 2.456507779698691, + "grad_norm": 0.43916671865176493, + "learning_rate": 4.837219934182803e-06, + "loss": 0.4894, + "step": 6260 + }, + { + "epoch": 2.4569029389972834, + "grad_norm": 0.4685573051031802, + "learning_rate": 4.837164319874972e-06, + "loss": 0.5134, + "step": 6261 + }, + { + "epoch": 2.4572980982958756, + "grad_norm": 0.45202371145633197, + "learning_rate": 4.8371086963881774e-06, + "loss": 0.4923, + "step": 6262 + }, + { + "epoch": 2.457693257594468, + "grad_norm": 0.46730026620993986, + "learning_rate": 4.83705306372264e-06, + "loss": 0.5016, + "step": 6263 + }, + { + "epoch": 2.45808841689306, + "grad_norm": 0.4626170175822174, + "learning_rate": 4.836997421878577e-06, + "loss": 0.5234, + "step": 6264 + }, + { + "epoch": 2.4584835761916524, + "grad_norm": 0.44361760739699646, + "learning_rate": 4.836941770856207e-06, + "loss": 0.499, + "step": 6265 + }, + { + "epoch": 2.4588787354902446, + "grad_norm": 0.45702240750866957, + "learning_rate": 4.83688611065575e-06, + "loss": 0.5058, + "step": 6266 + }, + { + "epoch": 2.459273894788837, + "grad_norm": 0.47299501346012857, + "learning_rate": 4.836830441277422e-06, + "loss": 0.5066, + "step": 6267 + }, + { + "epoch": 2.459669054087429, + "grad_norm": 0.4727679722857182, + "learning_rate": 4.836774762721443e-06, + "loss": 0.5219, + "step": 6268 + }, + { + "epoch": 2.4600642133860213, + "grad_norm": 0.4561162349721409, + "learning_rate": 4.836719074988033e-06, + "loss": 0.5123, + "step": 6269 + }, + { + "epoch": 2.4604593726846136, + "grad_norm": 0.46185475580783847, + "learning_rate": 4.836663378077408e-06, + "loss": 0.5057, + "step": 6270 + }, + { + "epoch": 2.460854531983206, + "grad_norm": 0.4589633018472339, + "learning_rate": 4.836607671989789e-06, + "loss": 0.491, + "step": 6271 + }, + { + "epoch": 2.461249691281798, + "grad_norm": 0.4674016116064247, + "learning_rate": 4.836551956725394e-06, + "loss": 0.505, + "step": 6272 + }, + { + "epoch": 2.4616448505803903, + "grad_norm": 0.44019704083575656, + "learning_rate": 4.836496232284441e-06, + "loss": 0.4899, + "step": 6273 + }, + { + "epoch": 2.4620400098789825, + "grad_norm": 0.4560950335962957, + "learning_rate": 4.8364404986671495e-06, + "loss": 0.5086, + "step": 6274 + }, + { + "epoch": 2.4624351691775748, + "grad_norm": 0.45987663956371605, + "learning_rate": 4.83638475587374e-06, + "loss": 0.5023, + "step": 6275 + }, + { + "epoch": 2.462830328476167, + "grad_norm": 0.45218362910357657, + "learning_rate": 4.836329003904429e-06, + "loss": 0.5049, + "step": 6276 + }, + { + "epoch": 2.4632254877747592, + "grad_norm": 0.46082337545098706, + "learning_rate": 4.836273242759436e-06, + "loss": 0.4906, + "step": 6277 + }, + { + "epoch": 2.4636206470733515, + "grad_norm": 0.4484085796656676, + "learning_rate": 4.83621747243898e-06, + "loss": 0.5003, + "step": 6278 + }, + { + "epoch": 2.4640158063719437, + "grad_norm": 0.4561063177663292, + "learning_rate": 4.836161692943282e-06, + "loss": 0.4872, + "step": 6279 + }, + { + "epoch": 2.464410965670536, + "grad_norm": 0.4635792252049459, + "learning_rate": 4.836105904272558e-06, + "loss": 0.4891, + "step": 6280 + }, + { + "epoch": 2.464806124969128, + "grad_norm": 0.44874718083152354, + "learning_rate": 4.836050106427029e-06, + "loss": 0.504, + "step": 6281 + }, + { + "epoch": 2.4652012842677204, + "grad_norm": 0.4542288616082571, + "learning_rate": 4.835994299406914e-06, + "loss": 0.5117, + "step": 6282 + }, + { + "epoch": 2.4655964435663127, + "grad_norm": 0.46392560474828354, + "learning_rate": 4.835938483212431e-06, + "loss": 0.5032, + "step": 6283 + }, + { + "epoch": 2.465991602864905, + "grad_norm": 0.461032498595409, + "learning_rate": 4.835882657843801e-06, + "loss": 0.5041, + "step": 6284 + }, + { + "epoch": 2.466386762163497, + "grad_norm": 0.43775177363788437, + "learning_rate": 4.835826823301242e-06, + "loss": 0.5057, + "step": 6285 + }, + { + "epoch": 2.4667819214620894, + "grad_norm": 0.45447705767758345, + "learning_rate": 4.835770979584974e-06, + "loss": 0.5059, + "step": 6286 + }, + { + "epoch": 2.4671770807606817, + "grad_norm": 0.4820043582122839, + "learning_rate": 4.835715126695216e-06, + "loss": 0.5025, + "step": 6287 + }, + { + "epoch": 2.467572240059274, + "grad_norm": 0.45364882204768436, + "learning_rate": 4.835659264632186e-06, + "loss": 0.5121, + "step": 6288 + }, + { + "epoch": 2.467967399357866, + "grad_norm": 0.44392647087381, + "learning_rate": 4.835603393396106e-06, + "loss": 0.4973, + "step": 6289 + }, + { + "epoch": 2.4683625586564584, + "grad_norm": 0.4702335158316592, + "learning_rate": 4.835547512987194e-06, + "loss": 0.5056, + "step": 6290 + }, + { + "epoch": 2.4687577179550506, + "grad_norm": 0.46379114054816933, + "learning_rate": 4.835491623405669e-06, + "loss": 0.5062, + "step": 6291 + }, + { + "epoch": 2.469152877253643, + "grad_norm": 0.46254378434485627, + "learning_rate": 4.835435724651753e-06, + "loss": 0.5034, + "step": 6292 + }, + { + "epoch": 2.469548036552235, + "grad_norm": 0.4435217656925511, + "learning_rate": 4.8353798167256615e-06, + "loss": 0.5178, + "step": 6293 + }, + { + "epoch": 2.4699431958508273, + "grad_norm": 0.44400526861468553, + "learning_rate": 4.835323899627616e-06, + "loss": 0.4828, + "step": 6294 + }, + { + "epoch": 2.4703383551494196, + "grad_norm": 0.4661317214500771, + "learning_rate": 4.835267973357837e-06, + "loss": 0.5096, + "step": 6295 + }, + { + "epoch": 2.470733514448012, + "grad_norm": 0.4599470856465283, + "learning_rate": 4.835212037916545e-06, + "loss": 0.5178, + "step": 6296 + }, + { + "epoch": 2.471128673746604, + "grad_norm": 0.44508708208240416, + "learning_rate": 4.835156093303956e-06, + "loss": 0.5011, + "step": 6297 + }, + { + "epoch": 2.4715238330451963, + "grad_norm": 0.45991594887386883, + "learning_rate": 4.835100139520292e-06, + "loss": 0.5066, + "step": 6298 + }, + { + "epoch": 2.4719189923437885, + "grad_norm": 0.4473801495463778, + "learning_rate": 4.8350441765657736e-06, + "loss": 0.5124, + "step": 6299 + }, + { + "epoch": 2.4723141516423808, + "grad_norm": 0.45643508613471995, + "learning_rate": 4.834988204440619e-06, + "loss": 0.4926, + "step": 6300 + }, + { + "epoch": 2.472709310940973, + "grad_norm": 0.4504940699709523, + "learning_rate": 4.834932223145049e-06, + "loss": 0.5032, + "step": 6301 + }, + { + "epoch": 2.4731044702395653, + "grad_norm": 0.4426038077719436, + "learning_rate": 4.834876232679283e-06, + "loss": 0.5104, + "step": 6302 + }, + { + "epoch": 2.4734996295381575, + "grad_norm": 0.46223284198355835, + "learning_rate": 4.83482023304354e-06, + "loss": 0.5081, + "step": 6303 + }, + { + "epoch": 2.4738947888367497, + "grad_norm": 0.4770885110104904, + "learning_rate": 4.834764224238042e-06, + "loss": 0.5098, + "step": 6304 + }, + { + "epoch": 2.474289948135342, + "grad_norm": 0.4732407782266299, + "learning_rate": 4.834708206263008e-06, + "loss": 0.4808, + "step": 6305 + }, + { + "epoch": 2.474685107433934, + "grad_norm": 0.4613411578962985, + "learning_rate": 4.834652179118657e-06, + "loss": 0.5215, + "step": 6306 + }, + { + "epoch": 2.4750802667325265, + "grad_norm": 0.4626350011702521, + "learning_rate": 4.83459614280521e-06, + "loss": 0.5143, + "step": 6307 + }, + { + "epoch": 2.4754754260311187, + "grad_norm": 0.4492080049375109, + "learning_rate": 4.834540097322888e-06, + "loss": 0.5151, + "step": 6308 + }, + { + "epoch": 2.475870585329711, + "grad_norm": 0.46276098431405666, + "learning_rate": 4.834484042671909e-06, + "loss": 0.5107, + "step": 6309 + }, + { + "epoch": 2.476265744628303, + "grad_norm": 0.4600080634693739, + "learning_rate": 4.834427978852495e-06, + "loss": 0.5183, + "step": 6310 + }, + { + "epoch": 2.4766609039268954, + "grad_norm": 0.4560521377408553, + "learning_rate": 4.834371905864865e-06, + "loss": 0.515, + "step": 6311 + }, + { + "epoch": 2.4770560632254877, + "grad_norm": 0.4680722643265072, + "learning_rate": 4.83431582370924e-06, + "loss": 0.5268, + "step": 6312 + }, + { + "epoch": 2.47745122252408, + "grad_norm": 0.45788403308191006, + "learning_rate": 4.83425973238584e-06, + "loss": 0.5185, + "step": 6313 + }, + { + "epoch": 2.477846381822672, + "grad_norm": 0.4489353107357901, + "learning_rate": 4.834203631894885e-06, + "loss": 0.5245, + "step": 6314 + }, + { + "epoch": 2.4782415411212644, + "grad_norm": 0.4769728044328352, + "learning_rate": 4.834147522236595e-06, + "loss": 0.5034, + "step": 6315 + }, + { + "epoch": 2.4786367004198566, + "grad_norm": 0.4531931819815472, + "learning_rate": 4.8340914034111916e-06, + "loss": 0.4785, + "step": 6316 + }, + { + "epoch": 2.479031859718449, + "grad_norm": 0.436765563760745, + "learning_rate": 4.834035275418895e-06, + "loss": 0.5044, + "step": 6317 + }, + { + "epoch": 2.479427019017041, + "grad_norm": 0.45553402918025715, + "learning_rate": 4.833979138259923e-06, + "loss": 0.5099, + "step": 6318 + }, + { + "epoch": 2.4798221783156333, + "grad_norm": 0.45738260128460106, + "learning_rate": 4.8339229919345e-06, + "loss": 0.5028, + "step": 6319 + }, + { + "epoch": 2.4802173376142256, + "grad_norm": 0.45623700990486077, + "learning_rate": 4.833866836442844e-06, + "loss": 0.5015, + "step": 6320 + }, + { + "epoch": 2.480612496912818, + "grad_norm": 0.44512331672110883, + "learning_rate": 4.833810671785177e-06, + "loss": 0.5099, + "step": 6321 + }, + { + "epoch": 2.48100765621141, + "grad_norm": 0.4552888276049071, + "learning_rate": 4.833754497961719e-06, + "loss": 0.5012, + "step": 6322 + }, + { + "epoch": 2.4814028155100023, + "grad_norm": 0.46342152696551503, + "learning_rate": 4.83369831497269e-06, + "loss": 0.5272, + "step": 6323 + }, + { + "epoch": 2.4817979748085945, + "grad_norm": 0.44267854664405953, + "learning_rate": 4.833642122818311e-06, + "loss": 0.4853, + "step": 6324 + }, + { + "epoch": 2.482193134107187, + "grad_norm": 0.43631582113931716, + "learning_rate": 4.833585921498802e-06, + "loss": 0.4987, + "step": 6325 + }, + { + "epoch": 2.482588293405779, + "grad_norm": 0.44520693389382443, + "learning_rate": 4.8335297110143854e-06, + "loss": 0.5021, + "step": 6326 + }, + { + "epoch": 2.4829834527043713, + "grad_norm": 0.4611485355941225, + "learning_rate": 4.833473491365281e-06, + "loss": 0.5156, + "step": 6327 + }, + { + "epoch": 2.4833786120029635, + "grad_norm": 0.45289421087393966, + "learning_rate": 4.833417262551711e-06, + "loss": 0.5095, + "step": 6328 + }, + { + "epoch": 2.483773771301556, + "grad_norm": 0.4566101862712613, + "learning_rate": 4.833361024573893e-06, + "loss": 0.5237, + "step": 6329 + }, + { + "epoch": 2.4841689306001484, + "grad_norm": 0.4564858320090929, + "learning_rate": 4.833304777432051e-06, + "loss": 0.492, + "step": 6330 + }, + { + "epoch": 2.4845640898987407, + "grad_norm": 0.4555944378026437, + "learning_rate": 4.8332485211264035e-06, + "loss": 0.5221, + "step": 6331 + }, + { + "epoch": 2.484959249197333, + "grad_norm": 0.4640570525721282, + "learning_rate": 4.833192255657173e-06, + "loss": 0.5265, + "step": 6332 + }, + { + "epoch": 2.485354408495925, + "grad_norm": 0.4450100165471149, + "learning_rate": 4.833135981024581e-06, + "loss": 0.4968, + "step": 6333 + }, + { + "epoch": 2.4857495677945174, + "grad_norm": 0.44268155087230027, + "learning_rate": 4.833079697228847e-06, + "loss": 0.4893, + "step": 6334 + }, + { + "epoch": 2.4861447270931096, + "grad_norm": 0.446170036400285, + "learning_rate": 4.833023404270193e-06, + "loss": 0.4945, + "step": 6335 + }, + { + "epoch": 2.486539886391702, + "grad_norm": 0.46363170302080026, + "learning_rate": 4.8329671021488385e-06, + "loss": 0.5106, + "step": 6336 + }, + { + "epoch": 2.486935045690294, + "grad_norm": 0.46125075416833533, + "learning_rate": 4.832910790865007e-06, + "loss": 0.5099, + "step": 6337 + }, + { + "epoch": 2.4873302049888864, + "grad_norm": 0.4666268360674439, + "learning_rate": 4.832854470418918e-06, + "loss": 0.4968, + "step": 6338 + }, + { + "epoch": 2.4877253642874786, + "grad_norm": 0.45259995470398834, + "learning_rate": 4.8327981408107945e-06, + "loss": 0.5231, + "step": 6339 + }, + { + "epoch": 2.488120523586071, + "grad_norm": 0.45257495176751394, + "learning_rate": 4.832741802040856e-06, + "loss": 0.51, + "step": 6340 + }, + { + "epoch": 2.488515682884663, + "grad_norm": 0.4702986428108045, + "learning_rate": 4.8326854541093235e-06, + "loss": 0.511, + "step": 6341 + }, + { + "epoch": 2.4889108421832553, + "grad_norm": 0.45287635582005137, + "learning_rate": 4.832629097016419e-06, + "loss": 0.5194, + "step": 6342 + }, + { + "epoch": 2.4893060014818476, + "grad_norm": 0.44097483735488313, + "learning_rate": 4.832572730762364e-06, + "loss": 0.4886, + "step": 6343 + }, + { + "epoch": 2.48970116078044, + "grad_norm": 0.4963761696743815, + "learning_rate": 4.83251635534738e-06, + "loss": 0.5144, + "step": 6344 + }, + { + "epoch": 2.490096320079032, + "grad_norm": 0.4721389393948357, + "learning_rate": 4.832459970771688e-06, + "loss": 0.5196, + "step": 6345 + }, + { + "epoch": 2.4904914793776243, + "grad_norm": 0.4607493050777102, + "learning_rate": 4.83240357703551e-06, + "loss": 0.5065, + "step": 6346 + }, + { + "epoch": 2.4908866386762165, + "grad_norm": 0.4603650627094426, + "learning_rate": 4.8323471741390656e-06, + "loss": 0.4982, + "step": 6347 + }, + { + "epoch": 2.4912817979748088, + "grad_norm": 0.4620527083371952, + "learning_rate": 4.832290762082579e-06, + "loss": 0.5114, + "step": 6348 + }, + { + "epoch": 2.491676957273401, + "grad_norm": 0.46945952654863243, + "learning_rate": 4.8322343408662705e-06, + "loss": 0.5093, + "step": 6349 + }, + { + "epoch": 2.4920721165719932, + "grad_norm": 0.45597389461472576, + "learning_rate": 4.8321779104903616e-06, + "loss": 0.5103, + "step": 6350 + }, + { + "epoch": 2.4924672758705855, + "grad_norm": 0.4517530087892166, + "learning_rate": 4.832121470955074e-06, + "loss": 0.504, + "step": 6351 + }, + { + "epoch": 2.4928624351691777, + "grad_norm": 0.44430291926385046, + "learning_rate": 4.832065022260629e-06, + "loss": 0.4967, + "step": 6352 + }, + { + "epoch": 2.49325759446777, + "grad_norm": 0.46115093494647114, + "learning_rate": 4.83200856440725e-06, + "loss": 0.5082, + "step": 6353 + }, + { + "epoch": 2.493652753766362, + "grad_norm": 0.45214398265733147, + "learning_rate": 4.831952097395156e-06, + "loss": 0.503, + "step": 6354 + }, + { + "epoch": 2.4940479130649544, + "grad_norm": 0.4607263777785255, + "learning_rate": 4.831895621224571e-06, + "loss": 0.5115, + "step": 6355 + }, + { + "epoch": 2.4944430723635467, + "grad_norm": 0.45430159930327796, + "learning_rate": 4.8318391358957156e-06, + "loss": 0.4955, + "step": 6356 + }, + { + "epoch": 2.494838231662139, + "grad_norm": 0.45283670210193705, + "learning_rate": 4.831782641408812e-06, + "loss": 0.5071, + "step": 6357 + }, + { + "epoch": 2.495233390960731, + "grad_norm": 0.4598695429896044, + "learning_rate": 4.831726137764082e-06, + "loss": 0.518, + "step": 6358 + }, + { + "epoch": 2.4956285502593234, + "grad_norm": 0.4399372220031, + "learning_rate": 4.831669624961748e-06, + "loss": 0.5051, + "step": 6359 + }, + { + "epoch": 2.4960237095579156, + "grad_norm": 0.467578857452426, + "learning_rate": 4.831613103002032e-06, + "loss": 0.4944, + "step": 6360 + }, + { + "epoch": 2.496418868856508, + "grad_norm": 0.46610183550354434, + "learning_rate": 4.831556571885155e-06, + "loss": 0.5046, + "step": 6361 + }, + { + "epoch": 2.4968140281551, + "grad_norm": 0.4589963237813121, + "learning_rate": 4.831500031611339e-06, + "loss": 0.5007, + "step": 6362 + }, + { + "epoch": 2.4972091874536924, + "grad_norm": 0.45341734760729985, + "learning_rate": 4.831443482180808e-06, + "loss": 0.5097, + "step": 6363 + }, + { + "epoch": 2.4976043467522846, + "grad_norm": 0.4737045186230201, + "learning_rate": 4.831386923593781e-06, + "loss": 0.5105, + "step": 6364 + }, + { + "epoch": 2.497999506050877, + "grad_norm": 0.4641942011706654, + "learning_rate": 4.831330355850484e-06, + "loss": 0.5005, + "step": 6365 + }, + { + "epoch": 2.498394665349469, + "grad_norm": 0.4856421198337944, + "learning_rate": 4.831273778951135e-06, + "loss": 0.5245, + "step": 6366 + }, + { + "epoch": 2.4987898246480613, + "grad_norm": 0.44305020823239166, + "learning_rate": 4.831217192895959e-06, + "loss": 0.4882, + "step": 6367 + }, + { + "epoch": 2.4991849839466536, + "grad_norm": 0.4582781517686158, + "learning_rate": 4.831160597685178e-06, + "loss": 0.4903, + "step": 6368 + }, + { + "epoch": 2.499580143245246, + "grad_norm": 0.4525149145967303, + "learning_rate": 4.8311039933190136e-06, + "loss": 0.4834, + "step": 6369 + }, + { + "epoch": 2.499975302543838, + "grad_norm": 0.4575043422893562, + "learning_rate": 4.831047379797687e-06, + "loss": 0.4897, + "step": 6370 + }, + { + "epoch": 2.5003704618424303, + "grad_norm": 0.46250674104664735, + "learning_rate": 4.830990757121424e-06, + "loss": 0.506, + "step": 6371 + }, + { + "epoch": 2.5007656211410225, + "grad_norm": 0.4619201958339098, + "learning_rate": 4.830934125290443e-06, + "loss": 0.4947, + "step": 6372 + }, + { + "epoch": 2.5011607804396148, + "grad_norm": 0.4603940568427686, + "learning_rate": 4.830877484304969e-06, + "loss": 0.5052, + "step": 6373 + }, + { + "epoch": 2.501555939738207, + "grad_norm": 0.4671784400513492, + "learning_rate": 4.830820834165223e-06, + "loss": 0.4942, + "step": 6374 + }, + { + "epoch": 2.5019510990367992, + "grad_norm": 0.46267953291708924, + "learning_rate": 4.830764174871429e-06, + "loss": 0.5049, + "step": 6375 + }, + { + "epoch": 2.5023462583353915, + "grad_norm": 0.4670573153772785, + "learning_rate": 4.830707506423807e-06, + "loss": 0.5071, + "step": 6376 + }, + { + "epoch": 2.5027414176339837, + "grad_norm": 0.448679724311928, + "learning_rate": 4.830650828822583e-06, + "loss": 0.5036, + "step": 6377 + }, + { + "epoch": 2.503136576932576, + "grad_norm": 0.4638582201810059, + "learning_rate": 4.830594142067977e-06, + "loss": 0.4958, + "step": 6378 + }, + { + "epoch": 2.503531736231168, + "grad_norm": 0.46498455456100324, + "learning_rate": 4.8305374461602115e-06, + "loss": 0.4994, + "step": 6379 + }, + { + "epoch": 2.5039268955297604, + "grad_norm": 0.47076449596566666, + "learning_rate": 4.830480741099511e-06, + "loss": 0.5077, + "step": 6380 + }, + { + "epoch": 2.5043220548283527, + "grad_norm": 0.4620871607194632, + "learning_rate": 4.830424026886098e-06, + "loss": 0.5003, + "step": 6381 + }, + { + "epoch": 2.504717214126945, + "grad_norm": 0.46348419234538263, + "learning_rate": 4.8303673035201935e-06, + "loss": 0.5286, + "step": 6382 + }, + { + "epoch": 2.505112373425537, + "grad_norm": 0.45149330232898727, + "learning_rate": 4.830310571002022e-06, + "loss": 0.5099, + "step": 6383 + }, + { + "epoch": 2.5055075327241294, + "grad_norm": 0.4375735232605822, + "learning_rate": 4.830253829331805e-06, + "loss": 0.4983, + "step": 6384 + }, + { + "epoch": 2.5059026920227216, + "grad_norm": 0.4469286754515496, + "learning_rate": 4.830197078509766e-06, + "loss": 0.4992, + "step": 6385 + }, + { + "epoch": 2.506297851321314, + "grad_norm": 0.46197178445812176, + "learning_rate": 4.830140318536128e-06, + "loss": 0.497, + "step": 6386 + }, + { + "epoch": 2.506693010619906, + "grad_norm": 0.4619654942592018, + "learning_rate": 4.830083549411114e-06, + "loss": 0.4983, + "step": 6387 + }, + { + "epoch": 2.5070881699184984, + "grad_norm": 0.4522982728309289, + "learning_rate": 4.830026771134947e-06, + "loss": 0.4994, + "step": 6388 + }, + { + "epoch": 2.5074833292170906, + "grad_norm": 0.4628104450392865, + "learning_rate": 4.82996998370785e-06, + "loss": 0.5056, + "step": 6389 + }, + { + "epoch": 2.507878488515683, + "grad_norm": 0.45715011969492436, + "learning_rate": 4.829913187130044e-06, + "loss": 0.5084, + "step": 6390 + }, + { + "epoch": 2.508273647814275, + "grad_norm": 0.4620237998725422, + "learning_rate": 4.8298563814017555e-06, + "loss": 0.5114, + "step": 6391 + }, + { + "epoch": 2.5086688071128673, + "grad_norm": 0.4632610101741824, + "learning_rate": 4.829799566523205e-06, + "loss": 0.5141, + "step": 6392 + }, + { + "epoch": 2.5090639664114596, + "grad_norm": 0.4641510104801741, + "learning_rate": 4.829742742494616e-06, + "loss": 0.5042, + "step": 6393 + }, + { + "epoch": 2.509459125710052, + "grad_norm": 0.4628296152429653, + "learning_rate": 4.829685909316214e-06, + "loss": 0.4989, + "step": 6394 + }, + { + "epoch": 2.509854285008644, + "grad_norm": 0.47161142330182587, + "learning_rate": 4.829629066988219e-06, + "loss": 0.4889, + "step": 6395 + }, + { + "epoch": 2.5102494443072363, + "grad_norm": 0.4703058506566186, + "learning_rate": 4.829572215510856e-06, + "loss": 0.5248, + "step": 6396 + }, + { + "epoch": 2.5106446036058285, + "grad_norm": 0.4487390067504666, + "learning_rate": 4.829515354884348e-06, + "loss": 0.4989, + "step": 6397 + }, + { + "epoch": 2.5110397629044208, + "grad_norm": 0.46578439069427224, + "learning_rate": 4.829458485108918e-06, + "loss": 0.5157, + "step": 6398 + }, + { + "epoch": 2.511434922203013, + "grad_norm": 0.4589142880562439, + "learning_rate": 4.8294016061847895e-06, + "loss": 0.5113, + "step": 6399 + }, + { + "epoch": 2.5118300815016052, + "grad_norm": 0.43436790417000837, + "learning_rate": 4.829344718112186e-06, + "loss": 0.4928, + "step": 6400 + }, + { + "epoch": 2.5122252408001975, + "grad_norm": 0.454548256257017, + "learning_rate": 4.829287820891332e-06, + "loss": 0.4983, + "step": 6401 + }, + { + "epoch": 2.5126204000987897, + "grad_norm": 0.44998917234532554, + "learning_rate": 4.829230914522449e-06, + "loss": 0.502, + "step": 6402 + }, + { + "epoch": 2.513015559397382, + "grad_norm": 0.4419759844210292, + "learning_rate": 4.82917399900576e-06, + "loss": 0.4961, + "step": 6403 + }, + { + "epoch": 2.513410718695974, + "grad_norm": 0.4424432885292469, + "learning_rate": 4.829117074341492e-06, + "loss": 0.4962, + "step": 6404 + }, + { + "epoch": 2.5138058779945665, + "grad_norm": 0.47103994040807695, + "learning_rate": 4.829060140529866e-06, + "loss": 0.5016, + "step": 6405 + }, + { + "epoch": 2.5142010372931587, + "grad_norm": 0.4583560717436245, + "learning_rate": 4.829003197571106e-06, + "loss": 0.5003, + "step": 6406 + }, + { + "epoch": 2.514596196591751, + "grad_norm": 0.4511981257492769, + "learning_rate": 4.828946245465435e-06, + "loss": 0.5092, + "step": 6407 + }, + { + "epoch": 2.514991355890343, + "grad_norm": 0.44378556134382374, + "learning_rate": 4.828889284213078e-06, + "loss": 0.4868, + "step": 6408 + }, + { + "epoch": 2.5153865151889354, + "grad_norm": 0.4638400372635114, + "learning_rate": 4.828832313814258e-06, + "loss": 0.5157, + "step": 6409 + }, + { + "epoch": 2.5157816744875277, + "grad_norm": 0.4555708689326073, + "learning_rate": 4.828775334269198e-06, + "loss": 0.503, + "step": 6410 + }, + { + "epoch": 2.51617683378612, + "grad_norm": 0.4740137430827515, + "learning_rate": 4.828718345578124e-06, + "loss": 0.5012, + "step": 6411 + }, + { + "epoch": 2.516571993084712, + "grad_norm": 0.440175406189311, + "learning_rate": 4.828661347741258e-06, + "loss": 0.4972, + "step": 6412 + }, + { + "epoch": 2.5169671523833044, + "grad_norm": 0.44634195808516225, + "learning_rate": 4.828604340758824e-06, + "loss": 0.521, + "step": 6413 + }, + { + "epoch": 2.5173623116818966, + "grad_norm": 0.44706452475534614, + "learning_rate": 4.828547324631045e-06, + "loss": 0.5099, + "step": 6414 + }, + { + "epoch": 2.517757470980489, + "grad_norm": 0.45566928817560864, + "learning_rate": 4.828490299358148e-06, + "loss": 0.4969, + "step": 6415 + }, + { + "epoch": 2.518152630279081, + "grad_norm": 0.45962733757335644, + "learning_rate": 4.828433264940354e-06, + "loss": 0.4894, + "step": 6416 + }, + { + "epoch": 2.5185477895776733, + "grad_norm": 0.4699440047924458, + "learning_rate": 4.828376221377889e-06, + "loss": 0.5016, + "step": 6417 + }, + { + "epoch": 2.5189429488762656, + "grad_norm": 0.4398573036728772, + "learning_rate": 4.828319168670974e-06, + "loss": 0.4954, + "step": 6418 + }, + { + "epoch": 2.519338108174858, + "grad_norm": 0.4553277215876072, + "learning_rate": 4.828262106819837e-06, + "loss": 0.5103, + "step": 6419 + }, + { + "epoch": 2.51973326747345, + "grad_norm": 0.455563395903747, + "learning_rate": 4.8282050358247e-06, + "loss": 0.5047, + "step": 6420 + }, + { + "epoch": 2.5201284267720423, + "grad_norm": 0.4614817667592641, + "learning_rate": 4.828147955685787e-06, + "loss": 0.5015, + "step": 6421 + }, + { + "epoch": 2.5205235860706345, + "grad_norm": 0.4540776683434743, + "learning_rate": 4.8280908664033225e-06, + "loss": 0.5004, + "step": 6422 + }, + { + "epoch": 2.5209187453692268, + "grad_norm": 0.4427896768855242, + "learning_rate": 4.828033767977531e-06, + "loss": 0.4898, + "step": 6423 + }, + { + "epoch": 2.521313904667819, + "grad_norm": 0.4546512843332004, + "learning_rate": 4.8279766604086365e-06, + "loss": 0.5219, + "step": 6424 + }, + { + "epoch": 2.5217090639664113, + "grad_norm": 0.46014478411090454, + "learning_rate": 4.827919543696863e-06, + "loss": 0.5003, + "step": 6425 + }, + { + "epoch": 2.5221042232650035, + "grad_norm": 0.5757044021010417, + "learning_rate": 4.827862417842435e-06, + "loss": 0.489, + "step": 6426 + }, + { + "epoch": 2.5224993825635957, + "grad_norm": 0.44379865154981046, + "learning_rate": 4.827805282845577e-06, + "loss": 0.5096, + "step": 6427 + }, + { + "epoch": 2.522894541862188, + "grad_norm": 0.4678140092742878, + "learning_rate": 4.827748138706514e-06, + "loss": 0.5189, + "step": 6428 + }, + { + "epoch": 2.52328970116078, + "grad_norm": 0.45998044186365117, + "learning_rate": 4.827690985425469e-06, + "loss": 0.5091, + "step": 6429 + }, + { + "epoch": 2.5236848604593725, + "grad_norm": 0.49126830773446944, + "learning_rate": 4.827633823002669e-06, + "loss": 0.5154, + "step": 6430 + }, + { + "epoch": 2.5240800197579647, + "grad_norm": 0.5033898050900029, + "learning_rate": 4.827576651438335e-06, + "loss": 0.504, + "step": 6431 + }, + { + "epoch": 2.524475179056557, + "grad_norm": 0.4401770069375346, + "learning_rate": 4.827519470732693e-06, + "loss": 0.5029, + "step": 6432 + }, + { + "epoch": 2.524870338355149, + "grad_norm": 0.45085555469804983, + "learning_rate": 4.82746228088597e-06, + "loss": 0.4777, + "step": 6433 + }, + { + "epoch": 2.5252654976537414, + "grad_norm": 0.4711645476173809, + "learning_rate": 4.827405081898387e-06, + "loss": 0.5107, + "step": 6434 + }, + { + "epoch": 2.5256606569523337, + "grad_norm": 0.46331381932207494, + "learning_rate": 4.82734787377017e-06, + "loss": 0.5287, + "step": 6435 + }, + { + "epoch": 2.526055816250926, + "grad_norm": 0.4595073465716432, + "learning_rate": 4.827290656501544e-06, + "loss": 0.5141, + "step": 6436 + }, + { + "epoch": 2.526450975549518, + "grad_norm": 0.45432945562079763, + "learning_rate": 4.827233430092733e-06, + "loss": 0.4814, + "step": 6437 + }, + { + "epoch": 2.5268461348481104, + "grad_norm": 0.4531070026892637, + "learning_rate": 4.827176194543963e-06, + "loss": 0.5066, + "step": 6438 + }, + { + "epoch": 2.5272412941467026, + "grad_norm": 0.5299603756232713, + "learning_rate": 4.8271189498554575e-06, + "loss": 0.5128, + "step": 6439 + }, + { + "epoch": 2.527636453445295, + "grad_norm": 0.444567842697348, + "learning_rate": 4.827061696027442e-06, + "loss": 0.4827, + "step": 6440 + }, + { + "epoch": 2.528031612743887, + "grad_norm": 0.4590491574983818, + "learning_rate": 4.827004433060142e-06, + "loss": 0.4784, + "step": 6441 + }, + { + "epoch": 2.52842677204248, + "grad_norm": 0.46222176709133134, + "learning_rate": 4.826947160953781e-06, + "loss": 0.5197, + "step": 6442 + }, + { + "epoch": 2.528821931341072, + "grad_norm": 0.4553604192939767, + "learning_rate": 4.826889879708585e-06, + "loss": 0.4899, + "step": 6443 + }, + { + "epoch": 2.5292170906396643, + "grad_norm": 0.4485178175280675, + "learning_rate": 4.826832589324778e-06, + "loss": 0.5062, + "step": 6444 + }, + { + "epoch": 2.5296122499382565, + "grad_norm": 0.45899668741167826, + "learning_rate": 4.8267752898025855e-06, + "loss": 0.5224, + "step": 6445 + }, + { + "epoch": 2.5300074092368487, + "grad_norm": 0.45782021791332567, + "learning_rate": 4.826717981142233e-06, + "loss": 0.5074, + "step": 6446 + }, + { + "epoch": 2.530402568535441, + "grad_norm": 0.471834837281007, + "learning_rate": 4.8266606633439445e-06, + "loss": 0.5152, + "step": 6447 + }, + { + "epoch": 2.5307977278340332, + "grad_norm": 0.48656419425692243, + "learning_rate": 4.826603336407945e-06, + "loss": 0.5209, + "step": 6448 + }, + { + "epoch": 2.5311928871326255, + "grad_norm": 0.45199569225901554, + "learning_rate": 4.826546000334462e-06, + "loss": 0.5035, + "step": 6449 + }, + { + "epoch": 2.5315880464312177, + "grad_norm": 0.4505649136262649, + "learning_rate": 4.826488655123719e-06, + "loss": 0.5034, + "step": 6450 + }, + { + "epoch": 2.53198320572981, + "grad_norm": 0.4750069460993798, + "learning_rate": 4.826431300775941e-06, + "loss": 0.5377, + "step": 6451 + }, + { + "epoch": 2.532378365028402, + "grad_norm": 0.464527033598869, + "learning_rate": 4.826373937291353e-06, + "loss": 0.4931, + "step": 6452 + }, + { + "epoch": 2.5327735243269944, + "grad_norm": 0.4530418607627022, + "learning_rate": 4.826316564670181e-06, + "loss": 0.5072, + "step": 6453 + }, + { + "epoch": 2.5331686836255867, + "grad_norm": 0.45364038939024304, + "learning_rate": 4.82625918291265e-06, + "loss": 0.5041, + "step": 6454 + }, + { + "epoch": 2.533563842924179, + "grad_norm": 0.47698190204477, + "learning_rate": 4.8262017920189864e-06, + "loss": 0.5116, + "step": 6455 + }, + { + "epoch": 2.533959002222771, + "grad_norm": 0.4731322955615775, + "learning_rate": 4.826144391989414e-06, + "loss": 0.5182, + "step": 6456 + }, + { + "epoch": 2.5343541615213634, + "grad_norm": 0.46876448274990934, + "learning_rate": 4.8260869828241595e-06, + "loss": 0.5053, + "step": 6457 + }, + { + "epoch": 2.5347493208199556, + "grad_norm": 0.4565534077873607, + "learning_rate": 4.826029564523447e-06, + "loss": 0.4996, + "step": 6458 + }, + { + "epoch": 2.535144480118548, + "grad_norm": 0.47123001937860665, + "learning_rate": 4.825972137087504e-06, + "loss": 0.5209, + "step": 6459 + }, + { + "epoch": 2.53553963941714, + "grad_norm": 0.4620746913890331, + "learning_rate": 4.825914700516553e-06, + "loss": 0.5306, + "step": 6460 + }, + { + "epoch": 2.5359347987157324, + "grad_norm": 0.4431055424606069, + "learning_rate": 4.825857254810823e-06, + "loss": 0.4777, + "step": 6461 + }, + { + "epoch": 2.5363299580143246, + "grad_norm": 0.4670518314206856, + "learning_rate": 4.8257997999705365e-06, + "loss": 0.4896, + "step": 6462 + }, + { + "epoch": 2.536725117312917, + "grad_norm": 0.46517370738592695, + "learning_rate": 4.825742335995922e-06, + "loss": 0.514, + "step": 6463 + }, + { + "epoch": 2.537120276611509, + "grad_norm": 0.46818923159639003, + "learning_rate": 4.825684862887204e-06, + "loss": 0.5092, + "step": 6464 + }, + { + "epoch": 2.5375154359101013, + "grad_norm": 0.47402007401065194, + "learning_rate": 4.825627380644607e-06, + "loss": 0.4979, + "step": 6465 + }, + { + "epoch": 2.5379105952086936, + "grad_norm": 0.45423949379520695, + "learning_rate": 4.825569889268359e-06, + "loss": 0.4905, + "step": 6466 + }, + { + "epoch": 2.538305754507286, + "grad_norm": 0.44881199031238245, + "learning_rate": 4.825512388758684e-06, + "loss": 0.4949, + "step": 6467 + }, + { + "epoch": 2.538700913805878, + "grad_norm": 0.43610597989809763, + "learning_rate": 4.825454879115808e-06, + "loss": 0.4913, + "step": 6468 + }, + { + "epoch": 2.5390960731044703, + "grad_norm": 0.46713538360387696, + "learning_rate": 4.8253973603399585e-06, + "loss": 0.5091, + "step": 6469 + }, + { + "epoch": 2.5394912324030625, + "grad_norm": 0.457557105090376, + "learning_rate": 4.825339832431359e-06, + "loss": 0.5067, + "step": 6470 + }, + { + "epoch": 2.5398863917016548, + "grad_norm": 0.44744590808245827, + "learning_rate": 4.8252822953902374e-06, + "loss": 0.5041, + "step": 6471 + }, + { + "epoch": 2.540281551000247, + "grad_norm": 0.44535213213717323, + "learning_rate": 4.825224749216819e-06, + "loss": 0.5114, + "step": 6472 + }, + { + "epoch": 2.5406767102988392, + "grad_norm": 0.4457813945339316, + "learning_rate": 4.825167193911329e-06, + "loss": 0.5075, + "step": 6473 + }, + { + "epoch": 2.5410718695974315, + "grad_norm": 0.45529691510876213, + "learning_rate": 4.825109629473995e-06, + "loss": 0.491, + "step": 6474 + }, + { + "epoch": 2.5414670288960237, + "grad_norm": 0.4716080108006373, + "learning_rate": 4.825052055905043e-06, + "loss": 0.5155, + "step": 6475 + }, + { + "epoch": 2.541862188194616, + "grad_norm": 0.4566158583555428, + "learning_rate": 4.8249944732046975e-06, + "loss": 0.522, + "step": 6476 + }, + { + "epoch": 2.542257347493208, + "grad_norm": 0.46881103956534304, + "learning_rate": 4.8249368813731845e-06, + "loss": 0.5274, + "step": 6477 + }, + { + "epoch": 2.5426525067918004, + "grad_norm": 0.45250162234599905, + "learning_rate": 4.824879280410733e-06, + "loss": 0.5005, + "step": 6478 + }, + { + "epoch": 2.5430476660903927, + "grad_norm": 0.47056029175688424, + "learning_rate": 4.824821670317566e-06, + "loss": 0.5215, + "step": 6479 + }, + { + "epoch": 2.543442825388985, + "grad_norm": 0.4597434281516728, + "learning_rate": 4.824764051093912e-06, + "loss": 0.5006, + "step": 6480 + }, + { + "epoch": 2.543837984687577, + "grad_norm": 0.45131921755337623, + "learning_rate": 4.824706422739996e-06, + "loss": 0.5014, + "step": 6481 + }, + { + "epoch": 2.5442331439861694, + "grad_norm": 0.4435352090089486, + "learning_rate": 4.824648785256045e-06, + "loss": 0.4683, + "step": 6482 + }, + { + "epoch": 2.5446283032847616, + "grad_norm": 0.4361623565598776, + "learning_rate": 4.824591138642285e-06, + "loss": 0.4739, + "step": 6483 + }, + { + "epoch": 2.545023462583354, + "grad_norm": 0.4528329358816713, + "learning_rate": 4.824533482898943e-06, + "loss": 0.4936, + "step": 6484 + }, + { + "epoch": 2.545418621881946, + "grad_norm": 0.457804851022941, + "learning_rate": 4.824475818026244e-06, + "loss": 0.5038, + "step": 6485 + }, + { + "epoch": 2.5458137811805384, + "grad_norm": 0.47354522749284367, + "learning_rate": 4.824418144024416e-06, + "loss": 0.5182, + "step": 6486 + }, + { + "epoch": 2.5462089404791306, + "grad_norm": 0.4695229394887001, + "learning_rate": 4.824360460893686e-06, + "loss": 0.4991, + "step": 6487 + }, + { + "epoch": 2.546604099777723, + "grad_norm": 0.46327892852606095, + "learning_rate": 4.824302768634279e-06, + "loss": 0.502, + "step": 6488 + }, + { + "epoch": 2.546999259076315, + "grad_norm": 0.45297480248431404, + "learning_rate": 4.824245067246422e-06, + "loss": 0.5066, + "step": 6489 + }, + { + "epoch": 2.5473944183749073, + "grad_norm": 0.46517129424956516, + "learning_rate": 4.824187356730341e-06, + "loss": 0.4846, + "step": 6490 + }, + { + "epoch": 2.5477895776734996, + "grad_norm": 0.4668569089467578, + "learning_rate": 4.824129637086264e-06, + "loss": 0.5238, + "step": 6491 + }, + { + "epoch": 2.548184736972092, + "grad_norm": 0.47720546968786415, + "learning_rate": 4.824071908314417e-06, + "loss": 0.5293, + "step": 6492 + }, + { + "epoch": 2.548579896270684, + "grad_norm": 0.4729585091074834, + "learning_rate": 4.824014170415027e-06, + "loss": 0.5163, + "step": 6493 + }, + { + "epoch": 2.5489750555692763, + "grad_norm": 0.4559767498901322, + "learning_rate": 4.8239564233883205e-06, + "loss": 0.5028, + "step": 6494 + }, + { + "epoch": 2.5493702148678685, + "grad_norm": 0.46193288182539766, + "learning_rate": 4.823898667234525e-06, + "loss": 0.5036, + "step": 6495 + }, + { + "epoch": 2.5497653741664608, + "grad_norm": 0.50276777112018, + "learning_rate": 4.823840901953865e-06, + "loss": 0.5199, + "step": 6496 + }, + { + "epoch": 2.550160533465053, + "grad_norm": 0.44571450965752996, + "learning_rate": 4.823783127546571e-06, + "loss": 0.4981, + "step": 6497 + }, + { + "epoch": 2.5505556927636452, + "grad_norm": 0.4545230887683023, + "learning_rate": 4.823725344012866e-06, + "loss": 0.5014, + "step": 6498 + }, + { + "epoch": 2.5509508520622375, + "grad_norm": 0.45011277157561497, + "learning_rate": 4.8236675513529804e-06, + "loss": 0.5016, + "step": 6499 + }, + { + "epoch": 2.5513460113608297, + "grad_norm": 0.45347326578518127, + "learning_rate": 4.823609749567139e-06, + "loss": 0.486, + "step": 6500 + }, + { + "epoch": 2.551741170659422, + "grad_norm": 0.49671184627419557, + "learning_rate": 4.823551938655569e-06, + "loss": 0.5229, + "step": 6501 + }, + { + "epoch": 2.552136329958014, + "grad_norm": 0.4630157588304572, + "learning_rate": 4.823494118618499e-06, + "loss": 0.4994, + "step": 6502 + }, + { + "epoch": 2.5525314892566064, + "grad_norm": 0.44595435111081594, + "learning_rate": 4.8234362894561544e-06, + "loss": 0.4913, + "step": 6503 + }, + { + "epoch": 2.5529266485551987, + "grad_norm": 0.46456032038638434, + "learning_rate": 4.823378451168763e-06, + "loss": 0.4989, + "step": 6504 + }, + { + "epoch": 2.553321807853791, + "grad_norm": 0.4741181020350032, + "learning_rate": 4.8233206037565515e-06, + "loss": 0.5169, + "step": 6505 + }, + { + "epoch": 2.553716967152383, + "grad_norm": 0.4586086884230933, + "learning_rate": 4.823262747219749e-06, + "loss": 0.5044, + "step": 6506 + }, + { + "epoch": 2.5541121264509754, + "grad_norm": 0.4594391518699296, + "learning_rate": 4.823204881558579e-06, + "loss": 0.5095, + "step": 6507 + }, + { + "epoch": 2.5545072857495676, + "grad_norm": 0.4520677455107984, + "learning_rate": 4.8231470067732726e-06, + "loss": 0.5007, + "step": 6508 + }, + { + "epoch": 2.55490244504816, + "grad_norm": 0.4685641116507666, + "learning_rate": 4.823089122864055e-06, + "loss": 0.513, + "step": 6509 + }, + { + "epoch": 2.555297604346752, + "grad_norm": 0.45202109312605754, + "learning_rate": 4.8230312298311535e-06, + "loss": 0.5163, + "step": 6510 + }, + { + "epoch": 2.5556927636453444, + "grad_norm": 0.4521725537088804, + "learning_rate": 4.822973327674796e-06, + "loss": 0.4907, + "step": 6511 + }, + { + "epoch": 2.556087922943937, + "grad_norm": 0.44282430032215886, + "learning_rate": 4.82291541639521e-06, + "loss": 0.5044, + "step": 6512 + }, + { + "epoch": 2.5564830822425293, + "grad_norm": 0.4656119245187877, + "learning_rate": 4.822857495992623e-06, + "loss": 0.5138, + "step": 6513 + }, + { + "epoch": 2.5568782415411215, + "grad_norm": 0.46796730981011786, + "learning_rate": 4.8227995664672625e-06, + "loss": 0.5162, + "step": 6514 + }, + { + "epoch": 2.5572734008397138, + "grad_norm": 0.4491042247333808, + "learning_rate": 4.822741627819355e-06, + "loss": 0.5012, + "step": 6515 + }, + { + "epoch": 2.557668560138306, + "grad_norm": 0.46834449695954017, + "learning_rate": 4.82268368004913e-06, + "loss": 0.5263, + "step": 6516 + }, + { + "epoch": 2.5580637194368983, + "grad_norm": 0.44103675945686854, + "learning_rate": 4.822625723156813e-06, + "loss": 0.4862, + "step": 6517 + }, + { + "epoch": 2.5584588787354905, + "grad_norm": 0.45017086475460627, + "learning_rate": 4.822567757142634e-06, + "loss": 0.4989, + "step": 6518 + }, + { + "epoch": 2.5588540380340827, + "grad_norm": 0.44261210213602237, + "learning_rate": 4.822509782006817e-06, + "loss": 0.5209, + "step": 6519 + }, + { + "epoch": 2.559249197332675, + "grad_norm": 0.45761518270513285, + "learning_rate": 4.822451797749592e-06, + "loss": 0.5203, + "step": 6520 + }, + { + "epoch": 2.559644356631267, + "grad_norm": 0.46985808636516363, + "learning_rate": 4.822393804371188e-06, + "loss": 0.5149, + "step": 6521 + }, + { + "epoch": 2.5600395159298595, + "grad_norm": 0.4526976621493061, + "learning_rate": 4.822335801871832e-06, + "loss": 0.5091, + "step": 6522 + }, + { + "epoch": 2.5604346752284517, + "grad_norm": 0.46492146610005, + "learning_rate": 4.822277790251749e-06, + "loss": 0.5276, + "step": 6523 + }, + { + "epoch": 2.560829834527044, + "grad_norm": 0.46271285637553683, + "learning_rate": 4.82221976951117e-06, + "loss": 0.4965, + "step": 6524 + }, + { + "epoch": 2.561224993825636, + "grad_norm": 0.4503037126759286, + "learning_rate": 4.822161739650322e-06, + "loss": 0.516, + "step": 6525 + }, + { + "epoch": 2.5616201531242284, + "grad_norm": 0.443470486159481, + "learning_rate": 4.822103700669432e-06, + "loss": 0.4974, + "step": 6526 + }, + { + "epoch": 2.5620153124228207, + "grad_norm": 0.43703146172923873, + "learning_rate": 4.82204565256873e-06, + "loss": 0.4952, + "step": 6527 + }, + { + "epoch": 2.562410471721413, + "grad_norm": 0.45292345235815396, + "learning_rate": 4.821987595348442e-06, + "loss": 0.5271, + "step": 6528 + }, + { + "epoch": 2.562805631020005, + "grad_norm": 0.45759067526669045, + "learning_rate": 4.821929529008797e-06, + "loss": 0.5063, + "step": 6529 + }, + { + "epoch": 2.5632007903185974, + "grad_norm": 0.47064593711048897, + "learning_rate": 4.821871453550023e-06, + "loss": 0.5057, + "step": 6530 + }, + { + "epoch": 2.5635959496171896, + "grad_norm": 0.47813721885154226, + "learning_rate": 4.821813368972347e-06, + "loss": 0.539, + "step": 6531 + }, + { + "epoch": 2.563991108915782, + "grad_norm": 0.4573359139734747, + "learning_rate": 4.821755275275998e-06, + "loss": 0.5119, + "step": 6532 + }, + { + "epoch": 2.564386268214374, + "grad_norm": 0.5039910077391357, + "learning_rate": 4.821697172461205e-06, + "loss": 0.5096, + "step": 6533 + }, + { + "epoch": 2.5647814275129663, + "grad_norm": 0.45480012799103875, + "learning_rate": 4.821639060528194e-06, + "loss": 0.5073, + "step": 6534 + }, + { + "epoch": 2.5651765868115586, + "grad_norm": 0.43914393122344525, + "learning_rate": 4.821580939477195e-06, + "loss": 0.5173, + "step": 6535 + }, + { + "epoch": 2.565571746110151, + "grad_norm": 0.4512267600492565, + "learning_rate": 4.821522809308436e-06, + "loss": 0.5046, + "step": 6536 + }, + { + "epoch": 2.565966905408743, + "grad_norm": 0.45888241462106466, + "learning_rate": 4.821464670022146e-06, + "loss": 0.5159, + "step": 6537 + }, + { + "epoch": 2.5663620647073353, + "grad_norm": 0.4441592838891458, + "learning_rate": 4.821406521618551e-06, + "loss": 0.5057, + "step": 6538 + }, + { + "epoch": 2.5667572240059275, + "grad_norm": 0.45376803364604695, + "learning_rate": 4.821348364097882e-06, + "loss": 0.4884, + "step": 6539 + }, + { + "epoch": 2.56715238330452, + "grad_norm": 0.46633304742246534, + "learning_rate": 4.821290197460366e-06, + "loss": 0.5307, + "step": 6540 + }, + { + "epoch": 2.567547542603112, + "grad_norm": 0.43666480002999797, + "learning_rate": 4.821232021706231e-06, + "loss": 0.4926, + "step": 6541 + }, + { + "epoch": 2.5679427019017043, + "grad_norm": 0.4605237411052962, + "learning_rate": 4.8211738368357065e-06, + "loss": 0.5049, + "step": 6542 + }, + { + "epoch": 2.5683378612002965, + "grad_norm": 0.462050066279481, + "learning_rate": 4.821115642849021e-06, + "loss": 0.5165, + "step": 6543 + }, + { + "epoch": 2.5687330204988887, + "grad_norm": 0.45750052243887834, + "learning_rate": 4.821057439746402e-06, + "loss": 0.4976, + "step": 6544 + }, + { + "epoch": 2.569128179797481, + "grad_norm": 0.44555516105490556, + "learning_rate": 4.820999227528079e-06, + "loss": 0.5186, + "step": 6545 + }, + { + "epoch": 2.5695233390960732, + "grad_norm": 0.46285962401306396, + "learning_rate": 4.820941006194281e-06, + "loss": 0.4978, + "step": 6546 + }, + { + "epoch": 2.5699184983946655, + "grad_norm": 0.4469392519255687, + "learning_rate": 4.820882775745236e-06, + "loss": 0.4863, + "step": 6547 + }, + { + "epoch": 2.5703136576932577, + "grad_norm": 0.4519758312975245, + "learning_rate": 4.8208245361811724e-06, + "loss": 0.5094, + "step": 6548 + }, + { + "epoch": 2.57070881699185, + "grad_norm": 0.46503941371816515, + "learning_rate": 4.820766287502319e-06, + "loss": 0.5133, + "step": 6549 + }, + { + "epoch": 2.571103976290442, + "grad_norm": 0.4600897389443086, + "learning_rate": 4.820708029708905e-06, + "loss": 0.5137, + "step": 6550 + }, + { + "epoch": 2.5714991355890344, + "grad_norm": 0.4588463556332273, + "learning_rate": 4.820649762801159e-06, + "loss": 0.5092, + "step": 6551 + }, + { + "epoch": 2.5718942948876267, + "grad_norm": 0.46703432271255124, + "learning_rate": 4.820591486779312e-06, + "loss": 0.4949, + "step": 6552 + }, + { + "epoch": 2.572289454186219, + "grad_norm": 0.4489108181770717, + "learning_rate": 4.820533201643588e-06, + "loss": 0.4944, + "step": 6553 + }, + { + "epoch": 2.572684613484811, + "grad_norm": 0.4608905598839847, + "learning_rate": 4.82047490739422e-06, + "loss": 0.5049, + "step": 6554 + }, + { + "epoch": 2.5730797727834034, + "grad_norm": 0.46983121539687306, + "learning_rate": 4.820416604031435e-06, + "loss": 0.4909, + "step": 6555 + }, + { + "epoch": 2.5734749320819956, + "grad_norm": 0.4533141543179746, + "learning_rate": 4.820358291555462e-06, + "loss": 0.525, + "step": 6556 + }, + { + "epoch": 2.573870091380588, + "grad_norm": 0.4619074947389451, + "learning_rate": 4.820299969966532e-06, + "loss": 0.5169, + "step": 6557 + }, + { + "epoch": 2.57426525067918, + "grad_norm": 0.44500184074671084, + "learning_rate": 4.820241639264872e-06, + "loss": 0.5106, + "step": 6558 + }, + { + "epoch": 2.5746604099777723, + "grad_norm": 0.45059185521325373, + "learning_rate": 4.820183299450713e-06, + "loss": 0.5029, + "step": 6559 + }, + { + "epoch": 2.5750555692763646, + "grad_norm": 0.45616010644680055, + "learning_rate": 4.820124950524282e-06, + "loss": 0.5018, + "step": 6560 + }, + { + "epoch": 2.575450728574957, + "grad_norm": 0.4572348903384075, + "learning_rate": 4.820066592485809e-06, + "loss": 0.5139, + "step": 6561 + }, + { + "epoch": 2.575845887873549, + "grad_norm": 0.4421219919687656, + "learning_rate": 4.8200082253355226e-06, + "loss": 0.4971, + "step": 6562 + }, + { + "epoch": 2.5762410471721413, + "grad_norm": 0.4666909383851814, + "learning_rate": 4.819949849073654e-06, + "loss": 0.5117, + "step": 6563 + }, + { + "epoch": 2.5766362064707335, + "grad_norm": 0.46477953473232203, + "learning_rate": 4.8198914637004305e-06, + "loss": 0.4949, + "step": 6564 + }, + { + "epoch": 2.577031365769326, + "grad_norm": 0.46799345127111774, + "learning_rate": 4.819833069216081e-06, + "loss": 0.4962, + "step": 6565 + }, + { + "epoch": 2.577426525067918, + "grad_norm": 0.4431351713952274, + "learning_rate": 4.819774665620837e-06, + "loss": 0.4857, + "step": 6566 + }, + { + "epoch": 2.5778216843665103, + "grad_norm": 0.45369599827793194, + "learning_rate": 4.819716252914927e-06, + "loss": 0.5163, + "step": 6567 + }, + { + "epoch": 2.5782168436651025, + "grad_norm": 0.4736993413815267, + "learning_rate": 4.81965783109858e-06, + "loss": 0.5077, + "step": 6568 + }, + { + "epoch": 2.5786120029636947, + "grad_norm": 0.4767138840762411, + "learning_rate": 4.819599400172025e-06, + "loss": 0.5151, + "step": 6569 + }, + { + "epoch": 2.579007162262287, + "grad_norm": 0.46197056590790053, + "learning_rate": 4.819540960135493e-06, + "loss": 0.494, + "step": 6570 + }, + { + "epoch": 2.5794023215608792, + "grad_norm": 0.4719226833563452, + "learning_rate": 4.819482510989211e-06, + "loss": 0.5235, + "step": 6571 + }, + { + "epoch": 2.5797974808594715, + "grad_norm": 0.47446676249677355, + "learning_rate": 4.8194240527334115e-06, + "loss": 0.5077, + "step": 6572 + }, + { + "epoch": 2.5801926401580637, + "grad_norm": 0.4660645331065908, + "learning_rate": 4.819365585368322e-06, + "loss": 0.5013, + "step": 6573 + }, + { + "epoch": 2.580587799456656, + "grad_norm": 0.45366183792375775, + "learning_rate": 4.819307108894173e-06, + "loss": 0.4923, + "step": 6574 + }, + { + "epoch": 2.580982958755248, + "grad_norm": 0.43736598970253193, + "learning_rate": 4.819248623311195e-06, + "loss": 0.5013, + "step": 6575 + }, + { + "epoch": 2.5813781180538404, + "grad_norm": 0.46508435134068205, + "learning_rate": 4.819190128619617e-06, + "loss": 0.4959, + "step": 6576 + }, + { + "epoch": 2.5817732773524327, + "grad_norm": 0.46567765929034605, + "learning_rate": 4.819131624819667e-06, + "loss": 0.5076, + "step": 6577 + }, + { + "epoch": 2.582168436651025, + "grad_norm": 0.4707030207964489, + "learning_rate": 4.8190731119115766e-06, + "loss": 0.5174, + "step": 6578 + }, + { + "epoch": 2.582563595949617, + "grad_norm": 0.46834324977420716, + "learning_rate": 4.819014589895575e-06, + "loss": 0.5154, + "step": 6579 + }, + { + "epoch": 2.5829587552482094, + "grad_norm": 0.4620901719740855, + "learning_rate": 4.818956058771893e-06, + "loss": 0.4916, + "step": 6580 + }, + { + "epoch": 2.5833539145468016, + "grad_norm": 0.4619346062310383, + "learning_rate": 4.81889751854076e-06, + "loss": 0.5024, + "step": 6581 + }, + { + "epoch": 2.583749073845394, + "grad_norm": 0.4704870295417848, + "learning_rate": 4.818838969202405e-06, + "loss": 0.507, + "step": 6582 + }, + { + "epoch": 2.584144233143986, + "grad_norm": 0.48508779424763204, + "learning_rate": 4.818780410757059e-06, + "loss": 0.512, + "step": 6583 + }, + { + "epoch": 2.5845393924425784, + "grad_norm": 0.44361406120966196, + "learning_rate": 4.818721843204951e-06, + "loss": 0.4971, + "step": 6584 + }, + { + "epoch": 2.5849345517411706, + "grad_norm": 0.46045806937302997, + "learning_rate": 4.818663266546312e-06, + "loss": 0.5146, + "step": 6585 + }, + { + "epoch": 2.585329711039763, + "grad_norm": 0.503532295954558, + "learning_rate": 4.818604680781372e-06, + "loss": 0.5202, + "step": 6586 + }, + { + "epoch": 2.585724870338355, + "grad_norm": 0.4809956728838628, + "learning_rate": 4.8185460859103596e-06, + "loss": 0.5341, + "step": 6587 + }, + { + "epoch": 2.5861200296369473, + "grad_norm": 0.4452613791874474, + "learning_rate": 4.818487481933507e-06, + "loss": 0.4868, + "step": 6588 + }, + { + "epoch": 2.5865151889355396, + "grad_norm": 0.4645895638161033, + "learning_rate": 4.818428868851042e-06, + "loss": 0.5162, + "step": 6589 + }, + { + "epoch": 2.586910348234132, + "grad_norm": 0.45690373781668653, + "learning_rate": 4.818370246663199e-06, + "loss": 0.5186, + "step": 6590 + }, + { + "epoch": 2.587305507532724, + "grad_norm": 0.4624829183292078, + "learning_rate": 4.818311615370204e-06, + "loss": 0.5169, + "step": 6591 + }, + { + "epoch": 2.5877006668313163, + "grad_norm": 0.48491493887224846, + "learning_rate": 4.818252974972288e-06, + "loss": 0.5092, + "step": 6592 + }, + { + "epoch": 2.5880958261299085, + "grad_norm": 0.4642333592429095, + "learning_rate": 4.818194325469683e-06, + "loss": 0.532, + "step": 6593 + }, + { + "epoch": 2.5884909854285008, + "grad_norm": 0.48132123097086577, + "learning_rate": 4.818135666862618e-06, + "loss": 0.5152, + "step": 6594 + }, + { + "epoch": 2.588886144727093, + "grad_norm": 0.4736558582523291, + "learning_rate": 4.818076999151323e-06, + "loss": 0.5074, + "step": 6595 + }, + { + "epoch": 2.5892813040256852, + "grad_norm": 0.45982193687887485, + "learning_rate": 4.81801832233603e-06, + "loss": 0.5136, + "step": 6596 + }, + { + "epoch": 2.5896764633242775, + "grad_norm": 0.459117193820494, + "learning_rate": 4.817959636416969e-06, + "loss": 0.5178, + "step": 6597 + }, + { + "epoch": 2.5900716226228697, + "grad_norm": 0.45795808917106084, + "learning_rate": 4.817900941394369e-06, + "loss": 0.497, + "step": 6598 + }, + { + "epoch": 2.590466781921462, + "grad_norm": 0.4961771265409982, + "learning_rate": 4.817842237268463e-06, + "loss": 0.5025, + "step": 6599 + }, + { + "epoch": 2.590861941220054, + "grad_norm": 0.4517244851535084, + "learning_rate": 4.817783524039479e-06, + "loss": 0.496, + "step": 6600 + }, + { + "epoch": 2.5912571005186464, + "grad_norm": 0.4606749996141201, + "learning_rate": 4.8177248017076496e-06, + "loss": 0.4885, + "step": 6601 + }, + { + "epoch": 2.5916522598172387, + "grad_norm": 0.44595584652993003, + "learning_rate": 4.817666070273203e-06, + "loss": 0.5178, + "step": 6602 + }, + { + "epoch": 2.592047419115831, + "grad_norm": 0.4361569136817302, + "learning_rate": 4.817607329736373e-06, + "loss": 0.4927, + "step": 6603 + }, + { + "epoch": 2.592442578414423, + "grad_norm": 0.48331893774269574, + "learning_rate": 4.817548580097389e-06, + "loss": 0.4918, + "step": 6604 + }, + { + "epoch": 2.5928377377130154, + "grad_norm": 0.4480574187648233, + "learning_rate": 4.81748982135648e-06, + "loss": 0.4898, + "step": 6605 + }, + { + "epoch": 2.5932328970116076, + "grad_norm": 0.4574403926622687, + "learning_rate": 4.817431053513879e-06, + "loss": 0.5153, + "step": 6606 + }, + { + "epoch": 2.5936280563102, + "grad_norm": 0.4464682971660518, + "learning_rate": 4.8173722765698165e-06, + "loss": 0.4879, + "step": 6607 + }, + { + "epoch": 2.594023215608792, + "grad_norm": 0.4433326822459801, + "learning_rate": 4.817313490524523e-06, + "loss": 0.4725, + "step": 6608 + }, + { + "epoch": 2.5944183749073844, + "grad_norm": 0.4707423582400784, + "learning_rate": 4.817254695378228e-06, + "loss": 0.5221, + "step": 6609 + }, + { + "epoch": 2.5948135342059766, + "grad_norm": 0.47295746581167725, + "learning_rate": 4.8171958911311646e-06, + "loss": 0.5108, + "step": 6610 + }, + { + "epoch": 2.595208693504569, + "grad_norm": 0.45263375041631376, + "learning_rate": 4.817137077783563e-06, + "loss": 0.492, + "step": 6611 + }, + { + "epoch": 2.595603852803161, + "grad_norm": 0.4552715976251914, + "learning_rate": 4.817078255335653e-06, + "loss": 0.496, + "step": 6612 + }, + { + "epoch": 2.5959990121017533, + "grad_norm": 0.4595060840449498, + "learning_rate": 4.817019423787667e-06, + "loss": 0.5155, + "step": 6613 + }, + { + "epoch": 2.5963941714003456, + "grad_norm": 0.45945144727738035, + "learning_rate": 4.8169605831398355e-06, + "loss": 0.5292, + "step": 6614 + }, + { + "epoch": 2.596789330698938, + "grad_norm": 0.45474069846107534, + "learning_rate": 4.81690173339239e-06, + "loss": 0.5158, + "step": 6615 + }, + { + "epoch": 2.59718448999753, + "grad_norm": 0.44549691265764996, + "learning_rate": 4.816842874545562e-06, + "loss": 0.4949, + "step": 6616 + }, + { + "epoch": 2.5975796492961223, + "grad_norm": 0.4465165484665156, + "learning_rate": 4.816784006599582e-06, + "loss": 0.5135, + "step": 6617 + }, + { + "epoch": 2.5979748085947145, + "grad_norm": 0.46690949256258174, + "learning_rate": 4.81672512955468e-06, + "loss": 0.4823, + "step": 6618 + }, + { + "epoch": 2.5983699678933068, + "grad_norm": 0.44898509566584294, + "learning_rate": 4.81666624341109e-06, + "loss": 0.5126, + "step": 6619 + }, + { + "epoch": 2.598765127191899, + "grad_norm": 0.4486657739928971, + "learning_rate": 4.816607348169041e-06, + "loss": 0.4901, + "step": 6620 + }, + { + "epoch": 2.5991602864904912, + "grad_norm": 0.4527214681354412, + "learning_rate": 4.816548443828765e-06, + "loss": 0.5123, + "step": 6621 + }, + { + "epoch": 2.5995554457890835, + "grad_norm": 0.4529843101656628, + "learning_rate": 4.8164895303904935e-06, + "loss": 0.4951, + "step": 6622 + }, + { + "epoch": 2.5999506050876757, + "grad_norm": 0.4564936706989616, + "learning_rate": 4.816430607854458e-06, + "loss": 0.5113, + "step": 6623 + }, + { + "epoch": 2.600345764386268, + "grad_norm": 0.4940966671369367, + "learning_rate": 4.816371676220889e-06, + "loss": 0.4969, + "step": 6624 + }, + { + "epoch": 2.60074092368486, + "grad_norm": 0.45722310993861126, + "learning_rate": 4.81631273549002e-06, + "loss": 0.5132, + "step": 6625 + }, + { + "epoch": 2.6011360829834524, + "grad_norm": 0.45286575656466427, + "learning_rate": 4.816253785662079e-06, + "loss": 0.5084, + "step": 6626 + }, + { + "epoch": 2.6015312422820447, + "grad_norm": 0.445135014900183, + "learning_rate": 4.816194826737302e-06, + "loss": 0.5027, + "step": 6627 + }, + { + "epoch": 2.601926401580637, + "grad_norm": 0.46012984610648944, + "learning_rate": 4.816135858715917e-06, + "loss": 0.4891, + "step": 6628 + }, + { + "epoch": 2.602321560879229, + "grad_norm": 0.46640326101065027, + "learning_rate": 4.816076881598156e-06, + "loss": 0.5077, + "step": 6629 + }, + { + "epoch": 2.6027167201778214, + "grad_norm": 0.4395627749768447, + "learning_rate": 4.816017895384253e-06, + "loss": 0.5029, + "step": 6630 + }, + { + "epoch": 2.603111879476414, + "grad_norm": 0.4471917295160825, + "learning_rate": 4.815958900074437e-06, + "loss": 0.4987, + "step": 6631 + }, + { + "epoch": 2.6035070387750063, + "grad_norm": 0.4577848836839979, + "learning_rate": 4.815899895668941e-06, + "loss": 0.5088, + "step": 6632 + }, + { + "epoch": 2.6039021980735986, + "grad_norm": 0.44634805154755014, + "learning_rate": 4.815840882167997e-06, + "loss": 0.5121, + "step": 6633 + }, + { + "epoch": 2.604297357372191, + "grad_norm": 0.4506256280957546, + "learning_rate": 4.815781859571835e-06, + "loss": 0.5081, + "step": 6634 + }, + { + "epoch": 2.604692516670783, + "grad_norm": 0.44851796039012143, + "learning_rate": 4.815722827880689e-06, + "loss": 0.5061, + "step": 6635 + }, + { + "epoch": 2.6050876759693753, + "grad_norm": 0.4632284380664699, + "learning_rate": 4.81566378709479e-06, + "loss": 0.5179, + "step": 6636 + }, + { + "epoch": 2.6054828352679675, + "grad_norm": 0.4495346025856113, + "learning_rate": 4.8156047372143695e-06, + "loss": 0.5185, + "step": 6637 + }, + { + "epoch": 2.6058779945665598, + "grad_norm": 0.45146252001014836, + "learning_rate": 4.815545678239659e-06, + "loss": 0.4782, + "step": 6638 + }, + { + "epoch": 2.606273153865152, + "grad_norm": 0.473770212276825, + "learning_rate": 4.8154866101708925e-06, + "loss": 0.5126, + "step": 6639 + }, + { + "epoch": 2.6066683131637443, + "grad_norm": 0.4485774694573647, + "learning_rate": 4.8154275330083e-06, + "loss": 0.5119, + "step": 6640 + }, + { + "epoch": 2.6070634724623365, + "grad_norm": 0.49678656209215, + "learning_rate": 4.815368446752114e-06, + "loss": 0.5094, + "step": 6641 + }, + { + "epoch": 2.6074586317609287, + "grad_norm": 0.470453453281169, + "learning_rate": 4.815309351402568e-06, + "loss": 0.5265, + "step": 6642 + }, + { + "epoch": 2.607853791059521, + "grad_norm": 0.46330047850218925, + "learning_rate": 4.815250246959891e-06, + "loss": 0.5093, + "step": 6643 + }, + { + "epoch": 2.608248950358113, + "grad_norm": 0.4646962109211404, + "learning_rate": 4.815191133424318e-06, + "loss": 0.5302, + "step": 6644 + }, + { + "epoch": 2.6086441096567055, + "grad_norm": 0.45067265265161177, + "learning_rate": 4.815132010796079e-06, + "loss": 0.5108, + "step": 6645 + }, + { + "epoch": 2.6090392689552977, + "grad_norm": 0.4679320453519832, + "learning_rate": 4.815072879075409e-06, + "loss": 0.5069, + "step": 6646 + }, + { + "epoch": 2.60943442825389, + "grad_norm": 0.4633004240529549, + "learning_rate": 4.815013738262537e-06, + "loss": 0.5084, + "step": 6647 + }, + { + "epoch": 2.609829587552482, + "grad_norm": 0.4579293641594263, + "learning_rate": 4.8149545883576974e-06, + "loss": 0.508, + "step": 6648 + }, + { + "epoch": 2.6102247468510744, + "grad_norm": 0.44095006698161243, + "learning_rate": 4.814895429361122e-06, + "loss": 0.5141, + "step": 6649 + }, + { + "epoch": 2.6106199061496667, + "grad_norm": 0.45014307724452574, + "learning_rate": 4.814836261273043e-06, + "loss": 0.5071, + "step": 6650 + }, + { + "epoch": 2.611015065448259, + "grad_norm": 0.5653488468963971, + "learning_rate": 4.814777084093692e-06, + "loss": 0.4967, + "step": 6651 + }, + { + "epoch": 2.611410224746851, + "grad_norm": 0.4715720317621717, + "learning_rate": 4.814717897823303e-06, + "loss": 0.5069, + "step": 6652 + }, + { + "epoch": 2.6118053840454434, + "grad_norm": 0.4575070392319098, + "learning_rate": 4.8146587024621075e-06, + "loss": 0.5, + "step": 6653 + }, + { + "epoch": 2.6122005433440356, + "grad_norm": 0.449501214753915, + "learning_rate": 4.814599498010338e-06, + "loss": 0.5035, + "step": 6654 + }, + { + "epoch": 2.612595702642628, + "grad_norm": 0.4541220531434449, + "learning_rate": 4.814540284468227e-06, + "loss": 0.4795, + "step": 6655 + }, + { + "epoch": 2.61299086194122, + "grad_norm": 0.4527150728896562, + "learning_rate": 4.814481061836008e-06, + "loss": 0.4958, + "step": 6656 + }, + { + "epoch": 2.6133860212398123, + "grad_norm": 0.44537076654968044, + "learning_rate": 4.814421830113913e-06, + "loss": 0.494, + "step": 6657 + }, + { + "epoch": 2.6137811805384046, + "grad_norm": 0.4538622914542414, + "learning_rate": 4.814362589302174e-06, + "loss": 0.5144, + "step": 6658 + }, + { + "epoch": 2.614176339836997, + "grad_norm": 0.4585896393617423, + "learning_rate": 4.8143033394010245e-06, + "loss": 0.5091, + "step": 6659 + }, + { + "epoch": 2.614571499135589, + "grad_norm": 0.47488969909117135, + "learning_rate": 4.814244080410695e-06, + "loss": 0.5268, + "step": 6660 + }, + { + "epoch": 2.6149666584341813, + "grad_norm": 0.44323004486266726, + "learning_rate": 4.814184812331422e-06, + "loss": 0.4918, + "step": 6661 + }, + { + "epoch": 2.6153618177327735, + "grad_norm": 0.4707926165710535, + "learning_rate": 4.814125535163435e-06, + "loss": 0.5096, + "step": 6662 + }, + { + "epoch": 2.615756977031366, + "grad_norm": 0.446777983459746, + "learning_rate": 4.814066248906969e-06, + "loss": 0.5048, + "step": 6663 + }, + { + "epoch": 2.616152136329958, + "grad_norm": 0.4423589321854824, + "learning_rate": 4.8140069535622555e-06, + "loss": 0.4932, + "step": 6664 + }, + { + "epoch": 2.6165472956285503, + "grad_norm": 0.4578414653275977, + "learning_rate": 4.813947649129528e-06, + "loss": 0.5018, + "step": 6665 + }, + { + "epoch": 2.6169424549271425, + "grad_norm": 0.45309280010222674, + "learning_rate": 4.8138883356090196e-06, + "loss": 0.4921, + "step": 6666 + }, + { + "epoch": 2.6173376142257347, + "grad_norm": 0.45045781550198477, + "learning_rate": 4.813829013000963e-06, + "loss": 0.4947, + "step": 6667 + }, + { + "epoch": 2.617732773524327, + "grad_norm": 0.4581432466232565, + "learning_rate": 4.81376968130559e-06, + "loss": 0.5086, + "step": 6668 + }, + { + "epoch": 2.6181279328229192, + "grad_norm": 0.4746838428529649, + "learning_rate": 4.813710340523135e-06, + "loss": 0.5008, + "step": 6669 + }, + { + "epoch": 2.6185230921215115, + "grad_norm": 0.44025887937767083, + "learning_rate": 4.813650990653831e-06, + "loss": 0.4871, + "step": 6670 + }, + { + "epoch": 2.6189182514201037, + "grad_norm": 0.467331378827747, + "learning_rate": 4.813591631697912e-06, + "loss": 0.5059, + "step": 6671 + }, + { + "epoch": 2.619313410718696, + "grad_norm": 0.44926649544211894, + "learning_rate": 4.813532263655608e-06, + "loss": 0.5107, + "step": 6672 + }, + { + "epoch": 2.619708570017288, + "grad_norm": 0.4601513297437074, + "learning_rate": 4.813472886527155e-06, + "loss": 0.5235, + "step": 6673 + }, + { + "epoch": 2.6201037293158804, + "grad_norm": 0.5023320218764915, + "learning_rate": 4.813413500312785e-06, + "loss": 0.5387, + "step": 6674 + }, + { + "epoch": 2.6204988886144727, + "grad_norm": 0.45407121529937644, + "learning_rate": 4.813354105012732e-06, + "loss": 0.4986, + "step": 6675 + }, + { + "epoch": 2.620894047913065, + "grad_norm": 0.4758694797793042, + "learning_rate": 4.813294700627229e-06, + "loss": 0.5081, + "step": 6676 + }, + { + "epoch": 2.621289207211657, + "grad_norm": 0.477442584592846, + "learning_rate": 4.8132352871565085e-06, + "loss": 0.5098, + "step": 6677 + }, + { + "epoch": 2.6216843665102494, + "grad_norm": 0.4635000681832021, + "learning_rate": 4.813175864600805e-06, + "loss": 0.4921, + "step": 6678 + }, + { + "epoch": 2.6220795258088416, + "grad_norm": 0.5493110100084927, + "learning_rate": 4.813116432960351e-06, + "loss": 0.5086, + "step": 6679 + }, + { + "epoch": 2.622474685107434, + "grad_norm": 0.4593734891855389, + "learning_rate": 4.813056992235381e-06, + "loss": 0.5297, + "step": 6680 + }, + { + "epoch": 2.622869844406026, + "grad_norm": 0.4578122007316964, + "learning_rate": 4.812997542426126e-06, + "loss": 0.4811, + "step": 6681 + }, + { + "epoch": 2.6232650037046183, + "grad_norm": 0.46842927501510234, + "learning_rate": 4.812938083532822e-06, + "loss": 0.5119, + "step": 6682 + }, + { + "epoch": 2.6236601630032106, + "grad_norm": 0.46888027270874133, + "learning_rate": 4.812878615555702e-06, + "loss": 0.4904, + "step": 6683 + }, + { + "epoch": 2.624055322301803, + "grad_norm": 0.45800572704355974, + "learning_rate": 4.812819138495e-06, + "loss": 0.4936, + "step": 6684 + }, + { + "epoch": 2.624450481600395, + "grad_norm": 0.4643329241994898, + "learning_rate": 4.812759652350947e-06, + "loss": 0.5101, + "step": 6685 + }, + { + "epoch": 2.6248456408989873, + "grad_norm": 0.47372550653573653, + "learning_rate": 4.81270015712378e-06, + "loss": 0.5129, + "step": 6686 + }, + { + "epoch": 2.6252408001975795, + "grad_norm": 0.4539448907813019, + "learning_rate": 4.81264065281373e-06, + "loss": 0.5008, + "step": 6687 + }, + { + "epoch": 2.625635959496172, + "grad_norm": 0.4678129441000318, + "learning_rate": 4.812581139421033e-06, + "loss": 0.5341, + "step": 6688 + }, + { + "epoch": 2.626031118794764, + "grad_norm": 0.4425885751871649, + "learning_rate": 4.812521616945921e-06, + "loss": 0.5244, + "step": 6689 + }, + { + "epoch": 2.6264262780933563, + "grad_norm": 0.4674942573227224, + "learning_rate": 4.8124620853886285e-06, + "loss": 0.5266, + "step": 6690 + }, + { + "epoch": 2.6268214373919485, + "grad_norm": 0.4509317914278103, + "learning_rate": 4.8124025447493885e-06, + "loss": 0.5086, + "step": 6691 + }, + { + "epoch": 2.6272165966905408, + "grad_norm": 0.465509813792353, + "learning_rate": 4.8123429950284365e-06, + "loss": 0.522, + "step": 6692 + }, + { + "epoch": 2.627611755989133, + "grad_norm": 0.46175300779476025, + "learning_rate": 4.812283436226004e-06, + "loss": 0.506, + "step": 6693 + }, + { + "epoch": 2.6280069152877252, + "grad_norm": 0.455507151711426, + "learning_rate": 4.8122238683423276e-06, + "loss": 0.5059, + "step": 6694 + }, + { + "epoch": 2.6284020745863175, + "grad_norm": 0.45167680557954504, + "learning_rate": 4.812164291377639e-06, + "loss": 0.4937, + "step": 6695 + }, + { + "epoch": 2.6287972338849097, + "grad_norm": 0.462114757199228, + "learning_rate": 4.812104705332174e-06, + "loss": 0.5205, + "step": 6696 + }, + { + "epoch": 2.629192393183502, + "grad_norm": 0.4513431799853371, + "learning_rate": 4.812045110206165e-06, + "loss": 0.492, + "step": 6697 + }, + { + "epoch": 2.629587552482094, + "grad_norm": 0.44284132274625015, + "learning_rate": 4.811985505999846e-06, + "loss": 0.4751, + "step": 6698 + }, + { + "epoch": 2.6299827117806864, + "grad_norm": 0.4750386854078683, + "learning_rate": 4.811925892713452e-06, + "loss": 0.5302, + "step": 6699 + }, + { + "epoch": 2.6303778710792787, + "grad_norm": 0.4651936596747168, + "learning_rate": 4.811866270347219e-06, + "loss": 0.5187, + "step": 6700 + }, + { + "epoch": 2.630773030377871, + "grad_norm": 0.45466631519979056, + "learning_rate": 4.811806638901378e-06, + "loss": 0.4861, + "step": 6701 + }, + { + "epoch": 2.6311681896764636, + "grad_norm": 0.4480365156981654, + "learning_rate": 4.8117469983761636e-06, + "loss": 0.4928, + "step": 6702 + }, + { + "epoch": 2.631563348975056, + "grad_norm": 0.45596295905822454, + "learning_rate": 4.811687348771811e-06, + "loss": 0.5143, + "step": 6703 + }, + { + "epoch": 2.631958508273648, + "grad_norm": 0.4612335510523021, + "learning_rate": 4.811627690088555e-06, + "loss": 0.5039, + "step": 6704 + }, + { + "epoch": 2.6323536675722403, + "grad_norm": 0.4495637737856094, + "learning_rate": 4.811568022326628e-06, + "loss": 0.5014, + "step": 6705 + }, + { + "epoch": 2.6327488268708326, + "grad_norm": 0.4571383147738367, + "learning_rate": 4.811508345486267e-06, + "loss": 0.4987, + "step": 6706 + }, + { + "epoch": 2.633143986169425, + "grad_norm": 0.46620616655944114, + "learning_rate": 4.811448659567703e-06, + "loss": 0.5157, + "step": 6707 + }, + { + "epoch": 2.633539145468017, + "grad_norm": 0.692201266331602, + "learning_rate": 4.811388964571173e-06, + "loss": 0.502, + "step": 6708 + }, + { + "epoch": 2.6339343047666093, + "grad_norm": 0.452425821090267, + "learning_rate": 4.811329260496911e-06, + "loss": 0.5082, + "step": 6709 + }, + { + "epoch": 2.6343294640652015, + "grad_norm": 0.4484831548580827, + "learning_rate": 4.811269547345151e-06, + "loss": 0.4953, + "step": 6710 + }, + { + "epoch": 2.6347246233637938, + "grad_norm": 0.5286805004399712, + "learning_rate": 4.8112098251161275e-06, + "loss": 0.5059, + "step": 6711 + }, + { + "epoch": 2.635119782662386, + "grad_norm": 0.47242345842337413, + "learning_rate": 4.811150093810076e-06, + "loss": 0.515, + "step": 6712 + }, + { + "epoch": 2.6355149419609782, + "grad_norm": 0.45817282138512155, + "learning_rate": 4.81109035342723e-06, + "loss": 0.4959, + "step": 6713 + }, + { + "epoch": 2.6359101012595705, + "grad_norm": 0.47033111569825325, + "learning_rate": 4.811030603967824e-06, + "loss": 0.5047, + "step": 6714 + }, + { + "epoch": 2.6363052605581627, + "grad_norm": 0.4574013018432309, + "learning_rate": 4.810970845432094e-06, + "loss": 0.5121, + "step": 6715 + }, + { + "epoch": 2.636700419856755, + "grad_norm": 0.47733427395045364, + "learning_rate": 4.810911077820273e-06, + "loss": 0.4993, + "step": 6716 + }, + { + "epoch": 2.637095579155347, + "grad_norm": 0.45438462788409517, + "learning_rate": 4.8108513011325965e-06, + "loss": 0.496, + "step": 6717 + }, + { + "epoch": 2.6374907384539394, + "grad_norm": 0.46176365204286124, + "learning_rate": 4.8107915153693e-06, + "loss": 0.5186, + "step": 6718 + }, + { + "epoch": 2.6378858977525317, + "grad_norm": 0.46309358892460495, + "learning_rate": 4.810731720530617e-06, + "loss": 0.5083, + "step": 6719 + }, + { + "epoch": 2.638281057051124, + "grad_norm": 0.4712899021409001, + "learning_rate": 4.810671916616783e-06, + "loss": 0.5113, + "step": 6720 + }, + { + "epoch": 2.638676216349716, + "grad_norm": 0.4621511341873441, + "learning_rate": 4.810612103628033e-06, + "loss": 0.505, + "step": 6721 + }, + { + "epoch": 2.6390713756483084, + "grad_norm": 0.4484479850838346, + "learning_rate": 4.810552281564602e-06, + "loss": 0.498, + "step": 6722 + }, + { + "epoch": 2.6394665349469006, + "grad_norm": 0.4447686728319573, + "learning_rate": 4.8104924504267245e-06, + "loss": 0.4786, + "step": 6723 + }, + { + "epoch": 2.639861694245493, + "grad_norm": 0.4587588719398148, + "learning_rate": 4.810432610214636e-06, + "loss": 0.5008, + "step": 6724 + }, + { + "epoch": 2.640256853544085, + "grad_norm": 0.46171851458075386, + "learning_rate": 4.81037276092857e-06, + "loss": 0.5104, + "step": 6725 + }, + { + "epoch": 2.6406520128426774, + "grad_norm": 0.47362051128187516, + "learning_rate": 4.810312902568763e-06, + "loss": 0.5179, + "step": 6726 + }, + { + "epoch": 2.6410471721412696, + "grad_norm": 0.4707818961914056, + "learning_rate": 4.81025303513545e-06, + "loss": 0.5152, + "step": 6727 + }, + { + "epoch": 2.641442331439862, + "grad_norm": 0.4487464299804814, + "learning_rate": 4.810193158628867e-06, + "loss": 0.4998, + "step": 6728 + }, + { + "epoch": 2.641837490738454, + "grad_norm": 0.4550736327280535, + "learning_rate": 4.810133273049247e-06, + "loss": 0.5179, + "step": 6729 + }, + { + "epoch": 2.6422326500370463, + "grad_norm": 0.47050227006026263, + "learning_rate": 4.810073378396827e-06, + "loss": 0.5214, + "step": 6730 + }, + { + "epoch": 2.6426278093356386, + "grad_norm": 0.4519033384468275, + "learning_rate": 4.81001347467184e-06, + "loss": 0.4998, + "step": 6731 + }, + { + "epoch": 2.643022968634231, + "grad_norm": 0.451105123921464, + "learning_rate": 4.809953561874525e-06, + "loss": 0.5057, + "step": 6732 + }, + { + "epoch": 2.643418127932823, + "grad_norm": 0.4556875869733165, + "learning_rate": 4.8098936400051145e-06, + "loss": 0.5156, + "step": 6733 + }, + { + "epoch": 2.6438132872314153, + "grad_norm": 0.45194215128861687, + "learning_rate": 4.809833709063844e-06, + "loss": 0.5221, + "step": 6734 + }, + { + "epoch": 2.6442084465300075, + "grad_norm": 0.4695069511637479, + "learning_rate": 4.809773769050948e-06, + "loss": 0.5203, + "step": 6735 + }, + { + "epoch": 2.6446036058285998, + "grad_norm": 0.4792987985028994, + "learning_rate": 4.809713819966665e-06, + "loss": 0.5284, + "step": 6736 + }, + { + "epoch": 2.644998765127192, + "grad_norm": 0.45769602212488414, + "learning_rate": 4.809653861811228e-06, + "loss": 0.4851, + "step": 6737 + }, + { + "epoch": 2.6453939244257842, + "grad_norm": 0.4424740957506516, + "learning_rate": 4.809593894584873e-06, + "loss": 0.4982, + "step": 6738 + }, + { + "epoch": 2.6457890837243765, + "grad_norm": 0.4696493235047474, + "learning_rate": 4.809533918287836e-06, + "loss": 0.4989, + "step": 6739 + }, + { + "epoch": 2.6461842430229687, + "grad_norm": 0.46456830773516294, + "learning_rate": 4.809473932920352e-06, + "loss": 0.4941, + "step": 6740 + }, + { + "epoch": 2.646579402321561, + "grad_norm": 0.47014248694056754, + "learning_rate": 4.809413938482657e-06, + "loss": 0.5156, + "step": 6741 + }, + { + "epoch": 2.646974561620153, + "grad_norm": 0.4356406681706022, + "learning_rate": 4.809353934974987e-06, + "loss": 0.4908, + "step": 6742 + }, + { + "epoch": 2.6473697209187455, + "grad_norm": 0.4862338209729092, + "learning_rate": 4.809293922397576e-06, + "loss": 0.5048, + "step": 6743 + }, + { + "epoch": 2.6477648802173377, + "grad_norm": 0.4575448187275317, + "learning_rate": 4.80923390075066e-06, + "loss": 0.5134, + "step": 6744 + }, + { + "epoch": 2.64816003951593, + "grad_norm": 0.4564366850228013, + "learning_rate": 4.809173870034477e-06, + "loss": 0.5085, + "step": 6745 + }, + { + "epoch": 2.648555198814522, + "grad_norm": 0.48219599498932664, + "learning_rate": 4.809113830249261e-06, + "loss": 0.5221, + "step": 6746 + }, + { + "epoch": 2.6489503581131144, + "grad_norm": 0.4453567920915501, + "learning_rate": 4.809053781395248e-06, + "loss": 0.509, + "step": 6747 + }, + { + "epoch": 2.6493455174117067, + "grad_norm": 0.4724611482066963, + "learning_rate": 4.8089937234726734e-06, + "loss": 0.4851, + "step": 6748 + }, + { + "epoch": 2.649740676710299, + "grad_norm": 0.4444318838338744, + "learning_rate": 4.808933656481774e-06, + "loss": 0.5097, + "step": 6749 + }, + { + "epoch": 2.650135836008891, + "grad_norm": 0.46372690064711497, + "learning_rate": 4.808873580422785e-06, + "loss": 0.5011, + "step": 6750 + }, + { + "epoch": 2.6505309953074834, + "grad_norm": 0.45321412299435093, + "learning_rate": 4.808813495295942e-06, + "loss": 0.4916, + "step": 6751 + }, + { + "epoch": 2.6509261546060756, + "grad_norm": 0.47068167177315234, + "learning_rate": 4.808753401101483e-06, + "loss": 0.5067, + "step": 6752 + }, + { + "epoch": 2.651321313904668, + "grad_norm": 0.4635431012232405, + "learning_rate": 4.808693297839642e-06, + "loss": 0.5182, + "step": 6753 + }, + { + "epoch": 2.65171647320326, + "grad_norm": 0.4412239947020369, + "learning_rate": 4.8086331855106546e-06, + "loss": 0.5035, + "step": 6754 + }, + { + "epoch": 2.6521116325018523, + "grad_norm": 0.43995031164220066, + "learning_rate": 4.80857306411476e-06, + "loss": 0.5028, + "step": 6755 + }, + { + "epoch": 2.6525067918004446, + "grad_norm": 0.4591057467664625, + "learning_rate": 4.808512933652191e-06, + "loss": 0.5049, + "step": 6756 + }, + { + "epoch": 2.652901951099037, + "grad_norm": 0.4571034390087959, + "learning_rate": 4.808452794123184e-06, + "loss": 0.5272, + "step": 6757 + }, + { + "epoch": 2.653297110397629, + "grad_norm": 0.4518130469202257, + "learning_rate": 4.8083926455279775e-06, + "loss": 0.5165, + "step": 6758 + }, + { + "epoch": 2.6536922696962213, + "grad_norm": 0.439662265546044, + "learning_rate": 4.808332487866806e-06, + "loss": 0.4949, + "step": 6759 + }, + { + "epoch": 2.6540874289948135, + "grad_norm": 0.46867256715880273, + "learning_rate": 4.808272321139907e-06, + "loss": 0.5098, + "step": 6760 + }, + { + "epoch": 2.6544825882934058, + "grad_norm": 0.4407389861497981, + "learning_rate": 4.808212145347515e-06, + "loss": 0.4803, + "step": 6761 + }, + { + "epoch": 2.654877747591998, + "grad_norm": 0.45779304183120295, + "learning_rate": 4.808151960489867e-06, + "loss": 0.4899, + "step": 6762 + }, + { + "epoch": 2.6552729068905903, + "grad_norm": 0.4565973133474136, + "learning_rate": 4.808091766567201e-06, + "loss": 0.5085, + "step": 6763 + }, + { + "epoch": 2.6556680661891825, + "grad_norm": 0.468953636638347, + "learning_rate": 4.8080315635797515e-06, + "loss": 0.5232, + "step": 6764 + }, + { + "epoch": 2.6560632254877747, + "grad_norm": 0.44984729188558875, + "learning_rate": 4.807971351527755e-06, + "loss": 0.52, + "step": 6765 + }, + { + "epoch": 2.656458384786367, + "grad_norm": 0.46345262945947907, + "learning_rate": 4.807911130411449e-06, + "loss": 0.4932, + "step": 6766 + }, + { + "epoch": 2.656853544084959, + "grad_norm": 0.46323751843274685, + "learning_rate": 4.80785090023107e-06, + "loss": 0.5168, + "step": 6767 + }, + { + "epoch": 2.6572487033835515, + "grad_norm": 0.4626841241252618, + "learning_rate": 4.807790660986854e-06, + "loss": 0.5224, + "step": 6768 + }, + { + "epoch": 2.6576438626821437, + "grad_norm": 0.4413542814112758, + "learning_rate": 4.807730412679037e-06, + "loss": 0.5105, + "step": 6769 + }, + { + "epoch": 2.658039021980736, + "grad_norm": 0.45515913790444384, + "learning_rate": 4.807670155307857e-06, + "loss": 0.4853, + "step": 6770 + }, + { + "epoch": 2.658434181279328, + "grad_norm": 0.45080574051033717, + "learning_rate": 4.807609888873548e-06, + "loss": 0.5232, + "step": 6771 + }, + { + "epoch": 2.6588293405779204, + "grad_norm": 0.45056542118018017, + "learning_rate": 4.807549613376351e-06, + "loss": 0.503, + "step": 6772 + }, + { + "epoch": 2.6592244998765127, + "grad_norm": 0.45084980258696417, + "learning_rate": 4.8074893288164995e-06, + "loss": 0.5082, + "step": 6773 + }, + { + "epoch": 2.659619659175105, + "grad_norm": 0.46000748014035947, + "learning_rate": 4.80742903519423e-06, + "loss": 0.5126, + "step": 6774 + }, + { + "epoch": 2.660014818473697, + "grad_norm": 0.6125057638224274, + "learning_rate": 4.807368732509782e-06, + "loss": 0.516, + "step": 6775 + }, + { + "epoch": 2.6604099777722894, + "grad_norm": 0.4465782067051833, + "learning_rate": 4.8073084207633895e-06, + "loss": 0.5017, + "step": 6776 + }, + { + "epoch": 2.6608051370708816, + "grad_norm": 0.44598562336646147, + "learning_rate": 4.807248099955291e-06, + "loss": 0.4966, + "step": 6777 + }, + { + "epoch": 2.661200296369474, + "grad_norm": 0.4617546372416006, + "learning_rate": 4.807187770085724e-06, + "loss": 0.5003, + "step": 6778 + }, + { + "epoch": 2.661595455668066, + "grad_norm": 0.4676309244036775, + "learning_rate": 4.807127431154923e-06, + "loss": 0.5039, + "step": 6779 + }, + { + "epoch": 2.6619906149666583, + "grad_norm": 0.5277411293225999, + "learning_rate": 4.807067083163127e-06, + "loss": 0.518, + "step": 6780 + }, + { + "epoch": 2.6623857742652506, + "grad_norm": 0.45429895891466116, + "learning_rate": 4.8070067261105725e-06, + "loss": 0.4982, + "step": 6781 + }, + { + "epoch": 2.662780933563843, + "grad_norm": 0.4464728729832292, + "learning_rate": 4.806946359997496e-06, + "loss": 0.5189, + "step": 6782 + }, + { + "epoch": 2.663176092862435, + "grad_norm": 0.4715484427485262, + "learning_rate": 4.806885984824136e-06, + "loss": 0.5117, + "step": 6783 + }, + { + "epoch": 2.6635712521610273, + "grad_norm": 0.45900610189295865, + "learning_rate": 4.8068256005907275e-06, + "loss": 0.5023, + "step": 6784 + }, + { + "epoch": 2.6639664114596195, + "grad_norm": 0.45019303096695007, + "learning_rate": 4.80676520729751e-06, + "loss": 0.5113, + "step": 6785 + }, + { + "epoch": 2.664361570758212, + "grad_norm": 0.46748176211191267, + "learning_rate": 4.806704804944719e-06, + "loss": 0.5164, + "step": 6786 + }, + { + "epoch": 2.664756730056804, + "grad_norm": 0.46855906191928937, + "learning_rate": 4.8066443935325926e-06, + "loss": 0.5002, + "step": 6787 + }, + { + "epoch": 2.6651518893553963, + "grad_norm": 0.452910775505456, + "learning_rate": 4.806583973061367e-06, + "loss": 0.5156, + "step": 6788 + }, + { + "epoch": 2.6655470486539885, + "grad_norm": 0.47440038216416675, + "learning_rate": 4.80652354353128e-06, + "loss": 0.5303, + "step": 6789 + }, + { + "epoch": 2.6659422079525807, + "grad_norm": 0.5924097415433517, + "learning_rate": 4.806463104942569e-06, + "loss": 0.5167, + "step": 6790 + }, + { + "epoch": 2.666337367251173, + "grad_norm": 0.438886847398818, + "learning_rate": 4.806402657295472e-06, + "loss": 0.5003, + "step": 6791 + }, + { + "epoch": 2.6667325265497652, + "grad_norm": 0.46609047736390946, + "learning_rate": 4.806342200590227e-06, + "loss": 0.5128, + "step": 6792 + }, + { + "epoch": 2.6671276858483575, + "grad_norm": 0.4632346721128658, + "learning_rate": 4.8062817348270684e-06, + "loss": 0.4954, + "step": 6793 + }, + { + "epoch": 2.6675228451469497, + "grad_norm": 0.4608854558412661, + "learning_rate": 4.806221260006237e-06, + "loss": 0.505, + "step": 6794 + }, + { + "epoch": 2.667918004445542, + "grad_norm": 0.46630121452068407, + "learning_rate": 4.806160776127968e-06, + "loss": 0.5108, + "step": 6795 + }, + { + "epoch": 2.668313163744134, + "grad_norm": 0.45531938998001714, + "learning_rate": 4.806100283192501e-06, + "loss": 0.4968, + "step": 6796 + }, + { + "epoch": 2.6687083230427264, + "grad_norm": 0.4555861650279177, + "learning_rate": 4.806039781200071e-06, + "loss": 0.5088, + "step": 6797 + }, + { + "epoch": 2.6691034823413187, + "grad_norm": 0.4803868219645436, + "learning_rate": 4.805979270150918e-06, + "loss": 0.5122, + "step": 6798 + }, + { + "epoch": 2.669498641639911, + "grad_norm": 0.47581987750590815, + "learning_rate": 4.805918750045278e-06, + "loss": 0.4929, + "step": 6799 + }, + { + "epoch": 2.669893800938503, + "grad_norm": 0.449650149880212, + "learning_rate": 4.80585822088339e-06, + "loss": 0.5197, + "step": 6800 + }, + { + "epoch": 2.6702889602370954, + "grad_norm": 0.43855063136998546, + "learning_rate": 4.8057976826654906e-06, + "loss": 0.5101, + "step": 6801 + }, + { + "epoch": 2.6706841195356876, + "grad_norm": 0.4720486302585973, + "learning_rate": 4.805737135391818e-06, + "loss": 0.5262, + "step": 6802 + }, + { + "epoch": 2.67107927883428, + "grad_norm": 0.44945472494966127, + "learning_rate": 4.80567657906261e-06, + "loss": 0.5048, + "step": 6803 + }, + { + "epoch": 2.671474438132872, + "grad_norm": 0.4669554725656825, + "learning_rate": 4.8056160136781055e-06, + "loss": 0.5115, + "step": 6804 + }, + { + "epoch": 2.6718695974314643, + "grad_norm": 0.4403150506327582, + "learning_rate": 4.805555439238541e-06, + "loss": 0.5031, + "step": 6805 + }, + { + "epoch": 2.6722647567300566, + "grad_norm": 0.5000895923389528, + "learning_rate": 4.805494855744154e-06, + "loss": 0.5302, + "step": 6806 + }, + { + "epoch": 2.672659916028649, + "grad_norm": 0.47576343934993737, + "learning_rate": 4.8054342631951836e-06, + "loss": 0.4895, + "step": 6807 + }, + { + "epoch": 2.673055075327241, + "grad_norm": 0.477265617906093, + "learning_rate": 4.8053736615918675e-06, + "loss": 0.5007, + "step": 6808 + }, + { + "epoch": 2.6734502346258333, + "grad_norm": 0.4485511317732285, + "learning_rate": 4.8053130509344434e-06, + "loss": 0.5081, + "step": 6809 + }, + { + "epoch": 2.6738453939244256, + "grad_norm": 0.45923352589367195, + "learning_rate": 4.8052524312231494e-06, + "loss": 0.5056, + "step": 6810 + }, + { + "epoch": 2.674240553223018, + "grad_norm": 0.4661199256713386, + "learning_rate": 4.8051918024582235e-06, + "loss": 0.506, + "step": 6811 + }, + { + "epoch": 2.67463571252161, + "grad_norm": 0.4610083722334732, + "learning_rate": 4.8051311646399045e-06, + "loss": 0.5033, + "step": 6812 + }, + { + "epoch": 2.6750308718202023, + "grad_norm": 0.43652772614403434, + "learning_rate": 4.80507051776843e-06, + "loss": 0.5071, + "step": 6813 + }, + { + "epoch": 2.6754260311187945, + "grad_norm": 0.4691550437232749, + "learning_rate": 4.805009861844038e-06, + "loss": 0.527, + "step": 6814 + }, + { + "epoch": 2.6758211904173868, + "grad_norm": 0.4560452636416422, + "learning_rate": 4.804949196866967e-06, + "loss": 0.4908, + "step": 6815 + }, + { + "epoch": 2.676216349715979, + "grad_norm": 0.48133219998650245, + "learning_rate": 4.8048885228374556e-06, + "loss": 0.5118, + "step": 6816 + }, + { + "epoch": 2.6766115090145712, + "grad_norm": 0.452233986235823, + "learning_rate": 4.804827839755741e-06, + "loss": 0.4915, + "step": 6817 + }, + { + "epoch": 2.6770066683131635, + "grad_norm": 0.4631514836434119, + "learning_rate": 4.804767147622062e-06, + "loss": 0.529, + "step": 6818 + }, + { + "epoch": 2.6774018276117557, + "grad_norm": 0.4502944964366507, + "learning_rate": 4.804706446436658e-06, + "loss": 0.5077, + "step": 6819 + }, + { + "epoch": 2.6777969869103484, + "grad_norm": 0.46421201437573734, + "learning_rate": 4.8046457361997655e-06, + "loss": 0.5185, + "step": 6820 + }, + { + "epoch": 2.6781921462089406, + "grad_norm": 0.4613890649783742, + "learning_rate": 4.804585016911625e-06, + "loss": 0.5138, + "step": 6821 + }, + { + "epoch": 2.678587305507533, + "grad_norm": 0.4456774139985832, + "learning_rate": 4.8045242885724735e-06, + "loss": 0.5217, + "step": 6822 + }, + { + "epoch": 2.678982464806125, + "grad_norm": 0.47237905847343165, + "learning_rate": 4.80446355118255e-06, + "loss": 0.5191, + "step": 6823 + }, + { + "epoch": 2.6793776241047174, + "grad_norm": 0.4974035566555643, + "learning_rate": 4.804402804742093e-06, + "loss": 0.5512, + "step": 6824 + }, + { + "epoch": 2.6797727834033096, + "grad_norm": 0.47581595796135917, + "learning_rate": 4.804342049251341e-06, + "loss": 0.5303, + "step": 6825 + }, + { + "epoch": 2.680167942701902, + "grad_norm": 0.47309784832193486, + "learning_rate": 4.804281284710534e-06, + "loss": 0.5315, + "step": 6826 + }, + { + "epoch": 2.680563102000494, + "grad_norm": 0.44291199384154933, + "learning_rate": 4.804220511119908e-06, + "loss": 0.5095, + "step": 6827 + }, + { + "epoch": 2.6809582612990863, + "grad_norm": 0.4418663874638194, + "learning_rate": 4.804159728479703e-06, + "loss": 0.5, + "step": 6828 + }, + { + "epoch": 2.6813534205976786, + "grad_norm": 0.46327736667282093, + "learning_rate": 4.804098936790158e-06, + "loss": 0.5141, + "step": 6829 + }, + { + "epoch": 2.681748579896271, + "grad_norm": 0.4479767367906708, + "learning_rate": 4.804038136051512e-06, + "loss": 0.5011, + "step": 6830 + }, + { + "epoch": 2.682143739194863, + "grad_norm": 0.46247506860437304, + "learning_rate": 4.803977326264003e-06, + "loss": 0.5114, + "step": 6831 + }, + { + "epoch": 2.6825388984934553, + "grad_norm": 0.46141174971832466, + "learning_rate": 4.803916507427869e-06, + "loss": 0.5141, + "step": 6832 + }, + { + "epoch": 2.6829340577920475, + "grad_norm": 0.4873722164912079, + "learning_rate": 4.803855679543352e-06, + "loss": 0.4998, + "step": 6833 + }, + { + "epoch": 2.6833292170906398, + "grad_norm": 0.46403561177788066, + "learning_rate": 4.803794842610687e-06, + "loss": 0.5164, + "step": 6834 + }, + { + "epoch": 2.683724376389232, + "grad_norm": 0.4456637637514536, + "learning_rate": 4.803733996630116e-06, + "loss": 0.5018, + "step": 6835 + }, + { + "epoch": 2.6841195356878242, + "grad_norm": 0.4563297591462206, + "learning_rate": 4.803673141601876e-06, + "loss": 0.4958, + "step": 6836 + }, + { + "epoch": 2.6845146949864165, + "grad_norm": 0.46433291682819583, + "learning_rate": 4.803612277526207e-06, + "loss": 0.5041, + "step": 6837 + }, + { + "epoch": 2.6849098542850087, + "grad_norm": 0.45384038987818476, + "learning_rate": 4.803551404403348e-06, + "loss": 0.5029, + "step": 6838 + }, + { + "epoch": 2.685305013583601, + "grad_norm": 0.4396868856560902, + "learning_rate": 4.803490522233538e-06, + "loss": 0.4979, + "step": 6839 + }, + { + "epoch": 2.685700172882193, + "grad_norm": 0.45017056466155314, + "learning_rate": 4.803429631017016e-06, + "loss": 0.5048, + "step": 6840 + }, + { + "epoch": 2.6860953321807854, + "grad_norm": 0.4574075515113022, + "learning_rate": 4.8033687307540214e-06, + "loss": 0.5155, + "step": 6841 + }, + { + "epoch": 2.6864904914793777, + "grad_norm": 0.4618670159293962, + "learning_rate": 4.803307821444793e-06, + "loss": 0.516, + "step": 6842 + }, + { + "epoch": 2.68688565077797, + "grad_norm": 0.4464411508006451, + "learning_rate": 4.803246903089569e-06, + "loss": 0.5171, + "step": 6843 + }, + { + "epoch": 2.687280810076562, + "grad_norm": 0.45337545621080044, + "learning_rate": 4.80318597568859e-06, + "loss": 0.5238, + "step": 6844 + }, + { + "epoch": 2.6876759693751544, + "grad_norm": 0.4706459371294516, + "learning_rate": 4.803125039242096e-06, + "loss": 0.5214, + "step": 6845 + }, + { + "epoch": 2.6880711286737466, + "grad_norm": 0.444744828944862, + "learning_rate": 4.8030640937503245e-06, + "loss": 0.5177, + "step": 6846 + }, + { + "epoch": 2.688466287972339, + "grad_norm": 0.4650828362886886, + "learning_rate": 4.803003139213517e-06, + "loss": 0.5075, + "step": 6847 + }, + { + "epoch": 2.688861447270931, + "grad_norm": 0.45639383031758396, + "learning_rate": 4.802942175631911e-06, + "loss": 0.5011, + "step": 6848 + }, + { + "epoch": 2.6892566065695234, + "grad_norm": 0.466428988530897, + "learning_rate": 4.802881203005746e-06, + "loss": 0.5139, + "step": 6849 + }, + { + "epoch": 2.6896517658681156, + "grad_norm": 0.4525428401741202, + "learning_rate": 4.802820221335263e-06, + "loss": 0.4978, + "step": 6850 + }, + { + "epoch": 2.690046925166708, + "grad_norm": 0.45716169122612815, + "learning_rate": 4.8027592306206995e-06, + "loss": 0.501, + "step": 6851 + }, + { + "epoch": 2.6904420844653, + "grad_norm": 0.48662071727200495, + "learning_rate": 4.802698230862296e-06, + "loss": 0.5265, + "step": 6852 + }, + { + "epoch": 2.6908372437638923, + "grad_norm": 0.4623332955672627, + "learning_rate": 4.802637222060293e-06, + "loss": 0.5163, + "step": 6853 + }, + { + "epoch": 2.6912324030624846, + "grad_norm": 0.45648458131398784, + "learning_rate": 4.802576204214928e-06, + "loss": 0.515, + "step": 6854 + }, + { + "epoch": 2.691627562361077, + "grad_norm": 0.4551657895457975, + "learning_rate": 4.802515177326444e-06, + "loss": 0.5282, + "step": 6855 + }, + { + "epoch": 2.692022721659669, + "grad_norm": 0.4475910585637045, + "learning_rate": 4.802454141395076e-06, + "loss": 0.5113, + "step": 6856 + }, + { + "epoch": 2.6924178809582613, + "grad_norm": 0.4558226749142037, + "learning_rate": 4.802393096421068e-06, + "loss": 0.5123, + "step": 6857 + }, + { + "epoch": 2.6928130402568535, + "grad_norm": 0.4486376587821075, + "learning_rate": 4.802332042404657e-06, + "loss": 0.5117, + "step": 6858 + }, + { + "epoch": 2.6932081995554458, + "grad_norm": 0.44884818601499643, + "learning_rate": 4.8022709793460846e-06, + "loss": 0.5084, + "step": 6859 + }, + { + "epoch": 2.693603358854038, + "grad_norm": 0.5475296537368842, + "learning_rate": 4.8022099072455896e-06, + "loss": 0.5086, + "step": 6860 + }, + { + "epoch": 2.6939985181526303, + "grad_norm": 0.4747377379257253, + "learning_rate": 4.802148826103412e-06, + "loss": 0.5109, + "step": 6861 + }, + { + "epoch": 2.6943936774512225, + "grad_norm": 0.47187115055855244, + "learning_rate": 4.802087735919792e-06, + "loss": 0.5107, + "step": 6862 + }, + { + "epoch": 2.6947888367498147, + "grad_norm": 0.45581653066216854, + "learning_rate": 4.802026636694969e-06, + "loss": 0.5092, + "step": 6863 + }, + { + "epoch": 2.695183996048407, + "grad_norm": 0.45531582354330513, + "learning_rate": 4.8019655284291825e-06, + "loss": 0.4925, + "step": 6864 + }, + { + "epoch": 2.695579155346999, + "grad_norm": 0.5224115146598348, + "learning_rate": 4.801904411122675e-06, + "loss": 0.5203, + "step": 6865 + }, + { + "epoch": 2.6959743146455915, + "grad_norm": 0.4448869457231406, + "learning_rate": 4.8018432847756825e-06, + "loss": 0.4948, + "step": 6866 + }, + { + "epoch": 2.6963694739441837, + "grad_norm": 0.46002467750035936, + "learning_rate": 4.801782149388448e-06, + "loss": 0.5056, + "step": 6867 + }, + { + "epoch": 2.696764633242776, + "grad_norm": 0.47172532896971114, + "learning_rate": 4.801721004961213e-06, + "loss": 0.5075, + "step": 6868 + }, + { + "epoch": 2.697159792541368, + "grad_norm": 0.45878868214555063, + "learning_rate": 4.8016598514942135e-06, + "loss": 0.4925, + "step": 6869 + }, + { + "epoch": 2.6975549518399604, + "grad_norm": 0.4544323936183125, + "learning_rate": 4.801598688987692e-06, + "loss": 0.5035, + "step": 6870 + }, + { + "epoch": 2.6979501111385527, + "grad_norm": 0.4774600541049669, + "learning_rate": 4.801537517441889e-06, + "loss": 0.5106, + "step": 6871 + }, + { + "epoch": 2.698345270437145, + "grad_norm": 0.4625044065364173, + "learning_rate": 4.801476336857043e-06, + "loss": 0.4934, + "step": 6872 + }, + { + "epoch": 2.698740429735737, + "grad_norm": 0.45276021101004094, + "learning_rate": 4.801415147233397e-06, + "loss": 0.4948, + "step": 6873 + }, + { + "epoch": 2.6991355890343294, + "grad_norm": 0.46016046750830586, + "learning_rate": 4.801353948571189e-06, + "loss": 0.5105, + "step": 6874 + }, + { + "epoch": 2.6995307483329216, + "grad_norm": 0.46937028530160213, + "learning_rate": 4.801292740870661e-06, + "loss": 0.5325, + "step": 6875 + }, + { + "epoch": 2.699925907631514, + "grad_norm": 0.43934115205425334, + "learning_rate": 4.801231524132052e-06, + "loss": 0.5062, + "step": 6876 + }, + { + "epoch": 2.700321066930106, + "grad_norm": 0.4469967985779925, + "learning_rate": 4.8011702983556026e-06, + "loss": 0.4971, + "step": 6877 + }, + { + "epoch": 2.7007162262286983, + "grad_norm": 0.46826293402204716, + "learning_rate": 4.801109063541554e-06, + "loss": 0.4992, + "step": 6878 + }, + { + "epoch": 2.7011113855272906, + "grad_norm": 0.5113971704976565, + "learning_rate": 4.801047819690146e-06, + "loss": 0.5298, + "step": 6879 + }, + { + "epoch": 2.701506544825883, + "grad_norm": 0.4645313397142929, + "learning_rate": 4.80098656680162e-06, + "loss": 0.5112, + "step": 6880 + }, + { + "epoch": 2.701901704124475, + "grad_norm": 0.47144303739327625, + "learning_rate": 4.800925304876215e-06, + "loss": 0.5002, + "step": 6881 + }, + { + "epoch": 2.7022968634230673, + "grad_norm": 0.4780153361617559, + "learning_rate": 4.800864033914173e-06, + "loss": 0.5317, + "step": 6882 + }, + { + "epoch": 2.7026920227216595, + "grad_norm": 0.46311336498971456, + "learning_rate": 4.800802753915735e-06, + "loss": 0.5098, + "step": 6883 + }, + { + "epoch": 2.7030871820202518, + "grad_norm": 0.4704994698152214, + "learning_rate": 4.8007414648811405e-06, + "loss": 0.4842, + "step": 6884 + }, + { + "epoch": 2.703482341318844, + "grad_norm": 0.44615978605663914, + "learning_rate": 4.80068016681063e-06, + "loss": 0.5057, + "step": 6885 + }, + { + "epoch": 2.7038775006174363, + "grad_norm": 0.4582730966673267, + "learning_rate": 4.800618859704445e-06, + "loss": 0.4998, + "step": 6886 + }, + { + "epoch": 2.7042726599160285, + "grad_norm": 0.47695840090832414, + "learning_rate": 4.800557543562827e-06, + "loss": 0.5027, + "step": 6887 + }, + { + "epoch": 2.7046678192146207, + "grad_norm": 0.464279484547299, + "learning_rate": 4.800496218386015e-06, + "loss": 0.4821, + "step": 6888 + }, + { + "epoch": 2.705062978513213, + "grad_norm": 0.4627589954096053, + "learning_rate": 4.800434884174251e-06, + "loss": 0.5229, + "step": 6889 + }, + { + "epoch": 2.705458137811805, + "grad_norm": 0.46481102969321464, + "learning_rate": 4.800373540927776e-06, + "loss": 0.4879, + "step": 6890 + }, + { + "epoch": 2.705853297110398, + "grad_norm": 0.46484595991261723, + "learning_rate": 4.800312188646831e-06, + "loss": 0.5025, + "step": 6891 + }, + { + "epoch": 2.70624845640899, + "grad_norm": 0.4679261982088802, + "learning_rate": 4.800250827331656e-06, + "loss": 0.5031, + "step": 6892 + }, + { + "epoch": 2.7066436157075824, + "grad_norm": 0.45277952364743757, + "learning_rate": 4.800189456982492e-06, + "loss": 0.5067, + "step": 6893 + }, + { + "epoch": 2.7070387750061746, + "grad_norm": 0.4581955934170635, + "learning_rate": 4.800128077599581e-06, + "loss": 0.498, + "step": 6894 + }, + { + "epoch": 2.707433934304767, + "grad_norm": 0.4728192631389551, + "learning_rate": 4.800066689183164e-06, + "loss": 0.5116, + "step": 6895 + }, + { + "epoch": 2.707829093603359, + "grad_norm": 0.4742126911889233, + "learning_rate": 4.800005291733482e-06, + "loss": 0.5127, + "step": 6896 + }, + { + "epoch": 2.7082242529019513, + "grad_norm": 0.4515853200904919, + "learning_rate": 4.7999438852507745e-06, + "loss": 0.4993, + "step": 6897 + }, + { + "epoch": 2.7086194122005436, + "grad_norm": 0.4613616972466868, + "learning_rate": 4.799882469735285e-06, + "loss": 0.5032, + "step": 6898 + }, + { + "epoch": 2.709014571499136, + "grad_norm": 0.45816883607410985, + "learning_rate": 4.799821045187254e-06, + "loss": 0.5137, + "step": 6899 + }, + { + "epoch": 2.709409730797728, + "grad_norm": 0.44940851329013853, + "learning_rate": 4.7997596116069215e-06, + "loss": 0.5187, + "step": 6900 + }, + { + "epoch": 2.7098048900963203, + "grad_norm": 0.4502469570380973, + "learning_rate": 4.79969816899453e-06, + "loss": 0.5025, + "step": 6901 + }, + { + "epoch": 2.7102000493949125, + "grad_norm": 0.4481223650440485, + "learning_rate": 4.799636717350321e-06, + "loss": 0.4923, + "step": 6902 + }, + { + "epoch": 2.710595208693505, + "grad_norm": 0.45792623854887804, + "learning_rate": 4.7995752566745345e-06, + "loss": 0.5043, + "step": 6903 + }, + { + "epoch": 2.710990367992097, + "grad_norm": 0.4601271483289185, + "learning_rate": 4.7995137869674135e-06, + "loss": 0.509, + "step": 6904 + }, + { + "epoch": 2.7113855272906893, + "grad_norm": 0.4476972091826272, + "learning_rate": 4.799452308229199e-06, + "loss": 0.5014, + "step": 6905 + }, + { + "epoch": 2.7117806865892815, + "grad_norm": 0.4600909924968701, + "learning_rate": 4.7993908204601315e-06, + "loss": 0.5073, + "step": 6906 + }, + { + "epoch": 2.7121758458878737, + "grad_norm": 0.45934839488330736, + "learning_rate": 4.799329323660453e-06, + "loss": 0.5173, + "step": 6907 + }, + { + "epoch": 2.712571005186466, + "grad_norm": 0.45337135497372977, + "learning_rate": 4.799267817830406e-06, + "loss": 0.501, + "step": 6908 + }, + { + "epoch": 2.7129661644850582, + "grad_norm": 0.4500761726544255, + "learning_rate": 4.7992063029702304e-06, + "loss": 0.5018, + "step": 6909 + }, + { + "epoch": 2.7133613237836505, + "grad_norm": 0.45950279024379764, + "learning_rate": 4.799144779080169e-06, + "loss": 0.517, + "step": 6910 + }, + { + "epoch": 2.7137564830822427, + "grad_norm": 0.4730730331136291, + "learning_rate": 4.799083246160463e-06, + "loss": 0.4923, + "step": 6911 + }, + { + "epoch": 2.714151642380835, + "grad_norm": 0.43830507511052397, + "learning_rate": 4.799021704211354e-06, + "loss": 0.4934, + "step": 6912 + }, + { + "epoch": 2.714546801679427, + "grad_norm": 0.48563994353988965, + "learning_rate": 4.798960153233084e-06, + "loss": 0.5061, + "step": 6913 + }, + { + "epoch": 2.7149419609780194, + "grad_norm": 0.4792693935069234, + "learning_rate": 4.798898593225894e-06, + "loss": 0.5025, + "step": 6914 + }, + { + "epoch": 2.7153371202766117, + "grad_norm": 0.4535898076683153, + "learning_rate": 4.798837024190027e-06, + "loss": 0.5132, + "step": 6915 + }, + { + "epoch": 2.715732279575204, + "grad_norm": 0.47116866983420425, + "learning_rate": 4.798775446125723e-06, + "loss": 0.4975, + "step": 6916 + }, + { + "epoch": 2.716127438873796, + "grad_norm": 0.4460352697228603, + "learning_rate": 4.7987138590332264e-06, + "loss": 0.5062, + "step": 6917 + }, + { + "epoch": 2.7165225981723884, + "grad_norm": 0.4520031645447902, + "learning_rate": 4.798652262912776e-06, + "loss": 0.512, + "step": 6918 + }, + { + "epoch": 2.7169177574709806, + "grad_norm": 0.44599164428046767, + "learning_rate": 4.798590657764617e-06, + "loss": 0.4947, + "step": 6919 + }, + { + "epoch": 2.717312916769573, + "grad_norm": 0.4462324282376401, + "learning_rate": 4.798529043588989e-06, + "loss": 0.4997, + "step": 6920 + }, + { + "epoch": 2.717708076068165, + "grad_norm": 0.4517482214846663, + "learning_rate": 4.798467420386133e-06, + "loss": 0.5082, + "step": 6921 + }, + { + "epoch": 2.7181032353667574, + "grad_norm": 0.45386053523383296, + "learning_rate": 4.798405788156295e-06, + "loss": 0.4976, + "step": 6922 + }, + { + "epoch": 2.7184983946653496, + "grad_norm": 0.46008987922377587, + "learning_rate": 4.7983441468997134e-06, + "loss": 0.493, + "step": 6923 + }, + { + "epoch": 2.718893553963942, + "grad_norm": 0.4481062465022577, + "learning_rate": 4.798282496616633e-06, + "loss": 0.5143, + "step": 6924 + }, + { + "epoch": 2.719288713262534, + "grad_norm": 0.4505251384368751, + "learning_rate": 4.7982208373072936e-06, + "loss": 0.5175, + "step": 6925 + }, + { + "epoch": 2.7196838725611263, + "grad_norm": 0.47657228107417726, + "learning_rate": 4.798159168971938e-06, + "loss": 0.5243, + "step": 6926 + }, + { + "epoch": 2.7200790318597186, + "grad_norm": 0.46381523563325006, + "learning_rate": 4.798097491610809e-06, + "loss": 0.4926, + "step": 6927 + }, + { + "epoch": 2.720474191158311, + "grad_norm": 0.4925744506297022, + "learning_rate": 4.798035805224149e-06, + "loss": 0.4848, + "step": 6928 + }, + { + "epoch": 2.720869350456903, + "grad_norm": 0.44864029022686136, + "learning_rate": 4.797974109812199e-06, + "loss": 0.4967, + "step": 6929 + }, + { + "epoch": 2.7212645097554953, + "grad_norm": 0.49217622158783636, + "learning_rate": 4.797912405375203e-06, + "loss": 0.5014, + "step": 6930 + }, + { + "epoch": 2.7216596690540875, + "grad_norm": 0.4551306861648084, + "learning_rate": 4.797850691913402e-06, + "loss": 0.5291, + "step": 6931 + }, + { + "epoch": 2.7220548283526798, + "grad_norm": 0.4637807915278924, + "learning_rate": 4.797788969427039e-06, + "loss": 0.5012, + "step": 6932 + }, + { + "epoch": 2.722449987651272, + "grad_norm": 0.4512850558545194, + "learning_rate": 4.797727237916355e-06, + "loss": 0.5125, + "step": 6933 + }, + { + "epoch": 2.7228451469498642, + "grad_norm": 0.4594283870187807, + "learning_rate": 4.7976654973815955e-06, + "loss": 0.5108, + "step": 6934 + }, + { + "epoch": 2.7232403062484565, + "grad_norm": 0.5129146142831824, + "learning_rate": 4.797603747823e-06, + "loss": 0.5151, + "step": 6935 + }, + { + "epoch": 2.7236354655470487, + "grad_norm": 0.4624544070980088, + "learning_rate": 4.797541989240812e-06, + "loss": 0.5022, + "step": 6936 + }, + { + "epoch": 2.724030624845641, + "grad_norm": 0.44163885935392466, + "learning_rate": 4.797480221635276e-06, + "loss": 0.4944, + "step": 6937 + }, + { + "epoch": 2.724425784144233, + "grad_norm": 0.4428403204450448, + "learning_rate": 4.7974184450066305e-06, + "loss": 0.5108, + "step": 6938 + }, + { + "epoch": 2.7248209434428254, + "grad_norm": 0.46568478000032276, + "learning_rate": 4.7973566593551216e-06, + "loss": 0.4976, + "step": 6939 + }, + { + "epoch": 2.7252161027414177, + "grad_norm": 0.45155171800762267, + "learning_rate": 4.7972948646809906e-06, + "loss": 0.5149, + "step": 6940 + }, + { + "epoch": 2.72561126204001, + "grad_norm": 0.46921689781602866, + "learning_rate": 4.797233060984481e-06, + "loss": 0.5185, + "step": 6941 + }, + { + "epoch": 2.726006421338602, + "grad_norm": 0.4584753926896495, + "learning_rate": 4.797171248265833e-06, + "loss": 0.5043, + "step": 6942 + }, + { + "epoch": 2.7264015806371944, + "grad_norm": 0.4457236150051623, + "learning_rate": 4.797109426525293e-06, + "loss": 0.5003, + "step": 6943 + }, + { + "epoch": 2.7267967399357866, + "grad_norm": 0.4577197175651194, + "learning_rate": 4.797047595763101e-06, + "loss": 0.5268, + "step": 6944 + }, + { + "epoch": 2.727191899234379, + "grad_norm": 0.4574229252485062, + "learning_rate": 4.796985755979502e-06, + "loss": 0.498, + "step": 6945 + }, + { + "epoch": 2.727587058532971, + "grad_norm": 0.4596664767781495, + "learning_rate": 4.796923907174737e-06, + "loss": 0.5038, + "step": 6946 + }, + { + "epoch": 2.7279822178315634, + "grad_norm": 0.4458255538392768, + "learning_rate": 4.79686204934905e-06, + "loss": 0.5137, + "step": 6947 + }, + { + "epoch": 2.7283773771301556, + "grad_norm": 0.45280689074026337, + "learning_rate": 4.796800182502683e-06, + "loss": 0.5208, + "step": 6948 + }, + { + "epoch": 2.728772536428748, + "grad_norm": 0.47554043866753837, + "learning_rate": 4.7967383066358795e-06, + "loss": 0.5044, + "step": 6949 + }, + { + "epoch": 2.72916769572734, + "grad_norm": 0.4607340942101068, + "learning_rate": 4.796676421748884e-06, + "loss": 0.5169, + "step": 6950 + }, + { + "epoch": 2.7295628550259323, + "grad_norm": 0.44291773525236144, + "learning_rate": 4.796614527841937e-06, + "loss": 0.4949, + "step": 6951 + }, + { + "epoch": 2.7299580143245246, + "grad_norm": 0.46432616537422905, + "learning_rate": 4.796552624915283e-06, + "loss": 0.5058, + "step": 6952 + }, + { + "epoch": 2.730353173623117, + "grad_norm": 0.4633027740708962, + "learning_rate": 4.796490712969165e-06, + "loss": 0.5167, + "step": 6953 + }, + { + "epoch": 2.730748332921709, + "grad_norm": 0.5118956983442591, + "learning_rate": 4.796428792003826e-06, + "loss": 0.5229, + "step": 6954 + }, + { + "epoch": 2.7311434922203013, + "grad_norm": 0.4543120210417002, + "learning_rate": 4.796366862019508e-06, + "loss": 0.4928, + "step": 6955 + }, + { + "epoch": 2.7315386515188935, + "grad_norm": 0.47789407600911105, + "learning_rate": 4.796304923016456e-06, + "loss": 0.5188, + "step": 6956 + }, + { + "epoch": 2.7319338108174858, + "grad_norm": 0.46242168805945055, + "learning_rate": 4.796242974994913e-06, + "loss": 0.4952, + "step": 6957 + }, + { + "epoch": 2.732328970116078, + "grad_norm": 0.4566066993777903, + "learning_rate": 4.796181017955122e-06, + "loss": 0.5047, + "step": 6958 + }, + { + "epoch": 2.7327241294146702, + "grad_norm": 0.47657476604138915, + "learning_rate": 4.796119051897327e-06, + "loss": 0.5012, + "step": 6959 + }, + { + "epoch": 2.7331192887132625, + "grad_norm": 0.46254465859951066, + "learning_rate": 4.79605707682177e-06, + "loss": 0.4958, + "step": 6960 + }, + { + "epoch": 2.7335144480118547, + "grad_norm": 0.45509248055911644, + "learning_rate": 4.795995092728694e-06, + "loss": 0.5063, + "step": 6961 + }, + { + "epoch": 2.733909607310447, + "grad_norm": 0.4460935553959142, + "learning_rate": 4.795933099618344e-06, + "loss": 0.5105, + "step": 6962 + }, + { + "epoch": 2.734304766609039, + "grad_norm": 0.4477458914794227, + "learning_rate": 4.795871097490964e-06, + "loss": 0.5147, + "step": 6963 + }, + { + "epoch": 2.7346999259076314, + "grad_norm": 0.4423880972213776, + "learning_rate": 4.795809086346796e-06, + "loss": 0.5089, + "step": 6964 + }, + { + "epoch": 2.7350950852062237, + "grad_norm": 0.4391301789018041, + "learning_rate": 4.795747066186083e-06, + "loss": 0.5023, + "step": 6965 + }, + { + "epoch": 2.735490244504816, + "grad_norm": 0.45270875027903384, + "learning_rate": 4.79568503700907e-06, + "loss": 0.5027, + "step": 6966 + }, + { + "epoch": 2.735885403803408, + "grad_norm": 0.4548394809042717, + "learning_rate": 4.795622998816001e-06, + "loss": 0.4998, + "step": 6967 + }, + { + "epoch": 2.7362805631020004, + "grad_norm": 0.4500140263131528, + "learning_rate": 4.795560951607118e-06, + "loss": 0.4956, + "step": 6968 + }, + { + "epoch": 2.7366757224005926, + "grad_norm": 0.45444549472429036, + "learning_rate": 4.795498895382667e-06, + "loss": 0.4999, + "step": 6969 + }, + { + "epoch": 2.737070881699185, + "grad_norm": 0.45943067355398565, + "learning_rate": 4.795436830142888e-06, + "loss": 0.5079, + "step": 6970 + }, + { + "epoch": 2.737466040997777, + "grad_norm": 0.4583842097104527, + "learning_rate": 4.795374755888028e-06, + "loss": 0.5035, + "step": 6971 + }, + { + "epoch": 2.7378612002963694, + "grad_norm": 0.44524024874479756, + "learning_rate": 4.7953126726183305e-06, + "loss": 0.4885, + "step": 6972 + }, + { + "epoch": 2.7382563595949616, + "grad_norm": 0.5259627464540994, + "learning_rate": 4.795250580334038e-06, + "loss": 0.5078, + "step": 6973 + }, + { + "epoch": 2.738651518893554, + "grad_norm": 0.44629367850165574, + "learning_rate": 4.795188479035395e-06, + "loss": 0.5132, + "step": 6974 + }, + { + "epoch": 2.739046678192146, + "grad_norm": 0.4738145104107334, + "learning_rate": 4.7951263687226444e-06, + "loss": 0.5144, + "step": 6975 + }, + { + "epoch": 2.7394418374907383, + "grad_norm": 0.4536390577628937, + "learning_rate": 4.795064249396032e-06, + "loss": 0.4981, + "step": 6976 + }, + { + "epoch": 2.7398369967893306, + "grad_norm": 0.48887491920473725, + "learning_rate": 4.795002121055802e-06, + "loss": 0.529, + "step": 6977 + }, + { + "epoch": 2.740232156087923, + "grad_norm": 0.45703040398721106, + "learning_rate": 4.794939983702196e-06, + "loss": 0.4995, + "step": 6978 + }, + { + "epoch": 2.740627315386515, + "grad_norm": 0.4688231922181613, + "learning_rate": 4.7948778373354585e-06, + "loss": 0.5007, + "step": 6979 + }, + { + "epoch": 2.7410224746851073, + "grad_norm": 0.4466851138075401, + "learning_rate": 4.794815681955836e-06, + "loss": 0.496, + "step": 6980 + }, + { + "epoch": 2.7414176339836995, + "grad_norm": 0.4893848636599617, + "learning_rate": 4.79475351756357e-06, + "loss": 0.5012, + "step": 6981 + }, + { + "epoch": 2.7418127932822918, + "grad_norm": 0.4883005520320106, + "learning_rate": 4.794691344158906e-06, + "loss": 0.5085, + "step": 6982 + }, + { + "epoch": 2.742207952580884, + "grad_norm": 0.45293984493757544, + "learning_rate": 4.794629161742088e-06, + "loss": 0.5008, + "step": 6983 + }, + { + "epoch": 2.7426031118794763, + "grad_norm": 0.5501230497941952, + "learning_rate": 4.79456697031336e-06, + "loss": 0.5128, + "step": 6984 + }, + { + "epoch": 2.7429982711780685, + "grad_norm": 0.4639739628186388, + "learning_rate": 4.794504769872966e-06, + "loss": 0.5169, + "step": 6985 + }, + { + "epoch": 2.7433934304766607, + "grad_norm": 0.4576900039164546, + "learning_rate": 4.794442560421151e-06, + "loss": 0.4819, + "step": 6986 + }, + { + "epoch": 2.743788589775253, + "grad_norm": 0.4954297664525487, + "learning_rate": 4.794380341958158e-06, + "loss": 0.5166, + "step": 6987 + }, + { + "epoch": 2.744183749073845, + "grad_norm": 0.46126145279470626, + "learning_rate": 4.794318114484233e-06, + "loss": 0.5118, + "step": 6988 + }, + { + "epoch": 2.7445789083724375, + "grad_norm": 0.4584209830526114, + "learning_rate": 4.79425587799962e-06, + "loss": 0.5086, + "step": 6989 + }, + { + "epoch": 2.7449740676710297, + "grad_norm": 0.4831191153527967, + "learning_rate": 4.794193632504561e-06, + "loss": 0.5441, + "step": 6990 + }, + { + "epoch": 2.745369226969622, + "grad_norm": 0.4797130021650316, + "learning_rate": 4.794131377999305e-06, + "loss": 0.5279, + "step": 6991 + }, + { + "epoch": 2.745764386268214, + "grad_norm": 0.4699057790970996, + "learning_rate": 4.794069114484092e-06, + "loss": 0.5147, + "step": 6992 + }, + { + "epoch": 2.7461595455668064, + "grad_norm": 0.46046778025908625, + "learning_rate": 4.79400684195917e-06, + "loss": 0.5086, + "step": 6993 + }, + { + "epoch": 2.7465547048653987, + "grad_norm": 0.4383439181821961, + "learning_rate": 4.793944560424782e-06, + "loss": 0.4921, + "step": 6994 + }, + { + "epoch": 2.746949864163991, + "grad_norm": 0.4643419519593291, + "learning_rate": 4.7938822698811725e-06, + "loss": 0.5024, + "step": 6995 + }, + { + "epoch": 2.747345023462583, + "grad_norm": 0.45885771648686097, + "learning_rate": 4.793819970328586e-06, + "loss": 0.4996, + "step": 6996 + }, + { + "epoch": 2.7477401827611754, + "grad_norm": 0.47670938755540426, + "learning_rate": 4.793757661767268e-06, + "loss": 0.5024, + "step": 6997 + }, + { + "epoch": 2.7481353420597676, + "grad_norm": 0.45854522700324046, + "learning_rate": 4.7936953441974624e-06, + "loss": 0.4988, + "step": 6998 + }, + { + "epoch": 2.74853050135836, + "grad_norm": 0.4518882922405723, + "learning_rate": 4.793633017619415e-06, + "loss": 0.5026, + "step": 6999 + }, + { + "epoch": 2.748925660656952, + "grad_norm": 0.4721585044201508, + "learning_rate": 4.793570682033368e-06, + "loss": 0.5066, + "step": 7000 + }, + { + "epoch": 2.7493208199555443, + "grad_norm": 0.45566875735908363, + "learning_rate": 4.7935083374395694e-06, + "loss": 0.4957, + "step": 7001 + }, + { + "epoch": 2.7497159792541366, + "grad_norm": 0.46827293594382546, + "learning_rate": 4.793445983838263e-06, + "loss": 0.5161, + "step": 7002 + }, + { + "epoch": 2.750111138552729, + "grad_norm": 0.45535123480528633, + "learning_rate": 4.793383621229694e-06, + "loss": 0.4846, + "step": 7003 + }, + { + "epoch": 2.750506297851321, + "grad_norm": 0.4640975956499678, + "learning_rate": 4.7933212496141055e-06, + "loss": 0.5196, + "step": 7004 + }, + { + "epoch": 2.7509014571499133, + "grad_norm": 0.45391152016709185, + "learning_rate": 4.793258868991743e-06, + "loss": 0.5044, + "step": 7005 + }, + { + "epoch": 2.7512966164485055, + "grad_norm": 0.456017711396762, + "learning_rate": 4.793196479362854e-06, + "loss": 0.5073, + "step": 7006 + }, + { + "epoch": 2.751691775747098, + "grad_norm": 0.457250650843779, + "learning_rate": 4.793134080727682e-06, + "loss": 0.5072, + "step": 7007 + }, + { + "epoch": 2.75208693504569, + "grad_norm": 0.5241068945571156, + "learning_rate": 4.79307167308647e-06, + "loss": 0.5143, + "step": 7008 + }, + { + "epoch": 2.7524820943442827, + "grad_norm": 0.44978349825521247, + "learning_rate": 4.793009256439466e-06, + "loss": 0.491, + "step": 7009 + }, + { + "epoch": 2.752877253642875, + "grad_norm": 0.445219206652602, + "learning_rate": 4.792946830786914e-06, + "loss": 0.4946, + "step": 7010 + }, + { + "epoch": 2.753272412941467, + "grad_norm": 0.4728591513095926, + "learning_rate": 4.792884396129059e-06, + "loss": 0.5342, + "step": 7011 + }, + { + "epoch": 2.7536675722400594, + "grad_norm": 0.4494210336181794, + "learning_rate": 4.792821952466146e-06, + "loss": 0.4868, + "step": 7012 + }, + { + "epoch": 2.7540627315386517, + "grad_norm": 0.46547211537802097, + "learning_rate": 4.7927594997984215e-06, + "loss": 0.491, + "step": 7013 + }, + { + "epoch": 2.754457890837244, + "grad_norm": 0.47364471713178513, + "learning_rate": 4.7926970381261295e-06, + "loss": 0.506, + "step": 7014 + }, + { + "epoch": 2.754853050135836, + "grad_norm": 0.4508628675635779, + "learning_rate": 4.7926345674495155e-06, + "loss": 0.5066, + "step": 7015 + }, + { + "epoch": 2.7552482094344284, + "grad_norm": 0.47245052795730946, + "learning_rate": 4.792572087768825e-06, + "loss": 0.509, + "step": 7016 + }, + { + "epoch": 2.7556433687330206, + "grad_norm": 0.49318688651877063, + "learning_rate": 4.792509599084304e-06, + "loss": 0.4953, + "step": 7017 + }, + { + "epoch": 2.756038528031613, + "grad_norm": 0.45637468572032197, + "learning_rate": 4.792447101396197e-06, + "loss": 0.5149, + "step": 7018 + }, + { + "epoch": 2.756433687330205, + "grad_norm": 0.4682354739864325, + "learning_rate": 4.79238459470475e-06, + "loss": 0.5186, + "step": 7019 + }, + { + "epoch": 2.7568288466287973, + "grad_norm": 0.4568829125381733, + "learning_rate": 4.7923220790102084e-06, + "loss": 0.5055, + "step": 7020 + }, + { + "epoch": 2.7572240059273896, + "grad_norm": 0.45180194260707746, + "learning_rate": 4.792259554312817e-06, + "loss": 0.4921, + "step": 7021 + }, + { + "epoch": 2.757619165225982, + "grad_norm": 0.4668813422184499, + "learning_rate": 4.7921970206128235e-06, + "loss": 0.5301, + "step": 7022 + }, + { + "epoch": 2.758014324524574, + "grad_norm": 0.4487332150132883, + "learning_rate": 4.7921344779104705e-06, + "loss": 0.5031, + "step": 7023 + }, + { + "epoch": 2.7584094838231663, + "grad_norm": 0.45510934399651354, + "learning_rate": 4.7920719262060055e-06, + "loss": 0.5386, + "step": 7024 + }, + { + "epoch": 2.7588046431217585, + "grad_norm": 0.44723741022980124, + "learning_rate": 4.792009365499674e-06, + "loss": 0.5155, + "step": 7025 + }, + { + "epoch": 2.759199802420351, + "grad_norm": 0.4504980939404593, + "learning_rate": 4.791946795791721e-06, + "loss": 0.5161, + "step": 7026 + }, + { + "epoch": 2.759594961718943, + "grad_norm": 0.44795581514294985, + "learning_rate": 4.791884217082394e-06, + "loss": 0.5015, + "step": 7027 + }, + { + "epoch": 2.7599901210175353, + "grad_norm": 0.45748558259217437, + "learning_rate": 4.791821629371936e-06, + "loss": 0.5038, + "step": 7028 + }, + { + "epoch": 2.7603852803161275, + "grad_norm": 0.47531123398447556, + "learning_rate": 4.791759032660596e-06, + "loss": 0.4986, + "step": 7029 + }, + { + "epoch": 2.7607804396147198, + "grad_norm": 0.46673985312016447, + "learning_rate": 4.7916964269486165e-06, + "loss": 0.4956, + "step": 7030 + }, + { + "epoch": 2.761175598913312, + "grad_norm": 0.4629491572931826, + "learning_rate": 4.791633812236245e-06, + "loss": 0.4946, + "step": 7031 + }, + { + "epoch": 2.7615707582119042, + "grad_norm": 0.4525334948265311, + "learning_rate": 4.791571188523729e-06, + "loss": 0.5164, + "step": 7032 + }, + { + "epoch": 2.7619659175104965, + "grad_norm": 0.45400679108436304, + "learning_rate": 4.7915085558113115e-06, + "loss": 0.4999, + "step": 7033 + }, + { + "epoch": 2.7623610768090887, + "grad_norm": 0.47943008845611956, + "learning_rate": 4.791445914099241e-06, + "loss": 0.5032, + "step": 7034 + }, + { + "epoch": 2.762756236107681, + "grad_norm": 0.4452226257043686, + "learning_rate": 4.791383263387761e-06, + "loss": 0.501, + "step": 7035 + }, + { + "epoch": 2.763151395406273, + "grad_norm": 0.48784364857172546, + "learning_rate": 4.7913206036771195e-06, + "loss": 0.5172, + "step": 7036 + }, + { + "epoch": 2.7635465547048654, + "grad_norm": 0.47792955328117803, + "learning_rate": 4.791257934967563e-06, + "loss": 0.5341, + "step": 7037 + }, + { + "epoch": 2.7639417140034577, + "grad_norm": 0.4432418586956085, + "learning_rate": 4.791195257259335e-06, + "loss": 0.4891, + "step": 7038 + }, + { + "epoch": 2.76433687330205, + "grad_norm": 0.4712175445574134, + "learning_rate": 4.791132570552685e-06, + "loss": 0.4999, + "step": 7039 + }, + { + "epoch": 2.764732032600642, + "grad_norm": 0.4439252343190045, + "learning_rate": 4.791069874847857e-06, + "loss": 0.5085, + "step": 7040 + }, + { + "epoch": 2.7651271918992344, + "grad_norm": 0.5317426037392942, + "learning_rate": 4.791007170145097e-06, + "loss": 0.5129, + "step": 7041 + }, + { + "epoch": 2.7655223511978266, + "grad_norm": 0.4587796397905756, + "learning_rate": 4.790944456444653e-06, + "loss": 0.5116, + "step": 7042 + }, + { + "epoch": 2.765917510496419, + "grad_norm": 0.4797852387845019, + "learning_rate": 4.7908817337467695e-06, + "loss": 0.5254, + "step": 7043 + }, + { + "epoch": 2.766312669795011, + "grad_norm": 0.4558663094889101, + "learning_rate": 4.790819002051694e-06, + "loss": 0.5199, + "step": 7044 + }, + { + "epoch": 2.7667078290936034, + "grad_norm": 0.44832319567068757, + "learning_rate": 4.790756261359673e-06, + "loss": 0.515, + "step": 7045 + }, + { + "epoch": 2.7671029883921956, + "grad_norm": 0.4482969941450535, + "learning_rate": 4.7906935116709505e-06, + "loss": 0.5046, + "step": 7046 + }, + { + "epoch": 2.767498147690788, + "grad_norm": 0.4559545857856152, + "learning_rate": 4.790630752985776e-06, + "loss": 0.4937, + "step": 7047 + }, + { + "epoch": 2.76789330698938, + "grad_norm": 0.44267716011895897, + "learning_rate": 4.790567985304396e-06, + "loss": 0.508, + "step": 7048 + }, + { + "epoch": 2.7682884662879723, + "grad_norm": 0.464044432217731, + "learning_rate": 4.790505208627055e-06, + "loss": 0.5148, + "step": 7049 + }, + { + "epoch": 2.7686836255865646, + "grad_norm": 0.4467116038672829, + "learning_rate": 4.790442422954e-06, + "loss": 0.498, + "step": 7050 + }, + { + "epoch": 2.769078784885157, + "grad_norm": 0.4798202007038068, + "learning_rate": 4.790379628285479e-06, + "loss": 0.5087, + "step": 7051 + }, + { + "epoch": 2.769473944183749, + "grad_norm": 0.44503110439284255, + "learning_rate": 4.790316824621736e-06, + "loss": 0.5137, + "step": 7052 + }, + { + "epoch": 2.7698691034823413, + "grad_norm": 0.46457729311037305, + "learning_rate": 4.79025401196302e-06, + "loss": 0.5195, + "step": 7053 + }, + { + "epoch": 2.7702642627809335, + "grad_norm": 0.44066547002420625, + "learning_rate": 4.790191190309578e-06, + "loss": 0.4925, + "step": 7054 + }, + { + "epoch": 2.7706594220795258, + "grad_norm": 0.45430470319634064, + "learning_rate": 4.790128359661654e-06, + "loss": 0.495, + "step": 7055 + }, + { + "epoch": 2.771054581378118, + "grad_norm": 0.4414603456774517, + "learning_rate": 4.790065520019498e-06, + "loss": 0.5038, + "step": 7056 + }, + { + "epoch": 2.7714497406767102, + "grad_norm": 0.4922018456920078, + "learning_rate": 4.790002671383354e-06, + "loss": 0.5155, + "step": 7057 + }, + { + "epoch": 2.7718448999753025, + "grad_norm": 0.453999025325055, + "learning_rate": 4.789939813753471e-06, + "loss": 0.5008, + "step": 7058 + }, + { + "epoch": 2.7722400592738947, + "grad_norm": 0.4617428583463696, + "learning_rate": 4.789876947130095e-06, + "loss": 0.4974, + "step": 7059 + }, + { + "epoch": 2.772635218572487, + "grad_norm": 0.464515686848104, + "learning_rate": 4.789814071513472e-06, + "loss": 0.514, + "step": 7060 + }, + { + "epoch": 2.773030377871079, + "grad_norm": 0.44904340222359973, + "learning_rate": 4.78975118690385e-06, + "loss": 0.5063, + "step": 7061 + }, + { + "epoch": 2.7734255371696714, + "grad_norm": 0.44818575851748277, + "learning_rate": 4.789688293301477e-06, + "loss": 0.5114, + "step": 7062 + }, + { + "epoch": 2.7738206964682637, + "grad_norm": 0.4394864938402016, + "learning_rate": 4.789625390706597e-06, + "loss": 0.4861, + "step": 7063 + }, + { + "epoch": 2.774215855766856, + "grad_norm": 0.4476957402660163, + "learning_rate": 4.789562479119459e-06, + "loss": 0.5039, + "step": 7064 + }, + { + "epoch": 2.774611015065448, + "grad_norm": 0.44866741905501123, + "learning_rate": 4.789499558540311e-06, + "loss": 0.4856, + "step": 7065 + }, + { + "epoch": 2.7750061743640404, + "grad_norm": 0.4657641738772069, + "learning_rate": 4.7894366289693984e-06, + "loss": 0.4909, + "step": 7066 + }, + { + "epoch": 2.7754013336626326, + "grad_norm": 0.43895805450614483, + "learning_rate": 4.789373690406969e-06, + "loss": 0.4977, + "step": 7067 + }, + { + "epoch": 2.775796492961225, + "grad_norm": 0.45870187996681017, + "learning_rate": 4.789310742853269e-06, + "loss": 0.4998, + "step": 7068 + }, + { + "epoch": 2.776191652259817, + "grad_norm": 0.45232602161964686, + "learning_rate": 4.789247786308548e-06, + "loss": 0.5082, + "step": 7069 + }, + { + "epoch": 2.7765868115584094, + "grad_norm": 0.4533454018434586, + "learning_rate": 4.789184820773052e-06, + "loss": 0.4736, + "step": 7070 + }, + { + "epoch": 2.7769819708570016, + "grad_norm": 0.4473360662431684, + "learning_rate": 4.7891218462470264e-06, + "loss": 0.4908, + "step": 7071 + }, + { + "epoch": 2.777377130155594, + "grad_norm": 0.4682252863668288, + "learning_rate": 4.7890588627307214e-06, + "loss": 0.5016, + "step": 7072 + }, + { + "epoch": 2.777772289454186, + "grad_norm": 0.4457755613086908, + "learning_rate": 4.788995870224382e-06, + "loss": 0.5183, + "step": 7073 + }, + { + "epoch": 2.7781674487527783, + "grad_norm": 0.46757695064672683, + "learning_rate": 4.788932868728258e-06, + "loss": 0.514, + "step": 7074 + }, + { + "epoch": 2.7785626080513706, + "grad_norm": 0.4538274673145632, + "learning_rate": 4.788869858242595e-06, + "loss": 0.5092, + "step": 7075 + }, + { + "epoch": 2.778957767349963, + "grad_norm": 0.4561259698590066, + "learning_rate": 4.788806838767642e-06, + "loss": 0.5331, + "step": 7076 + }, + { + "epoch": 2.779352926648555, + "grad_norm": 0.4589851990064589, + "learning_rate": 4.788743810303644e-06, + "loss": 0.5087, + "step": 7077 + }, + { + "epoch": 2.7797480859471473, + "grad_norm": 0.4587981064535018, + "learning_rate": 4.788680772850852e-06, + "loss": 0.5064, + "step": 7078 + }, + { + "epoch": 2.7801432452457395, + "grad_norm": 0.4495946809709087, + "learning_rate": 4.78861772640951e-06, + "loss": 0.5094, + "step": 7079 + }, + { + "epoch": 2.780538404544332, + "grad_norm": 0.4687277276868725, + "learning_rate": 4.788554670979868e-06, + "loss": 0.4998, + "step": 7080 + }, + { + "epoch": 2.7809335638429244, + "grad_norm": 0.4624450226026877, + "learning_rate": 4.7884916065621735e-06, + "loss": 0.5357, + "step": 7081 + }, + { + "epoch": 2.7813287231415167, + "grad_norm": 0.44737808383227995, + "learning_rate": 4.788428533156673e-06, + "loss": 0.5085, + "step": 7082 + }, + { + "epoch": 2.781723882440109, + "grad_norm": 0.4435365459154493, + "learning_rate": 4.788365450763614e-06, + "loss": 0.4855, + "step": 7083 + }, + { + "epoch": 2.782119041738701, + "grad_norm": 0.47758039605363733, + "learning_rate": 4.788302359383247e-06, + "loss": 0.5147, + "step": 7084 + }, + { + "epoch": 2.7825142010372934, + "grad_norm": 0.47467035291327786, + "learning_rate": 4.788239259015817e-06, + "loss": 0.5085, + "step": 7085 + }, + { + "epoch": 2.7829093603358857, + "grad_norm": 0.44586423467559405, + "learning_rate": 4.788176149661572e-06, + "loss": 0.4858, + "step": 7086 + }, + { + "epoch": 2.783304519634478, + "grad_norm": 0.45852446260824725, + "learning_rate": 4.7881130313207615e-06, + "loss": 0.4946, + "step": 7087 + }, + { + "epoch": 2.78369967893307, + "grad_norm": 0.4608076484794306, + "learning_rate": 4.7880499039936315e-06, + "loss": 0.5079, + "step": 7088 + }, + { + "epoch": 2.7840948382316624, + "grad_norm": 0.4653013987128913, + "learning_rate": 4.787986767680431e-06, + "loss": 0.5409, + "step": 7089 + }, + { + "epoch": 2.7844899975302546, + "grad_norm": 0.4563060161643037, + "learning_rate": 4.787923622381409e-06, + "loss": 0.4793, + "step": 7090 + }, + { + "epoch": 2.784885156828847, + "grad_norm": 0.474596234725646, + "learning_rate": 4.787860468096811e-06, + "loss": 0.5247, + "step": 7091 + }, + { + "epoch": 2.785280316127439, + "grad_norm": 0.4340035243691619, + "learning_rate": 4.787797304826887e-06, + "loss": 0.5038, + "step": 7092 + }, + { + "epoch": 2.7856754754260313, + "grad_norm": 0.45248266773447615, + "learning_rate": 4.787734132571884e-06, + "loss": 0.5287, + "step": 7093 + }, + { + "epoch": 2.7860706347246236, + "grad_norm": 0.45357497159528226, + "learning_rate": 4.7876709513320506e-06, + "loss": 0.4866, + "step": 7094 + }, + { + "epoch": 2.786465794023216, + "grad_norm": 0.471725493815604, + "learning_rate": 4.787607761107634e-06, + "loss": 0.509, + "step": 7095 + }, + { + "epoch": 2.786860953321808, + "grad_norm": 0.4550489319224383, + "learning_rate": 4.7875445618988846e-06, + "loss": 0.5069, + "step": 7096 + }, + { + "epoch": 2.7872561126204003, + "grad_norm": 0.4480112491649345, + "learning_rate": 4.787481353706049e-06, + "loss": 0.5214, + "step": 7097 + }, + { + "epoch": 2.7876512719189925, + "grad_norm": 0.46020685458924593, + "learning_rate": 4.787418136529376e-06, + "loss": 0.4913, + "step": 7098 + }, + { + "epoch": 2.7880464312175848, + "grad_norm": 0.47097392767674306, + "learning_rate": 4.787354910369113e-06, + "loss": 0.5046, + "step": 7099 + }, + { + "epoch": 2.788441590516177, + "grad_norm": 0.6635008391885756, + "learning_rate": 4.787291675225508e-06, + "loss": 0.5148, + "step": 7100 + }, + { + "epoch": 2.7888367498147693, + "grad_norm": 0.4483283359174607, + "learning_rate": 4.7872284310988115e-06, + "loss": 0.4828, + "step": 7101 + }, + { + "epoch": 2.7892319091133615, + "grad_norm": 0.45492414146412075, + "learning_rate": 4.78716517798927e-06, + "loss": 0.493, + "step": 7102 + }, + { + "epoch": 2.7896270684119537, + "grad_norm": 0.46950627047824917, + "learning_rate": 4.787101915897133e-06, + "loss": 0.5081, + "step": 7103 + }, + { + "epoch": 2.790022227710546, + "grad_norm": 0.46505419717794555, + "learning_rate": 4.787038644822649e-06, + "loss": 0.5116, + "step": 7104 + }, + { + "epoch": 2.790417387009138, + "grad_norm": 0.4544759503964953, + "learning_rate": 4.786975364766064e-06, + "loss": 0.5137, + "step": 7105 + }, + { + "epoch": 2.7908125463077305, + "grad_norm": 0.4615992806036096, + "learning_rate": 4.786912075727631e-06, + "loss": 0.5141, + "step": 7106 + }, + { + "epoch": 2.7912077056063227, + "grad_norm": 0.44988730317410236, + "learning_rate": 4.786848777707594e-06, + "loss": 0.5142, + "step": 7107 + }, + { + "epoch": 2.791602864904915, + "grad_norm": 0.46645509824578013, + "learning_rate": 4.786785470706204e-06, + "loss": 0.5091, + "step": 7108 + }, + { + "epoch": 2.791998024203507, + "grad_norm": 0.4714405835689825, + "learning_rate": 4.78672215472371e-06, + "loss": 0.5221, + "step": 7109 + }, + { + "epoch": 2.7923931835020994, + "grad_norm": 0.4614676395031843, + "learning_rate": 4.78665882976036e-06, + "loss": 0.5036, + "step": 7110 + }, + { + "epoch": 2.7927883428006917, + "grad_norm": 0.45921742192416665, + "learning_rate": 4.786595495816402e-06, + "loss": 0.5073, + "step": 7111 + }, + { + "epoch": 2.793183502099284, + "grad_norm": 0.46441158320725995, + "learning_rate": 4.786532152892086e-06, + "loss": 0.5343, + "step": 7112 + }, + { + "epoch": 2.793578661397876, + "grad_norm": 0.4473283825529127, + "learning_rate": 4.78646880098766e-06, + "loss": 0.4916, + "step": 7113 + }, + { + "epoch": 2.7939738206964684, + "grad_norm": 0.4565617472568001, + "learning_rate": 4.786405440103372e-06, + "loss": 0.4896, + "step": 7114 + }, + { + "epoch": 2.7943689799950606, + "grad_norm": 0.4484189292638863, + "learning_rate": 4.786342070239473e-06, + "loss": 0.4876, + "step": 7115 + }, + { + "epoch": 2.794764139293653, + "grad_norm": 0.4561057568670873, + "learning_rate": 4.78627869139621e-06, + "loss": 0.5226, + "step": 7116 + }, + { + "epoch": 2.795159298592245, + "grad_norm": 0.45392026125118257, + "learning_rate": 4.786215303573834e-06, + "loss": 0.5162, + "step": 7117 + }, + { + "epoch": 2.7955544578908373, + "grad_norm": 0.44001406352495115, + "learning_rate": 4.7861519067725904e-06, + "loss": 0.5084, + "step": 7118 + }, + { + "epoch": 2.7959496171894296, + "grad_norm": 0.45328763812652456, + "learning_rate": 4.786088500992732e-06, + "loss": 0.5035, + "step": 7119 + }, + { + "epoch": 2.796344776488022, + "grad_norm": 0.47975356308620704, + "learning_rate": 4.786025086234505e-06, + "loss": 0.5178, + "step": 7120 + }, + { + "epoch": 2.796739935786614, + "grad_norm": 0.4358024295607447, + "learning_rate": 4.78596166249816e-06, + "loss": 0.508, + "step": 7121 + }, + { + "epoch": 2.7971350950852063, + "grad_norm": 0.45560496266499945, + "learning_rate": 4.785898229783946e-06, + "loss": 0.522, + "step": 7122 + }, + { + "epoch": 2.7975302543837985, + "grad_norm": 0.4493585711669279, + "learning_rate": 4.785834788092112e-06, + "loss": 0.5086, + "step": 7123 + }, + { + "epoch": 2.797925413682391, + "grad_norm": 0.4518530962962152, + "learning_rate": 4.785771337422906e-06, + "loss": 0.5163, + "step": 7124 + }, + { + "epoch": 2.798320572980983, + "grad_norm": 0.44724673545538657, + "learning_rate": 4.7857078777765796e-06, + "loss": 0.5082, + "step": 7125 + }, + { + "epoch": 2.7987157322795753, + "grad_norm": 0.46761704301589785, + "learning_rate": 4.785644409153379e-06, + "loss": 0.514, + "step": 7126 + }, + { + "epoch": 2.7991108915781675, + "grad_norm": 0.4533222753692102, + "learning_rate": 4.785580931553556e-06, + "loss": 0.5179, + "step": 7127 + }, + { + "epoch": 2.7995060508767597, + "grad_norm": 0.4559014721619301, + "learning_rate": 4.7855174449773595e-06, + "loss": 0.508, + "step": 7128 + }, + { + "epoch": 2.799901210175352, + "grad_norm": 0.4677071703771446, + "learning_rate": 4.785453949425038e-06, + "loss": 0.5001, + "step": 7129 + }, + { + "epoch": 2.8002963694739442, + "grad_norm": 0.4786331564258494, + "learning_rate": 4.785390444896841e-06, + "loss": 0.5206, + "step": 7130 + }, + { + "epoch": 2.8006915287725365, + "grad_norm": 0.4518046203867029, + "learning_rate": 4.7853269313930175e-06, + "loss": 0.5311, + "step": 7131 + }, + { + "epoch": 2.8010866880711287, + "grad_norm": 0.4493408611983403, + "learning_rate": 4.785263408913818e-06, + "loss": 0.5078, + "step": 7132 + }, + { + "epoch": 2.801481847369721, + "grad_norm": 0.4413293231411184, + "learning_rate": 4.7851998774594915e-06, + "loss": 0.5015, + "step": 7133 + }, + { + "epoch": 2.801877006668313, + "grad_norm": 0.4405329418854599, + "learning_rate": 4.7851363370302875e-06, + "loss": 0.4971, + "step": 7134 + }, + { + "epoch": 2.8022721659669054, + "grad_norm": 0.47861777339015044, + "learning_rate": 4.785072787626456e-06, + "loss": 0.505, + "step": 7135 + }, + { + "epoch": 2.8026673252654977, + "grad_norm": 0.4540541349102917, + "learning_rate": 4.785009229248246e-06, + "loss": 0.5346, + "step": 7136 + }, + { + "epoch": 2.80306248456409, + "grad_norm": 0.4660733299034907, + "learning_rate": 4.784945661895907e-06, + "loss": 0.5054, + "step": 7137 + }, + { + "epoch": 2.803457643862682, + "grad_norm": 0.454650518694851, + "learning_rate": 4.784882085569689e-06, + "loss": 0.5111, + "step": 7138 + }, + { + "epoch": 2.8038528031612744, + "grad_norm": 2.699233093548558, + "learning_rate": 4.784818500269842e-06, + "loss": 0.512, + "step": 7139 + }, + { + "epoch": 2.8042479624598666, + "grad_norm": 0.44347795457544964, + "learning_rate": 4.7847549059966144e-06, + "loss": 0.509, + "step": 7140 + }, + { + "epoch": 2.804643121758459, + "grad_norm": 0.44865460293087767, + "learning_rate": 4.784691302750257e-06, + "loss": 0.4965, + "step": 7141 + }, + { + "epoch": 2.805038281057051, + "grad_norm": 0.44186758621797884, + "learning_rate": 4.78462769053102e-06, + "loss": 0.4935, + "step": 7142 + }, + { + "epoch": 2.8054334403556433, + "grad_norm": 0.4690246679204338, + "learning_rate": 4.784564069339154e-06, + "loss": 0.4998, + "step": 7143 + }, + { + "epoch": 2.8058285996542356, + "grad_norm": 0.4413211990503885, + "learning_rate": 4.7845004391749065e-06, + "loss": 0.5136, + "step": 7144 + }, + { + "epoch": 2.806223758952828, + "grad_norm": 0.4477276180507849, + "learning_rate": 4.784436800038528e-06, + "loss": 0.4832, + "step": 7145 + }, + { + "epoch": 2.80661891825142, + "grad_norm": 0.4624922297012007, + "learning_rate": 4.784373151930269e-06, + "loss": 0.5086, + "step": 7146 + }, + { + "epoch": 2.8070140775500123, + "grad_norm": 0.452716126237242, + "learning_rate": 4.78430949485038e-06, + "loss": 0.5171, + "step": 7147 + }, + { + "epoch": 2.8074092368486046, + "grad_norm": 0.4531177548368098, + "learning_rate": 4.78424582879911e-06, + "loss": 0.5133, + "step": 7148 + }, + { + "epoch": 2.807804396147197, + "grad_norm": 0.45184974430917935, + "learning_rate": 4.7841821537767095e-06, + "loss": 0.5062, + "step": 7149 + }, + { + "epoch": 2.808199555445789, + "grad_norm": 0.45422086326819416, + "learning_rate": 4.784118469783429e-06, + "loss": 0.5039, + "step": 7150 + }, + { + "epoch": 2.8085947147443813, + "grad_norm": 0.4253947576601844, + "learning_rate": 4.784054776819517e-06, + "loss": 0.4924, + "step": 7151 + }, + { + "epoch": 2.8089898740429735, + "grad_norm": 0.4578816649664364, + "learning_rate": 4.7839910748852255e-06, + "loss": 0.5276, + "step": 7152 + }, + { + "epoch": 2.8093850333415658, + "grad_norm": 0.4483518419377659, + "learning_rate": 4.7839273639808035e-06, + "loss": 0.4853, + "step": 7153 + }, + { + "epoch": 2.809780192640158, + "grad_norm": 0.46266492861443637, + "learning_rate": 4.783863644106502e-06, + "loss": 0.5165, + "step": 7154 + }, + { + "epoch": 2.8101753519387502, + "grad_norm": 0.45201465369049515, + "learning_rate": 4.783799915262571e-06, + "loss": 0.5112, + "step": 7155 + }, + { + "epoch": 2.8105705112373425, + "grad_norm": 0.47492599053950296, + "learning_rate": 4.783736177449262e-06, + "loss": 0.5106, + "step": 7156 + }, + { + "epoch": 2.8109656705359347, + "grad_norm": 0.46952297692513184, + "learning_rate": 4.783672430666822e-06, + "loss": 0.5118, + "step": 7157 + }, + { + "epoch": 2.811360829834527, + "grad_norm": 0.45235950979008643, + "learning_rate": 4.783608674915505e-06, + "loss": 0.5173, + "step": 7158 + }, + { + "epoch": 2.811755989133119, + "grad_norm": 0.4900581695225324, + "learning_rate": 4.783544910195559e-06, + "loss": 0.506, + "step": 7159 + }, + { + "epoch": 2.8121511484317114, + "grad_norm": 0.4777901985820845, + "learning_rate": 4.783481136507236e-06, + "loss": 0.5085, + "step": 7160 + }, + { + "epoch": 2.8125463077303037, + "grad_norm": 0.5041776097941606, + "learning_rate": 4.783417353850785e-06, + "loss": 0.5097, + "step": 7161 + }, + { + "epoch": 2.812941467028896, + "grad_norm": 0.45039977578331913, + "learning_rate": 4.7833535622264565e-06, + "loss": 0.4931, + "step": 7162 + }, + { + "epoch": 2.813336626327488, + "grad_norm": 0.46033667197541245, + "learning_rate": 4.783289761634502e-06, + "loss": 0.5218, + "step": 7163 + }, + { + "epoch": 2.8137317856260804, + "grad_norm": 0.5045096295509437, + "learning_rate": 4.783225952075173e-06, + "loss": 0.506, + "step": 7164 + }, + { + "epoch": 2.8141269449246726, + "grad_norm": 0.46808375423952775, + "learning_rate": 4.783162133548718e-06, + "loss": 0.5239, + "step": 7165 + }, + { + "epoch": 2.814522104223265, + "grad_norm": 0.47727715778274427, + "learning_rate": 4.783098306055389e-06, + "loss": 0.5242, + "step": 7166 + }, + { + "epoch": 2.814917263521857, + "grad_norm": 0.49741116135472996, + "learning_rate": 4.7830344695954356e-06, + "loss": 0.5105, + "step": 7167 + }, + { + "epoch": 2.8153124228204494, + "grad_norm": 0.4765653183745411, + "learning_rate": 4.78297062416911e-06, + "loss": 0.5158, + "step": 7168 + }, + { + "epoch": 2.8157075821190416, + "grad_norm": 0.441604632522944, + "learning_rate": 4.782906769776661e-06, + "loss": 0.4992, + "step": 7169 + }, + { + "epoch": 2.816102741417634, + "grad_norm": 0.7388460812324761, + "learning_rate": 4.782842906418341e-06, + "loss": 0.5187, + "step": 7170 + }, + { + "epoch": 2.816497900716226, + "grad_norm": 0.4647214598823728, + "learning_rate": 4.7827790340944e-06, + "loss": 0.5028, + "step": 7171 + }, + { + "epoch": 2.8168930600148183, + "grad_norm": 0.46169878022399585, + "learning_rate": 4.7827151528050894e-06, + "loss": 0.5102, + "step": 7172 + }, + { + "epoch": 2.8172882193134106, + "grad_norm": 0.46029778083095246, + "learning_rate": 4.782651262550661e-06, + "loss": 0.51, + "step": 7173 + }, + { + "epoch": 2.817683378612003, + "grad_norm": 0.45078052262587254, + "learning_rate": 4.782587363331363e-06, + "loss": 0.5207, + "step": 7174 + }, + { + "epoch": 2.818078537910595, + "grad_norm": 0.46397038369117694, + "learning_rate": 4.782523455147448e-06, + "loss": 0.5054, + "step": 7175 + }, + { + "epoch": 2.8184736972091873, + "grad_norm": 0.45332697701497565, + "learning_rate": 4.782459537999168e-06, + "loss": 0.4939, + "step": 7176 + }, + { + "epoch": 2.8188688565077795, + "grad_norm": 0.45625065796075703, + "learning_rate": 4.782395611886771e-06, + "loss": 0.5069, + "step": 7177 + }, + { + "epoch": 2.8192640158063718, + "grad_norm": 0.45551406905446623, + "learning_rate": 4.7823316768105115e-06, + "loss": 0.5109, + "step": 7178 + }, + { + "epoch": 2.819659175104964, + "grad_norm": 0.46505352472091316, + "learning_rate": 4.782267732770639e-06, + "loss": 0.5063, + "step": 7179 + }, + { + "epoch": 2.8200543344035562, + "grad_norm": 0.46864702173796885, + "learning_rate": 4.782203779767404e-06, + "loss": 0.4998, + "step": 7180 + }, + { + "epoch": 2.8204494937021485, + "grad_norm": 0.44137492397186884, + "learning_rate": 4.782139817801059e-06, + "loss": 0.5154, + "step": 7181 + }, + { + "epoch": 2.8208446530007407, + "grad_norm": 0.457276576966862, + "learning_rate": 4.782075846871855e-06, + "loss": 0.506, + "step": 7182 + }, + { + "epoch": 2.821239812299333, + "grad_norm": 0.4617777248102637, + "learning_rate": 4.782011866980042e-06, + "loss": 0.5193, + "step": 7183 + }, + { + "epoch": 2.821634971597925, + "grad_norm": 0.4495925503990227, + "learning_rate": 4.781947878125872e-06, + "loss": 0.5091, + "step": 7184 + }, + { + "epoch": 2.8220301308965174, + "grad_norm": 0.4464844531883246, + "learning_rate": 4.781883880309597e-06, + "loss": 0.5096, + "step": 7185 + }, + { + "epoch": 2.8224252901951097, + "grad_norm": 0.4553422801335472, + "learning_rate": 4.781819873531467e-06, + "loss": 0.4845, + "step": 7186 + }, + { + "epoch": 2.822820449493702, + "grad_norm": 0.508345334941812, + "learning_rate": 4.781755857791734e-06, + "loss": 0.512, + "step": 7187 + }, + { + "epoch": 2.823215608792294, + "grad_norm": 0.46314240936594236, + "learning_rate": 4.78169183309065e-06, + "loss": 0.5283, + "step": 7188 + }, + { + "epoch": 2.8236107680908864, + "grad_norm": 0.45552834604576314, + "learning_rate": 4.781627799428466e-06, + "loss": 0.4988, + "step": 7189 + }, + { + "epoch": 2.8240059273894786, + "grad_norm": 0.43823171263924665, + "learning_rate": 4.781563756805434e-06, + "loss": 0.4898, + "step": 7190 + }, + { + "epoch": 2.824401086688071, + "grad_norm": 0.4425700045729354, + "learning_rate": 4.781499705221805e-06, + "loss": 0.4902, + "step": 7191 + }, + { + "epoch": 2.824796245986663, + "grad_norm": 0.46454792833992753, + "learning_rate": 4.7814356446778294e-06, + "loss": 0.5339, + "step": 7192 + }, + { + "epoch": 2.8251914052852554, + "grad_norm": 0.4766460659681996, + "learning_rate": 4.781371575173762e-06, + "loss": 0.5107, + "step": 7193 + }, + { + "epoch": 2.8255865645838476, + "grad_norm": 0.46813319069331144, + "learning_rate": 4.78130749670985e-06, + "loss": 0.5201, + "step": 7194 + }, + { + "epoch": 2.82598172388244, + "grad_norm": 0.4545253394863707, + "learning_rate": 4.781243409286349e-06, + "loss": 0.5225, + "step": 7195 + }, + { + "epoch": 2.826376883181032, + "grad_norm": 0.439864191746294, + "learning_rate": 4.781179312903509e-06, + "loss": 0.5041, + "step": 7196 + }, + { + "epoch": 2.8267720424796243, + "grad_norm": 0.45086076483594706, + "learning_rate": 4.781115207561582e-06, + "loss": 0.5045, + "step": 7197 + }, + { + "epoch": 2.827167201778217, + "grad_norm": 0.46994834299009564, + "learning_rate": 4.781051093260819e-06, + "loss": 0.4944, + "step": 7198 + }, + { + "epoch": 2.8275623610768092, + "grad_norm": 0.4455482853226493, + "learning_rate": 4.7809869700014726e-06, + "loss": 0.5059, + "step": 7199 + }, + { + "epoch": 2.8279575203754015, + "grad_norm": 0.45802471378032417, + "learning_rate": 4.7809228377837934e-06, + "loss": 0.5053, + "step": 7200 + }, + { + "epoch": 2.8283526796739937, + "grad_norm": 0.4642886398985806, + "learning_rate": 4.780858696608036e-06, + "loss": 0.5204, + "step": 7201 + }, + { + "epoch": 2.828747838972586, + "grad_norm": 0.4489929183982641, + "learning_rate": 4.78079454647445e-06, + "loss": 0.5073, + "step": 7202 + }, + { + "epoch": 2.829142998271178, + "grad_norm": 0.4626978965499321, + "learning_rate": 4.7807303873832875e-06, + "loss": 0.5068, + "step": 7203 + }, + { + "epoch": 2.8295381575697705, + "grad_norm": 0.45282255975863683, + "learning_rate": 4.780666219334802e-06, + "loss": 0.5095, + "step": 7204 + }, + { + "epoch": 2.8299333168683627, + "grad_norm": 0.45313611105031126, + "learning_rate": 4.780602042329244e-06, + "loss": 0.5198, + "step": 7205 + }, + { + "epoch": 2.830328476166955, + "grad_norm": 0.4584691584839706, + "learning_rate": 4.7805378563668655e-06, + "loss": 0.5147, + "step": 7206 + }, + { + "epoch": 2.830723635465547, + "grad_norm": 0.46044793656981825, + "learning_rate": 4.780473661447921e-06, + "loss": 0.5205, + "step": 7207 + }, + { + "epoch": 2.8311187947641394, + "grad_norm": 0.4583164398209612, + "learning_rate": 4.7804094575726585e-06, + "loss": 0.5222, + "step": 7208 + }, + { + "epoch": 2.8315139540627317, + "grad_norm": 0.4498029349438353, + "learning_rate": 4.780345244741333e-06, + "loss": 0.4927, + "step": 7209 + }, + { + "epoch": 2.831909113361324, + "grad_norm": 0.44328976919231206, + "learning_rate": 4.780281022954196e-06, + "loss": 0.5102, + "step": 7210 + }, + { + "epoch": 2.832304272659916, + "grad_norm": 0.44810747427037423, + "learning_rate": 4.7802167922115e-06, + "loss": 0.5071, + "step": 7211 + }, + { + "epoch": 2.8326994319585084, + "grad_norm": 0.44384368117197937, + "learning_rate": 4.780152552513499e-06, + "loss": 0.4983, + "step": 7212 + }, + { + "epoch": 2.8330945912571006, + "grad_norm": 0.44676501579869293, + "learning_rate": 4.7800883038604404e-06, + "loss": 0.5002, + "step": 7213 + }, + { + "epoch": 2.833489750555693, + "grad_norm": 0.44242213343088016, + "learning_rate": 4.780024046252581e-06, + "loss": 0.5114, + "step": 7214 + }, + { + "epoch": 2.833884909854285, + "grad_norm": 0.44041006597665694, + "learning_rate": 4.779959779690171e-06, + "loss": 0.5059, + "step": 7215 + }, + { + "epoch": 2.8342800691528773, + "grad_norm": 0.4440298358961055, + "learning_rate": 4.779895504173464e-06, + "loss": 0.5156, + "step": 7216 + }, + { + "epoch": 2.8346752284514696, + "grad_norm": 0.45776869409256077, + "learning_rate": 4.779831219702712e-06, + "loss": 0.5051, + "step": 7217 + }, + { + "epoch": 2.835070387750062, + "grad_norm": 0.43770623591530705, + "learning_rate": 4.7797669262781665e-06, + "loss": 0.4988, + "step": 7218 + }, + { + "epoch": 2.835465547048654, + "grad_norm": 0.4442467675600192, + "learning_rate": 4.779702623900082e-06, + "loss": 0.5167, + "step": 7219 + }, + { + "epoch": 2.8358607063472463, + "grad_norm": 0.4453421691195409, + "learning_rate": 4.779638312568708e-06, + "loss": 0.498, + "step": 7220 + }, + { + "epoch": 2.8362558656458385, + "grad_norm": 0.43148125749848565, + "learning_rate": 4.779573992284301e-06, + "loss": 0.4994, + "step": 7221 + }, + { + "epoch": 2.8366510249444308, + "grad_norm": 0.4384294284300938, + "learning_rate": 4.779509663047111e-06, + "loss": 0.4924, + "step": 7222 + }, + { + "epoch": 2.837046184243023, + "grad_norm": 0.450005448688682, + "learning_rate": 4.779445324857391e-06, + "loss": 0.5015, + "step": 7223 + }, + { + "epoch": 2.8374413435416153, + "grad_norm": 0.4383831955520918, + "learning_rate": 4.779380977715394e-06, + "loss": 0.5075, + "step": 7224 + }, + { + "epoch": 2.8378365028402075, + "grad_norm": 0.44872433012548524, + "learning_rate": 4.7793166216213725e-06, + "loss": 0.4907, + "step": 7225 + }, + { + "epoch": 2.8382316621387997, + "grad_norm": 0.4486414248911323, + "learning_rate": 4.77925225657558e-06, + "loss": 0.5103, + "step": 7226 + }, + { + "epoch": 2.838626821437392, + "grad_norm": 0.4505470486129958, + "learning_rate": 4.7791878825782675e-06, + "loss": 0.4949, + "step": 7227 + }, + { + "epoch": 2.839021980735984, + "grad_norm": 0.45187161145250815, + "learning_rate": 4.77912349962969e-06, + "loss": 0.5004, + "step": 7228 + }, + { + "epoch": 2.8394171400345765, + "grad_norm": 0.4532129642345992, + "learning_rate": 4.779059107730099e-06, + "loss": 0.4977, + "step": 7229 + }, + { + "epoch": 2.8398122993331687, + "grad_norm": 0.4463390504441417, + "learning_rate": 4.7789947068797474e-06, + "loss": 0.5049, + "step": 7230 + }, + { + "epoch": 2.840207458631761, + "grad_norm": 0.44255699499296886, + "learning_rate": 4.7789302970788895e-06, + "loss": 0.4965, + "step": 7231 + }, + { + "epoch": 2.840602617930353, + "grad_norm": 0.46126441571913374, + "learning_rate": 4.7788658783277765e-06, + "loss": 0.5196, + "step": 7232 + }, + { + "epoch": 2.8409977772289454, + "grad_norm": 0.4545844577709667, + "learning_rate": 4.778801450626662e-06, + "loss": 0.5099, + "step": 7233 + }, + { + "epoch": 2.8413929365275377, + "grad_norm": 0.45959479224584476, + "learning_rate": 4.7787370139758e-06, + "loss": 0.5083, + "step": 7234 + }, + { + "epoch": 2.84178809582613, + "grad_norm": 0.4506555503493617, + "learning_rate": 4.7786725683754415e-06, + "loss": 0.5267, + "step": 7235 + }, + { + "epoch": 2.842183255124722, + "grad_norm": 0.4569111610608343, + "learning_rate": 4.7786081138258414e-06, + "loss": 0.5184, + "step": 7236 + }, + { + "epoch": 2.8425784144233144, + "grad_norm": 0.4436242554559036, + "learning_rate": 4.778543650327252e-06, + "loss": 0.4825, + "step": 7237 + }, + { + "epoch": 2.8429735737219066, + "grad_norm": 0.4452045626348084, + "learning_rate": 4.778479177879928e-06, + "loss": 0.5106, + "step": 7238 + }, + { + "epoch": 2.843368733020499, + "grad_norm": 0.44570436675947306, + "learning_rate": 4.77841469648412e-06, + "loss": 0.5197, + "step": 7239 + }, + { + "epoch": 2.843763892319091, + "grad_norm": 0.44104364743730273, + "learning_rate": 4.778350206140083e-06, + "loss": 0.4942, + "step": 7240 + }, + { + "epoch": 2.8441590516176833, + "grad_norm": 0.470164189035705, + "learning_rate": 4.77828570684807e-06, + "loss": 0.52, + "step": 7241 + }, + { + "epoch": 2.8445542109162756, + "grad_norm": 0.45844457094618113, + "learning_rate": 4.778221198608333e-06, + "loss": 0.4999, + "step": 7242 + }, + { + "epoch": 2.844949370214868, + "grad_norm": 0.45188438120573965, + "learning_rate": 4.778156681421129e-06, + "loss": 0.5137, + "step": 7243 + }, + { + "epoch": 2.84534452951346, + "grad_norm": 0.4865736628712067, + "learning_rate": 4.778092155286707e-06, + "loss": 0.52, + "step": 7244 + }, + { + "epoch": 2.8457396888120523, + "grad_norm": 0.4618516637382695, + "learning_rate": 4.778027620205323e-06, + "loss": 0.5022, + "step": 7245 + }, + { + "epoch": 2.8461348481106445, + "grad_norm": 0.4518757121486141, + "learning_rate": 4.77796307617723e-06, + "loss": 0.513, + "step": 7246 + }, + { + "epoch": 2.846530007409237, + "grad_norm": 0.45286344906811393, + "learning_rate": 4.777898523202681e-06, + "loss": 0.4984, + "step": 7247 + }, + { + "epoch": 2.846925166707829, + "grad_norm": 0.46241029465759526, + "learning_rate": 4.777833961281929e-06, + "loss": 0.5241, + "step": 7248 + }, + { + "epoch": 2.8473203260064213, + "grad_norm": 0.4572057988626441, + "learning_rate": 4.7777693904152295e-06, + "loss": 0.4991, + "step": 7249 + }, + { + "epoch": 2.8477154853050135, + "grad_norm": 0.46688359869533963, + "learning_rate": 4.7777048106028345e-06, + "loss": 0.5258, + "step": 7250 + }, + { + "epoch": 2.8481106446036057, + "grad_norm": 0.4419843786987222, + "learning_rate": 4.777640221844998e-06, + "loss": 0.5003, + "step": 7251 + }, + { + "epoch": 2.848505803902198, + "grad_norm": 0.4509604918984796, + "learning_rate": 4.777575624141975e-06, + "loss": 0.4943, + "step": 7252 + }, + { + "epoch": 2.8489009632007902, + "grad_norm": 0.4544386957164005, + "learning_rate": 4.777511017494017e-06, + "loss": 0.5191, + "step": 7253 + }, + { + "epoch": 2.8492961224993825, + "grad_norm": 0.46217731638209764, + "learning_rate": 4.777446401901378e-06, + "loss": 0.5026, + "step": 7254 + }, + { + "epoch": 2.8496912817979747, + "grad_norm": 0.46198789235520654, + "learning_rate": 4.777381777364314e-06, + "loss": 0.5046, + "step": 7255 + }, + { + "epoch": 2.850086441096567, + "grad_norm": 0.467501208212079, + "learning_rate": 4.777317143883076e-06, + "loss": 0.5054, + "step": 7256 + }, + { + "epoch": 2.850481600395159, + "grad_norm": 0.45572805493163815, + "learning_rate": 4.77725250145792e-06, + "loss": 0.502, + "step": 7257 + }, + { + "epoch": 2.8508767596937514, + "grad_norm": 0.4585262214570415, + "learning_rate": 4.777187850089098e-06, + "loss": 0.5001, + "step": 7258 + }, + { + "epoch": 2.8512719189923437, + "grad_norm": 0.46257434246120355, + "learning_rate": 4.777123189776865e-06, + "loss": 0.4884, + "step": 7259 + }, + { + "epoch": 2.851667078290936, + "grad_norm": 0.44787734910546845, + "learning_rate": 4.777058520521476e-06, + "loss": 0.4854, + "step": 7260 + }, + { + "epoch": 2.852062237589528, + "grad_norm": 0.4679108827035021, + "learning_rate": 4.7769938423231825e-06, + "loss": 0.4984, + "step": 7261 + }, + { + "epoch": 2.8524573968881204, + "grad_norm": 0.5824913189397212, + "learning_rate": 4.776929155182241e-06, + "loss": 0.5165, + "step": 7262 + }, + { + "epoch": 2.8528525561867126, + "grad_norm": 0.47000334347932204, + "learning_rate": 4.776864459098904e-06, + "loss": 0.5063, + "step": 7263 + }, + { + "epoch": 2.853247715485305, + "grad_norm": 0.46271998546318105, + "learning_rate": 4.776799754073425e-06, + "loss": 0.4995, + "step": 7264 + }, + { + "epoch": 2.853642874783897, + "grad_norm": 0.46020642262681605, + "learning_rate": 4.776735040106061e-06, + "loss": 0.5026, + "step": 7265 + }, + { + "epoch": 2.8540380340824894, + "grad_norm": 0.4570714611160204, + "learning_rate": 4.776670317197063e-06, + "loss": 0.4976, + "step": 7266 + }, + { + "epoch": 2.8544331933810816, + "grad_norm": 0.45694522553571065, + "learning_rate": 4.776605585346687e-06, + "loss": 0.4947, + "step": 7267 + }, + { + "epoch": 2.854828352679674, + "grad_norm": 0.45434827116015725, + "learning_rate": 4.776540844555186e-06, + "loss": 0.5041, + "step": 7268 + }, + { + "epoch": 2.8552235119782665, + "grad_norm": 0.4449151544183041, + "learning_rate": 4.776476094822815e-06, + "loss": 0.4964, + "step": 7269 + }, + { + "epoch": 2.8556186712768588, + "grad_norm": 0.455998711224259, + "learning_rate": 4.7764113361498284e-06, + "loss": 0.5014, + "step": 7270 + }, + { + "epoch": 2.856013830575451, + "grad_norm": 0.4336150479227637, + "learning_rate": 4.776346568536481e-06, + "loss": 0.4975, + "step": 7271 + }, + { + "epoch": 2.8564089898740432, + "grad_norm": 0.46393244313991583, + "learning_rate": 4.776281791983026e-06, + "loss": 0.5004, + "step": 7272 + }, + { + "epoch": 2.8568041491726355, + "grad_norm": 0.4738720171848614, + "learning_rate": 4.776217006489719e-06, + "loss": 0.5105, + "step": 7273 + }, + { + "epoch": 2.8571993084712277, + "grad_norm": 0.4533385874249988, + "learning_rate": 4.776152212056813e-06, + "loss": 0.4789, + "step": 7274 + }, + { + "epoch": 2.85759446776982, + "grad_norm": 0.45076697220608347, + "learning_rate": 4.7760874086845635e-06, + "loss": 0.5222, + "step": 7275 + }, + { + "epoch": 2.857989627068412, + "grad_norm": 0.45274267999042384, + "learning_rate": 4.7760225963732255e-06, + "loss": 0.5075, + "step": 7276 + }, + { + "epoch": 2.8583847863670044, + "grad_norm": 0.4543131577956172, + "learning_rate": 4.775957775123052e-06, + "loss": 0.495, + "step": 7277 + }, + { + "epoch": 2.8587799456655967, + "grad_norm": 0.4551171658276694, + "learning_rate": 4.775892944934299e-06, + "loss": 0.5094, + "step": 7278 + }, + { + "epoch": 2.859175104964189, + "grad_norm": 0.45140156710288193, + "learning_rate": 4.77582810580722e-06, + "loss": 0.5102, + "step": 7279 + }, + { + "epoch": 2.859570264262781, + "grad_norm": 0.45003747644498915, + "learning_rate": 4.7757632577420696e-06, + "loss": 0.4983, + "step": 7280 + }, + { + "epoch": 2.8599654235613734, + "grad_norm": 0.45665797218831644, + "learning_rate": 4.775698400739104e-06, + "loss": 0.5201, + "step": 7281 + }, + { + "epoch": 2.8603605828599656, + "grad_norm": 0.4621874107147548, + "learning_rate": 4.775633534798576e-06, + "loss": 0.5044, + "step": 7282 + }, + { + "epoch": 2.860755742158558, + "grad_norm": 0.45736986361350984, + "learning_rate": 4.775568659920742e-06, + "loss": 0.4892, + "step": 7283 + }, + { + "epoch": 2.86115090145715, + "grad_norm": 0.44664628991297056, + "learning_rate": 4.775503776105857e-06, + "loss": 0.5025, + "step": 7284 + }, + { + "epoch": 2.8615460607557424, + "grad_norm": 0.44279821627241905, + "learning_rate": 4.775438883354173e-06, + "loss": 0.5267, + "step": 7285 + }, + { + "epoch": 2.8619412200543346, + "grad_norm": 0.4555964442573869, + "learning_rate": 4.775373981665949e-06, + "loss": 0.5362, + "step": 7286 + }, + { + "epoch": 2.862336379352927, + "grad_norm": 0.4616612464408454, + "learning_rate": 4.775309071041435e-06, + "loss": 0.5024, + "step": 7287 + }, + { + "epoch": 2.862731538651519, + "grad_norm": 0.44173733289424433, + "learning_rate": 4.7752441514808905e-06, + "loss": 0.4976, + "step": 7288 + }, + { + "epoch": 2.8631266979501113, + "grad_norm": 0.44729707048188977, + "learning_rate": 4.775179222984568e-06, + "loss": 0.5, + "step": 7289 + }, + { + "epoch": 2.8635218572487036, + "grad_norm": 0.45155830390167095, + "learning_rate": 4.775114285552723e-06, + "loss": 0.5023, + "step": 7290 + }, + { + "epoch": 2.863917016547296, + "grad_norm": 0.4777436940373346, + "learning_rate": 4.7750493391856116e-06, + "loss": 0.5002, + "step": 7291 + }, + { + "epoch": 2.864312175845888, + "grad_norm": 0.45674967310156706, + "learning_rate": 4.7749843838834865e-06, + "loss": 0.5122, + "step": 7292 + }, + { + "epoch": 2.8647073351444803, + "grad_norm": 0.4399469754211526, + "learning_rate": 4.774919419646605e-06, + "loss": 0.5023, + "step": 7293 + }, + { + "epoch": 2.8651024944430725, + "grad_norm": 0.4405627072122233, + "learning_rate": 4.774854446475221e-06, + "loss": 0.4848, + "step": 7294 + }, + { + "epoch": 2.8654976537416648, + "grad_norm": 0.4654307974697821, + "learning_rate": 4.7747894643695904e-06, + "loss": 0.5148, + "step": 7295 + }, + { + "epoch": 2.865892813040257, + "grad_norm": 0.4502154850100158, + "learning_rate": 4.774724473329968e-06, + "loss": 0.4916, + "step": 7296 + }, + { + "epoch": 2.8662879723388492, + "grad_norm": 0.45464786407421726, + "learning_rate": 4.7746594733566085e-06, + "loss": 0.5084, + "step": 7297 + }, + { + "epoch": 2.8666831316374415, + "grad_norm": 0.46932700358577506, + "learning_rate": 4.774594464449769e-06, + "loss": 0.521, + "step": 7298 + }, + { + "epoch": 2.8670782909360337, + "grad_norm": 0.455144441625113, + "learning_rate": 4.774529446609703e-06, + "loss": 0.5042, + "step": 7299 + }, + { + "epoch": 2.867473450234626, + "grad_norm": 0.456638540958973, + "learning_rate": 4.7744644198366665e-06, + "loss": 0.5098, + "step": 7300 + }, + { + "epoch": 2.867868609533218, + "grad_norm": 0.4731909071584723, + "learning_rate": 4.774399384130916e-06, + "loss": 0.4896, + "step": 7301 + }, + { + "epoch": 2.8682637688318104, + "grad_norm": 0.45219579881485555, + "learning_rate": 4.774334339492704e-06, + "loss": 0.4995, + "step": 7302 + }, + { + "epoch": 2.8686589281304027, + "grad_norm": 0.47541582012310035, + "learning_rate": 4.774269285922289e-06, + "loss": 0.5225, + "step": 7303 + }, + { + "epoch": 2.869054087428995, + "grad_norm": 0.4562665607670454, + "learning_rate": 4.774204223419925e-06, + "loss": 0.4862, + "step": 7304 + }, + { + "epoch": 2.869449246727587, + "grad_norm": 0.47218102712930754, + "learning_rate": 4.774139151985867e-06, + "loss": 0.5075, + "step": 7305 + }, + { + "epoch": 2.8698444060261794, + "grad_norm": 0.4403127626196469, + "learning_rate": 4.774074071620372e-06, + "loss": 0.5105, + "step": 7306 + }, + { + "epoch": 2.8702395653247716, + "grad_norm": 0.45708262550702466, + "learning_rate": 4.7740089823236955e-06, + "loss": 0.4972, + "step": 7307 + }, + { + "epoch": 2.870634724623364, + "grad_norm": 0.46018852248502795, + "learning_rate": 4.773943884096091e-06, + "loss": 0.4945, + "step": 7308 + }, + { + "epoch": 2.871029883921956, + "grad_norm": 0.4601257085296602, + "learning_rate": 4.773878776937817e-06, + "loss": 0.5051, + "step": 7309 + }, + { + "epoch": 2.8714250432205484, + "grad_norm": 0.45920489045937124, + "learning_rate": 4.7738136608491284e-06, + "loss": 0.505, + "step": 7310 + }, + { + "epoch": 2.8718202025191406, + "grad_norm": 0.45371392464155347, + "learning_rate": 4.77374853583028e-06, + "loss": 0.4837, + "step": 7311 + }, + { + "epoch": 2.872215361817733, + "grad_norm": 0.45429850072636996, + "learning_rate": 4.773683401881527e-06, + "loss": 0.4857, + "step": 7312 + }, + { + "epoch": 2.872610521116325, + "grad_norm": 0.4680648093877002, + "learning_rate": 4.773618259003127e-06, + "loss": 0.5074, + "step": 7313 + }, + { + "epoch": 2.8730056804149173, + "grad_norm": 0.461576978903435, + "learning_rate": 4.773553107195336e-06, + "loss": 0.513, + "step": 7314 + }, + { + "epoch": 2.8734008397135096, + "grad_norm": 0.5184320923986061, + "learning_rate": 4.773487946458407e-06, + "loss": 0.4973, + "step": 7315 + }, + { + "epoch": 2.873795999012102, + "grad_norm": 0.43754112690923064, + "learning_rate": 4.7734227767926e-06, + "loss": 0.4991, + "step": 7316 + }, + { + "epoch": 2.874191158310694, + "grad_norm": 0.4487036188195782, + "learning_rate": 4.773357598198167e-06, + "loss": 0.5125, + "step": 7317 + }, + { + "epoch": 2.8745863176092863, + "grad_norm": 0.4574040806296256, + "learning_rate": 4.773292410675366e-06, + "loss": 0.5261, + "step": 7318 + }, + { + "epoch": 2.8749814769078785, + "grad_norm": 0.4490619381032573, + "learning_rate": 4.773227214224454e-06, + "loss": 0.4986, + "step": 7319 + }, + { + "epoch": 2.8753766362064708, + "grad_norm": 0.47037528439860465, + "learning_rate": 4.773162008845685e-06, + "loss": 0.4982, + "step": 7320 + }, + { + "epoch": 2.875771795505063, + "grad_norm": 0.4506229178725294, + "learning_rate": 4.773096794539317e-06, + "loss": 0.4922, + "step": 7321 + }, + { + "epoch": 2.8761669548036553, + "grad_norm": 0.43652933566594937, + "learning_rate": 4.773031571305604e-06, + "loss": 0.5004, + "step": 7322 + }, + { + "epoch": 2.8765621141022475, + "grad_norm": 0.5540614478567485, + "learning_rate": 4.7729663391448035e-06, + "loss": 0.5186, + "step": 7323 + }, + { + "epoch": 2.8769572734008397, + "grad_norm": 0.46194230756962723, + "learning_rate": 4.772901098057172e-06, + "loss": 0.5071, + "step": 7324 + }, + { + "epoch": 2.877352432699432, + "grad_norm": 0.465341882765266, + "learning_rate": 4.772835848042965e-06, + "loss": 0.5202, + "step": 7325 + }, + { + "epoch": 2.877747591998024, + "grad_norm": 0.4778559900906391, + "learning_rate": 4.772770589102438e-06, + "loss": 0.4969, + "step": 7326 + }, + { + "epoch": 2.8781427512966165, + "grad_norm": 0.4405625019192915, + "learning_rate": 4.772705321235849e-06, + "loss": 0.4917, + "step": 7327 + }, + { + "epoch": 2.8785379105952087, + "grad_norm": 0.4506373137457211, + "learning_rate": 4.772640044443454e-06, + "loss": 0.507, + "step": 7328 + }, + { + "epoch": 2.878933069893801, + "grad_norm": 0.46863725137476087, + "learning_rate": 4.772574758725507e-06, + "loss": 0.4913, + "step": 7329 + }, + { + "epoch": 2.879328229192393, + "grad_norm": 0.4987365734752338, + "learning_rate": 4.772509464082269e-06, + "loss": 0.5125, + "step": 7330 + }, + { + "epoch": 2.8797233884909854, + "grad_norm": 0.45031884644226655, + "learning_rate": 4.772444160513992e-06, + "loss": 0.497, + "step": 7331 + }, + { + "epoch": 2.8801185477895777, + "grad_norm": 0.453093686573893, + "learning_rate": 4.772378848020935e-06, + "loss": 0.5252, + "step": 7332 + }, + { + "epoch": 2.88051370708817, + "grad_norm": 0.46311837414420265, + "learning_rate": 4.772313526603354e-06, + "loss": 0.5245, + "step": 7333 + }, + { + "epoch": 2.880908866386762, + "grad_norm": 0.5599787194673955, + "learning_rate": 4.772248196261504e-06, + "loss": 0.492, + "step": 7334 + }, + { + "epoch": 2.8813040256853544, + "grad_norm": 0.47600078027671755, + "learning_rate": 4.7721828569956435e-06, + "loss": 0.5181, + "step": 7335 + }, + { + "epoch": 2.8816991849839466, + "grad_norm": 0.4672363357538037, + "learning_rate": 4.772117508806029e-06, + "loss": 0.5156, + "step": 7336 + }, + { + "epoch": 2.882094344282539, + "grad_norm": 0.44982543608975994, + "learning_rate": 4.7720521516929155e-06, + "loss": 0.5017, + "step": 7337 + }, + { + "epoch": 2.882489503581131, + "grad_norm": 0.47014208289943477, + "learning_rate": 4.7719867856565615e-06, + "loss": 0.5021, + "step": 7338 + }, + { + "epoch": 2.8828846628797233, + "grad_norm": 0.4641826462044799, + "learning_rate": 4.771921410697224e-06, + "loss": 0.51, + "step": 7339 + }, + { + "epoch": 2.8832798221783156, + "grad_norm": 0.4480770317445626, + "learning_rate": 4.771856026815157e-06, + "loss": 0.4978, + "step": 7340 + }, + { + "epoch": 2.883674981476908, + "grad_norm": 0.4764205030319496, + "learning_rate": 4.77179063401062e-06, + "loss": 0.5137, + "step": 7341 + }, + { + "epoch": 2.8840701407755, + "grad_norm": 0.44175592798798174, + "learning_rate": 4.771725232283869e-06, + "loss": 0.5218, + "step": 7342 + }, + { + "epoch": 2.8844653000740923, + "grad_norm": 0.4522738770160621, + "learning_rate": 4.771659821635161e-06, + "loss": 0.5073, + "step": 7343 + }, + { + "epoch": 2.8848604593726845, + "grad_norm": 0.45063177134700066, + "learning_rate": 4.771594402064752e-06, + "loss": 0.5103, + "step": 7344 + }, + { + "epoch": 2.885255618671277, + "grad_norm": 0.4519278901335532, + "learning_rate": 4.7715289735729e-06, + "loss": 0.508, + "step": 7345 + }, + { + "epoch": 2.885650777969869, + "grad_norm": 0.45610027675210985, + "learning_rate": 4.771463536159861e-06, + "loss": 0.4991, + "step": 7346 + }, + { + "epoch": 2.8860459372684613, + "grad_norm": 0.4657657398587538, + "learning_rate": 4.771398089825893e-06, + "loss": 0.4981, + "step": 7347 + }, + { + "epoch": 2.8864410965670535, + "grad_norm": 0.4504836071125689, + "learning_rate": 4.771332634571252e-06, + "loss": 0.5194, + "step": 7348 + }, + { + "epoch": 2.8868362558656457, + "grad_norm": 0.4482132370137858, + "learning_rate": 4.771267170396197e-06, + "loss": 0.5019, + "step": 7349 + }, + { + "epoch": 2.887231415164238, + "grad_norm": 0.44368133807741067, + "learning_rate": 4.771201697300982e-06, + "loss": 0.5046, + "step": 7350 + }, + { + "epoch": 2.88762657446283, + "grad_norm": 0.4506387198321668, + "learning_rate": 4.7711362152858665e-06, + "loss": 0.5078, + "step": 7351 + }, + { + "epoch": 2.8880217337614225, + "grad_norm": 0.4570475595802833, + "learning_rate": 4.771070724351108e-06, + "loss": 0.5087, + "step": 7352 + }, + { + "epoch": 2.8884168930600147, + "grad_norm": 0.45887991966607716, + "learning_rate": 4.771005224496962e-06, + "loss": 0.5119, + "step": 7353 + }, + { + "epoch": 2.888812052358607, + "grad_norm": 0.4556415441058092, + "learning_rate": 4.770939715723686e-06, + "loss": 0.5164, + "step": 7354 + }, + { + "epoch": 2.889207211657199, + "grad_norm": 0.44832761334800575, + "learning_rate": 4.7708741980315386e-06, + "loss": 0.4931, + "step": 7355 + }, + { + "epoch": 2.8896023709557914, + "grad_norm": 0.47312914667107786, + "learning_rate": 4.770808671420775e-06, + "loss": 0.5121, + "step": 7356 + }, + { + "epoch": 2.8899975302543837, + "grad_norm": 0.6480164808981641, + "learning_rate": 4.770743135891656e-06, + "loss": 0.5064, + "step": 7357 + }, + { + "epoch": 2.890392689552976, + "grad_norm": 0.4502180251195585, + "learning_rate": 4.770677591444434e-06, + "loss": 0.5109, + "step": 7358 + }, + { + "epoch": 2.890787848851568, + "grad_norm": 0.44215136184674697, + "learning_rate": 4.770612038079372e-06, + "loss": 0.5187, + "step": 7359 + }, + { + "epoch": 2.8911830081501604, + "grad_norm": 0.46977522141656114, + "learning_rate": 4.770546475796724e-06, + "loss": 0.5058, + "step": 7360 + }, + { + "epoch": 2.8915781674487526, + "grad_norm": 0.44217645544646506, + "learning_rate": 4.770480904596747e-06, + "loss": 0.4908, + "step": 7361 + }, + { + "epoch": 2.891973326747345, + "grad_norm": 0.45598608295802123, + "learning_rate": 4.770415324479701e-06, + "loss": 0.5157, + "step": 7362 + }, + { + "epoch": 2.892368486045937, + "grad_norm": 0.4412753609610493, + "learning_rate": 4.770349735445841e-06, + "loss": 0.4907, + "step": 7363 + }, + { + "epoch": 2.8927636453445293, + "grad_norm": 0.4688090294235565, + "learning_rate": 4.770284137495428e-06, + "loss": 0.5032, + "step": 7364 + }, + { + "epoch": 2.8931588046431216, + "grad_norm": 0.46203038169470984, + "learning_rate": 4.770218530628716e-06, + "loss": 0.5207, + "step": 7365 + }, + { + "epoch": 2.893553963941714, + "grad_norm": 0.45196862440782837, + "learning_rate": 4.770152914845964e-06, + "loss": 0.498, + "step": 7366 + }, + { + "epoch": 2.893949123240306, + "grad_norm": 0.45872640459487846, + "learning_rate": 4.77008729014743e-06, + "loss": 0.5073, + "step": 7367 + }, + { + "epoch": 2.8943442825388983, + "grad_norm": 0.4534048366751859, + "learning_rate": 4.770021656533372e-06, + "loss": 0.4985, + "step": 7368 + }, + { + "epoch": 2.8947394418374905, + "grad_norm": 0.443264773459016, + "learning_rate": 4.769956014004047e-06, + "loss": 0.5142, + "step": 7369 + }, + { + "epoch": 2.895134601136083, + "grad_norm": 0.4379839420016731, + "learning_rate": 4.769890362559714e-06, + "loss": 0.4825, + "step": 7370 + }, + { + "epoch": 2.895529760434675, + "grad_norm": 0.4481743034206296, + "learning_rate": 4.769824702200629e-06, + "loss": 0.5016, + "step": 7371 + }, + { + "epoch": 2.8959249197332673, + "grad_norm": 0.4451918949234217, + "learning_rate": 4.769759032927051e-06, + "loss": 0.4956, + "step": 7372 + }, + { + "epoch": 2.8963200790318595, + "grad_norm": 0.5979190982944923, + "learning_rate": 4.7696933547392375e-06, + "loss": 0.5125, + "step": 7373 + }, + { + "epoch": 2.8967152383304517, + "grad_norm": 0.4563147906713146, + "learning_rate": 4.769627667637448e-06, + "loss": 0.5096, + "step": 7374 + }, + { + "epoch": 2.897110397629044, + "grad_norm": 0.46718974725304174, + "learning_rate": 4.7695619716219384e-06, + "loss": 0.5175, + "step": 7375 + }, + { + "epoch": 2.8975055569276362, + "grad_norm": 0.44977361032355456, + "learning_rate": 4.7694962666929674e-06, + "loss": 0.4988, + "step": 7376 + }, + { + "epoch": 2.8979007162262285, + "grad_norm": 0.4770742328128973, + "learning_rate": 4.769430552850793e-06, + "loss": 0.5208, + "step": 7377 + }, + { + "epoch": 2.8982958755248207, + "grad_norm": 0.46699375367596374, + "learning_rate": 4.769364830095674e-06, + "loss": 0.5051, + "step": 7378 + }, + { + "epoch": 2.898691034823413, + "grad_norm": 0.4793154561864407, + "learning_rate": 4.769299098427868e-06, + "loss": 0.505, + "step": 7379 + }, + { + "epoch": 2.899086194122005, + "grad_norm": 0.45166395843198986, + "learning_rate": 4.769233357847633e-06, + "loss": 0.5283, + "step": 7380 + }, + { + "epoch": 2.8994813534205974, + "grad_norm": 0.47128699616743297, + "learning_rate": 4.769167608355227e-06, + "loss": 0.516, + "step": 7381 + }, + { + "epoch": 2.8998765127191897, + "grad_norm": 0.4582075932359091, + "learning_rate": 4.769101849950909e-06, + "loss": 0.5087, + "step": 7382 + }, + { + "epoch": 2.900271672017782, + "grad_norm": 0.4557041584886058, + "learning_rate": 4.7690360826349365e-06, + "loss": 0.5215, + "step": 7383 + }, + { + "epoch": 2.900666831316374, + "grad_norm": 0.45326360410121896, + "learning_rate": 4.768970306407569e-06, + "loss": 0.5068, + "step": 7384 + }, + { + "epoch": 2.9010619906149664, + "grad_norm": 0.44111797702117767, + "learning_rate": 4.7689045212690625e-06, + "loss": 0.4926, + "step": 7385 + }, + { + "epoch": 2.9014571499135586, + "grad_norm": 0.47593611801487395, + "learning_rate": 4.7688387272196775e-06, + "loss": 0.5163, + "step": 7386 + }, + { + "epoch": 2.901852309212151, + "grad_norm": 0.4473189561304879, + "learning_rate": 4.768772924259671e-06, + "loss": 0.5233, + "step": 7387 + }, + { + "epoch": 2.9022474685107436, + "grad_norm": 0.44995418334579357, + "learning_rate": 4.768707112389303e-06, + "loss": 0.5283, + "step": 7388 + }, + { + "epoch": 2.902642627809336, + "grad_norm": 0.45948641092246273, + "learning_rate": 4.768641291608831e-06, + "loss": 0.5166, + "step": 7389 + }, + { + "epoch": 2.903037787107928, + "grad_norm": 0.4697983204082374, + "learning_rate": 4.768575461918513e-06, + "loss": 0.5185, + "step": 7390 + }, + { + "epoch": 2.9034329464065203, + "grad_norm": 0.4654187783348642, + "learning_rate": 4.768509623318609e-06, + "loss": 0.5268, + "step": 7391 + }, + { + "epoch": 2.9038281057051125, + "grad_norm": 0.4558337696980285, + "learning_rate": 4.768443775809376e-06, + "loss": 0.5024, + "step": 7392 + }, + { + "epoch": 2.9042232650037048, + "grad_norm": 0.4606281586552123, + "learning_rate": 4.768377919391074e-06, + "loss": 0.514, + "step": 7393 + }, + { + "epoch": 2.904618424302297, + "grad_norm": 0.4613832493128262, + "learning_rate": 4.768312054063961e-06, + "loss": 0.5055, + "step": 7394 + }, + { + "epoch": 2.9050135836008892, + "grad_norm": 0.455634197555287, + "learning_rate": 4.768246179828295e-06, + "loss": 0.5112, + "step": 7395 + }, + { + "epoch": 2.9054087428994815, + "grad_norm": 0.45367728348365555, + "learning_rate": 4.768180296684335e-06, + "loss": 0.5233, + "step": 7396 + }, + { + "epoch": 2.9058039021980737, + "grad_norm": 0.4571715325974132, + "learning_rate": 4.768114404632341e-06, + "loss": 0.5039, + "step": 7397 + }, + { + "epoch": 2.906199061496666, + "grad_norm": 0.45459414076829735, + "learning_rate": 4.768048503672571e-06, + "loss": 0.4977, + "step": 7398 + }, + { + "epoch": 2.906594220795258, + "grad_norm": 0.476609792116574, + "learning_rate": 4.7679825938052825e-06, + "loss": 0.4998, + "step": 7399 + }, + { + "epoch": 2.9069893800938504, + "grad_norm": 0.44935073335968, + "learning_rate": 4.7679166750307364e-06, + "loss": 0.5068, + "step": 7400 + }, + { + "epoch": 2.9073845393924427, + "grad_norm": 0.44816683458804896, + "learning_rate": 4.767850747349191e-06, + "loss": 0.5195, + "step": 7401 + }, + { + "epoch": 2.907779698691035, + "grad_norm": 0.514040565789535, + "learning_rate": 4.767784810760905e-06, + "loss": 0.4983, + "step": 7402 + }, + { + "epoch": 2.908174857989627, + "grad_norm": 0.4678938516683875, + "learning_rate": 4.767718865266136e-06, + "loss": 0.5241, + "step": 7403 + }, + { + "epoch": 2.9085700172882194, + "grad_norm": 0.44463113513631825, + "learning_rate": 4.767652910865146e-06, + "loss": 0.4928, + "step": 7404 + }, + { + "epoch": 2.9089651765868116, + "grad_norm": 0.44305031644764015, + "learning_rate": 4.767586947558191e-06, + "loss": 0.5274, + "step": 7405 + }, + { + "epoch": 2.909360335885404, + "grad_norm": 0.4608795710666698, + "learning_rate": 4.767520975345533e-06, + "loss": 0.496, + "step": 7406 + }, + { + "epoch": 2.909755495183996, + "grad_norm": 0.45454535548418995, + "learning_rate": 4.767454994227428e-06, + "loss": 0.5294, + "step": 7407 + }, + { + "epoch": 2.9101506544825884, + "grad_norm": 0.4438403467716369, + "learning_rate": 4.767389004204137e-06, + "loss": 0.5058, + "step": 7408 + }, + { + "epoch": 2.9105458137811806, + "grad_norm": 0.45975886229728474, + "learning_rate": 4.76732300527592e-06, + "loss": 0.4937, + "step": 7409 + }, + { + "epoch": 2.910940973079773, + "grad_norm": 0.4679652680374663, + "learning_rate": 4.767256997443034e-06, + "loss": 0.5338, + "step": 7410 + }, + { + "epoch": 2.911336132378365, + "grad_norm": 0.4443491416640828, + "learning_rate": 4.767190980705739e-06, + "loss": 0.5009, + "step": 7411 + }, + { + "epoch": 2.9117312916769573, + "grad_norm": 0.4466231335445614, + "learning_rate": 4.767124955064295e-06, + "loss": 0.5008, + "step": 7412 + }, + { + "epoch": 2.9121264509755496, + "grad_norm": 0.45416773779205705, + "learning_rate": 4.767058920518961e-06, + "loss": 0.5097, + "step": 7413 + }, + { + "epoch": 2.912521610274142, + "grad_norm": 0.4407003267094362, + "learning_rate": 4.766992877069996e-06, + "loss": 0.5013, + "step": 7414 + }, + { + "epoch": 2.912916769572734, + "grad_norm": 0.4526490214558496, + "learning_rate": 4.76692682471766e-06, + "loss": 0.5065, + "step": 7415 + }, + { + "epoch": 2.9133119288713263, + "grad_norm": 0.4384526778991886, + "learning_rate": 4.766860763462211e-06, + "loss": 0.5202, + "step": 7416 + }, + { + "epoch": 2.9137070881699185, + "grad_norm": 0.4554119386883933, + "learning_rate": 4.76679469330391e-06, + "loss": 0.4985, + "step": 7417 + }, + { + "epoch": 2.9141022474685108, + "grad_norm": 0.4364078258328405, + "learning_rate": 4.766728614243016e-06, + "loss": 0.4893, + "step": 7418 + }, + { + "epoch": 2.914497406767103, + "grad_norm": 0.4588538320640383, + "learning_rate": 4.766662526279788e-06, + "loss": 0.521, + "step": 7419 + }, + { + "epoch": 2.9148925660656952, + "grad_norm": 0.45670544657223855, + "learning_rate": 4.766596429414487e-06, + "loss": 0.5111, + "step": 7420 + }, + { + "epoch": 2.9152877253642875, + "grad_norm": 0.4417258587433614, + "learning_rate": 4.76653032364737e-06, + "loss": 0.5018, + "step": 7421 + }, + { + "epoch": 2.9156828846628797, + "grad_norm": 0.4365589726517331, + "learning_rate": 4.7664642089787e-06, + "loss": 0.5026, + "step": 7422 + }, + { + "epoch": 2.916078043961472, + "grad_norm": 0.4556876909416063, + "learning_rate": 4.766398085408734e-06, + "loss": 0.5208, + "step": 7423 + }, + { + "epoch": 2.916473203260064, + "grad_norm": 0.4496719110515717, + "learning_rate": 4.766331952937732e-06, + "loss": 0.5158, + "step": 7424 + }, + { + "epoch": 2.9168683625586564, + "grad_norm": 0.45674338189085323, + "learning_rate": 4.7662658115659546e-06, + "loss": 0.5155, + "step": 7425 + }, + { + "epoch": 2.9172635218572487, + "grad_norm": 0.4365519880167459, + "learning_rate": 4.766199661293662e-06, + "loss": 0.5067, + "step": 7426 + }, + { + "epoch": 2.917658681155841, + "grad_norm": 0.46044332574529667, + "learning_rate": 4.766133502121113e-06, + "loss": 0.509, + "step": 7427 + }, + { + "epoch": 2.918053840454433, + "grad_norm": 0.4523808596214587, + "learning_rate": 4.766067334048567e-06, + "loss": 0.5011, + "step": 7428 + }, + { + "epoch": 2.9184489997530254, + "grad_norm": 0.4552082949507314, + "learning_rate": 4.766001157076284e-06, + "loss": 0.515, + "step": 7429 + }, + { + "epoch": 2.9188441590516176, + "grad_norm": 0.45129094467367264, + "learning_rate": 4.765934971204526e-06, + "loss": 0.5026, + "step": 7430 + }, + { + "epoch": 2.91923931835021, + "grad_norm": 0.4615128953141365, + "learning_rate": 4.765868776433551e-06, + "loss": 0.5293, + "step": 7431 + }, + { + "epoch": 2.919634477648802, + "grad_norm": 0.44445080798757497, + "learning_rate": 4.765802572763619e-06, + "loss": 0.5084, + "step": 7432 + }, + { + "epoch": 2.9200296369473944, + "grad_norm": 0.5535941391787863, + "learning_rate": 4.76573636019499e-06, + "loss": 0.5165, + "step": 7433 + }, + { + "epoch": 2.9204247962459866, + "grad_norm": 0.4647127083547014, + "learning_rate": 4.765670138727925e-06, + "loss": 0.5072, + "step": 7434 + }, + { + "epoch": 2.920819955544579, + "grad_norm": 0.44196536115349694, + "learning_rate": 4.765603908362683e-06, + "loss": 0.5036, + "step": 7435 + }, + { + "epoch": 2.921215114843171, + "grad_norm": 0.45968440262090315, + "learning_rate": 4.765537669099525e-06, + "loss": 0.4972, + "step": 7436 + }, + { + "epoch": 2.9216102741417633, + "grad_norm": 0.444196485819374, + "learning_rate": 4.765471420938711e-06, + "loss": 0.5025, + "step": 7437 + }, + { + "epoch": 2.9220054334403556, + "grad_norm": 0.4480022299436206, + "learning_rate": 4.7654051638805e-06, + "loss": 0.4847, + "step": 7438 + }, + { + "epoch": 2.922400592738948, + "grad_norm": 0.4567439332123647, + "learning_rate": 4.765338897925154e-06, + "loss": 0.5128, + "step": 7439 + }, + { + "epoch": 2.92279575203754, + "grad_norm": 0.458511120503876, + "learning_rate": 4.765272623072932e-06, + "loss": 0.5166, + "step": 7440 + }, + { + "epoch": 2.9231909113361323, + "grad_norm": 0.44860217503885985, + "learning_rate": 4.765206339324095e-06, + "loss": 0.5045, + "step": 7441 + }, + { + "epoch": 2.9235860706347245, + "grad_norm": 0.4546548390510289, + "learning_rate": 4.765140046678903e-06, + "loss": 0.4965, + "step": 7442 + }, + { + "epoch": 2.9239812299333168, + "grad_norm": 0.4492830754296067, + "learning_rate": 4.765073745137616e-06, + "loss": 0.5065, + "step": 7443 + }, + { + "epoch": 2.924376389231909, + "grad_norm": 0.4605719668288093, + "learning_rate": 4.765007434700495e-06, + "loss": 0.5078, + "step": 7444 + }, + { + "epoch": 2.9247715485305013, + "grad_norm": 0.4640714655281598, + "learning_rate": 4.7649411153678e-06, + "loss": 0.523, + "step": 7445 + }, + { + "epoch": 2.9251667078290935, + "grad_norm": 0.45659739128055815, + "learning_rate": 4.764874787139792e-06, + "loss": 0.4993, + "step": 7446 + }, + { + "epoch": 2.9255618671276857, + "grad_norm": 0.4728553705365856, + "learning_rate": 4.764808450016731e-06, + "loss": 0.5352, + "step": 7447 + }, + { + "epoch": 2.925957026426278, + "grad_norm": 0.43793250070587963, + "learning_rate": 4.764742103998877e-06, + "loss": 0.4935, + "step": 7448 + }, + { + "epoch": 2.92635218572487, + "grad_norm": 0.4423604050440384, + "learning_rate": 4.7646757490864926e-06, + "loss": 0.5088, + "step": 7449 + }, + { + "epoch": 2.9267473450234625, + "grad_norm": 0.4501666815149439, + "learning_rate": 4.764609385279836e-06, + "loss": 0.5102, + "step": 7450 + }, + { + "epoch": 2.9271425043220547, + "grad_norm": 0.4574235898623384, + "learning_rate": 4.764543012579169e-06, + "loss": 0.5032, + "step": 7451 + }, + { + "epoch": 2.927537663620647, + "grad_norm": 0.460475598434439, + "learning_rate": 4.764476630984752e-06, + "loss": 0.4958, + "step": 7452 + }, + { + "epoch": 2.927932822919239, + "grad_norm": 0.43089611921617016, + "learning_rate": 4.764410240496846e-06, + "loss": 0.4941, + "step": 7453 + }, + { + "epoch": 2.9283279822178314, + "grad_norm": 0.45244880455514785, + "learning_rate": 4.764343841115712e-06, + "loss": 0.5068, + "step": 7454 + }, + { + "epoch": 2.9287231415164237, + "grad_norm": 0.4572924724707516, + "learning_rate": 4.76427743284161e-06, + "loss": 0.509, + "step": 7455 + }, + { + "epoch": 2.929118300815016, + "grad_norm": 0.4535852769301378, + "learning_rate": 4.764211015674801e-06, + "loss": 0.5127, + "step": 7456 + }, + { + "epoch": 2.929513460113608, + "grad_norm": 0.5430024075970933, + "learning_rate": 4.764144589615547e-06, + "loss": 0.5065, + "step": 7457 + }, + { + "epoch": 2.929908619412201, + "grad_norm": 0.44556126301612464, + "learning_rate": 4.764078154664107e-06, + "loss": 0.5309, + "step": 7458 + }, + { + "epoch": 2.930303778710793, + "grad_norm": 0.4710840587154262, + "learning_rate": 4.764011710820743e-06, + "loss": 0.5104, + "step": 7459 + }, + { + "epoch": 2.9306989380093853, + "grad_norm": 0.45571321032211237, + "learning_rate": 4.763945258085716e-06, + "loss": 0.5092, + "step": 7460 + }, + { + "epoch": 2.9310940973079775, + "grad_norm": 0.44102051744598414, + "learning_rate": 4.763878796459287e-06, + "loss": 0.5168, + "step": 7461 + }, + { + "epoch": 2.93148925660657, + "grad_norm": 0.4470640541086543, + "learning_rate": 4.7638123259417166e-06, + "loss": 0.5013, + "step": 7462 + }, + { + "epoch": 2.931884415905162, + "grad_norm": 0.43554949182172414, + "learning_rate": 4.763745846533265e-06, + "loss": 0.4982, + "step": 7463 + }, + { + "epoch": 2.9322795752037543, + "grad_norm": 0.4720396479178599, + "learning_rate": 4.763679358234196e-06, + "loss": 0.5106, + "step": 7464 + }, + { + "epoch": 2.9326747345023465, + "grad_norm": 0.4602272736857191, + "learning_rate": 4.763612861044768e-06, + "loss": 0.5108, + "step": 7465 + }, + { + "epoch": 2.9330698938009387, + "grad_norm": 0.4552466120844723, + "learning_rate": 4.763546354965244e-06, + "loss": 0.5034, + "step": 7466 + }, + { + "epoch": 2.933465053099531, + "grad_norm": 0.4575835047170144, + "learning_rate": 4.763479839995883e-06, + "loss": 0.5096, + "step": 7467 + }, + { + "epoch": 2.9338602123981232, + "grad_norm": 0.4678916204593777, + "learning_rate": 4.763413316136949e-06, + "loss": 0.5274, + "step": 7468 + }, + { + "epoch": 2.9342553716967155, + "grad_norm": 0.4452559996447758, + "learning_rate": 4.7633467833887015e-06, + "loss": 0.4957, + "step": 7469 + }, + { + "epoch": 2.9346505309953077, + "grad_norm": 0.44208744364576574, + "learning_rate": 4.763280241751402e-06, + "loss": 0.495, + "step": 7470 + }, + { + "epoch": 2.9350456902939, + "grad_norm": 0.46246549564900696, + "learning_rate": 4.763213691225313e-06, + "loss": 0.5021, + "step": 7471 + }, + { + "epoch": 2.935440849592492, + "grad_norm": 0.44656255715995435, + "learning_rate": 4.763147131810693e-06, + "loss": 0.5128, + "step": 7472 + }, + { + "epoch": 2.9358360088910844, + "grad_norm": 0.4612831877698752, + "learning_rate": 4.7630805635078065e-06, + "loss": 0.5001, + "step": 7473 + }, + { + "epoch": 2.9362311681896767, + "grad_norm": 0.4956157029905282, + "learning_rate": 4.763013986316914e-06, + "loss": 0.5342, + "step": 7474 + }, + { + "epoch": 2.936626327488269, + "grad_norm": 0.4505671941107815, + "learning_rate": 4.762947400238276e-06, + "loss": 0.4923, + "step": 7475 + }, + { + "epoch": 2.937021486786861, + "grad_norm": 0.45390395120855775, + "learning_rate": 4.762880805272155e-06, + "loss": 0.5247, + "step": 7476 + }, + { + "epoch": 2.9374166460854534, + "grad_norm": 0.44602033162244886, + "learning_rate": 4.762814201418813e-06, + "loss": 0.4961, + "step": 7477 + }, + { + "epoch": 2.9378118053840456, + "grad_norm": 0.45061560756472163, + "learning_rate": 4.76274758867851e-06, + "loss": 0.5152, + "step": 7478 + }, + { + "epoch": 2.938206964682638, + "grad_norm": 0.4472180927534863, + "learning_rate": 4.762680967051509e-06, + "loss": 0.5113, + "step": 7479 + }, + { + "epoch": 2.93860212398123, + "grad_norm": 0.4582451501185086, + "learning_rate": 4.762614336538071e-06, + "loss": 0.5157, + "step": 7480 + }, + { + "epoch": 2.9389972832798223, + "grad_norm": 0.4383818777857067, + "learning_rate": 4.762547697138458e-06, + "loss": 0.4909, + "step": 7481 + }, + { + "epoch": 2.9393924425784146, + "grad_norm": 0.46044036455758336, + "learning_rate": 4.762481048852931e-06, + "loss": 0.5176, + "step": 7482 + }, + { + "epoch": 2.939787601877007, + "grad_norm": 0.4559104153811503, + "learning_rate": 4.762414391681753e-06, + "loss": 0.5117, + "step": 7483 + }, + { + "epoch": 2.940182761175599, + "grad_norm": 0.4345018162819212, + "learning_rate": 4.762347725625185e-06, + "loss": 0.4939, + "step": 7484 + }, + { + "epoch": 2.9405779204741913, + "grad_norm": 0.44329288256437194, + "learning_rate": 4.7622810506834885e-06, + "loss": 0.4959, + "step": 7485 + }, + { + "epoch": 2.9409730797727835, + "grad_norm": 0.4719182053165318, + "learning_rate": 4.762214366856925e-06, + "loss": 0.5199, + "step": 7486 + }, + { + "epoch": 2.941368239071376, + "grad_norm": 0.500449067523404, + "learning_rate": 4.762147674145759e-06, + "loss": 0.5036, + "step": 7487 + }, + { + "epoch": 2.941763398369968, + "grad_norm": 0.44559700179627093, + "learning_rate": 4.762080972550249e-06, + "loss": 0.5127, + "step": 7488 + }, + { + "epoch": 2.9421585576685603, + "grad_norm": 0.45054565116407974, + "learning_rate": 4.762014262070659e-06, + "loss": 0.5329, + "step": 7489 + }, + { + "epoch": 2.9425537169671525, + "grad_norm": 0.4530812436143374, + "learning_rate": 4.761947542707251e-06, + "loss": 0.4968, + "step": 7490 + }, + { + "epoch": 2.9429488762657448, + "grad_norm": 0.45123498001248574, + "learning_rate": 4.761880814460286e-06, + "loss": 0.5331, + "step": 7491 + }, + { + "epoch": 2.943344035564337, + "grad_norm": 0.46119671535259266, + "learning_rate": 4.761814077330027e-06, + "loss": 0.5299, + "step": 7492 + }, + { + "epoch": 2.9437391948629292, + "grad_norm": 0.4371101220621986, + "learning_rate": 4.7617473313167365e-06, + "loss": 0.4896, + "step": 7493 + }, + { + "epoch": 2.9441343541615215, + "grad_norm": 0.45650104982890227, + "learning_rate": 4.761680576420674e-06, + "loss": 0.5266, + "step": 7494 + }, + { + "epoch": 2.9445295134601137, + "grad_norm": 0.45022327882961727, + "learning_rate": 4.761613812642105e-06, + "loss": 0.5307, + "step": 7495 + }, + { + "epoch": 2.944924672758706, + "grad_norm": 0.45404058586100055, + "learning_rate": 4.76154703998129e-06, + "loss": 0.5091, + "step": 7496 + }, + { + "epoch": 2.945319832057298, + "grad_norm": 0.45618375047378157, + "learning_rate": 4.761480258438491e-06, + "loss": 0.5167, + "step": 7497 + }, + { + "epoch": 2.9457149913558904, + "grad_norm": 0.44164866519004375, + "learning_rate": 4.761413468013972e-06, + "loss": 0.4942, + "step": 7498 + }, + { + "epoch": 2.9461101506544827, + "grad_norm": 0.43860964167840893, + "learning_rate": 4.761346668707993e-06, + "loss": 0.5087, + "step": 7499 + }, + { + "epoch": 2.946505309953075, + "grad_norm": 0.4453511072236731, + "learning_rate": 4.7612798605208175e-06, + "loss": 0.5224, + "step": 7500 + }, + { + "epoch": 2.946900469251667, + "grad_norm": 0.44709440039596826, + "learning_rate": 4.761213043452708e-06, + "loss": 0.5303, + "step": 7501 + }, + { + "epoch": 2.9472956285502594, + "grad_norm": 0.454881212603844, + "learning_rate": 4.761146217503927e-06, + "loss": 0.523, + "step": 7502 + }, + { + "epoch": 2.9476907878488516, + "grad_norm": 0.43851122242734974, + "learning_rate": 4.761079382674737e-06, + "loss": 0.5026, + "step": 7503 + }, + { + "epoch": 2.948085947147444, + "grad_norm": 0.47892329084918284, + "learning_rate": 4.761012538965399e-06, + "loss": 0.5178, + "step": 7504 + }, + { + "epoch": 2.948481106446036, + "grad_norm": 0.4698801672891979, + "learning_rate": 4.760945686376178e-06, + "loss": 0.5098, + "step": 7505 + }, + { + "epoch": 2.9488762657446284, + "grad_norm": 0.48150219567951025, + "learning_rate": 4.760878824907335e-06, + "loss": 0.4947, + "step": 7506 + }, + { + "epoch": 2.9492714250432206, + "grad_norm": 0.4417389041761658, + "learning_rate": 4.7608119545591326e-06, + "loss": 0.5044, + "step": 7507 + }, + { + "epoch": 2.949666584341813, + "grad_norm": 0.44156128051674065, + "learning_rate": 4.760745075331833e-06, + "loss": 0.4993, + "step": 7508 + }, + { + "epoch": 2.950061743640405, + "grad_norm": 0.43631000583026247, + "learning_rate": 4.7606781872257e-06, + "loss": 0.4978, + "step": 7509 + }, + { + "epoch": 2.9504569029389973, + "grad_norm": 0.4570064542158102, + "learning_rate": 4.760611290240996e-06, + "loss": 0.5007, + "step": 7510 + }, + { + "epoch": 2.9508520622375896, + "grad_norm": 0.45615671034916505, + "learning_rate": 4.760544384377984e-06, + "loss": 0.4908, + "step": 7511 + }, + { + "epoch": 2.951247221536182, + "grad_norm": 0.4343490552276528, + "learning_rate": 4.760477469636926e-06, + "loss": 0.4896, + "step": 7512 + }, + { + "epoch": 2.951642380834774, + "grad_norm": 0.4459408199880296, + "learning_rate": 4.760410546018085e-06, + "loss": 0.5091, + "step": 7513 + }, + { + "epoch": 2.9520375401333663, + "grad_norm": 0.4636503598906805, + "learning_rate": 4.760343613521724e-06, + "loss": 0.4895, + "step": 7514 + }, + { + "epoch": 2.9524326994319585, + "grad_norm": 0.4473832087140672, + "learning_rate": 4.7602766721481055e-06, + "loss": 0.5059, + "step": 7515 + }, + { + "epoch": 2.9528278587305508, + "grad_norm": 0.45903466878021015, + "learning_rate": 4.760209721897493e-06, + "loss": 0.5259, + "step": 7516 + }, + { + "epoch": 2.953223018029143, + "grad_norm": 0.4389672467155343, + "learning_rate": 4.76014276277015e-06, + "loss": 0.5145, + "step": 7517 + }, + { + "epoch": 2.9536181773277352, + "grad_norm": 0.44946325074519894, + "learning_rate": 4.760075794766338e-06, + "loss": 0.5092, + "step": 7518 + }, + { + "epoch": 2.9540133366263275, + "grad_norm": 0.44012358398055595, + "learning_rate": 4.76000881788632e-06, + "loss": 0.4943, + "step": 7519 + }, + { + "epoch": 2.9544084959249197, + "grad_norm": 0.45946199113007924, + "learning_rate": 4.75994183213036e-06, + "loss": 0.5014, + "step": 7520 + }, + { + "epoch": 2.954803655223512, + "grad_norm": 0.45502527214353133, + "learning_rate": 4.759874837498721e-06, + "loss": 0.4964, + "step": 7521 + }, + { + "epoch": 2.955198814522104, + "grad_norm": 0.465012979637925, + "learning_rate": 4.759807833991667e-06, + "loss": 0.4897, + "step": 7522 + }, + { + "epoch": 2.9555939738206964, + "grad_norm": 0.4467396954982866, + "learning_rate": 4.759740821609459e-06, + "loss": 0.5003, + "step": 7523 + }, + { + "epoch": 2.9559891331192887, + "grad_norm": 0.4425638158476036, + "learning_rate": 4.759673800352362e-06, + "loss": 0.4933, + "step": 7524 + }, + { + "epoch": 2.956384292417881, + "grad_norm": 0.4616979621992214, + "learning_rate": 4.759606770220638e-06, + "loss": 0.5142, + "step": 7525 + }, + { + "epoch": 2.956779451716473, + "grad_norm": 0.4722218812872345, + "learning_rate": 4.759539731214549e-06, + "loss": 0.5087, + "step": 7526 + }, + { + "epoch": 2.9571746110150654, + "grad_norm": 0.439973697685221, + "learning_rate": 4.759472683334362e-06, + "loss": 0.4875, + "step": 7527 + }, + { + "epoch": 2.9575697703136576, + "grad_norm": 0.4571911157675649, + "learning_rate": 4.759405626580338e-06, + "loss": 0.513, + "step": 7528 + }, + { + "epoch": 2.95796492961225, + "grad_norm": 0.4657839715911801, + "learning_rate": 4.7593385609527406e-06, + "loss": 0.5174, + "step": 7529 + }, + { + "epoch": 2.958360088910842, + "grad_norm": 0.4530863288631989, + "learning_rate": 4.759271486451833e-06, + "loss": 0.4956, + "step": 7530 + }, + { + "epoch": 2.9587552482094344, + "grad_norm": 0.4722880955783831, + "learning_rate": 4.759204403077879e-06, + "loss": 0.5187, + "step": 7531 + }, + { + "epoch": 2.9591504075080266, + "grad_norm": 0.4567472933113083, + "learning_rate": 4.7591373108311425e-06, + "loss": 0.5272, + "step": 7532 + }, + { + "epoch": 2.959545566806619, + "grad_norm": 0.46118458955762015, + "learning_rate": 4.759070209711886e-06, + "loss": 0.515, + "step": 7533 + }, + { + "epoch": 2.959940726105211, + "grad_norm": 0.4337420420723696, + "learning_rate": 4.759003099720373e-06, + "loss": 0.4868, + "step": 7534 + }, + { + "epoch": 2.9603358854038033, + "grad_norm": 0.4597390934949171, + "learning_rate": 4.758935980856868e-06, + "loss": 0.524, + "step": 7535 + }, + { + "epoch": 2.9607310447023956, + "grad_norm": 0.4520311626789291, + "learning_rate": 4.758868853121635e-06, + "loss": 0.5164, + "step": 7536 + }, + { + "epoch": 2.961126204000988, + "grad_norm": 0.4604721101236667, + "learning_rate": 4.758801716514935e-06, + "loss": 0.5022, + "step": 7537 + }, + { + "epoch": 2.96152136329958, + "grad_norm": 0.4419481364317015, + "learning_rate": 4.758734571037035e-06, + "loss": 0.4752, + "step": 7538 + }, + { + "epoch": 2.9619165225981723, + "grad_norm": 0.4449236247292902, + "learning_rate": 4.758667416688197e-06, + "loss": 0.5091, + "step": 7539 + }, + { + "epoch": 2.9623116818967645, + "grad_norm": 0.4553128748466083, + "learning_rate": 4.758600253468684e-06, + "loss": 0.5, + "step": 7540 + }, + { + "epoch": 2.9627068411953568, + "grad_norm": 0.46612998739450867, + "learning_rate": 4.758533081378762e-06, + "loss": 0.5018, + "step": 7541 + }, + { + "epoch": 2.963102000493949, + "grad_norm": 0.46551416452290106, + "learning_rate": 4.7584659004186924e-06, + "loss": 0.508, + "step": 7542 + }, + { + "epoch": 2.9634971597925412, + "grad_norm": 0.452825185453952, + "learning_rate": 4.758398710588741e-06, + "loss": 0.4992, + "step": 7543 + }, + { + "epoch": 2.9638923190911335, + "grad_norm": 0.4604901561144933, + "learning_rate": 4.758331511889171e-06, + "loss": 0.5071, + "step": 7544 + }, + { + "epoch": 2.9642874783897257, + "grad_norm": 0.4700510709171447, + "learning_rate": 4.7582643043202445e-06, + "loss": 0.5159, + "step": 7545 + }, + { + "epoch": 2.964682637688318, + "grad_norm": 0.4702968346285271, + "learning_rate": 4.758197087882228e-06, + "loss": 0.5211, + "step": 7546 + }, + { + "epoch": 2.96507779698691, + "grad_norm": 0.45518608070060007, + "learning_rate": 4.758129862575386e-06, + "loss": 0.4996, + "step": 7547 + }, + { + "epoch": 2.9654729562855024, + "grad_norm": 0.46985798766213305, + "learning_rate": 4.758062628399979e-06, + "loss": 0.5043, + "step": 7548 + }, + { + "epoch": 2.9658681155840947, + "grad_norm": 0.47941413887771617, + "learning_rate": 4.7579953853562744e-06, + "loss": 0.5225, + "step": 7549 + }, + { + "epoch": 2.966263274882687, + "grad_norm": 0.44370397690809943, + "learning_rate": 4.757928133444534e-06, + "loss": 0.508, + "step": 7550 + }, + { + "epoch": 2.966658434181279, + "grad_norm": 0.5281064344206505, + "learning_rate": 4.757860872665024e-06, + "loss": 0.5019, + "step": 7551 + }, + { + "epoch": 2.9670535934798714, + "grad_norm": 0.46395863123158015, + "learning_rate": 4.757793603018007e-06, + "loss": 0.5176, + "step": 7552 + }, + { + "epoch": 2.9674487527784636, + "grad_norm": 0.4640272614666952, + "learning_rate": 4.757726324503749e-06, + "loss": 0.5029, + "step": 7553 + }, + { + "epoch": 2.967843912077056, + "grad_norm": 0.443178090717011, + "learning_rate": 4.757659037122511e-06, + "loss": 0.5143, + "step": 7554 + }, + { + "epoch": 2.968239071375648, + "grad_norm": 0.4562032262581782, + "learning_rate": 4.75759174087456e-06, + "loss": 0.4894, + "step": 7555 + }, + { + "epoch": 2.9686342306742404, + "grad_norm": 0.4505861613127547, + "learning_rate": 4.75752443576016e-06, + "loss": 0.511, + "step": 7556 + }, + { + "epoch": 2.9690293899728326, + "grad_norm": 0.46040086407622155, + "learning_rate": 4.757457121779575e-06, + "loss": 0.5134, + "step": 7557 + }, + { + "epoch": 2.969424549271425, + "grad_norm": 0.46009994634444334, + "learning_rate": 4.757389798933069e-06, + "loss": 0.4881, + "step": 7558 + }, + { + "epoch": 2.969819708570017, + "grad_norm": 0.48161543752949787, + "learning_rate": 4.757322467220906e-06, + "loss": 0.4888, + "step": 7559 + }, + { + "epoch": 2.9702148678686093, + "grad_norm": 0.4468364867174091, + "learning_rate": 4.7572551266433506e-06, + "loss": 0.5166, + "step": 7560 + }, + { + "epoch": 2.9706100271672016, + "grad_norm": 0.451144658830222, + "learning_rate": 4.757187777200669e-06, + "loss": 0.497, + "step": 7561 + }, + { + "epoch": 2.971005186465794, + "grad_norm": 0.4435308430537957, + "learning_rate": 4.757120418893124e-06, + "loss": 0.5151, + "step": 7562 + }, + { + "epoch": 2.971400345764386, + "grad_norm": 0.4757327743330427, + "learning_rate": 4.7570530517209815e-06, + "loss": 0.5268, + "step": 7563 + }, + { + "epoch": 2.9717955050629783, + "grad_norm": 0.46010095557816083, + "learning_rate": 4.756985675684504e-06, + "loss": 0.5048, + "step": 7564 + }, + { + "epoch": 2.9721906643615705, + "grad_norm": 0.4457407014760488, + "learning_rate": 4.756918290783957e-06, + "loss": 0.5066, + "step": 7565 + }, + { + "epoch": 2.9725858236601628, + "grad_norm": 0.4797320041985171, + "learning_rate": 4.756850897019606e-06, + "loss": 0.5166, + "step": 7566 + }, + { + "epoch": 2.972980982958755, + "grad_norm": 0.44199710687789934, + "learning_rate": 4.756783494391716e-06, + "loss": 0.487, + "step": 7567 + }, + { + "epoch": 2.9733761422573473, + "grad_norm": 0.4426609628978589, + "learning_rate": 4.7567160829005496e-06, + "loss": 0.5036, + "step": 7568 + }, + { + "epoch": 2.9737713015559395, + "grad_norm": 0.4505400917951688, + "learning_rate": 4.756648662546373e-06, + "loss": 0.5125, + "step": 7569 + }, + { + "epoch": 2.9741664608545317, + "grad_norm": 0.4676898947705984, + "learning_rate": 4.756581233329451e-06, + "loss": 0.529, + "step": 7570 + }, + { + "epoch": 2.974561620153124, + "grad_norm": 0.44793604608650994, + "learning_rate": 4.756513795250048e-06, + "loss": 0.5314, + "step": 7571 + }, + { + "epoch": 2.974956779451716, + "grad_norm": 0.4257182384070774, + "learning_rate": 4.756446348308429e-06, + "loss": 0.4938, + "step": 7572 + }, + { + "epoch": 2.9753519387503085, + "grad_norm": 0.4541639462382242, + "learning_rate": 4.7563788925048596e-06, + "loss": 0.5049, + "step": 7573 + }, + { + "epoch": 2.9757470980489007, + "grad_norm": 0.45049135883048075, + "learning_rate": 4.7563114278396025e-06, + "loss": 0.4962, + "step": 7574 + }, + { + "epoch": 2.976142257347493, + "grad_norm": 0.44642306301580476, + "learning_rate": 4.756243954312926e-06, + "loss": 0.4906, + "step": 7575 + }, + { + "epoch": 2.976537416646085, + "grad_norm": 0.4551265744121246, + "learning_rate": 4.756176471925092e-06, + "loss": 0.5114, + "step": 7576 + }, + { + "epoch": 2.976932575944678, + "grad_norm": 0.44159910731274965, + "learning_rate": 4.756108980676367e-06, + "loss": 0.5182, + "step": 7577 + }, + { + "epoch": 2.97732773524327, + "grad_norm": 0.43027421694295137, + "learning_rate": 4.756041480567017e-06, + "loss": 0.4889, + "step": 7578 + }, + { + "epoch": 2.9777228945418623, + "grad_norm": 0.4337873859510924, + "learning_rate": 4.755973971597305e-06, + "loss": 0.4941, + "step": 7579 + }, + { + "epoch": 2.9781180538404546, + "grad_norm": 0.45031945697341647, + "learning_rate": 4.7559064537674975e-06, + "loss": 0.4998, + "step": 7580 + }, + { + "epoch": 2.978513213139047, + "grad_norm": 0.4712226590155607, + "learning_rate": 4.755838927077859e-06, + "loss": 0.5111, + "step": 7581 + }, + { + "epoch": 2.978908372437639, + "grad_norm": 0.4409693860661984, + "learning_rate": 4.755771391528655e-06, + "loss": 0.5069, + "step": 7582 + }, + { + "epoch": 2.9793035317362313, + "grad_norm": 0.4640142389222412, + "learning_rate": 4.755703847120152e-06, + "loss": 0.5164, + "step": 7583 + }, + { + "epoch": 2.9796986910348235, + "grad_norm": 0.45381043122965947, + "learning_rate": 4.7556362938526124e-06, + "loss": 0.5041, + "step": 7584 + }, + { + "epoch": 2.980093850333416, + "grad_norm": 0.4587590971637779, + "learning_rate": 4.755568731726304e-06, + "loss": 0.5203, + "step": 7585 + }, + { + "epoch": 2.980489009632008, + "grad_norm": 0.4644178501879429, + "learning_rate": 4.755501160741491e-06, + "loss": 0.5328, + "step": 7586 + }, + { + "epoch": 2.9808841689306003, + "grad_norm": 0.44776614224075106, + "learning_rate": 4.755433580898439e-06, + "loss": 0.5036, + "step": 7587 + }, + { + "epoch": 2.9812793282291925, + "grad_norm": 0.43844012754058254, + "learning_rate": 4.7553659921974134e-06, + "loss": 0.4832, + "step": 7588 + }, + { + "epoch": 2.9816744875277847, + "grad_norm": 0.45286284274574534, + "learning_rate": 4.75529839463868e-06, + "loss": 0.5185, + "step": 7589 + }, + { + "epoch": 2.982069646826377, + "grad_norm": 0.46049052268894997, + "learning_rate": 4.755230788222504e-06, + "loss": 0.5378, + "step": 7590 + }, + { + "epoch": 2.9824648061249692, + "grad_norm": 0.4485615523268441, + "learning_rate": 4.755163172949151e-06, + "loss": 0.5009, + "step": 7591 + }, + { + "epoch": 2.9828599654235615, + "grad_norm": 0.45034777636007545, + "learning_rate": 4.755095548818886e-06, + "loss": 0.5236, + "step": 7592 + }, + { + "epoch": 2.9832551247221537, + "grad_norm": 0.4531560958419335, + "learning_rate": 4.755027915831975e-06, + "loss": 0.5101, + "step": 7593 + }, + { + "epoch": 2.983650284020746, + "grad_norm": 0.452406575756383, + "learning_rate": 4.754960273988684e-06, + "loss": 0.51, + "step": 7594 + }, + { + "epoch": 2.984045443319338, + "grad_norm": 0.4404650508761369, + "learning_rate": 4.754892623289279e-06, + "loss": 0.498, + "step": 7595 + }, + { + "epoch": 2.9844406026179304, + "grad_norm": 0.45105701449061847, + "learning_rate": 4.754824963734024e-06, + "loss": 0.5247, + "step": 7596 + }, + { + "epoch": 2.9848357619165227, + "grad_norm": 0.4544785032506456, + "learning_rate": 4.754757295323186e-06, + "loss": 0.5006, + "step": 7597 + }, + { + "epoch": 2.985230921215115, + "grad_norm": 0.44648615459844027, + "learning_rate": 4.7546896180570305e-06, + "loss": 0.5139, + "step": 7598 + }, + { + "epoch": 2.985626080513707, + "grad_norm": 0.46035057899107057, + "learning_rate": 4.754621931935823e-06, + "loss": 0.5129, + "step": 7599 + }, + { + "epoch": 2.9860212398122994, + "grad_norm": 0.468475157789722, + "learning_rate": 4.75455423695983e-06, + "loss": 0.5284, + "step": 7600 + }, + { + "epoch": 2.9864163991108916, + "grad_norm": 0.455654479720452, + "learning_rate": 4.7544865331293175e-06, + "loss": 0.5141, + "step": 7601 + }, + { + "epoch": 2.986811558409484, + "grad_norm": 0.43877314753613916, + "learning_rate": 4.75441882044455e-06, + "loss": 0.5102, + "step": 7602 + }, + { + "epoch": 2.987206717708076, + "grad_norm": 0.44066884059445266, + "learning_rate": 4.754351098905795e-06, + "loss": 0.5096, + "step": 7603 + }, + { + "epoch": 2.9876018770066683, + "grad_norm": 0.455561747753586, + "learning_rate": 4.754283368513317e-06, + "loss": 0.5194, + "step": 7604 + }, + { + "epoch": 2.9879970363052606, + "grad_norm": 0.4532632517976643, + "learning_rate": 4.754215629267384e-06, + "loss": 0.5145, + "step": 7605 + }, + { + "epoch": 2.988392195603853, + "grad_norm": 0.4469539479625391, + "learning_rate": 4.75414788116826e-06, + "loss": 0.5018, + "step": 7606 + }, + { + "epoch": 2.988787354902445, + "grad_norm": 0.45541116390015846, + "learning_rate": 4.754080124216212e-06, + "loss": 0.4859, + "step": 7607 + }, + { + "epoch": 2.9891825142010373, + "grad_norm": 0.4496311984557622, + "learning_rate": 4.754012358411506e-06, + "loss": 0.5018, + "step": 7608 + }, + { + "epoch": 2.9895776734996296, + "grad_norm": 0.4530431443004678, + "learning_rate": 4.753944583754408e-06, + "loss": 0.489, + "step": 7609 + }, + { + "epoch": 2.989972832798222, + "grad_norm": 0.4475247994930158, + "learning_rate": 4.753876800245186e-06, + "loss": 0.4946, + "step": 7610 + }, + { + "epoch": 2.990367992096814, + "grad_norm": 0.44677019584511535, + "learning_rate": 4.753809007884103e-06, + "loss": 0.5155, + "step": 7611 + }, + { + "epoch": 2.9907631513954063, + "grad_norm": 0.4491717613748345, + "learning_rate": 4.753741206671426e-06, + "loss": 0.489, + "step": 7612 + }, + { + "epoch": 2.9911583106939985, + "grad_norm": 0.4718935616370409, + "learning_rate": 4.753673396607423e-06, + "loss": 0.5084, + "step": 7613 + }, + { + "epoch": 2.9915534699925908, + "grad_norm": 0.4526066949527833, + "learning_rate": 4.7536055776923596e-06, + "loss": 0.5151, + "step": 7614 + }, + { + "epoch": 2.991948629291183, + "grad_norm": 0.4506375450777332, + "learning_rate": 4.753537749926502e-06, + "loss": 0.5064, + "step": 7615 + }, + { + "epoch": 2.9923437885897752, + "grad_norm": 0.45034114642880246, + "learning_rate": 4.753469913310116e-06, + "loss": 0.5166, + "step": 7616 + }, + { + "epoch": 2.9927389478883675, + "grad_norm": 0.4407471329242633, + "learning_rate": 4.753402067843469e-06, + "loss": 0.5098, + "step": 7617 + }, + { + "epoch": 2.9931341071869597, + "grad_norm": 0.4761783641018543, + "learning_rate": 4.753334213526827e-06, + "loss": 0.5045, + "step": 7618 + }, + { + "epoch": 2.993529266485552, + "grad_norm": 0.4549004819674926, + "learning_rate": 4.7532663503604566e-06, + "loss": 0.5019, + "step": 7619 + }, + { + "epoch": 2.993924425784144, + "grad_norm": 0.43524384467655386, + "learning_rate": 4.753198478344624e-06, + "loss": 0.4918, + "step": 7620 + }, + { + "epoch": 2.9943195850827364, + "grad_norm": 0.45140783826694003, + "learning_rate": 4.753130597479596e-06, + "loss": 0.5067, + "step": 7621 + }, + { + "epoch": 2.9947147443813287, + "grad_norm": 0.4481553706282452, + "learning_rate": 4.753062707765639e-06, + "loss": 0.5192, + "step": 7622 + }, + { + "epoch": 2.995109903679921, + "grad_norm": 0.44777175097676636, + "learning_rate": 4.7529948092030204e-06, + "loss": 0.5129, + "step": 7623 + }, + { + "epoch": 2.995505062978513, + "grad_norm": 0.4479442802590514, + "learning_rate": 4.752926901792006e-06, + "loss": 0.4877, + "step": 7624 + }, + { + "epoch": 2.9959002222771054, + "grad_norm": 0.446793554178131, + "learning_rate": 4.752858985532862e-06, + "loss": 0.5011, + "step": 7625 + }, + { + "epoch": 2.9962953815756976, + "grad_norm": 0.45259107436589663, + "learning_rate": 4.7527910604258575e-06, + "loss": 0.4992, + "step": 7626 + }, + { + "epoch": 2.99669054087429, + "grad_norm": 0.4445697317032285, + "learning_rate": 4.752723126471257e-06, + "loss": 0.5161, + "step": 7627 + }, + { + "epoch": 2.997085700172882, + "grad_norm": 0.46322756286869876, + "learning_rate": 4.752655183669327e-06, + "loss": 0.5021, + "step": 7628 + }, + { + "epoch": 2.9974808594714744, + "grad_norm": 0.4483882532172694, + "learning_rate": 4.752587232020337e-06, + "loss": 0.4896, + "step": 7629 + }, + { + "epoch": 2.9978760187700666, + "grad_norm": 0.4474447067182486, + "learning_rate": 4.7525192715245505e-06, + "loss": 0.5, + "step": 7630 + }, + { + "epoch": 2.998271178068659, + "grad_norm": 0.46092387125951806, + "learning_rate": 4.752451302182237e-06, + "loss": 0.5081, + "step": 7631 + }, + { + "epoch": 2.998666337367251, + "grad_norm": 0.44451538632392057, + "learning_rate": 4.752383323993663e-06, + "loss": 0.506, + "step": 7632 + }, + { + "epoch": 2.9990614966658433, + "grad_norm": 0.44559816658293144, + "learning_rate": 4.752315336959094e-06, + "loss": 0.4877, + "step": 7633 + }, + { + "epoch": 2.9994566559644356, + "grad_norm": 0.4729115645935803, + "learning_rate": 4.752247341078798e-06, + "loss": 0.5108, + "step": 7634 + }, + { + "epoch": 2.999851815263028, + "grad_norm": 0.4634195899356373, + "learning_rate": 4.752179336353043e-06, + "loss": 0.507, + "step": 7635 + }, + { + "epoch": 3.0002469745616205, + "grad_norm": 0.44336385846437987, + "learning_rate": 4.752111322782095e-06, + "loss": 0.4778, + "step": 7636 + }, + { + "epoch": 3.0006421338602127, + "grad_norm": 0.4516001642956355, + "learning_rate": 4.752043300366222e-06, + "loss": 0.5019, + "step": 7637 + }, + { + "epoch": 3.001037293158805, + "grad_norm": 0.45717790797056945, + "learning_rate": 4.751975269105689e-06, + "loss": 0.5165, + "step": 7638 + }, + { + "epoch": 3.001432452457397, + "grad_norm": 0.4584460619343716, + "learning_rate": 4.751907229000765e-06, + "loss": 0.5139, + "step": 7639 + }, + { + "epoch": 3.0018276117559894, + "grad_norm": 0.4505928965434931, + "learning_rate": 4.751839180051717e-06, + "loss": 0.4879, + "step": 7640 + }, + { + "epoch": 3.0022227710545817, + "grad_norm": 0.4550585108130313, + "learning_rate": 4.751771122258812e-06, + "loss": 0.518, + "step": 7641 + }, + { + "epoch": 3.002617930353174, + "grad_norm": 0.4641841812808131, + "learning_rate": 4.751703055622317e-06, + "loss": 0.5098, + "step": 7642 + }, + { + "epoch": 3.003013089651766, + "grad_norm": 0.4619224304919582, + "learning_rate": 4.7516349801424995e-06, + "loss": 0.492, + "step": 7643 + }, + { + "epoch": 3.0034082489503584, + "grad_norm": 0.45126132444360667, + "learning_rate": 4.751566895819628e-06, + "loss": 0.5081, + "step": 7644 + }, + { + "epoch": 3.0038034082489506, + "grad_norm": 0.4397768300217093, + "learning_rate": 4.7514988026539686e-06, + "loss": 0.4917, + "step": 7645 + }, + { + "epoch": 3.004198567547543, + "grad_norm": 0.45619992289347067, + "learning_rate": 4.75143070064579e-06, + "loss": 0.4798, + "step": 7646 + }, + { + "epoch": 3.004593726846135, + "grad_norm": 0.4498728600811053, + "learning_rate": 4.751362589795358e-06, + "loss": 0.4953, + "step": 7647 + }, + { + "epoch": 3.0049888861447274, + "grad_norm": 0.451458140159821, + "learning_rate": 4.751294470102941e-06, + "loss": 0.4945, + "step": 7648 + }, + { + "epoch": 3.0053840454433196, + "grad_norm": 0.44656047500039336, + "learning_rate": 4.751226341568806e-06, + "loss": 0.4945, + "step": 7649 + }, + { + "epoch": 3.005779204741912, + "grad_norm": 0.44674554953771023, + "learning_rate": 4.75115820419322e-06, + "loss": 0.5066, + "step": 7650 + }, + { + "epoch": 3.006174364040504, + "grad_norm": 0.5042336016390171, + "learning_rate": 4.751090057976453e-06, + "loss": 0.5143, + "step": 7651 + }, + { + "epoch": 3.0065695233390963, + "grad_norm": 0.4703862669838775, + "learning_rate": 4.751021902918771e-06, + "loss": 0.5049, + "step": 7652 + }, + { + "epoch": 3.0069646826376886, + "grad_norm": 0.44280468476812773, + "learning_rate": 4.750953739020441e-06, + "loss": 0.5, + "step": 7653 + }, + { + "epoch": 3.007359841936281, + "grad_norm": 0.4423726279571279, + "learning_rate": 4.7508855662817325e-06, + "loss": 0.5138, + "step": 7654 + }, + { + "epoch": 3.007755001234873, + "grad_norm": 0.4668415881963999, + "learning_rate": 4.750817384702912e-06, + "loss": 0.5075, + "step": 7655 + }, + { + "epoch": 3.0081501605334653, + "grad_norm": 0.4395316160725124, + "learning_rate": 4.750749194284248e-06, + "loss": 0.4892, + "step": 7656 + }, + { + "epoch": 3.0085453198320575, + "grad_norm": 0.4523097575996378, + "learning_rate": 4.750680995026007e-06, + "loss": 0.5155, + "step": 7657 + }, + { + "epoch": 3.0089404791306498, + "grad_norm": 0.43939023061801286, + "learning_rate": 4.7506127869284585e-06, + "loss": 0.4957, + "step": 7658 + }, + { + "epoch": 3.009335638429242, + "grad_norm": 0.46277152916188, + "learning_rate": 4.7505445699918695e-06, + "loss": 0.4936, + "step": 7659 + }, + { + "epoch": 3.0097307977278343, + "grad_norm": 0.46615632583881483, + "learning_rate": 4.750476344216508e-06, + "loss": 0.5201, + "step": 7660 + }, + { + "epoch": 3.0101259570264265, + "grad_norm": 0.4914314812774924, + "learning_rate": 4.750408109602641e-06, + "loss": 0.5254, + "step": 7661 + }, + { + "epoch": 3.0105211163250187, + "grad_norm": 0.459305044150223, + "learning_rate": 4.7503398661505386e-06, + "loss": 0.5076, + "step": 7662 + }, + { + "epoch": 3.010916275623611, + "grad_norm": 0.46075158979928715, + "learning_rate": 4.750271613860468e-06, + "loss": 0.5264, + "step": 7663 + }, + { + "epoch": 3.011311434922203, + "grad_norm": 0.450653814827898, + "learning_rate": 4.750203352732696e-06, + "loss": 0.5151, + "step": 7664 + }, + { + "epoch": 3.0117065942207955, + "grad_norm": 0.458283864427654, + "learning_rate": 4.750135082767492e-06, + "loss": 0.5008, + "step": 7665 + }, + { + "epoch": 3.0121017535193877, + "grad_norm": 0.4685611930993023, + "learning_rate": 4.750066803965124e-06, + "loss": 0.5187, + "step": 7666 + }, + { + "epoch": 3.01249691281798, + "grad_norm": 0.47791128196843446, + "learning_rate": 4.749998516325859e-06, + "loss": 0.5019, + "step": 7667 + }, + { + "epoch": 3.012892072116572, + "grad_norm": 0.4872509174602875, + "learning_rate": 4.749930219849967e-06, + "loss": 0.5231, + "step": 7668 + }, + { + "epoch": 3.0132872314151644, + "grad_norm": 0.4684674472270326, + "learning_rate": 4.749861914537715e-06, + "loss": 0.5076, + "step": 7669 + }, + { + "epoch": 3.0136823907137567, + "grad_norm": 0.4421540054512149, + "learning_rate": 4.749793600389372e-06, + "loss": 0.528, + "step": 7670 + }, + { + "epoch": 3.014077550012349, + "grad_norm": 0.46434385398269223, + "learning_rate": 4.749725277405205e-06, + "loss": 0.5189, + "step": 7671 + }, + { + "epoch": 3.014472709310941, + "grad_norm": 0.45238786433034117, + "learning_rate": 4.749656945585484e-06, + "loss": 0.4985, + "step": 7672 + }, + { + "epoch": 3.0148678686095334, + "grad_norm": 0.4740826575920035, + "learning_rate": 4.749588604930476e-06, + "loss": 0.4961, + "step": 7673 + }, + { + "epoch": 3.0152630279081256, + "grad_norm": 0.4356051895472756, + "learning_rate": 4.749520255440451e-06, + "loss": 0.5006, + "step": 7674 + }, + { + "epoch": 3.015658187206718, + "grad_norm": 0.44354071567582887, + "learning_rate": 4.749451897115675e-06, + "loss": 0.4959, + "step": 7675 + }, + { + "epoch": 3.01605334650531, + "grad_norm": 0.4561647234991173, + "learning_rate": 4.749383529956419e-06, + "loss": 0.4928, + "step": 7676 + }, + { + "epoch": 3.0164485058039023, + "grad_norm": 0.4883457852318272, + "learning_rate": 4.74931515396295e-06, + "loss": 0.5144, + "step": 7677 + }, + { + "epoch": 3.0168436651024946, + "grad_norm": 0.44882770949870965, + "learning_rate": 4.749246769135537e-06, + "loss": 0.4993, + "step": 7678 + }, + { + "epoch": 3.000222277105458, + "grad_norm": 0.7010342589000016, + "learning_rate": 4.749178375474448e-06, + "loss": 0.473, + "step": 7679 + }, + { + "epoch": 3.0006174364040503, + "grad_norm": 0.915590098727133, + "learning_rate": 4.749109972979953e-06, + "loss": 0.45, + "step": 7680 + }, + { + "epoch": 3.0010125957026426, + "grad_norm": 0.6897379840405359, + "learning_rate": 4.74904156165232e-06, + "loss": 0.4431, + "step": 7681 + }, + { + "epoch": 3.001407755001235, + "grad_norm": 0.5249346162970593, + "learning_rate": 4.748973141491816e-06, + "loss": 0.4328, + "step": 7682 + }, + { + "epoch": 3.001802914299827, + "grad_norm": 0.9709224152272217, + "learning_rate": 4.748904712498712e-06, + "loss": 0.4507, + "step": 7683 + }, + { + "epoch": 3.0021980735984193, + "grad_norm": 0.9257083172881498, + "learning_rate": 4.748836274673275e-06, + "loss": 0.4171, + "step": 7684 + }, + { + "epoch": 3.0025932328970115, + "grad_norm": 0.6654293170865759, + "learning_rate": 4.748767828015777e-06, + "loss": 0.4453, + "step": 7685 + }, + { + "epoch": 3.0029883921956038, + "grad_norm": 0.5713696408874206, + "learning_rate": 4.7486993725264824e-06, + "loss": 0.4407, + "step": 7686 + }, + { + "epoch": 3.003383551494196, + "grad_norm": 0.6629984546742934, + "learning_rate": 4.748630908205663e-06, + "loss": 0.4266, + "step": 7687 + }, + { + "epoch": 3.0037787107927882, + "grad_norm": 0.7031964288055295, + "learning_rate": 4.748562435053587e-06, + "loss": 0.4571, + "step": 7688 + }, + { + "epoch": 3.0041738700913805, + "grad_norm": 0.8005034174097257, + "learning_rate": 4.748493953070522e-06, + "loss": 0.441, + "step": 7689 + }, + { + "epoch": 3.0045690293899727, + "grad_norm": 0.5656503152080794, + "learning_rate": 4.748425462256739e-06, + "loss": 0.4482, + "step": 7690 + }, + { + "epoch": 3.004964188688565, + "grad_norm": 0.5805979657928287, + "learning_rate": 4.748356962612506e-06, + "loss": 0.4308, + "step": 7691 + }, + { + "epoch": 3.005359347987157, + "grad_norm": 0.6569090685400837, + "learning_rate": 4.7482884541380915e-06, + "loss": 0.4519, + "step": 7692 + }, + { + "epoch": 3.0057545072857494, + "grad_norm": 0.6217734324861036, + "learning_rate": 4.748219936833766e-06, + "loss": 0.4349, + "step": 7693 + }, + { + "epoch": 3.0061496665843417, + "grad_norm": 0.5796522323129869, + "learning_rate": 4.748151410699798e-06, + "loss": 0.4479, + "step": 7694 + }, + { + "epoch": 3.006544825882934, + "grad_norm": 0.5356679945336663, + "learning_rate": 4.7480828757364555e-06, + "loss": 0.4209, + "step": 7695 + }, + { + "epoch": 3.006939985181526, + "grad_norm": 0.5544815975849889, + "learning_rate": 4.7480143319440094e-06, + "loss": 0.4321, + "step": 7696 + }, + { + "epoch": 3.0073351444801184, + "grad_norm": 0.5615701121311352, + "learning_rate": 4.747945779322727e-06, + "loss": 0.4498, + "step": 7697 + }, + { + "epoch": 3.0077303037787106, + "grad_norm": 0.5591511570043203, + "learning_rate": 4.747877217872879e-06, + "loss": 0.4434, + "step": 7698 + }, + { + "epoch": 3.008125463077303, + "grad_norm": 0.5231502732146598, + "learning_rate": 4.747808647594735e-06, + "loss": 0.4454, + "step": 7699 + }, + { + "epoch": 3.008520622375895, + "grad_norm": 0.5749999982521911, + "learning_rate": 4.747740068488563e-06, + "loss": 0.4335, + "step": 7700 + }, + { + "epoch": 3.0089157816744874, + "grad_norm": 0.5594741958286711, + "learning_rate": 4.747671480554633e-06, + "loss": 0.4479, + "step": 7701 + }, + { + "epoch": 3.0093109409730796, + "grad_norm": 0.5229530513831508, + "learning_rate": 4.747602883793215e-06, + "loss": 0.4206, + "step": 7702 + }, + { + "epoch": 3.009706100271672, + "grad_norm": 0.5269098553306883, + "learning_rate": 4.747534278204576e-06, + "loss": 0.4259, + "step": 7703 + }, + { + "epoch": 3.010101259570264, + "grad_norm": 0.5743914914071026, + "learning_rate": 4.747465663788989e-06, + "loss": 0.4304, + "step": 7704 + }, + { + "epoch": 3.0104964188688563, + "grad_norm": 0.5032330836219913, + "learning_rate": 4.747397040546721e-06, + "loss": 0.4369, + "step": 7705 + }, + { + "epoch": 3.0108915781674486, + "grad_norm": 0.5170294237714139, + "learning_rate": 4.747328408478042e-06, + "loss": 0.442, + "step": 7706 + }, + { + "epoch": 3.011286737466041, + "grad_norm": 0.5325099151723756, + "learning_rate": 4.747259767583221e-06, + "loss": 0.4423, + "step": 7707 + }, + { + "epoch": 3.011681896764633, + "grad_norm": 0.5263448352266229, + "learning_rate": 4.7471911178625285e-06, + "loss": 0.4553, + "step": 7708 + }, + { + "epoch": 3.0120770560632253, + "grad_norm": 0.5935074925411176, + "learning_rate": 4.747122459316235e-06, + "loss": 0.4524, + "step": 7709 + }, + { + "epoch": 3.0124722153618175, + "grad_norm": 0.5162191886079706, + "learning_rate": 4.747053791944607e-06, + "loss": 0.4536, + "step": 7710 + }, + { + "epoch": 3.0128673746604098, + "grad_norm": 0.48679105836545394, + "learning_rate": 4.746985115747918e-06, + "loss": 0.4128, + "step": 7711 + }, + { + "epoch": 3.013262533959002, + "grad_norm": 0.5158017050260368, + "learning_rate": 4.746916430726435e-06, + "loss": 0.4413, + "step": 7712 + }, + { + "epoch": 3.0136576932575943, + "grad_norm": 0.5255387088096938, + "learning_rate": 4.746847736880429e-06, + "loss": 0.4331, + "step": 7713 + }, + { + "epoch": 3.014052852556187, + "grad_norm": 0.49833669861127644, + "learning_rate": 4.746779034210169e-06, + "loss": 0.4546, + "step": 7714 + }, + { + "epoch": 3.014448011854779, + "grad_norm": 0.5013002795411056, + "learning_rate": 4.746710322715926e-06, + "loss": 0.4338, + "step": 7715 + }, + { + "epoch": 3.0148431711533714, + "grad_norm": 0.47380997163306915, + "learning_rate": 4.746641602397969e-06, + "loss": 0.452, + "step": 7716 + }, + { + "epoch": 3.0152383304519637, + "grad_norm": 0.48815835539987246, + "learning_rate": 4.746572873256568e-06, + "loss": 0.4325, + "step": 7717 + }, + { + "epoch": 3.015633489750556, + "grad_norm": 0.5234825947101446, + "learning_rate": 4.746504135291992e-06, + "loss": 0.4627, + "step": 7718 + }, + { + "epoch": 3.016028649049148, + "grad_norm": 0.46528150114040756, + "learning_rate": 4.746435388504513e-06, + "loss": 0.4135, + "step": 7719 + }, + { + "epoch": 3.0164238083477404, + "grad_norm": 0.5186401009350103, + "learning_rate": 4.746366632894399e-06, + "loss": 0.4434, + "step": 7720 + }, + { + "epoch": 3.0168189676463326, + "grad_norm": 0.48449546596605336, + "learning_rate": 4.746297868461922e-06, + "loss": 0.4275, + "step": 7721 + }, + { + "epoch": 3.017214126944925, + "grad_norm": 0.4972566211579247, + "learning_rate": 4.74622909520735e-06, + "loss": 0.4352, + "step": 7722 + }, + { + "epoch": 3.017609286243517, + "grad_norm": 0.49817262550108005, + "learning_rate": 4.746160313130955e-06, + "loss": 0.4427, + "step": 7723 + }, + { + "epoch": 3.0180044455421093, + "grad_norm": 0.4807900493796438, + "learning_rate": 4.746091522233006e-06, + "loss": 0.4458, + "step": 7724 + }, + { + "epoch": 3.0183996048407016, + "grad_norm": 0.486178408026208, + "learning_rate": 4.746022722513772e-06, + "loss": 0.4375, + "step": 7725 + }, + { + "epoch": 3.018794764139294, + "grad_norm": 0.47609321433727503, + "learning_rate": 4.745953913973526e-06, + "loss": 0.439, + "step": 7726 + }, + { + "epoch": 3.019189923437886, + "grad_norm": 0.48600695149007456, + "learning_rate": 4.745885096612537e-06, + "loss": 0.427, + "step": 7727 + }, + { + "epoch": 3.0195850827364783, + "grad_norm": 0.509073376294911, + "learning_rate": 4.745816270431075e-06, + "loss": 0.4388, + "step": 7728 + }, + { + "epoch": 3.0199802420350705, + "grad_norm": 0.5057289776617495, + "learning_rate": 4.74574743542941e-06, + "loss": 0.4388, + "step": 7729 + }, + { + "epoch": 3.020375401333663, + "grad_norm": 0.506257402966116, + "learning_rate": 4.745678591607813e-06, + "loss": 0.4337, + "step": 7730 + }, + { + "epoch": 3.020770560632255, + "grad_norm": 0.5051592228444768, + "learning_rate": 4.745609738966554e-06, + "loss": 0.4439, + "step": 7731 + }, + { + "epoch": 3.0211657199308473, + "grad_norm": 0.5014567224089678, + "learning_rate": 4.745540877505904e-06, + "loss": 0.4341, + "step": 7732 + }, + { + "epoch": 3.0215608792294395, + "grad_norm": 0.5016734580118337, + "learning_rate": 4.745472007226133e-06, + "loss": 0.437, + "step": 7733 + }, + { + "epoch": 3.0219560385280317, + "grad_norm": 0.5017266865893028, + "learning_rate": 4.7454031281275105e-06, + "loss": 0.4297, + "step": 7734 + }, + { + "epoch": 3.022351197826624, + "grad_norm": 0.4925195971174838, + "learning_rate": 4.745334240210309e-06, + "loss": 0.4397, + "step": 7735 + }, + { + "epoch": 3.022746357125216, + "grad_norm": 0.47846300333645714, + "learning_rate": 4.745265343474797e-06, + "loss": 0.4587, + "step": 7736 + }, + { + "epoch": 3.0231415164238085, + "grad_norm": 0.4619490312270768, + "learning_rate": 4.745196437921247e-06, + "loss": 0.4376, + "step": 7737 + }, + { + "epoch": 3.0235366757224007, + "grad_norm": 0.4863155493658263, + "learning_rate": 4.745127523549928e-06, + "loss": 0.486, + "step": 7738 + }, + { + "epoch": 3.023931835020993, + "grad_norm": 0.5009948111270373, + "learning_rate": 4.7450586003611124e-06, + "loss": 0.4433, + "step": 7739 + }, + { + "epoch": 3.024326994319585, + "grad_norm": 0.49175225322505095, + "learning_rate": 4.744989668355069e-06, + "loss": 0.4466, + "step": 7740 + }, + { + "epoch": 3.0247221536181774, + "grad_norm": 0.4997137132091148, + "learning_rate": 4.744920727532069e-06, + "loss": 0.4422, + "step": 7741 + }, + { + "epoch": 3.0251173129167697, + "grad_norm": 0.49782214688948523, + "learning_rate": 4.744851777892386e-06, + "loss": 0.4215, + "step": 7742 + }, + { + "epoch": 3.025512472215362, + "grad_norm": 0.4878799150602817, + "learning_rate": 4.744782819436287e-06, + "loss": 0.4689, + "step": 7743 + }, + { + "epoch": 3.025907631513954, + "grad_norm": 0.4801197425799182, + "learning_rate": 4.7447138521640435e-06, + "loss": 0.4133, + "step": 7744 + }, + { + "epoch": 3.0263027908125464, + "grad_norm": 0.5153722734142074, + "learning_rate": 4.744644876075926e-06, + "loss": 0.4328, + "step": 7745 + }, + { + "epoch": 3.0266979501111386, + "grad_norm": 0.48115731489986924, + "learning_rate": 4.744575891172209e-06, + "loss": 0.4477, + "step": 7746 + }, + { + "epoch": 3.027093109409731, + "grad_norm": 0.5030409292416705, + "learning_rate": 4.74450689745316e-06, + "loss": 0.458, + "step": 7747 + }, + { + "epoch": 3.027488268708323, + "grad_norm": 0.47695935750014723, + "learning_rate": 4.7444378949190505e-06, + "loss": 0.4647, + "step": 7748 + }, + { + "epoch": 3.0278834280069153, + "grad_norm": 0.4898314891709876, + "learning_rate": 4.744368883570152e-06, + "loss": 0.4581, + "step": 7749 + }, + { + "epoch": 3.0282785873055076, + "grad_norm": 0.5003360236778067, + "learning_rate": 4.7442998634067356e-06, + "loss": 0.4249, + "step": 7750 + }, + { + "epoch": 3.0286737466041, + "grad_norm": 0.5091015290593945, + "learning_rate": 4.744230834429071e-06, + "loss": 0.4614, + "step": 7751 + }, + { + "epoch": 3.029068905902692, + "grad_norm": 0.49509617467245814, + "learning_rate": 4.744161796637432e-06, + "loss": 0.4538, + "step": 7752 + }, + { + "epoch": 3.0294640652012843, + "grad_norm": 0.49300299029182093, + "learning_rate": 4.7440927500320875e-06, + "loss": 0.4303, + "step": 7753 + }, + { + "epoch": 3.0298592244998765, + "grad_norm": 0.49820644043525975, + "learning_rate": 4.74402369461331e-06, + "loss": 0.4286, + "step": 7754 + }, + { + "epoch": 3.030254383798469, + "grad_norm": 0.5079411624836448, + "learning_rate": 4.743954630381369e-06, + "loss": 0.4414, + "step": 7755 + }, + { + "epoch": 3.030649543097061, + "grad_norm": 0.4842722586826298, + "learning_rate": 4.743885557336537e-06, + "loss": 0.4296, + "step": 7756 + }, + { + "epoch": 3.0310447023956533, + "grad_norm": 0.48422624836220324, + "learning_rate": 4.743816475479086e-06, + "loss": 0.4456, + "step": 7757 + }, + { + "epoch": 3.0314398616942455, + "grad_norm": 0.5143163370660467, + "learning_rate": 4.743747384809286e-06, + "loss": 0.4369, + "step": 7758 + }, + { + "epoch": 3.0318350209928377, + "grad_norm": 0.48054033529354206, + "learning_rate": 4.743678285327409e-06, + "loss": 0.4414, + "step": 7759 + }, + { + "epoch": 3.03223018029143, + "grad_norm": 0.47545118742637443, + "learning_rate": 4.743609177033725e-06, + "loss": 0.4391, + "step": 7760 + }, + { + "epoch": 3.0326253395900222, + "grad_norm": 0.500071350352214, + "learning_rate": 4.7435400599285075e-06, + "loss": 0.4591, + "step": 7761 + }, + { + "epoch": 3.0330204988886145, + "grad_norm": 0.49736275171993133, + "learning_rate": 4.743470934012026e-06, + "loss": 0.4451, + "step": 7762 + }, + { + "epoch": 3.0334156581872067, + "grad_norm": 0.48902933971503654, + "learning_rate": 4.7434017992845536e-06, + "loss": 0.444, + "step": 7763 + }, + { + "epoch": 3.033810817485799, + "grad_norm": 0.5017317318287491, + "learning_rate": 4.743332655746362e-06, + "loss": 0.4433, + "step": 7764 + }, + { + "epoch": 3.034205976784391, + "grad_norm": 0.49852541132644085, + "learning_rate": 4.743263503397721e-06, + "loss": 0.443, + "step": 7765 + }, + { + "epoch": 3.0346011360829834, + "grad_norm": 0.49514495883125453, + "learning_rate": 4.743194342238904e-06, + "loss": 0.4434, + "step": 7766 + }, + { + "epoch": 3.0349962953815757, + "grad_norm": 0.5065607691043884, + "learning_rate": 4.743125172270181e-06, + "loss": 0.4557, + "step": 7767 + }, + { + "epoch": 3.035391454680168, + "grad_norm": 0.500892295601084, + "learning_rate": 4.743055993491824e-06, + "loss": 0.438, + "step": 7768 + }, + { + "epoch": 3.03578661397876, + "grad_norm": 0.4840852701954542, + "learning_rate": 4.7429868059041065e-06, + "loss": 0.4334, + "step": 7769 + }, + { + "epoch": 3.0361817732773524, + "grad_norm": 0.5112220036213672, + "learning_rate": 4.742917609507298e-06, + "loss": 0.4411, + "step": 7770 + }, + { + "epoch": 3.0365769325759446, + "grad_norm": 0.48860572596776, + "learning_rate": 4.742848404301671e-06, + "loss": 0.4502, + "step": 7771 + }, + { + "epoch": 3.036972091874537, + "grad_norm": 0.4819193894547075, + "learning_rate": 4.742779190287497e-06, + "loss": 0.4429, + "step": 7772 + }, + { + "epoch": 3.037367251173129, + "grad_norm": 0.4943547849084416, + "learning_rate": 4.742709967465049e-06, + "loss": 0.4421, + "step": 7773 + }, + { + "epoch": 3.0377624104717214, + "grad_norm": 0.4917250752903298, + "learning_rate": 4.742640735834599e-06, + "loss": 0.4425, + "step": 7774 + }, + { + "epoch": 3.0381575697703136, + "grad_norm": 0.46606336140539156, + "learning_rate": 4.742571495396415e-06, + "loss": 0.4378, + "step": 7775 + }, + { + "epoch": 3.038552729068906, + "grad_norm": 0.47545852725279386, + "learning_rate": 4.742502246150775e-06, + "loss": 0.4333, + "step": 7776 + }, + { + "epoch": 3.038947888367498, + "grad_norm": 0.48667928367880847, + "learning_rate": 4.742432988097946e-06, + "loss": 0.4436, + "step": 7777 + }, + { + "epoch": 3.0393430476660903, + "grad_norm": 0.4984876343406036, + "learning_rate": 4.742363721238203e-06, + "loss": 0.4413, + "step": 7778 + }, + { + "epoch": 3.0397382069646826, + "grad_norm": 0.4810472084691766, + "learning_rate": 4.742294445571817e-06, + "loss": 0.4297, + "step": 7779 + }, + { + "epoch": 3.040133366263275, + "grad_norm": 0.49611193782024826, + "learning_rate": 4.742225161099059e-06, + "loss": 0.4536, + "step": 7780 + }, + { + "epoch": 3.040528525561867, + "grad_norm": 0.5043827439085407, + "learning_rate": 4.7421558678202025e-06, + "loss": 0.4358, + "step": 7781 + }, + { + "epoch": 3.0409236848604593, + "grad_norm": 0.48913922839386753, + "learning_rate": 4.7420865657355195e-06, + "loss": 0.4348, + "step": 7782 + }, + { + "epoch": 3.0413188441590515, + "grad_norm": 0.5045463746963759, + "learning_rate": 4.742017254845282e-06, + "loss": 0.4381, + "step": 7783 + }, + { + "epoch": 3.0417140034576438, + "grad_norm": 0.4955607123203069, + "learning_rate": 4.741947935149762e-06, + "loss": 0.439, + "step": 7784 + }, + { + "epoch": 3.042109162756236, + "grad_norm": 0.49656116436176884, + "learning_rate": 4.741878606649232e-06, + "loss": 0.4373, + "step": 7785 + }, + { + "epoch": 3.0425043220548282, + "grad_norm": 0.48383805086599924, + "learning_rate": 4.741809269343964e-06, + "loss": 0.4417, + "step": 7786 + }, + { + "epoch": 3.0428994813534205, + "grad_norm": 0.48644485198914195, + "learning_rate": 4.741739923234231e-06, + "loss": 0.4348, + "step": 7787 + }, + { + "epoch": 3.0432946406520127, + "grad_norm": 0.5051982041050768, + "learning_rate": 4.741670568320304e-06, + "loss": 0.4462, + "step": 7788 + }, + { + "epoch": 3.043689799950605, + "grad_norm": 0.5048908640256319, + "learning_rate": 4.741601204602457e-06, + "loss": 0.4518, + "step": 7789 + }, + { + "epoch": 3.044084959249197, + "grad_norm": 0.4944058502455239, + "learning_rate": 4.741531832080961e-06, + "loss": 0.4481, + "step": 7790 + }, + { + "epoch": 3.0444801185477894, + "grad_norm": 0.5375482439420921, + "learning_rate": 4.741462450756089e-06, + "loss": 0.4556, + "step": 7791 + }, + { + "epoch": 3.0448752778463817, + "grad_norm": 0.483762343858191, + "learning_rate": 4.741393060628115e-06, + "loss": 0.4258, + "step": 7792 + }, + { + "epoch": 3.045270437144974, + "grad_norm": 0.47995611425569656, + "learning_rate": 4.741323661697308e-06, + "loss": 0.4364, + "step": 7793 + }, + { + "epoch": 3.045665596443566, + "grad_norm": 0.47885796380315865, + "learning_rate": 4.741254253963944e-06, + "loss": 0.4335, + "step": 7794 + }, + { + "epoch": 3.0460607557421584, + "grad_norm": 0.5094045890304211, + "learning_rate": 4.741184837428294e-06, + "loss": 0.4501, + "step": 7795 + }, + { + "epoch": 3.0464559150407506, + "grad_norm": 0.4896089467966352, + "learning_rate": 4.741115412090631e-06, + "loss": 0.448, + "step": 7796 + }, + { + "epoch": 3.046851074339343, + "grad_norm": 0.4870799703211307, + "learning_rate": 4.7410459779512276e-06, + "loss": 0.4455, + "step": 7797 + }, + { + "epoch": 3.047246233637935, + "grad_norm": 0.5102806429622584, + "learning_rate": 4.740976535010355e-06, + "loss": 0.431, + "step": 7798 + }, + { + "epoch": 3.0476413929365274, + "grad_norm": 0.4899387354627377, + "learning_rate": 4.740907083268289e-06, + "loss": 0.4552, + "step": 7799 + }, + { + "epoch": 3.0480365522351196, + "grad_norm": 0.4979089977106902, + "learning_rate": 4.740837622725301e-06, + "loss": 0.4383, + "step": 7800 + }, + { + "epoch": 3.048431711533712, + "grad_norm": 0.49833292526111833, + "learning_rate": 4.7407681533816624e-06, + "loss": 0.4373, + "step": 7801 + }, + { + "epoch": 3.048826870832304, + "grad_norm": 0.49821399658843996, + "learning_rate": 4.7406986752376475e-06, + "loss": 0.4479, + "step": 7802 + }, + { + "epoch": 3.0492220301308963, + "grad_norm": 0.4744560438026494, + "learning_rate": 4.740629188293529e-06, + "loss": 0.4276, + "step": 7803 + }, + { + "epoch": 3.0496171894294886, + "grad_norm": 0.47806331822046966, + "learning_rate": 4.740559692549579e-06, + "loss": 0.4253, + "step": 7804 + }, + { + "epoch": 3.050012348728081, + "grad_norm": 0.48603559179075806, + "learning_rate": 4.7404901880060725e-06, + "loss": 0.4541, + "step": 7805 + }, + { + "epoch": 3.050407508026673, + "grad_norm": 0.4910984242297495, + "learning_rate": 4.74042067466328e-06, + "loss": 0.4299, + "step": 7806 + }, + { + "epoch": 3.0508026673252653, + "grad_norm": 0.4851618865150965, + "learning_rate": 4.740351152521475e-06, + "loss": 0.4374, + "step": 7807 + }, + { + "epoch": 3.0511978266238575, + "grad_norm": 0.4889763206502265, + "learning_rate": 4.740281621580932e-06, + "loss": 0.436, + "step": 7808 + }, + { + "epoch": 3.0515929859224498, + "grad_norm": 0.5012899815645088, + "learning_rate": 4.740212081841924e-06, + "loss": 0.4457, + "step": 7809 + }, + { + "epoch": 3.0519881452210424, + "grad_norm": 0.5082796466392632, + "learning_rate": 4.7401425333047215e-06, + "loss": 0.4409, + "step": 7810 + }, + { + "epoch": 3.0523833045196347, + "grad_norm": 0.4982145445419337, + "learning_rate": 4.7400729759696e-06, + "loss": 0.4436, + "step": 7811 + }, + { + "epoch": 3.052778463818227, + "grad_norm": 0.4956899805603572, + "learning_rate": 4.7400034098368325e-06, + "loss": 0.4269, + "step": 7812 + }, + { + "epoch": 3.053173623116819, + "grad_norm": 0.48480193413501993, + "learning_rate": 4.739933834906692e-06, + "loss": 0.4479, + "step": 7813 + }, + { + "epoch": 3.0535687824154114, + "grad_norm": 0.4806050679449309, + "learning_rate": 4.73986425117945e-06, + "loss": 0.4367, + "step": 7814 + }, + { + "epoch": 3.0539639417140036, + "grad_norm": 0.4855626404004344, + "learning_rate": 4.739794658655383e-06, + "loss": 0.4482, + "step": 7815 + }, + { + "epoch": 3.054359101012596, + "grad_norm": 0.5103646887287417, + "learning_rate": 4.739725057334762e-06, + "loss": 0.4485, + "step": 7816 + }, + { + "epoch": 3.054754260311188, + "grad_norm": 0.48745678622976013, + "learning_rate": 4.7396554472178615e-06, + "loss": 0.4548, + "step": 7817 + }, + { + "epoch": 3.0551494196097804, + "grad_norm": 0.512946784611527, + "learning_rate": 4.739585828304953e-06, + "loss": 0.4438, + "step": 7818 + }, + { + "epoch": 3.0555445789083726, + "grad_norm": 0.49787233339609754, + "learning_rate": 4.739516200596313e-06, + "loss": 0.4412, + "step": 7819 + }, + { + "epoch": 3.055939738206965, + "grad_norm": 0.4923640029971268, + "learning_rate": 4.739446564092213e-06, + "loss": 0.4266, + "step": 7820 + }, + { + "epoch": 3.056334897505557, + "grad_norm": 0.4922770070573321, + "learning_rate": 4.739376918792926e-06, + "loss": 0.4234, + "step": 7821 + }, + { + "epoch": 3.0567300568041493, + "grad_norm": 0.48256515830579066, + "learning_rate": 4.7393072646987266e-06, + "loss": 0.4283, + "step": 7822 + }, + { + "epoch": 3.0571252161027416, + "grad_norm": 0.4871301212135088, + "learning_rate": 4.739237601809889e-06, + "loss": 0.4347, + "step": 7823 + }, + { + "epoch": 3.057520375401334, + "grad_norm": 0.48509606892345153, + "learning_rate": 4.739167930126684e-06, + "loss": 0.4345, + "step": 7824 + }, + { + "epoch": 3.057915534699926, + "grad_norm": 0.6165399120475508, + "learning_rate": 4.739098249649388e-06, + "loss": 0.4507, + "step": 7825 + }, + { + "epoch": 3.0583106939985183, + "grad_norm": 0.4921339621782017, + "learning_rate": 4.739028560378274e-06, + "loss": 0.4406, + "step": 7826 + }, + { + "epoch": 3.0587058532971105, + "grad_norm": 0.5054560093119399, + "learning_rate": 4.738958862313615e-06, + "loss": 0.4542, + "step": 7827 + }, + { + "epoch": 3.0591010125957028, + "grad_norm": 0.4886778814315178, + "learning_rate": 4.7388891554556845e-06, + "loss": 0.4393, + "step": 7828 + }, + { + "epoch": 3.059496171894295, + "grad_norm": 0.4860239144323546, + "learning_rate": 4.7388194398047585e-06, + "loss": 0.4282, + "step": 7829 + }, + { + "epoch": 3.0598913311928873, + "grad_norm": 0.4924325835486165, + "learning_rate": 4.738749715361108e-06, + "loss": 0.4533, + "step": 7830 + }, + { + "epoch": 3.0602864904914795, + "grad_norm": 0.48932648303037984, + "learning_rate": 4.738679982125008e-06, + "loss": 0.4453, + "step": 7831 + }, + { + "epoch": 3.0606816497900717, + "grad_norm": 0.48237789375682943, + "learning_rate": 4.738610240096733e-06, + "loss": 0.4447, + "step": 7832 + }, + { + "epoch": 3.061076809088664, + "grad_norm": 0.506439179071494, + "learning_rate": 4.7385404892765565e-06, + "loss": 0.4499, + "step": 7833 + }, + { + "epoch": 3.061471968387256, + "grad_norm": 0.4977813662126531, + "learning_rate": 4.738470729664753e-06, + "loss": 0.4369, + "step": 7834 + }, + { + "epoch": 3.0618671276858485, + "grad_norm": 0.5122710738027177, + "learning_rate": 4.738400961261594e-06, + "loss": 0.4395, + "step": 7835 + }, + { + "epoch": 3.0622622869844407, + "grad_norm": 0.4846403050899975, + "learning_rate": 4.7383311840673565e-06, + "loss": 0.4371, + "step": 7836 + }, + { + "epoch": 3.062657446283033, + "grad_norm": 0.5032833387552864, + "learning_rate": 4.738261398082313e-06, + "loss": 0.4389, + "step": 7837 + }, + { + "epoch": 3.063052605581625, + "grad_norm": 0.4923216050566678, + "learning_rate": 4.738191603306738e-06, + "loss": 0.4339, + "step": 7838 + }, + { + "epoch": 3.0634477648802174, + "grad_norm": 0.4909668236344491, + "learning_rate": 4.738121799740904e-06, + "loss": 0.4379, + "step": 7839 + }, + { + "epoch": 3.0638429241788097, + "grad_norm": 0.5040118071240738, + "learning_rate": 4.738051987385088e-06, + "loss": 0.44, + "step": 7840 + }, + { + "epoch": 3.064238083477402, + "grad_norm": 0.5125484179414564, + "learning_rate": 4.737982166239563e-06, + "loss": 0.4341, + "step": 7841 + }, + { + "epoch": 3.064633242775994, + "grad_norm": 0.5251112266930515, + "learning_rate": 4.737912336304602e-06, + "loss": 0.4307, + "step": 7842 + }, + { + "epoch": 3.0650284020745864, + "grad_norm": 0.481537716871614, + "learning_rate": 4.737842497580482e-06, + "loss": 0.4342, + "step": 7843 + }, + { + "epoch": 3.0654235613731786, + "grad_norm": 0.48948821736693127, + "learning_rate": 4.737772650067474e-06, + "loss": 0.439, + "step": 7844 + }, + { + "epoch": 3.065818720671771, + "grad_norm": 0.5023038703887029, + "learning_rate": 4.737702793765855e-06, + "loss": 0.4525, + "step": 7845 + }, + { + "epoch": 3.066213879970363, + "grad_norm": 0.494523096453894, + "learning_rate": 4.737632928675897e-06, + "loss": 0.4439, + "step": 7846 + }, + { + "epoch": 3.0666090392689553, + "grad_norm": 0.48065114804411224, + "learning_rate": 4.7375630547978764e-06, + "loss": 0.4238, + "step": 7847 + }, + { + "epoch": 3.0670041985675476, + "grad_norm": 0.49790523236786666, + "learning_rate": 4.737493172132067e-06, + "loss": 0.4309, + "step": 7848 + }, + { + "epoch": 3.06739935786614, + "grad_norm": 0.4989996884805779, + "learning_rate": 4.737423280678742e-06, + "loss": 0.4512, + "step": 7849 + }, + { + "epoch": 3.067794517164732, + "grad_norm": 0.5016208814260822, + "learning_rate": 4.737353380438178e-06, + "loss": 0.4539, + "step": 7850 + }, + { + "epoch": 3.0681896764633243, + "grad_norm": 0.5134079709987914, + "learning_rate": 4.737283471410649e-06, + "loss": 0.457, + "step": 7851 + }, + { + "epoch": 3.0685848357619165, + "grad_norm": 0.5067457933406915, + "learning_rate": 4.737213553596428e-06, + "loss": 0.4481, + "step": 7852 + }, + { + "epoch": 3.068979995060509, + "grad_norm": 0.4776716218044217, + "learning_rate": 4.73714362699579e-06, + "loss": 0.4383, + "step": 7853 + }, + { + "epoch": 3.069375154359101, + "grad_norm": 0.48957989485980835, + "learning_rate": 4.737073691609012e-06, + "loss": 0.4357, + "step": 7854 + }, + { + "epoch": 3.0697703136576933, + "grad_norm": 0.47676778659524477, + "learning_rate": 4.737003747436366e-06, + "loss": 0.4386, + "step": 7855 + }, + { + "epoch": 3.0701654729562855, + "grad_norm": 0.6191504502491905, + "learning_rate": 4.736933794478128e-06, + "loss": 0.4543, + "step": 7856 + }, + { + "epoch": 3.0705606322548777, + "grad_norm": 0.4942925858350992, + "learning_rate": 4.736863832734573e-06, + "loss": 0.4353, + "step": 7857 + }, + { + "epoch": 3.07095579155347, + "grad_norm": 0.5216216205160417, + "learning_rate": 4.736793862205974e-06, + "loss": 0.4269, + "step": 7858 + }, + { + "epoch": 3.0713509508520622, + "grad_norm": 0.4838627707703042, + "learning_rate": 4.7367238828926075e-06, + "loss": 0.4415, + "step": 7859 + }, + { + "epoch": 3.0717461101506545, + "grad_norm": 0.48668009920048644, + "learning_rate": 4.736653894794748e-06, + "loss": 0.4303, + "step": 7860 + }, + { + "epoch": 3.0721412694492467, + "grad_norm": 0.5167979871026906, + "learning_rate": 4.7365838979126696e-06, + "loss": 0.4665, + "step": 7861 + }, + { + "epoch": 3.072536428747839, + "grad_norm": 0.5067480201831851, + "learning_rate": 4.736513892246648e-06, + "loss": 0.4592, + "step": 7862 + }, + { + "epoch": 3.072931588046431, + "grad_norm": 0.503091144854992, + "learning_rate": 4.736443877796959e-06, + "loss": 0.4509, + "step": 7863 + }, + { + "epoch": 3.0733267473450234, + "grad_norm": 0.4966903825101766, + "learning_rate": 4.736373854563875e-06, + "loss": 0.4393, + "step": 7864 + }, + { + "epoch": 3.0737219066436157, + "grad_norm": 0.5053933985734216, + "learning_rate": 4.736303822547673e-06, + "loss": 0.4505, + "step": 7865 + }, + { + "epoch": 3.074117065942208, + "grad_norm": 0.47840381289455225, + "learning_rate": 4.736233781748627e-06, + "loss": 0.4346, + "step": 7866 + }, + { + "epoch": 3.0745122252408, + "grad_norm": 0.5006357088048857, + "learning_rate": 4.736163732167014e-06, + "loss": 0.4538, + "step": 7867 + }, + { + "epoch": 3.0749073845393924, + "grad_norm": 0.47835652192235645, + "learning_rate": 4.736093673803106e-06, + "loss": 0.4365, + "step": 7868 + }, + { + "epoch": 3.0753025438379846, + "grad_norm": 0.5171050130456775, + "learning_rate": 4.736023606657181e-06, + "loss": 0.4422, + "step": 7869 + }, + { + "epoch": 3.075697703136577, + "grad_norm": 0.4937403544438971, + "learning_rate": 4.735953530729514e-06, + "loss": 0.4316, + "step": 7870 + }, + { + "epoch": 3.076092862435169, + "grad_norm": 0.4959244380471904, + "learning_rate": 4.735883446020377e-06, + "loss": 0.4553, + "step": 7871 + }, + { + "epoch": 3.0764880217337613, + "grad_norm": 0.5016759159070576, + "learning_rate": 4.7358133525300484e-06, + "loss": 0.4445, + "step": 7872 + }, + { + "epoch": 3.0768831810323536, + "grad_norm": 0.48705287229754113, + "learning_rate": 4.735743250258803e-06, + "loss": 0.4469, + "step": 7873 + }, + { + "epoch": 3.077278340330946, + "grad_norm": 0.5092392088990833, + "learning_rate": 4.735673139206915e-06, + "loss": 0.4592, + "step": 7874 + }, + { + "epoch": 3.077673499629538, + "grad_norm": 0.47669040144749475, + "learning_rate": 4.735603019374661e-06, + "loss": 0.4275, + "step": 7875 + }, + { + "epoch": 3.0780686589281303, + "grad_norm": 0.4924439478857504, + "learning_rate": 4.735532890762316e-06, + "loss": 0.4395, + "step": 7876 + }, + { + "epoch": 3.0784638182267225, + "grad_norm": 0.5123885580239934, + "learning_rate": 4.735462753370156e-06, + "loss": 0.4501, + "step": 7877 + }, + { + "epoch": 3.078858977525315, + "grad_norm": 0.4823099934221924, + "learning_rate": 4.735392607198455e-06, + "loss": 0.4275, + "step": 7878 + }, + { + "epoch": 3.079254136823907, + "grad_norm": 0.4857701642431153, + "learning_rate": 4.735322452247489e-06, + "loss": 0.4373, + "step": 7879 + }, + { + "epoch": 3.0796492961224993, + "grad_norm": 0.49163839390605096, + "learning_rate": 4.7352522885175345e-06, + "loss": 0.4455, + "step": 7880 + }, + { + "epoch": 3.0800444554210915, + "grad_norm": 0.4884031270257798, + "learning_rate": 4.735182116008866e-06, + "loss": 0.4582, + "step": 7881 + }, + { + "epoch": 3.0804396147196837, + "grad_norm": 0.4899553724792646, + "learning_rate": 4.7351119347217585e-06, + "loss": 0.4355, + "step": 7882 + }, + { + "epoch": 3.080834774018276, + "grad_norm": 0.4949749498592674, + "learning_rate": 4.73504174465649e-06, + "loss": 0.4525, + "step": 7883 + }, + { + "epoch": 3.0812299333168682, + "grad_norm": 0.4988182734709485, + "learning_rate": 4.734971545813334e-06, + "loss": 0.4498, + "step": 7884 + }, + { + "epoch": 3.0816250926154605, + "grad_norm": 0.49792382572578914, + "learning_rate": 4.734901338192567e-06, + "loss": 0.4562, + "step": 7885 + }, + { + "epoch": 3.0820202519140527, + "grad_norm": 0.4915536561332589, + "learning_rate": 4.734831121794464e-06, + "loss": 0.4463, + "step": 7886 + }, + { + "epoch": 3.082415411212645, + "grad_norm": 0.4970561396674575, + "learning_rate": 4.734760896619302e-06, + "loss": 0.4476, + "step": 7887 + }, + { + "epoch": 3.082810570511237, + "grad_norm": 0.4997419639637845, + "learning_rate": 4.734690662667356e-06, + "loss": 0.4337, + "step": 7888 + }, + { + "epoch": 3.0832057298098294, + "grad_norm": 0.48823979322917044, + "learning_rate": 4.734620419938902e-06, + "loss": 0.4684, + "step": 7889 + }, + { + "epoch": 3.0836008891084217, + "grad_norm": 0.5036100358314696, + "learning_rate": 4.734550168434216e-06, + "loss": 0.4457, + "step": 7890 + }, + { + "epoch": 3.083996048407014, + "grad_norm": 0.4941812609213103, + "learning_rate": 4.734479908153574e-06, + "loss": 0.4448, + "step": 7891 + }, + { + "epoch": 3.084391207705606, + "grad_norm": 0.5061906057278033, + "learning_rate": 4.734409639097253e-06, + "loss": 0.4486, + "step": 7892 + }, + { + "epoch": 3.0847863670041984, + "grad_norm": 0.5100975450233716, + "learning_rate": 4.734339361265526e-06, + "loss": 0.4425, + "step": 7893 + }, + { + "epoch": 3.0851815263027906, + "grad_norm": 0.49861233440246766, + "learning_rate": 4.7342690746586714e-06, + "loss": 0.4356, + "step": 7894 + }, + { + "epoch": 3.085576685601383, + "grad_norm": 0.574904471137688, + "learning_rate": 4.734198779276964e-06, + "loss": 0.4409, + "step": 7895 + }, + { + "epoch": 3.085971844899975, + "grad_norm": 0.4873731771876203, + "learning_rate": 4.734128475120681e-06, + "loss": 0.4405, + "step": 7896 + }, + { + "epoch": 3.0863670041985674, + "grad_norm": 0.4951592433754978, + "learning_rate": 4.7340581621900985e-06, + "loss": 0.4501, + "step": 7897 + }, + { + "epoch": 3.0867621634971596, + "grad_norm": 0.4766631688452397, + "learning_rate": 4.733987840485491e-06, + "loss": 0.4333, + "step": 7898 + }, + { + "epoch": 3.087157322795752, + "grad_norm": 0.4923906837500623, + "learning_rate": 4.733917510007137e-06, + "loss": 0.4383, + "step": 7899 + }, + { + "epoch": 3.087552482094344, + "grad_norm": 0.48241702014723586, + "learning_rate": 4.73384717075531e-06, + "loss": 0.4313, + "step": 7900 + }, + { + "epoch": 3.0879476413929363, + "grad_norm": 0.48868586859443763, + "learning_rate": 4.733776822730289e-06, + "loss": 0.4409, + "step": 7901 + }, + { + "epoch": 3.088342800691529, + "grad_norm": 0.5123299558820024, + "learning_rate": 4.733706465932349e-06, + "loss": 0.4523, + "step": 7902 + }, + { + "epoch": 3.0887379599901212, + "grad_norm": 0.4933673322520625, + "learning_rate": 4.733636100361766e-06, + "loss": 0.4312, + "step": 7903 + }, + { + "epoch": 3.0891331192887135, + "grad_norm": 0.5081997308648453, + "learning_rate": 4.733565726018817e-06, + "loss": 0.4357, + "step": 7904 + }, + { + "epoch": 3.0895282785873057, + "grad_norm": 0.4975799368385473, + "learning_rate": 4.733495342903778e-06, + "loss": 0.4392, + "step": 7905 + }, + { + "epoch": 3.089923437885898, + "grad_norm": 0.5025137130985103, + "learning_rate": 4.733424951016925e-06, + "loss": 0.4734, + "step": 7906 + }, + { + "epoch": 3.09031859718449, + "grad_norm": 0.4977488175268691, + "learning_rate": 4.733354550358536e-06, + "loss": 0.4412, + "step": 7907 + }, + { + "epoch": 3.0907137564830824, + "grad_norm": 0.48503718006898866, + "learning_rate": 4.733284140928886e-06, + "loss": 0.4577, + "step": 7908 + }, + { + "epoch": 3.0911089157816747, + "grad_norm": 0.49140511027996614, + "learning_rate": 4.733213722728251e-06, + "loss": 0.4249, + "step": 7909 + }, + { + "epoch": 3.091504075080267, + "grad_norm": 0.4854895181626003, + "learning_rate": 4.73314329575691e-06, + "loss": 0.4417, + "step": 7910 + }, + { + "epoch": 3.091899234378859, + "grad_norm": 0.49472508088997874, + "learning_rate": 4.733072860015138e-06, + "loss": 0.4447, + "step": 7911 + }, + { + "epoch": 3.0922943936774514, + "grad_norm": 0.5209744346067812, + "learning_rate": 4.7330024155032115e-06, + "loss": 0.4517, + "step": 7912 + }, + { + "epoch": 3.0926895529760436, + "grad_norm": 0.5376195020380707, + "learning_rate": 4.732931962221407e-06, + "loss": 0.4473, + "step": 7913 + }, + { + "epoch": 3.093084712274636, + "grad_norm": 0.48849504260879484, + "learning_rate": 4.732861500170003e-06, + "loss": 0.4362, + "step": 7914 + }, + { + "epoch": 3.093479871573228, + "grad_norm": 0.5121173737122551, + "learning_rate": 4.732791029349274e-06, + "loss": 0.4364, + "step": 7915 + }, + { + "epoch": 3.0938750308718204, + "grad_norm": 0.5013991610345373, + "learning_rate": 4.7327205497594975e-06, + "loss": 0.4413, + "step": 7916 + }, + { + "epoch": 3.0942701901704126, + "grad_norm": 0.5132348413742815, + "learning_rate": 4.732650061400951e-06, + "loss": 0.4609, + "step": 7917 + }, + { + "epoch": 3.094665349469005, + "grad_norm": 0.5140445000952766, + "learning_rate": 4.73257956427391e-06, + "loss": 0.4384, + "step": 7918 + }, + { + "epoch": 3.095060508767597, + "grad_norm": 0.4805026819173038, + "learning_rate": 4.732509058378653e-06, + "loss": 0.4353, + "step": 7919 + }, + { + "epoch": 3.0954556680661893, + "grad_norm": 0.4998603496611781, + "learning_rate": 4.732438543715456e-06, + "loss": 0.4513, + "step": 7920 + }, + { + "epoch": 3.0958508273647816, + "grad_norm": 0.5345924785899526, + "learning_rate": 4.732368020284596e-06, + "loss": 0.4452, + "step": 7921 + }, + { + "epoch": 3.096245986663374, + "grad_norm": 0.5074168698408299, + "learning_rate": 4.732297488086349e-06, + "loss": 0.4556, + "step": 7922 + }, + { + "epoch": 3.096641145961966, + "grad_norm": 0.4959304495710035, + "learning_rate": 4.732226947120995e-06, + "loss": 0.4633, + "step": 7923 + }, + { + "epoch": 3.0970363052605583, + "grad_norm": 0.5729612706103528, + "learning_rate": 4.732156397388807e-06, + "loss": 0.4323, + "step": 7924 + }, + { + "epoch": 3.0974314645591505, + "grad_norm": 0.48429647430521955, + "learning_rate": 4.732085838890064e-06, + "loss": 0.4401, + "step": 7925 + }, + { + "epoch": 3.0978266238577428, + "grad_norm": 0.49722917342119355, + "learning_rate": 4.732015271625045e-06, + "loss": 0.4468, + "step": 7926 + }, + { + "epoch": 3.098221783156335, + "grad_norm": 0.5045870249115458, + "learning_rate": 4.731944695594024e-06, + "loss": 0.4511, + "step": 7927 + }, + { + "epoch": 3.0986169424549272, + "grad_norm": 0.5022592905161292, + "learning_rate": 4.731874110797281e-06, + "loss": 0.4506, + "step": 7928 + }, + { + "epoch": 3.0990121017535195, + "grad_norm": 0.5111603683659854, + "learning_rate": 4.73180351723509e-06, + "loss": 0.4524, + "step": 7929 + }, + { + "epoch": 3.0994072610521117, + "grad_norm": 0.5267537367598453, + "learning_rate": 4.731732914907731e-06, + "loss": 0.4576, + "step": 7930 + }, + { + "epoch": 3.099802420350704, + "grad_norm": 0.5723867018529993, + "learning_rate": 4.731662303815479e-06, + "loss": 0.4464, + "step": 7931 + }, + { + "epoch": 3.100197579649296, + "grad_norm": 0.5002231014728444, + "learning_rate": 4.7315916839586144e-06, + "loss": 0.4518, + "step": 7932 + }, + { + "epoch": 3.1005927389478884, + "grad_norm": 0.49456541023282674, + "learning_rate": 4.731521055337412e-06, + "loss": 0.4395, + "step": 7933 + }, + { + "epoch": 3.1009878982464807, + "grad_norm": 0.48028211739122306, + "learning_rate": 4.73145041795215e-06, + "loss": 0.4491, + "step": 7934 + }, + { + "epoch": 3.101383057545073, + "grad_norm": 0.49265412815653176, + "learning_rate": 4.731379771803106e-06, + "loss": 0.4479, + "step": 7935 + }, + { + "epoch": 3.101778216843665, + "grad_norm": 0.488033887759252, + "learning_rate": 4.731309116890556e-06, + "loss": 0.433, + "step": 7936 + }, + { + "epoch": 3.1021733761422574, + "grad_norm": 0.4979653413020749, + "learning_rate": 4.731238453214781e-06, + "loss": 0.4585, + "step": 7937 + }, + { + "epoch": 3.1025685354408497, + "grad_norm": 0.5024424180826177, + "learning_rate": 4.731167780776055e-06, + "loss": 0.4695, + "step": 7938 + }, + { + "epoch": 3.102963694739442, + "grad_norm": 0.49050531548527476, + "learning_rate": 4.731097099574656e-06, + "loss": 0.4434, + "step": 7939 + }, + { + "epoch": 3.103358854038034, + "grad_norm": 0.4993603539723156, + "learning_rate": 4.731026409610863e-06, + "loss": 0.4562, + "step": 7940 + }, + { + "epoch": 3.1037540133366264, + "grad_norm": 0.4852146754915784, + "learning_rate": 4.7309557108849535e-06, + "loss": 0.4528, + "step": 7941 + }, + { + "epoch": 3.1041491726352186, + "grad_norm": 0.47989325470415545, + "learning_rate": 4.730885003397204e-06, + "loss": 0.4435, + "step": 7942 + }, + { + "epoch": 3.104544331933811, + "grad_norm": 0.5012231199309242, + "learning_rate": 4.730814287147893e-06, + "loss": 0.4349, + "step": 7943 + }, + { + "epoch": 3.104939491232403, + "grad_norm": 0.5077310439859108, + "learning_rate": 4.730743562137299e-06, + "loss": 0.4444, + "step": 7944 + }, + { + "epoch": 3.1053346505309953, + "grad_norm": 0.5353226020763748, + "learning_rate": 4.7306728283656976e-06, + "loss": 0.4208, + "step": 7945 + }, + { + "epoch": 3.1057298098295876, + "grad_norm": 0.4789749092687642, + "learning_rate": 4.7306020858333685e-06, + "loss": 0.4351, + "step": 7946 + }, + { + "epoch": 3.10612496912818, + "grad_norm": 0.49123192197023, + "learning_rate": 4.730531334540589e-06, + "loss": 0.4511, + "step": 7947 + }, + { + "epoch": 3.106520128426772, + "grad_norm": 0.503885581742825, + "learning_rate": 4.730460574487636e-06, + "loss": 0.4593, + "step": 7948 + }, + { + "epoch": 3.1069152877253643, + "grad_norm": 0.4910430506949811, + "learning_rate": 4.7303898056747895e-06, + "loss": 0.4606, + "step": 7949 + }, + { + "epoch": 3.1073104470239565, + "grad_norm": 0.4997712623504369, + "learning_rate": 4.730319028102326e-06, + "loss": 0.4524, + "step": 7950 + }, + { + "epoch": 3.1077056063225488, + "grad_norm": 0.4998843657763657, + "learning_rate": 4.730248241770523e-06, + "loss": 0.4575, + "step": 7951 + }, + { + "epoch": 3.108100765621141, + "grad_norm": 0.5030067437860954, + "learning_rate": 4.730177446679659e-06, + "loss": 0.4403, + "step": 7952 + }, + { + "epoch": 3.1084959249197333, + "grad_norm": 0.5010628496154773, + "learning_rate": 4.730106642830013e-06, + "loss": 0.4491, + "step": 7953 + }, + { + "epoch": 3.1088910842183255, + "grad_norm": 0.49750569778277126, + "learning_rate": 4.730035830221862e-06, + "loss": 0.4488, + "step": 7954 + }, + { + "epoch": 3.1092862435169177, + "grad_norm": 0.5133141372969336, + "learning_rate": 4.729965008855485e-06, + "loss": 0.4502, + "step": 7955 + }, + { + "epoch": 3.10968140281551, + "grad_norm": 0.47517558280691385, + "learning_rate": 4.729894178731159e-06, + "loss": 0.4254, + "step": 7956 + }, + { + "epoch": 3.110076562114102, + "grad_norm": 0.5159606553817537, + "learning_rate": 4.7298233398491625e-06, + "loss": 0.4435, + "step": 7957 + }, + { + "epoch": 3.1104717214126945, + "grad_norm": 0.4979703809694994, + "learning_rate": 4.729752492209774e-06, + "loss": 0.459, + "step": 7958 + }, + { + "epoch": 3.1108668807112867, + "grad_norm": 0.49169143247666647, + "learning_rate": 4.729681635813272e-06, + "loss": 0.454, + "step": 7959 + }, + { + "epoch": 3.111262040009879, + "grad_norm": 0.5197648875864174, + "learning_rate": 4.729610770659934e-06, + "loss": 0.4627, + "step": 7960 + }, + { + "epoch": 3.111657199308471, + "grad_norm": 0.5146456770573503, + "learning_rate": 4.729539896750039e-06, + "loss": 0.4386, + "step": 7961 + }, + { + "epoch": 3.1120523586070634, + "grad_norm": 0.5152953570832932, + "learning_rate": 4.729469014083865e-06, + "loss": 0.4338, + "step": 7962 + }, + { + "epoch": 3.1124475179056557, + "grad_norm": 0.5029822022584626, + "learning_rate": 4.729398122661692e-06, + "loss": 0.4484, + "step": 7963 + }, + { + "epoch": 3.112842677204248, + "grad_norm": 0.48762300399924735, + "learning_rate": 4.729327222483795e-06, + "loss": 0.4572, + "step": 7964 + }, + { + "epoch": 3.11323783650284, + "grad_norm": 0.5078757083645274, + "learning_rate": 4.7292563135504545e-06, + "loss": 0.4435, + "step": 7965 + }, + { + "epoch": 3.1136329958014324, + "grad_norm": 0.4895329614863369, + "learning_rate": 4.72918539586195e-06, + "loss": 0.4424, + "step": 7966 + }, + { + "epoch": 3.1140281551000246, + "grad_norm": 0.48685840804033426, + "learning_rate": 4.729114469418559e-06, + "loss": 0.4529, + "step": 7967 + }, + { + "epoch": 3.114423314398617, + "grad_norm": 0.5011426297857062, + "learning_rate": 4.729043534220559e-06, + "loss": 0.4412, + "step": 7968 + }, + { + "epoch": 3.114818473697209, + "grad_norm": 0.5086734828450132, + "learning_rate": 4.728972590268229e-06, + "loss": 0.4444, + "step": 7969 + }, + { + "epoch": 3.1152136329958013, + "grad_norm": 0.4872906939792074, + "learning_rate": 4.728901637561849e-06, + "loss": 0.4318, + "step": 7970 + }, + { + "epoch": 3.1156087922943936, + "grad_norm": 0.4894151559799161, + "learning_rate": 4.7288306761016976e-06, + "loss": 0.4377, + "step": 7971 + }, + { + "epoch": 3.116003951592986, + "grad_norm": 0.5055954518080974, + "learning_rate": 4.7287597058880516e-06, + "loss": 0.4496, + "step": 7972 + }, + { + "epoch": 3.116399110891578, + "grad_norm": 0.5148624398253207, + "learning_rate": 4.728688726921191e-06, + "loss": 0.4341, + "step": 7973 + }, + { + "epoch": 3.1167942701901703, + "grad_norm": 0.5155539975890019, + "learning_rate": 4.728617739201396e-06, + "loss": 0.4545, + "step": 7974 + }, + { + "epoch": 3.1171894294887625, + "grad_norm": 0.4864073054526971, + "learning_rate": 4.728546742728941e-06, + "loss": 0.4356, + "step": 7975 + }, + { + "epoch": 3.117584588787355, + "grad_norm": 0.48348184096928454, + "learning_rate": 4.728475737504109e-06, + "loss": 0.4346, + "step": 7976 + }, + { + "epoch": 3.117979748085947, + "grad_norm": 0.4769632382044736, + "learning_rate": 4.728404723527178e-06, + "loss": 0.4325, + "step": 7977 + }, + { + "epoch": 3.1183749073845393, + "grad_norm": 0.4977904405800571, + "learning_rate": 4.728333700798427e-06, + "loss": 0.4431, + "step": 7978 + }, + { + "epoch": 3.1187700666831315, + "grad_norm": 0.4786988797880768, + "learning_rate": 4.728262669318133e-06, + "loss": 0.4377, + "step": 7979 + }, + { + "epoch": 3.1191652259817237, + "grad_norm": 0.49376051451383535, + "learning_rate": 4.728191629086576e-06, + "loss": 0.4405, + "step": 7980 + }, + { + "epoch": 3.119560385280316, + "grad_norm": 0.49204273770267093, + "learning_rate": 4.728120580104036e-06, + "loss": 0.4542, + "step": 7981 + }, + { + "epoch": 3.1199555445789082, + "grad_norm": 0.47099976061807647, + "learning_rate": 4.728049522370791e-06, + "loss": 0.4385, + "step": 7982 + }, + { + "epoch": 3.1203507038775005, + "grad_norm": 0.5025481626978274, + "learning_rate": 4.727978455887121e-06, + "loss": 0.462, + "step": 7983 + }, + { + "epoch": 3.1207458631760927, + "grad_norm": 0.5017733662110474, + "learning_rate": 4.727907380653305e-06, + "loss": 0.4428, + "step": 7984 + }, + { + "epoch": 3.121141022474685, + "grad_norm": 0.483833290731428, + "learning_rate": 4.7278362966696204e-06, + "loss": 0.431, + "step": 7985 + }, + { + "epoch": 3.121536181773277, + "grad_norm": 0.5083971021184884, + "learning_rate": 4.727765203936348e-06, + "loss": 0.4337, + "step": 7986 + }, + { + "epoch": 3.1219313410718694, + "grad_norm": 0.49782182938515346, + "learning_rate": 4.727694102453767e-06, + "loss": 0.4353, + "step": 7987 + }, + { + "epoch": 3.1223265003704617, + "grad_norm": 0.5092377334599689, + "learning_rate": 4.727622992222156e-06, + "loss": 0.4453, + "step": 7988 + }, + { + "epoch": 3.122721659669054, + "grad_norm": 0.4911234394591529, + "learning_rate": 4.7275518732417945e-06, + "loss": 0.4348, + "step": 7989 + }, + { + "epoch": 3.123116818967646, + "grad_norm": 0.4745444703266985, + "learning_rate": 4.727480745512962e-06, + "loss": 0.4433, + "step": 7990 + }, + { + "epoch": 3.1235119782662384, + "grad_norm": 0.4882371574102391, + "learning_rate": 4.727409609035938e-06, + "loss": 0.444, + "step": 7991 + }, + { + "epoch": 3.1239071375648306, + "grad_norm": 0.49239501433303656, + "learning_rate": 4.727338463811002e-06, + "loss": 0.4373, + "step": 7992 + }, + { + "epoch": 3.124302296863423, + "grad_norm": 0.48170385902308394, + "learning_rate": 4.727267309838432e-06, + "loss": 0.442, + "step": 7993 + }, + { + "epoch": 3.124697456162015, + "grad_norm": 0.49440788201479674, + "learning_rate": 4.727196147118509e-06, + "loss": 0.4366, + "step": 7994 + }, + { + "epoch": 3.1250926154606073, + "grad_norm": 0.48255932560969295, + "learning_rate": 4.727124975651512e-06, + "loss": 0.4361, + "step": 7995 + }, + { + "epoch": 3.1254877747591996, + "grad_norm": 0.49476276790071294, + "learning_rate": 4.727053795437721e-06, + "loss": 0.4573, + "step": 7996 + }, + { + "epoch": 3.125882934057792, + "grad_norm": 0.49896678550201234, + "learning_rate": 4.726982606477414e-06, + "loss": 0.4553, + "step": 7997 + }, + { + "epoch": 3.126278093356384, + "grad_norm": 0.47886145773353866, + "learning_rate": 4.7269114087708714e-06, + "loss": 0.4255, + "step": 7998 + }, + { + "epoch": 3.1266732526549763, + "grad_norm": 0.50038796905005, + "learning_rate": 4.7268402023183736e-06, + "loss": 0.444, + "step": 7999 + }, + { + "epoch": 3.1270684119535685, + "grad_norm": 0.49176360838395583, + "learning_rate": 4.7267689871201995e-06, + "loss": 0.4473, + "step": 8000 + }, + { + "epoch": 3.1274635712521612, + "grad_norm": 0.4995031801626146, + "learning_rate": 4.72669776317663e-06, + "loss": 0.4583, + "step": 8001 + }, + { + "epoch": 3.1278587305507535, + "grad_norm": 0.48056503087598096, + "learning_rate": 4.726626530487943e-06, + "loss": 0.4377, + "step": 8002 + }, + { + "epoch": 3.1282538898493457, + "grad_norm": 0.5014416609905591, + "learning_rate": 4.726555289054419e-06, + "loss": 0.4422, + "step": 8003 + }, + { + "epoch": 3.128649049147938, + "grad_norm": 0.49552073766113186, + "learning_rate": 4.726484038876338e-06, + "loss": 0.4479, + "step": 8004 + }, + { + "epoch": 3.12904420844653, + "grad_norm": 0.5238464019017981, + "learning_rate": 4.726412779953979e-06, + "loss": 0.4569, + "step": 8005 + }, + { + "epoch": 3.1294393677451224, + "grad_norm": 0.5077356869403138, + "learning_rate": 4.726341512287623e-06, + "loss": 0.4641, + "step": 8006 + }, + { + "epoch": 3.1298345270437147, + "grad_norm": 0.5100490606005, + "learning_rate": 4.72627023587755e-06, + "loss": 0.4478, + "step": 8007 + }, + { + "epoch": 3.130229686342307, + "grad_norm": 0.48960448306534565, + "learning_rate": 4.726198950724039e-06, + "loss": 0.437, + "step": 8008 + }, + { + "epoch": 3.130624845640899, + "grad_norm": 0.483619465818084, + "learning_rate": 4.726127656827371e-06, + "loss": 0.4476, + "step": 8009 + }, + { + "epoch": 3.1310200049394914, + "grad_norm": 0.5085324589140995, + "learning_rate": 4.726056354187825e-06, + "loss": 0.4336, + "step": 8010 + }, + { + "epoch": 3.1314151642380836, + "grad_norm": 0.5078606194501115, + "learning_rate": 4.725985042805681e-06, + "loss": 0.4465, + "step": 8011 + }, + { + "epoch": 3.131810323536676, + "grad_norm": 0.49723065197368327, + "learning_rate": 4.725913722681219e-06, + "loss": 0.4538, + "step": 8012 + }, + { + "epoch": 3.132205482835268, + "grad_norm": 0.5007007217832884, + "learning_rate": 4.72584239381472e-06, + "loss": 0.4738, + "step": 8013 + }, + { + "epoch": 3.1326006421338604, + "grad_norm": 0.4898205436347479, + "learning_rate": 4.725771056206464e-06, + "loss": 0.4344, + "step": 8014 + }, + { + "epoch": 3.1329958014324526, + "grad_norm": 0.5166831822498245, + "learning_rate": 4.725699709856731e-06, + "loss": 0.4412, + "step": 8015 + }, + { + "epoch": 3.133390960731045, + "grad_norm": 0.48591617847536656, + "learning_rate": 4.7256283547658e-06, + "loss": 0.4383, + "step": 8016 + }, + { + "epoch": 3.133786120029637, + "grad_norm": 0.49374057323016374, + "learning_rate": 4.725556990933953e-06, + "loss": 0.4509, + "step": 8017 + }, + { + "epoch": 3.1341812793282293, + "grad_norm": 0.48914627951452694, + "learning_rate": 4.72548561836147e-06, + "loss": 0.432, + "step": 8018 + }, + { + "epoch": 3.1345764386268216, + "grad_norm": 0.48980290819549344, + "learning_rate": 4.72541423704863e-06, + "loss": 0.4362, + "step": 8019 + }, + { + "epoch": 3.134971597925414, + "grad_norm": 0.49924757675061804, + "learning_rate": 4.7253428469957144e-06, + "loss": 0.4405, + "step": 8020 + }, + { + "epoch": 3.135366757224006, + "grad_norm": 0.5080074429751118, + "learning_rate": 4.725271448203003e-06, + "loss": 0.4477, + "step": 8021 + }, + { + "epoch": 3.1357619165225983, + "grad_norm": 0.5140176692672513, + "learning_rate": 4.7252000406707775e-06, + "loss": 0.4527, + "step": 8022 + }, + { + "epoch": 3.1361570758211905, + "grad_norm": 0.506032454034494, + "learning_rate": 4.725128624399318e-06, + "loss": 0.4488, + "step": 8023 + }, + { + "epoch": 3.1365522351197828, + "grad_norm": 0.49308027132547305, + "learning_rate": 4.725057199388903e-06, + "loss": 0.4351, + "step": 8024 + }, + { + "epoch": 3.136947394418375, + "grad_norm": 0.5104007123637837, + "learning_rate": 4.724985765639815e-06, + "loss": 0.4602, + "step": 8025 + }, + { + "epoch": 3.1373425537169672, + "grad_norm": 0.48895688295994194, + "learning_rate": 4.7249143231523345e-06, + "loss": 0.4409, + "step": 8026 + }, + { + "epoch": 3.1377377130155595, + "grad_norm": 0.5064246682765903, + "learning_rate": 4.724842871926741e-06, + "loss": 0.4379, + "step": 8027 + }, + { + "epoch": 3.1381328723141517, + "grad_norm": 0.5069265300363764, + "learning_rate": 4.724771411963316e-06, + "loss": 0.4647, + "step": 8028 + }, + { + "epoch": 3.138528031612744, + "grad_norm": 0.5030257723209636, + "learning_rate": 4.72469994326234e-06, + "loss": 0.4447, + "step": 8029 + }, + { + "epoch": 3.138923190911336, + "grad_norm": 0.5051475543956928, + "learning_rate": 4.724628465824093e-06, + "loss": 0.4457, + "step": 8030 + }, + { + "epoch": 3.1393183502099284, + "grad_norm": 0.47809073280195014, + "learning_rate": 4.724556979648856e-06, + "loss": 0.4483, + "step": 8031 + }, + { + "epoch": 3.1397135095085207, + "grad_norm": 0.5238988459465759, + "learning_rate": 4.724485484736911e-06, + "loss": 0.4503, + "step": 8032 + }, + { + "epoch": 3.140108668807113, + "grad_norm": 0.4964702713402975, + "learning_rate": 4.724413981088537e-06, + "loss": 0.4453, + "step": 8033 + }, + { + "epoch": 3.140503828105705, + "grad_norm": 0.49721437049653233, + "learning_rate": 4.724342468704016e-06, + "loss": 0.4476, + "step": 8034 + }, + { + "epoch": 3.1408989874042974, + "grad_norm": 0.47752057571805373, + "learning_rate": 4.724270947583628e-06, + "loss": 0.4351, + "step": 8035 + }, + { + "epoch": 3.1412941467028896, + "grad_norm": 0.4937090612310495, + "learning_rate": 4.724199417727654e-06, + "loss": 0.4579, + "step": 8036 + }, + { + "epoch": 3.141689306001482, + "grad_norm": 0.48947682743996235, + "learning_rate": 4.724127879136377e-06, + "loss": 0.4379, + "step": 8037 + }, + { + "epoch": 3.142084465300074, + "grad_norm": 0.5001718495358742, + "learning_rate": 4.7240563318100755e-06, + "loss": 0.438, + "step": 8038 + }, + { + "epoch": 3.1424796245986664, + "grad_norm": 0.5014707511961164, + "learning_rate": 4.723984775749031e-06, + "loss": 0.4457, + "step": 8039 + }, + { + "epoch": 3.1428747838972586, + "grad_norm": 0.5028261244621974, + "learning_rate": 4.7239132109535245e-06, + "loss": 0.4604, + "step": 8040 + }, + { + "epoch": 3.143269943195851, + "grad_norm": 0.4934107578955984, + "learning_rate": 4.723841637423837e-06, + "loss": 0.4377, + "step": 8041 + }, + { + "epoch": 3.143665102494443, + "grad_norm": 0.48165968429853356, + "learning_rate": 4.723770055160251e-06, + "loss": 0.4519, + "step": 8042 + }, + { + "epoch": 3.1440602617930353, + "grad_norm": 0.49320305911875595, + "learning_rate": 4.723698464163046e-06, + "loss": 0.4551, + "step": 8043 + }, + { + "epoch": 3.1444554210916276, + "grad_norm": 0.5151414275313321, + "learning_rate": 4.723626864432504e-06, + "loss": 0.4411, + "step": 8044 + }, + { + "epoch": 3.14485058039022, + "grad_norm": 0.48841016631789297, + "learning_rate": 4.723555255968906e-06, + "loss": 0.4375, + "step": 8045 + }, + { + "epoch": 3.145245739688812, + "grad_norm": 0.49755737281596757, + "learning_rate": 4.723483638772532e-06, + "loss": 0.4511, + "step": 8046 + }, + { + "epoch": 3.1456408989874043, + "grad_norm": 0.5133698387807278, + "learning_rate": 4.723412012843666e-06, + "loss": 0.4507, + "step": 8047 + }, + { + "epoch": 3.1460360582859965, + "grad_norm": 0.5113782564797575, + "learning_rate": 4.723340378182587e-06, + "loss": 0.4471, + "step": 8048 + }, + { + "epoch": 3.1464312175845888, + "grad_norm": 0.5025143915593622, + "learning_rate": 4.7232687347895775e-06, + "loss": 0.4506, + "step": 8049 + }, + { + "epoch": 3.146826376883181, + "grad_norm": 0.5090332482162311, + "learning_rate": 4.7231970826649185e-06, + "loss": 0.4601, + "step": 8050 + }, + { + "epoch": 3.1472215361817732, + "grad_norm": 0.5129767234895644, + "learning_rate": 4.7231254218088906e-06, + "loss": 0.4339, + "step": 8051 + }, + { + "epoch": 3.1476166954803655, + "grad_norm": 0.4931726199572382, + "learning_rate": 4.723053752221777e-06, + "loss": 0.4628, + "step": 8052 + }, + { + "epoch": 3.1480118547789577, + "grad_norm": 0.4761021361758315, + "learning_rate": 4.722982073903857e-06, + "loss": 0.4111, + "step": 8053 + }, + { + "epoch": 3.14840701407755, + "grad_norm": 0.49121750684116305, + "learning_rate": 4.722910386855414e-06, + "loss": 0.4332, + "step": 8054 + }, + { + "epoch": 3.148802173376142, + "grad_norm": 0.49386756521790587, + "learning_rate": 4.722838691076729e-06, + "loss": 0.432, + "step": 8055 + }, + { + "epoch": 3.1491973326747345, + "grad_norm": 0.4962985239368982, + "learning_rate": 4.722766986568083e-06, + "loss": 0.4472, + "step": 8056 + }, + { + "epoch": 3.1495924919733267, + "grad_norm": 0.49652014240893927, + "learning_rate": 4.722695273329758e-06, + "loss": 0.4585, + "step": 8057 + }, + { + "epoch": 3.149987651271919, + "grad_norm": 0.4984442502510313, + "learning_rate": 4.722623551362036e-06, + "loss": 0.4285, + "step": 8058 + }, + { + "epoch": 3.150382810570511, + "grad_norm": 0.4923794585945551, + "learning_rate": 4.7225518206651975e-06, + "loss": 0.451, + "step": 8059 + }, + { + "epoch": 3.1507779698691034, + "grad_norm": 0.492056259052196, + "learning_rate": 4.722480081239527e-06, + "loss": 0.4423, + "step": 8060 + }, + { + "epoch": 3.1511731291676957, + "grad_norm": 0.4936357538899125, + "learning_rate": 4.7224083330853025e-06, + "loss": 0.4449, + "step": 8061 + }, + { + "epoch": 3.151568288466288, + "grad_norm": 0.4891054280607878, + "learning_rate": 4.722336576202808e-06, + "loss": 0.4607, + "step": 8062 + }, + { + "epoch": 3.15196344776488, + "grad_norm": 0.5006228316341875, + "learning_rate": 4.722264810592325e-06, + "loss": 0.4578, + "step": 8063 + }, + { + "epoch": 3.1523586070634724, + "grad_norm": 0.49583466507150603, + "learning_rate": 4.722193036254135e-06, + "loss": 0.4505, + "step": 8064 + }, + { + "epoch": 3.1527537663620646, + "grad_norm": 0.5029343842600947, + "learning_rate": 4.722121253188521e-06, + "loss": 0.4449, + "step": 8065 + }, + { + "epoch": 3.153148925660657, + "grad_norm": 0.5118215295573837, + "learning_rate": 4.722049461395763e-06, + "loss": 0.434, + "step": 8066 + }, + { + "epoch": 3.153544084959249, + "grad_norm": 0.4861987832676822, + "learning_rate": 4.721977660876144e-06, + "loss": 0.4308, + "step": 8067 + }, + { + "epoch": 3.1539392442578413, + "grad_norm": 0.5021670172513321, + "learning_rate": 4.721905851629947e-06, + "loss": 0.447, + "step": 8068 + }, + { + "epoch": 3.1543344035564336, + "grad_norm": 0.4869993622406486, + "learning_rate": 4.721834033657452e-06, + "loss": 0.4596, + "step": 8069 + }, + { + "epoch": 3.154729562855026, + "grad_norm": 0.515713215408121, + "learning_rate": 4.721762206958943e-06, + "loss": 0.4579, + "step": 8070 + }, + { + "epoch": 3.155124722153618, + "grad_norm": 0.5056235944038723, + "learning_rate": 4.7216903715347005e-06, + "loss": 0.4692, + "step": 8071 + }, + { + "epoch": 3.1555198814522103, + "grad_norm": 0.506279979442091, + "learning_rate": 4.721618527385008e-06, + "loss": 0.4411, + "step": 8072 + }, + { + "epoch": 3.1559150407508025, + "grad_norm": 0.507576731306934, + "learning_rate": 4.721546674510146e-06, + "loss": 0.4476, + "step": 8073 + }, + { + "epoch": 3.1563102000493948, + "grad_norm": 0.4959496994776162, + "learning_rate": 4.721474812910398e-06, + "loss": 0.4491, + "step": 8074 + }, + { + "epoch": 3.156705359347987, + "grad_norm": 0.48307075497150315, + "learning_rate": 4.721402942586046e-06, + "loss": 0.4518, + "step": 8075 + }, + { + "epoch": 3.1571005186465793, + "grad_norm": 0.5009750695914092, + "learning_rate": 4.721331063537372e-06, + "loss": 0.4293, + "step": 8076 + }, + { + "epoch": 3.1574956779451715, + "grad_norm": 0.5279332021896078, + "learning_rate": 4.721259175764659e-06, + "loss": 0.4506, + "step": 8077 + }, + { + "epoch": 3.1578908372437637, + "grad_norm": 0.5166606359510496, + "learning_rate": 4.721187279268189e-06, + "loss": 0.4785, + "step": 8078 + }, + { + "epoch": 3.158285996542356, + "grad_norm": 0.4957762206244525, + "learning_rate": 4.721115374048243e-06, + "loss": 0.4429, + "step": 8079 + }, + { + "epoch": 3.158681155840948, + "grad_norm": 0.5130296220814176, + "learning_rate": 4.721043460105106e-06, + "loss": 0.4582, + "step": 8080 + }, + { + "epoch": 3.1590763151395405, + "grad_norm": 0.5084021122302967, + "learning_rate": 4.720971537439058e-06, + "loss": 0.4468, + "step": 8081 + }, + { + "epoch": 3.1594714744381327, + "grad_norm": 0.5024239703569898, + "learning_rate": 4.720899606050382e-06, + "loss": 0.4391, + "step": 8082 + }, + { + "epoch": 3.159866633736725, + "grad_norm": 0.49038806705570115, + "learning_rate": 4.720827665939362e-06, + "loss": 0.4273, + "step": 8083 + }, + { + "epoch": 3.160261793035317, + "grad_norm": 0.5006231223006984, + "learning_rate": 4.720755717106278e-06, + "loss": 0.4549, + "step": 8084 + }, + { + "epoch": 3.1606569523339094, + "grad_norm": 0.5146917436143081, + "learning_rate": 4.7206837595514155e-06, + "loss": 0.4446, + "step": 8085 + }, + { + "epoch": 3.1610521116325017, + "grad_norm": 0.502995584917966, + "learning_rate": 4.720611793275055e-06, + "loss": 0.4453, + "step": 8086 + }, + { + "epoch": 3.161447270931094, + "grad_norm": 0.49610937331042165, + "learning_rate": 4.7205398182774806e-06, + "loss": 0.4585, + "step": 8087 + }, + { + "epoch": 3.161842430229686, + "grad_norm": 0.4843164971279493, + "learning_rate": 4.720467834558973e-06, + "loss": 0.4481, + "step": 8088 + }, + { + "epoch": 3.1622375895282784, + "grad_norm": 0.5128044968276422, + "learning_rate": 4.720395842119817e-06, + "loss": 0.4687, + "step": 8089 + }, + { + "epoch": 3.162632748826871, + "grad_norm": 0.48798274809495723, + "learning_rate": 4.7203238409602936e-06, + "loss": 0.4476, + "step": 8090 + }, + { + "epoch": 3.1630279081254633, + "grad_norm": 0.4928931238418614, + "learning_rate": 4.720251831080687e-06, + "loss": 0.4345, + "step": 8091 + }, + { + "epoch": 3.1634230674240555, + "grad_norm": 0.5669879530273516, + "learning_rate": 4.720179812481279e-06, + "loss": 0.4412, + "step": 8092 + }, + { + "epoch": 3.163818226722648, + "grad_norm": 0.5067692284751482, + "learning_rate": 4.720107785162353e-06, + "loss": 0.4427, + "step": 8093 + }, + { + "epoch": 3.16421338602124, + "grad_norm": 0.48670841000596843, + "learning_rate": 4.7200357491241925e-06, + "loss": 0.4391, + "step": 8094 + }, + { + "epoch": 3.1646085453198323, + "grad_norm": 0.49696643464950435, + "learning_rate": 4.71996370436708e-06, + "loss": 0.4545, + "step": 8095 + }, + { + "epoch": 3.1650037046184245, + "grad_norm": 0.5074530215661209, + "learning_rate": 4.719891650891296e-06, + "loss": 0.4583, + "step": 8096 + }, + { + "epoch": 3.1653988639170167, + "grad_norm": 0.49354704231252805, + "learning_rate": 4.719819588697127e-06, + "loss": 0.4461, + "step": 8097 + }, + { + "epoch": 3.165794023215609, + "grad_norm": 0.4963673335562395, + "learning_rate": 4.719747517784854e-06, + "loss": 0.4349, + "step": 8098 + }, + { + "epoch": 3.1661891825142012, + "grad_norm": 0.4904154243050707, + "learning_rate": 4.719675438154761e-06, + "loss": 0.4493, + "step": 8099 + }, + { + "epoch": 3.1665843418127935, + "grad_norm": 0.47953064206220075, + "learning_rate": 4.719603349807132e-06, + "loss": 0.4484, + "step": 8100 + }, + { + "epoch": 3.1669795011113857, + "grad_norm": 0.47907940594638293, + "learning_rate": 4.719531252742246e-06, + "loss": 0.4405, + "step": 8101 + }, + { + "epoch": 3.167374660409978, + "grad_norm": 0.4898293576798917, + "learning_rate": 4.7194591469603915e-06, + "loss": 0.4465, + "step": 8102 + }, + { + "epoch": 3.16776981970857, + "grad_norm": 0.4894519791280581, + "learning_rate": 4.7193870324618486e-06, + "loss": 0.4411, + "step": 8103 + }, + { + "epoch": 3.1681649790071624, + "grad_norm": 0.5194593246729222, + "learning_rate": 4.7193149092469e-06, + "loss": 0.4674, + "step": 8104 + }, + { + "epoch": 3.1685601383057547, + "grad_norm": 0.4838702824968695, + "learning_rate": 4.719242777315831e-06, + "loss": 0.4505, + "step": 8105 + }, + { + "epoch": 3.168955297604347, + "grad_norm": 0.506659585187492, + "learning_rate": 4.719170636668924e-06, + "loss": 0.4573, + "step": 8106 + }, + { + "epoch": 3.169350456902939, + "grad_norm": 0.487503557571576, + "learning_rate": 4.719098487306463e-06, + "loss": 0.4403, + "step": 8107 + }, + { + "epoch": 3.1697456162015314, + "grad_norm": 0.4876179319034556, + "learning_rate": 4.7190263292287296e-06, + "loss": 0.4379, + "step": 8108 + }, + { + "epoch": 3.1701407755001236, + "grad_norm": 0.5106552416620477, + "learning_rate": 4.718954162436008e-06, + "loss": 0.4282, + "step": 8109 + }, + { + "epoch": 3.170535934798716, + "grad_norm": 0.48302060495170895, + "learning_rate": 4.718881986928583e-06, + "loss": 0.4424, + "step": 8110 + }, + { + "epoch": 3.170931094097308, + "grad_norm": 0.4958854915432569, + "learning_rate": 4.718809802706736e-06, + "loss": 0.4507, + "step": 8111 + }, + { + "epoch": 3.1713262533959004, + "grad_norm": 0.4932554091257428, + "learning_rate": 4.7187376097707515e-06, + "loss": 0.4481, + "step": 8112 + }, + { + "epoch": 3.1717214126944926, + "grad_norm": 0.48904571684029674, + "learning_rate": 4.718665408120913e-06, + "loss": 0.4415, + "step": 8113 + }, + { + "epoch": 3.172116571993085, + "grad_norm": 0.5168961069991919, + "learning_rate": 4.718593197757505e-06, + "loss": 0.4599, + "step": 8114 + }, + { + "epoch": 3.172511731291677, + "grad_norm": 0.5063108372391634, + "learning_rate": 4.718520978680809e-06, + "loss": 0.4517, + "step": 8115 + }, + { + "epoch": 3.1729068905902693, + "grad_norm": 0.5007293842416772, + "learning_rate": 4.71844875089111e-06, + "loss": 0.4267, + "step": 8116 + }, + { + "epoch": 3.1733020498888616, + "grad_norm": 0.48180519881194206, + "learning_rate": 4.718376514388691e-06, + "loss": 0.4348, + "step": 8117 + }, + { + "epoch": 3.173697209187454, + "grad_norm": 0.5089904727964285, + "learning_rate": 4.718304269173837e-06, + "loss": 0.4541, + "step": 8118 + }, + { + "epoch": 3.174092368486046, + "grad_norm": 0.48877774632457816, + "learning_rate": 4.718232015246831e-06, + "loss": 0.4376, + "step": 8119 + }, + { + "epoch": 3.1744875277846383, + "grad_norm": 0.4891031842232021, + "learning_rate": 4.718159752607955e-06, + "loss": 0.4461, + "step": 8120 + }, + { + "epoch": 3.1748826870832305, + "grad_norm": 0.5040292533478185, + "learning_rate": 4.718087481257496e-06, + "loss": 0.4475, + "step": 8121 + }, + { + "epoch": 3.1752778463818228, + "grad_norm": 0.4981159124676147, + "learning_rate": 4.718015201195736e-06, + "loss": 0.45, + "step": 8122 + }, + { + "epoch": 3.175673005680415, + "grad_norm": 0.489987198323066, + "learning_rate": 4.717942912422958e-06, + "loss": 0.4469, + "step": 8123 + }, + { + "epoch": 3.1760681649790072, + "grad_norm": 0.5132570158383588, + "learning_rate": 4.717870614939449e-06, + "loss": 0.4671, + "step": 8124 + }, + { + "epoch": 3.1764633242775995, + "grad_norm": 0.4838179306406595, + "learning_rate": 4.717798308745489e-06, + "loss": 0.4483, + "step": 8125 + }, + { + "epoch": 3.1768584835761917, + "grad_norm": 0.49436482089246664, + "learning_rate": 4.717725993841366e-06, + "loss": 0.4505, + "step": 8126 + }, + { + "epoch": 3.177253642874784, + "grad_norm": 0.4927294185258886, + "learning_rate": 4.71765367022736e-06, + "loss": 0.4607, + "step": 8127 + }, + { + "epoch": 3.177648802173376, + "grad_norm": 0.5023211446663597, + "learning_rate": 4.717581337903759e-06, + "loss": 0.4412, + "step": 8128 + }, + { + "epoch": 3.1780439614719684, + "grad_norm": 0.4962184544429156, + "learning_rate": 4.717508996870843e-06, + "loss": 0.4472, + "step": 8129 + }, + { + "epoch": 3.1784391207705607, + "grad_norm": 0.5060665727054257, + "learning_rate": 4.717436647128899e-06, + "loss": 0.4483, + "step": 8130 + }, + { + "epoch": 3.178834280069153, + "grad_norm": 0.4856256691010014, + "learning_rate": 4.717364288678211e-06, + "loss": 0.4531, + "step": 8131 + }, + { + "epoch": 3.179229439367745, + "grad_norm": 0.5031699859568227, + "learning_rate": 4.717291921519062e-06, + "loss": 0.4601, + "step": 8132 + }, + { + "epoch": 3.1796245986663374, + "grad_norm": 0.49801437420123507, + "learning_rate": 4.717219545651736e-06, + "loss": 0.4482, + "step": 8133 + }, + { + "epoch": 3.1800197579649296, + "grad_norm": 0.5050121211282538, + "learning_rate": 4.71714716107652e-06, + "loss": 0.4642, + "step": 8134 + }, + { + "epoch": 3.180414917263522, + "grad_norm": 0.5261624095797713, + "learning_rate": 4.717074767793695e-06, + "loss": 0.4363, + "step": 8135 + }, + { + "epoch": 3.180810076562114, + "grad_norm": 0.5003002025261412, + "learning_rate": 4.717002365803547e-06, + "loss": 0.443, + "step": 8136 + }, + { + "epoch": 3.1812052358607064, + "grad_norm": 0.5162727913171745, + "learning_rate": 4.716929955106359e-06, + "loss": 0.4556, + "step": 8137 + }, + { + "epoch": 3.1816003951592986, + "grad_norm": 0.50383251369054, + "learning_rate": 4.716857535702417e-06, + "loss": 0.4476, + "step": 8138 + }, + { + "epoch": 3.181995554457891, + "grad_norm": 0.4989731953152965, + "learning_rate": 4.716785107592005e-06, + "loss": 0.4505, + "step": 8139 + }, + { + "epoch": 3.182390713756483, + "grad_norm": 0.4985898123702961, + "learning_rate": 4.716712670775407e-06, + "loss": 0.4513, + "step": 8140 + }, + { + "epoch": 3.1827858730550753, + "grad_norm": 0.4949371574614923, + "learning_rate": 4.7166402252529075e-06, + "loss": 0.4486, + "step": 8141 + }, + { + "epoch": 3.1831810323536676, + "grad_norm": 0.5050331893640804, + "learning_rate": 4.716567771024792e-06, + "loss": 0.457, + "step": 8142 + }, + { + "epoch": 3.18357619165226, + "grad_norm": 0.4860144273306709, + "learning_rate": 4.716495308091343e-06, + "loss": 0.4353, + "step": 8143 + }, + { + "epoch": 3.183971350950852, + "grad_norm": 0.48399721239140636, + "learning_rate": 4.716422836452846e-06, + "loss": 0.4373, + "step": 8144 + }, + { + "epoch": 3.1843665102494443, + "grad_norm": 0.4806840238583811, + "learning_rate": 4.716350356109588e-06, + "loss": 0.4395, + "step": 8145 + }, + { + "epoch": 3.1847616695480365, + "grad_norm": 0.4905696278198881, + "learning_rate": 4.716277867061851e-06, + "loss": 0.4521, + "step": 8146 + }, + { + "epoch": 3.1851568288466288, + "grad_norm": 0.5052154287634858, + "learning_rate": 4.716205369309919e-06, + "loss": 0.4654, + "step": 8147 + }, + { + "epoch": 3.185551988145221, + "grad_norm": 0.4976670668769747, + "learning_rate": 4.716132862854079e-06, + "loss": 0.4656, + "step": 8148 + }, + { + "epoch": 3.1859471474438132, + "grad_norm": 0.4856606394398254, + "learning_rate": 4.716060347694615e-06, + "loss": 0.4312, + "step": 8149 + }, + { + "epoch": 3.1863423067424055, + "grad_norm": 0.47616575166124564, + "learning_rate": 4.7159878238318116e-06, + "loss": 0.4321, + "step": 8150 + }, + { + "epoch": 3.1867374660409977, + "grad_norm": 0.4993342307580878, + "learning_rate": 4.715915291265954e-06, + "loss": 0.4534, + "step": 8151 + }, + { + "epoch": 3.18713262533959, + "grad_norm": 0.5057600356168487, + "learning_rate": 4.715842749997325e-06, + "loss": 0.454, + "step": 8152 + }, + { + "epoch": 3.187527784638182, + "grad_norm": 0.5076197913058856, + "learning_rate": 4.715770200026213e-06, + "loss": 0.449, + "step": 8153 + }, + { + "epoch": 3.1879229439367744, + "grad_norm": 0.47904704822140237, + "learning_rate": 4.7156976413529e-06, + "loss": 0.4405, + "step": 8154 + }, + { + "epoch": 3.1883181032353667, + "grad_norm": 0.478740457872185, + "learning_rate": 4.7156250739776725e-06, + "loss": 0.4527, + "step": 8155 + }, + { + "epoch": 3.188713262533959, + "grad_norm": 0.4865375368517737, + "learning_rate": 4.715552497900815e-06, + "loss": 0.4664, + "step": 8156 + }, + { + "epoch": 3.189108421832551, + "grad_norm": 0.49057170271705297, + "learning_rate": 4.715479913122613e-06, + "loss": 0.4498, + "step": 8157 + }, + { + "epoch": 3.1895035811311434, + "grad_norm": 0.48932211501442835, + "learning_rate": 4.715407319643352e-06, + "loss": 0.438, + "step": 8158 + }, + { + "epoch": 3.1898987404297356, + "grad_norm": 0.4986907746805224, + "learning_rate": 4.715334717463314e-06, + "loss": 0.4437, + "step": 8159 + }, + { + "epoch": 3.190293899728328, + "grad_norm": 0.5107324129303034, + "learning_rate": 4.715262106582788e-06, + "loss": 0.4663, + "step": 8160 + }, + { + "epoch": 3.19068905902692, + "grad_norm": 0.4816038125829416, + "learning_rate": 4.715189487002057e-06, + "loss": 0.4459, + "step": 8161 + }, + { + "epoch": 3.1910842183255124, + "grad_norm": 0.48589022124625236, + "learning_rate": 4.715116858721408e-06, + "loss": 0.4596, + "step": 8162 + }, + { + "epoch": 3.1914793776241046, + "grad_norm": 0.5059117225587179, + "learning_rate": 4.715044221741125e-06, + "loss": 0.4424, + "step": 8163 + }, + { + "epoch": 3.191874536922697, + "grad_norm": 0.4902259551321021, + "learning_rate": 4.714971576061492e-06, + "loss": 0.4436, + "step": 8164 + }, + { + "epoch": 3.192269696221289, + "grad_norm": 0.49921816165596244, + "learning_rate": 4.714898921682797e-06, + "loss": 0.4341, + "step": 8165 + }, + { + "epoch": 3.1926648555198813, + "grad_norm": 0.4873589691710802, + "learning_rate": 4.714826258605323e-06, + "loss": 0.447, + "step": 8166 + }, + { + "epoch": 3.1930600148184736, + "grad_norm": 0.4947509154503643, + "learning_rate": 4.714753586829357e-06, + "loss": 0.4531, + "step": 8167 + }, + { + "epoch": 3.193455174117066, + "grad_norm": 0.49979557568227245, + "learning_rate": 4.714680906355184e-06, + "loss": 0.4499, + "step": 8168 + }, + { + "epoch": 3.193850333415658, + "grad_norm": 0.47965466768323384, + "learning_rate": 4.714608217183088e-06, + "loss": 0.4294, + "step": 8169 + }, + { + "epoch": 3.1942454927142503, + "grad_norm": 0.5380227850408551, + "learning_rate": 4.7145355193133566e-06, + "loss": 0.44, + "step": 8170 + }, + { + "epoch": 3.1946406520128425, + "grad_norm": 0.4778666409695035, + "learning_rate": 4.714462812746275e-06, + "loss": 0.4518, + "step": 8171 + }, + { + "epoch": 3.1950358113114348, + "grad_norm": 0.498563645161947, + "learning_rate": 4.714390097482127e-06, + "loss": 0.4512, + "step": 8172 + }, + { + "epoch": 3.195430970610027, + "grad_norm": 0.502694986736014, + "learning_rate": 4.714317373521199e-06, + "loss": 0.4703, + "step": 8173 + }, + { + "epoch": 3.1958261299086193, + "grad_norm": 0.4955421090319872, + "learning_rate": 4.714244640863778e-06, + "loss": 0.4601, + "step": 8174 + }, + { + "epoch": 3.1962212892072115, + "grad_norm": 0.47703481341356674, + "learning_rate": 4.714171899510148e-06, + "loss": 0.4363, + "step": 8175 + }, + { + "epoch": 3.1966164485058037, + "grad_norm": 0.488778683807109, + "learning_rate": 4.714099149460596e-06, + "loss": 0.4309, + "step": 8176 + }, + { + "epoch": 3.197011607804396, + "grad_norm": 0.49243205733047013, + "learning_rate": 4.714026390715407e-06, + "loss": 0.4421, + "step": 8177 + }, + { + "epoch": 3.197406767102988, + "grad_norm": 0.47901350686825167, + "learning_rate": 4.7139536232748665e-06, + "loss": 0.4475, + "step": 8178 + }, + { + "epoch": 3.1978019264015805, + "grad_norm": 0.4873600139430021, + "learning_rate": 4.71388084713926e-06, + "loss": 0.4365, + "step": 8179 + }, + { + "epoch": 3.1981970857001727, + "grad_norm": 0.4750978959852484, + "learning_rate": 4.713808062308874e-06, + "loss": 0.4329, + "step": 8180 + }, + { + "epoch": 3.198592244998765, + "grad_norm": 0.49005747817347883, + "learning_rate": 4.713735268783995e-06, + "loss": 0.44, + "step": 8181 + }, + { + "epoch": 3.198987404297357, + "grad_norm": 0.4897517319745767, + "learning_rate": 4.713662466564908e-06, + "loss": 0.4418, + "step": 8182 + }, + { + "epoch": 3.1993825635959494, + "grad_norm": 0.4937199361984085, + "learning_rate": 4.713589655651898e-06, + "loss": 0.445, + "step": 8183 + }, + { + "epoch": 3.1997777228945417, + "grad_norm": 0.5006257765582313, + "learning_rate": 4.713516836045253e-06, + "loss": 0.4578, + "step": 8184 + }, + { + "epoch": 3.200172882193134, + "grad_norm": 0.5102363063233484, + "learning_rate": 4.713444007745258e-06, + "loss": 0.4649, + "step": 8185 + }, + { + "epoch": 3.200568041491726, + "grad_norm": 0.5619926568257341, + "learning_rate": 4.713371170752198e-06, + "loss": 0.4512, + "step": 8186 + }, + { + "epoch": 3.2009632007903184, + "grad_norm": 0.49017178589540195, + "learning_rate": 4.713298325066361e-06, + "loss": 0.4482, + "step": 8187 + }, + { + "epoch": 3.2013583600889106, + "grad_norm": 0.5075841792186128, + "learning_rate": 4.713225470688032e-06, + "loss": 0.4637, + "step": 8188 + }, + { + "epoch": 3.201753519387503, + "grad_norm": 0.4851504244889839, + "learning_rate": 4.713152607617497e-06, + "loss": 0.4354, + "step": 8189 + }, + { + "epoch": 3.2021486786860955, + "grad_norm": 0.49603485658612206, + "learning_rate": 4.713079735855043e-06, + "loss": 0.4669, + "step": 8190 + }, + { + "epoch": 3.202543837984688, + "grad_norm": 0.5131941929666564, + "learning_rate": 4.713006855400955e-06, + "loss": 0.4363, + "step": 8191 + }, + { + "epoch": 3.20293899728328, + "grad_norm": 0.5327331196587508, + "learning_rate": 4.712933966255521e-06, + "loss": 0.4517, + "step": 8192 + }, + { + "epoch": 3.2033341565818723, + "grad_norm": 0.47967217544059954, + "learning_rate": 4.7128610684190255e-06, + "loss": 0.4363, + "step": 8193 + }, + { + "epoch": 3.2037293158804645, + "grad_norm": 0.5032954106990305, + "learning_rate": 4.712788161891755e-06, + "loss": 0.4555, + "step": 8194 + }, + { + "epoch": 3.2041244751790567, + "grad_norm": 0.49443372271224756, + "learning_rate": 4.712715246673997e-06, + "loss": 0.4432, + "step": 8195 + }, + { + "epoch": 3.204519634477649, + "grad_norm": 0.5048761667286308, + "learning_rate": 4.712642322766037e-06, + "loss": 0.4636, + "step": 8196 + }, + { + "epoch": 3.204914793776241, + "grad_norm": 0.5114603132884818, + "learning_rate": 4.71256939016816e-06, + "loss": 0.4678, + "step": 8197 + }, + { + "epoch": 3.2053099530748335, + "grad_norm": 0.4783233894068138, + "learning_rate": 4.712496448880656e-06, + "loss": 0.4546, + "step": 8198 + }, + { + "epoch": 3.2057051123734257, + "grad_norm": 0.489295057789308, + "learning_rate": 4.712423498903809e-06, + "loss": 0.4391, + "step": 8199 + }, + { + "epoch": 3.206100271672018, + "grad_norm": 0.4876753870656324, + "learning_rate": 4.712350540237906e-06, + "loss": 0.4552, + "step": 8200 + }, + { + "epoch": 3.20649543097061, + "grad_norm": 0.4833416370700385, + "learning_rate": 4.712277572883233e-06, + "loss": 0.4461, + "step": 8201 + }, + { + "epoch": 3.2068905902692024, + "grad_norm": 0.4966315262459833, + "learning_rate": 4.712204596840077e-06, + "loss": 0.464, + "step": 8202 + }, + { + "epoch": 3.2072857495677947, + "grad_norm": 0.477804878380041, + "learning_rate": 4.712131612108724e-06, + "loss": 0.4378, + "step": 8203 + }, + { + "epoch": 3.207680908866387, + "grad_norm": 0.4842718555370584, + "learning_rate": 4.7120586186894626e-06, + "loss": 0.4488, + "step": 8204 + }, + { + "epoch": 3.208076068164979, + "grad_norm": 0.48752539718009136, + "learning_rate": 4.711985616582578e-06, + "loss": 0.4583, + "step": 8205 + }, + { + "epoch": 3.2084712274635714, + "grad_norm": 0.493674287690774, + "learning_rate": 4.711912605788357e-06, + "loss": 0.4461, + "step": 8206 + }, + { + "epoch": 3.2088663867621636, + "grad_norm": 0.5940079173022275, + "learning_rate": 4.711839586307086e-06, + "loss": 0.4796, + "step": 8207 + }, + { + "epoch": 3.209261546060756, + "grad_norm": 0.4727016450653654, + "learning_rate": 4.711766558139053e-06, + "loss": 0.4313, + "step": 8208 + }, + { + "epoch": 3.209656705359348, + "grad_norm": 0.4985753825854796, + "learning_rate": 4.711693521284543e-06, + "loss": 0.4622, + "step": 8209 + }, + { + "epoch": 3.2100518646579403, + "grad_norm": 0.5947833225156005, + "learning_rate": 4.711620475743845e-06, + "loss": 0.4576, + "step": 8210 + }, + { + "epoch": 3.2104470239565326, + "grad_norm": 0.4991842972271697, + "learning_rate": 4.711547421517244e-06, + "loss": 0.4429, + "step": 8211 + }, + { + "epoch": 3.210842183255125, + "grad_norm": 0.49646570990011646, + "learning_rate": 4.711474358605027e-06, + "loss": 0.4589, + "step": 8212 + }, + { + "epoch": 3.211237342553717, + "grad_norm": 0.49028663463091515, + "learning_rate": 4.711401287007482e-06, + "loss": 0.4321, + "step": 8213 + }, + { + "epoch": 3.2116325018523093, + "grad_norm": 0.5228254931374301, + "learning_rate": 4.711328206724897e-06, + "loss": 0.457, + "step": 8214 + }, + { + "epoch": 3.2120276611509015, + "grad_norm": 0.48805769076484595, + "learning_rate": 4.7112551177575564e-06, + "loss": 0.4406, + "step": 8215 + }, + { + "epoch": 3.212422820449494, + "grad_norm": 0.48960247238176013, + "learning_rate": 4.711182020105748e-06, + "loss": 0.4567, + "step": 8216 + }, + { + "epoch": 3.212817979748086, + "grad_norm": 0.4988948172821916, + "learning_rate": 4.71110891376976e-06, + "loss": 0.4662, + "step": 8217 + }, + { + "epoch": 3.2132131390466783, + "grad_norm": 0.502843794988854, + "learning_rate": 4.711035798749879e-06, + "loss": 0.4696, + "step": 8218 + }, + { + "epoch": 3.2136082983452705, + "grad_norm": 0.4867119474038113, + "learning_rate": 4.710962675046392e-06, + "loss": 0.4547, + "step": 8219 + }, + { + "epoch": 3.2140034576438627, + "grad_norm": 0.496603147591944, + "learning_rate": 4.710889542659586e-06, + "loss": 0.4643, + "step": 8220 + }, + { + "epoch": 3.214398616942455, + "grad_norm": 0.5108147174349558, + "learning_rate": 4.710816401589748e-06, + "loss": 0.4659, + "step": 8221 + }, + { + "epoch": 3.2147937762410472, + "grad_norm": 0.5331702363488228, + "learning_rate": 4.710743251837166e-06, + "loss": 0.4392, + "step": 8222 + }, + { + "epoch": 3.2151889355396395, + "grad_norm": 0.48329930143617106, + "learning_rate": 4.710670093402127e-06, + "loss": 0.4466, + "step": 8223 + }, + { + "epoch": 3.2155840948382317, + "grad_norm": 0.5172967429519062, + "learning_rate": 4.7105969262849185e-06, + "loss": 0.4705, + "step": 8224 + }, + { + "epoch": 3.215979254136824, + "grad_norm": 0.47907504810620555, + "learning_rate": 4.710523750485827e-06, + "loss": 0.435, + "step": 8225 + }, + { + "epoch": 3.216374413435416, + "grad_norm": 0.4878844771396085, + "learning_rate": 4.71045056600514e-06, + "loss": 0.4418, + "step": 8226 + }, + { + "epoch": 3.2167695727340084, + "grad_norm": 0.4937392208878633, + "learning_rate": 4.710377372843147e-06, + "loss": 0.4466, + "step": 8227 + }, + { + "epoch": 3.2171647320326007, + "grad_norm": 0.5193745711511666, + "learning_rate": 4.710304171000133e-06, + "loss": 0.445, + "step": 8228 + }, + { + "epoch": 3.217559891331193, + "grad_norm": 0.5017673107772458, + "learning_rate": 4.7102309604763865e-06, + "loss": 0.4535, + "step": 8229 + }, + { + "epoch": 3.217955050629785, + "grad_norm": 0.47724701872972525, + "learning_rate": 4.710157741272195e-06, + "loss": 0.4236, + "step": 8230 + }, + { + "epoch": 3.2183502099283774, + "grad_norm": 0.5029142492827028, + "learning_rate": 4.710084513387846e-06, + "loss": 0.46, + "step": 8231 + }, + { + "epoch": 3.2187453692269696, + "grad_norm": 0.4976804393277097, + "learning_rate": 4.710011276823627e-06, + "loss": 0.4601, + "step": 8232 + }, + { + "epoch": 3.219140528525562, + "grad_norm": 0.5261258641015674, + "learning_rate": 4.709938031579825e-06, + "loss": 0.4587, + "step": 8233 + }, + { + "epoch": 3.219535687824154, + "grad_norm": 0.5200984670015669, + "learning_rate": 4.70986477765673e-06, + "loss": 0.4571, + "step": 8234 + }, + { + "epoch": 3.2199308471227464, + "grad_norm": 0.4894476159865528, + "learning_rate": 4.709791515054627e-06, + "loss": 0.4442, + "step": 8235 + }, + { + "epoch": 3.2203260064213386, + "grad_norm": 0.49521421527173004, + "learning_rate": 4.709718243773805e-06, + "loss": 0.4359, + "step": 8236 + }, + { + "epoch": 3.220721165719931, + "grad_norm": 0.4927693522476051, + "learning_rate": 4.7096449638145504e-06, + "loss": 0.445, + "step": 8237 + }, + { + "epoch": 3.221116325018523, + "grad_norm": 0.5172144682625665, + "learning_rate": 4.709571675177154e-06, + "loss": 0.4584, + "step": 8238 + }, + { + "epoch": 3.2215114843171153, + "grad_norm": 0.5154003710753569, + "learning_rate": 4.709498377861901e-06, + "loss": 0.4433, + "step": 8239 + }, + { + "epoch": 3.2219066436157076, + "grad_norm": 0.4855890206340475, + "learning_rate": 4.709425071869079e-06, + "loss": 0.4501, + "step": 8240 + }, + { + "epoch": 3.2223018029143, + "grad_norm": 0.4830024442204952, + "learning_rate": 4.709351757198979e-06, + "loss": 0.4416, + "step": 8241 + }, + { + "epoch": 3.222696962212892, + "grad_norm": 0.5008192768633752, + "learning_rate": 4.7092784338518856e-06, + "loss": 0.4403, + "step": 8242 + }, + { + "epoch": 3.2230921215114843, + "grad_norm": 0.5174554129182379, + "learning_rate": 4.7092051018280886e-06, + "loss": 0.4436, + "step": 8243 + }, + { + "epoch": 3.2234872808100765, + "grad_norm": 0.49631332057852817, + "learning_rate": 4.709131761127875e-06, + "loss": 0.466, + "step": 8244 + }, + { + "epoch": 3.2238824401086688, + "grad_norm": 0.4879530471104053, + "learning_rate": 4.709058411751533e-06, + "loss": 0.4498, + "step": 8245 + }, + { + "epoch": 3.224277599407261, + "grad_norm": 0.5093815291047132, + "learning_rate": 4.708985053699351e-06, + "loss": 0.4828, + "step": 8246 + }, + { + "epoch": 3.2246727587058532, + "grad_norm": 0.48848100859749605, + "learning_rate": 4.708911686971618e-06, + "loss": 0.4388, + "step": 8247 + }, + { + "epoch": 3.2250679180044455, + "grad_norm": 0.49097358270784774, + "learning_rate": 4.708838311568621e-06, + "loss": 0.4417, + "step": 8248 + }, + { + "epoch": 3.2254630773030377, + "grad_norm": 0.5073726289639382, + "learning_rate": 4.708764927490648e-06, + "loss": 0.4715, + "step": 8249 + }, + { + "epoch": 3.22585823660163, + "grad_norm": 0.49923080085782845, + "learning_rate": 4.708691534737987e-06, + "loss": 0.4672, + "step": 8250 + }, + { + "epoch": 3.226253395900222, + "grad_norm": 0.4983917363183614, + "learning_rate": 4.7086181333109286e-06, + "loss": 0.4526, + "step": 8251 + }, + { + "epoch": 3.2266485551988144, + "grad_norm": 0.5111375734969307, + "learning_rate": 4.708544723209758e-06, + "loss": 0.452, + "step": 8252 + }, + { + "epoch": 3.2270437144974067, + "grad_norm": 0.48716751738116665, + "learning_rate": 4.708471304434765e-06, + "loss": 0.4564, + "step": 8253 + }, + { + "epoch": 3.227438873795999, + "grad_norm": 0.5069026621643061, + "learning_rate": 4.708397876986238e-06, + "loss": 0.4528, + "step": 8254 + }, + { + "epoch": 3.227834033094591, + "grad_norm": 0.5152378887836355, + "learning_rate": 4.708324440864465e-06, + "loss": 0.4355, + "step": 8255 + }, + { + "epoch": 3.2282291923931834, + "grad_norm": 0.4962324262994311, + "learning_rate": 4.7082509960697345e-06, + "loss": 0.442, + "step": 8256 + }, + { + "epoch": 3.2286243516917756, + "grad_norm": 0.5028634303397124, + "learning_rate": 4.708177542602335e-06, + "loss": 0.4553, + "step": 8257 + }, + { + "epoch": 3.229019510990368, + "grad_norm": 0.5001078547401838, + "learning_rate": 4.708104080462555e-06, + "loss": 0.4557, + "step": 8258 + }, + { + "epoch": 3.22941467028896, + "grad_norm": 0.4960036879535328, + "learning_rate": 4.708030609650683e-06, + "loss": 0.448, + "step": 8259 + }, + { + "epoch": 3.2298098295875524, + "grad_norm": 0.5050479554627619, + "learning_rate": 4.7079571301670076e-06, + "loss": 0.4655, + "step": 8260 + }, + { + "epoch": 3.2302049888861446, + "grad_norm": 0.5006203296479564, + "learning_rate": 4.707883642011818e-06, + "loss": 0.4653, + "step": 8261 + }, + { + "epoch": 3.230600148184737, + "grad_norm": 0.4999078554437849, + "learning_rate": 4.707810145185401e-06, + "loss": 0.4574, + "step": 8262 + }, + { + "epoch": 3.230995307483329, + "grad_norm": 0.47956978838455655, + "learning_rate": 4.707736639688047e-06, + "loss": 0.4366, + "step": 8263 + }, + { + "epoch": 3.2313904667819213, + "grad_norm": 0.5070009335133143, + "learning_rate": 4.707663125520044e-06, + "loss": 0.4479, + "step": 8264 + }, + { + "epoch": 3.2317856260805136, + "grad_norm": 0.5087445068694287, + "learning_rate": 4.707589602681681e-06, + "loss": 0.4596, + "step": 8265 + }, + { + "epoch": 3.232180785379106, + "grad_norm": 0.5718442258693521, + "learning_rate": 4.707516071173246e-06, + "loss": 0.4542, + "step": 8266 + }, + { + "epoch": 3.232575944677698, + "grad_norm": 0.5011292864269645, + "learning_rate": 4.707442530995029e-06, + "loss": 0.4525, + "step": 8267 + }, + { + "epoch": 3.2329711039762903, + "grad_norm": 0.5026210431492106, + "learning_rate": 4.707368982147318e-06, + "loss": 0.4431, + "step": 8268 + }, + { + "epoch": 3.2333662632748825, + "grad_norm": 0.5127765787850526, + "learning_rate": 4.707295424630402e-06, + "loss": 0.4665, + "step": 8269 + }, + { + "epoch": 3.2337614225734748, + "grad_norm": 0.5086511264451964, + "learning_rate": 4.707221858444569e-06, + "loss": 0.45, + "step": 8270 + }, + { + "epoch": 3.234156581872067, + "grad_norm": 0.5232352547690865, + "learning_rate": 4.70714828359011e-06, + "loss": 0.4685, + "step": 8271 + }, + { + "epoch": 3.2345517411706592, + "grad_norm": 0.49163545831580857, + "learning_rate": 4.707074700067312e-06, + "loss": 0.4508, + "step": 8272 + }, + { + "epoch": 3.2349469004692515, + "grad_norm": 0.5145261335572505, + "learning_rate": 4.707001107876466e-06, + "loss": 0.4693, + "step": 8273 + }, + { + "epoch": 3.2353420597678437, + "grad_norm": 0.5026474655303093, + "learning_rate": 4.7069275070178586e-06, + "loss": 0.4629, + "step": 8274 + }, + { + "epoch": 3.235737219066436, + "grad_norm": 0.486565673024086, + "learning_rate": 4.706853897491781e-06, + "loss": 0.4526, + "step": 8275 + }, + { + "epoch": 3.236132378365028, + "grad_norm": 0.47935306570145486, + "learning_rate": 4.70678027929852e-06, + "loss": 0.4466, + "step": 8276 + }, + { + "epoch": 3.2365275376636204, + "grad_norm": 0.4991778284300973, + "learning_rate": 4.706706652438368e-06, + "loss": 0.4549, + "step": 8277 + }, + { + "epoch": 3.2369226969622127, + "grad_norm": 0.5067651241856679, + "learning_rate": 4.706633016911611e-06, + "loss": 0.4634, + "step": 8278 + }, + { + "epoch": 3.2373178562608054, + "grad_norm": 0.49959265144088416, + "learning_rate": 4.70655937271854e-06, + "loss": 0.4477, + "step": 8279 + }, + { + "epoch": 3.2377130155593976, + "grad_norm": 0.4730123377567548, + "learning_rate": 4.706485719859443e-06, + "loss": 0.4348, + "step": 8280 + }, + { + "epoch": 3.23810817485799, + "grad_norm": 0.5011163271286927, + "learning_rate": 4.706412058334611e-06, + "loss": 0.469, + "step": 8281 + }, + { + "epoch": 3.238503334156582, + "grad_norm": 0.5118563019208421, + "learning_rate": 4.706338388144331e-06, + "loss": 0.4536, + "step": 8282 + }, + { + "epoch": 3.2388984934551743, + "grad_norm": 0.49785560194791934, + "learning_rate": 4.706264709288894e-06, + "loss": 0.4398, + "step": 8283 + }, + { + "epoch": 3.2392936527537666, + "grad_norm": 0.48918931904116053, + "learning_rate": 4.7061910217685895e-06, + "loss": 0.458, + "step": 8284 + }, + { + "epoch": 3.239688812052359, + "grad_norm": 0.4991786615442405, + "learning_rate": 4.7061173255837054e-06, + "loss": 0.4522, + "step": 8285 + }, + { + "epoch": 3.240083971350951, + "grad_norm": 0.48813754093841744, + "learning_rate": 4.706043620734533e-06, + "loss": 0.454, + "step": 8286 + }, + { + "epoch": 3.2404791306495433, + "grad_norm": 0.6162776390081206, + "learning_rate": 4.705969907221361e-06, + "loss": 0.4252, + "step": 8287 + }, + { + "epoch": 3.2408742899481355, + "grad_norm": 0.4916252788914689, + "learning_rate": 4.705896185044477e-06, + "loss": 0.4541, + "step": 8288 + }, + { + "epoch": 3.2412694492467278, + "grad_norm": 0.49006414916965146, + "learning_rate": 4.705822454204173e-06, + "loss": 0.4498, + "step": 8289 + }, + { + "epoch": 3.24166460854532, + "grad_norm": 0.5058325689652357, + "learning_rate": 4.705748714700739e-06, + "loss": 0.45, + "step": 8290 + }, + { + "epoch": 3.2420597678439123, + "grad_norm": 0.4876491731015844, + "learning_rate": 4.705674966534462e-06, + "loss": 0.4163, + "step": 8291 + }, + { + "epoch": 3.2424549271425045, + "grad_norm": 0.5021867760606261, + "learning_rate": 4.705601209705635e-06, + "loss": 0.441, + "step": 8292 + }, + { + "epoch": 3.2428500864410967, + "grad_norm": 0.5038352555082803, + "learning_rate": 4.705527444214543e-06, + "loss": 0.4469, + "step": 8293 + }, + { + "epoch": 3.243245245739689, + "grad_norm": 0.5013117536018146, + "learning_rate": 4.705453670061481e-06, + "loss": 0.4492, + "step": 8294 + }, + { + "epoch": 3.243640405038281, + "grad_norm": 0.47282887919141153, + "learning_rate": 4.705379887246735e-06, + "loss": 0.4431, + "step": 8295 + }, + { + "epoch": 3.2440355643368735, + "grad_norm": 0.49337905075174315, + "learning_rate": 4.705306095770596e-06, + "loss": 0.4572, + "step": 8296 + }, + { + "epoch": 3.2444307236354657, + "grad_norm": 0.49412301657250673, + "learning_rate": 4.705232295633355e-06, + "loss": 0.4463, + "step": 8297 + }, + { + "epoch": 3.244825882934058, + "grad_norm": 0.5031343987855506, + "learning_rate": 4.7051584868353e-06, + "loss": 0.4502, + "step": 8298 + }, + { + "epoch": 3.24522104223265, + "grad_norm": 0.5181152188198329, + "learning_rate": 4.705084669376721e-06, + "loss": 0.4397, + "step": 8299 + }, + { + "epoch": 3.2456162015312424, + "grad_norm": 0.4882928530529042, + "learning_rate": 4.7050108432579075e-06, + "loss": 0.4515, + "step": 8300 + }, + { + "epoch": 3.2460113608298347, + "grad_norm": 0.6015511647794045, + "learning_rate": 4.704937008479152e-06, + "loss": 0.4449, + "step": 8301 + }, + { + "epoch": 3.246406520128427, + "grad_norm": 0.5137381986143884, + "learning_rate": 4.704863165040742e-06, + "loss": 0.4537, + "step": 8302 + }, + { + "epoch": 3.246801679427019, + "grad_norm": 0.5065385080859512, + "learning_rate": 4.704789312942969e-06, + "loss": 0.4507, + "step": 8303 + }, + { + "epoch": 3.2471968387256114, + "grad_norm": 0.5006646215975414, + "learning_rate": 4.704715452186122e-06, + "loss": 0.4413, + "step": 8304 + }, + { + "epoch": 3.2475919980242036, + "grad_norm": 0.4925281165489277, + "learning_rate": 4.704641582770492e-06, + "loss": 0.4418, + "step": 8305 + }, + { + "epoch": 3.247987157322796, + "grad_norm": 0.4935778234372788, + "learning_rate": 4.704567704696368e-06, + "loss": 0.465, + "step": 8306 + }, + { + "epoch": 3.248382316621388, + "grad_norm": 0.5077023224379424, + "learning_rate": 4.704493817964041e-06, + "loss": 0.4327, + "step": 8307 + }, + { + "epoch": 3.2487774759199803, + "grad_norm": 0.5015739358541178, + "learning_rate": 4.7044199225738e-06, + "loss": 0.4464, + "step": 8308 + }, + { + "epoch": 3.2491726352185726, + "grad_norm": 0.5022006581016725, + "learning_rate": 4.704346018525937e-06, + "loss": 0.4532, + "step": 8309 + }, + { + "epoch": 3.249567794517165, + "grad_norm": 0.5067055633348441, + "learning_rate": 4.704272105820741e-06, + "loss": 0.4597, + "step": 8310 + }, + { + "epoch": 3.249962953815757, + "grad_norm": 0.4829044817680482, + "learning_rate": 4.704198184458503e-06, + "loss": 0.4245, + "step": 8311 + }, + { + "epoch": 3.2503581131143493, + "grad_norm": 0.502784983843347, + "learning_rate": 4.704124254439512e-06, + "loss": 0.4682, + "step": 8312 + }, + { + "epoch": 3.2507532724129415, + "grad_norm": 0.5025327631494758, + "learning_rate": 4.704050315764062e-06, + "loss": 0.4532, + "step": 8313 + }, + { + "epoch": 3.251148431711534, + "grad_norm": 0.4827438502841128, + "learning_rate": 4.703976368432438e-06, + "loss": 0.4582, + "step": 8314 + }, + { + "epoch": 3.251543591010126, + "grad_norm": 0.5001402118199502, + "learning_rate": 4.703902412444935e-06, + "loss": 0.4512, + "step": 8315 + }, + { + "epoch": 3.2519387503087183, + "grad_norm": 0.5073201743090425, + "learning_rate": 4.7038284478018405e-06, + "loss": 0.458, + "step": 8316 + }, + { + "epoch": 3.2523339096073105, + "grad_norm": 0.5044450071518284, + "learning_rate": 4.703754474503446e-06, + "loss": 0.4518, + "step": 8317 + }, + { + "epoch": 3.2527290689059027, + "grad_norm": 0.5019360105195412, + "learning_rate": 4.703680492550043e-06, + "loss": 0.4544, + "step": 8318 + }, + { + "epoch": 3.253124228204495, + "grad_norm": 0.49174623590569816, + "learning_rate": 4.703606501941921e-06, + "loss": 0.4361, + "step": 8319 + }, + { + "epoch": 3.2535193875030872, + "grad_norm": 0.5000322070806993, + "learning_rate": 4.70353250267937e-06, + "loss": 0.4647, + "step": 8320 + }, + { + "epoch": 3.2539145468016795, + "grad_norm": 0.4949672092349716, + "learning_rate": 4.7034584947626815e-06, + "loss": 0.4498, + "step": 8321 + }, + { + "epoch": 3.2543097061002717, + "grad_norm": 0.48215269520295684, + "learning_rate": 4.703384478192146e-06, + "loss": 0.4461, + "step": 8322 + }, + { + "epoch": 3.254704865398864, + "grad_norm": 0.4874422999560892, + "learning_rate": 4.703310452968055e-06, + "loss": 0.4434, + "step": 8323 + }, + { + "epoch": 3.255100024697456, + "grad_norm": 0.4842366003915545, + "learning_rate": 4.703236419090699e-06, + "loss": 0.4338, + "step": 8324 + }, + { + "epoch": 3.2554951839960484, + "grad_norm": 0.47835189607454215, + "learning_rate": 4.7031623765603665e-06, + "loss": 0.4463, + "step": 8325 + }, + { + "epoch": 3.2558903432946407, + "grad_norm": 0.48308358621451114, + "learning_rate": 4.7030883253773504e-06, + "loss": 0.455, + "step": 8326 + }, + { + "epoch": 3.256285502593233, + "grad_norm": 0.5094922118559343, + "learning_rate": 4.703014265541942e-06, + "loss": 0.447, + "step": 8327 + }, + { + "epoch": 3.256680661891825, + "grad_norm": 0.49550690331076896, + "learning_rate": 4.702940197054431e-06, + "loss": 0.4543, + "step": 8328 + }, + { + "epoch": 3.2570758211904174, + "grad_norm": 0.49464702815935846, + "learning_rate": 4.702866119915108e-06, + "loss": 0.443, + "step": 8329 + }, + { + "epoch": 3.2574709804890096, + "grad_norm": 0.48344035746473757, + "learning_rate": 4.702792034124265e-06, + "loss": 0.4422, + "step": 8330 + }, + { + "epoch": 3.257866139787602, + "grad_norm": 0.49796852073676307, + "learning_rate": 4.702717939682193e-06, + "loss": 0.4518, + "step": 8331 + }, + { + "epoch": 3.258261299086194, + "grad_norm": 0.49891388022773675, + "learning_rate": 4.702643836589182e-06, + "loss": 0.4536, + "step": 8332 + }, + { + "epoch": 3.2586564583847863, + "grad_norm": 0.5017290426062102, + "learning_rate": 4.702569724845523e-06, + "loss": 0.4492, + "step": 8333 + }, + { + "epoch": 3.2590516176833786, + "grad_norm": 0.5055247142680904, + "learning_rate": 4.702495604451508e-06, + "loss": 0.4646, + "step": 8334 + }, + { + "epoch": 3.259446776981971, + "grad_norm": 0.48822789965751323, + "learning_rate": 4.702421475407428e-06, + "loss": 0.4509, + "step": 8335 + }, + { + "epoch": 3.259841936280563, + "grad_norm": 0.4932198897185812, + "learning_rate": 4.7023473377135735e-06, + "loss": 0.4464, + "step": 8336 + }, + { + "epoch": 3.2602370955791553, + "grad_norm": 0.5137369946580147, + "learning_rate": 4.702273191370236e-06, + "loss": 0.452, + "step": 8337 + }, + { + "epoch": 3.2606322548777475, + "grad_norm": 0.5049765007477618, + "learning_rate": 4.702199036377707e-06, + "loss": 0.4344, + "step": 8338 + }, + { + "epoch": 3.26102741417634, + "grad_norm": 0.48715675795622143, + "learning_rate": 4.702124872736277e-06, + "loss": 0.4688, + "step": 8339 + }, + { + "epoch": 3.261422573474932, + "grad_norm": 0.5058798989467141, + "learning_rate": 4.702050700446238e-06, + "loss": 0.461, + "step": 8340 + }, + { + "epoch": 3.2618177327735243, + "grad_norm": 0.4875539335588554, + "learning_rate": 4.701976519507881e-06, + "loss": 0.4517, + "step": 8341 + }, + { + "epoch": 3.2622128920721165, + "grad_norm": 0.49540858670627236, + "learning_rate": 4.7019023299214974e-06, + "loss": 0.4436, + "step": 8342 + }, + { + "epoch": 3.2626080513707088, + "grad_norm": 0.48452284870049883, + "learning_rate": 4.701828131687378e-06, + "loss": 0.4563, + "step": 8343 + }, + { + "epoch": 3.263003210669301, + "grad_norm": 0.5064967286836504, + "learning_rate": 4.701753924805815e-06, + "loss": 0.4517, + "step": 8344 + }, + { + "epoch": 3.2633983699678932, + "grad_norm": 0.5049107650163969, + "learning_rate": 4.7016797092771e-06, + "loss": 0.4641, + "step": 8345 + }, + { + "epoch": 3.2637935292664855, + "grad_norm": 0.5286126343914365, + "learning_rate": 4.701605485101524e-06, + "loss": 0.4564, + "step": 8346 + }, + { + "epoch": 3.2641886885650777, + "grad_norm": 0.5224526170379453, + "learning_rate": 4.701531252279379e-06, + "loss": 0.4418, + "step": 8347 + }, + { + "epoch": 3.26458384786367, + "grad_norm": 0.5085290561882606, + "learning_rate": 4.701457010810955e-06, + "loss": 0.4561, + "step": 8348 + }, + { + "epoch": 3.264979007162262, + "grad_norm": 0.4860433808053769, + "learning_rate": 4.7013827606965446e-06, + "loss": 0.4365, + "step": 8349 + }, + { + "epoch": 3.2653741664608544, + "grad_norm": 0.5362280369050675, + "learning_rate": 4.70130850193644e-06, + "loss": 0.453, + "step": 8350 + }, + { + "epoch": 3.2657693257594467, + "grad_norm": 0.5026775324033125, + "learning_rate": 4.7012342345309316e-06, + "loss": 0.4318, + "step": 8351 + }, + { + "epoch": 3.266164485058039, + "grad_norm": 0.48284490669371327, + "learning_rate": 4.701159958480312e-06, + "loss": 0.4375, + "step": 8352 + }, + { + "epoch": 3.266559644356631, + "grad_norm": 0.511556437758763, + "learning_rate": 4.701085673784874e-06, + "loss": 0.4607, + "step": 8353 + }, + { + "epoch": 3.2669548036552234, + "grad_norm": 0.4829094276892418, + "learning_rate": 4.701011380444907e-06, + "loss": 0.4292, + "step": 8354 + }, + { + "epoch": 3.2673499629538156, + "grad_norm": 0.51144228894575, + "learning_rate": 4.700937078460704e-06, + "loss": 0.4603, + "step": 8355 + }, + { + "epoch": 3.267745122252408, + "grad_norm": 0.4906533832218431, + "learning_rate": 4.700862767832557e-06, + "loss": 0.4582, + "step": 8356 + }, + { + "epoch": 3.268140281551, + "grad_norm": 0.4930805349262283, + "learning_rate": 4.7007884485607565e-06, + "loss": 0.458, + "step": 8357 + }, + { + "epoch": 3.2685354408495924, + "grad_norm": 0.5009831870584155, + "learning_rate": 4.700714120645596e-06, + "loss": 0.4687, + "step": 8358 + }, + { + "epoch": 3.2689306001481846, + "grad_norm": 0.49092555374069324, + "learning_rate": 4.700639784087366e-06, + "loss": 0.4359, + "step": 8359 + }, + { + "epoch": 3.269325759446777, + "grad_norm": 0.500292655543431, + "learning_rate": 4.700565438886361e-06, + "loss": 0.4487, + "step": 8360 + }, + { + "epoch": 3.269720918745369, + "grad_norm": 0.4855070468612074, + "learning_rate": 4.70049108504287e-06, + "loss": 0.4411, + "step": 8361 + }, + { + "epoch": 3.2701160780439613, + "grad_norm": 0.5023057055550552, + "learning_rate": 4.700416722557186e-06, + "loss": 0.4455, + "step": 8362 + }, + { + "epoch": 3.2705112373425536, + "grad_norm": 0.509131608944125, + "learning_rate": 4.700342351429601e-06, + "loss": 0.4516, + "step": 8363 + }, + { + "epoch": 3.270906396641146, + "grad_norm": 0.49337058153224017, + "learning_rate": 4.700267971660408e-06, + "loss": 0.4675, + "step": 8364 + }, + { + "epoch": 3.271301555939738, + "grad_norm": 0.4860684297948216, + "learning_rate": 4.700193583249899e-06, + "loss": 0.4467, + "step": 8365 + }, + { + "epoch": 3.2716967152383303, + "grad_norm": 0.49228337368850594, + "learning_rate": 4.700119186198365e-06, + "loss": 0.4579, + "step": 8366 + }, + { + "epoch": 3.2720918745369225, + "grad_norm": 0.4845358764762339, + "learning_rate": 4.700044780506099e-06, + "loss": 0.449, + "step": 8367 + }, + { + "epoch": 3.2724870338355148, + "grad_norm": 0.5068451001379258, + "learning_rate": 4.699970366173393e-06, + "loss": 0.4348, + "step": 8368 + }, + { + "epoch": 3.272882193134107, + "grad_norm": 0.4914022566989148, + "learning_rate": 4.699895943200539e-06, + "loss": 0.4503, + "step": 8369 + }, + { + "epoch": 3.2732773524326992, + "grad_norm": 0.4992738042869505, + "learning_rate": 4.69982151158783e-06, + "loss": 0.4432, + "step": 8370 + }, + { + "epoch": 3.2736725117312915, + "grad_norm": 0.4938604279465015, + "learning_rate": 4.6997470713355574e-06, + "loss": 0.4414, + "step": 8371 + }, + { + "epoch": 3.2740676710298837, + "grad_norm": 0.4839800775793246, + "learning_rate": 4.699672622444015e-06, + "loss": 0.4349, + "step": 8372 + }, + { + "epoch": 3.274462830328476, + "grad_norm": 0.5024998565000468, + "learning_rate": 4.699598164913493e-06, + "loss": 0.4402, + "step": 8373 + }, + { + "epoch": 3.274857989627068, + "grad_norm": 0.49637224096799104, + "learning_rate": 4.699523698744286e-06, + "loss": 0.4483, + "step": 8374 + }, + { + "epoch": 3.2752531489256604, + "grad_norm": 0.49260375994924005, + "learning_rate": 4.699449223936686e-06, + "loss": 0.4474, + "step": 8375 + }, + { + "epoch": 3.2756483082242527, + "grad_norm": 0.6533471619862731, + "learning_rate": 4.699374740490984e-06, + "loss": 0.4655, + "step": 8376 + }, + { + "epoch": 3.276043467522845, + "grad_norm": 0.5234139605987889, + "learning_rate": 4.699300248407474e-06, + "loss": 0.4672, + "step": 8377 + }, + { + "epoch": 3.276438626821437, + "grad_norm": 0.4971511724210842, + "learning_rate": 4.6992257476864475e-06, + "loss": 0.4598, + "step": 8378 + }, + { + "epoch": 3.2768337861200294, + "grad_norm": 0.49100800872845307, + "learning_rate": 4.699151238328198e-06, + "loss": 0.47, + "step": 8379 + }, + { + "epoch": 3.2772289454186216, + "grad_norm": 0.4796505235609134, + "learning_rate": 4.6990767203330185e-06, + "loss": 0.4321, + "step": 8380 + }, + { + "epoch": 3.277624104717214, + "grad_norm": 0.5035218618276991, + "learning_rate": 4.6990021937012e-06, + "loss": 0.4544, + "step": 8381 + }, + { + "epoch": 3.278019264015806, + "grad_norm": 0.4912630675638253, + "learning_rate": 4.6989276584330365e-06, + "loss": 0.4311, + "step": 8382 + }, + { + "epoch": 3.278414423314399, + "grad_norm": 0.5050629349004524, + "learning_rate": 4.6988531145288204e-06, + "loss": 0.4416, + "step": 8383 + }, + { + "epoch": 3.278809582612991, + "grad_norm": 0.5106795613363712, + "learning_rate": 4.698778561988845e-06, + "loss": 0.4728, + "step": 8384 + }, + { + "epoch": 3.2792047419115833, + "grad_norm": 0.4961344250576797, + "learning_rate": 4.698704000813403e-06, + "loss": 0.4555, + "step": 8385 + }, + { + "epoch": 3.2795999012101755, + "grad_norm": 0.5105963304245603, + "learning_rate": 4.698629431002786e-06, + "loss": 0.4758, + "step": 8386 + }, + { + "epoch": 3.2799950605087678, + "grad_norm": 0.49603685469527325, + "learning_rate": 4.698554852557288e-06, + "loss": 0.4394, + "step": 8387 + }, + { + "epoch": 3.28039021980736, + "grad_norm": 0.51768754874356, + "learning_rate": 4.698480265477201e-06, + "loss": 0.4477, + "step": 8388 + }, + { + "epoch": 3.2807853791059522, + "grad_norm": 0.4988292744521313, + "learning_rate": 4.698405669762819e-06, + "loss": 0.4575, + "step": 8389 + }, + { + "epoch": 3.2811805384045445, + "grad_norm": 0.4906957305488342, + "learning_rate": 4.6983310654144345e-06, + "loss": 0.4463, + "step": 8390 + }, + { + "epoch": 3.2815756977031367, + "grad_norm": 0.4974751379220848, + "learning_rate": 4.6982564524323396e-06, + "loss": 0.4454, + "step": 8391 + }, + { + "epoch": 3.281970857001729, + "grad_norm": 0.4872843211030378, + "learning_rate": 4.698181830816829e-06, + "loss": 0.4311, + "step": 8392 + }, + { + "epoch": 3.282366016300321, + "grad_norm": 0.49418859390881315, + "learning_rate": 4.698107200568195e-06, + "loss": 0.4415, + "step": 8393 + }, + { + "epoch": 3.2827611755989135, + "grad_norm": 0.5037917521978581, + "learning_rate": 4.698032561686731e-06, + "loss": 0.4554, + "step": 8394 + }, + { + "epoch": 3.2831563348975057, + "grad_norm": 0.643760942178025, + "learning_rate": 4.697957914172728e-06, + "loss": 0.444, + "step": 8395 + }, + { + "epoch": 3.283551494196098, + "grad_norm": 0.49670328228290767, + "learning_rate": 4.697883258026483e-06, + "loss": 0.4514, + "step": 8396 + }, + { + "epoch": 3.28394665349469, + "grad_norm": 0.49682146132488997, + "learning_rate": 4.697808593248287e-06, + "loss": 0.4398, + "step": 8397 + }, + { + "epoch": 3.2843418127932824, + "grad_norm": 0.5166333354777709, + "learning_rate": 4.697733919838433e-06, + "loss": 0.4618, + "step": 8398 + }, + { + "epoch": 3.2847369720918747, + "grad_norm": 0.4806098054963327, + "learning_rate": 4.697659237797214e-06, + "loss": 0.4501, + "step": 8399 + }, + { + "epoch": 3.285132131390467, + "grad_norm": 0.49062496009639534, + "learning_rate": 4.697584547124925e-06, + "loss": 0.4476, + "step": 8400 + }, + { + "epoch": 3.285527290689059, + "grad_norm": 0.5057710961550459, + "learning_rate": 4.697509847821858e-06, + "loss": 0.4487, + "step": 8401 + }, + { + "epoch": 3.2859224499876514, + "grad_norm": 0.5096432840136818, + "learning_rate": 4.697435139888307e-06, + "loss": 0.4468, + "step": 8402 + }, + { + "epoch": 3.2863176092862436, + "grad_norm": 0.5099785625996293, + "learning_rate": 4.697360423324564e-06, + "loss": 0.4736, + "step": 8403 + }, + { + "epoch": 3.286712768584836, + "grad_norm": 0.4840586188305653, + "learning_rate": 4.6972856981309245e-06, + "loss": 0.4569, + "step": 8404 + }, + { + "epoch": 3.287107927883428, + "grad_norm": 0.49475695398984937, + "learning_rate": 4.697210964307681e-06, + "loss": 0.465, + "step": 8405 + }, + { + "epoch": 3.2875030871820203, + "grad_norm": 0.5199796400658511, + "learning_rate": 4.697136221855127e-06, + "loss": 0.4667, + "step": 8406 + }, + { + "epoch": 3.2878982464806126, + "grad_norm": 0.5003237789978858, + "learning_rate": 4.697061470773556e-06, + "loss": 0.457, + "step": 8407 + }, + { + "epoch": 3.288293405779205, + "grad_norm": 0.49819670220659273, + "learning_rate": 4.696986711063261e-06, + "loss": 0.4494, + "step": 8408 + }, + { + "epoch": 3.288688565077797, + "grad_norm": 0.49271546941029926, + "learning_rate": 4.696911942724538e-06, + "loss": 0.458, + "step": 8409 + }, + { + "epoch": 3.2890837243763893, + "grad_norm": 0.5002339935967224, + "learning_rate": 4.6968371657576774e-06, + "loss": 0.4463, + "step": 8410 + }, + { + "epoch": 3.2894788836749815, + "grad_norm": 0.49334667002247623, + "learning_rate": 4.696762380162975e-06, + "loss": 0.4486, + "step": 8411 + }, + { + "epoch": 3.2898740429735738, + "grad_norm": 0.496514797921124, + "learning_rate": 4.696687585940724e-06, + "loss": 0.4538, + "step": 8412 + }, + { + "epoch": 3.290269202272166, + "grad_norm": 0.4964892015506934, + "learning_rate": 4.696612783091218e-06, + "loss": 0.4445, + "step": 8413 + }, + { + "epoch": 3.2906643615707583, + "grad_norm": 0.5015291860594303, + "learning_rate": 4.696537971614751e-06, + "loss": 0.4537, + "step": 8414 + }, + { + "epoch": 3.2910595208693505, + "grad_norm": 0.4896264954525798, + "learning_rate": 4.696463151511617e-06, + "loss": 0.4516, + "step": 8415 + }, + { + "epoch": 3.2914546801679427, + "grad_norm": 0.5160967672675386, + "learning_rate": 4.696388322782108e-06, + "loss": 0.472, + "step": 8416 + }, + { + "epoch": 3.291849839466535, + "grad_norm": 0.5176580403198294, + "learning_rate": 4.69631348542652e-06, + "loss": 0.4505, + "step": 8417 + }, + { + "epoch": 3.292244998765127, + "grad_norm": 0.5070289013247464, + "learning_rate": 4.696238639445147e-06, + "loss": 0.4678, + "step": 8418 + }, + { + "epoch": 3.2926401580637195, + "grad_norm": 0.5280810466683207, + "learning_rate": 4.696163784838282e-06, + "loss": 0.4643, + "step": 8419 + }, + { + "epoch": 3.2930353173623117, + "grad_norm": 0.5182007906795619, + "learning_rate": 4.696088921606219e-06, + "loss": 0.4367, + "step": 8420 + }, + { + "epoch": 3.293430476660904, + "grad_norm": 0.49458421124688884, + "learning_rate": 4.696014049749252e-06, + "loss": 0.464, + "step": 8421 + }, + { + "epoch": 3.293825635959496, + "grad_norm": 0.5055857636265902, + "learning_rate": 4.695939169267676e-06, + "loss": 0.4492, + "step": 8422 + }, + { + "epoch": 3.2942207952580884, + "grad_norm": 0.5004862392256088, + "learning_rate": 4.695864280161784e-06, + "loss": 0.4587, + "step": 8423 + }, + { + "epoch": 3.2946159545566807, + "grad_norm": 0.484703488880412, + "learning_rate": 4.695789382431871e-06, + "loss": 0.4476, + "step": 8424 + }, + { + "epoch": 3.295011113855273, + "grad_norm": 0.4945224351875782, + "learning_rate": 4.6957144760782305e-06, + "loss": 0.4487, + "step": 8425 + }, + { + "epoch": 3.295406273153865, + "grad_norm": 0.47918483340578955, + "learning_rate": 4.695639561101156e-06, + "loss": 0.4509, + "step": 8426 + }, + { + "epoch": 3.2958014324524574, + "grad_norm": 0.523558981494999, + "learning_rate": 4.695564637500944e-06, + "loss": 0.4584, + "step": 8427 + }, + { + "epoch": 3.2961965917510496, + "grad_norm": 0.5135262004505718, + "learning_rate": 4.695489705277886e-06, + "loss": 0.4713, + "step": 8428 + }, + { + "epoch": 3.296591751049642, + "grad_norm": 0.5032512376213754, + "learning_rate": 4.695414764432278e-06, + "loss": 0.4609, + "step": 8429 + }, + { + "epoch": 3.296986910348234, + "grad_norm": 0.515575257988458, + "learning_rate": 4.6953398149644135e-06, + "loss": 0.4535, + "step": 8430 + }, + { + "epoch": 3.2973820696468263, + "grad_norm": 0.4783220407113566, + "learning_rate": 4.695264856874589e-06, + "loss": 0.4284, + "step": 8431 + }, + { + "epoch": 3.2977772289454186, + "grad_norm": 0.4996331261411462, + "learning_rate": 4.695189890163095e-06, + "loss": 0.4642, + "step": 8432 + }, + { + "epoch": 3.298172388244011, + "grad_norm": 0.4859730761571242, + "learning_rate": 4.6951149148302285e-06, + "loss": 0.4564, + "step": 8433 + }, + { + "epoch": 3.298567547542603, + "grad_norm": 0.49508609585829344, + "learning_rate": 4.695039930876285e-06, + "loss": 0.4478, + "step": 8434 + }, + { + "epoch": 3.2989627068411953, + "grad_norm": 0.5073958753301407, + "learning_rate": 4.694964938301556e-06, + "loss": 0.4498, + "step": 8435 + }, + { + "epoch": 3.2993578661397875, + "grad_norm": 0.5042008611601112, + "learning_rate": 4.694889937106338e-06, + "loss": 0.4743, + "step": 8436 + }, + { + "epoch": 3.29975302543838, + "grad_norm": 0.4995949580337922, + "learning_rate": 4.694814927290926e-06, + "loss": 0.4444, + "step": 8437 + }, + { + "epoch": 3.300148184736972, + "grad_norm": 0.5010429776804977, + "learning_rate": 4.694739908855613e-06, + "loss": 0.4519, + "step": 8438 + }, + { + "epoch": 3.3005433440355643, + "grad_norm": 0.49717228341091885, + "learning_rate": 4.6946648818006944e-06, + "loss": 0.4661, + "step": 8439 + }, + { + "epoch": 3.3009385033341565, + "grad_norm": 0.4941824510019, + "learning_rate": 4.694589846126465e-06, + "loss": 0.4616, + "step": 8440 + }, + { + "epoch": 3.3013336626327487, + "grad_norm": 0.4962369271830349, + "learning_rate": 4.69451480183322e-06, + "loss": 0.4617, + "step": 8441 + }, + { + "epoch": 3.301728821931341, + "grad_norm": 0.4973622352887221, + "learning_rate": 4.694439748921253e-06, + "loss": 0.458, + "step": 8442 + }, + { + "epoch": 3.3021239812299332, + "grad_norm": 0.5023833682592578, + "learning_rate": 4.694364687390858e-06, + "loss": 0.4733, + "step": 8443 + }, + { + "epoch": 3.3025191405285255, + "grad_norm": 0.48155367010251726, + "learning_rate": 4.694289617242331e-06, + "loss": 0.4431, + "step": 8444 + }, + { + "epoch": 3.3029142998271177, + "grad_norm": 0.494415452731697, + "learning_rate": 4.694214538475969e-06, + "loss": 0.4283, + "step": 8445 + }, + { + "epoch": 3.30330945912571, + "grad_norm": 0.47365239654389407, + "learning_rate": 4.694139451092062e-06, + "loss": 0.4505, + "step": 8446 + }, + { + "epoch": 3.303704618424302, + "grad_norm": 0.485640275190784, + "learning_rate": 4.6940643550909096e-06, + "loss": 0.4496, + "step": 8447 + }, + { + "epoch": 3.3040997777228944, + "grad_norm": 0.5680281189182768, + "learning_rate": 4.693989250472804e-06, + "loss": 0.472, + "step": 8448 + }, + { + "epoch": 3.3044949370214867, + "grad_norm": 0.5020600876380309, + "learning_rate": 4.693914137238042e-06, + "loss": 0.4475, + "step": 8449 + }, + { + "epoch": 3.304890096320079, + "grad_norm": 0.4919611952382774, + "learning_rate": 4.693839015386916e-06, + "loss": 0.4579, + "step": 8450 + }, + { + "epoch": 3.305285255618671, + "grad_norm": 0.48835887965110875, + "learning_rate": 4.6937638849197225e-06, + "loss": 0.455, + "step": 8451 + }, + { + "epoch": 3.3056804149172634, + "grad_norm": 0.5031462098533465, + "learning_rate": 4.6936887458367565e-06, + "loss": 0.4576, + "step": 8452 + }, + { + "epoch": 3.3060755742158556, + "grad_norm": 0.6061653188739415, + "learning_rate": 4.693613598138314e-06, + "loss": 0.4332, + "step": 8453 + }, + { + "epoch": 3.306470733514448, + "grad_norm": 0.4888084682391102, + "learning_rate": 4.693538441824689e-06, + "loss": 0.4495, + "step": 8454 + }, + { + "epoch": 3.30686589281304, + "grad_norm": 0.4905941903714758, + "learning_rate": 4.693463276896177e-06, + "loss": 0.4529, + "step": 8455 + }, + { + "epoch": 3.3072610521116323, + "grad_norm": 0.49490350717422127, + "learning_rate": 4.693388103353073e-06, + "loss": 0.4439, + "step": 8456 + }, + { + "epoch": 3.3076562114102246, + "grad_norm": 0.5077115025169423, + "learning_rate": 4.693312921195673e-06, + "loss": 0.4602, + "step": 8457 + }, + { + "epoch": 3.308051370708817, + "grad_norm": 0.49228501239192973, + "learning_rate": 4.693237730424272e-06, + "loss": 0.4479, + "step": 8458 + }, + { + "epoch": 3.308446530007409, + "grad_norm": 0.4929223392479759, + "learning_rate": 4.693162531039163e-06, + "loss": 0.4597, + "step": 8459 + }, + { + "epoch": 3.3088416893060013, + "grad_norm": 0.48557109126722053, + "learning_rate": 4.693087323040646e-06, + "loss": 0.4667, + "step": 8460 + }, + { + "epoch": 3.3092368486045936, + "grad_norm": 0.4829968156083153, + "learning_rate": 4.693012106429012e-06, + "loss": 0.446, + "step": 8461 + }, + { + "epoch": 3.309632007903186, + "grad_norm": 0.48460924487669355, + "learning_rate": 4.692936881204558e-06, + "loss": 0.4668, + "step": 8462 + }, + { + "epoch": 3.310027167201778, + "grad_norm": 0.4841591054021867, + "learning_rate": 4.69286164736758e-06, + "loss": 0.4478, + "step": 8463 + }, + { + "epoch": 3.3104223265003703, + "grad_norm": 0.48269716253485223, + "learning_rate": 4.692786404918374e-06, + "loss": 0.437, + "step": 8464 + }, + { + "epoch": 3.310817485798963, + "grad_norm": 0.47804342056190563, + "learning_rate": 4.692711153857233e-06, + "loss": 0.4483, + "step": 8465 + }, + { + "epoch": 3.311212645097555, + "grad_norm": 0.52764555981506, + "learning_rate": 4.6926358941844556e-06, + "loss": 0.4563, + "step": 8466 + }, + { + "epoch": 3.3116078043961474, + "grad_norm": 0.5246940150795155, + "learning_rate": 4.692560625900335e-06, + "loss": 0.4497, + "step": 8467 + }, + { + "epoch": 3.3120029636947397, + "grad_norm": 0.489990665703525, + "learning_rate": 4.6924853490051666e-06, + "loss": 0.4486, + "step": 8468 + }, + { + "epoch": 3.312398122993332, + "grad_norm": 0.4932042814676327, + "learning_rate": 4.692410063499249e-06, + "loss": 0.4372, + "step": 8469 + }, + { + "epoch": 3.312793282291924, + "grad_norm": 0.5058430347314362, + "learning_rate": 4.692334769382874e-06, + "loss": 0.4471, + "step": 8470 + }, + { + "epoch": 3.3131884415905164, + "grad_norm": 0.4913325565934741, + "learning_rate": 4.6922594666563405e-06, + "loss": 0.4635, + "step": 8471 + }, + { + "epoch": 3.3135836008891086, + "grad_norm": 0.4926565499584854, + "learning_rate": 4.692184155319943e-06, + "loss": 0.4682, + "step": 8472 + }, + { + "epoch": 3.313978760187701, + "grad_norm": 0.5075289488806716, + "learning_rate": 4.692108835373977e-06, + "loss": 0.4619, + "step": 8473 + }, + { + "epoch": 3.314373919486293, + "grad_norm": 0.4971824312123781, + "learning_rate": 4.692033506818739e-06, + "loss": 0.4394, + "step": 8474 + }, + { + "epoch": 3.3147690787848854, + "grad_norm": 0.4789459129843913, + "learning_rate": 4.691958169654524e-06, + "loss": 0.4312, + "step": 8475 + }, + { + "epoch": 3.3151642380834776, + "grad_norm": 0.4948838888029013, + "learning_rate": 4.691882823881629e-06, + "loss": 0.439, + "step": 8476 + }, + { + "epoch": 3.31555939738207, + "grad_norm": 0.4995649269101974, + "learning_rate": 4.691807469500349e-06, + "loss": 0.459, + "step": 8477 + }, + { + "epoch": 3.315954556680662, + "grad_norm": 0.48840216224269173, + "learning_rate": 4.69173210651098e-06, + "loss": 0.4614, + "step": 8478 + }, + { + "epoch": 3.3163497159792543, + "grad_norm": 0.49025639250296205, + "learning_rate": 4.691656734913818e-06, + "loss": 0.4378, + "step": 8479 + }, + { + "epoch": 3.3167448752778466, + "grad_norm": 0.4776483977009846, + "learning_rate": 4.69158135470916e-06, + "loss": 0.429, + "step": 8480 + }, + { + "epoch": 3.317140034576439, + "grad_norm": 0.4836933504478722, + "learning_rate": 4.6915059658973e-06, + "loss": 0.4487, + "step": 8481 + }, + { + "epoch": 3.317535193875031, + "grad_norm": 0.47926982677697055, + "learning_rate": 4.691430568478536e-06, + "loss": 0.4369, + "step": 8482 + }, + { + "epoch": 3.3179303531736233, + "grad_norm": 0.4824835398113746, + "learning_rate": 4.691355162453164e-06, + "loss": 0.4546, + "step": 8483 + }, + { + "epoch": 3.3183255124722155, + "grad_norm": 0.4920472483420201, + "learning_rate": 4.691279747821479e-06, + "loss": 0.443, + "step": 8484 + }, + { + "epoch": 3.3187206717708078, + "grad_norm": 0.5110190442880285, + "learning_rate": 4.691204324583777e-06, + "loss": 0.4742, + "step": 8485 + }, + { + "epoch": 3.3191158310694, + "grad_norm": 0.4917474251438137, + "learning_rate": 4.691128892740356e-06, + "loss": 0.4451, + "step": 8486 + }, + { + "epoch": 3.3195109903679922, + "grad_norm": 0.5022323736398868, + "learning_rate": 4.691053452291511e-06, + "loss": 0.4384, + "step": 8487 + }, + { + "epoch": 3.3199061496665845, + "grad_norm": 0.5093861229795713, + "learning_rate": 4.690978003237538e-06, + "loss": 0.4592, + "step": 8488 + }, + { + "epoch": 3.3203013089651767, + "grad_norm": 0.49345040595298023, + "learning_rate": 4.690902545578735e-06, + "loss": 0.4566, + "step": 8489 + }, + { + "epoch": 3.320696468263769, + "grad_norm": 0.4910573607099517, + "learning_rate": 4.690827079315397e-06, + "loss": 0.4521, + "step": 8490 + }, + { + "epoch": 3.321091627562361, + "grad_norm": 0.4847495876246576, + "learning_rate": 4.690751604447819e-06, + "loss": 0.4428, + "step": 8491 + }, + { + "epoch": 3.3214867868609534, + "grad_norm": 0.48758326536965474, + "learning_rate": 4.6906761209763e-06, + "loss": 0.4521, + "step": 8492 + }, + { + "epoch": 3.3218819461595457, + "grad_norm": 0.4916431762270958, + "learning_rate": 4.690600628901136e-06, + "loss": 0.4488, + "step": 8493 + }, + { + "epoch": 3.322277105458138, + "grad_norm": 0.4858608753350482, + "learning_rate": 4.690525128222622e-06, + "loss": 0.4405, + "step": 8494 + }, + { + "epoch": 3.32267226475673, + "grad_norm": 0.4863633181170981, + "learning_rate": 4.690449618941056e-06, + "loss": 0.4491, + "step": 8495 + }, + { + "epoch": 3.3230674240553224, + "grad_norm": 0.49316937629362895, + "learning_rate": 4.690374101056734e-06, + "loss": 0.4523, + "step": 8496 + }, + { + "epoch": 3.3234625833539146, + "grad_norm": 0.5007036354318656, + "learning_rate": 4.690298574569952e-06, + "loss": 0.4493, + "step": 8497 + }, + { + "epoch": 3.323857742652507, + "grad_norm": 0.4945456330336425, + "learning_rate": 4.690223039481008e-06, + "loss": 0.4582, + "step": 8498 + }, + { + "epoch": 3.324252901951099, + "grad_norm": 0.48559472918192703, + "learning_rate": 4.690147495790197e-06, + "loss": 0.454, + "step": 8499 + }, + { + "epoch": 3.3246480612496914, + "grad_norm": 0.5121803446284094, + "learning_rate": 4.690071943497818e-06, + "loss": 0.4726, + "step": 8500 + }, + { + "epoch": 3.3250432205482836, + "grad_norm": 0.4910672144158454, + "learning_rate": 4.689996382604165e-06, + "loss": 0.4425, + "step": 8501 + }, + { + "epoch": 3.325438379846876, + "grad_norm": 0.5094226150602034, + "learning_rate": 4.689920813109537e-06, + "loss": 0.4625, + "step": 8502 + }, + { + "epoch": 3.325833539145468, + "grad_norm": 0.48636282211631227, + "learning_rate": 4.6898452350142296e-06, + "loss": 0.4548, + "step": 8503 + }, + { + "epoch": 3.3262286984440603, + "grad_norm": 0.4995052919549064, + "learning_rate": 4.68976964831854e-06, + "loss": 0.4523, + "step": 8504 + }, + { + "epoch": 3.3266238577426526, + "grad_norm": 0.501508067836005, + "learning_rate": 4.6896940530227645e-06, + "loss": 0.4596, + "step": 8505 + }, + { + "epoch": 3.327019017041245, + "grad_norm": 0.4929069745339944, + "learning_rate": 4.6896184491272e-06, + "loss": 0.4766, + "step": 8506 + }, + { + "epoch": 3.327414176339837, + "grad_norm": 0.48439758395238547, + "learning_rate": 4.689542836632144e-06, + "loss": 0.4558, + "step": 8507 + }, + { + "epoch": 3.3278093356384293, + "grad_norm": 0.5043297803420848, + "learning_rate": 4.689467215537894e-06, + "loss": 0.4545, + "step": 8508 + }, + { + "epoch": 3.3282044949370215, + "grad_norm": 0.4918720068999591, + "learning_rate": 4.689391585844745e-06, + "loss": 0.4474, + "step": 8509 + }, + { + "epoch": 3.3285996542356138, + "grad_norm": 0.49767114366249415, + "learning_rate": 4.689315947552997e-06, + "loss": 0.44, + "step": 8510 + }, + { + "epoch": 3.328994813534206, + "grad_norm": 0.47267654346989246, + "learning_rate": 4.689240300662944e-06, + "loss": 0.444, + "step": 8511 + }, + { + "epoch": 3.3293899728327983, + "grad_norm": 0.502912384208219, + "learning_rate": 4.6891646451748855e-06, + "loss": 0.4432, + "step": 8512 + }, + { + "epoch": 3.3297851321313905, + "grad_norm": 0.49847552956035984, + "learning_rate": 4.6890889810891175e-06, + "loss": 0.4619, + "step": 8513 + }, + { + "epoch": 3.3301802914299827, + "grad_norm": 0.4986927619398265, + "learning_rate": 4.689013308405936e-06, + "loss": 0.4815, + "step": 8514 + }, + { + "epoch": 3.330575450728575, + "grad_norm": 0.481380549992846, + "learning_rate": 4.688937627125641e-06, + "loss": 0.4617, + "step": 8515 + }, + { + "epoch": 3.330970610027167, + "grad_norm": 0.4999972238019487, + "learning_rate": 4.6888619372485275e-06, + "loss": 0.4521, + "step": 8516 + }, + { + "epoch": 3.3313657693257595, + "grad_norm": 0.49828427414950494, + "learning_rate": 4.688786238774893e-06, + "loss": 0.4558, + "step": 8517 + }, + { + "epoch": 3.3317609286243517, + "grad_norm": 0.49747513646012703, + "learning_rate": 4.688710531705036e-06, + "loss": 0.4397, + "step": 8518 + }, + { + "epoch": 3.332156087922944, + "grad_norm": 0.4845769352628467, + "learning_rate": 4.688634816039253e-06, + "loss": 0.4477, + "step": 8519 + }, + { + "epoch": 3.332551247221536, + "grad_norm": 0.4955335961994695, + "learning_rate": 4.688559091777841e-06, + "loss": 0.4429, + "step": 8520 + }, + { + "epoch": 3.3329464065201284, + "grad_norm": 0.5002101662039317, + "learning_rate": 4.6884833589210984e-06, + "loss": 0.4557, + "step": 8521 + }, + { + "epoch": 3.3333415658187207, + "grad_norm": 0.5012403566636711, + "learning_rate": 4.688407617469321e-06, + "loss": 0.4539, + "step": 8522 + }, + { + "epoch": 3.333736725117313, + "grad_norm": 0.4846544545345259, + "learning_rate": 4.688331867422809e-06, + "loss": 0.4452, + "step": 8523 + }, + { + "epoch": 3.334131884415905, + "grad_norm": 0.4966381233591069, + "learning_rate": 4.688256108781858e-06, + "loss": 0.4487, + "step": 8524 + }, + { + "epoch": 3.3345270437144974, + "grad_norm": 0.5829217411242708, + "learning_rate": 4.688180341546765e-06, + "loss": 0.4482, + "step": 8525 + }, + { + "epoch": 3.3349222030130896, + "grad_norm": 0.511353842249708, + "learning_rate": 4.688104565717828e-06, + "loss": 0.4382, + "step": 8526 + }, + { + "epoch": 3.335317362311682, + "grad_norm": 0.5083341149647729, + "learning_rate": 4.688028781295346e-06, + "loss": 0.4576, + "step": 8527 + }, + { + "epoch": 3.335712521610274, + "grad_norm": 0.537344546705706, + "learning_rate": 4.687952988279615e-06, + "loss": 0.4469, + "step": 8528 + }, + { + "epoch": 3.3361076809088663, + "grad_norm": 0.5068930887011414, + "learning_rate": 4.687877186670934e-06, + "loss": 0.458, + "step": 8529 + }, + { + "epoch": 3.3365028402074586, + "grad_norm": 0.5136759214514139, + "learning_rate": 4.687801376469599e-06, + "loss": 0.4687, + "step": 8530 + }, + { + "epoch": 3.336897999506051, + "grad_norm": 0.48837862440654095, + "learning_rate": 4.687725557675909e-06, + "loss": 0.4405, + "step": 8531 + }, + { + "epoch": 3.337293158804643, + "grad_norm": 0.48565670852398274, + "learning_rate": 4.687649730290162e-06, + "loss": 0.4434, + "step": 8532 + }, + { + "epoch": 3.3376883181032353, + "grad_norm": 0.5031092299127129, + "learning_rate": 4.6875738943126544e-06, + "loss": 0.4517, + "step": 8533 + }, + { + "epoch": 3.3380834774018275, + "grad_norm": 0.5215776103880958, + "learning_rate": 4.687498049743685e-06, + "loss": 0.4554, + "step": 8534 + }, + { + "epoch": 3.3384786367004198, + "grad_norm": 0.5102826341112808, + "learning_rate": 4.687422196583552e-06, + "loss": 0.4708, + "step": 8535 + }, + { + "epoch": 3.338873795999012, + "grad_norm": 0.4978853219738474, + "learning_rate": 4.6873463348325535e-06, + "loss": 0.4542, + "step": 8536 + }, + { + "epoch": 3.3392689552976043, + "grad_norm": 0.5025990366613098, + "learning_rate": 4.687270464490986e-06, + "loss": 0.4417, + "step": 8537 + }, + { + "epoch": 3.3396641145961965, + "grad_norm": 0.49855784405646736, + "learning_rate": 4.687194585559148e-06, + "loss": 0.4601, + "step": 8538 + }, + { + "epoch": 3.3400592738947887, + "grad_norm": 0.4932797881481111, + "learning_rate": 4.687118698037338e-06, + "loss": 0.4471, + "step": 8539 + }, + { + "epoch": 3.340454433193381, + "grad_norm": 0.4854332267118237, + "learning_rate": 4.687042801925854e-06, + "loss": 0.4452, + "step": 8540 + }, + { + "epoch": 3.340849592491973, + "grad_norm": 0.5037097432876378, + "learning_rate": 4.686966897224995e-06, + "loss": 0.4677, + "step": 8541 + }, + { + "epoch": 3.3412447517905655, + "grad_norm": 0.4856578335596684, + "learning_rate": 4.686890983935057e-06, + "loss": 0.4449, + "step": 8542 + }, + { + "epoch": 3.3416399110891577, + "grad_norm": 0.507337201279644, + "learning_rate": 4.686815062056338e-06, + "loss": 0.4636, + "step": 8543 + }, + { + "epoch": 3.34203507038775, + "grad_norm": 0.5033310923165123, + "learning_rate": 4.686739131589139e-06, + "loss": 0.4504, + "step": 8544 + }, + { + "epoch": 3.342430229686342, + "grad_norm": 0.5337152714631989, + "learning_rate": 4.686663192533756e-06, + "loss": 0.4927, + "step": 8545 + }, + { + "epoch": 3.3428253889849344, + "grad_norm": 0.47904040874221143, + "learning_rate": 4.686587244890488e-06, + "loss": 0.4503, + "step": 8546 + }, + { + "epoch": 3.3432205482835267, + "grad_norm": 0.49230347712013833, + "learning_rate": 4.686511288659633e-06, + "loss": 0.4568, + "step": 8547 + }, + { + "epoch": 3.343615707582119, + "grad_norm": 0.5166799488026463, + "learning_rate": 4.686435323841489e-06, + "loss": 0.4562, + "step": 8548 + }, + { + "epoch": 3.344010866880711, + "grad_norm": 0.4809789988181581, + "learning_rate": 4.686359350436355e-06, + "loss": 0.4509, + "step": 8549 + }, + { + "epoch": 3.3444060261793034, + "grad_norm": 0.4984285848482146, + "learning_rate": 4.686283368444529e-06, + "loss": 0.4519, + "step": 8550 + }, + { + "epoch": 3.3448011854778956, + "grad_norm": 0.4914707828734001, + "learning_rate": 4.68620737786631e-06, + "loss": 0.4478, + "step": 8551 + }, + { + "epoch": 3.345196344776488, + "grad_norm": 0.5038394475829153, + "learning_rate": 4.6861313787019955e-06, + "loss": 0.4666, + "step": 8552 + }, + { + "epoch": 3.34559150407508, + "grad_norm": 0.48139717719915776, + "learning_rate": 4.686055370951884e-06, + "loss": 0.4516, + "step": 8553 + }, + { + "epoch": 3.3459866633736723, + "grad_norm": 0.48724474236998466, + "learning_rate": 4.685979354616275e-06, + "loss": 0.4405, + "step": 8554 + }, + { + "epoch": 3.3463818226722646, + "grad_norm": 0.4840451154072024, + "learning_rate": 4.685903329695467e-06, + "loss": 0.4385, + "step": 8555 + }, + { + "epoch": 3.346776981970857, + "grad_norm": 0.4881270901311099, + "learning_rate": 4.685827296189757e-06, + "loss": 0.4506, + "step": 8556 + }, + { + "epoch": 3.347172141269449, + "grad_norm": 0.4989552343260758, + "learning_rate": 4.6857512540994456e-06, + "loss": 0.4698, + "step": 8557 + }, + { + "epoch": 3.3475673005680413, + "grad_norm": 0.4808629699114711, + "learning_rate": 4.68567520342483e-06, + "loss": 0.4413, + "step": 8558 + }, + { + "epoch": 3.3479624598666335, + "grad_norm": 0.5190496818983991, + "learning_rate": 4.68559914416621e-06, + "loss": 0.4611, + "step": 8559 + }, + { + "epoch": 3.348357619165226, + "grad_norm": 0.5037425270647897, + "learning_rate": 4.685523076323882e-06, + "loss": 0.4509, + "step": 8560 + }, + { + "epoch": 3.348752778463818, + "grad_norm": 0.510585938637351, + "learning_rate": 4.685446999898148e-06, + "loss": 0.4438, + "step": 8561 + }, + { + "epoch": 3.3491479377624103, + "grad_norm": 0.4974626944458801, + "learning_rate": 4.685370914889305e-06, + "loss": 0.4606, + "step": 8562 + }, + { + "epoch": 3.3495430970610025, + "grad_norm": 0.7314950180759313, + "learning_rate": 4.685294821297652e-06, + "loss": 0.4673, + "step": 8563 + }, + { + "epoch": 3.3499382563595947, + "grad_norm": 0.49161103652361915, + "learning_rate": 4.685218719123488e-06, + "loss": 0.4424, + "step": 8564 + }, + { + "epoch": 3.350333415658187, + "grad_norm": 0.5024992958323747, + "learning_rate": 4.685142608367112e-06, + "loss": 0.4678, + "step": 8565 + }, + { + "epoch": 3.3507285749567792, + "grad_norm": 0.4938192969589111, + "learning_rate": 4.685066489028823e-06, + "loss": 0.4492, + "step": 8566 + }, + { + "epoch": 3.3511237342553715, + "grad_norm": 0.4803879916882299, + "learning_rate": 4.684990361108919e-06, + "loss": 0.4561, + "step": 8567 + }, + { + "epoch": 3.3515188935539637, + "grad_norm": 0.4949401035088218, + "learning_rate": 4.6849142246077e-06, + "loss": 0.4442, + "step": 8568 + }, + { + "epoch": 3.351914052852556, + "grad_norm": 0.5976486404806604, + "learning_rate": 4.684838079525465e-06, + "loss": 0.4277, + "step": 8569 + }, + { + "epoch": 3.352309212151148, + "grad_norm": 0.49601388755837394, + "learning_rate": 4.684761925862512e-06, + "loss": 0.4531, + "step": 8570 + }, + { + "epoch": 3.3527043714497404, + "grad_norm": 0.5152413906273675, + "learning_rate": 4.684685763619141e-06, + "loss": 0.4492, + "step": 8571 + }, + { + "epoch": 3.353099530748333, + "grad_norm": 0.49341639261673015, + "learning_rate": 4.684609592795651e-06, + "loss": 0.4489, + "step": 8572 + }, + { + "epoch": 3.3534946900469254, + "grad_norm": 0.48592662223802807, + "learning_rate": 4.684533413392341e-06, + "loss": 0.455, + "step": 8573 + }, + { + "epoch": 3.3538898493455176, + "grad_norm": 0.49332844351682664, + "learning_rate": 4.684457225409511e-06, + "loss": 0.4657, + "step": 8574 + }, + { + "epoch": 3.35428500864411, + "grad_norm": 0.5037336410016675, + "learning_rate": 4.684381028847459e-06, + "loss": 0.4659, + "step": 8575 + }, + { + "epoch": 3.354680167942702, + "grad_norm": 0.506139866036012, + "learning_rate": 4.684304823706484e-06, + "loss": 0.4483, + "step": 8576 + }, + { + "epoch": 3.3550753272412943, + "grad_norm": 0.4942492969355824, + "learning_rate": 4.6842286099868864e-06, + "loss": 0.4454, + "step": 8577 + }, + { + "epoch": 3.3554704865398866, + "grad_norm": 0.49335672939762426, + "learning_rate": 4.684152387688966e-06, + "loss": 0.4433, + "step": 8578 + }, + { + "epoch": 3.355865645838479, + "grad_norm": 0.4969284399321435, + "learning_rate": 4.6840761568130204e-06, + "loss": 0.4584, + "step": 8579 + }, + { + "epoch": 3.356260805137071, + "grad_norm": 0.4873906665965171, + "learning_rate": 4.68399991735935e-06, + "loss": 0.4597, + "step": 8580 + }, + { + "epoch": 3.3566559644356633, + "grad_norm": 0.4886120652074471, + "learning_rate": 4.6839236693282544e-06, + "loss": 0.4442, + "step": 8581 + }, + { + "epoch": 3.3570511237342555, + "grad_norm": 0.4899352867723243, + "learning_rate": 4.683847412720033e-06, + "loss": 0.4521, + "step": 8582 + }, + { + "epoch": 3.3574462830328478, + "grad_norm": 0.4933462350667174, + "learning_rate": 4.683771147534985e-06, + "loss": 0.4438, + "step": 8583 + }, + { + "epoch": 3.35784144233144, + "grad_norm": 0.4898866455801216, + "learning_rate": 4.68369487377341e-06, + "loss": 0.4564, + "step": 8584 + }, + { + "epoch": 3.3582366016300322, + "grad_norm": 0.5332125037232016, + "learning_rate": 4.683618591435607e-06, + "loss": 0.4515, + "step": 8585 + }, + { + "epoch": 3.3586317609286245, + "grad_norm": 0.5359679858136057, + "learning_rate": 4.683542300521876e-06, + "loss": 0.4486, + "step": 8586 + }, + { + "epoch": 3.3590269202272167, + "grad_norm": 0.4767570634929505, + "learning_rate": 4.6834660010325175e-06, + "loss": 0.4387, + "step": 8587 + }, + { + "epoch": 3.359422079525809, + "grad_norm": 0.5056533475750209, + "learning_rate": 4.68338969296783e-06, + "loss": 0.4585, + "step": 8588 + }, + { + "epoch": 3.359817238824401, + "grad_norm": 0.48897355907247364, + "learning_rate": 4.683313376328113e-06, + "loss": 0.4395, + "step": 8589 + }, + { + "epoch": 3.3602123981229934, + "grad_norm": 0.5203028354845993, + "learning_rate": 4.683237051113669e-06, + "loss": 0.4614, + "step": 8590 + }, + { + "epoch": 3.3606075574215857, + "grad_norm": 0.4925183013708426, + "learning_rate": 4.683160717324794e-06, + "loss": 0.4436, + "step": 8591 + }, + { + "epoch": 3.361002716720178, + "grad_norm": 0.4909350641925199, + "learning_rate": 4.683084374961789e-06, + "loss": 0.4459, + "step": 8592 + }, + { + "epoch": 3.36139787601877, + "grad_norm": 0.4990261941077317, + "learning_rate": 4.683008024024955e-06, + "loss": 0.4662, + "step": 8593 + }, + { + "epoch": 3.3617930353173624, + "grad_norm": 0.5042484386336988, + "learning_rate": 4.6829316645145905e-06, + "loss": 0.4454, + "step": 8594 + }, + { + "epoch": 3.3621881946159546, + "grad_norm": 0.5110820201232344, + "learning_rate": 4.682855296430997e-06, + "loss": 0.461, + "step": 8595 + }, + { + "epoch": 3.362583353914547, + "grad_norm": 0.5119373520344455, + "learning_rate": 4.6827789197744725e-06, + "loss": 0.4351, + "step": 8596 + }, + { + "epoch": 3.362978513213139, + "grad_norm": 0.4864074295817185, + "learning_rate": 4.682702534545318e-06, + "loss": 0.4648, + "step": 8597 + }, + { + "epoch": 3.3633736725117314, + "grad_norm": 0.5170136763217118, + "learning_rate": 4.682626140743833e-06, + "loss": 0.4485, + "step": 8598 + }, + { + "epoch": 3.3637688318103236, + "grad_norm": 0.5124926082111486, + "learning_rate": 4.682549738370319e-06, + "loss": 0.4701, + "step": 8599 + }, + { + "epoch": 3.364163991108916, + "grad_norm": 0.4920001975170289, + "learning_rate": 4.682473327425074e-06, + "loss": 0.4648, + "step": 8600 + }, + { + "epoch": 3.364559150407508, + "grad_norm": 0.4937153316476773, + "learning_rate": 4.682396907908399e-06, + "loss": 0.442, + "step": 8601 + }, + { + "epoch": 3.3649543097061003, + "grad_norm": 0.48028050782341225, + "learning_rate": 4.682320479820595e-06, + "loss": 0.4459, + "step": 8602 + }, + { + "epoch": 3.3653494690046926, + "grad_norm": 0.5216497378987135, + "learning_rate": 4.68224404316196e-06, + "loss": 0.4581, + "step": 8603 + }, + { + "epoch": 3.365744628303285, + "grad_norm": 0.5014704905655514, + "learning_rate": 4.682167597932797e-06, + "loss": 0.4563, + "step": 8604 + }, + { + "epoch": 3.366139787601877, + "grad_norm": 0.478995486285583, + "learning_rate": 4.682091144133404e-06, + "loss": 0.4382, + "step": 8605 + }, + { + "epoch": 3.3665349469004693, + "grad_norm": 0.5182832142253619, + "learning_rate": 4.682014681764082e-06, + "loss": 0.459, + "step": 8606 + }, + { + "epoch": 3.3669301061990615, + "grad_norm": 0.4961453547341973, + "learning_rate": 4.6819382108251316e-06, + "loss": 0.4501, + "step": 8607 + }, + { + "epoch": 3.3673252654976538, + "grad_norm": 0.5111669516032571, + "learning_rate": 4.681861731316852e-06, + "loss": 0.4616, + "step": 8608 + }, + { + "epoch": 3.367720424796246, + "grad_norm": 0.49667736409059116, + "learning_rate": 4.681785243239546e-06, + "loss": 0.4476, + "step": 8609 + }, + { + "epoch": 3.3681155840948382, + "grad_norm": 0.509991731084816, + "learning_rate": 4.681708746593511e-06, + "loss": 0.4551, + "step": 8610 + }, + { + "epoch": 3.3685107433934305, + "grad_norm": 0.4861532008939771, + "learning_rate": 4.6816322413790495e-06, + "loss": 0.4551, + "step": 8611 + }, + { + "epoch": 3.3689059026920227, + "grad_norm": 0.47927743937701206, + "learning_rate": 4.6815557275964605e-06, + "loss": 0.4407, + "step": 8612 + }, + { + "epoch": 3.369301061990615, + "grad_norm": 0.48329305038335735, + "learning_rate": 4.681479205246047e-06, + "loss": 0.4317, + "step": 8613 + }, + { + "epoch": 3.369696221289207, + "grad_norm": 0.5557400243821855, + "learning_rate": 4.681402674328106e-06, + "loss": 0.4298, + "step": 8614 + }, + { + "epoch": 3.3700913805877994, + "grad_norm": 0.516015662888361, + "learning_rate": 4.681326134842941e-06, + "loss": 0.4399, + "step": 8615 + }, + { + "epoch": 3.3704865398863917, + "grad_norm": 0.51146416218406, + "learning_rate": 4.681249586790851e-06, + "loss": 0.4549, + "step": 8616 + }, + { + "epoch": 3.370881699184984, + "grad_norm": 0.5148904737293851, + "learning_rate": 4.681173030172138e-06, + "loss": 0.4503, + "step": 8617 + }, + { + "epoch": 3.371276858483576, + "grad_norm": 0.47920716436293814, + "learning_rate": 4.681096464987101e-06, + "loss": 0.4567, + "step": 8618 + }, + { + "epoch": 3.3716720177821684, + "grad_norm": 0.48667328112391944, + "learning_rate": 4.681019891236042e-06, + "loss": 0.4422, + "step": 8619 + }, + { + "epoch": 3.3720671770807606, + "grad_norm": 0.48135580887321766, + "learning_rate": 4.680943308919261e-06, + "loss": 0.4675, + "step": 8620 + }, + { + "epoch": 3.372462336379353, + "grad_norm": 0.5032541309680445, + "learning_rate": 4.680866718037058e-06, + "loss": 0.4648, + "step": 8621 + }, + { + "epoch": 3.372857495677945, + "grad_norm": 0.4811650608559039, + "learning_rate": 4.680790118589737e-06, + "loss": 0.4447, + "step": 8622 + }, + { + "epoch": 3.3732526549765374, + "grad_norm": 0.5466985608523096, + "learning_rate": 4.680713510577594e-06, + "loss": 0.457, + "step": 8623 + }, + { + "epoch": 3.3736478142751296, + "grad_norm": 0.5592565607971153, + "learning_rate": 4.680636894000935e-06, + "loss": 0.4449, + "step": 8624 + }, + { + "epoch": 3.374042973573722, + "grad_norm": 0.504991945313021, + "learning_rate": 4.680560268860057e-06, + "loss": 0.4542, + "step": 8625 + }, + { + "epoch": 3.374438132872314, + "grad_norm": 0.48890150098264806, + "learning_rate": 4.680483635155263e-06, + "loss": 0.4502, + "step": 8626 + }, + { + "epoch": 3.3748332921709063, + "grad_norm": 0.4943593439450679, + "learning_rate": 4.680406992886854e-06, + "loss": 0.4404, + "step": 8627 + }, + { + "epoch": 3.3752284514694986, + "grad_norm": 0.5332119219072962, + "learning_rate": 4.680330342055129e-06, + "loss": 0.4288, + "step": 8628 + }, + { + "epoch": 3.375623610768091, + "grad_norm": 0.4841185179406501, + "learning_rate": 4.68025368266039e-06, + "loss": 0.4506, + "step": 8629 + }, + { + "epoch": 3.376018770066683, + "grad_norm": 0.523621408569861, + "learning_rate": 4.68017701470294e-06, + "loss": 0.4586, + "step": 8630 + }, + { + "epoch": 3.3764139293652753, + "grad_norm": 0.46451406844891363, + "learning_rate": 4.680100338183078e-06, + "loss": 0.4332, + "step": 8631 + }, + { + "epoch": 3.3768090886638675, + "grad_norm": 0.4905181028363597, + "learning_rate": 4.6800236531011055e-06, + "loss": 0.4411, + "step": 8632 + }, + { + "epoch": 3.3772042479624598, + "grad_norm": 0.502652462267723, + "learning_rate": 4.679946959457325e-06, + "loss": 0.4693, + "step": 8633 + }, + { + "epoch": 3.377599407261052, + "grad_norm": 0.4911574856877145, + "learning_rate": 4.6798702572520345e-06, + "loss": 0.44, + "step": 8634 + }, + { + "epoch": 3.3779945665596443, + "grad_norm": 0.5052945482135779, + "learning_rate": 4.6797935464855385e-06, + "loss": 0.4649, + "step": 8635 + }, + { + "epoch": 3.3783897258582365, + "grad_norm": 0.48569037287035166, + "learning_rate": 4.679716827158137e-06, + "loss": 0.4587, + "step": 8636 + }, + { + "epoch": 3.3787848851568287, + "grad_norm": 0.5041362195084776, + "learning_rate": 4.6796400992701314e-06, + "loss": 0.4767, + "step": 8637 + }, + { + "epoch": 3.379180044455421, + "grad_norm": 0.4813960654671574, + "learning_rate": 4.679563362821823e-06, + "loss": 0.4568, + "step": 8638 + }, + { + "epoch": 3.379575203754013, + "grad_norm": 0.5088131032990694, + "learning_rate": 4.679486617813513e-06, + "loss": 0.4435, + "step": 8639 + }, + { + "epoch": 3.3799703630526055, + "grad_norm": 0.4861775857012652, + "learning_rate": 4.679409864245503e-06, + "loss": 0.4632, + "step": 8640 + }, + { + "epoch": 3.3803655223511977, + "grad_norm": 0.490586606299979, + "learning_rate": 4.679333102118095e-06, + "loss": 0.4594, + "step": 8641 + }, + { + "epoch": 3.38076068164979, + "grad_norm": 0.4932899496063389, + "learning_rate": 4.67925633143159e-06, + "loss": 0.4725, + "step": 8642 + }, + { + "epoch": 3.381155840948382, + "grad_norm": 0.5069803709886499, + "learning_rate": 4.679179552186289e-06, + "loss": 0.4608, + "step": 8643 + }, + { + "epoch": 3.3815510002469744, + "grad_norm": 0.4788990174095867, + "learning_rate": 4.679102764382494e-06, + "loss": 0.4577, + "step": 8644 + }, + { + "epoch": 3.3819461595455667, + "grad_norm": 0.49268285139092627, + "learning_rate": 4.6790259680205064e-06, + "loss": 0.463, + "step": 8645 + }, + { + "epoch": 3.382341318844159, + "grad_norm": 0.48644575261707157, + "learning_rate": 4.678949163100629e-06, + "loss": 0.4608, + "step": 8646 + }, + { + "epoch": 3.382736478142751, + "grad_norm": 0.4884652879626933, + "learning_rate": 4.678872349623161e-06, + "loss": 0.4498, + "step": 8647 + }, + { + "epoch": 3.3831316374413434, + "grad_norm": 0.4981392757751588, + "learning_rate": 4.678795527588407e-06, + "loss": 0.4554, + "step": 8648 + }, + { + "epoch": 3.3835267967399356, + "grad_norm": 0.4825160629921625, + "learning_rate": 4.678718696996666e-06, + "loss": 0.447, + "step": 8649 + }, + { + "epoch": 3.383921956038528, + "grad_norm": 0.496410984109344, + "learning_rate": 4.678641857848241e-06, + "loss": 0.4894, + "step": 8650 + }, + { + "epoch": 3.38431711533712, + "grad_norm": 0.48757622681326085, + "learning_rate": 4.6785650101434335e-06, + "loss": 0.4523, + "step": 8651 + }, + { + "epoch": 3.3847122746357123, + "grad_norm": 0.49958339915552713, + "learning_rate": 4.678488153882546e-06, + "loss": 0.4837, + "step": 8652 + }, + { + "epoch": 3.3851074339343046, + "grad_norm": 0.48280083745483665, + "learning_rate": 4.67841128906588e-06, + "loss": 0.4733, + "step": 8653 + }, + { + "epoch": 3.385502593232897, + "grad_norm": 0.4827487271721312, + "learning_rate": 4.6783344156937375e-06, + "loss": 0.4551, + "step": 8654 + }, + { + "epoch": 3.3858977525314895, + "grad_norm": 0.5135331354845594, + "learning_rate": 4.67825753376642e-06, + "loss": 0.4505, + "step": 8655 + }, + { + "epoch": 3.3862929118300817, + "grad_norm": 0.4950284195537903, + "learning_rate": 4.678180643284229e-06, + "loss": 0.4456, + "step": 8656 + }, + { + "epoch": 3.386688071128674, + "grad_norm": 0.49134952539015186, + "learning_rate": 4.678103744247468e-06, + "loss": 0.4536, + "step": 8657 + }, + { + "epoch": 3.3870832304272662, + "grad_norm": 0.5177519290341787, + "learning_rate": 4.678026836656437e-06, + "loss": 0.442, + "step": 8658 + }, + { + "epoch": 3.3874783897258585, + "grad_norm": 0.5228271177583046, + "learning_rate": 4.6779499205114406e-06, + "loss": 0.4477, + "step": 8659 + }, + { + "epoch": 3.3878735490244507, + "grad_norm": 0.48410785050252503, + "learning_rate": 4.677872995812778e-06, + "loss": 0.4512, + "step": 8660 + }, + { + "epoch": 3.388268708323043, + "grad_norm": 0.4710781351956366, + "learning_rate": 4.677796062560753e-06, + "loss": 0.4506, + "step": 8661 + }, + { + "epoch": 3.388663867621635, + "grad_norm": 0.48111306966615963, + "learning_rate": 4.677719120755669e-06, + "loss": 0.4598, + "step": 8662 + }, + { + "epoch": 3.3890590269202274, + "grad_norm": 0.4720947283760529, + "learning_rate": 4.6776421703978245e-06, + "loss": 0.4468, + "step": 8663 + }, + { + "epoch": 3.3894541862188197, + "grad_norm": 0.49393506201536114, + "learning_rate": 4.677565211487526e-06, + "loss": 0.4379, + "step": 8664 + }, + { + "epoch": 3.389849345517412, + "grad_norm": 0.48345686255173453, + "learning_rate": 4.677488244025072e-06, + "loss": 0.458, + "step": 8665 + }, + { + "epoch": 3.390244504816004, + "grad_norm": 0.490727505465697, + "learning_rate": 4.677411268010768e-06, + "loss": 0.4772, + "step": 8666 + }, + { + "epoch": 3.3906396641145964, + "grad_norm": 0.5034772743200914, + "learning_rate": 4.677334283444913e-06, + "loss": 0.4652, + "step": 8667 + }, + { + "epoch": 3.3910348234131886, + "grad_norm": 0.49435834675571205, + "learning_rate": 4.677257290327812e-06, + "loss": 0.4475, + "step": 8668 + }, + { + "epoch": 3.391429982711781, + "grad_norm": 0.5301178874307636, + "learning_rate": 4.677180288659766e-06, + "loss": 0.4683, + "step": 8669 + }, + { + "epoch": 3.391825142010373, + "grad_norm": 0.5010243323234773, + "learning_rate": 4.677103278441079e-06, + "loss": 0.4642, + "step": 8670 + }, + { + "epoch": 3.3922203013089653, + "grad_norm": 0.4885105136781966, + "learning_rate": 4.67702625967205e-06, + "loss": 0.4411, + "step": 8671 + }, + { + "epoch": 3.3926154606075576, + "grad_norm": 0.5011611192314379, + "learning_rate": 4.6769492323529865e-06, + "loss": 0.4517, + "step": 8672 + }, + { + "epoch": 3.39301061990615, + "grad_norm": 0.4939700998530884, + "learning_rate": 4.6768721964841865e-06, + "loss": 0.4498, + "step": 8673 + }, + { + "epoch": 3.393405779204742, + "grad_norm": 0.4954624935938974, + "learning_rate": 4.676795152065955e-06, + "loss": 0.4407, + "step": 8674 + }, + { + "epoch": 3.3938009385033343, + "grad_norm": 0.5119224790406571, + "learning_rate": 4.676718099098594e-06, + "loss": 0.4485, + "step": 8675 + }, + { + "epoch": 3.3941960978019265, + "grad_norm": 0.49705423434443835, + "learning_rate": 4.676641037582407e-06, + "loss": 0.4382, + "step": 8676 + }, + { + "epoch": 3.394591257100519, + "grad_norm": 0.4752730371851217, + "learning_rate": 4.676563967517694e-06, + "loss": 0.4412, + "step": 8677 + }, + { + "epoch": 3.394986416399111, + "grad_norm": 0.49012430064946205, + "learning_rate": 4.67648688890476e-06, + "loss": 0.4519, + "step": 8678 + }, + { + "epoch": 3.3953815756977033, + "grad_norm": 0.5050443098707427, + "learning_rate": 4.676409801743908e-06, + "loss": 0.4619, + "step": 8679 + }, + { + "epoch": 3.3957767349962955, + "grad_norm": 0.5049807653407213, + "learning_rate": 4.676332706035439e-06, + "loss": 0.4584, + "step": 8680 + }, + { + "epoch": 3.3961718942948877, + "grad_norm": 0.4838150340724149, + "learning_rate": 4.676255601779656e-06, + "loss": 0.4453, + "step": 8681 + }, + { + "epoch": 3.39656705359348, + "grad_norm": 0.4860542133541303, + "learning_rate": 4.676178488976864e-06, + "loss": 0.4621, + "step": 8682 + }, + { + "epoch": 3.3969622128920722, + "grad_norm": 0.4982361920417968, + "learning_rate": 4.676101367627364e-06, + "loss": 0.4705, + "step": 8683 + }, + { + "epoch": 3.3973573721906645, + "grad_norm": 0.4923037972317949, + "learning_rate": 4.676024237731459e-06, + "loss": 0.4545, + "step": 8684 + }, + { + "epoch": 3.3977525314892567, + "grad_norm": 0.5050487317688797, + "learning_rate": 4.675947099289452e-06, + "loss": 0.4528, + "step": 8685 + }, + { + "epoch": 3.398147690787849, + "grad_norm": 0.506471973724114, + "learning_rate": 4.675869952301646e-06, + "loss": 0.4753, + "step": 8686 + }, + { + "epoch": 3.398542850086441, + "grad_norm": 0.4709433859416677, + "learning_rate": 4.675792796768344e-06, + "loss": 0.445, + "step": 8687 + }, + { + "epoch": 3.3989380093850334, + "grad_norm": 0.49103621457028523, + "learning_rate": 4.67571563268985e-06, + "loss": 0.4651, + "step": 8688 + }, + { + "epoch": 3.3993331686836257, + "grad_norm": 0.490208526893036, + "learning_rate": 4.675638460066465e-06, + "loss": 0.4492, + "step": 8689 + }, + { + "epoch": 3.399728327982218, + "grad_norm": 0.48711510354854665, + "learning_rate": 4.675561278898494e-06, + "loss": 0.4417, + "step": 8690 + }, + { + "epoch": 3.40012348728081, + "grad_norm": 0.48830265701534586, + "learning_rate": 4.675484089186239e-06, + "loss": 0.449, + "step": 8691 + }, + { + "epoch": 3.4005186465794024, + "grad_norm": 0.4779197088460178, + "learning_rate": 4.6754068909300044e-06, + "loss": 0.4609, + "step": 8692 + }, + { + "epoch": 3.4009138058779946, + "grad_norm": 0.5000217624862712, + "learning_rate": 4.675329684130091e-06, + "loss": 0.4776, + "step": 8693 + }, + { + "epoch": 3.401308965176587, + "grad_norm": 0.515046165717711, + "learning_rate": 4.675252468786805e-06, + "loss": 0.4721, + "step": 8694 + }, + { + "epoch": 3.401704124475179, + "grad_norm": 0.4887957168364599, + "learning_rate": 4.675175244900447e-06, + "loss": 0.4457, + "step": 8695 + }, + { + "epoch": 3.4020992837737714, + "grad_norm": 0.48483411104988006, + "learning_rate": 4.675098012471322e-06, + "loss": 0.459, + "step": 8696 + }, + { + "epoch": 3.4024944430723636, + "grad_norm": 0.49370445185103784, + "learning_rate": 4.675020771499733e-06, + "loss": 0.4517, + "step": 8697 + }, + { + "epoch": 3.402889602370956, + "grad_norm": 0.5232425366645843, + "learning_rate": 4.6749435219859825e-06, + "loss": 0.4522, + "step": 8698 + }, + { + "epoch": 3.403284761669548, + "grad_norm": 0.47812677963839323, + "learning_rate": 4.674866263930375e-06, + "loss": 0.4373, + "step": 8699 + }, + { + "epoch": 3.4036799209681403, + "grad_norm": 0.50114037166455, + "learning_rate": 4.674788997333214e-06, + "loss": 0.4761, + "step": 8700 + }, + { + "epoch": 3.4040750802667326, + "grad_norm": 0.4990732251105035, + "learning_rate": 4.674711722194802e-06, + "loss": 0.4305, + "step": 8701 + }, + { + "epoch": 3.404470239565325, + "grad_norm": 0.4919053285526375, + "learning_rate": 4.674634438515443e-06, + "loss": 0.4596, + "step": 8702 + }, + { + "epoch": 3.404865398863917, + "grad_norm": 0.500445117775505, + "learning_rate": 4.6745571462954395e-06, + "loss": 0.4375, + "step": 8703 + }, + { + "epoch": 3.4052605581625093, + "grad_norm": 0.48686796875270305, + "learning_rate": 4.674479845535097e-06, + "loss": 0.4523, + "step": 8704 + }, + { + "epoch": 3.4056557174611015, + "grad_norm": 0.5030201027639913, + "learning_rate": 4.6744025362347175e-06, + "loss": 0.4715, + "step": 8705 + }, + { + "epoch": 3.4060508767596938, + "grad_norm": 0.4858539869831347, + "learning_rate": 4.674325218394606e-06, + "loss": 0.4542, + "step": 8706 + }, + { + "epoch": 3.406446036058286, + "grad_norm": 0.48744591028154344, + "learning_rate": 4.674247892015065e-06, + "loss": 0.4632, + "step": 8707 + }, + { + "epoch": 3.4068411953568782, + "grad_norm": 0.5305591582260777, + "learning_rate": 4.674170557096398e-06, + "loss": 0.4471, + "step": 8708 + }, + { + "epoch": 3.4072363546554705, + "grad_norm": 0.4896593869094789, + "learning_rate": 4.674093213638909e-06, + "loss": 0.4546, + "step": 8709 + }, + { + "epoch": 3.4076315139540627, + "grad_norm": 0.5193419476995951, + "learning_rate": 4.674015861642903e-06, + "loss": 0.4572, + "step": 8710 + }, + { + "epoch": 3.408026673252655, + "grad_norm": 0.5029397565244852, + "learning_rate": 4.673938501108684e-06, + "loss": 0.4718, + "step": 8711 + }, + { + "epoch": 3.408421832551247, + "grad_norm": 0.4966168365599225, + "learning_rate": 4.673861132036552e-06, + "loss": 0.4393, + "step": 8712 + }, + { + "epoch": 3.4088169918498394, + "grad_norm": 0.48279757458338685, + "learning_rate": 4.673783754426815e-06, + "loss": 0.4432, + "step": 8713 + }, + { + "epoch": 3.4092121511484317, + "grad_norm": 0.47848792729340384, + "learning_rate": 4.673706368279775e-06, + "loss": 0.4654, + "step": 8714 + }, + { + "epoch": 3.409607310447024, + "grad_norm": 0.48502060720503026, + "learning_rate": 4.6736289735957365e-06, + "loss": 0.4351, + "step": 8715 + }, + { + "epoch": 3.410002469745616, + "grad_norm": 0.517902551374955, + "learning_rate": 4.673551570375003e-06, + "loss": 0.4608, + "step": 8716 + }, + { + "epoch": 3.4103976290442084, + "grad_norm": 0.4959576763379469, + "learning_rate": 4.673474158617879e-06, + "loss": 0.4575, + "step": 8717 + }, + { + "epoch": 3.4107927883428006, + "grad_norm": 0.487828895869595, + "learning_rate": 4.673396738324668e-06, + "loss": 0.4381, + "step": 8718 + }, + { + "epoch": 3.411187947641393, + "grad_norm": 0.4921131542323153, + "learning_rate": 4.6733193094956755e-06, + "loss": 0.4618, + "step": 8719 + }, + { + "epoch": 3.411583106939985, + "grad_norm": 0.489989732392488, + "learning_rate": 4.6732418721312036e-06, + "loss": 0.4712, + "step": 8720 + }, + { + "epoch": 3.4119782662385774, + "grad_norm": 0.48021030964681694, + "learning_rate": 4.6731644262315575e-06, + "loss": 0.4398, + "step": 8721 + }, + { + "epoch": 3.4123734255371696, + "grad_norm": 0.5196748841661029, + "learning_rate": 4.673086971797041e-06, + "loss": 0.4569, + "step": 8722 + }, + { + "epoch": 3.412768584835762, + "grad_norm": 0.6391029638584853, + "learning_rate": 4.673009508827958e-06, + "loss": 0.4584, + "step": 8723 + }, + { + "epoch": 3.413163744134354, + "grad_norm": 0.49223316026383734, + "learning_rate": 4.672932037324614e-06, + "loss": 0.4523, + "step": 8724 + }, + { + "epoch": 3.4135589034329463, + "grad_norm": 0.4865710928676197, + "learning_rate": 4.672854557287312e-06, + "loss": 0.4411, + "step": 8725 + }, + { + "epoch": 3.4139540627315386, + "grad_norm": 0.5065983068219876, + "learning_rate": 4.672777068716357e-06, + "loss": 0.4726, + "step": 8726 + }, + { + "epoch": 3.414349222030131, + "grad_norm": 0.49208422147691183, + "learning_rate": 4.672699571612053e-06, + "loss": 0.4772, + "step": 8727 + }, + { + "epoch": 3.414744381328723, + "grad_norm": 0.4883146901724943, + "learning_rate": 4.6726220659747035e-06, + "loss": 0.4645, + "step": 8728 + }, + { + "epoch": 3.4151395406273153, + "grad_norm": 0.4873411772334921, + "learning_rate": 4.672544551804615e-06, + "loss": 0.453, + "step": 8729 + }, + { + "epoch": 3.4155346999259075, + "grad_norm": 0.4812888051291896, + "learning_rate": 4.67246702910209e-06, + "loss": 0.4401, + "step": 8730 + }, + { + "epoch": 3.4159298592244998, + "grad_norm": 0.48596379658909483, + "learning_rate": 4.6723894978674344e-06, + "loss": 0.4595, + "step": 8731 + }, + { + "epoch": 3.416325018523092, + "grad_norm": 0.48806164094782617, + "learning_rate": 4.672311958100952e-06, + "loss": 0.4559, + "step": 8732 + }, + { + "epoch": 3.4167201778216842, + "grad_norm": 0.5278943342209049, + "learning_rate": 4.672234409802946e-06, + "loss": 0.4598, + "step": 8733 + }, + { + "epoch": 3.4171153371202765, + "grad_norm": 0.48766632378299773, + "learning_rate": 4.672156852973724e-06, + "loss": 0.447, + "step": 8734 + }, + { + "epoch": 3.4175104964188687, + "grad_norm": 0.4998558934415389, + "learning_rate": 4.672079287613588e-06, + "loss": 0.466, + "step": 8735 + }, + { + "epoch": 3.417905655717461, + "grad_norm": 0.5161927705054742, + "learning_rate": 4.672001713722844e-06, + "loss": 0.4512, + "step": 8736 + }, + { + "epoch": 3.418300815016053, + "grad_norm": 0.4993835199850496, + "learning_rate": 4.671924131301795e-06, + "loss": 0.4667, + "step": 8737 + }, + { + "epoch": 3.4186959743146454, + "grad_norm": 0.5111345200683314, + "learning_rate": 4.671846540350748e-06, + "loss": 0.4669, + "step": 8738 + }, + { + "epoch": 3.4190911336132377, + "grad_norm": 0.4968151170656032, + "learning_rate": 4.6717689408700065e-06, + "loss": 0.4415, + "step": 8739 + }, + { + "epoch": 3.41948629291183, + "grad_norm": 0.48927357809806327, + "learning_rate": 4.671691332859875e-06, + "loss": 0.4454, + "step": 8740 + }, + { + "epoch": 3.419881452210422, + "grad_norm": 0.4986943442895406, + "learning_rate": 4.671613716320658e-06, + "loss": 0.4578, + "step": 8741 + }, + { + "epoch": 3.4202766115090144, + "grad_norm": 0.507640735251368, + "learning_rate": 4.671536091252662e-06, + "loss": 0.4708, + "step": 8742 + }, + { + "epoch": 3.4206717708076066, + "grad_norm": 0.4864374605398443, + "learning_rate": 4.671458457656191e-06, + "loss": 0.4459, + "step": 8743 + }, + { + "epoch": 3.421066930106199, + "grad_norm": 0.48587924372525426, + "learning_rate": 4.671380815531549e-06, + "loss": 0.4556, + "step": 8744 + }, + { + "epoch": 3.421462089404791, + "grad_norm": 0.49368596535285864, + "learning_rate": 4.671303164879043e-06, + "loss": 0.4518, + "step": 8745 + }, + { + "epoch": 3.4218572487033834, + "grad_norm": 0.49585373553596823, + "learning_rate": 4.671225505698975e-06, + "loss": 0.4557, + "step": 8746 + }, + { + "epoch": 3.4222524080019756, + "grad_norm": 0.48924156726844276, + "learning_rate": 4.671147837991653e-06, + "loss": 0.4598, + "step": 8747 + }, + { + "epoch": 3.422647567300568, + "grad_norm": 0.5066355289955341, + "learning_rate": 4.67107016175738e-06, + "loss": 0.4619, + "step": 8748 + }, + { + "epoch": 3.42304272659916, + "grad_norm": 0.4998574490378027, + "learning_rate": 4.670992476996462e-06, + "loss": 0.4628, + "step": 8749 + }, + { + "epoch": 3.4234378858977523, + "grad_norm": 0.4926776349012955, + "learning_rate": 4.670914783709203e-06, + "loss": 0.4533, + "step": 8750 + }, + { + "epoch": 3.4238330451963446, + "grad_norm": 0.48942538427023524, + "learning_rate": 4.6708370818959105e-06, + "loss": 0.4497, + "step": 8751 + }, + { + "epoch": 3.424228204494937, + "grad_norm": 0.48790555324765683, + "learning_rate": 4.670759371556886e-06, + "loss": 0.4497, + "step": 8752 + }, + { + "epoch": 3.424623363793529, + "grad_norm": 0.5083233533489905, + "learning_rate": 4.670681652692439e-06, + "loss": 0.4488, + "step": 8753 + }, + { + "epoch": 3.4250185230921213, + "grad_norm": 0.5216951615222701, + "learning_rate": 4.670603925302871e-06, + "loss": 0.4747, + "step": 8754 + }, + { + "epoch": 3.4254136823907135, + "grad_norm": 0.49070908525677087, + "learning_rate": 4.6705261893884904e-06, + "loss": 0.4622, + "step": 8755 + }, + { + "epoch": 3.4258088416893058, + "grad_norm": 0.4914126193159798, + "learning_rate": 4.6704484449496e-06, + "loss": 0.4628, + "step": 8756 + }, + { + "epoch": 3.426204000987898, + "grad_norm": 0.49544497930427484, + "learning_rate": 4.670370691986507e-06, + "loss": 0.4443, + "step": 8757 + }, + { + "epoch": 3.4265991602864903, + "grad_norm": 0.509745151929745, + "learning_rate": 4.670292930499515e-06, + "loss": 0.4639, + "step": 8758 + }, + { + "epoch": 3.4269943195850825, + "grad_norm": 0.4992599417209327, + "learning_rate": 4.67021516048893e-06, + "loss": 0.4588, + "step": 8759 + }, + { + "epoch": 3.4273894788836747, + "grad_norm": 0.48349844003574116, + "learning_rate": 4.670137381955058e-06, + "loss": 0.4485, + "step": 8760 + }, + { + "epoch": 3.4277846381822674, + "grad_norm": 0.492075989647956, + "learning_rate": 4.670059594898204e-06, + "loss": 0.4503, + "step": 8761 + }, + { + "epoch": 3.4281797974808597, + "grad_norm": 0.48934117903372404, + "learning_rate": 4.669981799318673e-06, + "loss": 0.4554, + "step": 8762 + }, + { + "epoch": 3.428574956779452, + "grad_norm": 0.4878416543021339, + "learning_rate": 4.669903995216772e-06, + "loss": 0.4563, + "step": 8763 + }, + { + "epoch": 3.428970116078044, + "grad_norm": 0.4967785153968765, + "learning_rate": 4.669826182592806e-06, + "loss": 0.4613, + "step": 8764 + }, + { + "epoch": 3.4293652753766364, + "grad_norm": 0.4892337802144703, + "learning_rate": 4.669748361447081e-06, + "loss": 0.4604, + "step": 8765 + }, + { + "epoch": 3.4297604346752286, + "grad_norm": 0.5013718521706811, + "learning_rate": 4.6696705317799e-06, + "loss": 0.4558, + "step": 8766 + }, + { + "epoch": 3.430155593973821, + "grad_norm": 0.5039804223697372, + "learning_rate": 4.669592693591571e-06, + "loss": 0.4654, + "step": 8767 + }, + { + "epoch": 3.430550753272413, + "grad_norm": 0.5028272637505543, + "learning_rate": 4.669514846882401e-06, + "loss": 0.4487, + "step": 8768 + }, + { + "epoch": 3.4309459125710053, + "grad_norm": 0.47464547478018637, + "learning_rate": 4.669436991652693e-06, + "loss": 0.4574, + "step": 8769 + }, + { + "epoch": 3.4313410718695976, + "grad_norm": 0.4998838948907234, + "learning_rate": 4.6693591279027535e-06, + "loss": 0.461, + "step": 8770 + }, + { + "epoch": 3.43173623116819, + "grad_norm": 0.4955573106154294, + "learning_rate": 4.669281255632889e-06, + "loss": 0.4541, + "step": 8771 + }, + { + "epoch": 3.432131390466782, + "grad_norm": 0.5161307923252046, + "learning_rate": 4.669203374843405e-06, + "loss": 0.4616, + "step": 8772 + }, + { + "epoch": 3.4325265497653743, + "grad_norm": 0.4873616543396202, + "learning_rate": 4.669125485534608e-06, + "loss": 0.4531, + "step": 8773 + }, + { + "epoch": 3.4329217090639665, + "grad_norm": 0.4914250914065892, + "learning_rate": 4.669047587706803e-06, + "loss": 0.4371, + "step": 8774 + }, + { + "epoch": 3.433316868362559, + "grad_norm": 0.49013758880173275, + "learning_rate": 4.668969681360295e-06, + "loss": 0.455, + "step": 8775 + }, + { + "epoch": 3.433712027661151, + "grad_norm": 0.4935136679299746, + "learning_rate": 4.668891766495393e-06, + "loss": 0.4581, + "step": 8776 + }, + { + "epoch": 3.4341071869597433, + "grad_norm": 0.5068462598993336, + "learning_rate": 4.6688138431124e-06, + "loss": 0.4592, + "step": 8777 + }, + { + "epoch": 3.4345023462583355, + "grad_norm": 0.5140101571522524, + "learning_rate": 4.668735911211623e-06, + "loss": 0.44, + "step": 8778 + }, + { + "epoch": 3.4348975055569277, + "grad_norm": 0.48453766176716734, + "learning_rate": 4.668657970793369e-06, + "loss": 0.4403, + "step": 8779 + }, + { + "epoch": 3.43529266485552, + "grad_norm": 0.5039231232251424, + "learning_rate": 4.668580021857943e-06, + "loss": 0.4624, + "step": 8780 + }, + { + "epoch": 3.4356878241541122, + "grad_norm": 0.5007826918099934, + "learning_rate": 4.668502064405651e-06, + "loss": 0.4575, + "step": 8781 + }, + { + "epoch": 3.4360829834527045, + "grad_norm": 0.4847249820915745, + "learning_rate": 4.6684240984368005e-06, + "loss": 0.4504, + "step": 8782 + }, + { + "epoch": 3.4364781427512967, + "grad_norm": 0.5305724135544032, + "learning_rate": 4.668346123951696e-06, + "loss": 0.4301, + "step": 8783 + }, + { + "epoch": 3.436873302049889, + "grad_norm": 0.4873670692094404, + "learning_rate": 4.668268140950646e-06, + "loss": 0.4645, + "step": 8784 + }, + { + "epoch": 3.437268461348481, + "grad_norm": 0.4961011367709567, + "learning_rate": 4.668190149433955e-06, + "loss": 0.4574, + "step": 8785 + }, + { + "epoch": 3.4376636206470734, + "grad_norm": 0.519896601726298, + "learning_rate": 4.668112149401928e-06, + "loss": 0.4673, + "step": 8786 + }, + { + "epoch": 3.4380587799456657, + "grad_norm": 0.5092646275519088, + "learning_rate": 4.6680341408548746e-06, + "loss": 0.4497, + "step": 8787 + }, + { + "epoch": 3.438453939244258, + "grad_norm": 0.5057955855549762, + "learning_rate": 4.667956123793099e-06, + "loss": 0.4783, + "step": 8788 + }, + { + "epoch": 3.43884909854285, + "grad_norm": 0.4985817701072453, + "learning_rate": 4.667878098216909e-06, + "loss": 0.4517, + "step": 8789 + }, + { + "epoch": 3.4392442578414424, + "grad_norm": 0.5134738759251121, + "learning_rate": 4.66780006412661e-06, + "loss": 0.4479, + "step": 8790 + }, + { + "epoch": 3.4396394171400346, + "grad_norm": 0.5255474891390775, + "learning_rate": 4.667722021522508e-06, + "loss": 0.4562, + "step": 8791 + }, + { + "epoch": 3.440034576438627, + "grad_norm": 0.49695920779912806, + "learning_rate": 4.667643970404911e-06, + "loss": 0.4648, + "step": 8792 + }, + { + "epoch": 3.440429735737219, + "grad_norm": 0.49907335777305206, + "learning_rate": 4.667565910774124e-06, + "loss": 0.4706, + "step": 8793 + }, + { + "epoch": 3.4408248950358113, + "grad_norm": 0.4884559789494324, + "learning_rate": 4.667487842630455e-06, + "loss": 0.4514, + "step": 8794 + }, + { + "epoch": 3.4412200543344036, + "grad_norm": 0.5000777793061358, + "learning_rate": 4.667409765974209e-06, + "loss": 0.4539, + "step": 8795 + }, + { + "epoch": 3.441615213632996, + "grad_norm": 0.5069843087753504, + "learning_rate": 4.667331680805694e-06, + "loss": 0.4659, + "step": 8796 + }, + { + "epoch": 3.442010372931588, + "grad_norm": 0.5197003132246248, + "learning_rate": 4.6672535871252165e-06, + "loss": 0.4652, + "step": 8797 + }, + { + "epoch": 3.4424055322301803, + "grad_norm": 0.4871319544582793, + "learning_rate": 4.6671754849330834e-06, + "loss": 0.4448, + "step": 8798 + }, + { + "epoch": 3.4428006915287725, + "grad_norm": 0.5028109247529198, + "learning_rate": 4.6670973742296e-06, + "loss": 0.4618, + "step": 8799 + }, + { + "epoch": 3.443195850827365, + "grad_norm": 0.49091531311391173, + "learning_rate": 4.667019255015075e-06, + "loss": 0.4409, + "step": 8800 + }, + { + "epoch": 3.443591010125957, + "grad_norm": 0.5051259627632506, + "learning_rate": 4.666941127289813e-06, + "loss": 0.4567, + "step": 8801 + }, + { + "epoch": 3.4439861694245493, + "grad_norm": 0.49791345108367346, + "learning_rate": 4.666862991054123e-06, + "loss": 0.4553, + "step": 8802 + }, + { + "epoch": 3.4443813287231415, + "grad_norm": 0.4943684112194807, + "learning_rate": 4.666784846308311e-06, + "loss": 0.4501, + "step": 8803 + }, + { + "epoch": 3.4447764880217338, + "grad_norm": 0.598269397017305, + "learning_rate": 4.666706693052684e-06, + "loss": 0.4722, + "step": 8804 + }, + { + "epoch": 3.445171647320326, + "grad_norm": 0.5033122711335563, + "learning_rate": 4.666628531287548e-06, + "loss": 0.4559, + "step": 8805 + }, + { + "epoch": 3.4455668066189182, + "grad_norm": 0.47928044079152565, + "learning_rate": 4.666550361013211e-06, + "loss": 0.4556, + "step": 8806 + }, + { + "epoch": 3.4459619659175105, + "grad_norm": 0.49275067830564856, + "learning_rate": 4.66647218222998e-06, + "loss": 0.4271, + "step": 8807 + }, + { + "epoch": 3.4463571252161027, + "grad_norm": 0.49320681766088637, + "learning_rate": 4.666393994938162e-06, + "loss": 0.4565, + "step": 8808 + }, + { + "epoch": 3.446752284514695, + "grad_norm": 0.48982527817865373, + "learning_rate": 4.666315799138064e-06, + "loss": 0.459, + "step": 8809 + }, + { + "epoch": 3.447147443813287, + "grad_norm": 0.48836849709394153, + "learning_rate": 4.6662375948299924e-06, + "loss": 0.4543, + "step": 8810 + }, + { + "epoch": 3.4475426031118794, + "grad_norm": 0.49635265949726676, + "learning_rate": 4.666159382014255e-06, + "loss": 0.4618, + "step": 8811 + }, + { + "epoch": 3.4479377624104717, + "grad_norm": 0.4953454582630569, + "learning_rate": 4.66608116069116e-06, + "loss": 0.4572, + "step": 8812 + }, + { + "epoch": 3.448332921709064, + "grad_norm": 0.5006722224939579, + "learning_rate": 4.6660029308610125e-06, + "loss": 0.4508, + "step": 8813 + }, + { + "epoch": 3.448728081007656, + "grad_norm": 0.5043235685637225, + "learning_rate": 4.665924692524121e-06, + "loss": 0.462, + "step": 8814 + }, + { + "epoch": 3.4491232403062484, + "grad_norm": 0.5053267442560526, + "learning_rate": 4.665846445680792e-06, + "loss": 0.4496, + "step": 8815 + }, + { + "epoch": 3.4495183996048406, + "grad_norm": 0.4917987457397994, + "learning_rate": 4.665768190331334e-06, + "loss": 0.4662, + "step": 8816 + }, + { + "epoch": 3.449913558903433, + "grad_norm": 0.48570702363194657, + "learning_rate": 4.665689926476054e-06, + "loss": 0.4525, + "step": 8817 + }, + { + "epoch": 3.450308718202025, + "grad_norm": 0.49691878545305157, + "learning_rate": 4.665611654115258e-06, + "loss": 0.4573, + "step": 8818 + }, + { + "epoch": 3.4507038775006174, + "grad_norm": 0.4895661716841793, + "learning_rate": 4.665533373249255e-06, + "loss": 0.4411, + "step": 8819 + }, + { + "epoch": 3.4510990367992096, + "grad_norm": 0.49186441261086594, + "learning_rate": 4.665455083878352e-06, + "loss": 0.4692, + "step": 8820 + }, + { + "epoch": 3.451494196097802, + "grad_norm": 0.5099516410323954, + "learning_rate": 4.665376786002856e-06, + "loss": 0.4656, + "step": 8821 + }, + { + "epoch": 3.451889355396394, + "grad_norm": 0.5169744344300137, + "learning_rate": 4.665298479623075e-06, + "loss": 0.4556, + "step": 8822 + }, + { + "epoch": 3.4522845146949863, + "grad_norm": 0.5171843190465332, + "learning_rate": 4.665220164739316e-06, + "loss": 0.4769, + "step": 8823 + }, + { + "epoch": 3.4526796739935786, + "grad_norm": 0.4974279869698287, + "learning_rate": 4.665141841351888e-06, + "loss": 0.439, + "step": 8824 + }, + { + "epoch": 3.453074833292171, + "grad_norm": 0.50434560452558, + "learning_rate": 4.665063509461098e-06, + "loss": 0.4594, + "step": 8825 + }, + { + "epoch": 3.453469992590763, + "grad_norm": 0.4955408379140396, + "learning_rate": 4.664985169067251e-06, + "loss": 0.4758, + "step": 8826 + }, + { + "epoch": 3.4538651518893553, + "grad_norm": 0.48902071003257797, + "learning_rate": 4.664906820170658e-06, + "loss": 0.4516, + "step": 8827 + }, + { + "epoch": 3.4542603111879475, + "grad_norm": 0.4937855562200361, + "learning_rate": 4.664828462771627e-06, + "loss": 0.4646, + "step": 8828 + }, + { + "epoch": 3.4546554704865398, + "grad_norm": 0.5082998875173607, + "learning_rate": 4.664750096870463e-06, + "loss": 0.4603, + "step": 8829 + }, + { + "epoch": 3.455050629785132, + "grad_norm": 0.47237718060013906, + "learning_rate": 4.664671722467475e-06, + "loss": 0.4576, + "step": 8830 + }, + { + "epoch": 3.4554457890837242, + "grad_norm": 0.5038153640837024, + "learning_rate": 4.664593339562971e-06, + "loss": 0.4622, + "step": 8831 + }, + { + "epoch": 3.4558409483823165, + "grad_norm": 0.49668508531522104, + "learning_rate": 4.66451494815726e-06, + "loss": 0.4477, + "step": 8832 + }, + { + "epoch": 3.4562361076809087, + "grad_norm": 0.5133159759954414, + "learning_rate": 4.664436548250646e-06, + "loss": 0.4596, + "step": 8833 + }, + { + "epoch": 3.456631266979501, + "grad_norm": 0.49905932346964094, + "learning_rate": 4.664358139843442e-06, + "loss": 0.4759, + "step": 8834 + }, + { + "epoch": 3.457026426278093, + "grad_norm": 0.4849036414485639, + "learning_rate": 4.664279722935953e-06, + "loss": 0.4609, + "step": 8835 + }, + { + "epoch": 3.4574215855766854, + "grad_norm": 0.5058385856975989, + "learning_rate": 4.664201297528487e-06, + "loss": 0.4706, + "step": 8836 + }, + { + "epoch": 3.4578167448752777, + "grad_norm": 0.4967750894306117, + "learning_rate": 4.6641228636213524e-06, + "loss": 0.4501, + "step": 8837 + }, + { + "epoch": 3.45821190417387, + "grad_norm": 0.5136748603140383, + "learning_rate": 4.664044421214857e-06, + "loss": 0.4634, + "step": 8838 + }, + { + "epoch": 3.458607063472462, + "grad_norm": 0.4914768586557601, + "learning_rate": 4.663965970309311e-06, + "loss": 0.4609, + "step": 8839 + }, + { + "epoch": 3.4590022227710544, + "grad_norm": 0.5032307100674864, + "learning_rate": 4.6638875109050184e-06, + "loss": 0.454, + "step": 8840 + }, + { + "epoch": 3.4593973820696466, + "grad_norm": 0.5373774375004284, + "learning_rate": 4.663809043002291e-06, + "loss": 0.4558, + "step": 8841 + }, + { + "epoch": 3.459792541368239, + "grad_norm": 0.5050016062307993, + "learning_rate": 4.663730566601436e-06, + "loss": 0.4626, + "step": 8842 + }, + { + "epoch": 3.460187700666831, + "grad_norm": 0.4891364435706425, + "learning_rate": 4.6636520817027596e-06, + "loss": 0.4506, + "step": 8843 + }, + { + "epoch": 3.460582859965424, + "grad_norm": 0.48998437910287085, + "learning_rate": 4.6635735883065734e-06, + "loss": 0.4529, + "step": 8844 + }, + { + "epoch": 3.460978019264016, + "grad_norm": 0.5234452315475704, + "learning_rate": 4.663495086413183e-06, + "loss": 0.4699, + "step": 8845 + }, + { + "epoch": 3.4613731785626083, + "grad_norm": 0.49371396515639815, + "learning_rate": 4.663416576022898e-06, + "loss": 0.4706, + "step": 8846 + }, + { + "epoch": 3.4617683378612005, + "grad_norm": 0.49709209591769404, + "learning_rate": 4.663338057136025e-06, + "loss": 0.476, + "step": 8847 + }, + { + "epoch": 3.4621634971597928, + "grad_norm": 0.49578446171291213, + "learning_rate": 4.663259529752876e-06, + "loss": 0.4444, + "step": 8848 + }, + { + "epoch": 3.462558656458385, + "grad_norm": 0.49126172062385615, + "learning_rate": 4.663180993873756e-06, + "loss": 0.4489, + "step": 8849 + }, + { + "epoch": 3.4629538157569772, + "grad_norm": 0.49180903216506305, + "learning_rate": 4.663102449498974e-06, + "loss": 0.4546, + "step": 8850 + }, + { + "epoch": 3.4633489750555695, + "grad_norm": 0.5005694477594985, + "learning_rate": 4.66302389662884e-06, + "loss": 0.4512, + "step": 8851 + }, + { + "epoch": 3.4637441343541617, + "grad_norm": 0.49774390912434563, + "learning_rate": 4.6629453352636615e-06, + "loss": 0.4651, + "step": 8852 + }, + { + "epoch": 3.464139293652754, + "grad_norm": 0.5149141552536891, + "learning_rate": 4.662866765403747e-06, + "loss": 0.4473, + "step": 8853 + }, + { + "epoch": 3.464534452951346, + "grad_norm": 0.5072099362598289, + "learning_rate": 4.662788187049405e-06, + "loss": 0.4772, + "step": 8854 + }, + { + "epoch": 3.4649296122499385, + "grad_norm": 0.49560828556589115, + "learning_rate": 4.6627096002009445e-06, + "loss": 0.4537, + "step": 8855 + }, + { + "epoch": 3.4653247715485307, + "grad_norm": 0.4920647341464689, + "learning_rate": 4.662631004858674e-06, + "loss": 0.4422, + "step": 8856 + }, + { + "epoch": 3.465719930847123, + "grad_norm": 0.4992878751198371, + "learning_rate": 4.6625524010229026e-06, + "loss": 0.4608, + "step": 8857 + }, + { + "epoch": 3.466115090145715, + "grad_norm": 0.49054926255487874, + "learning_rate": 4.662473788693938e-06, + "loss": 0.4539, + "step": 8858 + }, + { + "epoch": 3.4665102494443074, + "grad_norm": 0.511990187730294, + "learning_rate": 4.6623951678720894e-06, + "loss": 0.4829, + "step": 8859 + }, + { + "epoch": 3.4669054087428997, + "grad_norm": 0.5042603888878492, + "learning_rate": 4.662316538557666e-06, + "loss": 0.4613, + "step": 8860 + }, + { + "epoch": 3.467300568041492, + "grad_norm": 0.4969317296679051, + "learning_rate": 4.662237900750976e-06, + "loss": 0.445, + "step": 8861 + }, + { + "epoch": 3.467695727340084, + "grad_norm": 0.503339914538068, + "learning_rate": 4.662159254452329e-06, + "loss": 0.4607, + "step": 8862 + }, + { + "epoch": 3.4680908866386764, + "grad_norm": 0.47526527789470935, + "learning_rate": 4.662080599662032e-06, + "loss": 0.4332, + "step": 8863 + }, + { + "epoch": 3.4684860459372686, + "grad_norm": 0.48910025218721065, + "learning_rate": 4.662001936380397e-06, + "loss": 0.4532, + "step": 8864 + }, + { + "epoch": 3.468881205235861, + "grad_norm": 0.5023800565124126, + "learning_rate": 4.661923264607731e-06, + "loss": 0.4635, + "step": 8865 + }, + { + "epoch": 3.469276364534453, + "grad_norm": 0.49650951915124897, + "learning_rate": 4.661844584344342e-06, + "loss": 0.4726, + "step": 8866 + }, + { + "epoch": 3.4696715238330453, + "grad_norm": 0.4968947894789248, + "learning_rate": 4.661765895590541e-06, + "loss": 0.4736, + "step": 8867 + }, + { + "epoch": 3.4700666831316376, + "grad_norm": 0.5009727916810776, + "learning_rate": 4.661687198346636e-06, + "loss": 0.459, + "step": 8868 + }, + { + "epoch": 3.47046184243023, + "grad_norm": 0.48976101872113026, + "learning_rate": 4.661608492612937e-06, + "loss": 0.4467, + "step": 8869 + }, + { + "epoch": 3.470857001728822, + "grad_norm": 0.5016589813587168, + "learning_rate": 4.661529778389752e-06, + "loss": 0.4542, + "step": 8870 + }, + { + "epoch": 3.4712521610274143, + "grad_norm": 0.4861886122067568, + "learning_rate": 4.66145105567739e-06, + "loss": 0.461, + "step": 8871 + }, + { + "epoch": 3.4716473203260065, + "grad_norm": 0.641821102486397, + "learning_rate": 4.661372324476161e-06, + "loss": 0.4541, + "step": 8872 + }, + { + "epoch": 3.4720424796245988, + "grad_norm": 0.4940555527439461, + "learning_rate": 4.6612935847863746e-06, + "loss": 0.4645, + "step": 8873 + }, + { + "epoch": 3.472437638923191, + "grad_norm": 0.5017974488912654, + "learning_rate": 4.661214836608339e-06, + "loss": 0.4536, + "step": 8874 + }, + { + "epoch": 3.4728327982217833, + "grad_norm": 0.4969241194719381, + "learning_rate": 4.661136079942364e-06, + "loss": 0.4661, + "step": 8875 + }, + { + "epoch": 3.4732279575203755, + "grad_norm": 0.497649831354656, + "learning_rate": 4.661057314788758e-06, + "loss": 0.4413, + "step": 8876 + }, + { + "epoch": 3.4736231168189677, + "grad_norm": 0.485793011113323, + "learning_rate": 4.6609785411478326e-06, + "loss": 0.4653, + "step": 8877 + }, + { + "epoch": 3.47401827611756, + "grad_norm": 0.49424185636318685, + "learning_rate": 4.6608997590198945e-06, + "loss": 0.4475, + "step": 8878 + }, + { + "epoch": 3.474413435416152, + "grad_norm": 0.48959550866721235, + "learning_rate": 4.6608209684052555e-06, + "loss": 0.455, + "step": 8879 + }, + { + "epoch": 3.4748085947147445, + "grad_norm": 0.4878411516995577, + "learning_rate": 4.660742169304223e-06, + "loss": 0.4612, + "step": 8880 + }, + { + "epoch": 3.4752037540133367, + "grad_norm": 0.5094157531998392, + "learning_rate": 4.660663361717107e-06, + "loss": 0.4718, + "step": 8881 + }, + { + "epoch": 3.475598913311929, + "grad_norm": 0.4989284958044305, + "learning_rate": 4.660584545644218e-06, + "loss": 0.4602, + "step": 8882 + }, + { + "epoch": 3.475994072610521, + "grad_norm": 0.48462148426022633, + "learning_rate": 4.660505721085865e-06, + "loss": 0.4506, + "step": 8883 + }, + { + "epoch": 3.4763892319091134, + "grad_norm": 0.4881796143310651, + "learning_rate": 4.660426888042356e-06, + "loss": 0.4601, + "step": 8884 + }, + { + "epoch": 3.4767843912077057, + "grad_norm": 0.49485495431393517, + "learning_rate": 4.660348046514004e-06, + "loss": 0.4462, + "step": 8885 + }, + { + "epoch": 3.477179550506298, + "grad_norm": 0.48394677880720927, + "learning_rate": 4.660269196501116e-06, + "loss": 0.4463, + "step": 8886 + }, + { + "epoch": 3.47757470980489, + "grad_norm": 0.4855215947244085, + "learning_rate": 4.660190338004002e-06, + "loss": 0.4465, + "step": 8887 + }, + { + "epoch": 3.4779698691034824, + "grad_norm": 0.5086993405331509, + "learning_rate": 4.660111471022973e-06, + "loss": 0.456, + "step": 8888 + }, + { + "epoch": 3.4783650284020746, + "grad_norm": 0.49724371282452545, + "learning_rate": 4.660032595558337e-06, + "loss": 0.4611, + "step": 8889 + }, + { + "epoch": 3.478760187700667, + "grad_norm": 0.5029381576180928, + "learning_rate": 4.659953711610405e-06, + "loss": 0.4807, + "step": 8890 + }, + { + "epoch": 3.479155346999259, + "grad_norm": 0.48536678216800494, + "learning_rate": 4.659874819179486e-06, + "loss": 0.4673, + "step": 8891 + }, + { + "epoch": 3.4795505062978513, + "grad_norm": 0.48336927893573745, + "learning_rate": 4.65979591826589e-06, + "loss": 0.4539, + "step": 8892 + }, + { + "epoch": 3.4799456655964436, + "grad_norm": 0.4937274142048356, + "learning_rate": 4.659717008869928e-06, + "loss": 0.4763, + "step": 8893 + }, + { + "epoch": 3.480340824895036, + "grad_norm": 0.4928360042158491, + "learning_rate": 4.659638090991909e-06, + "loss": 0.4613, + "step": 8894 + }, + { + "epoch": 3.480735984193628, + "grad_norm": 0.530269437282756, + "learning_rate": 4.659559164632142e-06, + "loss": 0.4487, + "step": 8895 + }, + { + "epoch": 3.4811311434922203, + "grad_norm": 0.4910271446685397, + "learning_rate": 4.659480229790938e-06, + "loss": 0.4719, + "step": 8896 + }, + { + "epoch": 3.4815263027908125, + "grad_norm": 0.4777584233118303, + "learning_rate": 4.659401286468609e-06, + "loss": 0.4439, + "step": 8897 + }, + { + "epoch": 3.481921462089405, + "grad_norm": 0.48331532023965773, + "learning_rate": 4.659322334665461e-06, + "loss": 0.454, + "step": 8898 + }, + { + "epoch": 3.482316621387997, + "grad_norm": 0.49680547638456035, + "learning_rate": 4.659243374381806e-06, + "loss": 0.4589, + "step": 8899 + }, + { + "epoch": 3.4827117806865893, + "grad_norm": 0.4759886694048339, + "learning_rate": 4.659164405617955e-06, + "loss": 0.453, + "step": 8900 + }, + { + "epoch": 3.4831069399851815, + "grad_norm": 0.48347058134167675, + "learning_rate": 4.6590854283742175e-06, + "loss": 0.4649, + "step": 8901 + }, + { + "epoch": 3.4835020992837737, + "grad_norm": 0.5021839134977939, + "learning_rate": 4.659006442650903e-06, + "loss": 0.4534, + "step": 8902 + }, + { + "epoch": 3.483897258582366, + "grad_norm": 0.48132025196008, + "learning_rate": 4.658927448448323e-06, + "loss": 0.4478, + "step": 8903 + }, + { + "epoch": 3.4842924178809582, + "grad_norm": 0.49135070095173283, + "learning_rate": 4.658848445766786e-06, + "loss": 0.4506, + "step": 8904 + }, + { + "epoch": 3.4846875771795505, + "grad_norm": 0.48125094395751417, + "learning_rate": 4.6587694346066035e-06, + "loss": 0.4377, + "step": 8905 + }, + { + "epoch": 3.4850827364781427, + "grad_norm": 0.49750742233825657, + "learning_rate": 4.658690414968086e-06, + "loss": 0.4655, + "step": 8906 + }, + { + "epoch": 3.485477895776735, + "grad_norm": 0.48458376051136504, + "learning_rate": 4.658611386851543e-06, + "loss": 0.4559, + "step": 8907 + }, + { + "epoch": 3.485873055075327, + "grad_norm": 0.5115369273188421, + "learning_rate": 4.658532350257285e-06, + "loss": 0.4611, + "step": 8908 + }, + { + "epoch": 3.4862682143739194, + "grad_norm": 0.48861679265318614, + "learning_rate": 4.658453305185623e-06, + "loss": 0.4476, + "step": 8909 + }, + { + "epoch": 3.4866633736725117, + "grad_norm": 0.4877426200944746, + "learning_rate": 4.658374251636867e-06, + "loss": 0.4544, + "step": 8910 + }, + { + "epoch": 3.487058532971104, + "grad_norm": 0.4828165510082438, + "learning_rate": 4.658295189611327e-06, + "loss": 0.4535, + "step": 8911 + }, + { + "epoch": 3.487453692269696, + "grad_norm": 0.6245926944397081, + "learning_rate": 4.658216119109314e-06, + "loss": 0.4151, + "step": 8912 + }, + { + "epoch": 3.4878488515682884, + "grad_norm": 0.4874104433869976, + "learning_rate": 4.65813704013114e-06, + "loss": 0.4456, + "step": 8913 + }, + { + "epoch": 3.4882440108668806, + "grad_norm": 0.5145352743350671, + "learning_rate": 4.658057952677113e-06, + "loss": 0.4768, + "step": 8914 + }, + { + "epoch": 3.488639170165473, + "grad_norm": 0.49016021595770465, + "learning_rate": 4.657978856747546e-06, + "loss": 0.4567, + "step": 8915 + }, + { + "epoch": 3.489034329464065, + "grad_norm": 0.48831218665648374, + "learning_rate": 4.6578997523427475e-06, + "loss": 0.4356, + "step": 8916 + }, + { + "epoch": 3.4894294887626573, + "grad_norm": 0.5010113609737595, + "learning_rate": 4.657820639463029e-06, + "loss": 0.4799, + "step": 8917 + }, + { + "epoch": 3.4898246480612496, + "grad_norm": 0.4907345989010224, + "learning_rate": 4.657741518108702e-06, + "loss": 0.4628, + "step": 8918 + }, + { + "epoch": 3.490219807359842, + "grad_norm": 0.49499089035592075, + "learning_rate": 4.657662388280076e-06, + "loss": 0.4615, + "step": 8919 + }, + { + "epoch": 3.490614966658434, + "grad_norm": 0.5066619543747422, + "learning_rate": 4.657583249977463e-06, + "loss": 0.4592, + "step": 8920 + }, + { + "epoch": 3.4910101259570263, + "grad_norm": 0.4958302058204061, + "learning_rate": 4.657504103201173e-06, + "loss": 0.4485, + "step": 8921 + }, + { + "epoch": 3.4914052852556186, + "grad_norm": 0.4944424623211386, + "learning_rate": 4.657424947951517e-06, + "loss": 0.4522, + "step": 8922 + }, + { + "epoch": 3.491800444554211, + "grad_norm": 0.6267993198451537, + "learning_rate": 4.657345784228805e-06, + "loss": 0.462, + "step": 8923 + }, + { + "epoch": 3.492195603852803, + "grad_norm": 0.548874751841137, + "learning_rate": 4.657266612033348e-06, + "loss": 0.4649, + "step": 8924 + }, + { + "epoch": 3.4925907631513953, + "grad_norm": 0.5118844974346741, + "learning_rate": 4.65718743136546e-06, + "loss": 0.4482, + "step": 8925 + }, + { + "epoch": 3.4929859224499875, + "grad_norm": 0.5033343474006436, + "learning_rate": 4.657108242225449e-06, + "loss": 0.4731, + "step": 8926 + }, + { + "epoch": 3.4933810817485798, + "grad_norm": 0.4861803347304094, + "learning_rate": 4.657029044613627e-06, + "loss": 0.4503, + "step": 8927 + }, + { + "epoch": 3.493776241047172, + "grad_norm": 0.4791250625600111, + "learning_rate": 4.656949838530304e-06, + "loss": 0.4428, + "step": 8928 + }, + { + "epoch": 3.4941714003457642, + "grad_norm": 0.4863617818870344, + "learning_rate": 4.656870623975791e-06, + "loss": 0.4659, + "step": 8929 + }, + { + "epoch": 3.4945665596443565, + "grad_norm": 0.5127343716951415, + "learning_rate": 4.656791400950401e-06, + "loss": 0.461, + "step": 8930 + }, + { + "epoch": 3.4949617189429487, + "grad_norm": 0.49777034576549223, + "learning_rate": 4.656712169454444e-06, + "loss": 0.4585, + "step": 8931 + }, + { + "epoch": 3.495356878241541, + "grad_norm": 0.48877150097142436, + "learning_rate": 4.656632929488231e-06, + "loss": 0.4547, + "step": 8932 + }, + { + "epoch": 3.495752037540133, + "grad_norm": 0.4927474974697246, + "learning_rate": 4.656553681052074e-06, + "loss": 0.4581, + "step": 8933 + }, + { + "epoch": 3.4961471968387254, + "grad_norm": 0.4849494940227447, + "learning_rate": 4.656474424146283e-06, + "loss": 0.4459, + "step": 8934 + }, + { + "epoch": 3.4965423561373177, + "grad_norm": 0.48094079718336596, + "learning_rate": 4.6563951587711706e-06, + "loss": 0.4617, + "step": 8935 + }, + { + "epoch": 3.49693751543591, + "grad_norm": 0.5234897546830422, + "learning_rate": 4.6563158849270465e-06, + "loss": 0.4462, + "step": 8936 + }, + { + "epoch": 3.497332674734502, + "grad_norm": 0.49375252338582026, + "learning_rate": 4.656236602614223e-06, + "loss": 0.4497, + "step": 8937 + }, + { + "epoch": 3.4977278340330944, + "grad_norm": 0.5267848201795143, + "learning_rate": 4.656157311833013e-06, + "loss": 0.442, + "step": 8938 + }, + { + "epoch": 3.4981229933316866, + "grad_norm": 0.5060587932376777, + "learning_rate": 4.656078012583725e-06, + "loss": 0.4711, + "step": 8939 + }, + { + "epoch": 3.498518152630279, + "grad_norm": 0.5025260956706785, + "learning_rate": 4.655998704866672e-06, + "loss": 0.4468, + "step": 8940 + }, + { + "epoch": 3.498913311928871, + "grad_norm": 0.687091654136015, + "learning_rate": 4.655919388682166e-06, + "loss": 0.4844, + "step": 8941 + }, + { + "epoch": 3.4993084712274634, + "grad_norm": 0.48026680469874616, + "learning_rate": 4.655840064030517e-06, + "loss": 0.4412, + "step": 8942 + }, + { + "epoch": 3.4997036305260556, + "grad_norm": 0.4942720809704896, + "learning_rate": 4.655760730912038e-06, + "loss": 0.4645, + "step": 8943 + }, + { + "epoch": 3.500098789824648, + "grad_norm": 0.509126848920385, + "learning_rate": 4.655681389327039e-06, + "loss": 0.4497, + "step": 8944 + }, + { + "epoch": 3.50049394912324, + "grad_norm": 0.7110801116137855, + "learning_rate": 4.655602039275833e-06, + "loss": 0.4598, + "step": 8945 + }, + { + "epoch": 3.5008891084218323, + "grad_norm": 0.5019582976854259, + "learning_rate": 4.6555226807587304e-06, + "loss": 0.4657, + "step": 8946 + }, + { + "epoch": 3.5012842677204246, + "grad_norm": 0.47619258987485347, + "learning_rate": 4.655443313776045e-06, + "loss": 0.4483, + "step": 8947 + }, + { + "epoch": 3.501679427019017, + "grad_norm": 0.5029273839023065, + "learning_rate": 4.655363938328086e-06, + "loss": 0.4669, + "step": 8948 + }, + { + "epoch": 3.502074586317609, + "grad_norm": 0.5028619187317396, + "learning_rate": 4.655284554415167e-06, + "loss": 0.4582, + "step": 8949 + }, + { + "epoch": 3.5024697456162013, + "grad_norm": 0.49264262307149015, + "learning_rate": 4.655205162037598e-06, + "loss": 0.4606, + "step": 8950 + }, + { + "epoch": 3.5028649049147935, + "grad_norm": 0.4708730042896239, + "learning_rate": 4.655125761195694e-06, + "loss": 0.4526, + "step": 8951 + }, + { + "epoch": 3.5032600642133858, + "grad_norm": 0.4861263707563721, + "learning_rate": 4.655046351889763e-06, + "loss": 0.4414, + "step": 8952 + }, + { + "epoch": 3.503655223511978, + "grad_norm": 0.4893316048388238, + "learning_rate": 4.654966934120118e-06, + "loss": 0.453, + "step": 8953 + }, + { + "epoch": 3.5040503828105702, + "grad_norm": 0.5139962221803164, + "learning_rate": 4.654887507887073e-06, + "loss": 0.465, + "step": 8954 + }, + { + "epoch": 3.5044455421091625, + "grad_norm": 0.47724658926285185, + "learning_rate": 4.654808073190937e-06, + "loss": 0.4418, + "step": 8955 + }, + { + "epoch": 3.504840701407755, + "grad_norm": 0.48987603430328075, + "learning_rate": 4.654728630032024e-06, + "loss": 0.4561, + "step": 8956 + }, + { + "epoch": 3.5052358607063474, + "grad_norm": 0.49005477886288673, + "learning_rate": 4.654649178410645e-06, + "loss": 0.4554, + "step": 8957 + }, + { + "epoch": 3.5056310200049396, + "grad_norm": 0.49167524318701145, + "learning_rate": 4.654569718327113e-06, + "loss": 0.4432, + "step": 8958 + }, + { + "epoch": 3.506026179303532, + "grad_norm": 0.48832196263631317, + "learning_rate": 4.654490249781739e-06, + "loss": 0.4567, + "step": 8959 + }, + { + "epoch": 3.506421338602124, + "grad_norm": 0.5195513981086755, + "learning_rate": 4.654410772774836e-06, + "loss": 0.4682, + "step": 8960 + }, + { + "epoch": 3.5068164979007164, + "grad_norm": 0.4883940235257672, + "learning_rate": 4.654331287306715e-06, + "loss": 0.4648, + "step": 8961 + }, + { + "epoch": 3.5072116571993086, + "grad_norm": 0.4939222640936546, + "learning_rate": 4.65425179337769e-06, + "loss": 0.4555, + "step": 8962 + }, + { + "epoch": 3.507606816497901, + "grad_norm": 0.507042086996444, + "learning_rate": 4.654172290988071e-06, + "loss": 0.4688, + "step": 8963 + }, + { + "epoch": 3.508001975796493, + "grad_norm": 0.4851747453407202, + "learning_rate": 4.654092780138172e-06, + "loss": 0.4667, + "step": 8964 + }, + { + "epoch": 3.5083971350950853, + "grad_norm": 0.49132032301674466, + "learning_rate": 4.654013260828304e-06, + "loss": 0.4558, + "step": 8965 + }, + { + "epoch": 3.5087922943936776, + "grad_norm": 0.5020222923780524, + "learning_rate": 4.653933733058781e-06, + "loss": 0.4471, + "step": 8966 + }, + { + "epoch": 3.50918745369227, + "grad_norm": 0.4894974516850361, + "learning_rate": 4.653854196829913e-06, + "loss": 0.442, + "step": 8967 + }, + { + "epoch": 3.509582612990862, + "grad_norm": 0.4868608182493534, + "learning_rate": 4.653774652142014e-06, + "loss": 0.4481, + "step": 8968 + }, + { + "epoch": 3.5099777722894543, + "grad_norm": 0.7314629885353082, + "learning_rate": 4.6536950989953965e-06, + "loss": 0.4573, + "step": 8969 + }, + { + "epoch": 3.5103729315880465, + "grad_norm": 0.48728983834155626, + "learning_rate": 4.653615537390371e-06, + "loss": 0.4548, + "step": 8970 + }, + { + "epoch": 3.5107680908866388, + "grad_norm": 0.49949151444479584, + "learning_rate": 4.653535967327253e-06, + "loss": 0.463, + "step": 8971 + }, + { + "epoch": 3.511163250185231, + "grad_norm": 0.4859042126598325, + "learning_rate": 4.6534563888063535e-06, + "loss": 0.4636, + "step": 8972 + }, + { + "epoch": 3.5115584094838233, + "grad_norm": 0.49909226950739194, + "learning_rate": 4.653376801827983e-06, + "loss": 0.4647, + "step": 8973 + }, + { + "epoch": 3.5119535687824155, + "grad_norm": 0.4841418346190016, + "learning_rate": 4.653297206392458e-06, + "loss": 0.4487, + "step": 8974 + }, + { + "epoch": 3.5123487280810077, + "grad_norm": 0.5017750095021966, + "learning_rate": 4.653217602500089e-06, + "loss": 0.4772, + "step": 8975 + }, + { + "epoch": 3.5127438873796, + "grad_norm": 0.5054355471499746, + "learning_rate": 4.653137990151188e-06, + "loss": 0.442, + "step": 8976 + }, + { + "epoch": 3.513139046678192, + "grad_norm": 0.5124016068780876, + "learning_rate": 4.6530583693460685e-06, + "loss": 0.4693, + "step": 8977 + }, + { + "epoch": 3.5135342059767845, + "grad_norm": 0.4884638972810405, + "learning_rate": 4.6529787400850435e-06, + "loss": 0.4549, + "step": 8978 + }, + { + "epoch": 3.5139293652753767, + "grad_norm": 0.484980903508112, + "learning_rate": 4.652899102368425e-06, + "loss": 0.4508, + "step": 8979 + }, + { + "epoch": 3.514324524573969, + "grad_norm": 0.4750814543537716, + "learning_rate": 4.652819456196527e-06, + "loss": 0.4455, + "step": 8980 + }, + { + "epoch": 3.514719683872561, + "grad_norm": 0.4787417445872702, + "learning_rate": 4.65273980156966e-06, + "loss": 0.4497, + "step": 8981 + }, + { + "epoch": 3.5151148431711534, + "grad_norm": 0.49080099109165837, + "learning_rate": 4.6526601384881396e-06, + "loss": 0.4487, + "step": 8982 + }, + { + "epoch": 3.5155100024697457, + "grad_norm": 0.47651127208635596, + "learning_rate": 4.652580466952277e-06, + "loss": 0.4488, + "step": 8983 + }, + { + "epoch": 3.515905161768338, + "grad_norm": 0.47966488268224355, + "learning_rate": 4.6525007869623845e-06, + "loss": 0.4513, + "step": 8984 + }, + { + "epoch": 3.51630032106693, + "grad_norm": 0.4878014502335229, + "learning_rate": 4.652421098518777e-06, + "loss": 0.4429, + "step": 8985 + }, + { + "epoch": 3.5166954803655224, + "grad_norm": 0.48298319426845326, + "learning_rate": 4.652341401621766e-06, + "loss": 0.448, + "step": 8986 + }, + { + "epoch": 3.5170906396641146, + "grad_norm": 0.47671515243568685, + "learning_rate": 4.652261696271665e-06, + "loss": 0.4482, + "step": 8987 + }, + { + "epoch": 3.517485798962707, + "grad_norm": 0.5012073706299464, + "learning_rate": 4.652181982468787e-06, + "loss": 0.4517, + "step": 8988 + }, + { + "epoch": 3.517880958261299, + "grad_norm": 0.48952067375088115, + "learning_rate": 4.652102260213445e-06, + "loss": 0.4697, + "step": 8989 + }, + { + "epoch": 3.5182761175598913, + "grad_norm": 0.5014312771832417, + "learning_rate": 4.652022529505953e-06, + "loss": 0.4617, + "step": 8990 + }, + { + "epoch": 3.5186712768584836, + "grad_norm": 0.48760811032686774, + "learning_rate": 4.651942790346622e-06, + "loss": 0.4775, + "step": 8991 + }, + { + "epoch": 3.519066436157076, + "grad_norm": 0.4804722298357196, + "learning_rate": 4.6518630427357666e-06, + "loss": 0.4565, + "step": 8992 + }, + { + "epoch": 3.519461595455668, + "grad_norm": 0.49171418019719676, + "learning_rate": 4.6517832866737e-06, + "loss": 0.4605, + "step": 8993 + }, + { + "epoch": 3.5198567547542603, + "grad_norm": 0.49918333071216625, + "learning_rate": 4.651703522160736e-06, + "loss": 0.463, + "step": 8994 + }, + { + "epoch": 3.5202519140528525, + "grad_norm": 0.49170780602129877, + "learning_rate": 4.651623749197187e-06, + "loss": 0.4683, + "step": 8995 + }, + { + "epoch": 3.520647073351445, + "grad_norm": 0.5033774543953301, + "learning_rate": 4.651543967783366e-06, + "loss": 0.4509, + "step": 8996 + }, + { + "epoch": 3.521042232650037, + "grad_norm": 0.6184118730606496, + "learning_rate": 4.651464177919587e-06, + "loss": 0.4398, + "step": 8997 + }, + { + "epoch": 3.5214373919486293, + "grad_norm": 0.5583292211707592, + "learning_rate": 4.6513843796061624e-06, + "loss": 0.441, + "step": 8998 + }, + { + "epoch": 3.5218325512472215, + "grad_norm": 0.5111210467079835, + "learning_rate": 4.651304572843407e-06, + "loss": 0.4463, + "step": 8999 + }, + { + "epoch": 3.5222277105458137, + "grad_norm": 0.4925902941362768, + "learning_rate": 4.651224757631634e-06, + "loss": 0.4565, + "step": 9000 + }, + { + "epoch": 3.522622869844406, + "grad_norm": 0.5053536793843054, + "learning_rate": 4.651144933971156e-06, + "loss": 0.454, + "step": 9001 + }, + { + "epoch": 3.523018029142998, + "grad_norm": 0.4939136118476549, + "learning_rate": 4.651065101862287e-06, + "loss": 0.4509, + "step": 9002 + }, + { + "epoch": 3.5234131884415905, + "grad_norm": 0.5038896891132291, + "learning_rate": 4.65098526130534e-06, + "loss": 0.438, + "step": 9003 + }, + { + "epoch": 3.5238083477401827, + "grad_norm": 0.5009202891855576, + "learning_rate": 4.65090541230063e-06, + "loss": 0.4411, + "step": 9004 + }, + { + "epoch": 3.524203507038775, + "grad_norm": 0.498130501386656, + "learning_rate": 4.650825554848468e-06, + "loss": 0.4614, + "step": 9005 + }, + { + "epoch": 3.524598666337367, + "grad_norm": 0.5107785107225102, + "learning_rate": 4.650745688949171e-06, + "loss": 0.4641, + "step": 9006 + }, + { + "epoch": 3.5249938256359594, + "grad_norm": 0.4973684335934365, + "learning_rate": 4.65066581460305e-06, + "loss": 0.4511, + "step": 9007 + }, + { + "epoch": 3.5253889849345517, + "grad_norm": 0.5120248873540496, + "learning_rate": 4.650585931810421e-06, + "loss": 0.4767, + "step": 9008 + }, + { + "epoch": 3.525784144233144, + "grad_norm": 0.5032821292378913, + "learning_rate": 4.6505060405715944e-06, + "loss": 0.4502, + "step": 9009 + }, + { + "epoch": 3.526179303531736, + "grad_norm": 0.4956958147919807, + "learning_rate": 4.6504261408868875e-06, + "loss": 0.4561, + "step": 9010 + }, + { + "epoch": 3.5265744628303284, + "grad_norm": 0.4957187392523741, + "learning_rate": 4.6503462327566115e-06, + "loss": 0.4681, + "step": 9011 + }, + { + "epoch": 3.5269696221289206, + "grad_norm": 0.4952159771925259, + "learning_rate": 4.650266316181082e-06, + "loss": 0.4461, + "step": 9012 + }, + { + "epoch": 3.527364781427513, + "grad_norm": 0.5672869678695379, + "learning_rate": 4.650186391160611e-06, + "loss": 0.4461, + "step": 9013 + }, + { + "epoch": 3.527759940726105, + "grad_norm": 0.5080704469053954, + "learning_rate": 4.650106457695515e-06, + "loss": 0.4708, + "step": 9014 + }, + { + "epoch": 3.5281551000246973, + "grad_norm": 0.4936767560240317, + "learning_rate": 4.650026515786106e-06, + "loss": 0.4585, + "step": 9015 + }, + { + "epoch": 3.5285502593232896, + "grad_norm": 0.5008042637689482, + "learning_rate": 4.649946565432698e-06, + "loss": 0.4692, + "step": 9016 + }, + { + "epoch": 3.528945418621882, + "grad_norm": 0.4962751637351177, + "learning_rate": 4.649866606635605e-06, + "loss": 0.4354, + "step": 9017 + }, + { + "epoch": 3.529340577920474, + "grad_norm": 0.5008934519010351, + "learning_rate": 4.649786639395142e-06, + "loss": 0.4699, + "step": 9018 + }, + { + "epoch": 3.5297357372190663, + "grad_norm": 0.48596615093437223, + "learning_rate": 4.649706663711623e-06, + "loss": 0.461, + "step": 9019 + }, + { + "epoch": 3.5301308965176585, + "grad_norm": 0.49089635795809994, + "learning_rate": 4.649626679585361e-06, + "loss": 0.4614, + "step": 9020 + }, + { + "epoch": 3.530526055816251, + "grad_norm": 0.4946185325035558, + "learning_rate": 4.64954668701667e-06, + "loss": 0.4487, + "step": 9021 + }, + { + "epoch": 3.530921215114843, + "grad_norm": 0.5126278761898772, + "learning_rate": 4.6494666860058655e-06, + "loss": 0.4568, + "step": 9022 + }, + { + "epoch": 3.5313163744134353, + "grad_norm": 0.5285391391199131, + "learning_rate": 4.649386676553262e-06, + "loss": 0.4557, + "step": 9023 + }, + { + "epoch": 3.5317115337120275, + "grad_norm": 0.5038626087100095, + "learning_rate": 4.649306658659172e-06, + "loss": 0.4707, + "step": 9024 + }, + { + "epoch": 3.5321066930106197, + "grad_norm": 0.4859360226761564, + "learning_rate": 4.649226632323911e-06, + "loss": 0.4837, + "step": 9025 + }, + { + "epoch": 3.532501852309212, + "grad_norm": 0.4966668712438498, + "learning_rate": 4.649146597547792e-06, + "loss": 0.4648, + "step": 9026 + }, + { + "epoch": 3.5328970116078047, + "grad_norm": 0.48166044464094737, + "learning_rate": 4.649066554331131e-06, + "loss": 0.4455, + "step": 9027 + }, + { + "epoch": 3.533292170906397, + "grad_norm": 0.49847140402641615, + "learning_rate": 4.648986502674241e-06, + "loss": 0.4745, + "step": 9028 + }, + { + "epoch": 3.533687330204989, + "grad_norm": 0.48375147299765764, + "learning_rate": 4.6489064425774375e-06, + "loss": 0.448, + "step": 9029 + }, + { + "epoch": 3.5340824895035814, + "grad_norm": 0.501937133054762, + "learning_rate": 4.648826374041034e-06, + "loss": 0.4857, + "step": 9030 + }, + { + "epoch": 3.5344776488021736, + "grad_norm": 0.5027543628434751, + "learning_rate": 4.648746297065345e-06, + "loss": 0.4561, + "step": 9031 + }, + { + "epoch": 3.534872808100766, + "grad_norm": 0.48644400230014095, + "learning_rate": 4.648666211650686e-06, + "loss": 0.4511, + "step": 9032 + }, + { + "epoch": 3.535267967399358, + "grad_norm": 0.492175793746889, + "learning_rate": 4.648586117797371e-06, + "loss": 0.4451, + "step": 9033 + }, + { + "epoch": 3.5356631266979504, + "grad_norm": 0.4854671322850227, + "learning_rate": 4.648506015505714e-06, + "loss": 0.4627, + "step": 9034 + }, + { + "epoch": 3.5360582859965426, + "grad_norm": 0.4972183962198117, + "learning_rate": 4.64842590477603e-06, + "loss": 0.4699, + "step": 9035 + }, + { + "epoch": 3.536453445295135, + "grad_norm": 0.5200452269905167, + "learning_rate": 4.648345785608633e-06, + "loss": 0.4788, + "step": 9036 + }, + { + "epoch": 3.536848604593727, + "grad_norm": 0.530977112773409, + "learning_rate": 4.64826565800384e-06, + "loss": 0.4598, + "step": 9037 + }, + { + "epoch": 3.5372437638923193, + "grad_norm": 0.48919921086325696, + "learning_rate": 4.648185521961963e-06, + "loss": 0.4605, + "step": 9038 + }, + { + "epoch": 3.5376389231909116, + "grad_norm": 0.48321014692774794, + "learning_rate": 4.648105377483318e-06, + "loss": 0.4562, + "step": 9039 + }, + { + "epoch": 3.538034082489504, + "grad_norm": 0.5096165386730921, + "learning_rate": 4.648025224568219e-06, + "loss": 0.4709, + "step": 9040 + }, + { + "epoch": 3.538429241788096, + "grad_norm": 0.4992741223806897, + "learning_rate": 4.647945063216981e-06, + "loss": 0.4531, + "step": 9041 + }, + { + "epoch": 3.5388244010866883, + "grad_norm": 0.5075195911086815, + "learning_rate": 4.64786489342992e-06, + "loss": 0.4482, + "step": 9042 + }, + { + "epoch": 3.5392195603852805, + "grad_norm": 0.4817099394481762, + "learning_rate": 4.64778471520735e-06, + "loss": 0.4566, + "step": 9043 + }, + { + "epoch": 3.5396147196838728, + "grad_norm": 0.4936601585082596, + "learning_rate": 4.6477045285495845e-06, + "loss": 0.4618, + "step": 9044 + }, + { + "epoch": 3.540009878982465, + "grad_norm": 0.4892077110552015, + "learning_rate": 4.647624333456941e-06, + "loss": 0.4588, + "step": 9045 + }, + { + "epoch": 3.5404050382810572, + "grad_norm": 0.4969970473771617, + "learning_rate": 4.647544129929733e-06, + "loss": 0.4653, + "step": 9046 + }, + { + "epoch": 3.5408001975796495, + "grad_norm": 0.4819114190057846, + "learning_rate": 4.647463917968275e-06, + "loss": 0.4526, + "step": 9047 + }, + { + "epoch": 3.5411953568782417, + "grad_norm": 0.48764776197722826, + "learning_rate": 4.647383697572883e-06, + "loss": 0.4338, + "step": 9048 + }, + { + "epoch": 3.541590516176834, + "grad_norm": 0.4818100110717255, + "learning_rate": 4.647303468743873e-06, + "loss": 0.4755, + "step": 9049 + }, + { + "epoch": 3.541985675475426, + "grad_norm": 0.5005030935665212, + "learning_rate": 4.647223231481557e-06, + "loss": 0.4614, + "step": 9050 + }, + { + "epoch": 3.5423808347740184, + "grad_norm": 0.5150064517997461, + "learning_rate": 4.647142985786254e-06, + "loss": 0.4648, + "step": 9051 + }, + { + "epoch": 3.5427759940726107, + "grad_norm": 0.48527068890730546, + "learning_rate": 4.647062731658276e-06, + "loss": 0.4558, + "step": 9052 + }, + { + "epoch": 3.543171153371203, + "grad_norm": 0.5093185573266966, + "learning_rate": 4.6469824690979394e-06, + "loss": 0.4848, + "step": 9053 + }, + { + "epoch": 3.543566312669795, + "grad_norm": 0.5371264924132514, + "learning_rate": 4.64690219810556e-06, + "loss": 0.4721, + "step": 9054 + }, + { + "epoch": 3.5439614719683874, + "grad_norm": 0.49196835315592014, + "learning_rate": 4.646821918681451e-06, + "loss": 0.4584, + "step": 9055 + }, + { + "epoch": 3.5443566312669796, + "grad_norm": 0.5095176634298171, + "learning_rate": 4.64674163082593e-06, + "loss": 0.454, + "step": 9056 + }, + { + "epoch": 3.544751790565572, + "grad_norm": 0.48759629545526767, + "learning_rate": 4.646661334539312e-06, + "loss": 0.4679, + "step": 9057 + }, + { + "epoch": 3.545146949864164, + "grad_norm": 0.4870613246032581, + "learning_rate": 4.646581029821912e-06, + "loss": 0.4512, + "step": 9058 + }, + { + "epoch": 3.5455421091627564, + "grad_norm": 0.4823673270563382, + "learning_rate": 4.646500716674044e-06, + "loss": 0.4649, + "step": 9059 + }, + { + "epoch": 3.5459372684613486, + "grad_norm": 0.4977917296918362, + "learning_rate": 4.646420395096025e-06, + "loss": 0.4615, + "step": 9060 + }, + { + "epoch": 3.546332427759941, + "grad_norm": 0.4981541055070267, + "learning_rate": 4.646340065088169e-06, + "loss": 0.4505, + "step": 9061 + }, + { + "epoch": 3.546727587058533, + "grad_norm": 0.4768432018824217, + "learning_rate": 4.646259726650795e-06, + "loss": 0.4401, + "step": 9062 + }, + { + "epoch": 3.5471227463571253, + "grad_norm": 0.49995907837696146, + "learning_rate": 4.646179379784215e-06, + "loss": 0.4566, + "step": 9063 + }, + { + "epoch": 3.5475179056557176, + "grad_norm": 0.529211500161168, + "learning_rate": 4.646099024488745e-06, + "loss": 0.4578, + "step": 9064 + }, + { + "epoch": 3.54791306495431, + "grad_norm": 0.49321795179014727, + "learning_rate": 4.646018660764702e-06, + "loss": 0.4441, + "step": 9065 + }, + { + "epoch": 3.548308224252902, + "grad_norm": 0.49786103565113565, + "learning_rate": 4.6459382886123996e-06, + "loss": 0.4731, + "step": 9066 + }, + { + "epoch": 3.5487033835514943, + "grad_norm": 0.48219336482722647, + "learning_rate": 4.645857908032156e-06, + "loss": 0.4381, + "step": 9067 + }, + { + "epoch": 3.5490985428500865, + "grad_norm": 0.4923319509953273, + "learning_rate": 4.645777519024285e-06, + "loss": 0.4582, + "step": 9068 + }, + { + "epoch": 3.5494937021486788, + "grad_norm": 0.4865553175937021, + "learning_rate": 4.645697121589103e-06, + "loss": 0.4423, + "step": 9069 + }, + { + "epoch": 3.549888861447271, + "grad_norm": 0.48095897113056707, + "learning_rate": 4.645616715726926e-06, + "loss": 0.448, + "step": 9070 + }, + { + "epoch": 3.5502840207458632, + "grad_norm": 0.5001311843314722, + "learning_rate": 4.645536301438069e-06, + "loss": 0.4466, + "step": 9071 + }, + { + "epoch": 3.5506791800444555, + "grad_norm": 0.477997428256851, + "learning_rate": 4.645455878722848e-06, + "loss": 0.4532, + "step": 9072 + }, + { + "epoch": 3.5510743393430477, + "grad_norm": 0.49193722985280725, + "learning_rate": 4.64537544758158e-06, + "loss": 0.4599, + "step": 9073 + }, + { + "epoch": 3.55146949864164, + "grad_norm": 0.4920447859862548, + "learning_rate": 4.645295008014579e-06, + "loss": 0.4532, + "step": 9074 + }, + { + "epoch": 3.551864657940232, + "grad_norm": 0.49558747516046264, + "learning_rate": 4.645214560022162e-06, + "loss": 0.4736, + "step": 9075 + }, + { + "epoch": 3.5522598172388244, + "grad_norm": 0.4762337243641712, + "learning_rate": 4.6451341036046455e-06, + "loss": 0.4517, + "step": 9076 + }, + { + "epoch": 3.5526549765374167, + "grad_norm": 0.5013245757181765, + "learning_rate": 4.6450536387623444e-06, + "loss": 0.4648, + "step": 9077 + }, + { + "epoch": 3.553050135836009, + "grad_norm": 0.5403068478658516, + "learning_rate": 4.644973165495576e-06, + "loss": 0.4449, + "step": 9078 + }, + { + "epoch": 3.553445295134601, + "grad_norm": 0.4931560024394474, + "learning_rate": 4.644892683804653e-06, + "loss": 0.4646, + "step": 9079 + }, + { + "epoch": 3.5538404544331934, + "grad_norm": 0.5078764038379829, + "learning_rate": 4.644812193689897e-06, + "loss": 0.4648, + "step": 9080 + }, + { + "epoch": 3.5542356137317856, + "grad_norm": 0.506957616825252, + "learning_rate": 4.64473169515162e-06, + "loss": 0.4636, + "step": 9081 + }, + { + "epoch": 3.554630773030378, + "grad_norm": 0.507432762723005, + "learning_rate": 4.644651188190139e-06, + "loss": 0.4732, + "step": 9082 + }, + { + "epoch": 3.55502593232897, + "grad_norm": 0.48687923691371193, + "learning_rate": 4.6445706728057705e-06, + "loss": 0.4512, + "step": 9083 + }, + { + "epoch": 3.5554210916275624, + "grad_norm": 0.6304721111076129, + "learning_rate": 4.64449014899883e-06, + "loss": 0.4679, + "step": 9084 + }, + { + "epoch": 3.5558162509261546, + "grad_norm": 0.4888618812137909, + "learning_rate": 4.644409616769635e-06, + "loss": 0.458, + "step": 9085 + }, + { + "epoch": 3.556211410224747, + "grad_norm": 0.4859315158657899, + "learning_rate": 4.644329076118502e-06, + "loss": 0.4544, + "step": 9086 + }, + { + "epoch": 3.556606569523339, + "grad_norm": 0.4927974306644837, + "learning_rate": 4.6442485270457464e-06, + "loss": 0.4546, + "step": 9087 + }, + { + "epoch": 3.5570017288219313, + "grad_norm": 0.46978089261360473, + "learning_rate": 4.644167969551683e-06, + "loss": 0.4435, + "step": 9088 + }, + { + "epoch": 3.5573968881205236, + "grad_norm": 0.5002112740039599, + "learning_rate": 4.644087403636631e-06, + "loss": 0.4575, + "step": 9089 + }, + { + "epoch": 3.557792047419116, + "grad_norm": 0.4816381481041411, + "learning_rate": 4.644006829300906e-06, + "loss": 0.4441, + "step": 9090 + }, + { + "epoch": 3.558187206717708, + "grad_norm": 0.5106665981705985, + "learning_rate": 4.643926246544823e-06, + "loss": 0.4611, + "step": 9091 + }, + { + "epoch": 3.5585823660163003, + "grad_norm": 0.49510570450790387, + "learning_rate": 4.6438456553687e-06, + "loss": 0.4687, + "step": 9092 + }, + { + "epoch": 3.5589775253148925, + "grad_norm": 0.507508378297751, + "learning_rate": 4.6437650557728535e-06, + "loss": 0.4652, + "step": 9093 + }, + { + "epoch": 3.5593726846134848, + "grad_norm": 0.49897649507220837, + "learning_rate": 4.643684447757599e-06, + "loss": 0.45, + "step": 9094 + }, + { + "epoch": 3.559767843912077, + "grad_norm": 0.5042685832105795, + "learning_rate": 4.643603831323255e-06, + "loss": 0.447, + "step": 9095 + }, + { + "epoch": 3.5601630032106693, + "grad_norm": 0.5209604652999842, + "learning_rate": 4.643523206470135e-06, + "loss": 0.4588, + "step": 9096 + }, + { + "epoch": 3.5605581625092615, + "grad_norm": 0.49949487365809175, + "learning_rate": 4.6434425731985585e-06, + "loss": 0.4741, + "step": 9097 + }, + { + "epoch": 3.5609533218078537, + "grad_norm": 0.5157946856320857, + "learning_rate": 4.643361931508841e-06, + "loss": 0.4709, + "step": 9098 + }, + { + "epoch": 3.561348481106446, + "grad_norm": 0.5076379876922849, + "learning_rate": 4.6432812814013e-06, + "loss": 0.4518, + "step": 9099 + }, + { + "epoch": 3.561743640405038, + "grad_norm": 0.48357121942351644, + "learning_rate": 4.64320062287625e-06, + "loss": 0.4689, + "step": 9100 + }, + { + "epoch": 3.5621387997036305, + "grad_norm": 0.4851016675612043, + "learning_rate": 4.64311995593401e-06, + "loss": 0.4623, + "step": 9101 + }, + { + "epoch": 3.5625339590022227, + "grad_norm": 0.5032142946703332, + "learning_rate": 4.643039280574897e-06, + "loss": 0.4569, + "step": 9102 + }, + { + "epoch": 3.562929118300815, + "grad_norm": 0.4981056358231815, + "learning_rate": 4.642958596799226e-06, + "loss": 0.4652, + "step": 9103 + }, + { + "epoch": 3.563324277599407, + "grad_norm": 0.5045031493173247, + "learning_rate": 4.642877904607316e-06, + "loss": 0.4757, + "step": 9104 + }, + { + "epoch": 3.5637194368979994, + "grad_norm": 0.49929741491236107, + "learning_rate": 4.642797203999482e-06, + "loss": 0.4553, + "step": 9105 + }, + { + "epoch": 3.5641145961965917, + "grad_norm": 0.48493063212795195, + "learning_rate": 4.6427164949760415e-06, + "loss": 0.4519, + "step": 9106 + }, + { + "epoch": 3.564509755495184, + "grad_norm": 0.4825224454556553, + "learning_rate": 4.642635777537312e-06, + "loss": 0.4657, + "step": 9107 + }, + { + "epoch": 3.564904914793776, + "grad_norm": 0.4923598417193523, + "learning_rate": 4.6425550516836106e-06, + "loss": 0.4535, + "step": 9108 + }, + { + "epoch": 3.5653000740923684, + "grad_norm": 0.5002848790092057, + "learning_rate": 4.6424743174152544e-06, + "loss": 0.4617, + "step": 9109 + }, + { + "epoch": 3.5656952333909606, + "grad_norm": 0.5246927195871764, + "learning_rate": 4.642393574732559e-06, + "loss": 0.4581, + "step": 9110 + }, + { + "epoch": 3.566090392689553, + "grad_norm": 0.5054517893159672, + "learning_rate": 4.642312823635843e-06, + "loss": 0.4571, + "step": 9111 + }, + { + "epoch": 3.566485551988145, + "grad_norm": 0.483406457593407, + "learning_rate": 4.642232064125424e-06, + "loss": 0.4415, + "step": 9112 + }, + { + "epoch": 3.5668807112867373, + "grad_norm": 0.49885049133184317, + "learning_rate": 4.642151296201617e-06, + "loss": 0.45, + "step": 9113 + }, + { + "epoch": 3.5672758705853296, + "grad_norm": 0.6059844092001336, + "learning_rate": 4.6420705198647405e-06, + "loss": 0.4584, + "step": 9114 + }, + { + "epoch": 3.567671029883922, + "grad_norm": 0.49181406902845737, + "learning_rate": 4.641989735115112e-06, + "loss": 0.4593, + "step": 9115 + }, + { + "epoch": 3.568066189182514, + "grad_norm": 0.4942661823361688, + "learning_rate": 4.64190894195305e-06, + "loss": 0.4683, + "step": 9116 + }, + { + "epoch": 3.5684613484811063, + "grad_norm": 0.4954476984028424, + "learning_rate": 4.641828140378868e-06, + "loss": 0.4657, + "step": 9117 + }, + { + "epoch": 3.5688565077796985, + "grad_norm": 0.4937799659377194, + "learning_rate": 4.641747330392886e-06, + "loss": 0.4503, + "step": 9118 + }, + { + "epoch": 3.569251667078291, + "grad_norm": 0.4902163898739017, + "learning_rate": 4.641666511995422e-06, + "loss": 0.463, + "step": 9119 + }, + { + "epoch": 3.569646826376883, + "grad_norm": 0.5043714006810839, + "learning_rate": 4.641585685186792e-06, + "loss": 0.4624, + "step": 9120 + }, + { + "epoch": 3.5700419856754753, + "grad_norm": 0.5004840014877152, + "learning_rate": 4.641504849967315e-06, + "loss": 0.4879, + "step": 9121 + }, + { + "epoch": 3.5704371449740675, + "grad_norm": 0.5077783702630878, + "learning_rate": 4.6414240063373065e-06, + "loss": 0.4456, + "step": 9122 + }, + { + "epoch": 3.5708323042726597, + "grad_norm": 0.4824382259117086, + "learning_rate": 4.6413431542970845e-06, + "loss": 0.4638, + "step": 9123 + }, + { + "epoch": 3.571227463571252, + "grad_norm": 0.4871993531897932, + "learning_rate": 4.641262293846966e-06, + "loss": 0.4599, + "step": 9124 + }, + { + "epoch": 3.571622622869844, + "grad_norm": 0.49457463356784, + "learning_rate": 4.641181424987271e-06, + "loss": 0.4611, + "step": 9125 + }, + { + "epoch": 3.5720177821684365, + "grad_norm": 0.48893854230152123, + "learning_rate": 4.641100547718314e-06, + "loss": 0.4459, + "step": 9126 + }, + { + "epoch": 3.5724129414670287, + "grad_norm": 0.4814786016167628, + "learning_rate": 4.641019662040417e-06, + "loss": 0.4518, + "step": 9127 + }, + { + "epoch": 3.572808100765621, + "grad_norm": 0.4990303217153392, + "learning_rate": 4.6409387679538925e-06, + "loss": 0.4668, + "step": 9128 + }, + { + "epoch": 3.573203260064213, + "grad_norm": 0.5001213765361451, + "learning_rate": 4.640857865459061e-06, + "loss": 0.4632, + "step": 9129 + }, + { + "epoch": 3.5735984193628054, + "grad_norm": 0.47937361186876326, + "learning_rate": 4.6407769545562395e-06, + "loss": 0.4403, + "step": 9130 + }, + { + "epoch": 3.5739935786613977, + "grad_norm": 0.4886845046562659, + "learning_rate": 4.6406960352457476e-06, + "loss": 0.4671, + "step": 9131 + }, + { + "epoch": 3.57438873795999, + "grad_norm": 0.6230155087433286, + "learning_rate": 4.6406151075279e-06, + "loss": 0.4663, + "step": 9132 + }, + { + "epoch": 3.574783897258582, + "grad_norm": 0.49211316752499784, + "learning_rate": 4.640534171403017e-06, + "loss": 0.4662, + "step": 9133 + }, + { + "epoch": 3.5751790565571744, + "grad_norm": 0.47931129273412515, + "learning_rate": 4.640453226871415e-06, + "loss": 0.462, + "step": 9134 + }, + { + "epoch": 3.5755742158557666, + "grad_norm": 0.4801125074561214, + "learning_rate": 4.640372273933412e-06, + "loss": 0.4514, + "step": 9135 + }, + { + "epoch": 3.575969375154359, + "grad_norm": 0.48855690347192954, + "learning_rate": 4.6402913125893275e-06, + "loss": 0.4655, + "step": 9136 + }, + { + "epoch": 3.576364534452951, + "grad_norm": 0.4902447553553269, + "learning_rate": 4.640210342839479e-06, + "loss": 0.4594, + "step": 9137 + }, + { + "epoch": 3.5767596937515433, + "grad_norm": 0.48368454552996915, + "learning_rate": 4.640129364684182e-06, + "loss": 0.4566, + "step": 9138 + }, + { + "epoch": 3.5771548530501356, + "grad_norm": 0.514151993001096, + "learning_rate": 4.640048378123757e-06, + "loss": 0.4547, + "step": 9139 + }, + { + "epoch": 3.577550012348728, + "grad_norm": 0.4776789489243305, + "learning_rate": 4.639967383158523e-06, + "loss": 0.4598, + "step": 9140 + }, + { + "epoch": 3.57794517164732, + "grad_norm": 0.48525569302537913, + "learning_rate": 4.639886379788794e-06, + "loss": 0.4418, + "step": 9141 + }, + { + "epoch": 3.5783403309459123, + "grad_norm": 0.478721665258136, + "learning_rate": 4.6398053680148926e-06, + "loss": 0.4651, + "step": 9142 + }, + { + "epoch": 3.5787354902445045, + "grad_norm": 0.47444493198435933, + "learning_rate": 4.639724347837135e-06, + "loss": 0.4424, + "step": 9143 + }, + { + "epoch": 3.579130649543097, + "grad_norm": 0.5070133096361706, + "learning_rate": 4.639643319255838e-06, + "loss": 0.466, + "step": 9144 + }, + { + "epoch": 3.5795258088416895, + "grad_norm": 0.4866231994143781, + "learning_rate": 4.639562282271323e-06, + "loss": 0.4792, + "step": 9145 + }, + { + "epoch": 3.5799209681402817, + "grad_norm": 0.513869457185633, + "learning_rate": 4.6394812368839055e-06, + "loss": 0.4557, + "step": 9146 + }, + { + "epoch": 3.580316127438874, + "grad_norm": 0.48227115983762886, + "learning_rate": 4.639400183093905e-06, + "loss": 0.4516, + "step": 9147 + }, + { + "epoch": 3.580711286737466, + "grad_norm": 0.5869304828960821, + "learning_rate": 4.63931912090164e-06, + "loss": 0.4585, + "step": 9148 + }, + { + "epoch": 3.5811064460360584, + "grad_norm": 0.4882734722747367, + "learning_rate": 4.639238050307428e-06, + "loss": 0.4565, + "step": 9149 + }, + { + "epoch": 3.5815016053346507, + "grad_norm": 0.5044115242028842, + "learning_rate": 4.639156971311589e-06, + "loss": 0.4693, + "step": 9150 + }, + { + "epoch": 3.581896764633243, + "grad_norm": 0.5019659383311873, + "learning_rate": 4.63907588391444e-06, + "loss": 0.4629, + "step": 9151 + }, + { + "epoch": 3.582291923931835, + "grad_norm": 0.5141519049977628, + "learning_rate": 4.638994788116299e-06, + "loss": 0.4677, + "step": 9152 + }, + { + "epoch": 3.5826870832304274, + "grad_norm": 0.4785979998489708, + "learning_rate": 4.638913683917486e-06, + "loss": 0.4595, + "step": 9153 + }, + { + "epoch": 3.5830822425290196, + "grad_norm": 0.4922496588594111, + "learning_rate": 4.638832571318319e-06, + "loss": 0.4422, + "step": 9154 + }, + { + "epoch": 3.583477401827612, + "grad_norm": 0.49580896733041097, + "learning_rate": 4.6387514503191165e-06, + "loss": 0.448, + "step": 9155 + }, + { + "epoch": 3.583872561126204, + "grad_norm": 0.48090076896396333, + "learning_rate": 4.638670320920196e-06, + "loss": 0.4618, + "step": 9156 + }, + { + "epoch": 3.5842677204247964, + "grad_norm": 0.4790421513834484, + "learning_rate": 4.638589183121879e-06, + "loss": 0.467, + "step": 9157 + }, + { + "epoch": 3.5846628797233886, + "grad_norm": 0.4878225988687124, + "learning_rate": 4.63850803692448e-06, + "loss": 0.4849, + "step": 9158 + }, + { + "epoch": 3.585058039021981, + "grad_norm": 0.4870640632594355, + "learning_rate": 4.638426882328322e-06, + "loss": 0.4645, + "step": 9159 + }, + { + "epoch": 3.585453198320573, + "grad_norm": 0.5050638549536798, + "learning_rate": 4.638345719333721e-06, + "loss": 0.4572, + "step": 9160 + }, + { + "epoch": 3.5858483576191653, + "grad_norm": 0.5069748386116536, + "learning_rate": 4.638264547940996e-06, + "loss": 0.4553, + "step": 9161 + }, + { + "epoch": 3.5862435169177576, + "grad_norm": 0.5578612016671592, + "learning_rate": 4.6381833681504675e-06, + "loss": 0.4625, + "step": 9162 + }, + { + "epoch": 3.58663867621635, + "grad_norm": 0.4656285059162261, + "learning_rate": 4.638102179962452e-06, + "loss": 0.4616, + "step": 9163 + }, + { + "epoch": 3.587033835514942, + "grad_norm": 0.4795082800303555, + "learning_rate": 4.63802098337727e-06, + "loss": 0.4658, + "step": 9164 + }, + { + "epoch": 3.5874289948135343, + "grad_norm": 0.4989309789649853, + "learning_rate": 4.637939778395239e-06, + "loss": 0.4533, + "step": 9165 + }, + { + "epoch": 3.5878241541121265, + "grad_norm": 0.4790162592288028, + "learning_rate": 4.637858565016679e-06, + "loss": 0.4608, + "step": 9166 + }, + { + "epoch": 3.5882193134107188, + "grad_norm": 0.4881752317773561, + "learning_rate": 4.6377773432419105e-06, + "loss": 0.4633, + "step": 9167 + }, + { + "epoch": 3.588614472709311, + "grad_norm": 0.4892512885912421, + "learning_rate": 4.637696113071249e-06, + "loss": 0.4697, + "step": 9168 + }, + { + "epoch": 3.5890096320079032, + "grad_norm": 0.4723594520462734, + "learning_rate": 4.637614874505016e-06, + "loss": 0.4436, + "step": 9169 + }, + { + "epoch": 3.5894047913064955, + "grad_norm": 0.4893238305591922, + "learning_rate": 4.637533627543529e-06, + "loss": 0.4557, + "step": 9170 + }, + { + "epoch": 3.5897999506050877, + "grad_norm": 0.48322974662494034, + "learning_rate": 4.637452372187109e-06, + "loss": 0.4547, + "step": 9171 + }, + { + "epoch": 3.59019510990368, + "grad_norm": 0.5042287973047576, + "learning_rate": 4.6373711084360725e-06, + "loss": 0.4595, + "step": 9172 + }, + { + "epoch": 3.590590269202272, + "grad_norm": 0.48463044178358355, + "learning_rate": 4.637289836290741e-06, + "loss": 0.4553, + "step": 9173 + }, + { + "epoch": 3.5909854285008644, + "grad_norm": 0.48284154312198374, + "learning_rate": 4.6372085557514335e-06, + "loss": 0.4398, + "step": 9174 + }, + { + "epoch": 3.5913805877994567, + "grad_norm": 0.48052961472036454, + "learning_rate": 4.637127266818467e-06, + "loss": 0.4667, + "step": 9175 + }, + { + "epoch": 3.591775747098049, + "grad_norm": 0.5272462072802832, + "learning_rate": 4.637045969492164e-06, + "loss": 0.4617, + "step": 9176 + }, + { + "epoch": 3.592170906396641, + "grad_norm": 0.47896935915209116, + "learning_rate": 4.636964663772841e-06, + "loss": 0.4543, + "step": 9177 + }, + { + "epoch": 3.5925660656952334, + "grad_norm": 0.5590814683075651, + "learning_rate": 4.636883349660819e-06, + "loss": 0.4606, + "step": 9178 + }, + { + "epoch": 3.5929612249938256, + "grad_norm": 0.5042016745245929, + "learning_rate": 4.6368020271564166e-06, + "loss": 0.4789, + "step": 9179 + }, + { + "epoch": 3.593356384292418, + "grad_norm": 0.47545093889683404, + "learning_rate": 4.636720696259954e-06, + "loss": 0.4471, + "step": 9180 + }, + { + "epoch": 3.59375154359101, + "grad_norm": 0.4886174481015884, + "learning_rate": 4.636639356971749e-06, + "loss": 0.4501, + "step": 9181 + }, + { + "epoch": 3.5941467028896024, + "grad_norm": 0.4909615597451317, + "learning_rate": 4.6365580092921224e-06, + "loss": 0.4536, + "step": 9182 + }, + { + "epoch": 3.5945418621881946, + "grad_norm": 0.5011647702759715, + "learning_rate": 4.6364766532213936e-06, + "loss": 0.4594, + "step": 9183 + }, + { + "epoch": 3.594937021486787, + "grad_norm": 0.4891039483973173, + "learning_rate": 4.636395288759881e-06, + "loss": 0.4428, + "step": 9184 + }, + { + "epoch": 3.595332180785379, + "grad_norm": 0.49238304772818725, + "learning_rate": 4.6363139159079056e-06, + "loss": 0.4598, + "step": 9185 + }, + { + "epoch": 3.5957273400839713, + "grad_norm": 0.499060773425193, + "learning_rate": 4.636232534665787e-06, + "loss": 0.4816, + "step": 9186 + }, + { + "epoch": 3.5961224993825636, + "grad_norm": 0.4876846908708361, + "learning_rate": 4.636151145033844e-06, + "loss": 0.4695, + "step": 9187 + }, + { + "epoch": 3.596517658681156, + "grad_norm": 0.5010421129141813, + "learning_rate": 4.636069747012395e-06, + "loss": 0.4674, + "step": 9188 + }, + { + "epoch": 3.596912817979748, + "grad_norm": 0.4742596611960192, + "learning_rate": 4.6359883406017625e-06, + "loss": 0.4597, + "step": 9189 + }, + { + "epoch": 3.5973079772783403, + "grad_norm": 0.5045304178288424, + "learning_rate": 4.635906925802264e-06, + "loss": 0.4566, + "step": 9190 + }, + { + "epoch": 3.5977031365769325, + "grad_norm": 0.49285646311806497, + "learning_rate": 4.635825502614221e-06, + "loss": 0.4506, + "step": 9191 + }, + { + "epoch": 3.5980982958755248, + "grad_norm": 1.3189611198454247, + "learning_rate": 4.635744071037952e-06, + "loss": 0.4826, + "step": 9192 + }, + { + "epoch": 3.598493455174117, + "grad_norm": 0.4978191275680937, + "learning_rate": 4.6356626310737774e-06, + "loss": 0.4644, + "step": 9193 + }, + { + "epoch": 3.5988886144727092, + "grad_norm": 0.48939510446702533, + "learning_rate": 4.635581182722017e-06, + "loss": 0.4695, + "step": 9194 + }, + { + "epoch": 3.5992837737713015, + "grad_norm": 0.5054530113910645, + "learning_rate": 4.635499725982989e-06, + "loss": 0.4633, + "step": 9195 + }, + { + "epoch": 3.5996789330698937, + "grad_norm": 0.48017657962523336, + "learning_rate": 4.6354182608570155e-06, + "loss": 0.4469, + "step": 9196 + }, + { + "epoch": 3.600074092368486, + "grad_norm": 0.5007634832293895, + "learning_rate": 4.635336787344416e-06, + "loss": 0.4594, + "step": 9197 + }, + { + "epoch": 3.600469251667078, + "grad_norm": 0.5032668243356092, + "learning_rate": 4.635255305445511e-06, + "loss": 0.4653, + "step": 9198 + }, + { + "epoch": 3.6008644109656704, + "grad_norm": 0.49875765737609773, + "learning_rate": 4.635173815160619e-06, + "loss": 0.4605, + "step": 9199 + }, + { + "epoch": 3.6012595702642627, + "grad_norm": 0.5062930640900483, + "learning_rate": 4.635092316490061e-06, + "loss": 0.4514, + "step": 9200 + }, + { + "epoch": 3.601654729562855, + "grad_norm": 0.4974728918315835, + "learning_rate": 4.635010809434157e-06, + "loss": 0.4514, + "step": 9201 + }, + { + "epoch": 3.602049888861447, + "grad_norm": 0.49648655888984, + "learning_rate": 4.634929293993226e-06, + "loss": 0.4744, + "step": 9202 + }, + { + "epoch": 3.6024450481600394, + "grad_norm": 0.4824566489860067, + "learning_rate": 4.634847770167591e-06, + "loss": 0.4635, + "step": 9203 + }, + { + "epoch": 3.6028402074586316, + "grad_norm": 0.4979705347753646, + "learning_rate": 4.6347662379575685e-06, + "loss": 0.451, + "step": 9204 + }, + { + "epoch": 3.603235366757224, + "grad_norm": 0.5062537574749927, + "learning_rate": 4.634684697363482e-06, + "loss": 0.4744, + "step": 9205 + }, + { + "epoch": 3.603630526055816, + "grad_norm": 0.5738607455547821, + "learning_rate": 4.634603148385649e-06, + "loss": 0.4592, + "step": 9206 + }, + { + "epoch": 3.6040256853544084, + "grad_norm": 1.189807804448986, + "learning_rate": 4.6345215910243915e-06, + "loss": 0.4479, + "step": 9207 + }, + { + "epoch": 3.6044208446530006, + "grad_norm": 0.4879581966562651, + "learning_rate": 4.634440025280029e-06, + "loss": 0.4646, + "step": 9208 + }, + { + "epoch": 3.604816003951593, + "grad_norm": 0.5497629591149095, + "learning_rate": 4.634358451152883e-06, + "loss": 0.4546, + "step": 9209 + }, + { + "epoch": 3.605211163250185, + "grad_norm": 0.5194558268040678, + "learning_rate": 4.634276868643273e-06, + "loss": 0.4611, + "step": 9210 + }, + { + "epoch": 3.6056063225487773, + "grad_norm": 0.4845956782876475, + "learning_rate": 4.634195277751518e-06, + "loss": 0.4445, + "step": 9211 + }, + { + "epoch": 3.6060014818473696, + "grad_norm": 0.4940635660411414, + "learning_rate": 4.634113678477942e-06, + "loss": 0.4524, + "step": 9212 + }, + { + "epoch": 3.606396641145962, + "grad_norm": 0.5070909576956948, + "learning_rate": 4.634032070822862e-06, + "loss": 0.4833, + "step": 9213 + }, + { + "epoch": 3.606791800444554, + "grad_norm": 0.490670333205641, + "learning_rate": 4.633950454786601e-06, + "loss": 0.4624, + "step": 9214 + }, + { + "epoch": 3.6071869597431463, + "grad_norm": 0.4835269632726879, + "learning_rate": 4.633868830369477e-06, + "loss": 0.4555, + "step": 9215 + }, + { + "epoch": 3.607582119041739, + "grad_norm": 0.4992739285137002, + "learning_rate": 4.633787197571813e-06, + "loss": 0.4769, + "step": 9216 + }, + { + "epoch": 3.607977278340331, + "grad_norm": 0.5402074831328878, + "learning_rate": 4.633705556393928e-06, + "loss": 0.4751, + "step": 9217 + }, + { + "epoch": 3.6083724376389235, + "grad_norm": 0.48961767171363246, + "learning_rate": 4.633623906836144e-06, + "loss": 0.4565, + "step": 9218 + }, + { + "epoch": 3.6087675969375157, + "grad_norm": 0.4838103684730782, + "learning_rate": 4.63354224889878e-06, + "loss": 0.4506, + "step": 9219 + }, + { + "epoch": 3.609162756236108, + "grad_norm": 0.4925277287138113, + "learning_rate": 4.633460582582157e-06, + "loss": 0.4488, + "step": 9220 + }, + { + "epoch": 3.6095579155347, + "grad_norm": 0.4825571381803209, + "learning_rate": 4.633378907886597e-06, + "loss": 0.4607, + "step": 9221 + }, + { + "epoch": 3.6099530748332924, + "grad_norm": 0.48936877599995243, + "learning_rate": 4.633297224812422e-06, + "loss": 0.4578, + "step": 9222 + }, + { + "epoch": 3.6103482341318847, + "grad_norm": 0.4905493219692705, + "learning_rate": 4.633215533359949e-06, + "loss": 0.4848, + "step": 9223 + }, + { + "epoch": 3.610743393430477, + "grad_norm": 0.5041306176026158, + "learning_rate": 4.633133833529501e-06, + "loss": 0.4625, + "step": 9224 + }, + { + "epoch": 3.611138552729069, + "grad_norm": 0.47807489803501046, + "learning_rate": 4.633052125321399e-06, + "loss": 0.4579, + "step": 9225 + }, + { + "epoch": 3.6115337120276614, + "grad_norm": 0.4905504604932367, + "learning_rate": 4.632970408735963e-06, + "loss": 0.4677, + "step": 9226 + }, + { + "epoch": 3.6119288713262536, + "grad_norm": 0.48751702506148886, + "learning_rate": 4.632888683773515e-06, + "loss": 0.4787, + "step": 9227 + }, + { + "epoch": 3.612324030624846, + "grad_norm": 0.497021089178736, + "learning_rate": 4.6328069504343745e-06, + "loss": 0.4421, + "step": 9228 + }, + { + "epoch": 3.612719189923438, + "grad_norm": 0.5243484285892979, + "learning_rate": 4.632725208718864e-06, + "loss": 0.451, + "step": 9229 + }, + { + "epoch": 3.6131143492220303, + "grad_norm": 0.49379034366259994, + "learning_rate": 4.6326434586273035e-06, + "loss": 0.4446, + "step": 9230 + }, + { + "epoch": 3.6135095085206226, + "grad_norm": 0.5142924559989173, + "learning_rate": 4.632561700160015e-06, + "loss": 0.4488, + "step": 9231 + }, + { + "epoch": 3.613904667819215, + "grad_norm": 0.504763239789171, + "learning_rate": 4.632479933317319e-06, + "loss": 0.4603, + "step": 9232 + }, + { + "epoch": 3.614299827117807, + "grad_norm": 0.503230561587256, + "learning_rate": 4.632398158099537e-06, + "loss": 0.4544, + "step": 9233 + }, + { + "epoch": 3.6146949864163993, + "grad_norm": 0.490744168584867, + "learning_rate": 4.63231637450699e-06, + "loss": 0.4409, + "step": 9234 + }, + { + "epoch": 3.6150901457149915, + "grad_norm": 0.5095413791531018, + "learning_rate": 4.6322345825399985e-06, + "loss": 0.4655, + "step": 9235 + }, + { + "epoch": 3.615485305013584, + "grad_norm": 0.4937333674079909, + "learning_rate": 4.6321527821988845e-06, + "loss": 0.4571, + "step": 9236 + }, + { + "epoch": 3.615880464312176, + "grad_norm": 0.4898084499608632, + "learning_rate": 4.632070973483969e-06, + "loss": 0.4839, + "step": 9237 + }, + { + "epoch": 3.6162756236107683, + "grad_norm": 0.5007373601317516, + "learning_rate": 4.631989156395574e-06, + "loss": 0.4541, + "step": 9238 + }, + { + "epoch": 3.6166707829093605, + "grad_norm": 0.48723321374477363, + "learning_rate": 4.631907330934019e-06, + "loss": 0.48, + "step": 9239 + }, + { + "epoch": 3.6170659422079527, + "grad_norm": 0.48448261567750023, + "learning_rate": 4.631825497099627e-06, + "loss": 0.4543, + "step": 9240 + }, + { + "epoch": 3.617461101506545, + "grad_norm": 0.5316625962607656, + "learning_rate": 4.63174365489272e-06, + "loss": 0.4764, + "step": 9241 + }, + { + "epoch": 3.6178562608051372, + "grad_norm": 0.49953626646231936, + "learning_rate": 4.6316618043136165e-06, + "loss": 0.4518, + "step": 9242 + }, + { + "epoch": 3.6182514201037295, + "grad_norm": 0.5003965600017454, + "learning_rate": 4.631579945362641e-06, + "loss": 0.4579, + "step": 9243 + }, + { + "epoch": 3.6186465794023217, + "grad_norm": 0.48092823514138977, + "learning_rate": 4.631498078040114e-06, + "loss": 0.448, + "step": 9244 + }, + { + "epoch": 3.619041738700914, + "grad_norm": 0.49710416001614555, + "learning_rate": 4.631416202346357e-06, + "loss": 0.4595, + "step": 9245 + }, + { + "epoch": 3.619436897999506, + "grad_norm": 0.49446651343934783, + "learning_rate": 4.631334318281691e-06, + "loss": 0.4602, + "step": 9246 + }, + { + "epoch": 3.6198320572980984, + "grad_norm": 0.5009277044883268, + "learning_rate": 4.631252425846439e-06, + "loss": 0.4408, + "step": 9247 + }, + { + "epoch": 3.6202272165966907, + "grad_norm": 0.4948355571473325, + "learning_rate": 4.63117052504092e-06, + "loss": 0.4506, + "step": 9248 + }, + { + "epoch": 3.620622375895283, + "grad_norm": 0.48896770764585235, + "learning_rate": 4.631088615865458e-06, + "loss": 0.455, + "step": 9249 + }, + { + "epoch": 3.621017535193875, + "grad_norm": 0.48712330415560645, + "learning_rate": 4.631006698320374e-06, + "loss": 0.4579, + "step": 9250 + }, + { + "epoch": 3.6214126944924674, + "grad_norm": 0.5013178593300339, + "learning_rate": 4.630924772405989e-06, + "loss": 0.4721, + "step": 9251 + }, + { + "epoch": 3.6218078537910596, + "grad_norm": 0.691080349270438, + "learning_rate": 4.630842838122627e-06, + "loss": 0.4728, + "step": 9252 + }, + { + "epoch": 3.622203013089652, + "grad_norm": 0.4974090024974102, + "learning_rate": 4.630760895470607e-06, + "loss": 0.4682, + "step": 9253 + }, + { + "epoch": 3.622598172388244, + "grad_norm": 0.4738706863887212, + "learning_rate": 4.630678944450253e-06, + "loss": 0.43, + "step": 9254 + }, + { + "epoch": 3.6229933316868363, + "grad_norm": 0.4996067080433458, + "learning_rate": 4.630596985061886e-06, + "loss": 0.4625, + "step": 9255 + }, + { + "epoch": 3.6233884909854286, + "grad_norm": 0.49310669745464825, + "learning_rate": 4.630515017305827e-06, + "loss": 0.4723, + "step": 9256 + }, + { + "epoch": 3.623783650284021, + "grad_norm": 0.5093600935457048, + "learning_rate": 4.630433041182398e-06, + "loss": 0.4711, + "step": 9257 + }, + { + "epoch": 3.624178809582613, + "grad_norm": 0.48115236735295236, + "learning_rate": 4.630351056691923e-06, + "loss": 0.4537, + "step": 9258 + }, + { + "epoch": 3.6245739688812053, + "grad_norm": 0.5011382943630356, + "learning_rate": 4.630269063834723e-06, + "loss": 0.458, + "step": 9259 + }, + { + "epoch": 3.6249691281797976, + "grad_norm": 0.5079718914818618, + "learning_rate": 4.630187062611119e-06, + "loss": 0.4779, + "step": 9260 + }, + { + "epoch": 3.62536428747839, + "grad_norm": 0.4980405547820634, + "learning_rate": 4.630105053021433e-06, + "loss": 0.4665, + "step": 9261 + }, + { + "epoch": 3.625759446776982, + "grad_norm": 0.4893026369877525, + "learning_rate": 4.6300230350659885e-06, + "loss": 0.4648, + "step": 9262 + }, + { + "epoch": 3.6261546060755743, + "grad_norm": 0.5555460867365863, + "learning_rate": 4.629941008745108e-06, + "loss": 0.4647, + "step": 9263 + }, + { + "epoch": 3.6265497653741665, + "grad_norm": 0.5759949696380431, + "learning_rate": 4.629858974059111e-06, + "loss": 0.4673, + "step": 9264 + }, + { + "epoch": 3.6269449246727588, + "grad_norm": 0.48052405561183786, + "learning_rate": 4.629776931008322e-06, + "loss": 0.458, + "step": 9265 + }, + { + "epoch": 3.627340083971351, + "grad_norm": 0.5010331429705553, + "learning_rate": 4.629694879593062e-06, + "loss": 0.4754, + "step": 9266 + }, + { + "epoch": 3.6277352432699432, + "grad_norm": 0.5007942666179961, + "learning_rate": 4.6296128198136545e-06, + "loss": 0.4696, + "step": 9267 + }, + { + "epoch": 3.6281304025685355, + "grad_norm": 0.5016428896270185, + "learning_rate": 4.62953075167042e-06, + "loss": 0.4672, + "step": 9268 + }, + { + "epoch": 3.6285255618671277, + "grad_norm": 0.49272882028841225, + "learning_rate": 4.629448675163682e-06, + "loss": 0.4639, + "step": 9269 + }, + { + "epoch": 3.62892072116572, + "grad_norm": 0.48986056236330106, + "learning_rate": 4.629366590293763e-06, + "loss": 0.4695, + "step": 9270 + }, + { + "epoch": 3.629315880464312, + "grad_norm": 0.4962133184327803, + "learning_rate": 4.629284497060985e-06, + "loss": 0.4502, + "step": 9271 + }, + { + "epoch": 3.6297110397629044, + "grad_norm": 0.4898941204572704, + "learning_rate": 4.629202395465672e-06, + "loss": 0.4609, + "step": 9272 + }, + { + "epoch": 3.6301061990614967, + "grad_norm": 0.49506032850463966, + "learning_rate": 4.629120285508143e-06, + "loss": 0.4482, + "step": 9273 + }, + { + "epoch": 3.630501358360089, + "grad_norm": 0.5171758595413785, + "learning_rate": 4.629038167188723e-06, + "loss": 0.4501, + "step": 9274 + }, + { + "epoch": 3.630896517658681, + "grad_norm": 0.4967738395377754, + "learning_rate": 4.628956040507734e-06, + "loss": 0.4649, + "step": 9275 + }, + { + "epoch": 3.6312916769572734, + "grad_norm": 0.5063747871234539, + "learning_rate": 4.628873905465498e-06, + "loss": 0.4629, + "step": 9276 + }, + { + "epoch": 3.6316868362558656, + "grad_norm": 0.47510127574337424, + "learning_rate": 4.628791762062338e-06, + "loss": 0.4449, + "step": 9277 + }, + { + "epoch": 3.632081995554458, + "grad_norm": 0.49070050113003616, + "learning_rate": 4.628709610298578e-06, + "loss": 0.4599, + "step": 9278 + }, + { + "epoch": 3.63247715485305, + "grad_norm": 0.513370408704164, + "learning_rate": 4.628627450174537e-06, + "loss": 0.463, + "step": 9279 + }, + { + "epoch": 3.6328723141516424, + "grad_norm": 0.5091150770823012, + "learning_rate": 4.628545281690541e-06, + "loss": 0.4746, + "step": 9280 + }, + { + "epoch": 3.6332674734502346, + "grad_norm": 0.5151701240747677, + "learning_rate": 4.628463104846912e-06, + "loss": 0.4705, + "step": 9281 + }, + { + "epoch": 3.633662632748827, + "grad_norm": 0.49056073566435143, + "learning_rate": 4.628380919643972e-06, + "loss": 0.4638, + "step": 9282 + }, + { + "epoch": 3.634057792047419, + "grad_norm": 0.5144421824248759, + "learning_rate": 4.6282987260820445e-06, + "loss": 0.4778, + "step": 9283 + }, + { + "epoch": 3.6344529513460113, + "grad_norm": 0.49744475854641185, + "learning_rate": 4.6282165241614515e-06, + "loss": 0.458, + "step": 9284 + }, + { + "epoch": 3.6348481106446036, + "grad_norm": 0.5035190140052788, + "learning_rate": 4.628134313882518e-06, + "loss": 0.4693, + "step": 9285 + }, + { + "epoch": 3.635243269943196, + "grad_norm": 0.5615476058939156, + "learning_rate": 4.6280520952455635e-06, + "loss": 0.4932, + "step": 9286 + }, + { + "epoch": 3.635638429241788, + "grad_norm": 0.49189317167932184, + "learning_rate": 4.627969868250912e-06, + "loss": 0.4635, + "step": 9287 + }, + { + "epoch": 3.6360335885403803, + "grad_norm": 0.4912590147231549, + "learning_rate": 4.6278876328988885e-06, + "loss": 0.4639, + "step": 9288 + }, + { + "epoch": 3.6364287478389725, + "grad_norm": 2.521359709853842, + "learning_rate": 4.627805389189814e-06, + "loss": 0.4611, + "step": 9289 + }, + { + "epoch": 3.6368239071375648, + "grad_norm": 0.486064042323978, + "learning_rate": 4.627723137124012e-06, + "loss": 0.4412, + "step": 9290 + }, + { + "epoch": 3.637219066436157, + "grad_norm": 0.5048179415790461, + "learning_rate": 4.627640876701806e-06, + "loss": 0.4571, + "step": 9291 + }, + { + "epoch": 3.6376142257347492, + "grad_norm": 0.4959460801045448, + "learning_rate": 4.627558607923517e-06, + "loss": 0.4677, + "step": 9292 + }, + { + "epoch": 3.6380093850333415, + "grad_norm": 0.48066916313414726, + "learning_rate": 4.627476330789471e-06, + "loss": 0.4447, + "step": 9293 + }, + { + "epoch": 3.6384045443319337, + "grad_norm": 0.4973239424582944, + "learning_rate": 4.62739404529999e-06, + "loss": 0.4672, + "step": 9294 + }, + { + "epoch": 3.638799703630526, + "grad_norm": 0.48199497014952, + "learning_rate": 4.627311751455397e-06, + "loss": 0.4517, + "step": 9295 + }, + { + "epoch": 3.639194862929118, + "grad_norm": 0.49279718443129383, + "learning_rate": 4.627229449256014e-06, + "loss": 0.4693, + "step": 9296 + }, + { + "epoch": 3.6395900222277104, + "grad_norm": 0.5010681919023393, + "learning_rate": 4.627147138702166e-06, + "loss": 0.4743, + "step": 9297 + }, + { + "epoch": 3.6399851815263027, + "grad_norm": 0.47695376136618906, + "learning_rate": 4.627064819794177e-06, + "loss": 0.454, + "step": 9298 + }, + { + "epoch": 3.640380340824895, + "grad_norm": 0.4888943085386212, + "learning_rate": 4.626982492532368e-06, + "loss": 0.4456, + "step": 9299 + }, + { + "epoch": 3.640775500123487, + "grad_norm": 0.4771083873842363, + "learning_rate": 4.626900156917064e-06, + "loss": 0.468, + "step": 9300 + }, + { + "epoch": 3.6411706594220794, + "grad_norm": 0.48136951242486825, + "learning_rate": 4.626817812948586e-06, + "loss": 0.4516, + "step": 9301 + }, + { + "epoch": 3.6415658187206716, + "grad_norm": 0.4979788453537473, + "learning_rate": 4.6267354606272605e-06, + "loss": 0.4723, + "step": 9302 + }, + { + "epoch": 3.641960978019264, + "grad_norm": 0.5102309614105011, + "learning_rate": 4.62665309995341e-06, + "loss": 0.4598, + "step": 9303 + }, + { + "epoch": 3.642356137317856, + "grad_norm": 0.49649828248075617, + "learning_rate": 4.6265707309273565e-06, + "loss": 0.4686, + "step": 9304 + }, + { + "epoch": 3.6427512966164484, + "grad_norm": 0.48125108712989917, + "learning_rate": 4.626488353549425e-06, + "loss": 0.4482, + "step": 9305 + }, + { + "epoch": 3.6431464559150406, + "grad_norm": 0.5210918594356808, + "learning_rate": 4.626405967819938e-06, + "loss": 0.459, + "step": 9306 + }, + { + "epoch": 3.643541615213633, + "grad_norm": 0.47329837989409934, + "learning_rate": 4.626323573739219e-06, + "loss": 0.4465, + "step": 9307 + }, + { + "epoch": 3.643936774512225, + "grad_norm": 0.5042256332277069, + "learning_rate": 4.626241171307593e-06, + "loss": 0.4573, + "step": 9308 + }, + { + "epoch": 3.6443319338108173, + "grad_norm": 1.0400744964458153, + "learning_rate": 4.626158760525383e-06, + "loss": 0.4835, + "step": 9309 + }, + { + "epoch": 3.6447270931094096, + "grad_norm": 0.5001511613882816, + "learning_rate": 4.6260763413929124e-06, + "loss": 0.4533, + "step": 9310 + }, + { + "epoch": 3.645122252408002, + "grad_norm": 0.4940591531345029, + "learning_rate": 4.625993913910505e-06, + "loss": 0.4646, + "step": 9311 + }, + { + "epoch": 3.645517411706594, + "grad_norm": 0.4942019788158831, + "learning_rate": 4.625911478078484e-06, + "loss": 0.4551, + "step": 9312 + }, + { + "epoch": 3.6459125710051863, + "grad_norm": 0.4884265738310717, + "learning_rate": 4.6258290338971735e-06, + "loss": 0.4517, + "step": 9313 + }, + { + "epoch": 3.6463077303037785, + "grad_norm": 0.4908742478353482, + "learning_rate": 4.625746581366898e-06, + "loss": 0.4711, + "step": 9314 + }, + { + "epoch": 3.6467028896023708, + "grad_norm": 0.5066441924253818, + "learning_rate": 4.625664120487981e-06, + "loss": 0.4672, + "step": 9315 + }, + { + "epoch": 3.647098048900963, + "grad_norm": 0.4975632434645502, + "learning_rate": 4.625581651260745e-06, + "loss": 0.4463, + "step": 9316 + }, + { + "epoch": 3.6474932081995552, + "grad_norm": 0.5054480921919917, + "learning_rate": 4.625499173685516e-06, + "loss": 0.4676, + "step": 9317 + }, + { + "epoch": 3.6478883674981475, + "grad_norm": 0.4881775251448275, + "learning_rate": 4.6254166877626175e-06, + "loss": 0.4441, + "step": 9318 + }, + { + "epoch": 3.6482835267967397, + "grad_norm": 0.48222427669341467, + "learning_rate": 4.625334193492371e-06, + "loss": 0.4665, + "step": 9319 + }, + { + "epoch": 3.648678686095332, + "grad_norm": 0.4936417971367663, + "learning_rate": 4.625251690875104e-06, + "loss": 0.4888, + "step": 9320 + }, + { + "epoch": 3.649073845393924, + "grad_norm": 0.49953652158686007, + "learning_rate": 4.6251691799111376e-06, + "loss": 0.4675, + "step": 9321 + }, + { + "epoch": 3.6494690046925164, + "grad_norm": 0.49771002562217137, + "learning_rate": 4.625086660600798e-06, + "loss": 0.468, + "step": 9322 + }, + { + "epoch": 3.6498641639911087, + "grad_norm": 0.49436016460202736, + "learning_rate": 4.625004132944409e-06, + "loss": 0.4492, + "step": 9323 + }, + { + "epoch": 3.650259323289701, + "grad_norm": 0.46676714681432757, + "learning_rate": 4.624921596942292e-06, + "loss": 0.4241, + "step": 9324 + }, + { + "epoch": 3.650654482588293, + "grad_norm": 0.49467615491708383, + "learning_rate": 4.6248390525947755e-06, + "loss": 0.4691, + "step": 9325 + }, + { + "epoch": 3.6510496418868854, + "grad_norm": 0.4938231345250084, + "learning_rate": 4.624756499902181e-06, + "loss": 0.46, + "step": 9326 + }, + { + "epoch": 3.6514448011854777, + "grad_norm": 0.4838739325764053, + "learning_rate": 4.624673938864832e-06, + "loss": 0.4513, + "step": 9327 + }, + { + "epoch": 3.65183996048407, + "grad_norm": 0.49112971366065933, + "learning_rate": 4.6245913694830545e-06, + "loss": 0.4524, + "step": 9328 + }, + { + "epoch": 3.652235119782662, + "grad_norm": 0.5011715788855052, + "learning_rate": 4.624508791757173e-06, + "loss": 0.4856, + "step": 9329 + }, + { + "epoch": 3.6526302790812544, + "grad_norm": 0.4993605096743152, + "learning_rate": 4.62442620568751e-06, + "loss": 0.471, + "step": 9330 + }, + { + "epoch": 3.6530254383798466, + "grad_norm": 0.48611034727334546, + "learning_rate": 4.624343611274391e-06, + "loss": 0.4552, + "step": 9331 + }, + { + "epoch": 3.653420597678439, + "grad_norm": 0.5253566397582367, + "learning_rate": 4.624261008518141e-06, + "loss": 0.4566, + "step": 9332 + }, + { + "epoch": 3.653815756977031, + "grad_norm": 0.5175702806785849, + "learning_rate": 4.624178397419083e-06, + "loss": 0.4533, + "step": 9333 + }, + { + "epoch": 3.6542109162756238, + "grad_norm": 0.4994237852186553, + "learning_rate": 4.624095777977543e-06, + "loss": 0.447, + "step": 9334 + }, + { + "epoch": 3.654606075574216, + "grad_norm": 0.4940996164170102, + "learning_rate": 4.624013150193844e-06, + "loss": 0.4544, + "step": 9335 + }, + { + "epoch": 3.6550012348728083, + "grad_norm": 0.4931404991433969, + "learning_rate": 4.623930514068311e-06, + "loss": 0.4364, + "step": 9336 + }, + { + "epoch": 3.6553963941714005, + "grad_norm": 0.5060095886918009, + "learning_rate": 4.623847869601269e-06, + "loss": 0.4622, + "step": 9337 + }, + { + "epoch": 3.6557915534699927, + "grad_norm": 0.4864732607657538, + "learning_rate": 4.623765216793042e-06, + "loss": 0.4508, + "step": 9338 + }, + { + "epoch": 3.656186712768585, + "grad_norm": 0.515417673885724, + "learning_rate": 4.623682555643955e-06, + "loss": 0.4746, + "step": 9339 + }, + { + "epoch": 3.656581872067177, + "grad_norm": 0.5093440038900928, + "learning_rate": 4.623599886154333e-06, + "loss": 0.4586, + "step": 9340 + }, + { + "epoch": 3.6569770313657695, + "grad_norm": 0.4884238293387281, + "learning_rate": 4.623517208324499e-06, + "loss": 0.4611, + "step": 9341 + }, + { + "epoch": 3.6573721906643617, + "grad_norm": 0.4949252091120524, + "learning_rate": 4.623434522154779e-06, + "loss": 0.453, + "step": 9342 + }, + { + "epoch": 3.657767349962954, + "grad_norm": 0.4996973803416822, + "learning_rate": 4.623351827645498e-06, + "loss": 0.4615, + "step": 9343 + }, + { + "epoch": 3.658162509261546, + "grad_norm": 0.504144710893261, + "learning_rate": 4.623269124796981e-06, + "loss": 0.4673, + "step": 9344 + }, + { + "epoch": 3.6585576685601384, + "grad_norm": 0.4969157578415948, + "learning_rate": 4.623186413609552e-06, + "loss": 0.4487, + "step": 9345 + }, + { + "epoch": 3.6589528278587307, + "grad_norm": 0.4932886314806929, + "learning_rate": 4.623103694083535e-06, + "loss": 0.456, + "step": 9346 + }, + { + "epoch": 3.659347987157323, + "grad_norm": 0.5148738766834107, + "learning_rate": 4.623020966219257e-06, + "loss": 0.4471, + "step": 9347 + }, + { + "epoch": 3.659743146455915, + "grad_norm": 0.5830202209682454, + "learning_rate": 4.622938230017041e-06, + "loss": 0.4643, + "step": 9348 + }, + { + "epoch": 3.6601383057545074, + "grad_norm": 0.48992602819472203, + "learning_rate": 4.622855485477214e-06, + "loss": 0.4474, + "step": 9349 + }, + { + "epoch": 3.6605334650530996, + "grad_norm": 0.48284568742478173, + "learning_rate": 4.622772732600098e-06, + "loss": 0.4623, + "step": 9350 + }, + { + "epoch": 3.660928624351692, + "grad_norm": 0.48801605211010135, + "learning_rate": 4.622689971386021e-06, + "loss": 0.4426, + "step": 9351 + }, + { + "epoch": 3.661323783650284, + "grad_norm": 0.49203131886308554, + "learning_rate": 4.6226072018353055e-06, + "loss": 0.4389, + "step": 9352 + }, + { + "epoch": 3.6617189429488763, + "grad_norm": 0.48655905238073444, + "learning_rate": 4.622524423948279e-06, + "loss": 0.4602, + "step": 9353 + }, + { + "epoch": 3.6621141022474686, + "grad_norm": 0.4934300579478431, + "learning_rate": 4.6224416377252645e-06, + "loss": 0.4439, + "step": 9354 + }, + { + "epoch": 3.662509261546061, + "grad_norm": 0.4858789552341792, + "learning_rate": 4.622358843166589e-06, + "loss": 0.4555, + "step": 9355 + }, + { + "epoch": 3.662904420844653, + "grad_norm": 0.4899213923525825, + "learning_rate": 4.622276040272576e-06, + "loss": 0.4529, + "step": 9356 + }, + { + "epoch": 3.6632995801432453, + "grad_norm": 0.494097810082973, + "learning_rate": 4.622193229043552e-06, + "loss": 0.4511, + "step": 9357 + }, + { + "epoch": 3.6636947394418375, + "grad_norm": 0.4844491051320266, + "learning_rate": 4.622110409479842e-06, + "loss": 0.4587, + "step": 9358 + }, + { + "epoch": 3.66408989874043, + "grad_norm": 0.4914228114049162, + "learning_rate": 4.622027581581771e-06, + "loss": 0.4487, + "step": 9359 + }, + { + "epoch": 3.664485058039022, + "grad_norm": 0.49201477673440297, + "learning_rate": 4.6219447453496626e-06, + "loss": 0.4654, + "step": 9360 + }, + { + "epoch": 3.6648802173376143, + "grad_norm": 0.5030420734743306, + "learning_rate": 4.621861900783845e-06, + "loss": 0.4556, + "step": 9361 + }, + { + "epoch": 3.6652753766362065, + "grad_norm": 0.49226581282451, + "learning_rate": 4.621779047884642e-06, + "loss": 0.4662, + "step": 9362 + }, + { + "epoch": 3.6656705359347987, + "grad_norm": 0.485604702744728, + "learning_rate": 4.621696186652379e-06, + "loss": 0.4651, + "step": 9363 + }, + { + "epoch": 3.666065695233391, + "grad_norm": 0.4947225613132439, + "learning_rate": 4.621613317087382e-06, + "loss": 0.45, + "step": 9364 + }, + { + "epoch": 3.6664608545319832, + "grad_norm": 0.49964170799077484, + "learning_rate": 4.6215304391899765e-06, + "loss": 0.4368, + "step": 9365 + }, + { + "epoch": 3.6668560138305755, + "grad_norm": 0.4640684797492653, + "learning_rate": 4.621447552960488e-06, + "loss": 0.4426, + "step": 9366 + }, + { + "epoch": 3.6672511731291677, + "grad_norm": 0.493726848514641, + "learning_rate": 4.621364658399241e-06, + "loss": 0.4724, + "step": 9367 + }, + { + "epoch": 3.66764633242776, + "grad_norm": 0.48809324691574657, + "learning_rate": 4.621281755506562e-06, + "loss": 0.4362, + "step": 9368 + }, + { + "epoch": 3.668041491726352, + "grad_norm": 0.4939624266923956, + "learning_rate": 4.621198844282777e-06, + "loss": 0.4726, + "step": 9369 + }, + { + "epoch": 3.6684366510249444, + "grad_norm": 0.49641926622094995, + "learning_rate": 4.62111592472821e-06, + "loss": 0.4717, + "step": 9370 + }, + { + "epoch": 3.6688318103235367, + "grad_norm": 0.5050735506985684, + "learning_rate": 4.6210329968431876e-06, + "loss": 0.4674, + "step": 9371 + }, + { + "epoch": 3.669226969622129, + "grad_norm": 0.4909447418334573, + "learning_rate": 4.620950060628037e-06, + "loss": 0.451, + "step": 9372 + }, + { + "epoch": 3.669622128920721, + "grad_norm": 0.48904389952363275, + "learning_rate": 4.620867116083081e-06, + "loss": 0.4497, + "step": 9373 + }, + { + "epoch": 3.6700172882193134, + "grad_norm": 0.5054031870535084, + "learning_rate": 4.620784163208647e-06, + "loss": 0.4945, + "step": 9374 + }, + { + "epoch": 3.6704124475179056, + "grad_norm": 0.5433428790690614, + "learning_rate": 4.6207012020050614e-06, + "loss": 0.4533, + "step": 9375 + }, + { + "epoch": 3.670807606816498, + "grad_norm": 0.5002618872303594, + "learning_rate": 4.620618232472649e-06, + "loss": 0.4489, + "step": 9376 + }, + { + "epoch": 3.67120276611509, + "grad_norm": 0.48690513054549206, + "learning_rate": 4.620535254611735e-06, + "loss": 0.4678, + "step": 9377 + }, + { + "epoch": 3.6715979254136824, + "grad_norm": 0.48402593380651104, + "learning_rate": 4.6204522684226475e-06, + "loss": 0.4506, + "step": 9378 + }, + { + "epoch": 3.6719930847122746, + "grad_norm": 0.5103203925733566, + "learning_rate": 4.620369273905711e-06, + "loss": 0.4528, + "step": 9379 + }, + { + "epoch": 3.672388244010867, + "grad_norm": 0.48564066203388767, + "learning_rate": 4.620286271061251e-06, + "loss": 0.4622, + "step": 9380 + }, + { + "epoch": 3.672783403309459, + "grad_norm": 0.47915158304914873, + "learning_rate": 4.620203259889593e-06, + "loss": 0.4528, + "step": 9381 + }, + { + "epoch": 3.6731785626080513, + "grad_norm": 0.4852717018202506, + "learning_rate": 4.620120240391065e-06, + "loss": 0.4716, + "step": 9382 + }, + { + "epoch": 3.6735737219066436, + "grad_norm": 0.496307917359212, + "learning_rate": 4.620037212565992e-06, + "loss": 0.4636, + "step": 9383 + }, + { + "epoch": 3.673968881205236, + "grad_norm": 0.49948922206920776, + "learning_rate": 4.6199541764147e-06, + "loss": 0.45, + "step": 9384 + }, + { + "epoch": 3.674364040503828, + "grad_norm": 0.4906585809016417, + "learning_rate": 4.619871131937516e-06, + "loss": 0.4709, + "step": 9385 + }, + { + "epoch": 3.6747591998024203, + "grad_norm": 0.49580349029874954, + "learning_rate": 4.619788079134766e-06, + "loss": 0.4575, + "step": 9386 + }, + { + "epoch": 3.6751543591010125, + "grad_norm": 0.4877988171055439, + "learning_rate": 4.619705018006775e-06, + "loss": 0.4606, + "step": 9387 + }, + { + "epoch": 3.6755495183996048, + "grad_norm": 0.47635905394834055, + "learning_rate": 4.619621948553869e-06, + "loss": 0.4598, + "step": 9388 + }, + { + "epoch": 3.675944677698197, + "grad_norm": 0.4848704937642032, + "learning_rate": 4.619538870776375e-06, + "loss": 0.4531, + "step": 9389 + }, + { + "epoch": 3.6763398369967892, + "grad_norm": 0.5132723431379773, + "learning_rate": 4.61945578467462e-06, + "loss": 0.4813, + "step": 9390 + }, + { + "epoch": 3.6767349962953815, + "grad_norm": 0.49446717879197793, + "learning_rate": 4.61937269024893e-06, + "loss": 0.4732, + "step": 9391 + }, + { + "epoch": 3.6771301555939737, + "grad_norm": 0.48783487200107034, + "learning_rate": 4.619289587499631e-06, + "loss": 0.4642, + "step": 9392 + }, + { + "epoch": 3.677525314892566, + "grad_norm": 0.5095115851096903, + "learning_rate": 4.619206476427049e-06, + "loss": 0.456, + "step": 9393 + }, + { + "epoch": 3.677920474191158, + "grad_norm": 0.4947934593811385, + "learning_rate": 4.619123357031511e-06, + "loss": 0.4721, + "step": 9394 + }, + { + "epoch": 3.6783156334897504, + "grad_norm": 0.5075806473304344, + "learning_rate": 4.619040229313343e-06, + "loss": 0.4786, + "step": 9395 + }, + { + "epoch": 3.6787107927883427, + "grad_norm": 0.5801842763429373, + "learning_rate": 4.618957093272872e-06, + "loss": 0.4517, + "step": 9396 + }, + { + "epoch": 3.679105952086935, + "grad_norm": 0.5005276061566315, + "learning_rate": 4.618873948910425e-06, + "loss": 0.454, + "step": 9397 + }, + { + "epoch": 3.679501111385527, + "grad_norm": 0.5031083933218373, + "learning_rate": 4.618790796226327e-06, + "loss": 0.4526, + "step": 9398 + }, + { + "epoch": 3.6798962706841194, + "grad_norm": 0.4919801264259388, + "learning_rate": 4.618707635220905e-06, + "loss": 0.4524, + "step": 9399 + }, + { + "epoch": 3.6802914299827116, + "grad_norm": 0.4978593733121662, + "learning_rate": 4.6186244658944865e-06, + "loss": 0.4633, + "step": 9400 + }, + { + "epoch": 3.680686589281304, + "grad_norm": 0.4998560933361007, + "learning_rate": 4.618541288247397e-06, + "loss": 0.4571, + "step": 9401 + }, + { + "epoch": 3.681081748579896, + "grad_norm": 0.4916373727411663, + "learning_rate": 4.618458102279964e-06, + "loss": 0.4585, + "step": 9402 + }, + { + "epoch": 3.6814769078784884, + "grad_norm": 0.4888930129846756, + "learning_rate": 4.6183749079925145e-06, + "loss": 0.4678, + "step": 9403 + }, + { + "epoch": 3.6818720671770806, + "grad_norm": 0.4956175383165202, + "learning_rate": 4.618291705385374e-06, + "loss": 0.4691, + "step": 9404 + }, + { + "epoch": 3.6822672264756733, + "grad_norm": 0.5119254964059565, + "learning_rate": 4.61820849445887e-06, + "loss": 0.4503, + "step": 9405 + }, + { + "epoch": 3.6826623857742655, + "grad_norm": 0.49166332352889675, + "learning_rate": 4.61812527521333e-06, + "loss": 0.4644, + "step": 9406 + }, + { + "epoch": 3.6830575450728578, + "grad_norm": 0.4813292349115858, + "learning_rate": 4.61804204764908e-06, + "loss": 0.448, + "step": 9407 + }, + { + "epoch": 3.68345270437145, + "grad_norm": 0.49544114801840133, + "learning_rate": 4.6179588117664465e-06, + "loss": 0.4531, + "step": 9408 + }, + { + "epoch": 3.6838478636700422, + "grad_norm": 0.5062682853242432, + "learning_rate": 4.6178755675657565e-06, + "loss": 0.4658, + "step": 9409 + }, + { + "epoch": 3.6842430229686345, + "grad_norm": 0.4885565367565571, + "learning_rate": 4.617792315047338e-06, + "loss": 0.4576, + "step": 9410 + }, + { + "epoch": 3.6846381822672267, + "grad_norm": 0.4927003220978129, + "learning_rate": 4.6177090542115176e-06, + "loss": 0.4545, + "step": 9411 + }, + { + "epoch": 3.685033341565819, + "grad_norm": 0.5342756099499717, + "learning_rate": 4.617625785058622e-06, + "loss": 0.4444, + "step": 9412 + }, + { + "epoch": 3.685428500864411, + "grad_norm": 0.5191362354914825, + "learning_rate": 4.617542507588977e-06, + "loss": 0.4632, + "step": 9413 + }, + { + "epoch": 3.6858236601630034, + "grad_norm": 0.5005797890017317, + "learning_rate": 4.6174592218029115e-06, + "loss": 0.4565, + "step": 9414 + }, + { + "epoch": 3.6862188194615957, + "grad_norm": 0.5115586690359692, + "learning_rate": 4.617375927700752e-06, + "loss": 0.4716, + "step": 9415 + }, + { + "epoch": 3.686613978760188, + "grad_norm": 0.5168058976627257, + "learning_rate": 4.617292625282826e-06, + "loss": 0.4794, + "step": 9416 + }, + { + "epoch": 3.68700913805878, + "grad_norm": 0.5038401934419339, + "learning_rate": 4.617209314549459e-06, + "loss": 0.4795, + "step": 9417 + }, + { + "epoch": 3.6874042973573724, + "grad_norm": 0.49671493204697803, + "learning_rate": 4.617125995500981e-06, + "loss": 0.4742, + "step": 9418 + }, + { + "epoch": 3.6877994566559646, + "grad_norm": 0.5245802827596923, + "learning_rate": 4.617042668137717e-06, + "loss": 0.4655, + "step": 9419 + }, + { + "epoch": 3.688194615954557, + "grad_norm": 0.49027669847543004, + "learning_rate": 4.616959332459995e-06, + "loss": 0.4685, + "step": 9420 + }, + { + "epoch": 3.688589775253149, + "grad_norm": 0.5318803917463871, + "learning_rate": 4.616875988468142e-06, + "loss": 0.4566, + "step": 9421 + }, + { + "epoch": 3.6889849345517414, + "grad_norm": 0.47512179395486337, + "learning_rate": 4.616792636162486e-06, + "loss": 0.4508, + "step": 9422 + }, + { + "epoch": 3.6893800938503336, + "grad_norm": 0.4995389465446858, + "learning_rate": 4.616709275543353e-06, + "loss": 0.4573, + "step": 9423 + }, + { + "epoch": 3.689775253148926, + "grad_norm": 0.4952456922866283, + "learning_rate": 4.616625906611072e-06, + "loss": 0.4449, + "step": 9424 + }, + { + "epoch": 3.690170412447518, + "grad_norm": 0.4681009974875698, + "learning_rate": 4.61654252936597e-06, + "loss": 0.4443, + "step": 9425 + }, + { + "epoch": 3.6905655717461103, + "grad_norm": 0.49698293961782763, + "learning_rate": 4.616459143808374e-06, + "loss": 0.4504, + "step": 9426 + }, + { + "epoch": 3.6909607310447026, + "grad_norm": 0.4799614987205371, + "learning_rate": 4.616375749938612e-06, + "loss": 0.4827, + "step": 9427 + }, + { + "epoch": 3.691355890343295, + "grad_norm": 0.4935543789709199, + "learning_rate": 4.61629234775701e-06, + "loss": 0.454, + "step": 9428 + }, + { + "epoch": 3.691751049641887, + "grad_norm": 0.48312786289813203, + "learning_rate": 4.616208937263897e-06, + "loss": 0.4618, + "step": 9429 + }, + { + "epoch": 3.6921462089404793, + "grad_norm": 0.48779193972998197, + "learning_rate": 4.616125518459601e-06, + "loss": 0.4664, + "step": 9430 + }, + { + "epoch": 3.6925413682390715, + "grad_norm": 0.5102169263992223, + "learning_rate": 4.616042091344449e-06, + "loss": 0.4528, + "step": 9431 + }, + { + "epoch": 3.6929365275376638, + "grad_norm": 0.5028415523931336, + "learning_rate": 4.615958655918768e-06, + "loss": 0.4761, + "step": 9432 + }, + { + "epoch": 3.693331686836256, + "grad_norm": 0.49065489093573394, + "learning_rate": 4.615875212182887e-06, + "loss": 0.4392, + "step": 9433 + }, + { + "epoch": 3.6937268461348483, + "grad_norm": 0.49732637769698324, + "learning_rate": 4.615791760137133e-06, + "loss": 0.4633, + "step": 9434 + }, + { + "epoch": 3.6941220054334405, + "grad_norm": 0.49238892823032243, + "learning_rate": 4.615708299781833e-06, + "loss": 0.4622, + "step": 9435 + }, + { + "epoch": 3.6945171647320327, + "grad_norm": 0.49343740327676344, + "learning_rate": 4.615624831117316e-06, + "loss": 0.4568, + "step": 9436 + }, + { + "epoch": 3.694912324030625, + "grad_norm": 0.4861385224097656, + "learning_rate": 4.615541354143908e-06, + "loss": 0.4503, + "step": 9437 + }, + { + "epoch": 3.695307483329217, + "grad_norm": 0.497943873992621, + "learning_rate": 4.61545786886194e-06, + "loss": 0.4495, + "step": 9438 + }, + { + "epoch": 3.6957026426278095, + "grad_norm": 0.48616932102253096, + "learning_rate": 4.615374375271738e-06, + "loss": 0.476, + "step": 9439 + }, + { + "epoch": 3.6960978019264017, + "grad_norm": 0.49036991811170894, + "learning_rate": 4.615290873373629e-06, + "loss": 0.4574, + "step": 9440 + }, + { + "epoch": 3.696492961224994, + "grad_norm": 0.4874642622968561, + "learning_rate": 4.615207363167943e-06, + "loss": 0.4534, + "step": 9441 + }, + { + "epoch": 3.696888120523586, + "grad_norm": 0.4874523989618741, + "learning_rate": 4.615123844655006e-06, + "loss": 0.4669, + "step": 9442 + }, + { + "epoch": 3.6972832798221784, + "grad_norm": 0.4893447874095786, + "learning_rate": 4.615040317835147e-06, + "loss": 0.4441, + "step": 9443 + }, + { + "epoch": 3.6976784391207707, + "grad_norm": 0.5035498076080915, + "learning_rate": 4.614956782708694e-06, + "loss": 0.4679, + "step": 9444 + }, + { + "epoch": 3.698073598419363, + "grad_norm": 0.4973304541825886, + "learning_rate": 4.614873239275976e-06, + "loss": 0.4513, + "step": 9445 + }, + { + "epoch": 3.698468757717955, + "grad_norm": 0.5653732922507139, + "learning_rate": 4.6147896875373185e-06, + "loss": 0.4583, + "step": 9446 + }, + { + "epoch": 3.6988639170165474, + "grad_norm": 0.47730565163897437, + "learning_rate": 4.614706127493052e-06, + "loss": 0.4706, + "step": 9447 + }, + { + "epoch": 3.6992590763151396, + "grad_norm": 0.4900913046606925, + "learning_rate": 4.614622559143504e-06, + "loss": 0.4713, + "step": 9448 + }, + { + "epoch": 3.699654235613732, + "grad_norm": 0.4807624939670291, + "learning_rate": 4.614538982489003e-06, + "loss": 0.462, + "step": 9449 + }, + { + "epoch": 3.700049394912324, + "grad_norm": 0.5255103013851058, + "learning_rate": 4.614455397529876e-06, + "loss": 0.4596, + "step": 9450 + }, + { + "epoch": 3.7004445542109163, + "grad_norm": 0.49339176253468786, + "learning_rate": 4.614371804266453e-06, + "loss": 0.4446, + "step": 9451 + }, + { + "epoch": 3.7008397135095086, + "grad_norm": 0.479984900218154, + "learning_rate": 4.614288202699061e-06, + "loss": 0.4539, + "step": 9452 + }, + { + "epoch": 3.701234872808101, + "grad_norm": 0.4906333419555256, + "learning_rate": 4.6142045928280284e-06, + "loss": 0.4602, + "step": 9453 + }, + { + "epoch": 3.701630032106693, + "grad_norm": 0.47786003426235846, + "learning_rate": 4.6141209746536855e-06, + "loss": 0.451, + "step": 9454 + }, + { + "epoch": 3.7020251914052853, + "grad_norm": 0.505494984357728, + "learning_rate": 4.614037348176358e-06, + "loss": 0.4777, + "step": 9455 + }, + { + "epoch": 3.7024203507038775, + "grad_norm": 0.5053597345252311, + "learning_rate": 4.613953713396376e-06, + "loss": 0.4582, + "step": 9456 + }, + { + "epoch": 3.70281551000247, + "grad_norm": 0.5125562819742167, + "learning_rate": 4.613870070314067e-06, + "loss": 0.4626, + "step": 9457 + }, + { + "epoch": 3.703210669301062, + "grad_norm": 0.5017156061662159, + "learning_rate": 4.6137864189297595e-06, + "loss": 0.451, + "step": 9458 + }, + { + "epoch": 3.7036058285996543, + "grad_norm": 0.47667609480628925, + "learning_rate": 4.613702759243784e-06, + "loss": 0.4517, + "step": 9459 + }, + { + "epoch": 3.7040009878982465, + "grad_norm": 0.4825374845237388, + "learning_rate": 4.613619091256466e-06, + "loss": 0.4474, + "step": 9460 + }, + { + "epoch": 3.7043961471968387, + "grad_norm": 0.47835338941796834, + "learning_rate": 4.6135354149681365e-06, + "loss": 0.4469, + "step": 9461 + }, + { + "epoch": 3.704791306495431, + "grad_norm": 0.9294106269951347, + "learning_rate": 4.6134517303791235e-06, + "loss": 0.4607, + "step": 9462 + }, + { + "epoch": 3.705186465794023, + "grad_norm": 0.4930233385165018, + "learning_rate": 4.613368037489756e-06, + "loss": 0.4506, + "step": 9463 + }, + { + "epoch": 3.7055816250926155, + "grad_norm": 0.49884851479906, + "learning_rate": 4.613284336300361e-06, + "loss": 0.4536, + "step": 9464 + }, + { + "epoch": 3.7059767843912077, + "grad_norm": 0.47765620112543045, + "learning_rate": 4.613200626811268e-06, + "loss": 0.4554, + "step": 9465 + }, + { + "epoch": 3.7063719436898, + "grad_norm": 0.49830740900398934, + "learning_rate": 4.613116909022807e-06, + "loss": 0.4485, + "step": 9466 + }, + { + "epoch": 3.706767102988392, + "grad_norm": 0.5144490254440631, + "learning_rate": 4.613033182935306e-06, + "loss": 0.4664, + "step": 9467 + }, + { + "epoch": 3.7071622622869844, + "grad_norm": 0.5067754911121317, + "learning_rate": 4.6129494485490935e-06, + "loss": 0.4505, + "step": 9468 + }, + { + "epoch": 3.7075574215855767, + "grad_norm": 0.49318464717590493, + "learning_rate": 4.612865705864499e-06, + "loss": 0.4718, + "step": 9469 + }, + { + "epoch": 3.707952580884169, + "grad_norm": 0.5308810953951024, + "learning_rate": 4.612781954881851e-06, + "loss": 0.4685, + "step": 9470 + }, + { + "epoch": 3.708347740182761, + "grad_norm": 0.47912651709226944, + "learning_rate": 4.6126981956014775e-06, + "loss": 0.4418, + "step": 9471 + }, + { + "epoch": 3.7087428994813534, + "grad_norm": 0.48759890811022727, + "learning_rate": 4.612614428023709e-06, + "loss": 0.451, + "step": 9472 + }, + { + "epoch": 3.7091380587799456, + "grad_norm": 0.5022546921872667, + "learning_rate": 4.612530652148875e-06, + "loss": 0.4441, + "step": 9473 + }, + { + "epoch": 3.709533218078538, + "grad_norm": 0.48883903463677697, + "learning_rate": 4.6124468679773015e-06, + "loss": 0.44, + "step": 9474 + }, + { + "epoch": 3.70992837737713, + "grad_norm": 0.4925590816151284, + "learning_rate": 4.61236307550932e-06, + "loss": 0.4625, + "step": 9475 + }, + { + "epoch": 3.7103235366757223, + "grad_norm": 0.5102359588145222, + "learning_rate": 4.612279274745259e-06, + "loss": 0.4532, + "step": 9476 + }, + { + "epoch": 3.7107186959743146, + "grad_norm": 0.5016564703231445, + "learning_rate": 4.612195465685448e-06, + "loss": 0.4611, + "step": 9477 + }, + { + "epoch": 3.711113855272907, + "grad_norm": 0.4985293696763766, + "learning_rate": 4.612111648330216e-06, + "loss": 0.4617, + "step": 9478 + }, + { + "epoch": 3.711509014571499, + "grad_norm": 0.49461850359770454, + "learning_rate": 4.612027822679892e-06, + "loss": 0.4549, + "step": 9479 + }, + { + "epoch": 3.7119041738700913, + "grad_norm": 0.4879647278277727, + "learning_rate": 4.611943988734805e-06, + "loss": 0.4686, + "step": 9480 + }, + { + "epoch": 3.7122993331686835, + "grad_norm": 0.502142235265202, + "learning_rate": 4.611860146495284e-06, + "loss": 0.468, + "step": 9481 + }, + { + "epoch": 3.712694492467276, + "grad_norm": 0.5078297162937819, + "learning_rate": 4.611776295961659e-06, + "loss": 0.4844, + "step": 9482 + }, + { + "epoch": 3.713089651765868, + "grad_norm": 0.6651109373307685, + "learning_rate": 4.611692437134259e-06, + "loss": 0.4576, + "step": 9483 + }, + { + "epoch": 3.7134848110644603, + "grad_norm": 0.48181360567693204, + "learning_rate": 4.611608570013414e-06, + "loss": 0.4633, + "step": 9484 + }, + { + "epoch": 3.7138799703630525, + "grad_norm": 0.48089336396003285, + "learning_rate": 4.611524694599452e-06, + "loss": 0.4548, + "step": 9485 + }, + { + "epoch": 3.7142751296616447, + "grad_norm": 0.5565739216162738, + "learning_rate": 4.611440810892703e-06, + "loss": 0.4592, + "step": 9486 + }, + { + "epoch": 3.714670288960237, + "grad_norm": 0.5089387807662913, + "learning_rate": 4.611356918893497e-06, + "loss": 0.4673, + "step": 9487 + }, + { + "epoch": 3.7150654482588292, + "grad_norm": 0.4849507219283539, + "learning_rate": 4.611273018602164e-06, + "loss": 0.4579, + "step": 9488 + }, + { + "epoch": 3.7154606075574215, + "grad_norm": 0.4957394966619531, + "learning_rate": 4.611189110019032e-06, + "loss": 0.4722, + "step": 9489 + }, + { + "epoch": 3.7158557668560137, + "grad_norm": 0.4907758996231714, + "learning_rate": 4.6111051931444304e-06, + "loss": 0.4597, + "step": 9490 + }, + { + "epoch": 3.716250926154606, + "grad_norm": 0.5101532752645882, + "learning_rate": 4.61102126797869e-06, + "loss": 0.4828, + "step": 9491 + }, + { + "epoch": 3.716646085453198, + "grad_norm": 0.5008474308167519, + "learning_rate": 4.610937334522141e-06, + "loss": 0.4722, + "step": 9492 + }, + { + "epoch": 3.7170412447517904, + "grad_norm": 0.49062418438118194, + "learning_rate": 4.610853392775111e-06, + "loss": 0.4542, + "step": 9493 + }, + { + "epoch": 3.7174364040503827, + "grad_norm": 0.5014221426388613, + "learning_rate": 4.61076944273793e-06, + "loss": 0.4843, + "step": 9494 + }, + { + "epoch": 3.717831563348975, + "grad_norm": 0.4915499915118497, + "learning_rate": 4.61068548441093e-06, + "loss": 0.4587, + "step": 9495 + }, + { + "epoch": 3.718226722647567, + "grad_norm": 0.5132337480297305, + "learning_rate": 4.610601517794437e-06, + "loss": 0.4743, + "step": 9496 + }, + { + "epoch": 3.7186218819461594, + "grad_norm": 0.5016517224129324, + "learning_rate": 4.610517542888785e-06, + "loss": 0.462, + "step": 9497 + }, + { + "epoch": 3.7190170412447516, + "grad_norm": 0.5060950647207751, + "learning_rate": 4.6104335596943004e-06, + "loss": 0.4641, + "step": 9498 + }, + { + "epoch": 3.719412200543344, + "grad_norm": 0.5062740065120886, + "learning_rate": 4.610349568211314e-06, + "loss": 0.4592, + "step": 9499 + }, + { + "epoch": 3.719807359841936, + "grad_norm": 0.4930087960545402, + "learning_rate": 4.6102655684401575e-06, + "loss": 0.47, + "step": 9500 + }, + { + "epoch": 3.7202025191405284, + "grad_norm": 0.4898865945969133, + "learning_rate": 4.6101815603811576e-06, + "loss": 0.4437, + "step": 9501 + }, + { + "epoch": 3.7205976784391206, + "grad_norm": 0.4795656726959586, + "learning_rate": 4.610097544034647e-06, + "loss": 0.4453, + "step": 9502 + }, + { + "epoch": 3.720992837737713, + "grad_norm": 0.49426700494295145, + "learning_rate": 4.610013519400954e-06, + "loss": 0.4864, + "step": 9503 + }, + { + "epoch": 3.721387997036305, + "grad_norm": 0.4887121669736692, + "learning_rate": 4.609929486480409e-06, + "loss": 0.4573, + "step": 9504 + }, + { + "epoch": 3.7217831563348973, + "grad_norm": 0.4905352215891702, + "learning_rate": 4.609845445273343e-06, + "loss": 0.4582, + "step": 9505 + }, + { + "epoch": 3.7221783156334896, + "grad_norm": 0.5105211554553892, + "learning_rate": 4.6097613957800845e-06, + "loss": 0.4693, + "step": 9506 + }, + { + "epoch": 3.722573474932082, + "grad_norm": 0.4987264748579873, + "learning_rate": 4.6096773380009655e-06, + "loss": 0.4679, + "step": 9507 + }, + { + "epoch": 3.722968634230674, + "grad_norm": 0.49566139627836764, + "learning_rate": 4.609593271936313e-06, + "loss": 0.4483, + "step": 9508 + }, + { + "epoch": 3.7233637935292663, + "grad_norm": 0.49187996336706025, + "learning_rate": 4.609509197586461e-06, + "loss": 0.467, + "step": 9509 + }, + { + "epoch": 3.7237589528278585, + "grad_norm": 0.4820091496779555, + "learning_rate": 4.609425114951737e-06, + "loss": 0.4536, + "step": 9510 + }, + { + "epoch": 3.7241541121264508, + "grad_norm": 0.5021340173430089, + "learning_rate": 4.609341024032472e-06, + "loss": 0.4824, + "step": 9511 + }, + { + "epoch": 3.724549271425043, + "grad_norm": 0.49803869540293166, + "learning_rate": 4.609256924828997e-06, + "loss": 0.4547, + "step": 9512 + }, + { + "epoch": 3.7249444307236352, + "grad_norm": 0.5128440482147195, + "learning_rate": 4.60917281734164e-06, + "loss": 0.4608, + "step": 9513 + }, + { + "epoch": 3.7253395900222275, + "grad_norm": 0.4913277569926873, + "learning_rate": 4.609088701570735e-06, + "loss": 0.4537, + "step": 9514 + }, + { + "epoch": 3.7257347493208197, + "grad_norm": 0.495974924098836, + "learning_rate": 4.609004577516609e-06, + "loss": 0.4734, + "step": 9515 + }, + { + "epoch": 3.726129908619412, + "grad_norm": 0.49140903339070413, + "learning_rate": 4.608920445179594e-06, + "loss": 0.4442, + "step": 9516 + }, + { + "epoch": 3.726525067918004, + "grad_norm": 0.5217558182771603, + "learning_rate": 4.60883630456002e-06, + "loss": 0.4896, + "step": 9517 + }, + { + "epoch": 3.7269202272165964, + "grad_norm": 0.49384622356957825, + "learning_rate": 4.608752155658218e-06, + "loss": 0.4412, + "step": 9518 + }, + { + "epoch": 3.7273153865151887, + "grad_norm": 0.4857726615455569, + "learning_rate": 4.608667998474518e-06, + "loss": 0.4497, + "step": 9519 + }, + { + "epoch": 3.727710545813781, + "grad_norm": 0.4963906295933952, + "learning_rate": 4.60858383300925e-06, + "loss": 0.4613, + "step": 9520 + }, + { + "epoch": 3.728105705112373, + "grad_norm": 0.4838697905450302, + "learning_rate": 4.608499659262745e-06, + "loss": 0.4584, + "step": 9521 + }, + { + "epoch": 3.7285008644109654, + "grad_norm": 0.48800127051436243, + "learning_rate": 4.608415477235334e-06, + "loss": 0.4589, + "step": 9522 + }, + { + "epoch": 3.7288960237095576, + "grad_norm": 0.48246388572291404, + "learning_rate": 4.6083312869273475e-06, + "loss": 0.451, + "step": 9523 + }, + { + "epoch": 3.7292911830081503, + "grad_norm": 0.4820982897305016, + "learning_rate": 4.608247088339116e-06, + "loss": 0.4481, + "step": 9524 + }, + { + "epoch": 3.7296863423067426, + "grad_norm": 0.4782123289577348, + "learning_rate": 4.60816288147097e-06, + "loss": 0.4613, + "step": 9525 + }, + { + "epoch": 3.730081501605335, + "grad_norm": 0.4893516657625509, + "learning_rate": 4.60807866632324e-06, + "loss": 0.4555, + "step": 9526 + }, + { + "epoch": 3.730476660903927, + "grad_norm": 0.489071023151812, + "learning_rate": 4.607994442896257e-06, + "loss": 0.4623, + "step": 9527 + }, + { + "epoch": 3.7308718202025193, + "grad_norm": 0.49687366002746247, + "learning_rate": 4.607910211190353e-06, + "loss": 0.4398, + "step": 9528 + }, + { + "epoch": 3.7312669795011115, + "grad_norm": 0.47881395393697335, + "learning_rate": 4.607825971205857e-06, + "loss": 0.4469, + "step": 9529 + }, + { + "epoch": 3.7316621387997038, + "grad_norm": 0.5130814385073522, + "learning_rate": 4.6077417229430995e-06, + "loss": 0.4628, + "step": 9530 + }, + { + "epoch": 3.732057298098296, + "grad_norm": 0.48593565257627325, + "learning_rate": 4.607657466402414e-06, + "loss": 0.4533, + "step": 9531 + }, + { + "epoch": 3.7324524573968882, + "grad_norm": 0.4875631077933674, + "learning_rate": 4.607573201584129e-06, + "loss": 0.4579, + "step": 9532 + }, + { + "epoch": 3.7328476166954805, + "grad_norm": 0.4957260704045972, + "learning_rate": 4.607488928488576e-06, + "loss": 0.4563, + "step": 9533 + }, + { + "epoch": 3.7332427759940727, + "grad_norm": 0.4860278696039461, + "learning_rate": 4.607404647116087e-06, + "loss": 0.4664, + "step": 9534 + }, + { + "epoch": 3.733637935292665, + "grad_norm": 0.4845068061589555, + "learning_rate": 4.607320357466992e-06, + "loss": 0.4614, + "step": 9535 + }, + { + "epoch": 3.734033094591257, + "grad_norm": 0.4786694984990013, + "learning_rate": 4.607236059541622e-06, + "loss": 0.4601, + "step": 9536 + }, + { + "epoch": 3.7344282538898494, + "grad_norm": 0.5088132150023401, + "learning_rate": 4.6071517533403085e-06, + "loss": 0.4538, + "step": 9537 + }, + { + "epoch": 3.7348234131884417, + "grad_norm": 0.4908292637789205, + "learning_rate": 4.6070674388633825e-06, + "loss": 0.4686, + "step": 9538 + }, + { + "epoch": 3.735218572487034, + "grad_norm": 0.5142726426805816, + "learning_rate": 4.606983116111175e-06, + "loss": 0.4719, + "step": 9539 + }, + { + "epoch": 3.735613731785626, + "grad_norm": 0.49016313325540406, + "learning_rate": 4.606898785084017e-06, + "loss": 0.4535, + "step": 9540 + }, + { + "epoch": 3.7360088910842184, + "grad_norm": 0.49285245557129015, + "learning_rate": 4.60681444578224e-06, + "loss": 0.4645, + "step": 9541 + }, + { + "epoch": 3.7364040503828106, + "grad_norm": 0.4839808550700243, + "learning_rate": 4.6067300982061754e-06, + "loss": 0.4491, + "step": 9542 + }, + { + "epoch": 3.736799209681403, + "grad_norm": 0.484570336078592, + "learning_rate": 4.606645742356155e-06, + "loss": 0.4594, + "step": 9543 + }, + { + "epoch": 3.737194368979995, + "grad_norm": 0.48518657604346466, + "learning_rate": 4.606561378232508e-06, + "loss": 0.4701, + "step": 9544 + }, + { + "epoch": 3.7375895282785874, + "grad_norm": 0.48628553075992975, + "learning_rate": 4.606477005835568e-06, + "loss": 0.4533, + "step": 9545 + }, + { + "epoch": 3.7379846875771796, + "grad_norm": 0.5123338585282978, + "learning_rate": 4.6063926251656656e-06, + "loss": 0.5095, + "step": 9546 + }, + { + "epoch": 3.738379846875772, + "grad_norm": 0.5102529842049373, + "learning_rate": 4.6063082362231306e-06, + "loss": 0.4866, + "step": 9547 + }, + { + "epoch": 3.738775006174364, + "grad_norm": 0.4795300343781412, + "learning_rate": 4.606223839008297e-06, + "loss": 0.4565, + "step": 9548 + }, + { + "epoch": 3.7391701654729563, + "grad_norm": 0.500458305229408, + "learning_rate": 4.6061394335214945e-06, + "loss": 0.4546, + "step": 9549 + }, + { + "epoch": 3.7395653247715486, + "grad_norm": 0.49447185079146605, + "learning_rate": 4.606055019763056e-06, + "loss": 0.4711, + "step": 9550 + }, + { + "epoch": 3.739960484070141, + "grad_norm": 0.4837507662358641, + "learning_rate": 4.6059705977333116e-06, + "loss": 0.4492, + "step": 9551 + }, + { + "epoch": 3.740355643368733, + "grad_norm": 0.6482460092048694, + "learning_rate": 4.605886167432595e-06, + "loss": 0.4591, + "step": 9552 + }, + { + "epoch": 3.7407508026673253, + "grad_norm": 0.48576022911474664, + "learning_rate": 4.605801728861235e-06, + "loss": 0.4561, + "step": 9553 + }, + { + "epoch": 3.7411459619659175, + "grad_norm": 0.4901974973756972, + "learning_rate": 4.6057172820195635e-06, + "loss": 0.4713, + "step": 9554 + }, + { + "epoch": 3.7415411212645098, + "grad_norm": 0.49639360986444836, + "learning_rate": 4.605632826907915e-06, + "loss": 0.4759, + "step": 9555 + }, + { + "epoch": 3.741936280563102, + "grad_norm": 0.4931648478189971, + "learning_rate": 4.605548363526619e-06, + "loss": 0.4587, + "step": 9556 + }, + { + "epoch": 3.7423314398616943, + "grad_norm": 0.4688955095749135, + "learning_rate": 4.605463891876006e-06, + "loss": 0.4648, + "step": 9557 + }, + { + "epoch": 3.7427265991602865, + "grad_norm": 0.49745821587087424, + "learning_rate": 4.605379411956411e-06, + "loss": 0.4603, + "step": 9558 + }, + { + "epoch": 3.7431217584588787, + "grad_norm": 0.46605724180238206, + "learning_rate": 4.605294923768164e-06, + "loss": 0.4536, + "step": 9559 + }, + { + "epoch": 3.743516917757471, + "grad_norm": 0.48666332590044686, + "learning_rate": 4.605210427311596e-06, + "loss": 0.4794, + "step": 9560 + }, + { + "epoch": 3.743912077056063, + "grad_norm": 0.49885552329714344, + "learning_rate": 4.605125922587041e-06, + "loss": 0.451, + "step": 9561 + }, + { + "epoch": 3.7443072363546555, + "grad_norm": 0.5000142082417693, + "learning_rate": 4.6050414095948294e-06, + "loss": 0.4529, + "step": 9562 + }, + { + "epoch": 3.7447023956532477, + "grad_norm": 0.4862982293683471, + "learning_rate": 4.604956888335292e-06, + "loss": 0.4614, + "step": 9563 + }, + { + "epoch": 3.74509755495184, + "grad_norm": 0.48972690615641834, + "learning_rate": 4.604872358808764e-06, + "loss": 0.4696, + "step": 9564 + }, + { + "epoch": 3.745492714250432, + "grad_norm": 0.48344315601917986, + "learning_rate": 4.604787821015575e-06, + "loss": 0.4618, + "step": 9565 + }, + { + "epoch": 3.7458878735490244, + "grad_norm": 0.501362692935816, + "learning_rate": 4.604703274956057e-06, + "loss": 0.4465, + "step": 9566 + }, + { + "epoch": 3.7462830328476167, + "grad_norm": 0.4869221765616658, + "learning_rate": 4.604618720630542e-06, + "loss": 0.4519, + "step": 9567 + }, + { + "epoch": 3.746678192146209, + "grad_norm": 0.49044552780260375, + "learning_rate": 4.604534158039364e-06, + "loss": 0.4575, + "step": 9568 + }, + { + "epoch": 3.747073351444801, + "grad_norm": 0.5337351622614847, + "learning_rate": 4.604449587182854e-06, + "loss": 0.4757, + "step": 9569 + }, + { + "epoch": 3.7474685107433934, + "grad_norm": 0.5142889870437418, + "learning_rate": 4.6043650080613434e-06, + "loss": 0.4368, + "step": 9570 + }, + { + "epoch": 3.7478636700419856, + "grad_norm": 0.5002069423060246, + "learning_rate": 4.604280420675165e-06, + "loss": 0.4662, + "step": 9571 + }, + { + "epoch": 3.748258829340578, + "grad_norm": 0.5012425468189804, + "learning_rate": 4.60419582502465e-06, + "loss": 0.4689, + "step": 9572 + }, + { + "epoch": 3.74865398863917, + "grad_norm": 0.48744682141535584, + "learning_rate": 4.6041112211101325e-06, + "loss": 0.4783, + "step": 9573 + }, + { + "epoch": 3.7490491479377623, + "grad_norm": 0.49788598446759974, + "learning_rate": 4.604026608931943e-06, + "loss": 0.4668, + "step": 9574 + }, + { + "epoch": 3.7494443072363546, + "grad_norm": 0.4964539038735511, + "learning_rate": 4.6039419884904155e-06, + "loss": 0.4768, + "step": 9575 + }, + { + "epoch": 3.749839466534947, + "grad_norm": 0.5034149134145032, + "learning_rate": 4.603857359785881e-06, + "loss": 0.4654, + "step": 9576 + }, + { + "epoch": 3.750234625833539, + "grad_norm": 0.5102791225039909, + "learning_rate": 4.6037727228186715e-06, + "loss": 0.4767, + "step": 9577 + }, + { + "epoch": 3.7506297851321313, + "grad_norm": 0.49641266816152013, + "learning_rate": 4.603688077589121e-06, + "loss": 0.4692, + "step": 9578 + }, + { + "epoch": 3.7510249444307235, + "grad_norm": 0.5170425495731209, + "learning_rate": 4.603603424097561e-06, + "loss": 0.4437, + "step": 9579 + }, + { + "epoch": 3.751420103729316, + "grad_norm": 0.4903857984331723, + "learning_rate": 4.603518762344325e-06, + "loss": 0.4665, + "step": 9580 + }, + { + "epoch": 3.751815263027908, + "grad_norm": 0.49410476736023734, + "learning_rate": 4.603434092329743e-06, + "loss": 0.4648, + "step": 9581 + }, + { + "epoch": 3.7522104223265003, + "grad_norm": 0.47620798317913865, + "learning_rate": 4.603349414054151e-06, + "loss": 0.4558, + "step": 9582 + }, + { + "epoch": 3.7526055816250925, + "grad_norm": 0.4909803148027834, + "learning_rate": 4.603264727517879e-06, + "loss": 0.4728, + "step": 9583 + }, + { + "epoch": 3.7530007409236847, + "grad_norm": 0.4841546764667915, + "learning_rate": 4.60318003272126e-06, + "loss": 0.4384, + "step": 9584 + }, + { + "epoch": 3.753395900222277, + "grad_norm": 0.49138074599478704, + "learning_rate": 4.603095329664627e-06, + "loss": 0.4509, + "step": 9585 + }, + { + "epoch": 3.7537910595208692, + "grad_norm": 0.4916572867617533, + "learning_rate": 4.6030106183483135e-06, + "loss": 0.4493, + "step": 9586 + }, + { + "epoch": 3.7541862188194615, + "grad_norm": 0.4867907743227322, + "learning_rate": 4.60292589877265e-06, + "loss": 0.4561, + "step": 9587 + }, + { + "epoch": 3.7545813781180537, + "grad_norm": 0.5044721101451728, + "learning_rate": 4.6028411709379715e-06, + "loss": 0.4525, + "step": 9588 + }, + { + "epoch": 3.754976537416646, + "grad_norm": 0.47514228642887263, + "learning_rate": 4.602756434844609e-06, + "loss": 0.4616, + "step": 9589 + }, + { + "epoch": 3.755371696715238, + "grad_norm": 0.776026961750764, + "learning_rate": 4.6026716904928965e-06, + "loss": 0.4883, + "step": 9590 + }, + { + "epoch": 3.7557668560138304, + "grad_norm": 0.4937700305585744, + "learning_rate": 4.602586937883167e-06, + "loss": 0.4647, + "step": 9591 + }, + { + "epoch": 3.7561620153124227, + "grad_norm": 0.47957905951413793, + "learning_rate": 4.6025021770157516e-06, + "loss": 0.4446, + "step": 9592 + }, + { + "epoch": 3.756557174611015, + "grad_norm": 0.5017133949581569, + "learning_rate": 4.602417407890984e-06, + "loss": 0.4884, + "step": 9593 + }, + { + "epoch": 3.756952333909607, + "grad_norm": 0.4870056949571294, + "learning_rate": 4.602332630509199e-06, + "loss": 0.4602, + "step": 9594 + }, + { + "epoch": 3.7573474932082, + "grad_norm": 0.5202279557981537, + "learning_rate": 4.602247844870728e-06, + "loss": 0.4776, + "step": 9595 + }, + { + "epoch": 3.757742652506792, + "grad_norm": 0.4973526737448953, + "learning_rate": 4.602163050975903e-06, + "loss": 0.4423, + "step": 9596 + }, + { + "epoch": 3.7581378118053843, + "grad_norm": 0.4983329516809982, + "learning_rate": 4.602078248825058e-06, + "loss": 0.4657, + "step": 9597 + }, + { + "epoch": 3.7585329711039765, + "grad_norm": 0.49221414055030543, + "learning_rate": 4.601993438418527e-06, + "loss": 0.4544, + "step": 9598 + }, + { + "epoch": 3.758928130402569, + "grad_norm": 0.4865035442026118, + "learning_rate": 4.601908619756642e-06, + "loss": 0.4511, + "step": 9599 + }, + { + "epoch": 3.759323289701161, + "grad_norm": 0.48356478090463534, + "learning_rate": 4.601823792839735e-06, + "loss": 0.4738, + "step": 9600 + }, + { + "epoch": 3.7597184489997533, + "grad_norm": 0.49314242592096336, + "learning_rate": 4.601738957668142e-06, + "loss": 0.463, + "step": 9601 + }, + { + "epoch": 3.7601136082983455, + "grad_norm": 0.5078494446402765, + "learning_rate": 4.601654114242194e-06, + "loss": 0.4618, + "step": 9602 + }, + { + "epoch": 3.7605087675969378, + "grad_norm": 0.5028843214675345, + "learning_rate": 4.6015692625622255e-06, + "loss": 0.4608, + "step": 9603 + }, + { + "epoch": 3.76090392689553, + "grad_norm": 0.5008323117874007, + "learning_rate": 4.601484402628569e-06, + "loss": 0.4549, + "step": 9604 + }, + { + "epoch": 3.7612990861941222, + "grad_norm": 0.4850454741183229, + "learning_rate": 4.6013995344415565e-06, + "loss": 0.4627, + "step": 9605 + }, + { + "epoch": 3.7616942454927145, + "grad_norm": 0.48808279245655617, + "learning_rate": 4.601314658001524e-06, + "loss": 0.4535, + "step": 9606 + }, + { + "epoch": 3.7620894047913067, + "grad_norm": 0.5066989763656677, + "learning_rate": 4.601229773308802e-06, + "loss": 0.4751, + "step": 9607 + }, + { + "epoch": 3.762484564089899, + "grad_norm": 0.4805137326607607, + "learning_rate": 4.601144880363726e-06, + "loss": 0.4638, + "step": 9608 + }, + { + "epoch": 3.762879723388491, + "grad_norm": 0.4922622961214075, + "learning_rate": 4.60105997916663e-06, + "loss": 0.4686, + "step": 9609 + }, + { + "epoch": 3.7632748826870834, + "grad_norm": 0.4810705349543995, + "learning_rate": 4.600975069717846e-06, + "loss": 0.4461, + "step": 9610 + }, + { + "epoch": 3.7636700419856757, + "grad_norm": 0.4773354945166677, + "learning_rate": 4.6008901520177065e-06, + "loss": 0.4516, + "step": 9611 + }, + { + "epoch": 3.764065201284268, + "grad_norm": 0.49206349143578293, + "learning_rate": 4.600805226066547e-06, + "loss": 0.4485, + "step": 9612 + }, + { + "epoch": 3.76446036058286, + "grad_norm": 0.4814701688954984, + "learning_rate": 4.6007202918647e-06, + "loss": 0.4489, + "step": 9613 + }, + { + "epoch": 3.7648555198814524, + "grad_norm": 0.5022048385321567, + "learning_rate": 4.600635349412499e-06, + "loss": 0.4721, + "step": 9614 + }, + { + "epoch": 3.7652506791800446, + "grad_norm": 0.48243788870808746, + "learning_rate": 4.600550398710278e-06, + "loss": 0.4464, + "step": 9615 + }, + { + "epoch": 3.765645838478637, + "grad_norm": 0.48757340703408836, + "learning_rate": 4.600465439758371e-06, + "loss": 0.4703, + "step": 9616 + }, + { + "epoch": 3.766040997777229, + "grad_norm": 0.48716159886857296, + "learning_rate": 4.600380472557112e-06, + "loss": 0.4519, + "step": 9617 + }, + { + "epoch": 3.7664361570758214, + "grad_norm": 0.5069818045941086, + "learning_rate": 4.600295497106833e-06, + "loss": 0.4943, + "step": 9618 + }, + { + "epoch": 3.7668313163744136, + "grad_norm": 0.5022666320922664, + "learning_rate": 4.6002105134078675e-06, + "loss": 0.4673, + "step": 9619 + }, + { + "epoch": 3.767226475673006, + "grad_norm": 0.4807723287562549, + "learning_rate": 4.600125521460552e-06, + "loss": 0.4646, + "step": 9620 + }, + { + "epoch": 3.767621634971598, + "grad_norm": 0.4912477141689686, + "learning_rate": 4.600040521265219e-06, + "loss": 0.4641, + "step": 9621 + }, + { + "epoch": 3.7680167942701903, + "grad_norm": 0.49929865985825106, + "learning_rate": 4.599955512822201e-06, + "loss": 0.4823, + "step": 9622 + }, + { + "epoch": 3.7684119535687826, + "grad_norm": 0.49230115169608873, + "learning_rate": 4.599870496131833e-06, + "loss": 0.4593, + "step": 9623 + }, + { + "epoch": 3.768807112867375, + "grad_norm": 0.48105300330907896, + "learning_rate": 4.59978547119445e-06, + "loss": 0.444, + "step": 9624 + }, + { + "epoch": 3.769202272165967, + "grad_norm": 0.4818330655568901, + "learning_rate": 4.599700438010382e-06, + "loss": 0.4473, + "step": 9625 + }, + { + "epoch": 3.7695974314645593, + "grad_norm": 0.4963785455532329, + "learning_rate": 4.599615396579968e-06, + "loss": 0.4342, + "step": 9626 + }, + { + "epoch": 3.7699925907631515, + "grad_norm": 0.4830399833200486, + "learning_rate": 4.5995303469035406e-06, + "loss": 0.44, + "step": 9627 + }, + { + "epoch": 3.7703877500617438, + "grad_norm": 0.47990178930501065, + "learning_rate": 4.59944528898143e-06, + "loss": 0.4504, + "step": 9628 + }, + { + "epoch": 3.770782909360336, + "grad_norm": 0.4786217792852282, + "learning_rate": 4.599360222813975e-06, + "loss": 0.4661, + "step": 9629 + }, + { + "epoch": 3.7711780686589282, + "grad_norm": 0.4805124331616531, + "learning_rate": 4.599275148401507e-06, + "loss": 0.4583, + "step": 9630 + }, + { + "epoch": 3.7715732279575205, + "grad_norm": 0.4879125432986269, + "learning_rate": 4.599190065744362e-06, + "loss": 0.4528, + "step": 9631 + }, + { + "epoch": 3.7719683872561127, + "grad_norm": 0.48314579798707913, + "learning_rate": 4.5991049748428725e-06, + "loss": 0.4537, + "step": 9632 + }, + { + "epoch": 3.772363546554705, + "grad_norm": 0.4903701935863649, + "learning_rate": 4.599019875697374e-06, + "loss": 0.4512, + "step": 9633 + }, + { + "epoch": 3.772758705853297, + "grad_norm": 0.5023398611383797, + "learning_rate": 4.598934768308199e-06, + "loss": 0.4546, + "step": 9634 + }, + { + "epoch": 3.7731538651518894, + "grad_norm": 0.6910082967623805, + "learning_rate": 4.598849652675683e-06, + "loss": 0.454, + "step": 9635 + }, + { + "epoch": 3.7735490244504817, + "grad_norm": 0.48924919480622175, + "learning_rate": 4.598764528800161e-06, + "loss": 0.4573, + "step": 9636 + }, + { + "epoch": 3.773944183749074, + "grad_norm": 0.5032570900259695, + "learning_rate": 4.598679396681964e-06, + "loss": 0.4735, + "step": 9637 + }, + { + "epoch": 3.774339343047666, + "grad_norm": 0.4884216635553054, + "learning_rate": 4.598594256321431e-06, + "loss": 0.4593, + "step": 9638 + }, + { + "epoch": 3.7747345023462584, + "grad_norm": 0.4839339436338744, + "learning_rate": 4.598509107718894e-06, + "loss": 0.4734, + "step": 9639 + }, + { + "epoch": 3.7751296616448506, + "grad_norm": 0.48156978064749584, + "learning_rate": 4.598423950874687e-06, + "loss": 0.4514, + "step": 9640 + }, + { + "epoch": 3.775524820943443, + "grad_norm": 0.4866270246823189, + "learning_rate": 4.598338785789144e-06, + "loss": 0.4615, + "step": 9641 + }, + { + "epoch": 3.775919980242035, + "grad_norm": 0.47743928665454505, + "learning_rate": 4.598253612462601e-06, + "loss": 0.4578, + "step": 9642 + }, + { + "epoch": 3.7763151395406274, + "grad_norm": 0.47599434611222147, + "learning_rate": 4.598168430895392e-06, + "loss": 0.4495, + "step": 9643 + }, + { + "epoch": 3.7767102988392196, + "grad_norm": 0.49548131853505706, + "learning_rate": 4.598083241087852e-06, + "loss": 0.4481, + "step": 9644 + }, + { + "epoch": 3.777105458137812, + "grad_norm": 0.536021204426832, + "learning_rate": 4.597998043040315e-06, + "loss": 0.4548, + "step": 9645 + }, + { + "epoch": 3.777500617436404, + "grad_norm": 0.49859452013864736, + "learning_rate": 4.597912836753116e-06, + "loss": 0.4472, + "step": 9646 + }, + { + "epoch": 3.7778957767349963, + "grad_norm": 0.4904930266960114, + "learning_rate": 4.597827622226588e-06, + "loss": 0.4407, + "step": 9647 + }, + { + "epoch": 3.7782909360335886, + "grad_norm": 0.5632944952584257, + "learning_rate": 4.597742399461067e-06, + "loss": 0.4677, + "step": 9648 + }, + { + "epoch": 3.778686095332181, + "grad_norm": 0.4825786026169142, + "learning_rate": 4.597657168456889e-06, + "loss": 0.4711, + "step": 9649 + }, + { + "epoch": 3.779081254630773, + "grad_norm": 0.48663207299892375, + "learning_rate": 4.5975719292143865e-06, + "loss": 0.4562, + "step": 9650 + }, + { + "epoch": 3.7794764139293653, + "grad_norm": 0.48151353926840385, + "learning_rate": 4.5974866817338955e-06, + "loss": 0.4491, + "step": 9651 + }, + { + "epoch": 3.7798715732279575, + "grad_norm": 0.490395399162099, + "learning_rate": 4.597401426015751e-06, + "loss": 0.4708, + "step": 9652 + }, + { + "epoch": 3.7802667325265498, + "grad_norm": 0.48180099915518, + "learning_rate": 4.597316162060287e-06, + "loss": 0.4571, + "step": 9653 + }, + { + "epoch": 3.780661891825142, + "grad_norm": 0.5093098515857098, + "learning_rate": 4.597230889867837e-06, + "loss": 0.4601, + "step": 9654 + }, + { + "epoch": 3.7810570511237342, + "grad_norm": 0.5002845474985712, + "learning_rate": 4.5971456094387395e-06, + "loss": 0.4705, + "step": 9655 + }, + { + "epoch": 3.7814522104223265, + "grad_norm": 0.49887974718155487, + "learning_rate": 4.597060320773327e-06, + "loss": 0.4552, + "step": 9656 + }, + { + "epoch": 3.7818473697209187, + "grad_norm": 0.4844913686111668, + "learning_rate": 4.596975023871935e-06, + "loss": 0.4494, + "step": 9657 + }, + { + "epoch": 3.782242529019511, + "grad_norm": 0.49008324798281255, + "learning_rate": 4.596889718734898e-06, + "loss": 0.4534, + "step": 9658 + }, + { + "epoch": 3.782637688318103, + "grad_norm": 0.48905798875896944, + "learning_rate": 4.5968044053625525e-06, + "loss": 0.4652, + "step": 9659 + }, + { + "epoch": 3.7830328476166954, + "grad_norm": 0.48024108850578784, + "learning_rate": 4.596719083755231e-06, + "loss": 0.4592, + "step": 9660 + }, + { + "epoch": 3.7834280069152877, + "grad_norm": 0.4900301758693202, + "learning_rate": 4.596633753913272e-06, + "loss": 0.4565, + "step": 9661 + }, + { + "epoch": 3.78382316621388, + "grad_norm": 0.4802953667319636, + "learning_rate": 4.596548415837007e-06, + "loss": 0.4543, + "step": 9662 + }, + { + "epoch": 3.784218325512472, + "grad_norm": 0.4837608289095752, + "learning_rate": 4.596463069526775e-06, + "loss": 0.4546, + "step": 9663 + }, + { + "epoch": 3.7846134848110644, + "grad_norm": 0.5182415469829378, + "learning_rate": 4.596377714982907e-06, + "loss": 0.4844, + "step": 9664 + }, + { + "epoch": 3.7850086441096567, + "grad_norm": 0.4966888453091403, + "learning_rate": 4.5962923522057415e-06, + "loss": 0.4576, + "step": 9665 + }, + { + "epoch": 3.785403803408249, + "grad_norm": 0.49753630366467044, + "learning_rate": 4.596206981195611e-06, + "loss": 0.468, + "step": 9666 + }, + { + "epoch": 3.785798962706841, + "grad_norm": 0.48874836080231554, + "learning_rate": 4.596121601952854e-06, + "loss": 0.4405, + "step": 9667 + }, + { + "epoch": 3.7861941220054334, + "grad_norm": 0.5076579580062619, + "learning_rate": 4.596036214477804e-06, + "loss": 0.467, + "step": 9668 + }, + { + "epoch": 3.7865892813040256, + "grad_norm": 0.49135910265658483, + "learning_rate": 4.595950818770796e-06, + "loss": 0.4639, + "step": 9669 + }, + { + "epoch": 3.786984440602618, + "grad_norm": 0.4840783957984401, + "learning_rate": 4.595865414832166e-06, + "loss": 0.4864, + "step": 9670 + }, + { + "epoch": 3.78737959990121, + "grad_norm": 0.5059648003209108, + "learning_rate": 4.59578000266225e-06, + "loss": 0.4623, + "step": 9671 + }, + { + "epoch": 3.7877747591998023, + "grad_norm": 0.5155562419723234, + "learning_rate": 4.595694582261382e-06, + "loss": 0.4552, + "step": 9672 + }, + { + "epoch": 3.7881699184983946, + "grad_norm": 0.49286246868120626, + "learning_rate": 4.595609153629899e-06, + "loss": 0.4531, + "step": 9673 + }, + { + "epoch": 3.788565077796987, + "grad_norm": 0.5030907783859865, + "learning_rate": 4.5955237167681356e-06, + "loss": 0.4809, + "step": 9674 + }, + { + "epoch": 3.788960237095579, + "grad_norm": 0.48208063419912994, + "learning_rate": 4.595438271676427e-06, + "loss": 0.4621, + "step": 9675 + }, + { + "epoch": 3.7893553963941713, + "grad_norm": 0.48623500793987223, + "learning_rate": 4.595352818355109e-06, + "loss": 0.4628, + "step": 9676 + }, + { + "epoch": 3.7897505556927635, + "grad_norm": 0.5114092445947878, + "learning_rate": 4.595267356804518e-06, + "loss": 0.4678, + "step": 9677 + }, + { + "epoch": 3.7901457149913558, + "grad_norm": 0.4776370080310583, + "learning_rate": 4.595181887024989e-06, + "loss": 0.4466, + "step": 9678 + }, + { + "epoch": 3.790540874289948, + "grad_norm": 0.5818700802644523, + "learning_rate": 4.595096409016858e-06, + "loss": 0.4536, + "step": 9679 + }, + { + "epoch": 3.7909360335885403, + "grad_norm": 0.4740846191844771, + "learning_rate": 4.595010922780461e-06, + "loss": 0.4419, + "step": 9680 + }, + { + "epoch": 3.7913311928871325, + "grad_norm": 0.49737461925124105, + "learning_rate": 4.594925428316132e-06, + "loss": 0.458, + "step": 9681 + }, + { + "epoch": 3.7917263521857247, + "grad_norm": 0.5051660541685692, + "learning_rate": 4.594839925624209e-06, + "loss": 0.4409, + "step": 9682 + }, + { + "epoch": 3.792121511484317, + "grad_norm": 0.4984405654390054, + "learning_rate": 4.594754414705027e-06, + "loss": 0.4665, + "step": 9683 + }, + { + "epoch": 3.792516670782909, + "grad_norm": 0.48241664062480294, + "learning_rate": 4.594668895558921e-06, + "loss": 0.4474, + "step": 9684 + }, + { + "epoch": 3.7929118300815015, + "grad_norm": 0.5191185844568438, + "learning_rate": 4.594583368186228e-06, + "loss": 0.4513, + "step": 9685 + }, + { + "epoch": 3.7933069893800937, + "grad_norm": 0.48750409539612494, + "learning_rate": 4.594497832587283e-06, + "loss": 0.4593, + "step": 9686 + }, + { + "epoch": 3.793702148678686, + "grad_norm": 0.5133154479299648, + "learning_rate": 4.594412288762423e-06, + "loss": 0.4735, + "step": 9687 + }, + { + "epoch": 3.794097307977278, + "grad_norm": 0.47361385821885527, + "learning_rate": 4.594326736711983e-06, + "loss": 0.4412, + "step": 9688 + }, + { + "epoch": 3.7944924672758704, + "grad_norm": 0.48210172755908104, + "learning_rate": 4.5942411764362985e-06, + "loss": 0.4547, + "step": 9689 + }, + { + "epoch": 3.7948876265744627, + "grad_norm": 0.4960098779386097, + "learning_rate": 4.5941556079357076e-06, + "loss": 0.4625, + "step": 9690 + }, + { + "epoch": 3.795282785873055, + "grad_norm": 0.4903029773758132, + "learning_rate": 4.5940700312105444e-06, + "loss": 0.4712, + "step": 9691 + }, + { + "epoch": 3.795677945171647, + "grad_norm": 0.5136839009508477, + "learning_rate": 4.593984446261146e-06, + "loss": 0.473, + "step": 9692 + }, + { + "epoch": 3.7960731044702394, + "grad_norm": 0.4747156730400875, + "learning_rate": 4.5938988530878485e-06, + "loss": 0.4611, + "step": 9693 + }, + { + "epoch": 3.7964682637688316, + "grad_norm": 0.4870902649544296, + "learning_rate": 4.5938132516909865e-06, + "loss": 0.4524, + "step": 9694 + }, + { + "epoch": 3.796863423067424, + "grad_norm": 0.5010946794796335, + "learning_rate": 4.593727642070899e-06, + "loss": 0.4849, + "step": 9695 + }, + { + "epoch": 3.797258582366016, + "grad_norm": 0.49536194899113356, + "learning_rate": 4.593642024227919e-06, + "loss": 0.4797, + "step": 9696 + }, + { + "epoch": 3.7976537416646083, + "grad_norm": 0.4930240021909693, + "learning_rate": 4.593556398162386e-06, + "loss": 0.4596, + "step": 9697 + }, + { + "epoch": 3.7980489009632006, + "grad_norm": 0.4921928001724572, + "learning_rate": 4.5934707638746344e-06, + "loss": 0.4532, + "step": 9698 + }, + { + "epoch": 3.798444060261793, + "grad_norm": 0.4973760960380266, + "learning_rate": 4.5933851213650005e-06, + "loss": 0.462, + "step": 9699 + }, + { + "epoch": 3.798839219560385, + "grad_norm": 0.47361402891352083, + "learning_rate": 4.593299470633821e-06, + "loss": 0.4658, + "step": 9700 + }, + { + "epoch": 3.7992343788589773, + "grad_norm": 0.48887488140182345, + "learning_rate": 4.593213811681433e-06, + "loss": 0.455, + "step": 9701 + }, + { + "epoch": 3.7996295381575695, + "grad_norm": 0.4867609427093806, + "learning_rate": 4.593128144508171e-06, + "loss": 0.4464, + "step": 9702 + }, + { + "epoch": 3.800024697456162, + "grad_norm": 0.5026991707874338, + "learning_rate": 4.593042469114374e-06, + "loss": 0.4899, + "step": 9703 + }, + { + "epoch": 3.800419856754754, + "grad_norm": 0.47389953125387, + "learning_rate": 4.592956785500376e-06, + "loss": 0.4522, + "step": 9704 + }, + { + "epoch": 3.8008150160533463, + "grad_norm": 0.49268631953734526, + "learning_rate": 4.5928710936665156e-06, + "loss": 0.4552, + "step": 9705 + }, + { + "epoch": 3.8012101753519385, + "grad_norm": 0.4743740110089471, + "learning_rate": 4.592785393613128e-06, + "loss": 0.4472, + "step": 9706 + }, + { + "epoch": 3.8016053346505307, + "grad_norm": 0.4888493962366472, + "learning_rate": 4.59269968534055e-06, + "loss": 0.4569, + "step": 9707 + }, + { + "epoch": 3.802000493949123, + "grad_norm": 0.49012796067105, + "learning_rate": 4.592613968849119e-06, + "loss": 0.4644, + "step": 9708 + }, + { + "epoch": 3.8023956532477152, + "grad_norm": 0.48032600347511467, + "learning_rate": 4.592528244139171e-06, + "loss": 0.4587, + "step": 9709 + }, + { + "epoch": 3.8027908125463075, + "grad_norm": 0.49803327331966196, + "learning_rate": 4.5924425112110425e-06, + "loss": 0.4694, + "step": 9710 + }, + { + "epoch": 3.8031859718448997, + "grad_norm": 0.4876381618700325, + "learning_rate": 4.59235677006507e-06, + "loss": 0.4651, + "step": 9711 + }, + { + "epoch": 3.803581131143492, + "grad_norm": 0.4865814270109722, + "learning_rate": 4.592271020701591e-06, + "loss": 0.4516, + "step": 9712 + }, + { + "epoch": 3.8039762904420846, + "grad_norm": 0.47309099910390273, + "learning_rate": 4.592185263120942e-06, + "loss": 0.4569, + "step": 9713 + }, + { + "epoch": 3.804371449740677, + "grad_norm": 0.4828676962421864, + "learning_rate": 4.592099497323459e-06, + "loss": 0.4503, + "step": 9714 + }, + { + "epoch": 3.804766609039269, + "grad_norm": 0.5044657636177922, + "learning_rate": 4.592013723309481e-06, + "loss": 0.4534, + "step": 9715 + }, + { + "epoch": 3.8051617683378613, + "grad_norm": 0.4741720246957415, + "learning_rate": 4.591927941079341e-06, + "loss": 0.4707, + "step": 9716 + }, + { + "epoch": 3.8055569276364536, + "grad_norm": 0.49614095314653145, + "learning_rate": 4.591842150633381e-06, + "loss": 0.4493, + "step": 9717 + }, + { + "epoch": 3.805952086935046, + "grad_norm": 0.4882532495153286, + "learning_rate": 4.5917563519719334e-06, + "loss": 0.4653, + "step": 9718 + }, + { + "epoch": 3.806347246233638, + "grad_norm": 0.4949527043726383, + "learning_rate": 4.5916705450953384e-06, + "loss": 0.4679, + "step": 9719 + }, + { + "epoch": 3.8067424055322303, + "grad_norm": 0.5602238728690052, + "learning_rate": 4.591584730003931e-06, + "loss": 0.4557, + "step": 9720 + }, + { + "epoch": 3.8071375648308226, + "grad_norm": 0.4912568517475018, + "learning_rate": 4.591498906698048e-06, + "loss": 0.4709, + "step": 9721 + }, + { + "epoch": 3.807532724129415, + "grad_norm": 0.4813539895906584, + "learning_rate": 4.591413075178029e-06, + "loss": 0.4531, + "step": 9722 + }, + { + "epoch": 3.807927883428007, + "grad_norm": 0.49909964708037335, + "learning_rate": 4.591327235444209e-06, + "loss": 0.4624, + "step": 9723 + }, + { + "epoch": 3.8083230427265993, + "grad_norm": 0.4814723511010151, + "learning_rate": 4.591241387496925e-06, + "loss": 0.4705, + "step": 9724 + }, + { + "epoch": 3.8087182020251915, + "grad_norm": 0.4827612230715431, + "learning_rate": 4.591155531336514e-06, + "loss": 0.4532, + "step": 9725 + }, + { + "epoch": 3.8091133613237838, + "grad_norm": 0.4938055389724764, + "learning_rate": 4.591069666963315e-06, + "loss": 0.4554, + "step": 9726 + }, + { + "epoch": 3.809508520622376, + "grad_norm": 0.4852708740829828, + "learning_rate": 4.590983794377664e-06, + "loss": 0.4645, + "step": 9727 + }, + { + "epoch": 3.8099036799209682, + "grad_norm": 0.48556679669930314, + "learning_rate": 4.590897913579898e-06, + "loss": 0.4488, + "step": 9728 + }, + { + "epoch": 3.8102988392195605, + "grad_norm": 0.48745710424116256, + "learning_rate": 4.590812024570355e-06, + "loss": 0.4391, + "step": 9729 + }, + { + "epoch": 3.8106939985181527, + "grad_norm": 0.4941001569984601, + "learning_rate": 4.590726127349372e-06, + "loss": 0.4688, + "step": 9730 + }, + { + "epoch": 3.811089157816745, + "grad_norm": 0.5066586936353458, + "learning_rate": 4.5906402219172865e-06, + "loss": 0.4672, + "step": 9731 + }, + { + "epoch": 3.811484317115337, + "grad_norm": 0.49499770510893454, + "learning_rate": 4.590554308274435e-06, + "loss": 0.4665, + "step": 9732 + }, + { + "epoch": 3.8118794764139294, + "grad_norm": 0.48385700939951193, + "learning_rate": 4.5904683864211564e-06, + "loss": 0.4287, + "step": 9733 + }, + { + "epoch": 3.8122746357125217, + "grad_norm": 0.5006235625670952, + "learning_rate": 4.590382456357787e-06, + "loss": 0.4763, + "step": 9734 + }, + { + "epoch": 3.812669795011114, + "grad_norm": 0.5063019446978001, + "learning_rate": 4.5902965180846645e-06, + "loss": 0.4849, + "step": 9735 + }, + { + "epoch": 3.813064954309706, + "grad_norm": 0.47648185972596263, + "learning_rate": 4.590210571602127e-06, + "loss": 0.4518, + "step": 9736 + }, + { + "epoch": 3.8134601136082984, + "grad_norm": 0.48236587628734506, + "learning_rate": 4.590124616910511e-06, + "loss": 0.4613, + "step": 9737 + }, + { + "epoch": 3.8138552729068906, + "grad_norm": 0.4854824711766063, + "learning_rate": 4.590038654010155e-06, + "loss": 0.4611, + "step": 9738 + }, + { + "epoch": 3.814250432205483, + "grad_norm": 0.47570054733499323, + "learning_rate": 4.5899526829013966e-06, + "loss": 0.4569, + "step": 9739 + }, + { + "epoch": 3.814645591504075, + "grad_norm": 0.47994844957076466, + "learning_rate": 4.589866703584573e-06, + "loss": 0.465, + "step": 9740 + }, + { + "epoch": 3.8150407508026674, + "grad_norm": 0.49181023857768696, + "learning_rate": 4.589780716060022e-06, + "loss": 0.4519, + "step": 9741 + }, + { + "epoch": 3.8154359101012596, + "grad_norm": 0.4784710573418661, + "learning_rate": 4.589694720328081e-06, + "loss": 0.4663, + "step": 9742 + }, + { + "epoch": 3.815831069399852, + "grad_norm": 0.4852518817476975, + "learning_rate": 4.589608716389088e-06, + "loss": 0.4718, + "step": 9743 + }, + { + "epoch": 3.816226228698444, + "grad_norm": 0.4869284641618619, + "learning_rate": 4.589522704243381e-06, + "loss": 0.4675, + "step": 9744 + }, + { + "epoch": 3.8166213879970363, + "grad_norm": 0.4895242207948993, + "learning_rate": 4.589436683891299e-06, + "loss": 0.4726, + "step": 9745 + }, + { + "epoch": 3.8170165472956286, + "grad_norm": 0.49471582563789906, + "learning_rate": 4.589350655333177e-06, + "loss": 0.4548, + "step": 9746 + }, + { + "epoch": 3.817411706594221, + "grad_norm": 0.48302731444823493, + "learning_rate": 4.589264618569355e-06, + "loss": 0.453, + "step": 9747 + }, + { + "epoch": 3.817806865892813, + "grad_norm": 0.4991821360059649, + "learning_rate": 4.5891785736001696e-06, + "loss": 0.4618, + "step": 9748 + }, + { + "epoch": 3.8182020251914053, + "grad_norm": 0.5035434112433709, + "learning_rate": 4.58909252042596e-06, + "loss": 0.4794, + "step": 9749 + }, + { + "epoch": 3.8185971844899975, + "grad_norm": 0.5381210527580598, + "learning_rate": 4.589006459047063e-06, + "loss": 0.457, + "step": 9750 + }, + { + "epoch": 3.8189923437885898, + "grad_norm": 0.5042220819531829, + "learning_rate": 4.588920389463817e-06, + "loss": 0.4713, + "step": 9751 + }, + { + "epoch": 3.819387503087182, + "grad_norm": 0.49824542001689043, + "learning_rate": 4.5888343116765604e-06, + "loss": 0.459, + "step": 9752 + }, + { + "epoch": 3.8197826623857742, + "grad_norm": 0.4812925277928597, + "learning_rate": 4.5887482256856305e-06, + "loss": 0.4468, + "step": 9753 + }, + { + "epoch": 3.8201778216843665, + "grad_norm": 0.4867523713344396, + "learning_rate": 4.588662131491367e-06, + "loss": 0.4725, + "step": 9754 + }, + { + "epoch": 3.8205729809829587, + "grad_norm": 0.5045119465234934, + "learning_rate": 4.588576029094107e-06, + "loss": 0.4651, + "step": 9755 + }, + { + "epoch": 3.820968140281551, + "grad_norm": 0.49383831121424904, + "learning_rate": 4.588489918494188e-06, + "loss": 0.4575, + "step": 9756 + }, + { + "epoch": 3.821363299580143, + "grad_norm": 0.49047441119572965, + "learning_rate": 4.588403799691949e-06, + "loss": 0.4587, + "step": 9757 + }, + { + "epoch": 3.8217584588787354, + "grad_norm": 0.513029731134795, + "learning_rate": 4.5883176726877276e-06, + "loss": 0.4473, + "step": 9758 + }, + { + "epoch": 3.8221536181773277, + "grad_norm": 0.5013246021960757, + "learning_rate": 4.588231537481863e-06, + "loss": 0.4535, + "step": 9759 + }, + { + "epoch": 3.82254877747592, + "grad_norm": 0.4916166616553396, + "learning_rate": 4.5881453940746925e-06, + "loss": 0.4626, + "step": 9760 + }, + { + "epoch": 3.822943936774512, + "grad_norm": 0.49314443616382275, + "learning_rate": 4.588059242466555e-06, + "loss": 0.4711, + "step": 9761 + }, + { + "epoch": 3.8233390960731044, + "grad_norm": 0.5008098059952933, + "learning_rate": 4.587973082657789e-06, + "loss": 0.445, + "step": 9762 + }, + { + "epoch": 3.8237342553716966, + "grad_norm": 0.4838807035601777, + "learning_rate": 4.587886914648733e-06, + "loss": 0.456, + "step": 9763 + }, + { + "epoch": 3.824129414670289, + "grad_norm": 0.48847805670933026, + "learning_rate": 4.587800738439725e-06, + "loss": 0.461, + "step": 9764 + }, + { + "epoch": 3.824524573968881, + "grad_norm": 0.4836684897311038, + "learning_rate": 4.587714554031103e-06, + "loss": 0.4676, + "step": 9765 + }, + { + "epoch": 3.8249197332674734, + "grad_norm": 0.4744461410812829, + "learning_rate": 4.587628361423205e-06, + "loss": 0.4467, + "step": 9766 + }, + { + "epoch": 3.8253148925660656, + "grad_norm": 0.5046875959722604, + "learning_rate": 4.587542160616372e-06, + "loss": 0.4594, + "step": 9767 + }, + { + "epoch": 3.825710051864658, + "grad_norm": 0.4894718925080993, + "learning_rate": 4.587455951610941e-06, + "loss": 0.4596, + "step": 9768 + }, + { + "epoch": 3.82610521116325, + "grad_norm": 0.6451510629529777, + "learning_rate": 4.58736973440725e-06, + "loss": 0.456, + "step": 9769 + }, + { + "epoch": 3.8265003704618423, + "grad_norm": 0.49698088184421574, + "learning_rate": 4.587283509005638e-06, + "loss": 0.4729, + "step": 9770 + }, + { + "epoch": 3.8268955297604346, + "grad_norm": 0.4744119187512547, + "learning_rate": 4.587197275406444e-06, + "loss": 0.4611, + "step": 9771 + }, + { + "epoch": 3.827290689059027, + "grad_norm": 0.48047611658306816, + "learning_rate": 4.587111033610007e-06, + "loss": 0.4404, + "step": 9772 + }, + { + "epoch": 3.827685848357619, + "grad_norm": 0.5074066632267403, + "learning_rate": 4.587024783616665e-06, + "loss": 0.4595, + "step": 9773 + }, + { + "epoch": 3.8280810076562113, + "grad_norm": 0.4886772671378936, + "learning_rate": 4.586938525426756e-06, + "loss": 0.4584, + "step": 9774 + }, + { + "epoch": 3.8284761669548035, + "grad_norm": 0.4855093381504905, + "learning_rate": 4.586852259040621e-06, + "loss": 0.4727, + "step": 9775 + }, + { + "epoch": 3.8288713262533958, + "grad_norm": 0.47748604681856477, + "learning_rate": 4.586765984458597e-06, + "loss": 0.441, + "step": 9776 + }, + { + "epoch": 3.829266485551988, + "grad_norm": 0.48420777791114294, + "learning_rate": 4.586679701681024e-06, + "loss": 0.4587, + "step": 9777 + }, + { + "epoch": 3.8296616448505802, + "grad_norm": 0.4833361501646878, + "learning_rate": 4.586593410708239e-06, + "loss": 0.4474, + "step": 9778 + }, + { + "epoch": 3.8300568041491725, + "grad_norm": 0.49869671048495123, + "learning_rate": 4.586507111540583e-06, + "loss": 0.4707, + "step": 9779 + }, + { + "epoch": 3.8304519634477647, + "grad_norm": 0.4879810325262599, + "learning_rate": 4.586420804178394e-06, + "loss": 0.4575, + "step": 9780 + }, + { + "epoch": 3.830847122746357, + "grad_norm": 0.48913117133273337, + "learning_rate": 4.58633448862201e-06, + "loss": 0.4654, + "step": 9781 + }, + { + "epoch": 3.831242282044949, + "grad_norm": 0.5998061332057096, + "learning_rate": 4.586248164871773e-06, + "loss": 0.4651, + "step": 9782 + }, + { + "epoch": 3.8316374413435415, + "grad_norm": 0.5325641228028988, + "learning_rate": 4.5861618329280185e-06, + "loss": 0.4674, + "step": 9783 + }, + { + "epoch": 3.832032600642134, + "grad_norm": 0.48101331017897647, + "learning_rate": 4.586075492791088e-06, + "loss": 0.4709, + "step": 9784 + }, + { + "epoch": 3.8324277599407264, + "grad_norm": 0.48810663701291834, + "learning_rate": 4.585989144461319e-06, + "loss": 0.4682, + "step": 9785 + }, + { + "epoch": 3.8328229192393186, + "grad_norm": 0.49088685212482186, + "learning_rate": 4.585902787939052e-06, + "loss": 0.4667, + "step": 9786 + }, + { + "epoch": 3.833218078537911, + "grad_norm": 0.4816964576299058, + "learning_rate": 4.585816423224625e-06, + "loss": 0.4619, + "step": 9787 + }, + { + "epoch": 3.833613237836503, + "grad_norm": 0.49202140202010347, + "learning_rate": 4.585730050318378e-06, + "loss": 0.4692, + "step": 9788 + }, + { + "epoch": 3.8340083971350953, + "grad_norm": 0.49988725949905755, + "learning_rate": 4.585643669220648e-06, + "loss": 0.4513, + "step": 9789 + }, + { + "epoch": 3.8344035564336876, + "grad_norm": 0.4826706645801356, + "learning_rate": 4.585557279931779e-06, + "loss": 0.4474, + "step": 9790 + }, + { + "epoch": 3.83479871573228, + "grad_norm": 0.49913183679571577, + "learning_rate": 4.585470882452106e-06, + "loss": 0.4592, + "step": 9791 + }, + { + "epoch": 3.835193875030872, + "grad_norm": 0.5102617938895149, + "learning_rate": 4.58538447678197e-06, + "loss": 0.4622, + "step": 9792 + }, + { + "epoch": 3.8355890343294643, + "grad_norm": 0.5093288577737429, + "learning_rate": 4.58529806292171e-06, + "loss": 0.4876, + "step": 9793 + }, + { + "epoch": 3.8359841936280565, + "grad_norm": 0.4847749359173575, + "learning_rate": 4.585211640871665e-06, + "loss": 0.4527, + "step": 9794 + }, + { + "epoch": 3.836379352926649, + "grad_norm": 0.4967128561121068, + "learning_rate": 4.5851252106321755e-06, + "loss": 0.4512, + "step": 9795 + }, + { + "epoch": 3.836774512225241, + "grad_norm": 0.5122771054364694, + "learning_rate": 4.585038772203581e-06, + "loss": 0.4581, + "step": 9796 + }, + { + "epoch": 3.8371696715238333, + "grad_norm": 0.4778663343535554, + "learning_rate": 4.584952325586219e-06, + "loss": 0.455, + "step": 9797 + }, + { + "epoch": 3.8375648308224255, + "grad_norm": 0.4730286915929293, + "learning_rate": 4.584865870780431e-06, + "loss": 0.4438, + "step": 9798 + }, + { + "epoch": 3.8379599901210177, + "grad_norm": 0.49474641210541287, + "learning_rate": 4.584779407786556e-06, + "loss": 0.4647, + "step": 9799 + }, + { + "epoch": 3.83835514941961, + "grad_norm": 0.4979187357021784, + "learning_rate": 4.5846929366049316e-06, + "loss": 0.469, + "step": 9800 + }, + { + "epoch": 3.838750308718202, + "grad_norm": 0.4969648880958739, + "learning_rate": 4.584606457235901e-06, + "loss": 0.472, + "step": 9801 + }, + { + "epoch": 3.8391454680167945, + "grad_norm": 0.48419610177762074, + "learning_rate": 4.584519969679803e-06, + "loss": 0.4609, + "step": 9802 + }, + { + "epoch": 3.8395406273153867, + "grad_norm": 0.47519923964504873, + "learning_rate": 4.584433473936975e-06, + "loss": 0.4407, + "step": 9803 + }, + { + "epoch": 3.839935786613979, + "grad_norm": 0.5036626686511386, + "learning_rate": 4.584346970007758e-06, + "loss": 0.4666, + "step": 9804 + }, + { + "epoch": 3.840330945912571, + "grad_norm": 0.49885414130181815, + "learning_rate": 4.584260457892492e-06, + "loss": 0.4486, + "step": 9805 + }, + { + "epoch": 3.8407261052111634, + "grad_norm": 0.4973984116213614, + "learning_rate": 4.584173937591516e-06, + "loss": 0.4716, + "step": 9806 + }, + { + "epoch": 3.8411212645097557, + "grad_norm": 0.47438026051623494, + "learning_rate": 4.584087409105171e-06, + "loss": 0.4508, + "step": 9807 + }, + { + "epoch": 3.841516423808348, + "grad_norm": 0.48007747354455976, + "learning_rate": 4.5840008724337955e-06, + "loss": 0.4631, + "step": 9808 + }, + { + "epoch": 3.84191158310694, + "grad_norm": 0.587833445348625, + "learning_rate": 4.583914327577731e-06, + "loss": 0.4741, + "step": 9809 + }, + { + "epoch": 3.8423067424055324, + "grad_norm": 0.499951286009737, + "learning_rate": 4.583827774537316e-06, + "loss": 0.456, + "step": 9810 + }, + { + "epoch": 3.8427019017041246, + "grad_norm": 0.4848679924648173, + "learning_rate": 4.583741213312891e-06, + "loss": 0.4729, + "step": 9811 + }, + { + "epoch": 3.843097061002717, + "grad_norm": 0.48840107615604933, + "learning_rate": 4.583654643904796e-06, + "loss": 0.4642, + "step": 9812 + }, + { + "epoch": 3.843492220301309, + "grad_norm": 0.4968970099252351, + "learning_rate": 4.5835680663133705e-06, + "loss": 0.4621, + "step": 9813 + }, + { + "epoch": 3.8438873795999013, + "grad_norm": 0.47240176053138544, + "learning_rate": 4.583481480538955e-06, + "loss": 0.458, + "step": 9814 + }, + { + "epoch": 3.8442825388984936, + "grad_norm": 0.4873267364456109, + "learning_rate": 4.583394886581889e-06, + "loss": 0.4614, + "step": 9815 + }, + { + "epoch": 3.844677698197086, + "grad_norm": 0.4813398493399383, + "learning_rate": 4.5833082844425135e-06, + "loss": 0.4543, + "step": 9816 + }, + { + "epoch": 3.845072857495678, + "grad_norm": 0.49742091745038736, + "learning_rate": 4.583221674121167e-06, + "loss": 0.4437, + "step": 9817 + }, + { + "epoch": 3.8454680167942703, + "grad_norm": 0.4937773614181849, + "learning_rate": 4.5831350556181934e-06, + "loss": 0.4668, + "step": 9818 + }, + { + "epoch": 3.8458631760928625, + "grad_norm": 0.4807185417064237, + "learning_rate": 4.583048428933928e-06, + "loss": 0.4303, + "step": 9819 + }, + { + "epoch": 3.846258335391455, + "grad_norm": 0.49996366474147874, + "learning_rate": 4.582961794068714e-06, + "loss": 0.463, + "step": 9820 + }, + { + "epoch": 3.846653494690047, + "grad_norm": 0.4767782183375261, + "learning_rate": 4.582875151022891e-06, + "loss": 0.4725, + "step": 9821 + }, + { + "epoch": 3.8470486539886393, + "grad_norm": 0.48139899842359823, + "learning_rate": 4.582788499796798e-06, + "loss": 0.4533, + "step": 9822 + }, + { + "epoch": 3.8474438132872315, + "grad_norm": 0.4963731934676299, + "learning_rate": 4.582701840390778e-06, + "loss": 0.4667, + "step": 9823 + }, + { + "epoch": 3.8478389725858237, + "grad_norm": 0.48437203938631246, + "learning_rate": 4.5826151728051696e-06, + "loss": 0.4576, + "step": 9824 + }, + { + "epoch": 3.848234131884416, + "grad_norm": 0.4788898724749113, + "learning_rate": 4.582528497040313e-06, + "loss": 0.4588, + "step": 9825 + }, + { + "epoch": 3.8486292911830082, + "grad_norm": 0.4864655138587333, + "learning_rate": 4.5824418130965485e-06, + "loss": 0.4664, + "step": 9826 + }, + { + "epoch": 3.8490244504816005, + "grad_norm": 0.491011327584925, + "learning_rate": 4.582355120974218e-06, + "loss": 0.443, + "step": 9827 + }, + { + "epoch": 3.8494196097801927, + "grad_norm": 0.5037804637537733, + "learning_rate": 4.582268420673661e-06, + "loss": 0.496, + "step": 9828 + }, + { + "epoch": 3.849814769078785, + "grad_norm": 0.4911686450431375, + "learning_rate": 4.582181712195218e-06, + "loss": 0.4648, + "step": 9829 + }, + { + "epoch": 3.850209928377377, + "grad_norm": 0.4885405950646796, + "learning_rate": 4.582094995539229e-06, + "loss": 0.4663, + "step": 9830 + }, + { + "epoch": 3.8506050876759694, + "grad_norm": 0.46999551043751275, + "learning_rate": 4.582008270706035e-06, + "loss": 0.4411, + "step": 9831 + }, + { + "epoch": 3.8510002469745617, + "grad_norm": 0.4882884684859506, + "learning_rate": 4.581921537695978e-06, + "loss": 0.4626, + "step": 9832 + }, + { + "epoch": 3.851395406273154, + "grad_norm": 0.4913585267933321, + "learning_rate": 4.581834796509397e-06, + "loss": 0.457, + "step": 9833 + }, + { + "epoch": 3.851790565571746, + "grad_norm": 0.4659923322842953, + "learning_rate": 4.581748047146633e-06, + "loss": 0.4434, + "step": 9834 + }, + { + "epoch": 3.8521857248703384, + "grad_norm": 0.49559627762410036, + "learning_rate": 4.581661289608027e-06, + "loss": 0.4724, + "step": 9835 + }, + { + "epoch": 3.8525808841689306, + "grad_norm": 0.4881260286548596, + "learning_rate": 4.581574523893919e-06, + "loss": 0.4519, + "step": 9836 + }, + { + "epoch": 3.852976043467523, + "grad_norm": 0.488542461742271, + "learning_rate": 4.581487750004651e-06, + "loss": 0.4687, + "step": 9837 + }, + { + "epoch": 3.853371202766115, + "grad_norm": 0.47946619013094255, + "learning_rate": 4.581400967940562e-06, + "loss": 0.4507, + "step": 9838 + }, + { + "epoch": 3.8537663620647074, + "grad_norm": 0.49113873280049447, + "learning_rate": 4.581314177701994e-06, + "loss": 0.4652, + "step": 9839 + }, + { + "epoch": 3.8541615213632996, + "grad_norm": 0.506053760875361, + "learning_rate": 4.581227379289288e-06, + "loss": 0.4565, + "step": 9840 + }, + { + "epoch": 3.854556680661892, + "grad_norm": 0.498570278337444, + "learning_rate": 4.581140572702785e-06, + "loss": 0.4572, + "step": 9841 + }, + { + "epoch": 3.854951839960484, + "grad_norm": 0.4750541834581741, + "learning_rate": 4.5810537579428255e-06, + "loss": 0.4602, + "step": 9842 + }, + { + "epoch": 3.8553469992590763, + "grad_norm": 0.48656050386521776, + "learning_rate": 4.580966935009751e-06, + "loss": 0.4811, + "step": 9843 + }, + { + "epoch": 3.8557421585576686, + "grad_norm": 0.48249156384469394, + "learning_rate": 4.580880103903901e-06, + "loss": 0.4764, + "step": 9844 + }, + { + "epoch": 3.856137317856261, + "grad_norm": 0.4880019438321168, + "learning_rate": 4.580793264625618e-06, + "loss": 0.4573, + "step": 9845 + }, + { + "epoch": 3.856532477154853, + "grad_norm": 0.5055590149018785, + "learning_rate": 4.5807064171752426e-06, + "loss": 0.4484, + "step": 9846 + }, + { + "epoch": 3.8569276364534453, + "grad_norm": 0.48601910111621366, + "learning_rate": 4.580619561553116e-06, + "loss": 0.4391, + "step": 9847 + }, + { + "epoch": 3.8573227957520375, + "grad_norm": 0.5571216265744174, + "learning_rate": 4.5805326977595784e-06, + "loss": 0.4402, + "step": 9848 + }, + { + "epoch": 3.8577179550506298, + "grad_norm": 0.49789507388969795, + "learning_rate": 4.5804458257949725e-06, + "loss": 0.4753, + "step": 9849 + }, + { + "epoch": 3.858113114349222, + "grad_norm": 0.493498154898902, + "learning_rate": 4.580358945659639e-06, + "loss": 0.4788, + "step": 9850 + }, + { + "epoch": 3.8585082736478142, + "grad_norm": 0.48765613657828505, + "learning_rate": 4.580272057353918e-06, + "loss": 0.4624, + "step": 9851 + }, + { + "epoch": 3.8589034329464065, + "grad_norm": 0.4844714607389518, + "learning_rate": 4.580185160878151e-06, + "loss": 0.459, + "step": 9852 + }, + { + "epoch": 3.8592985922449987, + "grad_norm": 0.497325111713686, + "learning_rate": 4.580098256232681e-06, + "loss": 0.4842, + "step": 9853 + }, + { + "epoch": 3.859693751543591, + "grad_norm": 0.48442884762655214, + "learning_rate": 4.5800113434178485e-06, + "loss": 0.4475, + "step": 9854 + }, + { + "epoch": 3.860088910842183, + "grad_norm": 0.4942069483791756, + "learning_rate": 4.579924422433993e-06, + "loss": 0.458, + "step": 9855 + }, + { + "epoch": 3.8604840701407754, + "grad_norm": 0.48273095849038933, + "learning_rate": 4.579837493281459e-06, + "loss": 0.459, + "step": 9856 + }, + { + "epoch": 3.8608792294393677, + "grad_norm": 0.48744132992622485, + "learning_rate": 4.579750555960585e-06, + "loss": 0.466, + "step": 9857 + }, + { + "epoch": 3.86127438873796, + "grad_norm": 0.4947598696132769, + "learning_rate": 4.5796636104717155e-06, + "loss": 0.4701, + "step": 9858 + }, + { + "epoch": 3.861669548036552, + "grad_norm": 0.4856546204635866, + "learning_rate": 4.579576656815188e-06, + "loss": 0.4595, + "step": 9859 + }, + { + "epoch": 3.8620647073351444, + "grad_norm": 0.484668155652294, + "learning_rate": 4.579489694991347e-06, + "loss": 0.452, + "step": 9860 + }, + { + "epoch": 3.8624598666337366, + "grad_norm": 0.49823648475531956, + "learning_rate": 4.579402725000534e-06, + "loss": 0.4525, + "step": 9861 + }, + { + "epoch": 3.862855025932329, + "grad_norm": 0.4879242829207528, + "learning_rate": 4.579315746843088e-06, + "loss": 0.4682, + "step": 9862 + }, + { + "epoch": 3.863250185230921, + "grad_norm": 0.5043668954023681, + "learning_rate": 4.579228760519354e-06, + "loss": 0.4696, + "step": 9863 + }, + { + "epoch": 3.8636453445295134, + "grad_norm": 0.47800525043818226, + "learning_rate": 4.579141766029672e-06, + "loss": 0.4392, + "step": 9864 + }, + { + "epoch": 3.8640405038281056, + "grad_norm": 0.4946490914928116, + "learning_rate": 4.579054763374383e-06, + "loss": 0.4515, + "step": 9865 + }, + { + "epoch": 3.864435663126698, + "grad_norm": 0.48212116695919593, + "learning_rate": 4.578967752553829e-06, + "loss": 0.4771, + "step": 9866 + }, + { + "epoch": 3.86483082242529, + "grad_norm": 0.49582742338022634, + "learning_rate": 4.578880733568353e-06, + "loss": 0.4477, + "step": 9867 + }, + { + "epoch": 3.8652259817238823, + "grad_norm": 0.49339493973167836, + "learning_rate": 4.578793706418295e-06, + "loss": 0.4487, + "step": 9868 + }, + { + "epoch": 3.8656211410224746, + "grad_norm": 0.4929894857650072, + "learning_rate": 4.578706671103998e-06, + "loss": 0.4811, + "step": 9869 + }, + { + "epoch": 3.866016300321067, + "grad_norm": 0.5010690182198145, + "learning_rate": 4.578619627625803e-06, + "loss": 0.4644, + "step": 9870 + }, + { + "epoch": 3.866411459619659, + "grad_norm": 0.48769438400413784, + "learning_rate": 4.578532575984053e-06, + "loss": 0.4517, + "step": 9871 + }, + { + "epoch": 3.8668066189182513, + "grad_norm": 0.49383384006167214, + "learning_rate": 4.5784455161790895e-06, + "loss": 0.4656, + "step": 9872 + }, + { + "epoch": 3.8672017782168435, + "grad_norm": 0.4810665135549552, + "learning_rate": 4.578358448211253e-06, + "loss": 0.4454, + "step": 9873 + }, + { + "epoch": 3.8675969375154358, + "grad_norm": 0.48861310498403004, + "learning_rate": 4.578271372080888e-06, + "loss": 0.4605, + "step": 9874 + }, + { + "epoch": 3.867992096814028, + "grad_norm": 0.49304147135775594, + "learning_rate": 4.5781842877883335e-06, + "loss": 0.4559, + "step": 9875 + }, + { + "epoch": 3.8683872561126202, + "grad_norm": 0.4901516373069663, + "learning_rate": 4.578097195333935e-06, + "loss": 0.4488, + "step": 9876 + }, + { + "epoch": 3.8687824154112125, + "grad_norm": 0.47112670377763216, + "learning_rate": 4.57801009471803e-06, + "loss": 0.4447, + "step": 9877 + }, + { + "epoch": 3.8691775747098047, + "grad_norm": 0.48837740071566554, + "learning_rate": 4.577922985940965e-06, + "loss": 0.457, + "step": 9878 + }, + { + "epoch": 3.869572734008397, + "grad_norm": 0.4965735724057039, + "learning_rate": 4.57783586900308e-06, + "loss": 0.4508, + "step": 9879 + }, + { + "epoch": 3.869967893306989, + "grad_norm": 0.4896825295302401, + "learning_rate": 4.577748743904717e-06, + "loss": 0.4706, + "step": 9880 + }, + { + "epoch": 3.8703630526055814, + "grad_norm": 0.48477203855856776, + "learning_rate": 4.577661610646219e-06, + "loss": 0.4486, + "step": 9881 + }, + { + "epoch": 3.8707582119041737, + "grad_norm": 0.4772573464731468, + "learning_rate": 4.577574469227928e-06, + "loss": 0.4639, + "step": 9882 + }, + { + "epoch": 3.871153371202766, + "grad_norm": 0.47562723439852267, + "learning_rate": 4.577487319650186e-06, + "loss": 0.4738, + "step": 9883 + }, + { + "epoch": 3.871548530501358, + "grad_norm": 0.48073672972235093, + "learning_rate": 4.577400161913335e-06, + "loss": 0.4711, + "step": 9884 + }, + { + "epoch": 3.8719436897999504, + "grad_norm": 0.4937434966089234, + "learning_rate": 4.577312996017718e-06, + "loss": 0.4678, + "step": 9885 + }, + { + "epoch": 3.8723388490985426, + "grad_norm": 0.5076968652598326, + "learning_rate": 4.5772258219636765e-06, + "loss": 0.4633, + "step": 9886 + }, + { + "epoch": 3.872734008397135, + "grad_norm": 0.4945776677023478, + "learning_rate": 4.5771386397515535e-06, + "loss": 0.4664, + "step": 9887 + }, + { + "epoch": 3.873129167695727, + "grad_norm": 0.49792744846750037, + "learning_rate": 4.577051449381691e-06, + "loss": 0.4664, + "step": 9888 + }, + { + "epoch": 3.8735243269943194, + "grad_norm": 0.48812305721614563, + "learning_rate": 4.576964250854432e-06, + "loss": 0.4683, + "step": 9889 + }, + { + "epoch": 3.8739194862929116, + "grad_norm": 0.48682855568601446, + "learning_rate": 4.5768770441701184e-06, + "loss": 0.4594, + "step": 9890 + }, + { + "epoch": 3.874314645591504, + "grad_norm": 0.48935463641111665, + "learning_rate": 4.576789829329093e-06, + "loss": 0.4752, + "step": 9891 + }, + { + "epoch": 3.874709804890096, + "grad_norm": 0.5341391098277775, + "learning_rate": 4.576702606331698e-06, + "loss": 0.4527, + "step": 9892 + }, + { + "epoch": 3.8751049641886883, + "grad_norm": 0.49067023477198496, + "learning_rate": 4.5766153751782775e-06, + "loss": 0.4546, + "step": 9893 + }, + { + "epoch": 3.8755001234872806, + "grad_norm": 0.4953575355699284, + "learning_rate": 4.576528135869171e-06, + "loss": 0.4609, + "step": 9894 + }, + { + "epoch": 3.875895282785873, + "grad_norm": 0.5064570635778317, + "learning_rate": 4.576440888404724e-06, + "loss": 0.4615, + "step": 9895 + }, + { + "epoch": 3.876290442084465, + "grad_norm": 0.4852237088041872, + "learning_rate": 4.576353632785278e-06, + "loss": 0.4504, + "step": 9896 + }, + { + "epoch": 3.8766856013830573, + "grad_norm": 0.4896826011480754, + "learning_rate": 4.576266369011175e-06, + "loss": 0.46, + "step": 9897 + }, + { + "epoch": 3.8770807606816495, + "grad_norm": 0.48478132345598957, + "learning_rate": 4.576179097082759e-06, + "loss": 0.4634, + "step": 9898 + }, + { + "epoch": 3.8774759199802418, + "grad_norm": 0.49074759563963166, + "learning_rate": 4.576091817000372e-06, + "loss": 0.4631, + "step": 9899 + }, + { + "epoch": 3.877871079278834, + "grad_norm": 0.5444614925031207, + "learning_rate": 4.576004528764358e-06, + "loss": 0.4646, + "step": 9900 + }, + { + "epoch": 3.8782662385774263, + "grad_norm": 0.4886325205119538, + "learning_rate": 4.575917232375058e-06, + "loss": 0.4634, + "step": 9901 + }, + { + "epoch": 3.878661397876019, + "grad_norm": 0.49095261197771417, + "learning_rate": 4.575829927832816e-06, + "loss": 0.465, + "step": 9902 + }, + { + "epoch": 3.879056557174611, + "grad_norm": 0.49252455851339666, + "learning_rate": 4.575742615137973e-06, + "loss": 0.4713, + "step": 9903 + }, + { + "epoch": 3.8794517164732034, + "grad_norm": 0.5467856159576922, + "learning_rate": 4.575655294290875e-06, + "loss": 0.479, + "step": 9904 + }, + { + "epoch": 3.8798468757717957, + "grad_norm": 0.49019837168996383, + "learning_rate": 4.575567965291864e-06, + "loss": 0.4722, + "step": 9905 + }, + { + "epoch": 3.880242035070388, + "grad_norm": 0.5341818933608717, + "learning_rate": 4.575480628141281e-06, + "loss": 0.4527, + "step": 9906 + }, + { + "epoch": 3.88063719436898, + "grad_norm": 0.4913343796849772, + "learning_rate": 4.575393282839471e-06, + "loss": 0.473, + "step": 9907 + }, + { + "epoch": 3.8810323536675724, + "grad_norm": 0.48134695756624346, + "learning_rate": 4.575305929386776e-06, + "loss": 0.4438, + "step": 9908 + }, + { + "epoch": 3.8814275129661646, + "grad_norm": 0.5029217370787735, + "learning_rate": 4.57521856778354e-06, + "loss": 0.4536, + "step": 9909 + }, + { + "epoch": 3.881822672264757, + "grad_norm": 0.4807948637994951, + "learning_rate": 4.5751311980301064e-06, + "loss": 0.4405, + "step": 9910 + }, + { + "epoch": 3.882217831563349, + "grad_norm": 0.47962412886103184, + "learning_rate": 4.5750438201268165e-06, + "loss": 0.4619, + "step": 9911 + }, + { + "epoch": 3.8826129908619413, + "grad_norm": 0.49393796080145874, + "learning_rate": 4.574956434074014e-06, + "loss": 0.4589, + "step": 9912 + }, + { + "epoch": 3.8830081501605336, + "grad_norm": 0.4946297200387813, + "learning_rate": 4.574869039872044e-06, + "loss": 0.4811, + "step": 9913 + }, + { + "epoch": 3.883403309459126, + "grad_norm": 0.5028163739738554, + "learning_rate": 4.574781637521247e-06, + "loss": 0.4621, + "step": 9914 + }, + { + "epoch": 3.883798468757718, + "grad_norm": 0.4846491793605942, + "learning_rate": 4.5746942270219686e-06, + "loss": 0.461, + "step": 9915 + }, + { + "epoch": 3.8841936280563103, + "grad_norm": 0.5158987625564292, + "learning_rate": 4.574606808374551e-06, + "loss": 0.4566, + "step": 9916 + }, + { + "epoch": 3.8845887873549025, + "grad_norm": 0.4833525715221507, + "learning_rate": 4.574519381579337e-06, + "loss": 0.4601, + "step": 9917 + }, + { + "epoch": 3.884983946653495, + "grad_norm": 0.5076912429801129, + "learning_rate": 4.574431946636671e-06, + "loss": 0.4689, + "step": 9918 + }, + { + "epoch": 3.885379105952087, + "grad_norm": 0.4863106371729655, + "learning_rate": 4.574344503546896e-06, + "loss": 0.4532, + "step": 9919 + }, + { + "epoch": 3.8857742652506793, + "grad_norm": 0.49043260914664416, + "learning_rate": 4.5742570523103555e-06, + "loss": 0.4664, + "step": 9920 + }, + { + "epoch": 3.8861694245492715, + "grad_norm": 0.5123972555223707, + "learning_rate": 4.574169592927392e-06, + "loss": 0.4707, + "step": 9921 + }, + { + "epoch": 3.8865645838478637, + "grad_norm": 0.47548790562191307, + "learning_rate": 4.5740821253983505e-06, + "loss": 0.4617, + "step": 9922 + }, + { + "epoch": 3.886959743146456, + "grad_norm": 0.489095797302651, + "learning_rate": 4.573994649723575e-06, + "loss": 0.4711, + "step": 9923 + }, + { + "epoch": 3.887354902445048, + "grad_norm": 0.48921262328932935, + "learning_rate": 4.573907165903406e-06, + "loss": 0.4555, + "step": 9924 + }, + { + "epoch": 3.8877500617436405, + "grad_norm": 0.49537271238528213, + "learning_rate": 4.57381967393819e-06, + "loss": 0.4711, + "step": 9925 + }, + { + "epoch": 3.8881452210422327, + "grad_norm": 0.5259888620610288, + "learning_rate": 4.573732173828269e-06, + "loss": 0.4623, + "step": 9926 + }, + { + "epoch": 3.888540380340825, + "grad_norm": 0.48638729424662486, + "learning_rate": 4.573644665573987e-06, + "loss": 0.4726, + "step": 9927 + }, + { + "epoch": 3.888935539639417, + "grad_norm": 0.503911496590745, + "learning_rate": 4.573557149175689e-06, + "loss": 0.4711, + "step": 9928 + }, + { + "epoch": 3.8893306989380094, + "grad_norm": 0.47957742258960145, + "learning_rate": 4.573469624633717e-06, + "loss": 0.4574, + "step": 9929 + }, + { + "epoch": 3.8897258582366017, + "grad_norm": 0.49719338473032687, + "learning_rate": 4.5733820919484165e-06, + "loss": 0.4637, + "step": 9930 + }, + { + "epoch": 3.890121017535194, + "grad_norm": 0.49630282899421835, + "learning_rate": 4.573294551120129e-06, + "loss": 0.4621, + "step": 9931 + }, + { + "epoch": 3.890516176833786, + "grad_norm": 0.48727643540534094, + "learning_rate": 4.573207002149199e-06, + "loss": 0.4585, + "step": 9932 + }, + { + "epoch": 3.8909113361323784, + "grad_norm": 0.4809569000720417, + "learning_rate": 4.573119445035972e-06, + "loss": 0.4586, + "step": 9933 + }, + { + "epoch": 3.8913064954309706, + "grad_norm": 0.47571185854215003, + "learning_rate": 4.57303187978079e-06, + "loss": 0.4683, + "step": 9934 + }, + { + "epoch": 3.891701654729563, + "grad_norm": 0.48390173237622336, + "learning_rate": 4.5729443063839984e-06, + "loss": 0.4748, + "step": 9935 + }, + { + "epoch": 3.892096814028155, + "grad_norm": 0.49110153521074534, + "learning_rate": 4.57285672484594e-06, + "loss": 0.4597, + "step": 9936 + }, + { + "epoch": 3.8924919733267473, + "grad_norm": 0.4952646122846938, + "learning_rate": 4.572769135166959e-06, + "loss": 0.4827, + "step": 9937 + }, + { + "epoch": 3.8928871326253396, + "grad_norm": 0.4823294860273062, + "learning_rate": 4.572681537347398e-06, + "loss": 0.47, + "step": 9938 + }, + { + "epoch": 3.893282291923932, + "grad_norm": 0.48698397822277945, + "learning_rate": 4.572593931387604e-06, + "loss": 0.4602, + "step": 9939 + }, + { + "epoch": 3.893677451222524, + "grad_norm": 0.48400546395610927, + "learning_rate": 4.572506317287921e-06, + "loss": 0.4481, + "step": 9940 + }, + { + "epoch": 3.8940726105211163, + "grad_norm": 0.48847518203566603, + "learning_rate": 4.572418695048689e-06, + "loss": 0.4573, + "step": 9941 + }, + { + "epoch": 3.8944677698197085, + "grad_norm": 0.4854571876960505, + "learning_rate": 4.572331064670257e-06, + "loss": 0.4626, + "step": 9942 + }, + { + "epoch": 3.894862929118301, + "grad_norm": 0.47898682929482006, + "learning_rate": 4.572243426152965e-06, + "loss": 0.44, + "step": 9943 + }, + { + "epoch": 3.895258088416893, + "grad_norm": 0.491532931524844, + "learning_rate": 4.57215577949716e-06, + "loss": 0.4807, + "step": 9944 + }, + { + "epoch": 3.8956532477154853, + "grad_norm": 0.47872724576808984, + "learning_rate": 4.572068124703185e-06, + "loss": 0.4603, + "step": 9945 + }, + { + "epoch": 3.8960484070140775, + "grad_norm": 0.45451419510929936, + "learning_rate": 4.571980461771386e-06, + "loss": 0.458, + "step": 9946 + }, + { + "epoch": 3.8964435663126697, + "grad_norm": 0.4762087573565171, + "learning_rate": 4.571892790702105e-06, + "loss": 0.4539, + "step": 9947 + }, + { + "epoch": 3.896838725611262, + "grad_norm": 0.501694628462472, + "learning_rate": 4.571805111495687e-06, + "loss": 0.4593, + "step": 9948 + }, + { + "epoch": 3.8972338849098542, + "grad_norm": 0.49270030012761595, + "learning_rate": 4.571717424152476e-06, + "loss": 0.4679, + "step": 9949 + }, + { + "epoch": 3.8976290442084465, + "grad_norm": 0.4925249641495412, + "learning_rate": 4.5716297286728184e-06, + "loss": 0.4769, + "step": 9950 + }, + { + "epoch": 3.8980242035070387, + "grad_norm": 0.4858992891832617, + "learning_rate": 4.571542025057057e-06, + "loss": 0.4795, + "step": 9951 + }, + { + "epoch": 3.898419362805631, + "grad_norm": 0.4786925181433357, + "learning_rate": 4.571454313305536e-06, + "loss": 0.4415, + "step": 9952 + }, + { + "epoch": 3.898814522104223, + "grad_norm": 0.48023530140553916, + "learning_rate": 4.5713665934186005e-06, + "loss": 0.4498, + "step": 9953 + }, + { + "epoch": 3.8992096814028154, + "grad_norm": 0.48453549831266507, + "learning_rate": 4.571278865396594e-06, + "loss": 0.4644, + "step": 9954 + }, + { + "epoch": 3.8996048407014077, + "grad_norm": 0.4900914360178932, + "learning_rate": 4.571191129239863e-06, + "loss": 0.474, + "step": 9955 + }, + { + "epoch": 3.9, + "grad_norm": 0.5041682526200453, + "learning_rate": 4.57110338494875e-06, + "loss": 0.4554, + "step": 9956 + }, + { + "epoch": 3.900395159298592, + "grad_norm": 0.5410219421410024, + "learning_rate": 4.571015632523601e-06, + "loss": 0.4793, + "step": 9957 + }, + { + "epoch": 3.9007903185971844, + "grad_norm": 0.4825869143751156, + "learning_rate": 4.57092787196476e-06, + "loss": 0.482, + "step": 9958 + }, + { + "epoch": 3.9011854778957766, + "grad_norm": 0.48251364045941925, + "learning_rate": 4.5708401032725725e-06, + "loss": 0.4482, + "step": 9959 + }, + { + "epoch": 3.901580637194369, + "grad_norm": 0.47369158864484645, + "learning_rate": 4.570752326447382e-06, + "loss": 0.4565, + "step": 9960 + }, + { + "epoch": 3.901975796492961, + "grad_norm": 0.4777236188587447, + "learning_rate": 4.5706645414895335e-06, + "loss": 0.4592, + "step": 9961 + }, + { + "epoch": 3.9023709557915534, + "grad_norm": 0.4818832878308696, + "learning_rate": 4.5705767483993725e-06, + "loss": 0.4561, + "step": 9962 + }, + { + "epoch": 3.9027661150901456, + "grad_norm": 0.4728202329441403, + "learning_rate": 4.570488947177243e-06, + "loss": 0.4611, + "step": 9963 + }, + { + "epoch": 3.903161274388738, + "grad_norm": 0.4774824700271554, + "learning_rate": 4.570401137823491e-06, + "loss": 0.4667, + "step": 9964 + }, + { + "epoch": 3.90355643368733, + "grad_norm": 0.5201325910426959, + "learning_rate": 4.57031332033846e-06, + "loss": 0.4556, + "step": 9965 + }, + { + "epoch": 3.9039515929859223, + "grad_norm": 0.49100204946745024, + "learning_rate": 4.570225494722495e-06, + "loss": 0.4605, + "step": 9966 + }, + { + "epoch": 3.9043467522845146, + "grad_norm": 0.4933350216968649, + "learning_rate": 4.5701376609759415e-06, + "loss": 0.465, + "step": 9967 + }, + { + "epoch": 3.904741911583107, + "grad_norm": 0.49608467793393873, + "learning_rate": 4.570049819099145e-06, + "loss": 0.4673, + "step": 9968 + }, + { + "epoch": 3.905137070881699, + "grad_norm": 0.49885302185060626, + "learning_rate": 4.569961969092449e-06, + "loss": 0.4731, + "step": 9969 + }, + { + "epoch": 3.9055322301802913, + "grad_norm": 0.4765724417728314, + "learning_rate": 4.569874110956201e-06, + "loss": 0.4664, + "step": 9970 + }, + { + "epoch": 3.9059273894788835, + "grad_norm": 0.48345495768456714, + "learning_rate": 4.569786244690743e-06, + "loss": 0.4747, + "step": 9971 + }, + { + "epoch": 3.9063225487774758, + "grad_norm": 0.48003753403167, + "learning_rate": 4.569698370296421e-06, + "loss": 0.445, + "step": 9972 + }, + { + "epoch": 3.9067177080760684, + "grad_norm": 0.4923494525260739, + "learning_rate": 4.5696104877735815e-06, + "loss": 0.4575, + "step": 9973 + }, + { + "epoch": 3.9071128673746607, + "grad_norm": 0.5211429128554534, + "learning_rate": 4.569522597122569e-06, + "loss": 0.4637, + "step": 9974 + }, + { + "epoch": 3.907508026673253, + "grad_norm": 0.4818937227907757, + "learning_rate": 4.569434698343727e-06, + "loss": 0.4429, + "step": 9975 + }, + { + "epoch": 3.907903185971845, + "grad_norm": 0.5046848124987253, + "learning_rate": 4.569346791437403e-06, + "loss": 0.4938, + "step": 9976 + }, + { + "epoch": 3.9082983452704374, + "grad_norm": 0.5027445720769897, + "learning_rate": 4.5692588764039415e-06, + "loss": 0.4727, + "step": 9977 + }, + { + "epoch": 3.9086935045690296, + "grad_norm": 0.5039278354823773, + "learning_rate": 4.569170953243688e-06, + "loss": 0.4559, + "step": 9978 + }, + { + "epoch": 3.909088663867622, + "grad_norm": 0.4843847065341945, + "learning_rate": 4.569083021956987e-06, + "loss": 0.4564, + "step": 9979 + }, + { + "epoch": 3.909483823166214, + "grad_norm": 0.49170737209975196, + "learning_rate": 4.568995082544184e-06, + "loss": 0.4752, + "step": 9980 + }, + { + "epoch": 3.9098789824648064, + "grad_norm": 0.4784679535834229, + "learning_rate": 4.568907135005625e-06, + "loss": 0.4719, + "step": 9981 + }, + { + "epoch": 3.9102741417633986, + "grad_norm": 0.48953143430090623, + "learning_rate": 4.5688191793416545e-06, + "loss": 0.4604, + "step": 9982 + }, + { + "epoch": 3.910669301061991, + "grad_norm": 0.5026359012231226, + "learning_rate": 4.56873121555262e-06, + "loss": 0.4494, + "step": 9983 + }, + { + "epoch": 3.911064460360583, + "grad_norm": 0.48889537939386374, + "learning_rate": 4.568643243638864e-06, + "loss": 0.4452, + "step": 9984 + }, + { + "epoch": 3.9114596196591753, + "grad_norm": 0.4968415694067955, + "learning_rate": 4.568555263600735e-06, + "loss": 0.4645, + "step": 9985 + }, + { + "epoch": 3.9118547789577676, + "grad_norm": 0.4877243861936003, + "learning_rate": 4.568467275438575e-06, + "loss": 0.4587, + "step": 9986 + }, + { + "epoch": 3.91224993825636, + "grad_norm": 0.49223731516425107, + "learning_rate": 4.568379279152733e-06, + "loss": 0.459, + "step": 9987 + }, + { + "epoch": 3.912645097554952, + "grad_norm": 0.49250738243308856, + "learning_rate": 4.568291274743553e-06, + "loss": 0.4778, + "step": 9988 + }, + { + "epoch": 3.9130402568535443, + "grad_norm": 0.5009850640273527, + "learning_rate": 4.56820326221138e-06, + "loss": 0.4748, + "step": 9989 + }, + { + "epoch": 3.9134354161521365, + "grad_norm": 0.4961840767254751, + "learning_rate": 4.568115241556562e-06, + "loss": 0.4531, + "step": 9990 + }, + { + "epoch": 3.9138305754507288, + "grad_norm": 0.48894424493050087, + "learning_rate": 4.5680272127794424e-06, + "loss": 0.4451, + "step": 9991 + }, + { + "epoch": 3.914225734749321, + "grad_norm": 0.4811481982683269, + "learning_rate": 4.567939175880367e-06, + "loss": 0.4579, + "step": 9992 + }, + { + "epoch": 3.9146208940479132, + "grad_norm": 0.4924442883893573, + "learning_rate": 4.567851130859683e-06, + "loss": 0.4714, + "step": 9993 + }, + { + "epoch": 3.9150160533465055, + "grad_norm": 0.5035466446279564, + "learning_rate": 4.567763077717735e-06, + "loss": 0.4947, + "step": 9994 + }, + { + "epoch": 3.9154112126450977, + "grad_norm": 0.4995580198600491, + "learning_rate": 4.56767501645487e-06, + "loss": 0.4711, + "step": 9995 + }, + { + "epoch": 3.91580637194369, + "grad_norm": 0.495893119902799, + "learning_rate": 4.5675869470714314e-06, + "loss": 0.4749, + "step": 9996 + }, + { + "epoch": 3.916201531242282, + "grad_norm": 0.4723772655931205, + "learning_rate": 4.567498869567769e-06, + "loss": 0.4362, + "step": 9997 + }, + { + "epoch": 3.9165966905408744, + "grad_norm": 0.4718587728404546, + "learning_rate": 4.567410783944225e-06, + "loss": 0.4638, + "step": 9998 + }, + { + "epoch": 3.9169918498394667, + "grad_norm": 0.4928852611306575, + "learning_rate": 4.567322690201147e-06, + "loss": 0.4487, + "step": 9999 + }, + { + "epoch": 3.917387009138059, + "grad_norm": 0.47713196411455694, + "learning_rate": 4.5672345883388816e-06, + "loss": 0.455, + "step": 10000 + }, + { + "epoch": 3.917782168436651, + "grad_norm": 0.49819690808918476, + "learning_rate": 4.567146478357773e-06, + "loss": 0.4426, + "step": 10001 + }, + { + "epoch": 3.9181773277352434, + "grad_norm": 0.5062977312521194, + "learning_rate": 4.5670583602581685e-06, + "loss": 0.4653, + "step": 10002 + }, + { + "epoch": 3.9185724870338356, + "grad_norm": 0.5003782785606375, + "learning_rate": 4.566970234040415e-06, + "loss": 0.4615, + "step": 10003 + }, + { + "epoch": 3.918967646332428, + "grad_norm": 0.5189545106568986, + "learning_rate": 4.566882099704857e-06, + "loss": 0.4731, + "step": 10004 + }, + { + "epoch": 3.91936280563102, + "grad_norm": 0.5035785229251792, + "learning_rate": 4.566793957251841e-06, + "loss": 0.4779, + "step": 10005 + }, + { + "epoch": 3.9197579649296124, + "grad_norm": 0.4800415368289926, + "learning_rate": 4.566705806681712e-06, + "loss": 0.4539, + "step": 10006 + }, + { + "epoch": 3.9201531242282046, + "grad_norm": 0.4882424905193737, + "learning_rate": 4.56661764799482e-06, + "loss": 0.4584, + "step": 10007 + }, + { + "epoch": 3.920548283526797, + "grad_norm": 0.47308810535562973, + "learning_rate": 4.566529481191507e-06, + "loss": 0.4741, + "step": 10008 + }, + { + "epoch": 3.920943442825389, + "grad_norm": 0.4867276304401251, + "learning_rate": 4.566441306272123e-06, + "loss": 0.4541, + "step": 10009 + }, + { + "epoch": 3.9213386021239813, + "grad_norm": 0.6343188880548906, + "learning_rate": 4.56635312323701e-06, + "loss": 0.4842, + "step": 10010 + }, + { + "epoch": 3.9217337614225736, + "grad_norm": 0.5071802879941949, + "learning_rate": 4.5662649320865186e-06, + "loss": 0.489, + "step": 10011 + }, + { + "epoch": 3.922128920721166, + "grad_norm": 0.5255039030846776, + "learning_rate": 4.566176732820991e-06, + "loss": 0.4627, + "step": 10012 + }, + { + "epoch": 3.922524080019758, + "grad_norm": 0.4971011698886485, + "learning_rate": 4.566088525440778e-06, + "loss": 0.4599, + "step": 10013 + }, + { + "epoch": 3.9229192393183503, + "grad_norm": 0.4852264543790252, + "learning_rate": 4.566000309946223e-06, + "loss": 0.4602, + "step": 10014 + }, + { + "epoch": 3.9233143986169425, + "grad_norm": 0.49830525957051336, + "learning_rate": 4.565912086337674e-06, + "loss": 0.4736, + "step": 10015 + }, + { + "epoch": 3.9237095579155348, + "grad_norm": 0.4773373018311242, + "learning_rate": 4.565823854615477e-06, + "loss": 0.466, + "step": 10016 + }, + { + "epoch": 3.924104717214127, + "grad_norm": 0.4831436103078164, + "learning_rate": 4.565735614779977e-06, + "loss": 0.4643, + "step": 10017 + }, + { + "epoch": 3.9244998765127193, + "grad_norm": 0.4802473860556205, + "learning_rate": 4.565647366831522e-06, + "loss": 0.4721, + "step": 10018 + }, + { + "epoch": 3.9248950358113115, + "grad_norm": 0.49874045500383063, + "learning_rate": 4.5655591107704595e-06, + "loss": 0.4702, + "step": 10019 + }, + { + "epoch": 3.9252901951099037, + "grad_norm": 0.47831567102450434, + "learning_rate": 4.565470846597135e-06, + "loss": 0.474, + "step": 10020 + }, + { + "epoch": 3.925685354408496, + "grad_norm": 0.49992203811316205, + "learning_rate": 4.565382574311894e-06, + "loss": 0.4655, + "step": 10021 + }, + { + "epoch": 3.926080513707088, + "grad_norm": 0.4926517107007104, + "learning_rate": 4.565294293915086e-06, + "loss": 0.4675, + "step": 10022 + }, + { + "epoch": 3.9264756730056805, + "grad_norm": 0.5084730405938702, + "learning_rate": 4.565206005407055e-06, + "loss": 0.4822, + "step": 10023 + }, + { + "epoch": 3.9268708323042727, + "grad_norm": 0.4931236099060291, + "learning_rate": 4.565117708788149e-06, + "loss": 0.4437, + "step": 10024 + }, + { + "epoch": 3.927265991602865, + "grad_norm": 0.483301233803827, + "learning_rate": 4.565029404058715e-06, + "loss": 0.4709, + "step": 10025 + }, + { + "epoch": 3.927661150901457, + "grad_norm": 0.47819265448253306, + "learning_rate": 4.5649410912191e-06, + "loss": 0.4658, + "step": 10026 + }, + { + "epoch": 3.9280563102000494, + "grad_norm": 0.4822872193597864, + "learning_rate": 4.564852770269648e-06, + "loss": 0.4385, + "step": 10027 + }, + { + "epoch": 3.9284514694986417, + "grad_norm": 0.48138130491052755, + "learning_rate": 4.5647644412107104e-06, + "loss": 0.4603, + "step": 10028 + }, + { + "epoch": 3.928846628797234, + "grad_norm": 0.4945851911204728, + "learning_rate": 4.564676104042631e-06, + "loss": 0.4505, + "step": 10029 + }, + { + "epoch": 3.929241788095826, + "grad_norm": 0.48293991390518787, + "learning_rate": 4.564587758765759e-06, + "loss": 0.45, + "step": 10030 + }, + { + "epoch": 3.9296369473944184, + "grad_norm": 0.5024502851465005, + "learning_rate": 4.5644994053804384e-06, + "loss": 0.4603, + "step": 10031 + }, + { + "epoch": 3.9300321066930106, + "grad_norm": 0.47694264905270695, + "learning_rate": 4.564411043887018e-06, + "loss": 0.4547, + "step": 10032 + }, + { + "epoch": 3.930427265991603, + "grad_norm": 0.5039734950596827, + "learning_rate": 4.564322674285845e-06, + "loss": 0.4551, + "step": 10033 + }, + { + "epoch": 3.930822425290195, + "grad_norm": 0.48332955949025447, + "learning_rate": 4.564234296577266e-06, + "loss": 0.4636, + "step": 10034 + }, + { + "epoch": 3.9312175845887873, + "grad_norm": 0.487981969360117, + "learning_rate": 4.564145910761627e-06, + "loss": 0.4659, + "step": 10035 + }, + { + "epoch": 3.9316127438873796, + "grad_norm": 0.4884183172019331, + "learning_rate": 4.564057516839277e-06, + "loss": 0.4776, + "step": 10036 + }, + { + "epoch": 3.932007903185972, + "grad_norm": 0.484407990579762, + "learning_rate": 4.563969114810563e-06, + "loss": 0.4549, + "step": 10037 + }, + { + "epoch": 3.932403062484564, + "grad_norm": 0.49302390973656574, + "learning_rate": 4.563880704675831e-06, + "loss": 0.461, + "step": 10038 + }, + { + "epoch": 3.9327982217831563, + "grad_norm": 0.4824245546460462, + "learning_rate": 4.56379228643543e-06, + "loss": 0.4456, + "step": 10039 + }, + { + "epoch": 3.9331933810817485, + "grad_norm": 0.524281421582032, + "learning_rate": 4.563703860089705e-06, + "loss": 0.4631, + "step": 10040 + }, + { + "epoch": 3.933588540380341, + "grad_norm": 0.48699863426336437, + "learning_rate": 4.563615425639005e-06, + "loss": 0.4554, + "step": 10041 + }, + { + "epoch": 3.933983699678933, + "grad_norm": 0.4956289418570272, + "learning_rate": 4.5635269830836764e-06, + "loss": 0.4691, + "step": 10042 + }, + { + "epoch": 3.9343788589775253, + "grad_norm": 0.48487716834135475, + "learning_rate": 4.563438532424067e-06, + "loss": 0.4785, + "step": 10043 + }, + { + "epoch": 3.9347740182761175, + "grad_norm": 0.47820983244486903, + "learning_rate": 4.563350073660524e-06, + "loss": 0.4445, + "step": 10044 + }, + { + "epoch": 3.9351691775747097, + "grad_norm": 0.5073931859582581, + "learning_rate": 4.5632616067933944e-06, + "loss": 0.4777, + "step": 10045 + }, + { + "epoch": 3.935564336873302, + "grad_norm": 0.473757508440239, + "learning_rate": 4.563173131823026e-06, + "loss": 0.4494, + "step": 10046 + }, + { + "epoch": 3.9359594961718942, + "grad_norm": 0.4807056489036854, + "learning_rate": 4.563084648749767e-06, + "loss": 0.4452, + "step": 10047 + }, + { + "epoch": 3.9363546554704865, + "grad_norm": 0.4878554376029113, + "learning_rate": 4.562996157573964e-06, + "loss": 0.4536, + "step": 10048 + }, + { + "epoch": 3.9367498147690787, + "grad_norm": 0.5087371100450915, + "learning_rate": 4.562907658295966e-06, + "loss": 0.488, + "step": 10049 + }, + { + "epoch": 3.937144974067671, + "grad_norm": 0.48438840021575363, + "learning_rate": 4.562819150916118e-06, + "loss": 0.4857, + "step": 10050 + }, + { + "epoch": 3.937540133366263, + "grad_norm": 0.49057136917157657, + "learning_rate": 4.562730635434768e-06, + "loss": 0.4643, + "step": 10051 + }, + { + "epoch": 3.9379352926648554, + "grad_norm": 0.47526482048284263, + "learning_rate": 4.562642111852266e-06, + "loss": 0.4482, + "step": 10052 + }, + { + "epoch": 3.9383304519634477, + "grad_norm": 0.5014131338073492, + "learning_rate": 4.562553580168958e-06, + "loss": 0.4565, + "step": 10053 + }, + { + "epoch": 3.93872561126204, + "grad_norm": 0.5031712717781439, + "learning_rate": 4.562465040385193e-06, + "loss": 0.4495, + "step": 10054 + }, + { + "epoch": 3.939120770560632, + "grad_norm": 1.0950220813960667, + "learning_rate": 4.562376492501316e-06, + "loss": 0.4635, + "step": 10055 + }, + { + "epoch": 3.9395159298592244, + "grad_norm": 0.4839972284867313, + "learning_rate": 4.5622879365176775e-06, + "loss": 0.4449, + "step": 10056 + }, + { + "epoch": 3.9399110891578166, + "grad_norm": 0.4914974711515712, + "learning_rate": 4.562199372434624e-06, + "loss": 0.4587, + "step": 10057 + }, + { + "epoch": 3.940306248456409, + "grad_norm": 0.4876711211009558, + "learning_rate": 4.562110800252504e-06, + "loss": 0.4591, + "step": 10058 + }, + { + "epoch": 3.940701407755001, + "grad_norm": 0.47546627363978317, + "learning_rate": 4.5620222199716645e-06, + "loss": 0.4416, + "step": 10059 + }, + { + "epoch": 3.9410965670535933, + "grad_norm": 0.48586322704048324, + "learning_rate": 4.561933631592453e-06, + "loss": 0.4549, + "step": 10060 + }, + { + "epoch": 3.9414917263521856, + "grad_norm": 0.4926171275617641, + "learning_rate": 4.56184503511522e-06, + "loss": 0.4598, + "step": 10061 + }, + { + "epoch": 3.941886885650778, + "grad_norm": 0.5123664239699524, + "learning_rate": 4.561756430540311e-06, + "loss": 0.4851, + "step": 10062 + }, + { + "epoch": 3.94228204494937, + "grad_norm": 0.5199067760226548, + "learning_rate": 4.5616678178680744e-06, + "loss": 0.4807, + "step": 10063 + }, + { + "epoch": 3.9426772042479623, + "grad_norm": 0.49024172756251366, + "learning_rate": 4.561579197098858e-06, + "loss": 0.4632, + "step": 10064 + }, + { + "epoch": 3.9430723635465545, + "grad_norm": 0.4814153973868991, + "learning_rate": 4.561490568233013e-06, + "loss": 0.4495, + "step": 10065 + }, + { + "epoch": 3.943467522845147, + "grad_norm": 0.47441139134775145, + "learning_rate": 4.561401931270882e-06, + "loss": 0.4604, + "step": 10066 + }, + { + "epoch": 3.943862682143739, + "grad_norm": 0.4924678869233399, + "learning_rate": 4.561313286212817e-06, + "loss": 0.4699, + "step": 10067 + }, + { + "epoch": 3.9442578414423313, + "grad_norm": 0.48654953928643213, + "learning_rate": 4.561224633059166e-06, + "loss": 0.4489, + "step": 10068 + }, + { + "epoch": 3.9446530007409235, + "grad_norm": 0.4963720302187141, + "learning_rate": 4.561135971810275e-06, + "loss": 0.4721, + "step": 10069 + }, + { + "epoch": 3.9450481600395157, + "grad_norm": 0.5091750814580799, + "learning_rate": 4.561047302466494e-06, + "loss": 0.4736, + "step": 10070 + }, + { + "epoch": 3.945443319338108, + "grad_norm": 0.49308737751715986, + "learning_rate": 4.56095862502817e-06, + "loss": 0.4517, + "step": 10071 + }, + { + "epoch": 3.9458384786367002, + "grad_norm": 0.4895075330718553, + "learning_rate": 4.5608699394956525e-06, + "loss": 0.4472, + "step": 10072 + }, + { + "epoch": 3.9462336379352925, + "grad_norm": 0.5031481365671037, + "learning_rate": 4.56078124586929e-06, + "loss": 0.4716, + "step": 10073 + }, + { + "epoch": 3.9466287972338847, + "grad_norm": 0.4990715567557606, + "learning_rate": 4.560692544149429e-06, + "loss": 0.4452, + "step": 10074 + }, + { + "epoch": 3.947023956532477, + "grad_norm": 0.4795562000811165, + "learning_rate": 4.56060383433642e-06, + "loss": 0.4639, + "step": 10075 + }, + { + "epoch": 3.947419115831069, + "grad_norm": 0.4809738041038454, + "learning_rate": 4.5605151164306095e-06, + "loss": 0.4589, + "step": 10076 + }, + { + "epoch": 3.9478142751296614, + "grad_norm": 0.5003235218268587, + "learning_rate": 4.5604263904323474e-06, + "loss": 0.4782, + "step": 10077 + }, + { + "epoch": 3.9482094344282537, + "grad_norm": 0.49029440459937, + "learning_rate": 4.560337656341981e-06, + "loss": 0.45, + "step": 10078 + }, + { + "epoch": 3.948604593726846, + "grad_norm": 0.48335540139031397, + "learning_rate": 4.560248914159861e-06, + "loss": 0.4601, + "step": 10079 + }, + { + "epoch": 3.948999753025438, + "grad_norm": 0.4893171254147366, + "learning_rate": 4.560160163886332e-06, + "loss": 0.4486, + "step": 10080 + }, + { + "epoch": 3.9493949123240304, + "grad_norm": 0.48236278214675543, + "learning_rate": 4.560071405521746e-06, + "loss": 0.4755, + "step": 10081 + }, + { + "epoch": 3.9497900716226226, + "grad_norm": 0.49023533737652236, + "learning_rate": 4.559982639066451e-06, + "loss": 0.457, + "step": 10082 + }, + { + "epoch": 3.950185230921215, + "grad_norm": 0.4812818608327351, + "learning_rate": 4.559893864520795e-06, + "loss": 0.4616, + "step": 10083 + }, + { + "epoch": 3.950580390219807, + "grad_norm": 0.4809646923115125, + "learning_rate": 4.559805081885126e-06, + "loss": 0.4535, + "step": 10084 + }, + { + "epoch": 3.9509755495183994, + "grad_norm": 0.5242179398310719, + "learning_rate": 4.559716291159793e-06, + "loss": 0.4589, + "step": 10085 + }, + { + "epoch": 3.9513707088169916, + "grad_norm": 0.48289980979582037, + "learning_rate": 4.559627492345147e-06, + "loss": 0.4572, + "step": 10086 + }, + { + "epoch": 3.951765868115584, + "grad_norm": 0.4805787354967715, + "learning_rate": 4.5595386854415335e-06, + "loss": 0.4507, + "step": 10087 + }, + { + "epoch": 3.952161027414176, + "grad_norm": 0.4853162285383262, + "learning_rate": 4.5594498704493025e-06, + "loss": 0.473, + "step": 10088 + }, + { + "epoch": 3.9525561867127683, + "grad_norm": 0.5148358522338808, + "learning_rate": 4.559361047368803e-06, + "loss": 0.4558, + "step": 10089 + }, + { + "epoch": 3.9529513460113606, + "grad_norm": 0.49389057275002457, + "learning_rate": 4.559272216200385e-06, + "loss": 0.4529, + "step": 10090 + }, + { + "epoch": 3.9533465053099532, + "grad_norm": 0.4800782868223698, + "learning_rate": 4.559183376944395e-06, + "loss": 0.4654, + "step": 10091 + }, + { + "epoch": 3.9537416646085455, + "grad_norm": 0.49900896659440425, + "learning_rate": 4.559094529601183e-06, + "loss": 0.4511, + "step": 10092 + }, + { + "epoch": 3.9541368239071377, + "grad_norm": 0.5014788964906836, + "learning_rate": 4.5590056741711e-06, + "loss": 0.4565, + "step": 10093 + }, + { + "epoch": 3.95453198320573, + "grad_norm": 0.49370738821746696, + "learning_rate": 4.558916810654491e-06, + "loss": 0.442, + "step": 10094 + }, + { + "epoch": 3.954927142504322, + "grad_norm": 0.49314453829677907, + "learning_rate": 4.558827939051707e-06, + "loss": 0.4664, + "step": 10095 + }, + { + "epoch": 3.9553223018029144, + "grad_norm": 0.4850261607507114, + "learning_rate": 4.558739059363098e-06, + "loss": 0.4599, + "step": 10096 + }, + { + "epoch": 3.9557174611015067, + "grad_norm": 0.5150383652631196, + "learning_rate": 4.558650171589012e-06, + "loss": 0.479, + "step": 10097 + }, + { + "epoch": 3.956112620400099, + "grad_norm": 0.49222732950123294, + "learning_rate": 4.558561275729798e-06, + "loss": 0.46, + "step": 10098 + }, + { + "epoch": 3.956507779698691, + "grad_norm": 0.5036146912323076, + "learning_rate": 4.558472371785805e-06, + "loss": 0.476, + "step": 10099 + }, + { + "epoch": 3.9569029389972834, + "grad_norm": 0.49324709847980996, + "learning_rate": 4.558383459757383e-06, + "loss": 0.4721, + "step": 10100 + }, + { + "epoch": 3.9572980982958756, + "grad_norm": 0.4688106876677952, + "learning_rate": 4.55829453964488e-06, + "loss": 0.4553, + "step": 10101 + }, + { + "epoch": 3.957693257594468, + "grad_norm": 0.4864105983815226, + "learning_rate": 4.558205611448646e-06, + "loss": 0.4507, + "step": 10102 + }, + { + "epoch": 3.95808841689306, + "grad_norm": 0.5127931976516101, + "learning_rate": 4.5581166751690306e-06, + "loss": 0.4646, + "step": 10103 + }, + { + "epoch": 3.9584835761916524, + "grad_norm": 0.493192790264334, + "learning_rate": 4.558027730806383e-06, + "loss": 0.4634, + "step": 10104 + }, + { + "epoch": 3.9588787354902446, + "grad_norm": 0.4995732048308892, + "learning_rate": 4.557938778361052e-06, + "loss": 0.4685, + "step": 10105 + }, + { + "epoch": 3.959273894788837, + "grad_norm": 0.4833683966329439, + "learning_rate": 4.557849817833386e-06, + "loss": 0.4548, + "step": 10106 + }, + { + "epoch": 3.959669054087429, + "grad_norm": 0.5172522955731031, + "learning_rate": 4.5577608492237365e-06, + "loss": 0.4625, + "step": 10107 + }, + { + "epoch": 3.9600642133860213, + "grad_norm": 0.48501688273496174, + "learning_rate": 4.557671872532452e-06, + "loss": 0.4537, + "step": 10108 + }, + { + "epoch": 3.9604593726846136, + "grad_norm": 0.5012142376692591, + "learning_rate": 4.557582887759881e-06, + "loss": 0.464, + "step": 10109 + }, + { + "epoch": 3.960854531983206, + "grad_norm": 0.49905859888997944, + "learning_rate": 4.557493894906375e-06, + "loss": 0.4688, + "step": 10110 + }, + { + "epoch": 3.961249691281798, + "grad_norm": 0.48977789559433366, + "learning_rate": 4.5574048939722825e-06, + "loss": 0.4545, + "step": 10111 + }, + { + "epoch": 3.9616448505803903, + "grad_norm": 0.47820284946504593, + "learning_rate": 4.557315884957952e-06, + "loss": 0.4509, + "step": 10112 + }, + { + "epoch": 3.9620400098789825, + "grad_norm": 0.5033813687269575, + "learning_rate": 4.557226867863734e-06, + "loss": 0.4541, + "step": 10113 + }, + { + "epoch": 3.9624351691775748, + "grad_norm": 0.48559157024559013, + "learning_rate": 4.5571378426899784e-06, + "loss": 0.4678, + "step": 10114 + }, + { + "epoch": 3.962830328476167, + "grad_norm": 0.48951951068737737, + "learning_rate": 4.557048809437034e-06, + "loss": 0.4826, + "step": 10115 + }, + { + "epoch": 3.9632254877747592, + "grad_norm": 0.5107355330569413, + "learning_rate": 4.556959768105253e-06, + "loss": 0.4643, + "step": 10116 + }, + { + "epoch": 3.9636206470733515, + "grad_norm": 0.4827625154057386, + "learning_rate": 4.556870718694981e-06, + "loss": 0.4592, + "step": 10117 + }, + { + "epoch": 3.9640158063719437, + "grad_norm": 0.5016649652795582, + "learning_rate": 4.55678166120657e-06, + "loss": 0.4668, + "step": 10118 + }, + { + "epoch": 3.964410965670536, + "grad_norm": 0.4945290431200768, + "learning_rate": 4.55669259564037e-06, + "loss": 0.4726, + "step": 10119 + }, + { + "epoch": 3.964806124969128, + "grad_norm": 0.48945605735322806, + "learning_rate": 4.55660352199673e-06, + "loss": 0.4581, + "step": 10120 + } + ], + "logging_steps": 1, + "max_steps": 50600, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 2530, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.540274358583296e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}