diff --git "a/data/mos-mamba-6x130m-trainer-sft/checkpoint-116500/trainer_state.json" "b/data/mos-mamba-6x130m-trainer-sft/checkpoint-116500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/data/mos-mamba-6x130m-trainer-sft/checkpoint-116500/trainer_state.json" @@ -0,0 +1,54395 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.22686643895637543, + "eval_steps": 500, + "global_step": 116500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.9210271110262932e-05, + "grad_norm": 5.397985935211182, + "learning_rate": 2.9999999936841446e-05, + "loss": 2.1156, + "step": 15 + }, + { + "epoch": 5.8420542220525864e-05, + "grad_norm": 3.999190092086792, + "learning_rate": 2.999999974736578e-05, + "loss": 2.5268, + "step": 30 + }, + { + "epoch": 8.76308133307888e-05, + "grad_norm": 5.580333709716797, + "learning_rate": 2.999999943157301e-05, + "loss": 2.3955, + "step": 45 + }, + { + "epoch": 0.00011684108444105173, + "grad_norm": 3.6752560138702393, + "learning_rate": 2.9999998989463132e-05, + "loss": 2.171, + "step": 60 + }, + { + "epoch": 0.00014605135555131467, + "grad_norm": 2.655461072921753, + "learning_rate": 2.9999998421036153e-05, + "loss": 2.188, + "step": 75 + }, + { + "epoch": 0.0001752616266615776, + "grad_norm": 2.572371006011963, + "learning_rate": 2.9999997726292083e-05, + "loss": 2.1977, + "step": 90 + }, + { + "epoch": 0.00020447189777184053, + "grad_norm": 8.544811248779297, + "learning_rate": 2.999999690523092e-05, + "loss": 2.1871, + "step": 105 + }, + { + "epoch": 0.00023368216888210345, + "grad_norm": 3.530529737472534, + "learning_rate": 2.999999595785267e-05, + "loss": 2.2238, + "step": 120 + }, + { + "epoch": 0.00026289243999236636, + "grad_norm": 3.671182155609131, + "learning_rate": 2.9999994884157345e-05, + "loss": 2.1338, + "step": 135 + }, + { + "epoch": 0.00029210271110262934, + "grad_norm": 3.9643137454986572, + "learning_rate": 2.9999993684144956e-05, + "loss": 2.0788, + "step": 150 + }, + { + "epoch": 0.00032131298221289227, + "grad_norm": 2.62164044380188, + "learning_rate": 2.9999992357815508e-05, + "loss": 2.117, + "step": 165 + }, + { + "epoch": 0.0003505232533231552, + "grad_norm": 4.189517021179199, + "learning_rate": 2.999999090516902e-05, + "loss": 2.262, + "step": 180 + }, + { + "epoch": 0.0003797335244334181, + "grad_norm": 2.880622625350952, + "learning_rate": 2.9999989326205494e-05, + "loss": 2.1102, + "step": 195 + }, + { + "epoch": 0.00040894379554368105, + "grad_norm": 5.852839469909668, + "learning_rate": 2.9999987620924948e-05, + "loss": 2.0684, + "step": 210 + }, + { + "epoch": 0.000438154066653944, + "grad_norm": 3.6339924335479736, + "learning_rate": 2.9999985789327394e-05, + "loss": 2.2518, + "step": 225 + }, + { + "epoch": 0.0004673643377642069, + "grad_norm": 2.743267297744751, + "learning_rate": 2.9999983831412858e-05, + "loss": 2.1236, + "step": 240 + }, + { + "epoch": 0.0004965746088744698, + "grad_norm": 3.9490363597869873, + "learning_rate": 2.9999981747181345e-05, + "loss": 2.0878, + "step": 255 + }, + { + "epoch": 0.0005257848799847327, + "grad_norm": 3.0342047214508057, + "learning_rate": 2.9999979536632872e-05, + "loss": 2.1415, + "step": 270 + }, + { + "epoch": 0.0005549951510949957, + "grad_norm": 4.045855522155762, + "learning_rate": 2.9999977199767467e-05, + "loss": 2.0558, + "step": 285 + }, + { + "epoch": 0.0005842054222052587, + "grad_norm": 4.325068950653076, + "learning_rate": 2.999997473658514e-05, + "loss": 2.0617, + "step": 300 + }, + { + "epoch": 0.0006134156933155216, + "grad_norm": 4.770871162414551, + "learning_rate": 2.999997214708592e-05, + "loss": 2.0922, + "step": 315 + }, + { + "epoch": 0.0006426259644257845, + "grad_norm": 4.1802754402160645, + "learning_rate": 2.999996943126982e-05, + "loss": 2.217, + "step": 330 + }, + { + "epoch": 0.0006718362355360475, + "grad_norm": 3.044260263442993, + "learning_rate": 2.999996658913687e-05, + "loss": 2.0508, + "step": 345 + }, + { + "epoch": 0.0007010465066463104, + "grad_norm": 4.1525187492370605, + "learning_rate": 2.9999963620687095e-05, + "loss": 2.1678, + "step": 360 + }, + { + "epoch": 0.0007302567777565733, + "grad_norm": 2.5676231384277344, + "learning_rate": 2.9999960525920515e-05, + "loss": 2.1693, + "step": 375 + }, + { + "epoch": 0.0007594670488668362, + "grad_norm": 3.880307674407959, + "learning_rate": 2.9999957304837156e-05, + "loss": 1.9008, + "step": 390 + }, + { + "epoch": 0.0007886773199770992, + "grad_norm": 3.3433754444122314, + "learning_rate": 2.999995395743705e-05, + "loss": 2.0734, + "step": 405 + }, + { + "epoch": 0.0008178875910873621, + "grad_norm": 3.451378583908081, + "learning_rate": 2.999995048372022e-05, + "loss": 1.9901, + "step": 420 + }, + { + "epoch": 0.000847097862197625, + "grad_norm": 3.4411580562591553, + "learning_rate": 2.9999946883686695e-05, + "loss": 1.9734, + "step": 435 + }, + { + "epoch": 0.000876308133307888, + "grad_norm": 3.514651298522949, + "learning_rate": 2.999994315733651e-05, + "loss": 2.1632, + "step": 450 + }, + { + "epoch": 0.0009055184044181509, + "grad_norm": 5.380824565887451, + "learning_rate": 2.999993930466969e-05, + "loss": 2.1534, + "step": 465 + }, + { + "epoch": 0.0009347286755284138, + "grad_norm": 3.768618106842041, + "learning_rate": 2.999993532568628e-05, + "loss": 2.2182, + "step": 480 + }, + { + "epoch": 0.0009639389466386767, + "grad_norm": 2.9490509033203125, + "learning_rate": 2.99999312203863e-05, + "loss": 1.9677, + "step": 495 + }, + { + "epoch": 0.0009931492177489396, + "grad_norm": 3.4881184101104736, + "learning_rate": 2.9999926988769788e-05, + "loss": 2.0786, + "step": 510 + }, + { + "epoch": 0.0010223594888592025, + "grad_norm": 3.788675546646118, + "learning_rate": 2.9999922630836784e-05, + "loss": 1.9696, + "step": 525 + }, + { + "epoch": 0.0010515697599694654, + "grad_norm": 3.070878744125366, + "learning_rate": 2.999991814658732e-05, + "loss": 1.9053, + "step": 540 + }, + { + "epoch": 0.0010807800310797286, + "grad_norm": 2.456150531768799, + "learning_rate": 2.9999913536021436e-05, + "loss": 1.9813, + "step": 555 + }, + { + "epoch": 0.0011099903021899915, + "grad_norm": 4.81670618057251, + "learning_rate": 2.999990879913917e-05, + "loss": 1.9317, + "step": 570 + }, + { + "epoch": 0.0011392005733002544, + "grad_norm": 4.014492988586426, + "learning_rate": 2.9999903935940567e-05, + "loss": 1.9709, + "step": 585 + }, + { + "epoch": 0.0011684108444105174, + "grad_norm": 4.765859603881836, + "learning_rate": 2.999989894642566e-05, + "loss": 1.9252, + "step": 600 + }, + { + "epoch": 0.0011976211155207803, + "grad_norm": 3.5599727630615234, + "learning_rate": 2.9999893830594492e-05, + "loss": 2.0175, + "step": 615 + }, + { + "epoch": 0.0012268313866310432, + "grad_norm": 4.441170692443848, + "learning_rate": 2.999988858844711e-05, + "loss": 2.1166, + "step": 630 + }, + { + "epoch": 0.0012560416577413061, + "grad_norm": 2.691765308380127, + "learning_rate": 2.9999883219983558e-05, + "loss": 1.991, + "step": 645 + }, + { + "epoch": 0.001285251928851569, + "grad_norm": 4.039743423461914, + "learning_rate": 2.9999877725203878e-05, + "loss": 1.9917, + "step": 660 + }, + { + "epoch": 0.001314462199961832, + "grad_norm": 3.0612881183624268, + "learning_rate": 2.9999872104108115e-05, + "loss": 2.0436, + "step": 675 + }, + { + "epoch": 0.001343672471072095, + "grad_norm": 3.3579518795013428, + "learning_rate": 2.9999866356696326e-05, + "loss": 1.9659, + "step": 690 + }, + { + "epoch": 0.0013728827421823579, + "grad_norm": 4.145946502685547, + "learning_rate": 2.9999860482968552e-05, + "loss": 2.0267, + "step": 705 + }, + { + "epoch": 0.0014020930132926208, + "grad_norm": 5.213028430938721, + "learning_rate": 2.999985448292484e-05, + "loss": 2.0953, + "step": 720 + }, + { + "epoch": 0.0014313032844028837, + "grad_norm": 2.440793752670288, + "learning_rate": 2.9999848356565246e-05, + "loss": 2.0218, + "step": 735 + }, + { + "epoch": 0.0014605135555131466, + "grad_norm": 2.6167280673980713, + "learning_rate": 2.9999842103889813e-05, + "loss": 2.0638, + "step": 750 + }, + { + "epoch": 0.0014897238266234096, + "grad_norm": 2.2680811882019043, + "learning_rate": 2.999983572489861e-05, + "loss": 1.973, + "step": 765 + }, + { + "epoch": 0.0015189340977336725, + "grad_norm": 2.905947208404541, + "learning_rate": 2.999982921959167e-05, + "loss": 1.9346, + "step": 780 + }, + { + "epoch": 0.0015481443688439354, + "grad_norm": 4.673079967498779, + "learning_rate": 2.999982258796907e-05, + "loss": 2.1666, + "step": 795 + }, + { + "epoch": 0.0015773546399541984, + "grad_norm": 2.5963408946990967, + "learning_rate": 2.9999815830030846e-05, + "loss": 2.1063, + "step": 810 + }, + { + "epoch": 0.0016065649110644613, + "grad_norm": 3.656632661819458, + "learning_rate": 2.9999808945777066e-05, + "loss": 2.1328, + "step": 825 + }, + { + "epoch": 0.0016357751821747242, + "grad_norm": 4.4109368324279785, + "learning_rate": 2.9999801935207786e-05, + "loss": 1.9602, + "step": 840 + }, + { + "epoch": 0.0016649854532849871, + "grad_norm": 2.10768985748291, + "learning_rate": 2.9999794798323065e-05, + "loss": 1.9766, + "step": 855 + }, + { + "epoch": 0.00169419572439525, + "grad_norm": 4.301672458648682, + "learning_rate": 2.999978753512296e-05, + "loss": 1.9386, + "step": 870 + }, + { + "epoch": 0.001723405995505513, + "grad_norm": 8.171302795410156, + "learning_rate": 2.9999780145607538e-05, + "loss": 2.1288, + "step": 885 + }, + { + "epoch": 0.001752616266615776, + "grad_norm": 3.4784069061279297, + "learning_rate": 2.999977262977685e-05, + "loss": 2.1346, + "step": 900 + }, + { + "epoch": 0.0017818265377260389, + "grad_norm": 2.9945054054260254, + "learning_rate": 2.9999764987630976e-05, + "loss": 2.0432, + "step": 915 + }, + { + "epoch": 0.0018110368088363018, + "grad_norm": 3.632039785385132, + "learning_rate": 2.9999757219169964e-05, + "loss": 2.0642, + "step": 930 + }, + { + "epoch": 0.0018402470799465647, + "grad_norm": 3.9847047328948975, + "learning_rate": 2.999974932439389e-05, + "loss": 2.016, + "step": 945 + }, + { + "epoch": 0.0018694573510568276, + "grad_norm": 1.89451003074646, + "learning_rate": 2.9999741303302816e-05, + "loss": 2.0355, + "step": 960 + }, + { + "epoch": 0.0018986676221670906, + "grad_norm": 6.945189952850342, + "learning_rate": 2.9999733155896814e-05, + "loss": 1.97, + "step": 975 + }, + { + "epoch": 0.0019278778932773535, + "grad_norm": 3.347123384475708, + "learning_rate": 2.9999724882175947e-05, + "loss": 2.0421, + "step": 990 + }, + { + "epoch": 0.0019570881643876164, + "grad_norm": 4.331577777862549, + "learning_rate": 2.9999716482140292e-05, + "loss": 2.133, + "step": 1005 + }, + { + "epoch": 0.001986298435497879, + "grad_norm": 4.221049785614014, + "learning_rate": 2.999970795578991e-05, + "loss": 1.9272, + "step": 1020 + }, + { + "epoch": 0.0020155087066081423, + "grad_norm": 2.820549488067627, + "learning_rate": 2.9999699303124876e-05, + "loss": 2.0116, + "step": 1035 + }, + { + "epoch": 0.002044718977718405, + "grad_norm": 2.2391343116760254, + "learning_rate": 2.9999690524145268e-05, + "loss": 1.9337, + "step": 1050 + }, + { + "epoch": 0.002073929248828668, + "grad_norm": 2.285027503967285, + "learning_rate": 2.9999681618851156e-05, + "loss": 1.9773, + "step": 1065 + }, + { + "epoch": 0.002103139519938931, + "grad_norm": 3.266909599304199, + "learning_rate": 2.9999672587242616e-05, + "loss": 1.9471, + "step": 1080 + }, + { + "epoch": 0.002132349791049194, + "grad_norm": 5.03222131729126, + "learning_rate": 2.9999663429319722e-05, + "loss": 2.1337, + "step": 1095 + }, + { + "epoch": 0.002161560062159457, + "grad_norm": 2.044246196746826, + "learning_rate": 2.9999654145082552e-05, + "loss": 2.06, + "step": 1110 + }, + { + "epoch": 0.00219077033326972, + "grad_norm": 3.1399905681610107, + "learning_rate": 2.9999644734531183e-05, + "loss": 1.9174, + "step": 1125 + }, + { + "epoch": 0.002219980604379983, + "grad_norm": 3.2445454597473145, + "learning_rate": 2.99996351976657e-05, + "loss": 1.9728, + "step": 1140 + }, + { + "epoch": 0.0022491908754902457, + "grad_norm": 2.3682138919830322, + "learning_rate": 2.999962553448618e-05, + "loss": 1.9779, + "step": 1155 + }, + { + "epoch": 0.002278401146600509, + "grad_norm": 2.283808708190918, + "learning_rate": 2.99996157449927e-05, + "loss": 1.964, + "step": 1170 + }, + { + "epoch": 0.0023076114177107716, + "grad_norm": 3.048382043838501, + "learning_rate": 2.9999605829185353e-05, + "loss": 2.0908, + "step": 1185 + }, + { + "epoch": 0.0023368216888210347, + "grad_norm": 6.139120101928711, + "learning_rate": 2.999959578706421e-05, + "loss": 1.9535, + "step": 1200 + }, + { + "epoch": 0.0023660319599312974, + "grad_norm": 2.104069232940674, + "learning_rate": 2.9999585618629363e-05, + "loss": 1.8775, + "step": 1215 + }, + { + "epoch": 0.0023952422310415606, + "grad_norm": 2.531449317932129, + "learning_rate": 2.9999575323880894e-05, + "loss": 2.0309, + "step": 1230 + }, + { + "epoch": 0.0024244525021518233, + "grad_norm": 2.841078996658325, + "learning_rate": 2.9999564902818894e-05, + "loss": 2.0546, + "step": 1245 + }, + { + "epoch": 0.0024536627732620864, + "grad_norm": 2.0627243518829346, + "learning_rate": 2.9999554355443445e-05, + "loss": 1.8833, + "step": 1260 + }, + { + "epoch": 0.002482873044372349, + "grad_norm": 3.4014151096343994, + "learning_rate": 2.9999543681754645e-05, + "loss": 1.8935, + "step": 1275 + }, + { + "epoch": 0.0025120833154826123, + "grad_norm": 3.507380962371826, + "learning_rate": 2.999953288175257e-05, + "loss": 1.9512, + "step": 1290 + }, + { + "epoch": 0.002541293586592875, + "grad_norm": 5.4273362159729, + "learning_rate": 2.9999521955437325e-05, + "loss": 1.9794, + "step": 1305 + }, + { + "epoch": 0.002570503857703138, + "grad_norm": 4.466185092926025, + "learning_rate": 2.9999510902808993e-05, + "loss": 1.8266, + "step": 1320 + }, + { + "epoch": 0.002599714128813401, + "grad_norm": 2.4276440143585205, + "learning_rate": 2.9999499723867672e-05, + "loss": 2.0149, + "step": 1335 + }, + { + "epoch": 0.002628924399923664, + "grad_norm": 3.589876413345337, + "learning_rate": 2.9999488418613454e-05, + "loss": 1.8865, + "step": 1350 + }, + { + "epoch": 0.0026581346710339267, + "grad_norm": 3.750687599182129, + "learning_rate": 2.9999476987046434e-05, + "loss": 2.13, + "step": 1365 + }, + { + "epoch": 0.00268734494214419, + "grad_norm": 3.638850212097168, + "learning_rate": 2.999946542916671e-05, + "loss": 1.981, + "step": 1380 + }, + { + "epoch": 0.0027165552132544526, + "grad_norm": 4.610542297363281, + "learning_rate": 2.9999453744974375e-05, + "loss": 1.8887, + "step": 1395 + }, + { + "epoch": 0.0027457654843647157, + "grad_norm": 2.3758933544158936, + "learning_rate": 2.9999441934469534e-05, + "loss": 1.7867, + "step": 1410 + }, + { + "epoch": 0.0027749757554749784, + "grad_norm": 1.9892805814743042, + "learning_rate": 2.999942999765228e-05, + "loss": 1.8632, + "step": 1425 + }, + { + "epoch": 0.0028041860265852416, + "grad_norm": 2.5921459197998047, + "learning_rate": 2.999941793452272e-05, + "loss": 1.9155, + "step": 1440 + }, + { + "epoch": 0.0028333962976955043, + "grad_norm": 6.098895072937012, + "learning_rate": 2.9999405745080948e-05, + "loss": 2.0112, + "step": 1455 + }, + { + "epoch": 0.0028626065688057674, + "grad_norm": 2.7144930362701416, + "learning_rate": 2.9999393429327073e-05, + "loss": 2.0019, + "step": 1470 + }, + { + "epoch": 0.00289181683991603, + "grad_norm": 2.3964602947235107, + "learning_rate": 2.9999380987261195e-05, + "loss": 1.9524, + "step": 1485 + }, + { + "epoch": 0.0029210271110262933, + "grad_norm": 3.2644741535186768, + "learning_rate": 2.9999368418883422e-05, + "loss": 1.881, + "step": 1500 + }, + { + "epoch": 0.002950237382136556, + "grad_norm": 3.0930943489074707, + "learning_rate": 2.9999355724193854e-05, + "loss": 2.0001, + "step": 1515 + }, + { + "epoch": 0.002979447653246819, + "grad_norm": 2.6339657306671143, + "learning_rate": 2.9999342903192608e-05, + "loss": 1.9344, + "step": 1530 + }, + { + "epoch": 0.003008657924357082, + "grad_norm": 3.7254459857940674, + "learning_rate": 2.999932995587978e-05, + "loss": 2.0673, + "step": 1545 + }, + { + "epoch": 0.003037868195467345, + "grad_norm": 2.7078473567962646, + "learning_rate": 2.999931688225549e-05, + "loss": 2.0223, + "step": 1560 + }, + { + "epoch": 0.0030670784665776077, + "grad_norm": 2.7792718410491943, + "learning_rate": 2.999930368231984e-05, + "loss": 1.902, + "step": 1575 + }, + { + "epoch": 0.003096288737687871, + "grad_norm": 4.607854843139648, + "learning_rate": 2.999929035607294e-05, + "loss": 1.9536, + "step": 1590 + }, + { + "epoch": 0.0031254990087981336, + "grad_norm": 2.4656715393066406, + "learning_rate": 2.999927690351491e-05, + "loss": 2.1792, + "step": 1605 + }, + { + "epoch": 0.0031547092799083967, + "grad_norm": 2.8231770992279053, + "learning_rate": 2.9999263324645863e-05, + "loss": 1.8908, + "step": 1620 + }, + { + "epoch": 0.0031839195510186594, + "grad_norm": 3.5741395950317383, + "learning_rate": 2.9999249619465904e-05, + "loss": 1.9505, + "step": 1635 + }, + { + "epoch": 0.0032131298221289226, + "grad_norm": 2.0066606998443604, + "learning_rate": 2.999923578797516e-05, + "loss": 2.0346, + "step": 1650 + }, + { + "epoch": 0.0032423400932391853, + "grad_norm": 2.5830793380737305, + "learning_rate": 2.999922183017374e-05, + "loss": 1.9546, + "step": 1665 + }, + { + "epoch": 0.0032715503643494484, + "grad_norm": 2.8811633586883545, + "learning_rate": 2.999920774606176e-05, + "loss": 1.8517, + "step": 1680 + }, + { + "epoch": 0.003300760635459711, + "grad_norm": 3.0571601390838623, + "learning_rate": 2.999919353563935e-05, + "loss": 1.8746, + "step": 1695 + }, + { + "epoch": 0.0033299709065699743, + "grad_norm": 2.927371025085449, + "learning_rate": 2.9999179198906614e-05, + "loss": 2.0249, + "step": 1710 + }, + { + "epoch": 0.003359181177680237, + "grad_norm": 2.425579071044922, + "learning_rate": 2.9999164735863685e-05, + "loss": 1.9149, + "step": 1725 + }, + { + "epoch": 0.0033883914487905, + "grad_norm": 3.0711071491241455, + "learning_rate": 2.9999150146510678e-05, + "loss": 1.8335, + "step": 1740 + }, + { + "epoch": 0.003417601719900763, + "grad_norm": 4.509474277496338, + "learning_rate": 2.9999135430847718e-05, + "loss": 2.0002, + "step": 1755 + }, + { + "epoch": 0.003446811991011026, + "grad_norm": 2.784403085708618, + "learning_rate": 2.9999120588874927e-05, + "loss": 1.985, + "step": 1770 + }, + { + "epoch": 0.0034760222621212887, + "grad_norm": 4.556781768798828, + "learning_rate": 2.9999105620592434e-05, + "loss": 1.8832, + "step": 1785 + }, + { + "epoch": 0.003505232533231552, + "grad_norm": 3.1926143169403076, + "learning_rate": 2.9999090526000364e-05, + "loss": 2.0136, + "step": 1800 + }, + { + "epoch": 0.0035344428043418146, + "grad_norm": 3.627634286880493, + "learning_rate": 2.9999075305098846e-05, + "loss": 2.0413, + "step": 1815 + }, + { + "epoch": 0.0035636530754520777, + "grad_norm": 4.250141620635986, + "learning_rate": 2.9999059957887998e-05, + "loss": 1.9076, + "step": 1830 + }, + { + "epoch": 0.0035928633465623404, + "grad_norm": 4.263555526733398, + "learning_rate": 2.999904448436796e-05, + "loss": 1.9782, + "step": 1845 + }, + { + "epoch": 0.0036220736176726036, + "grad_norm": 2.5871047973632812, + "learning_rate": 2.999902888453886e-05, + "loss": 1.9703, + "step": 1860 + }, + { + "epoch": 0.0036512838887828663, + "grad_norm": 2.2850048542022705, + "learning_rate": 2.9999013158400827e-05, + "loss": 1.9571, + "step": 1875 + }, + { + "epoch": 0.0036804941598931294, + "grad_norm": 2.419032096862793, + "learning_rate": 2.9998997305953993e-05, + "loss": 2.1583, + "step": 1890 + }, + { + "epoch": 0.003709704431003392, + "grad_norm": 3.190084934234619, + "learning_rate": 2.9998981327198497e-05, + "loss": 1.9732, + "step": 1905 + }, + { + "epoch": 0.0037389147021136553, + "grad_norm": 3.31540846824646, + "learning_rate": 2.9998965222134468e-05, + "loss": 1.8646, + "step": 1920 + }, + { + "epoch": 0.003768124973223918, + "grad_norm": 2.423227310180664, + "learning_rate": 2.9998948990762044e-05, + "loss": 1.9282, + "step": 1935 + }, + { + "epoch": 0.003797335244334181, + "grad_norm": 3.4635472297668457, + "learning_rate": 2.999893263308136e-05, + "loss": 1.9859, + "step": 1950 + }, + { + "epoch": 0.003826545515444444, + "grad_norm": 2.474353790283203, + "learning_rate": 2.9998916149092556e-05, + "loss": 2.0055, + "step": 1965 + }, + { + "epoch": 0.003855755786554707, + "grad_norm": 3.264435052871704, + "learning_rate": 2.9998899538795766e-05, + "loss": 1.9755, + "step": 1980 + }, + { + "epoch": 0.0038849660576649697, + "grad_norm": 2.028916358947754, + "learning_rate": 2.9998882802191137e-05, + "loss": 1.9914, + "step": 1995 + }, + { + "epoch": 0.003914176328775233, + "grad_norm": 1.9189203977584839, + "learning_rate": 2.9998865939278805e-05, + "loss": 1.8109, + "step": 2010 + }, + { + "epoch": 0.0039433865998854956, + "grad_norm": 3.143857955932617, + "learning_rate": 2.9998848950058913e-05, + "loss": 1.8233, + "step": 2025 + }, + { + "epoch": 0.003972596870995758, + "grad_norm": 2.662853479385376, + "learning_rate": 2.9998831834531608e-05, + "loss": 2.0015, + "step": 2040 + }, + { + "epoch": 0.004001807142106022, + "grad_norm": 2.0305709838867188, + "learning_rate": 2.9998814592697027e-05, + "loss": 1.8817, + "step": 2055 + }, + { + "epoch": 0.0040310174132162846, + "grad_norm": 4.386982440948486, + "learning_rate": 2.999879722455532e-05, + "loss": 1.8511, + "step": 2070 + }, + { + "epoch": 0.004060227684326547, + "grad_norm": 3.3638317584991455, + "learning_rate": 2.9998779730106633e-05, + "loss": 1.9738, + "step": 2085 + }, + { + "epoch": 0.00408943795543681, + "grad_norm": 3.6872494220733643, + "learning_rate": 2.9998762109351107e-05, + "loss": 1.8496, + "step": 2100 + }, + { + "epoch": 0.0041186482265470736, + "grad_norm": 3.2654178142547607, + "learning_rate": 2.9998744362288902e-05, + "loss": 1.9003, + "step": 2115 + }, + { + "epoch": 0.004147858497657336, + "grad_norm": 3.4839179515838623, + "learning_rate": 2.9998726488920162e-05, + "loss": 2.016, + "step": 2130 + }, + { + "epoch": 0.004177068768767599, + "grad_norm": 2.537320613861084, + "learning_rate": 2.9998708489245034e-05, + "loss": 2.0073, + "step": 2145 + }, + { + "epoch": 0.004206279039877862, + "grad_norm": 3.2071378231048584, + "learning_rate": 2.999869036326367e-05, + "loss": 1.9835, + "step": 2160 + }, + { + "epoch": 0.004235489310988125, + "grad_norm": 2.7338645458221436, + "learning_rate": 2.999867211097623e-05, + "loss": 1.8995, + "step": 2175 + }, + { + "epoch": 0.004264699582098388, + "grad_norm": 4.866488933563232, + "learning_rate": 2.999865373238286e-05, + "loss": 1.9656, + "step": 2190 + }, + { + "epoch": 0.004293909853208651, + "grad_norm": 2.67726731300354, + "learning_rate": 2.9998635227483715e-05, + "loss": 2.0401, + "step": 2205 + }, + { + "epoch": 0.004323120124318914, + "grad_norm": 3.1271047592163086, + "learning_rate": 2.9998616596278955e-05, + "loss": 2.0406, + "step": 2220 + }, + { + "epoch": 0.004352330395429177, + "grad_norm": 5.673630237579346, + "learning_rate": 2.9998597838768738e-05, + "loss": 1.9733, + "step": 2235 + }, + { + "epoch": 0.00438154066653944, + "grad_norm": 2.696476697921753, + "learning_rate": 2.9998578954953216e-05, + "loss": 2.16, + "step": 2250 + }, + { + "epoch": 0.004410750937649702, + "grad_norm": 5.409951686859131, + "learning_rate": 2.9998559944832553e-05, + "loss": 1.9401, + "step": 2265 + }, + { + "epoch": 0.004439961208759966, + "grad_norm": 2.183530330657959, + "learning_rate": 2.9998540808406903e-05, + "loss": 1.9405, + "step": 2280 + }, + { + "epoch": 0.004469171479870229, + "grad_norm": 3.3076109886169434, + "learning_rate": 2.9998521545676438e-05, + "loss": 1.9617, + "step": 2295 + }, + { + "epoch": 0.004498381750980491, + "grad_norm": 2.793837547302246, + "learning_rate": 2.999850215664131e-05, + "loss": 1.9463, + "step": 2310 + }, + { + "epoch": 0.004527592022090754, + "grad_norm": 2.6732029914855957, + "learning_rate": 2.9998482641301687e-05, + "loss": 1.9114, + "step": 2325 + }, + { + "epoch": 0.004556802293201018, + "grad_norm": 2.1354994773864746, + "learning_rate": 2.999846299965773e-05, + "loss": 2.0321, + "step": 2340 + }, + { + "epoch": 0.00458601256431128, + "grad_norm": 3.889298439025879, + "learning_rate": 2.9998443231709608e-05, + "loss": 2.0722, + "step": 2355 + }, + { + "epoch": 0.004615222835421543, + "grad_norm": 3.6255850791931152, + "learning_rate": 2.9998423337457486e-05, + "loss": 2.0355, + "step": 2370 + }, + { + "epoch": 0.004644433106531806, + "grad_norm": 2.4627747535705566, + "learning_rate": 2.9998403316901533e-05, + "loss": 2.0076, + "step": 2385 + }, + { + "epoch": 0.004673643377642069, + "grad_norm": 3.3934834003448486, + "learning_rate": 2.9998383170041916e-05, + "loss": 1.9877, + "step": 2400 + }, + { + "epoch": 0.004702853648752332, + "grad_norm": 2.2475860118865967, + "learning_rate": 2.99983628968788e-05, + "loss": 1.8463, + "step": 2415 + }, + { + "epoch": 0.004732063919862595, + "grad_norm": 2.8804898262023926, + "learning_rate": 2.9998342497412365e-05, + "loss": 2.0943, + "step": 2430 + }, + { + "epoch": 0.0047612741909728576, + "grad_norm": 2.2566545009613037, + "learning_rate": 2.999832197164278e-05, + "loss": 1.8834, + "step": 2445 + }, + { + "epoch": 0.004790484462083121, + "grad_norm": 2.948420524597168, + "learning_rate": 2.9998301319570216e-05, + "loss": 2.002, + "step": 2460 + }, + { + "epoch": 0.004819694733193384, + "grad_norm": 3.6838879585266113, + "learning_rate": 2.999828054119484e-05, + "loss": 1.9071, + "step": 2475 + }, + { + "epoch": 0.0048489050043036466, + "grad_norm": 2.2608137130737305, + "learning_rate": 2.9998259636516845e-05, + "loss": 1.9241, + "step": 2490 + }, + { + "epoch": 0.004878115275413909, + "grad_norm": 2.5766665935516357, + "learning_rate": 2.999823860553639e-05, + "loss": 1.9269, + "step": 2505 + }, + { + "epoch": 0.004907325546524173, + "grad_norm": 3.382236957550049, + "learning_rate": 2.9998217448253658e-05, + "loss": 2.0165, + "step": 2520 + }, + { + "epoch": 0.0049365358176344356, + "grad_norm": 2.600278854370117, + "learning_rate": 2.999819616466883e-05, + "loss": 1.8852, + "step": 2535 + }, + { + "epoch": 0.004965746088744698, + "grad_norm": 2.8558318614959717, + "learning_rate": 2.999817475478208e-05, + "loss": 1.9202, + "step": 2550 + }, + { + "epoch": 0.004994956359854961, + "grad_norm": 2.5992493629455566, + "learning_rate": 2.9998153218593594e-05, + "loss": 2.0815, + "step": 2565 + }, + { + "epoch": 0.0050241666309652246, + "grad_norm": 3.0813794136047363, + "learning_rate": 2.9998131556103545e-05, + "loss": 2.0847, + "step": 2580 + }, + { + "epoch": 0.005053376902075487, + "grad_norm": 3.12187123298645, + "learning_rate": 2.999810976731213e-05, + "loss": 1.8592, + "step": 2595 + }, + { + "epoch": 0.00508258717318575, + "grad_norm": 3.4060111045837402, + "learning_rate": 2.9998087852219514e-05, + "loss": 1.8904, + "step": 2610 + }, + { + "epoch": 0.005111797444296013, + "grad_norm": 1.8444907665252686, + "learning_rate": 2.9998065810825895e-05, + "loss": 2.0493, + "step": 2625 + }, + { + "epoch": 0.005141007715406276, + "grad_norm": 2.9746735095977783, + "learning_rate": 2.999804364313145e-05, + "loss": 2.0021, + "step": 2640 + }, + { + "epoch": 0.005170217986516539, + "grad_norm": 2.150517463684082, + "learning_rate": 2.9998021349136373e-05, + "loss": 1.8938, + "step": 2655 + }, + { + "epoch": 0.005199428257626802, + "grad_norm": 2.4695417881011963, + "learning_rate": 2.9997998928840854e-05, + "loss": 1.956, + "step": 2670 + }, + { + "epoch": 0.005228638528737064, + "grad_norm": 3.365466356277466, + "learning_rate": 2.999797638224507e-05, + "loss": 1.8762, + "step": 2685 + }, + { + "epoch": 0.005257848799847328, + "grad_norm": 2.206486940383911, + "learning_rate": 2.999795370934922e-05, + "loss": 2.0777, + "step": 2700 + }, + { + "epoch": 0.005287059070957591, + "grad_norm": 2.5338962078094482, + "learning_rate": 2.9997930910153492e-05, + "loss": 1.9524, + "step": 2715 + }, + { + "epoch": 0.005316269342067853, + "grad_norm": 2.7835092544555664, + "learning_rate": 2.999790798465808e-05, + "loss": 1.9849, + "step": 2730 + }, + { + "epoch": 0.005345479613178116, + "grad_norm": 3.5604777336120605, + "learning_rate": 2.999788493286317e-05, + "loss": 1.9226, + "step": 2745 + }, + { + "epoch": 0.00537468988428838, + "grad_norm": 2.7719836235046387, + "learning_rate": 2.9997861754768965e-05, + "loss": 2.0174, + "step": 2760 + }, + { + "epoch": 0.005403900155398642, + "grad_norm": 2.0660643577575684, + "learning_rate": 2.999783845037566e-05, + "loss": 2.0959, + "step": 2775 + }, + { + "epoch": 0.005433110426508905, + "grad_norm": 2.501246690750122, + "learning_rate": 2.9997815019683443e-05, + "loss": 1.9407, + "step": 2790 + }, + { + "epoch": 0.005462320697619168, + "grad_norm": 3.6545281410217285, + "learning_rate": 2.9997791462692518e-05, + "loss": 2.0155, + "step": 2805 + }, + { + "epoch": 0.005491530968729431, + "grad_norm": 2.2655766010284424, + "learning_rate": 2.9997767779403085e-05, + "loss": 1.7096, + "step": 2820 + }, + { + "epoch": 0.005520741239839694, + "grad_norm": 1.9551692008972168, + "learning_rate": 2.9997743969815337e-05, + "loss": 2.0597, + "step": 2835 + }, + { + "epoch": 0.005549951510949957, + "grad_norm": 4.887934684753418, + "learning_rate": 2.999772003392948e-05, + "loss": 2.0615, + "step": 2850 + }, + { + "epoch": 0.0055791617820602196, + "grad_norm": 3.462581157684326, + "learning_rate": 2.999769597174571e-05, + "loss": 1.9574, + "step": 2865 + }, + { + "epoch": 0.005608372053170483, + "grad_norm": 4.445065498352051, + "learning_rate": 2.9997671783264234e-05, + "loss": 1.9514, + "step": 2880 + }, + { + "epoch": 0.005637582324280746, + "grad_norm": 2.81374454498291, + "learning_rate": 2.9997647468485254e-05, + "loss": 1.9163, + "step": 2895 + }, + { + "epoch": 0.0056667925953910086, + "grad_norm": 3.4871456623077393, + "learning_rate": 2.999762302740898e-05, + "loss": 1.966, + "step": 2910 + }, + { + "epoch": 0.005696002866501271, + "grad_norm": 2.006155490875244, + "learning_rate": 2.9997598460035608e-05, + "loss": 2.009, + "step": 2925 + }, + { + "epoch": 0.005725213137611535, + "grad_norm": 2.104846477508545, + "learning_rate": 2.9997573766365353e-05, + "loss": 1.9507, + "step": 2940 + }, + { + "epoch": 0.0057544234087217976, + "grad_norm": 3.3199880123138428, + "learning_rate": 2.999754894639842e-05, + "loss": 1.9102, + "step": 2955 + }, + { + "epoch": 0.00578363367983206, + "grad_norm": 3.4421074390411377, + "learning_rate": 2.9997524000135015e-05, + "loss": 1.9148, + "step": 2970 + }, + { + "epoch": 0.005812843950942323, + "grad_norm": 3.9583024978637695, + "learning_rate": 2.9997498927575352e-05, + "loss": 1.9872, + "step": 2985 + }, + { + "epoch": 0.0058420542220525866, + "grad_norm": 2.90704607963562, + "learning_rate": 2.999747372871964e-05, + "loss": 1.8581, + "step": 3000 + }, + { + "epoch": 0.005871264493162849, + "grad_norm": 2.327897548675537, + "learning_rate": 2.99974484035681e-05, + "loss": 2.155, + "step": 3015 + }, + { + "epoch": 0.005900474764273112, + "grad_norm": 2.7539587020874023, + "learning_rate": 2.999742295212093e-05, + "loss": 2.0412, + "step": 3030 + }, + { + "epoch": 0.005929685035383375, + "grad_norm": 3.2530441284179688, + "learning_rate": 2.999739737437835e-05, + "loss": 1.8964, + "step": 3045 + }, + { + "epoch": 0.005958895306493638, + "grad_norm": 4.038791179656982, + "learning_rate": 2.9997371670340583e-05, + "loss": 1.8992, + "step": 3060 + }, + { + "epoch": 0.005988105577603901, + "grad_norm": 3.810509443283081, + "learning_rate": 2.999734584000784e-05, + "loss": 2.0157, + "step": 3075 + }, + { + "epoch": 0.006017315848714164, + "grad_norm": 2.6701083183288574, + "learning_rate": 2.9997319883380334e-05, + "loss": 1.9206, + "step": 3090 + }, + { + "epoch": 0.006046526119824426, + "grad_norm": 2.0933597087860107, + "learning_rate": 2.999729380045829e-05, + "loss": 1.8986, + "step": 3105 + }, + { + "epoch": 0.00607573639093469, + "grad_norm": 7.184755802154541, + "learning_rate": 2.9997267591241924e-05, + "loss": 2.0928, + "step": 3120 + }, + { + "epoch": 0.006104946662044953, + "grad_norm": 4.04668664932251, + "learning_rate": 2.9997241255731465e-05, + "loss": 2.1005, + "step": 3135 + }, + { + "epoch": 0.006134156933155215, + "grad_norm": 2.4879837036132812, + "learning_rate": 2.9997214793927122e-05, + "loss": 2.0177, + "step": 3150 + }, + { + "epoch": 0.006163367204265478, + "grad_norm": 2.1891894340515137, + "learning_rate": 2.9997188205829127e-05, + "loss": 1.827, + "step": 3165 + }, + { + "epoch": 0.006192577475375742, + "grad_norm": 2.6386845111846924, + "learning_rate": 2.9997161491437696e-05, + "loss": 2.0118, + "step": 3180 + }, + { + "epoch": 0.006221787746486004, + "grad_norm": 2.792214870452881, + "learning_rate": 2.9997134650753066e-05, + "loss": 1.7433, + "step": 3195 + }, + { + "epoch": 0.006250998017596267, + "grad_norm": 3.470515727996826, + "learning_rate": 2.999710768377545e-05, + "loss": 2.0895, + "step": 3210 + }, + { + "epoch": 0.00628020828870653, + "grad_norm": 3.3043479919433594, + "learning_rate": 2.9997080590505085e-05, + "loss": 2.0023, + "step": 3225 + }, + { + "epoch": 0.006309418559816793, + "grad_norm": 2.1124866008758545, + "learning_rate": 2.9997053370942195e-05, + "loss": 2.0457, + "step": 3240 + }, + { + "epoch": 0.006338628830927056, + "grad_norm": 3.1867144107818604, + "learning_rate": 2.9997026025087004e-05, + "loss": 1.9503, + "step": 3255 + }, + { + "epoch": 0.006367839102037319, + "grad_norm": 2.3057665824890137, + "learning_rate": 2.999699855293975e-05, + "loss": 1.8968, + "step": 3270 + }, + { + "epoch": 0.0063970493731475815, + "grad_norm": 2.416084051132202, + "learning_rate": 2.999697095450066e-05, + "loss": 1.8304, + "step": 3285 + }, + { + "epoch": 0.006426259644257845, + "grad_norm": 2.481602191925049, + "learning_rate": 2.9996943229769977e-05, + "loss": 2.1386, + "step": 3300 + }, + { + "epoch": 0.006455469915368108, + "grad_norm": 2.4862520694732666, + "learning_rate": 2.9996915378747918e-05, + "loss": 1.9672, + "step": 3315 + }, + { + "epoch": 0.0064846801864783706, + "grad_norm": 3.704164743423462, + "learning_rate": 2.999688740143473e-05, + "loss": 1.9671, + "step": 3330 + }, + { + "epoch": 0.006513890457588634, + "grad_norm": 4.340814113616943, + "learning_rate": 2.999685929783064e-05, + "loss": 1.889, + "step": 3345 + }, + { + "epoch": 0.006543100728698897, + "grad_norm": 2.8202598094940186, + "learning_rate": 2.999683106793589e-05, + "loss": 1.883, + "step": 3360 + }, + { + "epoch": 0.0065723109998091596, + "grad_norm": 2.3717639446258545, + "learning_rate": 2.9996802711750716e-05, + "loss": 1.8932, + "step": 3375 + }, + { + "epoch": 0.006601521270919422, + "grad_norm": 2.1720776557922363, + "learning_rate": 2.999677422927536e-05, + "loss": 1.8262, + "step": 3390 + }, + { + "epoch": 0.006630731542029686, + "grad_norm": 6.15664005279541, + "learning_rate": 2.9996745620510055e-05, + "loss": 1.9139, + "step": 3405 + }, + { + "epoch": 0.0066599418131399486, + "grad_norm": 3.080832004547119, + "learning_rate": 2.9996716885455047e-05, + "loss": 1.9838, + "step": 3420 + }, + { + "epoch": 0.006689152084250211, + "grad_norm": 2.1853530406951904, + "learning_rate": 2.9996688024110577e-05, + "loss": 1.9353, + "step": 3435 + }, + { + "epoch": 0.006718362355360474, + "grad_norm": 2.609475612640381, + "learning_rate": 2.9996659036476886e-05, + "loss": 1.9734, + "step": 3450 + }, + { + "epoch": 0.0067475726264707376, + "grad_norm": 3.808016061782837, + "learning_rate": 2.9996629922554225e-05, + "loss": 1.9872, + "step": 3465 + }, + { + "epoch": 0.006776782897581, + "grad_norm": 4.373661994934082, + "learning_rate": 2.999660068234283e-05, + "loss": 1.8084, + "step": 3480 + }, + { + "epoch": 0.006805993168691263, + "grad_norm": 4.299337387084961, + "learning_rate": 2.9996571315842954e-05, + "loss": 2.0801, + "step": 3495 + }, + { + "epoch": 0.006835203439801526, + "grad_norm": 4.49350118637085, + "learning_rate": 2.999654182305484e-05, + "loss": 1.8882, + "step": 3510 + }, + { + "epoch": 0.006864413710911789, + "grad_norm": 2.319342613220215, + "learning_rate": 2.999651220397874e-05, + "loss": 1.9627, + "step": 3525 + }, + { + "epoch": 0.006893623982022052, + "grad_norm": 3.2572271823883057, + "learning_rate": 2.99964824586149e-05, + "loss": 1.9482, + "step": 3540 + }, + { + "epoch": 0.006922834253132315, + "grad_norm": 1.7764968872070312, + "learning_rate": 2.9996452586963575e-05, + "loss": 1.8938, + "step": 3555 + }, + { + "epoch": 0.006952044524242577, + "grad_norm": 2.2628538608551025, + "learning_rate": 2.9996422589025007e-05, + "loss": 1.9644, + "step": 3570 + }, + { + "epoch": 0.006981254795352841, + "grad_norm": 3.8122570514678955, + "learning_rate": 2.999639246479946e-05, + "loss": 1.7568, + "step": 3585 + }, + { + "epoch": 0.007010465066463104, + "grad_norm": 3.868978977203369, + "learning_rate": 2.999636221428718e-05, + "loss": 2.077, + "step": 3600 + }, + { + "epoch": 0.007039675337573366, + "grad_norm": 4.426783561706543, + "learning_rate": 2.999633183748843e-05, + "loss": 1.8161, + "step": 3615 + }, + { + "epoch": 0.007068885608683629, + "grad_norm": 2.3112306594848633, + "learning_rate": 2.9996301334403456e-05, + "loss": 1.8738, + "step": 3630 + }, + { + "epoch": 0.007098095879793893, + "grad_norm": 1.7659751176834106, + "learning_rate": 2.9996270705032523e-05, + "loss": 1.9714, + "step": 3645 + }, + { + "epoch": 0.007127306150904155, + "grad_norm": 5.0493364334106445, + "learning_rate": 2.9996239949375882e-05, + "loss": 1.9451, + "step": 3660 + }, + { + "epoch": 0.007156516422014418, + "grad_norm": 1.6928982734680176, + "learning_rate": 2.9996209067433794e-05, + "loss": 1.7872, + "step": 3675 + }, + { + "epoch": 0.007185726693124681, + "grad_norm": 2.2954142093658447, + "learning_rate": 2.9996178059206525e-05, + "loss": 2.0124, + "step": 3690 + }, + { + "epoch": 0.007214936964234944, + "grad_norm": 4.649162292480469, + "learning_rate": 2.9996146924694327e-05, + "loss": 1.823, + "step": 3705 + }, + { + "epoch": 0.007244147235345207, + "grad_norm": 2.248623847961426, + "learning_rate": 2.9996115663897468e-05, + "loss": 1.9382, + "step": 3720 + }, + { + "epoch": 0.00727335750645547, + "grad_norm": 3.033177375793457, + "learning_rate": 2.999608427681621e-05, + "loss": 2.0324, + "step": 3735 + }, + { + "epoch": 0.0073025677775657325, + "grad_norm": 3.4453887939453125, + "learning_rate": 2.9996052763450817e-05, + "loss": 1.9293, + "step": 3750 + }, + { + "epoch": 0.007331778048675996, + "grad_norm": 3.873504877090454, + "learning_rate": 2.9996021123801556e-05, + "loss": 2.0132, + "step": 3765 + }, + { + "epoch": 0.007360988319786259, + "grad_norm": 3.7370665073394775, + "learning_rate": 2.999598935786869e-05, + "loss": 1.8681, + "step": 3780 + }, + { + "epoch": 0.0073901985908965215, + "grad_norm": 5.5719523429870605, + "learning_rate": 2.999595746565249e-05, + "loss": 1.8799, + "step": 3795 + }, + { + "epoch": 0.007419408862006784, + "grad_norm": 3.7253758907318115, + "learning_rate": 2.9995925447153226e-05, + "loss": 1.8633, + "step": 3810 + }, + { + "epoch": 0.007448619133117048, + "grad_norm": 2.792862892150879, + "learning_rate": 2.9995893302371158e-05, + "loss": 1.9642, + "step": 3825 + }, + { + "epoch": 0.0074778294042273106, + "grad_norm": 2.693080425262451, + "learning_rate": 2.999586103130657e-05, + "loss": 1.8963, + "step": 3840 + }, + { + "epoch": 0.007507039675337573, + "grad_norm": 2.7681262493133545, + "learning_rate": 2.9995828633959724e-05, + "loss": 2.0581, + "step": 3855 + }, + { + "epoch": 0.007536249946447836, + "grad_norm": 3.635828733444214, + "learning_rate": 2.9995796110330894e-05, + "loss": 1.7659, + "step": 3870 + }, + { + "epoch": 0.0075654602175580996, + "grad_norm": 2.46408748626709, + "learning_rate": 2.9995763460420358e-05, + "loss": 1.7966, + "step": 3885 + }, + { + "epoch": 0.007594670488668362, + "grad_norm": 2.2896721363067627, + "learning_rate": 2.999573068422839e-05, + "loss": 1.9918, + "step": 3900 + }, + { + "epoch": 0.007623880759778625, + "grad_norm": 2.3533968925476074, + "learning_rate": 2.9995697781755262e-05, + "loss": 1.8725, + "step": 3915 + }, + { + "epoch": 0.007653091030888888, + "grad_norm": 2.5543251037597656, + "learning_rate": 2.999566475300125e-05, + "loss": 1.8896, + "step": 3930 + }, + { + "epoch": 0.007682301301999151, + "grad_norm": 3.2201671600341797, + "learning_rate": 2.999563159796665e-05, + "loss": 1.8444, + "step": 3945 + }, + { + "epoch": 0.007711511573109414, + "grad_norm": 2.2435178756713867, + "learning_rate": 2.9995598316651713e-05, + "loss": 1.976, + "step": 3960 + }, + { + "epoch": 0.007740721844219677, + "grad_norm": 3.219825506210327, + "learning_rate": 2.999556490905674e-05, + "loss": 1.8561, + "step": 3975 + }, + { + "epoch": 0.007769932115329939, + "grad_norm": 2.4510769844055176, + "learning_rate": 2.9995531375182008e-05, + "loss": 2.068, + "step": 3990 + }, + { + "epoch": 0.007799142386440203, + "grad_norm": 4.242166519165039, + "learning_rate": 2.999549771502779e-05, + "loss": 1.8497, + "step": 4005 + }, + { + "epoch": 0.007828352657550466, + "grad_norm": 2.9145870208740234, + "learning_rate": 2.9995463928594383e-05, + "loss": 1.8896, + "step": 4020 + }, + { + "epoch": 0.007857562928660728, + "grad_norm": 2.33396315574646, + "learning_rate": 2.9995430015882064e-05, + "loss": 1.9502, + "step": 4035 + }, + { + "epoch": 0.007886773199770991, + "grad_norm": 3.1073617935180664, + "learning_rate": 2.9995395976891118e-05, + "loss": 1.9178, + "step": 4050 + }, + { + "epoch": 0.007915983470881254, + "grad_norm": 3.444310188293457, + "learning_rate": 2.9995361811621838e-05, + "loss": 1.7477, + "step": 4065 + }, + { + "epoch": 0.007945193741991517, + "grad_norm": 3.131169080734253, + "learning_rate": 2.9995327520074504e-05, + "loss": 1.8799, + "step": 4080 + }, + { + "epoch": 0.007974404013101781, + "grad_norm": 2.9107017517089844, + "learning_rate": 2.9995293102249408e-05, + "loss": 1.8785, + "step": 4095 + }, + { + "epoch": 0.008003614284212044, + "grad_norm": 2.795280694961548, + "learning_rate": 2.9995258558146834e-05, + "loss": 2.0386, + "step": 4110 + }, + { + "epoch": 0.008032824555322306, + "grad_norm": 5.588748455047607, + "learning_rate": 2.9995223887767087e-05, + "loss": 1.9852, + "step": 4125 + }, + { + "epoch": 0.008062034826432569, + "grad_norm": 2.659045934677124, + "learning_rate": 2.999518909111045e-05, + "loss": 1.8707, + "step": 4140 + }, + { + "epoch": 0.008091245097542832, + "grad_norm": 2.9934346675872803, + "learning_rate": 2.9995154168177214e-05, + "loss": 1.9004, + "step": 4155 + }, + { + "epoch": 0.008120455368653095, + "grad_norm": 2.6598739624023438, + "learning_rate": 2.9995119118967674e-05, + "loss": 1.8897, + "step": 4170 + }, + { + "epoch": 0.008149665639763357, + "grad_norm": 3.6846165657043457, + "learning_rate": 2.9995083943482126e-05, + "loss": 1.9932, + "step": 4185 + }, + { + "epoch": 0.00817887591087362, + "grad_norm": 3.6916184425354004, + "learning_rate": 2.9995048641720873e-05, + "loss": 1.7672, + "step": 4200 + }, + { + "epoch": 0.008208086181983884, + "grad_norm": 3.650599956512451, + "learning_rate": 2.9995013213684202e-05, + "loss": 1.9271, + "step": 4215 + }, + { + "epoch": 0.008237296453094147, + "grad_norm": 4.196152687072754, + "learning_rate": 2.999497765937242e-05, + "loss": 1.8244, + "step": 4230 + }, + { + "epoch": 0.00826650672420441, + "grad_norm": 3.1078038215637207, + "learning_rate": 2.9994941978785817e-05, + "loss": 1.893, + "step": 4245 + }, + { + "epoch": 0.008295716995314673, + "grad_norm": 4.401791095733643, + "learning_rate": 2.9994906171924703e-05, + "loss": 1.8844, + "step": 4260 + }, + { + "epoch": 0.008324927266424935, + "grad_norm": 2.6136245727539062, + "learning_rate": 2.999487023878937e-05, + "loss": 1.8981, + "step": 4275 + }, + { + "epoch": 0.008354137537535198, + "grad_norm": 3.334519624710083, + "learning_rate": 2.9994834179380134e-05, + "loss": 1.9983, + "step": 4290 + }, + { + "epoch": 0.00838334780864546, + "grad_norm": 1.8010269403457642, + "learning_rate": 2.9994797993697283e-05, + "loss": 1.8192, + "step": 4305 + }, + { + "epoch": 0.008412558079755723, + "grad_norm": 3.2746548652648926, + "learning_rate": 2.9994761681741135e-05, + "loss": 1.9696, + "step": 4320 + }, + { + "epoch": 0.008441768350865988, + "grad_norm": 2.172431468963623, + "learning_rate": 2.9994725243511982e-05, + "loss": 1.9068, + "step": 4335 + }, + { + "epoch": 0.00847097862197625, + "grad_norm": 3.5535871982574463, + "learning_rate": 2.999468867901015e-05, + "loss": 1.985, + "step": 4350 + }, + { + "epoch": 0.008500188893086513, + "grad_norm": 3.866422176361084, + "learning_rate": 2.9994651988235923e-05, + "loss": 1.8057, + "step": 4365 + }, + { + "epoch": 0.008529399164196776, + "grad_norm": 4.770716190338135, + "learning_rate": 2.999461517118963e-05, + "loss": 1.9949, + "step": 4380 + }, + { + "epoch": 0.008558609435307039, + "grad_norm": 2.9273180961608887, + "learning_rate": 2.999457822787157e-05, + "loss": 1.8686, + "step": 4395 + }, + { + "epoch": 0.008587819706417301, + "grad_norm": 3.192166328430176, + "learning_rate": 2.9994541158282063e-05, + "loss": 1.9539, + "step": 4410 + }, + { + "epoch": 0.008617029977527564, + "grad_norm": 3.507930040359497, + "learning_rate": 2.9994503962421417e-05, + "loss": 1.7732, + "step": 4425 + }, + { + "epoch": 0.008646240248637829, + "grad_norm": 2.666705369949341, + "learning_rate": 2.9994466640289938e-05, + "loss": 1.9615, + "step": 4440 + }, + { + "epoch": 0.008675450519748091, + "grad_norm": 2.640362501144409, + "learning_rate": 2.999442919188795e-05, + "loss": 1.89, + "step": 4455 + }, + { + "epoch": 0.008704660790858354, + "grad_norm": 2.524216890335083, + "learning_rate": 2.9994391617215765e-05, + "loss": 2.0473, + "step": 4470 + }, + { + "epoch": 0.008733871061968617, + "grad_norm": 2.6444146633148193, + "learning_rate": 2.9994353916273696e-05, + "loss": 1.8889, + "step": 4485 + }, + { + "epoch": 0.00876308133307888, + "grad_norm": 2.4632787704467773, + "learning_rate": 2.9994316089062068e-05, + "loss": 1.9616, + "step": 4500 + }, + { + "epoch": 0.008792291604189142, + "grad_norm": 5.182243347167969, + "learning_rate": 2.999427813558119e-05, + "loss": 1.8216, + "step": 4515 + }, + { + "epoch": 0.008821501875299405, + "grad_norm": 3.6622776985168457, + "learning_rate": 2.9994240055831395e-05, + "loss": 1.8692, + "step": 4530 + }, + { + "epoch": 0.008850712146409668, + "grad_norm": 2.899912118911743, + "learning_rate": 2.9994201849812988e-05, + "loss": 1.9363, + "step": 4545 + }, + { + "epoch": 0.008879922417519932, + "grad_norm": 1.8940974473953247, + "learning_rate": 2.99941635175263e-05, + "loss": 1.8404, + "step": 4560 + }, + { + "epoch": 0.008909132688630195, + "grad_norm": 3.581655263900757, + "learning_rate": 2.9994125058971657e-05, + "loss": 1.9623, + "step": 4575 + }, + { + "epoch": 0.008938342959740457, + "grad_norm": 2.6804749965667725, + "learning_rate": 2.9994086474149375e-05, + "loss": 2.0692, + "step": 4590 + }, + { + "epoch": 0.00896755323085072, + "grad_norm": 4.302793502807617, + "learning_rate": 2.999404776305978e-05, + "loss": 1.9728, + "step": 4605 + }, + { + "epoch": 0.008996763501960983, + "grad_norm": 2.143483877182007, + "learning_rate": 2.9994008925703202e-05, + "loss": 1.941, + "step": 4620 + }, + { + "epoch": 0.009025973773071246, + "grad_norm": 2.2785732746124268, + "learning_rate": 2.9993969962079964e-05, + "loss": 1.9828, + "step": 4635 + }, + { + "epoch": 0.009055184044181508, + "grad_norm": 3.918194055557251, + "learning_rate": 2.9993930872190398e-05, + "loss": 1.9211, + "step": 4650 + }, + { + "epoch": 0.009084394315291771, + "grad_norm": 2.7863261699676514, + "learning_rate": 2.999389165603483e-05, + "loss": 1.8564, + "step": 4665 + }, + { + "epoch": 0.009113604586402035, + "grad_norm": 2.655966281890869, + "learning_rate": 2.9993852313613596e-05, + "loss": 1.9327, + "step": 4680 + }, + { + "epoch": 0.009142814857512298, + "grad_norm": 2.987030506134033, + "learning_rate": 2.999381284492702e-05, + "loss": 1.8325, + "step": 4695 + }, + { + "epoch": 0.00917202512862256, + "grad_norm": 3.127544641494751, + "learning_rate": 2.9993773249975435e-05, + "loss": 1.7246, + "step": 4710 + }, + { + "epoch": 0.009201235399732824, + "grad_norm": 2.395202398300171, + "learning_rate": 2.999373352875918e-05, + "loss": 1.9827, + "step": 4725 + }, + { + "epoch": 0.009230445670843086, + "grad_norm": 4.163525104522705, + "learning_rate": 2.9993693681278582e-05, + "loss": 1.9441, + "step": 4740 + }, + { + "epoch": 0.009259655941953349, + "grad_norm": 2.84067964553833, + "learning_rate": 2.9993653707533985e-05, + "loss": 2.0252, + "step": 4755 + }, + { + "epoch": 0.009288866213063612, + "grad_norm": 1.949599266052246, + "learning_rate": 2.9993613607525717e-05, + "loss": 1.9497, + "step": 4770 + }, + { + "epoch": 0.009318076484173874, + "grad_norm": 4.164729595184326, + "learning_rate": 2.9993573381254124e-05, + "loss": 2.1013, + "step": 4785 + }, + { + "epoch": 0.009347286755284139, + "grad_norm": 1.7438952922821045, + "learning_rate": 2.9993533028719537e-05, + "loss": 1.9461, + "step": 4800 + }, + { + "epoch": 0.009376497026394402, + "grad_norm": 1.9006491899490356, + "learning_rate": 2.9993492549922302e-05, + "loss": 1.9273, + "step": 4815 + }, + { + "epoch": 0.009405707297504664, + "grad_norm": 3.693070411682129, + "learning_rate": 2.9993451944862762e-05, + "loss": 1.9791, + "step": 4830 + }, + { + "epoch": 0.009434917568614927, + "grad_norm": 2.9572765827178955, + "learning_rate": 2.9993411213541248e-05, + "loss": 1.7725, + "step": 4845 + }, + { + "epoch": 0.00946412783972519, + "grad_norm": 2.882349967956543, + "learning_rate": 2.999337035595811e-05, + "loss": 1.8094, + "step": 4860 + }, + { + "epoch": 0.009493338110835452, + "grad_norm": 3.9489054679870605, + "learning_rate": 2.9993329372113695e-05, + "loss": 1.8678, + "step": 4875 + }, + { + "epoch": 0.009522548381945715, + "grad_norm": 2.7020881175994873, + "learning_rate": 2.999328826200834e-05, + "loss": 2.0528, + "step": 4890 + }, + { + "epoch": 0.009551758653055978, + "grad_norm": 3.121814489364624, + "learning_rate": 2.99932470256424e-05, + "loss": 1.9087, + "step": 4905 + }, + { + "epoch": 0.009580968924166242, + "grad_norm": 2.713003158569336, + "learning_rate": 2.9993205663016218e-05, + "loss": 1.8721, + "step": 4920 + }, + { + "epoch": 0.009610179195276505, + "grad_norm": 3.5016112327575684, + "learning_rate": 2.9993164174130137e-05, + "loss": 1.8573, + "step": 4935 + }, + { + "epoch": 0.009639389466386768, + "grad_norm": 4.257192611694336, + "learning_rate": 2.9993122558984516e-05, + "loss": 1.9638, + "step": 4950 + }, + { + "epoch": 0.00966859973749703, + "grad_norm": 3.4689440727233887, + "learning_rate": 2.9993080817579702e-05, + "loss": 1.801, + "step": 4965 + }, + { + "epoch": 0.009697810008607293, + "grad_norm": 2.3255503177642822, + "learning_rate": 2.999303894991605e-05, + "loss": 1.6792, + "step": 4980 + }, + { + "epoch": 0.009727020279717556, + "grad_norm": 2.5599734783172607, + "learning_rate": 2.9992996955993898e-05, + "loss": 2.0037, + "step": 4995 + }, + { + "epoch": 0.009756230550827819, + "grad_norm": 2.528571605682373, + "learning_rate": 2.9992954835813616e-05, + "loss": 1.9778, + "step": 5010 + }, + { + "epoch": 0.009785440821938081, + "grad_norm": 3.466859817504883, + "learning_rate": 2.999291258937555e-05, + "loss": 1.9507, + "step": 5025 + }, + { + "epoch": 0.009814651093048346, + "grad_norm": 2.9515936374664307, + "learning_rate": 2.999287021668006e-05, + "loss": 1.9506, + "step": 5040 + }, + { + "epoch": 0.009843861364158608, + "grad_norm": 2.116895914077759, + "learning_rate": 2.99928277177275e-05, + "loss": 2.0403, + "step": 5055 + }, + { + "epoch": 0.009873071635268871, + "grad_norm": 2.0982749462127686, + "learning_rate": 2.999278509251823e-05, + "loss": 1.981, + "step": 5070 + }, + { + "epoch": 0.009902281906379134, + "grad_norm": 2.1464314460754395, + "learning_rate": 2.9992742341052612e-05, + "loss": 2.0126, + "step": 5085 + }, + { + "epoch": 0.009931492177489397, + "grad_norm": 5.098091125488281, + "learning_rate": 2.9992699463330995e-05, + "loss": 1.8511, + "step": 5100 + }, + { + "epoch": 0.00996070244859966, + "grad_norm": 3.8748703002929688, + "learning_rate": 2.999265645935375e-05, + "loss": 1.9058, + "step": 5115 + }, + { + "epoch": 0.009989912719709922, + "grad_norm": 3.9060122966766357, + "learning_rate": 2.999261332912124e-05, + "loss": 1.9298, + "step": 5130 + }, + { + "epoch": 0.010019122990820185, + "grad_norm": 3.6169393062591553, + "learning_rate": 2.999257007263382e-05, + "loss": 1.6655, + "step": 5145 + }, + { + "epoch": 0.010048333261930449, + "grad_norm": 2.0531461238861084, + "learning_rate": 2.999252668989186e-05, + "loss": 2.0188, + "step": 5160 + }, + { + "epoch": 0.010077543533040712, + "grad_norm": 2.554202079772949, + "learning_rate": 2.9992483180895725e-05, + "loss": 1.8039, + "step": 5175 + }, + { + "epoch": 0.010106753804150975, + "grad_norm": 3.2061095237731934, + "learning_rate": 2.9992439545645778e-05, + "loss": 1.8889, + "step": 5190 + }, + { + "epoch": 0.010135964075261237, + "grad_norm": 2.8294739723205566, + "learning_rate": 2.9992395784142395e-05, + "loss": 1.9291, + "step": 5205 + }, + { + "epoch": 0.0101651743463715, + "grad_norm": 2.8096394538879395, + "learning_rate": 2.9992351896385932e-05, + "loss": 1.924, + "step": 5220 + }, + { + "epoch": 0.010194384617481763, + "grad_norm": 3.7793962955474854, + "learning_rate": 2.999230788237677e-05, + "loss": 1.933, + "step": 5235 + }, + { + "epoch": 0.010223594888592025, + "grad_norm": 5.307252407073975, + "learning_rate": 2.999226374211527e-05, + "loss": 1.9296, + "step": 5250 + }, + { + "epoch": 0.010252805159702288, + "grad_norm": 3.207782506942749, + "learning_rate": 2.9992219475601806e-05, + "loss": 2.0676, + "step": 5265 + }, + { + "epoch": 0.010282015430812553, + "grad_norm": 2.7217934131622314, + "learning_rate": 2.9992175082836765e-05, + "loss": 1.8675, + "step": 5280 + }, + { + "epoch": 0.010311225701922815, + "grad_norm": 3.455260753631592, + "learning_rate": 2.9992130563820497e-05, + "loss": 1.924, + "step": 5295 + }, + { + "epoch": 0.010340435973033078, + "grad_norm": 2.638262987136841, + "learning_rate": 2.9992085918553393e-05, + "loss": 1.9263, + "step": 5310 + }, + { + "epoch": 0.01036964624414334, + "grad_norm": 3.1709418296813965, + "learning_rate": 2.9992041147035828e-05, + "loss": 1.8848, + "step": 5325 + }, + { + "epoch": 0.010398856515253603, + "grad_norm": 2.1629397869110107, + "learning_rate": 2.9991996249268175e-05, + "loss": 1.9271, + "step": 5340 + }, + { + "epoch": 0.010428066786363866, + "grad_norm": 4.2046284675598145, + "learning_rate": 2.999195122525081e-05, + "loss": 1.9208, + "step": 5355 + }, + { + "epoch": 0.010457277057474129, + "grad_norm": 4.966402053833008, + "learning_rate": 2.9991906074984116e-05, + "loss": 1.8059, + "step": 5370 + }, + { + "epoch": 0.010486487328584392, + "grad_norm": 3.0322844982147217, + "learning_rate": 2.9991860798468473e-05, + "loss": 1.9608, + "step": 5385 + }, + { + "epoch": 0.010515697599694656, + "grad_norm": 2.3690319061279297, + "learning_rate": 2.9991815395704266e-05, + "loss": 1.9424, + "step": 5400 + }, + { + "epoch": 0.010544907870804919, + "grad_norm": 2.81915545463562, + "learning_rate": 2.9991769866691865e-05, + "loss": 1.8246, + "step": 5415 + }, + { + "epoch": 0.010574118141915181, + "grad_norm": 3.064317464828491, + "learning_rate": 2.9991724211431667e-05, + "loss": 1.8012, + "step": 5430 + }, + { + "epoch": 0.010603328413025444, + "grad_norm": 4.302711009979248, + "learning_rate": 2.999167842992405e-05, + "loss": 1.9098, + "step": 5445 + }, + { + "epoch": 0.010632538684135707, + "grad_norm": 3.3137192726135254, + "learning_rate": 2.9991632522169398e-05, + "loss": 1.8922, + "step": 5460 + }, + { + "epoch": 0.01066174895524597, + "grad_norm": 3.0742363929748535, + "learning_rate": 2.9991586488168104e-05, + "loss": 2.0403, + "step": 5475 + }, + { + "epoch": 0.010690959226356232, + "grad_norm": 3.034343957901001, + "learning_rate": 2.9991540327920547e-05, + "loss": 1.8944, + "step": 5490 + }, + { + "epoch": 0.010720169497466495, + "grad_norm": 4.131673812866211, + "learning_rate": 2.9991494041427124e-05, + "loss": 1.8897, + "step": 5505 + }, + { + "epoch": 0.01074937976857676, + "grad_norm": 3.8295650482177734, + "learning_rate": 2.999144762868822e-05, + "loss": 1.8522, + "step": 5520 + }, + { + "epoch": 0.010778590039687022, + "grad_norm": 4.337125301361084, + "learning_rate": 2.999140108970423e-05, + "loss": 1.8039, + "step": 5535 + }, + { + "epoch": 0.010807800310797285, + "grad_norm": 4.047338008880615, + "learning_rate": 2.999135442447554e-05, + "loss": 1.964, + "step": 5550 + }, + { + "epoch": 0.010837010581907548, + "grad_norm": 3.5203754901885986, + "learning_rate": 2.9991307633002546e-05, + "loss": 1.9198, + "step": 5565 + }, + { + "epoch": 0.01086622085301781, + "grad_norm": 3.9507977962493896, + "learning_rate": 2.9991260715285642e-05, + "loss": 1.9079, + "step": 5580 + }, + { + "epoch": 0.010895431124128073, + "grad_norm": 4.072928428649902, + "learning_rate": 2.9991213671325223e-05, + "loss": 2.032, + "step": 5595 + }, + { + "epoch": 0.010924641395238336, + "grad_norm": 2.8438544273376465, + "learning_rate": 2.9991166501121685e-05, + "loss": 1.9995, + "step": 5610 + }, + { + "epoch": 0.0109538516663486, + "grad_norm": 3.2885472774505615, + "learning_rate": 2.9991119204675425e-05, + "loss": 1.8653, + "step": 5625 + }, + { + "epoch": 0.010983061937458863, + "grad_norm": 3.6044580936431885, + "learning_rate": 2.9991071781986843e-05, + "loss": 1.9103, + "step": 5640 + }, + { + "epoch": 0.011012272208569126, + "grad_norm": 4.155179977416992, + "learning_rate": 2.9991024233056335e-05, + "loss": 1.8887, + "step": 5655 + }, + { + "epoch": 0.011041482479679388, + "grad_norm": 4.087657451629639, + "learning_rate": 2.9990976557884308e-05, + "loss": 1.967, + "step": 5670 + }, + { + "epoch": 0.011070692750789651, + "grad_norm": 4.724533557891846, + "learning_rate": 2.999092875647116e-05, + "loss": 1.8383, + "step": 5685 + }, + { + "epoch": 0.011099903021899914, + "grad_norm": 7.630526065826416, + "learning_rate": 2.9990880828817287e-05, + "loss": 1.9142, + "step": 5700 + }, + { + "epoch": 0.011129113293010176, + "grad_norm": 4.646370887756348, + "learning_rate": 2.99908327749231e-05, + "loss": 1.8862, + "step": 5715 + }, + { + "epoch": 0.011158323564120439, + "grad_norm": 1.9703487157821655, + "learning_rate": 2.9990784594789e-05, + "loss": 1.8747, + "step": 5730 + }, + { + "epoch": 0.011187533835230704, + "grad_norm": 2.7092244625091553, + "learning_rate": 2.99907362884154e-05, + "loss": 1.9593, + "step": 5745 + }, + { + "epoch": 0.011216744106340966, + "grad_norm": 2.3020477294921875, + "learning_rate": 2.9990687855802695e-05, + "loss": 1.9351, + "step": 5760 + }, + { + "epoch": 0.011245954377451229, + "grad_norm": 3.7924842834472656, + "learning_rate": 2.9990639296951303e-05, + "loss": 1.9888, + "step": 5775 + }, + { + "epoch": 0.011275164648561492, + "grad_norm": 2.8485357761383057, + "learning_rate": 2.9990590611861625e-05, + "loss": 1.7921, + "step": 5790 + }, + { + "epoch": 0.011304374919671754, + "grad_norm": 2.7510149478912354, + "learning_rate": 2.999054180053408e-05, + "loss": 1.9655, + "step": 5805 + }, + { + "epoch": 0.011333585190782017, + "grad_norm": 4.522819519042969, + "learning_rate": 2.999049286296907e-05, + "loss": 1.9384, + "step": 5820 + }, + { + "epoch": 0.01136279546189228, + "grad_norm": 2.325582265853882, + "learning_rate": 2.9990443799167018e-05, + "loss": 1.8342, + "step": 5835 + }, + { + "epoch": 0.011392005733002543, + "grad_norm": 3.583799123764038, + "learning_rate": 2.999039460912832e-05, + "loss": 1.9671, + "step": 5850 + }, + { + "epoch": 0.011421216004112807, + "grad_norm": 2.342571973800659, + "learning_rate": 2.999034529285341e-05, + "loss": 1.8169, + "step": 5865 + }, + { + "epoch": 0.01145042627522307, + "grad_norm": 3.772407054901123, + "learning_rate": 2.9990295850342694e-05, + "loss": 1.8066, + "step": 5880 + }, + { + "epoch": 0.011479636546333332, + "grad_norm": 2.550743341445923, + "learning_rate": 2.9990246281596583e-05, + "loss": 1.9611, + "step": 5895 + }, + { + "epoch": 0.011508846817443595, + "grad_norm": 1.9916815757751465, + "learning_rate": 2.9990196586615502e-05, + "loss": 1.8308, + "step": 5910 + }, + { + "epoch": 0.011538057088553858, + "grad_norm": 3.8781256675720215, + "learning_rate": 2.9990146765399868e-05, + "loss": 1.804, + "step": 5925 + }, + { + "epoch": 0.01156726735966412, + "grad_norm": 1.9027587175369263, + "learning_rate": 2.99900968179501e-05, + "loss": 1.8449, + "step": 5940 + }, + { + "epoch": 0.011596477630774383, + "grad_norm": 2.9147605895996094, + "learning_rate": 2.9990046744266612e-05, + "loss": 1.9283, + "step": 5955 + }, + { + "epoch": 0.011625687901884646, + "grad_norm": 2.0023555755615234, + "learning_rate": 2.9989996544349842e-05, + "loss": 1.9464, + "step": 5970 + }, + { + "epoch": 0.01165489817299491, + "grad_norm": 2.819077253341675, + "learning_rate": 2.9989946218200195e-05, + "loss": 1.7682, + "step": 5985 + }, + { + "epoch": 0.011684108444105173, + "grad_norm": 4.221833229064941, + "learning_rate": 2.998989576581811e-05, + "loss": 1.6921, + "step": 6000 + }, + { + "epoch": 0.011713318715215436, + "grad_norm": 2.9984993934631348, + "learning_rate": 2.9989845187204e-05, + "loss": 1.8921, + "step": 6015 + }, + { + "epoch": 0.011742528986325699, + "grad_norm": 2.866452932357788, + "learning_rate": 2.9989794482358293e-05, + "loss": 1.9759, + "step": 6030 + }, + { + "epoch": 0.011771739257435961, + "grad_norm": 2.398813486099243, + "learning_rate": 2.9989743651281424e-05, + "loss": 1.8948, + "step": 6045 + }, + { + "epoch": 0.011800949528546224, + "grad_norm": 2.6533145904541016, + "learning_rate": 2.998969269397381e-05, + "loss": 1.8532, + "step": 6060 + }, + { + "epoch": 0.011830159799656487, + "grad_norm": 3.8135955333709717, + "learning_rate": 2.998964161043589e-05, + "loss": 1.9564, + "step": 6075 + }, + { + "epoch": 0.01185937007076675, + "grad_norm": 2.5285484790802, + "learning_rate": 2.9989590400668086e-05, + "loss": 1.8229, + "step": 6090 + }, + { + "epoch": 0.011888580341877014, + "grad_norm": 2.0157272815704346, + "learning_rate": 2.9989539064670838e-05, + "loss": 1.9952, + "step": 6105 + }, + { + "epoch": 0.011917790612987277, + "grad_norm": 1.8600102663040161, + "learning_rate": 2.998948760244457e-05, + "loss": 2.0176, + "step": 6120 + }, + { + "epoch": 0.01194700088409754, + "grad_norm": 2.4452428817749023, + "learning_rate": 2.9989436013989718e-05, + "loss": 1.8965, + "step": 6135 + }, + { + "epoch": 0.011976211155207802, + "grad_norm": 3.593918561935425, + "learning_rate": 2.998938429930672e-05, + "loss": 1.9629, + "step": 6150 + }, + { + "epoch": 0.012005421426318065, + "grad_norm": 2.161616086959839, + "learning_rate": 2.9989332458396005e-05, + "loss": 2.0643, + "step": 6165 + }, + { + "epoch": 0.012034631697428327, + "grad_norm": 4.1473493576049805, + "learning_rate": 2.9989280491258015e-05, + "loss": 1.8227, + "step": 6180 + }, + { + "epoch": 0.01206384196853859, + "grad_norm": 3.4432153701782227, + "learning_rate": 2.9989228397893186e-05, + "loss": 1.9787, + "step": 6195 + }, + { + "epoch": 0.012093052239648853, + "grad_norm": 3.591747283935547, + "learning_rate": 2.9989176178301955e-05, + "loss": 1.8248, + "step": 6210 + }, + { + "epoch": 0.012122262510759117, + "grad_norm": 2.385715961456299, + "learning_rate": 2.9989123832484767e-05, + "loss": 2.0869, + "step": 6225 + }, + { + "epoch": 0.01215147278186938, + "grad_norm": 3.128617763519287, + "learning_rate": 2.9989071360442058e-05, + "loss": 1.8622, + "step": 6240 + }, + { + "epoch": 0.012180683052979643, + "grad_norm": 5.122420787811279, + "learning_rate": 2.998901876217427e-05, + "loss": 1.828, + "step": 6255 + }, + { + "epoch": 0.012209893324089905, + "grad_norm": 2.451355218887329, + "learning_rate": 2.9988966037681844e-05, + "loss": 1.8766, + "step": 6270 + }, + { + "epoch": 0.012239103595200168, + "grad_norm": 2.8492448329925537, + "learning_rate": 2.9988913186965232e-05, + "loss": 1.9675, + "step": 6285 + }, + { + "epoch": 0.01226831386631043, + "grad_norm": 4.332848072052002, + "learning_rate": 2.998886021002487e-05, + "loss": 2.0243, + "step": 6300 + }, + { + "epoch": 0.012297524137420694, + "grad_norm": 2.7769408226013184, + "learning_rate": 2.9988807106861208e-05, + "loss": 1.9015, + "step": 6315 + }, + { + "epoch": 0.012326734408530956, + "grad_norm": 2.2205896377563477, + "learning_rate": 2.9988753877474696e-05, + "loss": 1.8927, + "step": 6330 + }, + { + "epoch": 0.01235594467964122, + "grad_norm": 4.445504665374756, + "learning_rate": 2.9988700521865777e-05, + "loss": 1.9646, + "step": 6345 + }, + { + "epoch": 0.012385154950751483, + "grad_norm": 2.951155662536621, + "learning_rate": 2.9988647040034905e-05, + "loss": 1.8487, + "step": 6360 + }, + { + "epoch": 0.012414365221861746, + "grad_norm": 4.908761501312256, + "learning_rate": 2.998859343198253e-05, + "loss": 1.8704, + "step": 6375 + }, + { + "epoch": 0.012443575492972009, + "grad_norm": 3.0668880939483643, + "learning_rate": 2.9988539697709098e-05, + "loss": 1.8632, + "step": 6390 + }, + { + "epoch": 0.012472785764082272, + "grad_norm": 1.8523426055908203, + "learning_rate": 2.9988485837215068e-05, + "loss": 1.8434, + "step": 6405 + }, + { + "epoch": 0.012501996035192534, + "grad_norm": 4.285421848297119, + "learning_rate": 2.9988431850500887e-05, + "loss": 1.8438, + "step": 6420 + }, + { + "epoch": 0.012531206306302797, + "grad_norm": 1.9119669198989868, + "learning_rate": 2.9988377737567013e-05, + "loss": 1.8469, + "step": 6435 + }, + { + "epoch": 0.01256041657741306, + "grad_norm": 2.430860757827759, + "learning_rate": 2.9988323498413907e-05, + "loss": 2.1108, + "step": 6450 + }, + { + "epoch": 0.012589626848523324, + "grad_norm": 2.2205920219421387, + "learning_rate": 2.9988269133042016e-05, + "loss": 1.8481, + "step": 6465 + }, + { + "epoch": 0.012618837119633587, + "grad_norm": 2.0588815212249756, + "learning_rate": 2.9988214641451804e-05, + "loss": 1.8007, + "step": 6480 + }, + { + "epoch": 0.01264804739074385, + "grad_norm": 2.527836322784424, + "learning_rate": 2.998816002364373e-05, + "loss": 1.9974, + "step": 6495 + }, + { + "epoch": 0.012677257661854112, + "grad_norm": 3.0486202239990234, + "learning_rate": 2.9988105279618253e-05, + "loss": 1.8459, + "step": 6510 + }, + { + "epoch": 0.012706467932964375, + "grad_norm": 2.6629745960235596, + "learning_rate": 2.998805040937583e-05, + "loss": 1.9373, + "step": 6525 + }, + { + "epoch": 0.012735678204074638, + "grad_norm": 3.2922544479370117, + "learning_rate": 2.9987995412916928e-05, + "loss": 1.8839, + "step": 6540 + }, + { + "epoch": 0.0127648884751849, + "grad_norm": 4.026648044586182, + "learning_rate": 2.998794029024201e-05, + "loss": 2.1104, + "step": 6555 + }, + { + "epoch": 0.012794098746295163, + "grad_norm": 2.473214864730835, + "learning_rate": 2.998788504135154e-05, + "loss": 1.664, + "step": 6570 + }, + { + "epoch": 0.012823309017405428, + "grad_norm": 2.8834683895111084, + "learning_rate": 2.998782966624598e-05, + "loss": 1.9737, + "step": 6585 + }, + { + "epoch": 0.01285251928851569, + "grad_norm": 3.123595952987671, + "learning_rate": 2.99877741649258e-05, + "loss": 2.1071, + "step": 6600 + }, + { + "epoch": 0.012881729559625953, + "grad_norm": 2.3172380924224854, + "learning_rate": 2.998771853739146e-05, + "loss": 1.8585, + "step": 6615 + }, + { + "epoch": 0.012910939830736216, + "grad_norm": 3.4954440593719482, + "learning_rate": 2.998766278364344e-05, + "loss": 1.8458, + "step": 6630 + }, + { + "epoch": 0.012940150101846478, + "grad_norm": 2.48248553276062, + "learning_rate": 2.9987606903682203e-05, + "loss": 1.9398, + "step": 6645 + }, + { + "epoch": 0.012969360372956741, + "grad_norm": 2.7575950622558594, + "learning_rate": 2.998755089750822e-05, + "loss": 1.8692, + "step": 6660 + }, + { + "epoch": 0.012998570644067004, + "grad_norm": 2.821286678314209, + "learning_rate": 2.9987494765121962e-05, + "loss": 2.1104, + "step": 6675 + }, + { + "epoch": 0.013027780915177268, + "grad_norm": 3.881669521331787, + "learning_rate": 2.99874385065239e-05, + "loss": 1.8453, + "step": 6690 + }, + { + "epoch": 0.013056991186287531, + "grad_norm": 2.573246955871582, + "learning_rate": 2.9987382121714516e-05, + "loss": 1.9538, + "step": 6705 + }, + { + "epoch": 0.013086201457397794, + "grad_norm": 4.5084052085876465, + "learning_rate": 2.9987325610694277e-05, + "loss": 1.9216, + "step": 6720 + }, + { + "epoch": 0.013115411728508056, + "grad_norm": 3.882384777069092, + "learning_rate": 2.9987268973463662e-05, + "loss": 1.7694, + "step": 6735 + }, + { + "epoch": 0.013144621999618319, + "grad_norm": 2.769113302230835, + "learning_rate": 2.9987212210023147e-05, + "loss": 1.9591, + "step": 6750 + }, + { + "epoch": 0.013173832270728582, + "grad_norm": 3.9864275455474854, + "learning_rate": 2.9987155320373207e-05, + "loss": 1.9079, + "step": 6765 + }, + { + "epoch": 0.013203042541838845, + "grad_norm": 1.9832093715667725, + "learning_rate": 2.998709830451433e-05, + "loss": 1.8263, + "step": 6780 + }, + { + "epoch": 0.013232252812949107, + "grad_norm": 5.0461106300354, + "learning_rate": 2.9987041162446985e-05, + "loss": 1.8265, + "step": 6795 + }, + { + "epoch": 0.013261463084059372, + "grad_norm": 2.5139777660369873, + "learning_rate": 2.998698389417166e-05, + "loss": 1.8644, + "step": 6810 + }, + { + "epoch": 0.013290673355169634, + "grad_norm": 2.5377390384674072, + "learning_rate": 2.998692649968884e-05, + "loss": 1.9956, + "step": 6825 + }, + { + "epoch": 0.013319883626279897, + "grad_norm": 2.68589186668396, + "learning_rate": 2.9986868978998998e-05, + "loss": 1.9298, + "step": 6840 + }, + { + "epoch": 0.01334909389739016, + "grad_norm": 5.362221717834473, + "learning_rate": 2.9986811332102624e-05, + "loss": 1.8319, + "step": 6855 + }, + { + "epoch": 0.013378304168500423, + "grad_norm": 3.440687417984009, + "learning_rate": 2.9986753559000207e-05, + "loss": 1.8854, + "step": 6870 + }, + { + "epoch": 0.013407514439610685, + "grad_norm": 2.2173657417297363, + "learning_rate": 2.9986695659692233e-05, + "loss": 1.8672, + "step": 6885 + }, + { + "epoch": 0.013436724710720948, + "grad_norm": 2.1564152240753174, + "learning_rate": 2.998663763417918e-05, + "loss": 1.9548, + "step": 6900 + }, + { + "epoch": 0.01346593498183121, + "grad_norm": 2.339550018310547, + "learning_rate": 2.9986579482461552e-05, + "loss": 1.7549, + "step": 6915 + }, + { + "epoch": 0.013495145252941475, + "grad_norm": 8.277188301086426, + "learning_rate": 2.9986521204539824e-05, + "loss": 1.8958, + "step": 6930 + }, + { + "epoch": 0.013524355524051738, + "grad_norm": 3.044663190841675, + "learning_rate": 2.9986462800414498e-05, + "loss": 1.9019, + "step": 6945 + }, + { + "epoch": 0.013553565795162, + "grad_norm": 2.622844696044922, + "learning_rate": 2.9986404270086056e-05, + "loss": 1.8349, + "step": 6960 + }, + { + "epoch": 0.013582776066272263, + "grad_norm": 2.686244010925293, + "learning_rate": 2.9986345613554998e-05, + "loss": 1.9178, + "step": 6975 + }, + { + "epoch": 0.013611986337382526, + "grad_norm": 3.487210512161255, + "learning_rate": 2.9986286830821817e-05, + "loss": 2.0353, + "step": 6990 + }, + { + "epoch": 0.013641196608492789, + "grad_norm": 3.297060966491699, + "learning_rate": 2.9986227921887005e-05, + "loss": 2.0344, + "step": 7005 + }, + { + "epoch": 0.013670406879603051, + "grad_norm": 6.4708571434021, + "learning_rate": 2.9986168886751064e-05, + "loss": 1.946, + "step": 7020 + }, + { + "epoch": 0.013699617150713314, + "grad_norm": 3.04331636428833, + "learning_rate": 2.9986109725414485e-05, + "loss": 1.9229, + "step": 7035 + }, + { + "epoch": 0.013728827421823579, + "grad_norm": 3.3614730834960938, + "learning_rate": 2.9986050437877762e-05, + "loss": 1.8943, + "step": 7050 + }, + { + "epoch": 0.013758037692933841, + "grad_norm": 2.632763624191284, + "learning_rate": 2.998599102414141e-05, + "loss": 1.907, + "step": 7065 + }, + { + "epoch": 0.013787247964044104, + "grad_norm": 2.441969871520996, + "learning_rate": 2.998593148420592e-05, + "loss": 1.9382, + "step": 7080 + }, + { + "epoch": 0.013816458235154367, + "grad_norm": 3.3013856410980225, + "learning_rate": 2.9985871818071784e-05, + "loss": 1.7866, + "step": 7095 + }, + { + "epoch": 0.01384566850626463, + "grad_norm": 3.12922739982605, + "learning_rate": 2.9985812025739518e-05, + "loss": 1.9155, + "step": 7110 + }, + { + "epoch": 0.013874878777374892, + "grad_norm": 3.3649489879608154, + "learning_rate": 2.998575210720962e-05, + "loss": 1.7256, + "step": 7125 + }, + { + "epoch": 0.013904089048485155, + "grad_norm": 2.381049871444702, + "learning_rate": 2.9985692062482603e-05, + "loss": 1.9324, + "step": 7140 + }, + { + "epoch": 0.013933299319595418, + "grad_norm": 2.5328924655914307, + "learning_rate": 2.998563189155896e-05, + "loss": 2.0635, + "step": 7155 + }, + { + "epoch": 0.013962509590705682, + "grad_norm": 3.5615475177764893, + "learning_rate": 2.99855715944392e-05, + "loss": 1.8377, + "step": 7170 + }, + { + "epoch": 0.013991719861815945, + "grad_norm": 3.7230849266052246, + "learning_rate": 2.998551117112384e-05, + "loss": 1.839, + "step": 7185 + }, + { + "epoch": 0.014020930132926207, + "grad_norm": 2.7863929271698, + "learning_rate": 2.998545062161338e-05, + "loss": 1.7265, + "step": 7200 + }, + { + "epoch": 0.01405014040403647, + "grad_norm": 2.238398790359497, + "learning_rate": 2.9985389945908332e-05, + "loss": 1.9747, + "step": 7215 + }, + { + "epoch": 0.014079350675146733, + "grad_norm": 2.3368685245513916, + "learning_rate": 2.998532914400921e-05, + "loss": 2.0277, + "step": 7230 + }, + { + "epoch": 0.014108560946256996, + "grad_norm": 3.667415142059326, + "learning_rate": 2.9985268215916523e-05, + "loss": 1.8947, + "step": 7245 + }, + { + "epoch": 0.014137771217367258, + "grad_norm": 5.902375221252441, + "learning_rate": 2.9985207161630784e-05, + "loss": 2.023, + "step": 7260 + }, + { + "epoch": 0.014166981488477521, + "grad_norm": 3.5179100036621094, + "learning_rate": 2.998514598115251e-05, + "loss": 1.9654, + "step": 7275 + }, + { + "epoch": 0.014196191759587785, + "grad_norm": 4.09523344039917, + "learning_rate": 2.9985084674482207e-05, + "loss": 1.9822, + "step": 7290 + }, + { + "epoch": 0.014225402030698048, + "grad_norm": 3.9758706092834473, + "learning_rate": 2.9985023241620405e-05, + "loss": 1.7241, + "step": 7305 + }, + { + "epoch": 0.01425461230180831, + "grad_norm": 4.463382720947266, + "learning_rate": 2.9984961682567614e-05, + "loss": 1.9208, + "step": 7320 + }, + { + "epoch": 0.014283822572918574, + "grad_norm": 2.2591640949249268, + "learning_rate": 2.9984899997324357e-05, + "loss": 1.9372, + "step": 7335 + }, + { + "epoch": 0.014313032844028836, + "grad_norm": 2.958768129348755, + "learning_rate": 2.998483818589114e-05, + "loss": 1.9092, + "step": 7350 + }, + { + "epoch": 0.014342243115139099, + "grad_norm": 2.4651241302490234, + "learning_rate": 2.99847762482685e-05, + "loss": 1.7909, + "step": 7365 + }, + { + "epoch": 0.014371453386249362, + "grad_norm": 4.785153388977051, + "learning_rate": 2.9984714184456948e-05, + "loss": 2.0846, + "step": 7380 + }, + { + "epoch": 0.014400663657359624, + "grad_norm": 5.212136745452881, + "learning_rate": 2.9984651994457013e-05, + "loss": 1.7623, + "step": 7395 + }, + { + "epoch": 0.014429873928469889, + "grad_norm": 3.5508365631103516, + "learning_rate": 2.9984589678269216e-05, + "loss": 1.8922, + "step": 7410 + }, + { + "epoch": 0.014459084199580152, + "grad_norm": 4.388883113861084, + "learning_rate": 2.998452723589408e-05, + "loss": 1.9701, + "step": 7425 + }, + { + "epoch": 0.014488294470690414, + "grad_norm": 2.4577128887176514, + "learning_rate": 2.9984464667332135e-05, + "loss": 1.8807, + "step": 7440 + }, + { + "epoch": 0.014517504741800677, + "grad_norm": 3.046642541885376, + "learning_rate": 2.99844019725839e-05, + "loss": 2.0671, + "step": 7455 + }, + { + "epoch": 0.01454671501291094, + "grad_norm": 2.639798879623413, + "learning_rate": 2.9984339151649913e-05, + "loss": 1.7981, + "step": 7470 + }, + { + "epoch": 0.014575925284021202, + "grad_norm": 4.231945991516113, + "learning_rate": 2.9984276204530702e-05, + "loss": 1.822, + "step": 7485 + }, + { + "epoch": 0.014605135555131465, + "grad_norm": 2.3419246673583984, + "learning_rate": 2.9984213131226788e-05, + "loss": 1.8931, + "step": 7500 + }, + { + "epoch": 0.014634345826241728, + "grad_norm": 3.034775733947754, + "learning_rate": 2.998414993173871e-05, + "loss": 1.8925, + "step": 7515 + }, + { + "epoch": 0.014663556097351992, + "grad_norm": 2.818657875061035, + "learning_rate": 2.9984086606066997e-05, + "loss": 1.8714, + "step": 7530 + }, + { + "epoch": 0.014692766368462255, + "grad_norm": 2.0940043926239014, + "learning_rate": 2.9984023154212183e-05, + "loss": 1.9476, + "step": 7545 + }, + { + "epoch": 0.014721976639572518, + "grad_norm": 1.9247641563415527, + "learning_rate": 2.9983959576174807e-05, + "loss": 1.8433, + "step": 7560 + }, + { + "epoch": 0.01475118691068278, + "grad_norm": 4.496128559112549, + "learning_rate": 2.9983895871955397e-05, + "loss": 1.8137, + "step": 7575 + }, + { + "epoch": 0.014780397181793043, + "grad_norm": 1.846083402633667, + "learning_rate": 2.998383204155449e-05, + "loss": 1.9474, + "step": 7590 + }, + { + "epoch": 0.014809607452903306, + "grad_norm": 2.063615083694458, + "learning_rate": 2.9983768084972626e-05, + "loss": 1.9017, + "step": 7605 + }, + { + "epoch": 0.014838817724013569, + "grad_norm": 3.480355978012085, + "learning_rate": 2.9983704002210346e-05, + "loss": 1.8554, + "step": 7620 + }, + { + "epoch": 0.014868027995123831, + "grad_norm": 3.3823037147521973, + "learning_rate": 2.9983639793268187e-05, + "loss": 2.067, + "step": 7635 + }, + { + "epoch": 0.014897238266234096, + "grad_norm": 4.111497402191162, + "learning_rate": 2.998357545814669e-05, + "loss": 1.8451, + "step": 7650 + }, + { + "epoch": 0.014926448537344358, + "grad_norm": 2.51603102684021, + "learning_rate": 2.9983510996846397e-05, + "loss": 1.835, + "step": 7665 + }, + { + "epoch": 0.014955658808454621, + "grad_norm": 2.576188087463379, + "learning_rate": 2.9983446409367846e-05, + "loss": 2.0444, + "step": 7680 + }, + { + "epoch": 0.014984869079564884, + "grad_norm": 2.695582151412964, + "learning_rate": 2.9983381695711595e-05, + "loss": 1.9372, + "step": 7695 + }, + { + "epoch": 0.015014079350675147, + "grad_norm": 2.580967664718628, + "learning_rate": 2.9983316855878172e-05, + "loss": 1.7984, + "step": 7710 + }, + { + "epoch": 0.01504328962178541, + "grad_norm": 2.779932737350464, + "learning_rate": 2.9983251889868133e-05, + "loss": 2.0286, + "step": 7725 + }, + { + "epoch": 0.015072499892895672, + "grad_norm": 4.4765849113464355, + "learning_rate": 2.998318679768202e-05, + "loss": 1.873, + "step": 7740 + }, + { + "epoch": 0.015101710164005935, + "grad_norm": 5.089080810546875, + "learning_rate": 2.9983121579320387e-05, + "loss": 1.9452, + "step": 7755 + }, + { + "epoch": 0.015130920435116199, + "grad_norm": 3.860607862472534, + "learning_rate": 2.9983056234783774e-05, + "loss": 1.9469, + "step": 7770 + }, + { + "epoch": 0.015160130706226462, + "grad_norm": 3.014214277267456, + "learning_rate": 2.998299076407274e-05, + "loss": 2.0488, + "step": 7785 + }, + { + "epoch": 0.015189340977336725, + "grad_norm": 3.1203925609588623, + "learning_rate": 2.998292516718784e-05, + "loss": 1.8334, + "step": 7800 + }, + { + "epoch": 0.015218551248446987, + "grad_norm": 1.6761817932128906, + "learning_rate": 2.998285944412961e-05, + "loss": 2.0263, + "step": 7815 + }, + { + "epoch": 0.01524776151955725, + "grad_norm": 2.3997247219085693, + "learning_rate": 2.9982793594898623e-05, + "loss": 1.9195, + "step": 7830 + }, + { + "epoch": 0.015276971790667513, + "grad_norm": 2.0805671215057373, + "learning_rate": 2.998272761949542e-05, + "loss": 1.8461, + "step": 7845 + }, + { + "epoch": 0.015306182061777775, + "grad_norm": 3.0058650970458984, + "learning_rate": 2.998266151792056e-05, + "loss": 2.0221, + "step": 7860 + }, + { + "epoch": 0.01533539233288804, + "grad_norm": 4.376441955566406, + "learning_rate": 2.99825952901746e-05, + "loss": 1.8942, + "step": 7875 + }, + { + "epoch": 0.015364602603998303, + "grad_norm": 2.509711503982544, + "learning_rate": 2.9982528936258096e-05, + "loss": 1.7979, + "step": 7890 + }, + { + "epoch": 0.015393812875108565, + "grad_norm": 2.763103723526001, + "learning_rate": 2.9982462456171605e-05, + "loss": 1.9112, + "step": 7905 + }, + { + "epoch": 0.015423023146218828, + "grad_norm": 2.519554615020752, + "learning_rate": 2.9982395849915698e-05, + "loss": 1.9375, + "step": 7920 + }, + { + "epoch": 0.01545223341732909, + "grad_norm": 2.679543972015381, + "learning_rate": 2.9982329117490926e-05, + "loss": 1.8226, + "step": 7935 + }, + { + "epoch": 0.015481443688439353, + "grad_norm": 3.619253158569336, + "learning_rate": 2.9982262258897855e-05, + "loss": 1.8639, + "step": 7950 + }, + { + "epoch": 0.015510653959549616, + "grad_norm": 1.7691304683685303, + "learning_rate": 2.9982195274137042e-05, + "loss": 1.7112, + "step": 7965 + }, + { + "epoch": 0.015539864230659879, + "grad_norm": 3.720994234085083, + "learning_rate": 2.9982128163209058e-05, + "loss": 1.8901, + "step": 7980 + }, + { + "epoch": 0.015569074501770143, + "grad_norm": 3.499058723449707, + "learning_rate": 2.9982060926114467e-05, + "loss": 1.9527, + "step": 7995 + }, + { + "epoch": 0.015598284772880406, + "grad_norm": 2.401658773422241, + "learning_rate": 2.9981993562853833e-05, + "loss": 1.9037, + "step": 8010 + }, + { + "epoch": 0.015627495043990667, + "grad_norm": 3.7763659954071045, + "learning_rate": 2.9981926073427724e-05, + "loss": 2.0314, + "step": 8025 + }, + { + "epoch": 0.01565670531510093, + "grad_norm": 3.2521162033081055, + "learning_rate": 2.9981858457836707e-05, + "loss": 1.7982, + "step": 8040 + }, + { + "epoch": 0.015685915586211196, + "grad_norm": 3.2938101291656494, + "learning_rate": 2.9981790716081353e-05, + "loss": 1.9225, + "step": 8055 + }, + { + "epoch": 0.015715125857321457, + "grad_norm": 2.9045004844665527, + "learning_rate": 2.9981722848162233e-05, + "loss": 2.04, + "step": 8070 + }, + { + "epoch": 0.01574433612843172, + "grad_norm": 2.2528388500213623, + "learning_rate": 2.9981654854079918e-05, + "loss": 1.792, + "step": 8085 + }, + { + "epoch": 0.015773546399541982, + "grad_norm": 2.9156363010406494, + "learning_rate": 2.998158673383498e-05, + "loss": 1.9291, + "step": 8100 + }, + { + "epoch": 0.015802756670652247, + "grad_norm": 2.375291347503662, + "learning_rate": 2.9981518487427996e-05, + "loss": 1.8216, + "step": 8115 + }, + { + "epoch": 0.015831966941762508, + "grad_norm": 4.628626823425293, + "learning_rate": 2.9981450114859532e-05, + "loss": 1.7895, + "step": 8130 + }, + { + "epoch": 0.015861177212872772, + "grad_norm": 2.200885534286499, + "learning_rate": 2.9981381616130172e-05, + "loss": 1.9302, + "step": 8145 + }, + { + "epoch": 0.015890387483983033, + "grad_norm": 3.7578535079956055, + "learning_rate": 2.998131299124049e-05, + "loss": 1.9202, + "step": 8160 + }, + { + "epoch": 0.015919597755093298, + "grad_norm": 2.235351800918579, + "learning_rate": 2.9981244240191063e-05, + "loss": 1.9151, + "step": 8175 + }, + { + "epoch": 0.015948808026203562, + "grad_norm": 1.905808925628662, + "learning_rate": 2.9981175362982473e-05, + "loss": 2.0942, + "step": 8190 + }, + { + "epoch": 0.015978018297313823, + "grad_norm": 2.797210454940796, + "learning_rate": 2.99811063596153e-05, + "loss": 2.0168, + "step": 8205 + }, + { + "epoch": 0.016007228568424087, + "grad_norm": 2.7250447273254395, + "learning_rate": 2.9981037230090125e-05, + "loss": 1.8033, + "step": 8220 + }, + { + "epoch": 0.01603643883953435, + "grad_norm": 2.6646409034729004, + "learning_rate": 2.9980967974407525e-05, + "loss": 1.8914, + "step": 8235 + }, + { + "epoch": 0.016065649110644613, + "grad_norm": 4.621725082397461, + "learning_rate": 2.9980898592568086e-05, + "loss": 1.9805, + "step": 8250 + }, + { + "epoch": 0.016094859381754874, + "grad_norm": 7.544195175170898, + "learning_rate": 2.9980829084572393e-05, + "loss": 1.8876, + "step": 8265 + }, + { + "epoch": 0.016124069652865138, + "grad_norm": 3.1491341590881348, + "learning_rate": 2.9980759450421032e-05, + "loss": 1.8096, + "step": 8280 + }, + { + "epoch": 0.016153279923975403, + "grad_norm": 3.049743175506592, + "learning_rate": 2.998068969011459e-05, + "loss": 1.9014, + "step": 8295 + }, + { + "epoch": 0.016182490195085664, + "grad_norm": 3.4776172637939453, + "learning_rate": 2.998061980365365e-05, + "loss": 1.9011, + "step": 8310 + }, + { + "epoch": 0.016211700466195928, + "grad_norm": 3.4769961833953857, + "learning_rate": 2.9980549791038804e-05, + "loss": 2.2367, + "step": 8325 + }, + { + "epoch": 0.01624091073730619, + "grad_norm": 2.252976894378662, + "learning_rate": 2.9980479652270645e-05, + "loss": 1.9017, + "step": 8340 + }, + { + "epoch": 0.016270121008416454, + "grad_norm": 2.3866465091705322, + "learning_rate": 2.998040938734976e-05, + "loss": 1.8849, + "step": 8355 + }, + { + "epoch": 0.016299331279526715, + "grad_norm": 4.388668060302734, + "learning_rate": 2.998033899627674e-05, + "loss": 1.8601, + "step": 8370 + }, + { + "epoch": 0.01632854155063698, + "grad_norm": 3.2678322792053223, + "learning_rate": 2.9980268479052173e-05, + "loss": 1.8799, + "step": 8385 + }, + { + "epoch": 0.01635775182174724, + "grad_norm": 3.3072402477264404, + "learning_rate": 2.9980197835676665e-05, + "loss": 1.8995, + "step": 8400 + }, + { + "epoch": 0.016386962092857504, + "grad_norm": 2.1432266235351562, + "learning_rate": 2.99801270661508e-05, + "loss": 1.9894, + "step": 8415 + }, + { + "epoch": 0.01641617236396777, + "grad_norm": 4.359271049499512, + "learning_rate": 2.998005617047518e-05, + "loss": 1.8252, + "step": 8430 + }, + { + "epoch": 0.01644538263507803, + "grad_norm": 4.683668613433838, + "learning_rate": 2.99799851486504e-05, + "loss": 1.8865, + "step": 8445 + }, + { + "epoch": 0.016474592906188294, + "grad_norm": 2.4480679035186768, + "learning_rate": 2.997991400067706e-05, + "loss": 1.8999, + "step": 8460 + }, + { + "epoch": 0.016503803177298555, + "grad_norm": 4.169287204742432, + "learning_rate": 2.9979842726555753e-05, + "loss": 1.8878, + "step": 8475 + }, + { + "epoch": 0.01653301344840882, + "grad_norm": 2.4626035690307617, + "learning_rate": 2.9979771326287084e-05, + "loss": 2.0727, + "step": 8490 + }, + { + "epoch": 0.01656222371951908, + "grad_norm": 3.7514021396636963, + "learning_rate": 2.9979699799871658e-05, + "loss": 1.9559, + "step": 8505 + }, + { + "epoch": 0.016591433990629345, + "grad_norm": 3.0994958877563477, + "learning_rate": 2.9979628147310068e-05, + "loss": 1.8535, + "step": 8520 + }, + { + "epoch": 0.01662064426173961, + "grad_norm": 5.931899070739746, + "learning_rate": 2.9979556368602924e-05, + "loss": 1.9244, + "step": 8535 + }, + { + "epoch": 0.01664985453284987, + "grad_norm": 2.016737699508667, + "learning_rate": 2.9979484463750833e-05, + "loss": 1.9107, + "step": 8550 + }, + { + "epoch": 0.016679064803960135, + "grad_norm": 2.3796162605285645, + "learning_rate": 2.9979412432754394e-05, + "loss": 2.0904, + "step": 8565 + }, + { + "epoch": 0.016708275075070396, + "grad_norm": 1.870884656906128, + "learning_rate": 2.9979340275614217e-05, + "loss": 1.926, + "step": 8580 + }, + { + "epoch": 0.01673748534618066, + "grad_norm": 2.833564281463623, + "learning_rate": 2.997926799233091e-05, + "loss": 1.871, + "step": 8595 + }, + { + "epoch": 0.01676669561729092, + "grad_norm": 3.7916762828826904, + "learning_rate": 2.9979195582905075e-05, + "loss": 1.8166, + "step": 8610 + }, + { + "epoch": 0.016795905888401186, + "grad_norm": 2.3426475524902344, + "learning_rate": 2.997912304733733e-05, + "loss": 1.8514, + "step": 8625 + }, + { + "epoch": 0.016825116159511447, + "grad_norm": 4.586437225341797, + "learning_rate": 2.9979050385628286e-05, + "loss": 1.8544, + "step": 8640 + }, + { + "epoch": 0.01685432643062171, + "grad_norm": 4.281703472137451, + "learning_rate": 2.997897759777855e-05, + "loss": 1.8978, + "step": 8655 + }, + { + "epoch": 0.016883536701731976, + "grad_norm": 2.1048526763916016, + "learning_rate": 2.9978904683788735e-05, + "loss": 1.9104, + "step": 8670 + }, + { + "epoch": 0.016912746972842237, + "grad_norm": 4.526645660400391, + "learning_rate": 2.9978831643659462e-05, + "loss": 1.9814, + "step": 8685 + }, + { + "epoch": 0.0169419572439525, + "grad_norm": 1.9656120538711548, + "learning_rate": 2.9978758477391334e-05, + "loss": 1.878, + "step": 8700 + }, + { + "epoch": 0.016971167515062762, + "grad_norm": 2.5091657638549805, + "learning_rate": 2.997868518498498e-05, + "loss": 1.8586, + "step": 8715 + }, + { + "epoch": 0.017000377786173027, + "grad_norm": 3.0821568965911865, + "learning_rate": 2.997861176644101e-05, + "loss": 2.1301, + "step": 8730 + }, + { + "epoch": 0.017029588057283288, + "grad_norm": 2.465061902999878, + "learning_rate": 2.997853822176004e-05, + "loss": 1.9435, + "step": 8745 + }, + { + "epoch": 0.017058798328393552, + "grad_norm": 4.7108306884765625, + "learning_rate": 2.9978464550942697e-05, + "loss": 1.8857, + "step": 8760 + }, + { + "epoch": 0.017088008599503816, + "grad_norm": 2.838949203491211, + "learning_rate": 2.9978390753989597e-05, + "loss": 1.9367, + "step": 8775 + }, + { + "epoch": 0.017117218870614077, + "grad_norm": 6.954312324523926, + "learning_rate": 2.9978316830901358e-05, + "loss": 1.6954, + "step": 8790 + }, + { + "epoch": 0.017146429141724342, + "grad_norm": 3.075137138366699, + "learning_rate": 2.997824278167861e-05, + "loss": 1.8787, + "step": 8805 + }, + { + "epoch": 0.017175639412834603, + "grad_norm": 2.7041006088256836, + "learning_rate": 2.9978168606321975e-05, + "loss": 1.8062, + "step": 8820 + }, + { + "epoch": 0.017204849683944867, + "grad_norm": 2.9021966457366943, + "learning_rate": 2.997809430483207e-05, + "loss": 1.9083, + "step": 8835 + }, + { + "epoch": 0.017234059955055128, + "grad_norm": 3.350419521331787, + "learning_rate": 2.9978019877209528e-05, + "loss": 2.0403, + "step": 8850 + }, + { + "epoch": 0.017263270226165393, + "grad_norm": 4.803377628326416, + "learning_rate": 2.9977945323454977e-05, + "loss": 1.8116, + "step": 8865 + }, + { + "epoch": 0.017292480497275657, + "grad_norm": 4.362671375274658, + "learning_rate": 2.997787064356904e-05, + "loss": 1.8089, + "step": 8880 + }, + { + "epoch": 0.017321690768385918, + "grad_norm": 3.1585206985473633, + "learning_rate": 2.9977795837552347e-05, + "loss": 2.1531, + "step": 8895 + }, + { + "epoch": 0.017350901039496183, + "grad_norm": 2.9386544227600098, + "learning_rate": 2.997772090540553e-05, + "loss": 1.8995, + "step": 8910 + }, + { + "epoch": 0.017380111310606444, + "grad_norm": 4.238663673400879, + "learning_rate": 2.9977645847129216e-05, + "loss": 1.9374, + "step": 8925 + }, + { + "epoch": 0.017409321581716708, + "grad_norm": 3.94399356842041, + "learning_rate": 2.9977570662724047e-05, + "loss": 1.9782, + "step": 8940 + }, + { + "epoch": 0.01743853185282697, + "grad_norm": 3.2048075199127197, + "learning_rate": 2.9977495352190643e-05, + "loss": 1.8984, + "step": 8955 + }, + { + "epoch": 0.017467742123937233, + "grad_norm": 2.4807546138763428, + "learning_rate": 2.9977419915529646e-05, + "loss": 1.8613, + "step": 8970 + }, + { + "epoch": 0.017496952395047494, + "grad_norm": 2.518021583557129, + "learning_rate": 2.9977344352741686e-05, + "loss": 2.0321, + "step": 8985 + }, + { + "epoch": 0.01752616266615776, + "grad_norm": 4.450172424316406, + "learning_rate": 2.9977268663827403e-05, + "loss": 1.9419, + "step": 9000 + }, + { + "epoch": 0.017555372937268023, + "grad_norm": 3.8519856929779053, + "learning_rate": 2.9977192848787437e-05, + "loss": 1.9083, + "step": 9015 + }, + { + "epoch": 0.017584583208378284, + "grad_norm": 5.37404727935791, + "learning_rate": 2.9977116907622422e-05, + "loss": 1.9904, + "step": 9030 + }, + { + "epoch": 0.01761379347948855, + "grad_norm": 2.9285478591918945, + "learning_rate": 2.9977040840333e-05, + "loss": 1.7262, + "step": 9045 + }, + { + "epoch": 0.01764300375059881, + "grad_norm": 3.8375890254974365, + "learning_rate": 2.9976964646919814e-05, + "loss": 1.945, + "step": 9060 + }, + { + "epoch": 0.017672214021709074, + "grad_norm": 2.987417459487915, + "learning_rate": 2.9976888327383497e-05, + "loss": 2.1196, + "step": 9075 + }, + { + "epoch": 0.017701424292819335, + "grad_norm": 3.6940131187438965, + "learning_rate": 2.99768118817247e-05, + "loss": 2.002, + "step": 9090 + }, + { + "epoch": 0.0177306345639296, + "grad_norm": 1.8791909217834473, + "learning_rate": 2.997673530994406e-05, + "loss": 1.9082, + "step": 9105 + }, + { + "epoch": 0.017759844835039864, + "grad_norm": 5.242600440979004, + "learning_rate": 2.997665861204223e-05, + "loss": 1.8396, + "step": 9120 + }, + { + "epoch": 0.017789055106150125, + "grad_norm": 3.7789740562438965, + "learning_rate": 2.997658178801985e-05, + "loss": 1.9959, + "step": 9135 + }, + { + "epoch": 0.01781826537726039, + "grad_norm": 2.6715869903564453, + "learning_rate": 2.9976504837877566e-05, + "loss": 1.8346, + "step": 9150 + }, + { + "epoch": 0.01784747564837065, + "grad_norm": 3.229962110519409, + "learning_rate": 2.997642776161603e-05, + "loss": 2.0002, + "step": 9165 + }, + { + "epoch": 0.017876685919480915, + "grad_norm": 2.602320671081543, + "learning_rate": 2.997635055923589e-05, + "loss": 2.0765, + "step": 9180 + }, + { + "epoch": 0.017905896190591176, + "grad_norm": 3.277393102645874, + "learning_rate": 2.9976273230737795e-05, + "loss": 1.8345, + "step": 9195 + }, + { + "epoch": 0.01793510646170144, + "grad_norm": 2.1936373710632324, + "learning_rate": 2.9976195776122397e-05, + "loss": 1.9265, + "step": 9210 + }, + { + "epoch": 0.0179643167328117, + "grad_norm": 4.018658638000488, + "learning_rate": 2.997611819539035e-05, + "loss": 1.9237, + "step": 9225 + }, + { + "epoch": 0.017993527003921966, + "grad_norm": 3.5876455307006836, + "learning_rate": 2.9976040488542304e-05, + "loss": 1.8761, + "step": 9240 + }, + { + "epoch": 0.01802273727503223, + "grad_norm": 2.9479000568389893, + "learning_rate": 2.9975962655578915e-05, + "loss": 1.9062, + "step": 9255 + }, + { + "epoch": 0.01805194754614249, + "grad_norm": 3.185248613357544, + "learning_rate": 2.9975884696500835e-05, + "loss": 1.9958, + "step": 9270 + }, + { + "epoch": 0.018081157817252756, + "grad_norm": 2.5612637996673584, + "learning_rate": 2.9975806611308725e-05, + "loss": 2.0438, + "step": 9285 + }, + { + "epoch": 0.018110368088363017, + "grad_norm": 3.9929094314575195, + "learning_rate": 2.9975728400003244e-05, + "loss": 1.8404, + "step": 9300 + }, + { + "epoch": 0.01813957835947328, + "grad_norm": 2.15783953666687, + "learning_rate": 2.9975650062585043e-05, + "loss": 1.8677, + "step": 9315 + }, + { + "epoch": 0.018168788630583542, + "grad_norm": 3.7899887561798096, + "learning_rate": 2.997557159905479e-05, + "loss": 1.9269, + "step": 9330 + }, + { + "epoch": 0.018197998901693806, + "grad_norm": 4.0455145835876465, + "learning_rate": 2.9975493009413144e-05, + "loss": 1.8552, + "step": 9345 + }, + { + "epoch": 0.01822720917280407, + "grad_norm": 2.837963581085205, + "learning_rate": 2.9975414293660766e-05, + "loss": 2.1462, + "step": 9360 + }, + { + "epoch": 0.018256419443914332, + "grad_norm": 3.8562536239624023, + "learning_rate": 2.9975335451798317e-05, + "loss": 1.8625, + "step": 9375 + }, + { + "epoch": 0.018285629715024596, + "grad_norm": 2.4235100746154785, + "learning_rate": 2.9975256483826453e-05, + "loss": 1.8512, + "step": 9390 + }, + { + "epoch": 0.018314839986134857, + "grad_norm": 5.756357192993164, + "learning_rate": 2.997517738974586e-05, + "loss": 1.84, + "step": 9405 + }, + { + "epoch": 0.01834405025724512, + "grad_norm": 2.3832759857177734, + "learning_rate": 2.9975098169557187e-05, + "loss": 1.8721, + "step": 9420 + }, + { + "epoch": 0.018373260528355383, + "grad_norm": 2.0849883556365967, + "learning_rate": 2.9975018823261106e-05, + "loss": 1.914, + "step": 9435 + }, + { + "epoch": 0.018402470799465647, + "grad_norm": 2.3563778400421143, + "learning_rate": 2.997493935085829e-05, + "loss": 1.7283, + "step": 9450 + }, + { + "epoch": 0.018431681070575908, + "grad_norm": 2.018721580505371, + "learning_rate": 2.9974859752349396e-05, + "loss": 1.7748, + "step": 9465 + }, + { + "epoch": 0.018460891341686173, + "grad_norm": 2.725719451904297, + "learning_rate": 2.9974780027735103e-05, + "loss": 1.9287, + "step": 9480 + }, + { + "epoch": 0.018490101612796437, + "grad_norm": 3.2846550941467285, + "learning_rate": 2.9974700177016082e-05, + "loss": 1.8214, + "step": 9495 + }, + { + "epoch": 0.018519311883906698, + "grad_norm": 4.153242111206055, + "learning_rate": 2.997462020019301e-05, + "loss": 1.9377, + "step": 9510 + }, + { + "epoch": 0.018548522155016962, + "grad_norm": 2.386509656906128, + "learning_rate": 2.997454009726655e-05, + "loss": 1.7513, + "step": 9525 + }, + { + "epoch": 0.018577732426127223, + "grad_norm": 3.9288957118988037, + "learning_rate": 2.9974459868237384e-05, + "loss": 1.8623, + "step": 9540 + }, + { + "epoch": 0.018606942697237488, + "grad_norm": 2.2002370357513428, + "learning_rate": 2.9974379513106184e-05, + "loss": 2.0153, + "step": 9555 + }, + { + "epoch": 0.01863615296834775, + "grad_norm": 2.4271867275238037, + "learning_rate": 2.9974299031873625e-05, + "loss": 1.8772, + "step": 9570 + }, + { + "epoch": 0.018665363239458013, + "grad_norm": 2.355729341506958, + "learning_rate": 2.9974218424540395e-05, + "loss": 1.9947, + "step": 9585 + }, + { + "epoch": 0.018694573510568278, + "grad_norm": 4.210724830627441, + "learning_rate": 2.9974137691107164e-05, + "loss": 1.8113, + "step": 9600 + }, + { + "epoch": 0.01872378378167854, + "grad_norm": 3.092832088470459, + "learning_rate": 2.997405683157461e-05, + "loss": 1.8457, + "step": 9615 + }, + { + "epoch": 0.018752994052788803, + "grad_norm": 3.154505491256714, + "learning_rate": 2.997397584594342e-05, + "loss": 1.9223, + "step": 9630 + }, + { + "epoch": 0.018782204323899064, + "grad_norm": 2.661449909210205, + "learning_rate": 2.997389473421427e-05, + "loss": 1.8362, + "step": 9645 + }, + { + "epoch": 0.01881141459500933, + "grad_norm": 4.499241352081299, + "learning_rate": 2.997381349638785e-05, + "loss": 1.974, + "step": 9660 + }, + { + "epoch": 0.01884062486611959, + "grad_norm": 8.712298393249512, + "learning_rate": 2.9973732132464838e-05, + "loss": 1.793, + "step": 9675 + }, + { + "epoch": 0.018869835137229854, + "grad_norm": 3.771261215209961, + "learning_rate": 2.9973650642445926e-05, + "loss": 1.8158, + "step": 9690 + }, + { + "epoch": 0.018899045408340115, + "grad_norm": 2.3334624767303467, + "learning_rate": 2.997356902633179e-05, + "loss": 1.9791, + "step": 9705 + }, + { + "epoch": 0.01892825567945038, + "grad_norm": 3.4070332050323486, + "learning_rate": 2.997348728412313e-05, + "loss": 1.8621, + "step": 9720 + }, + { + "epoch": 0.018957465950560644, + "grad_norm": 2.4055604934692383, + "learning_rate": 2.997340541582062e-05, + "loss": 1.8424, + "step": 9735 + }, + { + "epoch": 0.018986676221670905, + "grad_norm": 3.209122896194458, + "learning_rate": 2.9973323421424962e-05, + "loss": 1.8791, + "step": 9750 + }, + { + "epoch": 0.01901588649278117, + "grad_norm": 2.720518112182617, + "learning_rate": 2.9973241300936842e-05, + "loss": 1.9752, + "step": 9765 + }, + { + "epoch": 0.01904509676389143, + "grad_norm": 1.9057116508483887, + "learning_rate": 2.9973159054356948e-05, + "loss": 1.9658, + "step": 9780 + }, + { + "epoch": 0.019074307035001695, + "grad_norm": 3.4243197441101074, + "learning_rate": 2.9973076681685977e-05, + "loss": 1.9394, + "step": 9795 + }, + { + "epoch": 0.019103517306111956, + "grad_norm": 3.557957410812378, + "learning_rate": 2.997299418292462e-05, + "loss": 1.8854, + "step": 9810 + }, + { + "epoch": 0.01913272757722222, + "grad_norm": 2.576314926147461, + "learning_rate": 2.9972911558073575e-05, + "loss": 1.8975, + "step": 9825 + }, + { + "epoch": 0.019161937848332485, + "grad_norm": 2.4958183765411377, + "learning_rate": 2.9972828807133537e-05, + "loss": 1.765, + "step": 9840 + }, + { + "epoch": 0.019191148119442746, + "grad_norm": 3.1889865398406982, + "learning_rate": 2.99727459301052e-05, + "loss": 1.9785, + "step": 9855 + }, + { + "epoch": 0.01922035839055301, + "grad_norm": 4.607937335968018, + "learning_rate": 2.9972662926989267e-05, + "loss": 1.7931, + "step": 9870 + }, + { + "epoch": 0.01924956866166327, + "grad_norm": 3.380537271499634, + "learning_rate": 2.997257979778643e-05, + "loss": 1.8366, + "step": 9885 + }, + { + "epoch": 0.019278778932773535, + "grad_norm": 1.7773466110229492, + "learning_rate": 2.9972496542497393e-05, + "loss": 1.841, + "step": 9900 + }, + { + "epoch": 0.019307989203883796, + "grad_norm": 3.43685245513916, + "learning_rate": 2.9972413161122858e-05, + "loss": 1.8255, + "step": 9915 + }, + { + "epoch": 0.01933719947499406, + "grad_norm": 4.238219261169434, + "learning_rate": 2.9972329653663525e-05, + "loss": 1.8403, + "step": 9930 + }, + { + "epoch": 0.019366409746104322, + "grad_norm": 3.8438355922698975, + "learning_rate": 2.99722460201201e-05, + "loss": 1.8005, + "step": 9945 + }, + { + "epoch": 0.019395620017214586, + "grad_norm": 4.286600589752197, + "learning_rate": 2.997216226049328e-05, + "loss": 1.8684, + "step": 9960 + }, + { + "epoch": 0.01942483028832485, + "grad_norm": 1.7342430353164673, + "learning_rate": 2.997207837478378e-05, + "loss": 1.9657, + "step": 9975 + }, + { + "epoch": 0.01945404055943511, + "grad_norm": 3.3315911293029785, + "learning_rate": 2.9971994362992304e-05, + "loss": 1.9156, + "step": 9990 + }, + { + "epoch": 0.019483250830545376, + "grad_norm": 4.346848011016846, + "learning_rate": 2.9971910225119556e-05, + "loss": 1.9114, + "step": 10005 + }, + { + "epoch": 0.019512461101655637, + "grad_norm": 2.8670308589935303, + "learning_rate": 2.9971825961166248e-05, + "loss": 1.8471, + "step": 10020 + }, + { + "epoch": 0.0195416713727659, + "grad_norm": 2.1465935707092285, + "learning_rate": 2.9971741571133085e-05, + "loss": 1.8608, + "step": 10035 + }, + { + "epoch": 0.019570881643876162, + "grad_norm": 3.3292319774627686, + "learning_rate": 2.9971657055020782e-05, + "loss": 1.877, + "step": 10050 + }, + { + "epoch": 0.019600091914986427, + "grad_norm": 2.3058934211730957, + "learning_rate": 2.9971572412830045e-05, + "loss": 1.9125, + "step": 10065 + }, + { + "epoch": 0.01962930218609669, + "grad_norm": 4.684175968170166, + "learning_rate": 2.9971487644561597e-05, + "loss": 1.9237, + "step": 10080 + }, + { + "epoch": 0.019658512457206952, + "grad_norm": 3.6633639335632324, + "learning_rate": 2.9971402750216144e-05, + "loss": 1.856, + "step": 10095 + }, + { + "epoch": 0.019687722728317217, + "grad_norm": 3.837944269180298, + "learning_rate": 2.9971317729794404e-05, + "loss": 2.0146, + "step": 10110 + }, + { + "epoch": 0.019716932999427478, + "grad_norm": 4.053643226623535, + "learning_rate": 2.997123258329709e-05, + "loss": 1.9125, + "step": 10125 + }, + { + "epoch": 0.019746143270537742, + "grad_norm": 1.9680250883102417, + "learning_rate": 2.9971147310724923e-05, + "loss": 1.9431, + "step": 10140 + }, + { + "epoch": 0.019775353541648003, + "grad_norm": 1.941953420639038, + "learning_rate": 2.9971061912078615e-05, + "loss": 1.8638, + "step": 10155 + }, + { + "epoch": 0.019804563812758268, + "grad_norm": 3.1855714321136475, + "learning_rate": 2.997097638735889e-05, + "loss": 1.6231, + "step": 10170 + }, + { + "epoch": 0.019833774083868532, + "grad_norm": 4.022531509399414, + "learning_rate": 2.997089073656647e-05, + "loss": 1.8637, + "step": 10185 + }, + { + "epoch": 0.019862984354978793, + "grad_norm": 4.320540904998779, + "learning_rate": 2.997080495970207e-05, + "loss": 1.8383, + "step": 10200 + }, + { + "epoch": 0.019892194626089058, + "grad_norm": 3.0604958534240723, + "learning_rate": 2.997071905676642e-05, + "loss": 1.8843, + "step": 10215 + }, + { + "epoch": 0.01992140489719932, + "grad_norm": 3.2216265201568604, + "learning_rate": 2.9970633027760235e-05, + "loss": 2.004, + "step": 10230 + }, + { + "epoch": 0.019950615168309583, + "grad_norm": 2.4354753494262695, + "learning_rate": 2.997054687268425e-05, + "loss": 1.9255, + "step": 10245 + }, + { + "epoch": 0.019979825439419844, + "grad_norm": 2.3556060791015625, + "learning_rate": 2.9970460591539175e-05, + "loss": 2.0436, + "step": 10260 + }, + { + "epoch": 0.02000903571053011, + "grad_norm": 3.6977896690368652, + "learning_rate": 2.9970374184325753e-05, + "loss": 1.9181, + "step": 10275 + }, + { + "epoch": 0.02003824598164037, + "grad_norm": 1.909177541732788, + "learning_rate": 2.99702876510447e-05, + "loss": 1.7653, + "step": 10290 + }, + { + "epoch": 0.020067456252750634, + "grad_norm": 2.486943006515503, + "learning_rate": 2.997020099169675e-05, + "loss": 1.9137, + "step": 10305 + }, + { + "epoch": 0.020096666523860898, + "grad_norm": 3.0627150535583496, + "learning_rate": 2.9970114206282634e-05, + "loss": 1.8442, + "step": 10320 + }, + { + "epoch": 0.02012587679497116, + "grad_norm": 2.7819271087646484, + "learning_rate": 2.997002729480308e-05, + "loss": 2.1325, + "step": 10335 + }, + { + "epoch": 0.020155087066081424, + "grad_norm": 2.097712755203247, + "learning_rate": 2.9969940257258823e-05, + "loss": 1.8952, + "step": 10350 + }, + { + "epoch": 0.020184297337191685, + "grad_norm": 2.317915439605713, + "learning_rate": 2.9969853093650592e-05, + "loss": 1.8896, + "step": 10365 + }, + { + "epoch": 0.02021350760830195, + "grad_norm": 3.3799221515655518, + "learning_rate": 2.996976580397912e-05, + "loss": 1.9032, + "step": 10380 + }, + { + "epoch": 0.02024271787941221, + "grad_norm": 4.226128578186035, + "learning_rate": 2.996967838824515e-05, + "loss": 1.9812, + "step": 10395 + }, + { + "epoch": 0.020271928150522475, + "grad_norm": 2.872182607650757, + "learning_rate": 2.996959084644941e-05, + "loss": 1.7335, + "step": 10410 + }, + { + "epoch": 0.02030113842163274, + "grad_norm": 1.931477427482605, + "learning_rate": 2.9969503178592638e-05, + "loss": 1.8178, + "step": 10425 + }, + { + "epoch": 0.020330348692743, + "grad_norm": 1.9570348262786865, + "learning_rate": 2.9969415384675577e-05, + "loss": 1.9652, + "step": 10440 + }, + { + "epoch": 0.020359558963853264, + "grad_norm": 2.1548566818237305, + "learning_rate": 2.996932746469896e-05, + "loss": 1.9105, + "step": 10455 + }, + { + "epoch": 0.020388769234963525, + "grad_norm": 2.138561248779297, + "learning_rate": 2.9969239418663538e-05, + "loss": 2.2595, + "step": 10470 + }, + { + "epoch": 0.02041797950607379, + "grad_norm": 3.682020425796509, + "learning_rate": 2.9969151246570038e-05, + "loss": 1.8828, + "step": 10485 + }, + { + "epoch": 0.02044718977718405, + "grad_norm": 1.9086365699768066, + "learning_rate": 2.9969062948419213e-05, + "loss": 1.8974, + "step": 10500 + }, + { + "epoch": 0.020476400048294315, + "grad_norm": 3.4526236057281494, + "learning_rate": 2.9968974524211807e-05, + "loss": 1.8972, + "step": 10515 + }, + { + "epoch": 0.020505610319404576, + "grad_norm": 5.137340068817139, + "learning_rate": 2.996888597394856e-05, + "loss": 1.7885, + "step": 10530 + }, + { + "epoch": 0.02053482059051484, + "grad_norm": 2.410789966583252, + "learning_rate": 2.9968797297630215e-05, + "loss": 1.9339, + "step": 10545 + }, + { + "epoch": 0.020564030861625105, + "grad_norm": 2.756986618041992, + "learning_rate": 2.9968708495257527e-05, + "loss": 1.9682, + "step": 10560 + }, + { + "epoch": 0.020593241132735366, + "grad_norm": 4.116410255432129, + "learning_rate": 2.9968619566831238e-05, + "loss": 1.8814, + "step": 10575 + }, + { + "epoch": 0.02062245140384563, + "grad_norm": 3.892730712890625, + "learning_rate": 2.9968530512352098e-05, + "loss": 1.9744, + "step": 10590 + }, + { + "epoch": 0.02065166167495589, + "grad_norm": 3.5503933429718018, + "learning_rate": 2.9968441331820856e-05, + "loss": 2.0472, + "step": 10605 + }, + { + "epoch": 0.020680871946066156, + "grad_norm": 4.085498809814453, + "learning_rate": 2.9968352025238263e-05, + "loss": 1.9428, + "step": 10620 + }, + { + "epoch": 0.020710082217176417, + "grad_norm": 3.806868553161621, + "learning_rate": 2.996826259260508e-05, + "loss": 1.8212, + "step": 10635 + }, + { + "epoch": 0.02073929248828668, + "grad_norm": 2.24172306060791, + "learning_rate": 2.9968173033922045e-05, + "loss": 1.7338, + "step": 10650 + }, + { + "epoch": 0.020768502759396946, + "grad_norm": 2.516962766647339, + "learning_rate": 2.996808334918992e-05, + "loss": 1.9185, + "step": 10665 + }, + { + "epoch": 0.020797713030507207, + "grad_norm": 3.8772926330566406, + "learning_rate": 2.9967993538409465e-05, + "loss": 1.8053, + "step": 10680 + }, + { + "epoch": 0.02082692330161747, + "grad_norm": 5.146918296813965, + "learning_rate": 2.9967903601581427e-05, + "loss": 1.8488, + "step": 10695 + }, + { + "epoch": 0.020856133572727732, + "grad_norm": 6.216543197631836, + "learning_rate": 2.9967813538706568e-05, + "loss": 1.9446, + "step": 10710 + }, + { + "epoch": 0.020885343843837997, + "grad_norm": 3.0393502712249756, + "learning_rate": 2.9967723349785648e-05, + "loss": 1.8881, + "step": 10725 + }, + { + "epoch": 0.020914554114948258, + "grad_norm": 2.7038638591766357, + "learning_rate": 2.996763303481942e-05, + "loss": 1.9454, + "step": 10740 + }, + { + "epoch": 0.020943764386058522, + "grad_norm": 2.4057178497314453, + "learning_rate": 2.9967542593808655e-05, + "loss": 2.0256, + "step": 10755 + }, + { + "epoch": 0.020972974657168783, + "grad_norm": 2.2479588985443115, + "learning_rate": 2.9967452026754104e-05, + "loss": 1.8835, + "step": 10770 + }, + { + "epoch": 0.021002184928279048, + "grad_norm": 2.2106525897979736, + "learning_rate": 2.996736133365654e-05, + "loss": 1.8594, + "step": 10785 + }, + { + "epoch": 0.021031395199389312, + "grad_norm": 2.273165225982666, + "learning_rate": 2.9967270514516718e-05, + "loss": 1.772, + "step": 10800 + }, + { + "epoch": 0.021060605470499573, + "grad_norm": 4.13364315032959, + "learning_rate": 2.9967179569335407e-05, + "loss": 1.8364, + "step": 10815 + }, + { + "epoch": 0.021089815741609837, + "grad_norm": 4.664285182952881, + "learning_rate": 2.9967088498113368e-05, + "loss": 1.7126, + "step": 10830 + }, + { + "epoch": 0.0211190260127201, + "grad_norm": 3.9353389739990234, + "learning_rate": 2.9966997300851376e-05, + "loss": 1.8366, + "step": 10845 + }, + { + "epoch": 0.021148236283830363, + "grad_norm": 3.433561086654663, + "learning_rate": 2.996690597755019e-05, + "loss": 1.7921, + "step": 10860 + }, + { + "epoch": 0.021177446554940624, + "grad_norm": 2.0325567722320557, + "learning_rate": 2.996681452821059e-05, + "loss": 1.88, + "step": 10875 + }, + { + "epoch": 0.021206656826050888, + "grad_norm": 3.6523592472076416, + "learning_rate": 2.9966722952833335e-05, + "loss": 1.8464, + "step": 10890 + }, + { + "epoch": 0.021235867097161153, + "grad_norm": 4.08983039855957, + "learning_rate": 2.99666312514192e-05, + "loss": 2.0643, + "step": 10905 + }, + { + "epoch": 0.021265077368271414, + "grad_norm": 4.714212894439697, + "learning_rate": 2.9966539423968964e-05, + "loss": 1.9458, + "step": 10920 + }, + { + "epoch": 0.021294287639381678, + "grad_norm": 3.2425897121429443, + "learning_rate": 2.996644747048339e-05, + "loss": 1.9802, + "step": 10935 + }, + { + "epoch": 0.02132349791049194, + "grad_norm": 5.025219440460205, + "learning_rate": 2.9966355390963258e-05, + "loss": 1.8679, + "step": 10950 + }, + { + "epoch": 0.021352708181602204, + "grad_norm": 3.669241428375244, + "learning_rate": 2.9966263185409343e-05, + "loss": 1.9634, + "step": 10965 + }, + { + "epoch": 0.021381918452712464, + "grad_norm": 3.5736284255981445, + "learning_rate": 2.996617085382242e-05, + "loss": 1.9348, + "step": 10980 + }, + { + "epoch": 0.02141112872382273, + "grad_norm": 2.8263094425201416, + "learning_rate": 2.996607839620327e-05, + "loss": 1.7897, + "step": 10995 + }, + { + "epoch": 0.02144033899493299, + "grad_norm": 4.033946990966797, + "learning_rate": 2.996598581255267e-05, + "loss": 1.9587, + "step": 11010 + }, + { + "epoch": 0.021469549266043254, + "grad_norm": 3.7127420902252197, + "learning_rate": 2.996589310287139e-05, + "loss": 1.874, + "step": 11025 + }, + { + "epoch": 0.02149875953715352, + "grad_norm": 4.083348751068115, + "learning_rate": 2.9965800267160223e-05, + "loss": 1.7831, + "step": 11040 + }, + { + "epoch": 0.02152796980826378, + "grad_norm": 4.666345596313477, + "learning_rate": 2.996570730541995e-05, + "loss": 1.8516, + "step": 11055 + }, + { + "epoch": 0.021557180079374044, + "grad_norm": 2.3497631549835205, + "learning_rate": 2.996561421765135e-05, + "loss": 1.9213, + "step": 11070 + }, + { + "epoch": 0.021586390350484305, + "grad_norm": 4.355146408081055, + "learning_rate": 2.996552100385521e-05, + "loss": 1.8072, + "step": 11085 + }, + { + "epoch": 0.02161560062159457, + "grad_norm": 3.6673879623413086, + "learning_rate": 2.996542766403231e-05, + "loss": 1.8541, + "step": 11100 + }, + { + "epoch": 0.02164481089270483, + "grad_norm": 2.2446086406707764, + "learning_rate": 2.996533419818344e-05, + "loss": 1.9764, + "step": 11115 + }, + { + "epoch": 0.021674021163815095, + "grad_norm": 3.8175010681152344, + "learning_rate": 2.996524060630938e-05, + "loss": 1.9664, + "step": 11130 + }, + { + "epoch": 0.02170323143492536, + "grad_norm": 4.360842704772949, + "learning_rate": 2.996514688841093e-05, + "loss": 1.9145, + "step": 11145 + }, + { + "epoch": 0.02173244170603562, + "grad_norm": 3.110860586166382, + "learning_rate": 2.996505304448887e-05, + "loss": 1.8939, + "step": 11160 + }, + { + "epoch": 0.021761651977145885, + "grad_norm": 2.5407495498657227, + "learning_rate": 2.9964959074544e-05, + "loss": 1.8649, + "step": 11175 + }, + { + "epoch": 0.021790862248256146, + "grad_norm": 2.933225393295288, + "learning_rate": 2.9964864978577103e-05, + "loss": 1.8987, + "step": 11190 + }, + { + "epoch": 0.02182007251936641, + "grad_norm": 3.003664493560791, + "learning_rate": 2.996477075658897e-05, + "loss": 1.936, + "step": 11205 + }, + { + "epoch": 0.02184928279047667, + "grad_norm": 3.119703531265259, + "learning_rate": 2.99646764085804e-05, + "loss": 1.7183, + "step": 11220 + }, + { + "epoch": 0.021878493061586936, + "grad_norm": 2.6627697944641113, + "learning_rate": 2.9964581934552182e-05, + "loss": 1.9218, + "step": 11235 + }, + { + "epoch": 0.0219077033326972, + "grad_norm": 4.310539245605469, + "learning_rate": 2.9964487334505114e-05, + "loss": 2.0666, + "step": 11250 + }, + { + "epoch": 0.02193691360380746, + "grad_norm": 2.611443519592285, + "learning_rate": 2.9964392608439997e-05, + "loss": 1.8154, + "step": 11265 + }, + { + "epoch": 0.021966123874917726, + "grad_norm": 3.391406774520874, + "learning_rate": 2.996429775635763e-05, + "loss": 1.9308, + "step": 11280 + }, + { + "epoch": 0.021995334146027987, + "grad_norm": 3.1492297649383545, + "learning_rate": 2.9964202778258797e-05, + "loss": 1.939, + "step": 11295 + }, + { + "epoch": 0.02202454441713825, + "grad_norm": 3.680859088897705, + "learning_rate": 2.9964107674144313e-05, + "loss": 1.9048, + "step": 11310 + }, + { + "epoch": 0.022053754688248512, + "grad_norm": 2.0522656440734863, + "learning_rate": 2.9964012444014972e-05, + "loss": 1.9477, + "step": 11325 + }, + { + "epoch": 0.022082964959358777, + "grad_norm": 3.265316963195801, + "learning_rate": 2.996391708787158e-05, + "loss": 1.8116, + "step": 11340 + }, + { + "epoch": 0.022112175230469037, + "grad_norm": 2.0570802688598633, + "learning_rate": 2.9963821605714934e-05, + "loss": 1.8493, + "step": 11355 + }, + { + "epoch": 0.022141385501579302, + "grad_norm": 4.1805877685546875, + "learning_rate": 2.9963725997545844e-05, + "loss": 1.8909, + "step": 11370 + }, + { + "epoch": 0.022170595772689566, + "grad_norm": 2.6846070289611816, + "learning_rate": 2.9963630263365116e-05, + "loss": 1.8424, + "step": 11385 + }, + { + "epoch": 0.022199806043799827, + "grad_norm": 4.480174541473389, + "learning_rate": 2.996353440317355e-05, + "loss": 1.919, + "step": 11400 + }, + { + "epoch": 0.022229016314910092, + "grad_norm": 2.179137706756592, + "learning_rate": 2.996343841697195e-05, + "loss": 1.8023, + "step": 11415 + }, + { + "epoch": 0.022258226586020353, + "grad_norm": 4.341340065002441, + "learning_rate": 2.996334230476114e-05, + "loss": 1.8263, + "step": 11430 + }, + { + "epoch": 0.022287436857130617, + "grad_norm": 1.6666501760482788, + "learning_rate": 2.9963246066541913e-05, + "loss": 1.8854, + "step": 11445 + }, + { + "epoch": 0.022316647128240878, + "grad_norm": 2.9793460369110107, + "learning_rate": 2.9963149702315093e-05, + "loss": 1.9214, + "step": 11460 + }, + { + "epoch": 0.022345857399351143, + "grad_norm": 4.338296413421631, + "learning_rate": 2.996305321208148e-05, + "loss": 1.9517, + "step": 11475 + }, + { + "epoch": 0.022375067670461407, + "grad_norm": 4.449549674987793, + "learning_rate": 2.99629565958419e-05, + "loss": 2.0761, + "step": 11490 + }, + { + "epoch": 0.022404277941571668, + "grad_norm": 2.6661598682403564, + "learning_rate": 2.9962859853597146e-05, + "loss": 1.9029, + "step": 11505 + }, + { + "epoch": 0.022433488212681933, + "grad_norm": 2.5066778659820557, + "learning_rate": 2.996276298534805e-05, + "loss": 1.7934, + "step": 11520 + }, + { + "epoch": 0.022462698483792193, + "grad_norm": 2.2089765071868896, + "learning_rate": 2.9962665991095424e-05, + "loss": 2.0755, + "step": 11535 + }, + { + "epoch": 0.022491908754902458, + "grad_norm": 3.543388605117798, + "learning_rate": 2.9962568870840078e-05, + "loss": 1.8682, + "step": 11550 + }, + { + "epoch": 0.02252111902601272, + "grad_norm": 3.1596784591674805, + "learning_rate": 2.9962471624582838e-05, + "loss": 2.0225, + "step": 11565 + }, + { + "epoch": 0.022550329297122983, + "grad_norm": 3.338447332382202, + "learning_rate": 2.9962374252324524e-05, + "loss": 1.7248, + "step": 11580 + }, + { + "epoch": 0.022579539568233244, + "grad_norm": 2.6090617179870605, + "learning_rate": 2.996227675406595e-05, + "loss": 2.1125, + "step": 11595 + }, + { + "epoch": 0.02260874983934351, + "grad_norm": 3.618283271789551, + "learning_rate": 2.9962179129807936e-05, + "loss": 2.0012, + "step": 11610 + }, + { + "epoch": 0.022637960110453773, + "grad_norm": 2.161893129348755, + "learning_rate": 2.996208137955131e-05, + "loss": 1.8541, + "step": 11625 + }, + { + "epoch": 0.022667170381564034, + "grad_norm": 4.789167881011963, + "learning_rate": 2.996198350329689e-05, + "loss": 1.7935, + "step": 11640 + }, + { + "epoch": 0.0226963806526743, + "grad_norm": 2.7407031059265137, + "learning_rate": 2.9961885501045505e-05, + "loss": 1.8944, + "step": 11655 + }, + { + "epoch": 0.02272559092378456, + "grad_norm": 2.0403008460998535, + "learning_rate": 2.9961787372797977e-05, + "loss": 1.9245, + "step": 11670 + }, + { + "epoch": 0.022754801194894824, + "grad_norm": 2.2842493057250977, + "learning_rate": 2.996168911855513e-05, + "loss": 1.8867, + "step": 11685 + }, + { + "epoch": 0.022784011466005085, + "grad_norm": 3.4035165309906006, + "learning_rate": 2.99615907383178e-05, + "loss": 1.8029, + "step": 11700 + }, + { + "epoch": 0.02281322173711535, + "grad_norm": 4.04712438583374, + "learning_rate": 2.996149223208681e-05, + "loss": 1.8933, + "step": 11715 + }, + { + "epoch": 0.022842432008225614, + "grad_norm": 3.2550394535064697, + "learning_rate": 2.996139359986299e-05, + "loss": 1.8503, + "step": 11730 + }, + { + "epoch": 0.022871642279335875, + "grad_norm": 2.3266172409057617, + "learning_rate": 2.9961294841647164e-05, + "loss": 1.939, + "step": 11745 + }, + { + "epoch": 0.02290085255044614, + "grad_norm": 1.7769925594329834, + "learning_rate": 2.9961195957440172e-05, + "loss": 2.0939, + "step": 11760 + }, + { + "epoch": 0.0229300628215564, + "grad_norm": 3.4839985370635986, + "learning_rate": 2.9961096947242846e-05, + "loss": 1.8933, + "step": 11775 + }, + { + "epoch": 0.022959273092666665, + "grad_norm": 4.051612377166748, + "learning_rate": 2.9960997811056017e-05, + "loss": 1.8464, + "step": 11790 + }, + { + "epoch": 0.022988483363776926, + "grad_norm": 5.2628703117370605, + "learning_rate": 2.9960898548880525e-05, + "loss": 1.9296, + "step": 11805 + }, + { + "epoch": 0.02301769363488719, + "grad_norm": 4.6693434715271, + "learning_rate": 2.99607991607172e-05, + "loss": 2.0126, + "step": 11820 + }, + { + "epoch": 0.02304690390599745, + "grad_norm": 2.2805821895599365, + "learning_rate": 2.996069964656688e-05, + "loss": 1.7084, + "step": 11835 + }, + { + "epoch": 0.023076114177107716, + "grad_norm": 5.130448818206787, + "learning_rate": 2.996060000643041e-05, + "loss": 1.8437, + "step": 11850 + }, + { + "epoch": 0.02310532444821798, + "grad_norm": 3.6057186126708984, + "learning_rate": 2.9960500240308616e-05, + "loss": 2.0847, + "step": 11865 + }, + { + "epoch": 0.02313453471932824, + "grad_norm": 3.6711442470550537, + "learning_rate": 2.9960400348202348e-05, + "loss": 1.9817, + "step": 11880 + }, + { + "epoch": 0.023163744990438506, + "grad_norm": 2.514784097671509, + "learning_rate": 2.9960300330112445e-05, + "loss": 1.7633, + "step": 11895 + }, + { + "epoch": 0.023192955261548766, + "grad_norm": 2.1372082233428955, + "learning_rate": 2.996020018603975e-05, + "loss": 1.7941, + "step": 11910 + }, + { + "epoch": 0.02322216553265903, + "grad_norm": 2.8542985916137695, + "learning_rate": 2.9960099915985104e-05, + "loss": 1.7778, + "step": 11925 + }, + { + "epoch": 0.023251375803769292, + "grad_norm": 2.1213855743408203, + "learning_rate": 2.9959999519949354e-05, + "loss": 1.8966, + "step": 11940 + }, + { + "epoch": 0.023280586074879556, + "grad_norm": 2.751647710800171, + "learning_rate": 2.995989899793334e-05, + "loss": 1.7919, + "step": 11955 + }, + { + "epoch": 0.02330979634598982, + "grad_norm": 2.2502388954162598, + "learning_rate": 2.9959798349937915e-05, + "loss": 1.8062, + "step": 11970 + }, + { + "epoch": 0.023339006617100082, + "grad_norm": 2.5316834449768066, + "learning_rate": 2.995969757596392e-05, + "loss": 1.9685, + "step": 11985 + }, + { + "epoch": 0.023368216888210346, + "grad_norm": 2.8897788524627686, + "learning_rate": 2.995959667601221e-05, + "loss": 1.9659, + "step": 12000 + }, + { + "epoch": 0.023397427159320607, + "grad_norm": 3.8147826194763184, + "learning_rate": 2.9959495650083634e-05, + "loss": 1.7452, + "step": 12015 + }, + { + "epoch": 0.02342663743043087, + "grad_norm": 1.9649070501327515, + "learning_rate": 2.9959394498179043e-05, + "loss": 1.7987, + "step": 12030 + }, + { + "epoch": 0.023455847701541133, + "grad_norm": 2.2376818656921387, + "learning_rate": 2.9959293220299287e-05, + "loss": 1.87, + "step": 12045 + }, + { + "epoch": 0.023485057972651397, + "grad_norm": 2.3533129692077637, + "learning_rate": 2.9959191816445217e-05, + "loss": 2.006, + "step": 12060 + }, + { + "epoch": 0.023514268243761658, + "grad_norm": 2.5106008052825928, + "learning_rate": 2.9959090286617686e-05, + "loss": 1.8053, + "step": 12075 + }, + { + "epoch": 0.023543478514871923, + "grad_norm": 2.843824863433838, + "learning_rate": 2.9958988630817555e-05, + "loss": 2.0062, + "step": 12090 + }, + { + "epoch": 0.023572688785982187, + "grad_norm": 3.826493501663208, + "learning_rate": 2.9958886849045678e-05, + "loss": 1.8213, + "step": 12105 + }, + { + "epoch": 0.023601899057092448, + "grad_norm": 2.669509172439575, + "learning_rate": 2.9958784941302908e-05, + "loss": 2.0031, + "step": 12120 + }, + { + "epoch": 0.023631109328202712, + "grad_norm": 5.227283477783203, + "learning_rate": 2.995868290759011e-05, + "loss": 1.9724, + "step": 12135 + }, + { + "epoch": 0.023660319599312973, + "grad_norm": 2.67545485496521, + "learning_rate": 2.9958580747908134e-05, + "loss": 1.7393, + "step": 12150 + }, + { + "epoch": 0.023689529870423238, + "grad_norm": 4.294065952301025, + "learning_rate": 2.9958478462257847e-05, + "loss": 2.0262, + "step": 12165 + }, + { + "epoch": 0.0237187401415335, + "grad_norm": 4.024529933929443, + "learning_rate": 2.9958376050640114e-05, + "loss": 1.8878, + "step": 12180 + }, + { + "epoch": 0.023747950412643763, + "grad_norm": 3.88948130607605, + "learning_rate": 2.9958273513055785e-05, + "loss": 1.9365, + "step": 12195 + }, + { + "epoch": 0.023777160683754028, + "grad_norm": 2.3247134685516357, + "learning_rate": 2.9958170849505736e-05, + "loss": 1.9275, + "step": 12210 + }, + { + "epoch": 0.02380637095486429, + "grad_norm": 2.8689277172088623, + "learning_rate": 2.9958068059990827e-05, + "loss": 1.9015, + "step": 12225 + }, + { + "epoch": 0.023835581225974553, + "grad_norm": 2.650768756866455, + "learning_rate": 2.995796514451192e-05, + "loss": 2.0453, + "step": 12240 + }, + { + "epoch": 0.023864791497084814, + "grad_norm": 3.2005455493927, + "learning_rate": 2.9957862103069886e-05, + "loss": 1.8641, + "step": 12255 + }, + { + "epoch": 0.02389400176819508, + "grad_norm": 3.2274022102355957, + "learning_rate": 2.9957758935665592e-05, + "loss": 2.0379, + "step": 12270 + }, + { + "epoch": 0.02392321203930534, + "grad_norm": 2.933011054992676, + "learning_rate": 2.9957655642299903e-05, + "loss": 2.0304, + "step": 12285 + }, + { + "epoch": 0.023952422310415604, + "grad_norm": 2.051677942276001, + "learning_rate": 2.9957552222973696e-05, + "loss": 1.9656, + "step": 12300 + }, + { + "epoch": 0.02398163258152587, + "grad_norm": 2.488805055618286, + "learning_rate": 2.995744867768784e-05, + "loss": 1.8997, + "step": 12315 + }, + { + "epoch": 0.02401084285263613, + "grad_norm": 2.1613032817840576, + "learning_rate": 2.99573450064432e-05, + "loss": 1.9733, + "step": 12330 + }, + { + "epoch": 0.024040053123746394, + "grad_norm": 3.817368745803833, + "learning_rate": 2.9957241209240656e-05, + "loss": 1.74, + "step": 12345 + }, + { + "epoch": 0.024069263394856655, + "grad_norm": 2.048835515975952, + "learning_rate": 2.995713728608108e-05, + "loss": 2.0201, + "step": 12360 + }, + { + "epoch": 0.02409847366596692, + "grad_norm": 3.2480578422546387, + "learning_rate": 2.995703323696535e-05, + "loss": 1.8652, + "step": 12375 + }, + { + "epoch": 0.02412768393707718, + "grad_norm": 2.2918214797973633, + "learning_rate": 2.9956929061894334e-05, + "loss": 1.9089, + "step": 12390 + }, + { + "epoch": 0.024156894208187445, + "grad_norm": 2.4332637786865234, + "learning_rate": 2.995682476086892e-05, + "loss": 1.8168, + "step": 12405 + }, + { + "epoch": 0.024186104479297706, + "grad_norm": 3.9398860931396484, + "learning_rate": 2.9956720333889978e-05, + "loss": 1.8114, + "step": 12420 + }, + { + "epoch": 0.02421531475040797, + "grad_norm": 1.3877410888671875, + "learning_rate": 2.995661578095839e-05, + "loss": 1.7609, + "step": 12435 + }, + { + "epoch": 0.024244525021518235, + "grad_norm": 3.1245439052581787, + "learning_rate": 2.9956511102075043e-05, + "loss": 1.9191, + "step": 12450 + }, + { + "epoch": 0.024273735292628495, + "grad_norm": 4.245307445526123, + "learning_rate": 2.9956406297240805e-05, + "loss": 1.9551, + "step": 12465 + }, + { + "epoch": 0.02430294556373876, + "grad_norm": 3.033841371536255, + "learning_rate": 2.995630136645657e-05, + "loss": 2.063, + "step": 12480 + }, + { + "epoch": 0.02433215583484902, + "grad_norm": 2.9336037635803223, + "learning_rate": 2.9956196309723217e-05, + "loss": 1.8383, + "step": 12495 + }, + { + "epoch": 0.024361366105959285, + "grad_norm": 2.946824312210083, + "learning_rate": 2.9956091127041628e-05, + "loss": 2.012, + "step": 12510 + }, + { + "epoch": 0.024390576377069546, + "grad_norm": 1.9165891408920288, + "learning_rate": 2.9955985818412695e-05, + "loss": 1.8422, + "step": 12525 + }, + { + "epoch": 0.02441978664817981, + "grad_norm": 2.4142305850982666, + "learning_rate": 2.9955880383837304e-05, + "loss": 2.027, + "step": 12540 + }, + { + "epoch": 0.024448996919290075, + "grad_norm": 3.1490895748138428, + "learning_rate": 2.9955774823316337e-05, + "loss": 1.8485, + "step": 12555 + }, + { + "epoch": 0.024478207190400336, + "grad_norm": 3.187546968460083, + "learning_rate": 2.995566913685069e-05, + "loss": 1.8955, + "step": 12570 + }, + { + "epoch": 0.0245074174615106, + "grad_norm": 4.743402481079102, + "learning_rate": 2.9955563324441246e-05, + "loss": 1.8438, + "step": 12585 + }, + { + "epoch": 0.02453662773262086, + "grad_norm": 4.372682094573975, + "learning_rate": 2.9955457386088904e-05, + "loss": 1.6814, + "step": 12600 + }, + { + "epoch": 0.024565838003731126, + "grad_norm": 4.228031158447266, + "learning_rate": 2.995535132179455e-05, + "loss": 1.8358, + "step": 12615 + }, + { + "epoch": 0.024595048274841387, + "grad_norm": 1.7856305837631226, + "learning_rate": 2.9955245131559078e-05, + "loss": 1.9314, + "step": 12630 + }, + { + "epoch": 0.02462425854595165, + "grad_norm": 2.232226610183716, + "learning_rate": 2.9955138815383383e-05, + "loss": 1.7662, + "step": 12645 + }, + { + "epoch": 0.024653468817061912, + "grad_norm": 2.812988758087158, + "learning_rate": 2.9955032373268366e-05, + "loss": 1.8475, + "step": 12660 + }, + { + "epoch": 0.024682679088172177, + "grad_norm": 4.58977746963501, + "learning_rate": 2.995492580521491e-05, + "loss": 1.8952, + "step": 12675 + }, + { + "epoch": 0.02471188935928244, + "grad_norm": 2.619033098220825, + "learning_rate": 2.995481911122393e-05, + "loss": 1.7343, + "step": 12690 + }, + { + "epoch": 0.024741099630392702, + "grad_norm": 2.20595645904541, + "learning_rate": 2.9954712291296303e-05, + "loss": 1.8573, + "step": 12705 + }, + { + "epoch": 0.024770309901502967, + "grad_norm": 4.261920928955078, + "learning_rate": 2.9954605345432948e-05, + "loss": 1.9771, + "step": 12720 + }, + { + "epoch": 0.024799520172613228, + "grad_norm": 4.171009063720703, + "learning_rate": 2.995449827363476e-05, + "loss": 1.8328, + "step": 12735 + }, + { + "epoch": 0.024828730443723492, + "grad_norm": 1.9221203327178955, + "learning_rate": 2.9954391075902634e-05, + "loss": 2.0122, + "step": 12750 + }, + { + "epoch": 0.024857940714833753, + "grad_norm": 4.801123142242432, + "learning_rate": 2.9954283752237478e-05, + "loss": 1.9288, + "step": 12765 + }, + { + "epoch": 0.024887150985944018, + "grad_norm": 5.146918296813965, + "learning_rate": 2.99541763026402e-05, + "loss": 1.8986, + "step": 12780 + }, + { + "epoch": 0.024916361257054282, + "grad_norm": 3.8306210041046143, + "learning_rate": 2.9954068727111694e-05, + "loss": 1.9432, + "step": 12795 + }, + { + "epoch": 0.024945571528164543, + "grad_norm": 3.569969892501831, + "learning_rate": 2.9953961025652875e-05, + "loss": 1.802, + "step": 12810 + }, + { + "epoch": 0.024974781799274808, + "grad_norm": 2.11238956451416, + "learning_rate": 2.995385319826465e-05, + "loss": 1.7489, + "step": 12825 + }, + { + "epoch": 0.02500399207038507, + "grad_norm": 2.0586655139923096, + "learning_rate": 2.995374524494792e-05, + "loss": 1.8308, + "step": 12840 + }, + { + "epoch": 0.025033202341495333, + "grad_norm": 2.133302688598633, + "learning_rate": 2.9953637165703597e-05, + "loss": 1.8642, + "step": 12855 + }, + { + "epoch": 0.025062412612605594, + "grad_norm": 6.014092922210693, + "learning_rate": 2.9953528960532594e-05, + "loss": 1.838, + "step": 12870 + }, + { + "epoch": 0.02509162288371586, + "grad_norm": 4.038995742797852, + "learning_rate": 2.9953420629435823e-05, + "loss": 1.9485, + "step": 12885 + }, + { + "epoch": 0.02512083315482612, + "grad_norm": 2.016037940979004, + "learning_rate": 2.995331217241419e-05, + "loss": 1.8532, + "step": 12900 + }, + { + "epoch": 0.025150043425936384, + "grad_norm": 2.268634796142578, + "learning_rate": 2.9953203589468617e-05, + "loss": 1.7899, + "step": 12915 + }, + { + "epoch": 0.025179253697046648, + "grad_norm": 4.676908016204834, + "learning_rate": 2.995309488060001e-05, + "loss": 1.875, + "step": 12930 + }, + { + "epoch": 0.02520846396815691, + "grad_norm": 2.0081729888916016, + "learning_rate": 2.9952986045809284e-05, + "loss": 1.9863, + "step": 12945 + }, + { + "epoch": 0.025237674239267174, + "grad_norm": 2.9928226470947266, + "learning_rate": 2.9952877085097364e-05, + "loss": 1.8073, + "step": 12960 + }, + { + "epoch": 0.025266884510377435, + "grad_norm": 3.514173746109009, + "learning_rate": 2.9952767998465164e-05, + "loss": 1.849, + "step": 12975 + }, + { + "epoch": 0.0252960947814877, + "grad_norm": 1.9361449480056763, + "learning_rate": 2.99526587859136e-05, + "loss": 1.8925, + "step": 12990 + }, + { + "epoch": 0.02532530505259796, + "grad_norm": 4.396752834320068, + "learning_rate": 2.9952549447443595e-05, + "loss": 1.8844, + "step": 13005 + }, + { + "epoch": 0.025354515323708225, + "grad_norm": 2.9122262001037598, + "learning_rate": 2.9952439983056066e-05, + "loss": 1.9564, + "step": 13020 + }, + { + "epoch": 0.02538372559481849, + "grad_norm": 2.6632208824157715, + "learning_rate": 2.9952330392751935e-05, + "loss": 1.8514, + "step": 13035 + }, + { + "epoch": 0.02541293586592875, + "grad_norm": 2.678126573562622, + "learning_rate": 2.995222067653213e-05, + "loss": 1.8146, + "step": 13050 + }, + { + "epoch": 0.025442146137039014, + "grad_norm": 3.901380777359009, + "learning_rate": 2.9952110834397572e-05, + "loss": 2.0268, + "step": 13065 + }, + { + "epoch": 0.025471356408149275, + "grad_norm": 2.7066125869750977, + "learning_rate": 2.9952000866349185e-05, + "loss": 1.8731, + "step": 13080 + }, + { + "epoch": 0.02550056667925954, + "grad_norm": 4.8974456787109375, + "learning_rate": 2.9951890772387897e-05, + "loss": 1.908, + "step": 13095 + }, + { + "epoch": 0.0255297769503698, + "grad_norm": 3.9215996265411377, + "learning_rate": 2.995178055251463e-05, + "loss": 2.0428, + "step": 13110 + }, + { + "epoch": 0.025558987221480065, + "grad_norm": 2.519742250442505, + "learning_rate": 2.9951670206730318e-05, + "loss": 1.7794, + "step": 13125 + }, + { + "epoch": 0.025588197492590326, + "grad_norm": 2.1564877033233643, + "learning_rate": 2.995155973503589e-05, + "loss": 1.6714, + "step": 13140 + }, + { + "epoch": 0.02561740776370059, + "grad_norm": 4.537586688995361, + "learning_rate": 2.9951449137432275e-05, + "loss": 2.0812, + "step": 13155 + }, + { + "epoch": 0.025646618034810855, + "grad_norm": 3.659740686416626, + "learning_rate": 2.9951338413920403e-05, + "loss": 1.995, + "step": 13170 + }, + { + "epoch": 0.025675828305921116, + "grad_norm": 2.057332992553711, + "learning_rate": 2.9951227564501207e-05, + "loss": 1.8206, + "step": 13185 + }, + { + "epoch": 0.02570503857703138, + "grad_norm": 3.4197821617126465, + "learning_rate": 2.995111658917562e-05, + "loss": 1.9727, + "step": 13200 + }, + { + "epoch": 0.02573424884814164, + "grad_norm": 2.212815761566162, + "learning_rate": 2.995100548794458e-05, + "loss": 1.959, + "step": 13215 + }, + { + "epoch": 0.025763459119251906, + "grad_norm": 2.1396286487579346, + "learning_rate": 2.9950894260809015e-05, + "loss": 1.9048, + "step": 13230 + }, + { + "epoch": 0.025792669390362167, + "grad_norm": 2.823305606842041, + "learning_rate": 2.995078290776987e-05, + "loss": 1.9381, + "step": 13245 + }, + { + "epoch": 0.02582187966147243, + "grad_norm": 3.0127432346343994, + "learning_rate": 2.9950671428828083e-05, + "loss": 1.959, + "step": 13260 + }, + { + "epoch": 0.025851089932582696, + "grad_norm": 2.1536638736724854, + "learning_rate": 2.9950559823984583e-05, + "loss": 1.7713, + "step": 13275 + }, + { + "epoch": 0.025880300203692957, + "grad_norm": 2.784940242767334, + "learning_rate": 2.9950448093240318e-05, + "loss": 1.8586, + "step": 13290 + }, + { + "epoch": 0.02590951047480322, + "grad_norm": 2.103855848312378, + "learning_rate": 2.9950336236596226e-05, + "loss": 1.8167, + "step": 13305 + }, + { + "epoch": 0.025938720745913482, + "grad_norm": 2.465585947036743, + "learning_rate": 2.9950224254053254e-05, + "loss": 1.9863, + "step": 13320 + }, + { + "epoch": 0.025967931017023747, + "grad_norm": 3.2060279846191406, + "learning_rate": 2.9950112145612335e-05, + "loss": 1.838, + "step": 13335 + }, + { + "epoch": 0.025997141288134008, + "grad_norm": 3.0637335777282715, + "learning_rate": 2.9949999911274427e-05, + "loss": 1.9325, + "step": 13350 + }, + { + "epoch": 0.026026351559244272, + "grad_norm": 2.8039963245391846, + "learning_rate": 2.994988755104046e-05, + "loss": 1.8671, + "step": 13365 + }, + { + "epoch": 0.026055561830354537, + "grad_norm": 4.239367485046387, + "learning_rate": 2.9949775064911388e-05, + "loss": 1.9305, + "step": 13380 + }, + { + "epoch": 0.026084772101464797, + "grad_norm": 4.794821262359619, + "learning_rate": 2.994966245288816e-05, + "loss": 1.8946, + "step": 13395 + }, + { + "epoch": 0.026113982372575062, + "grad_norm": 3.5099828243255615, + "learning_rate": 2.994954971497172e-05, + "loss": 1.919, + "step": 13410 + }, + { + "epoch": 0.026143192643685323, + "grad_norm": 4.3362603187561035, + "learning_rate": 2.994943685116302e-05, + "loss": 1.9266, + "step": 13425 + }, + { + "epoch": 0.026172402914795587, + "grad_norm": 4.010773658752441, + "learning_rate": 2.994932386146301e-05, + "loss": 1.8712, + "step": 13440 + }, + { + "epoch": 0.02620161318590585, + "grad_norm": 2.9033546447753906, + "learning_rate": 2.9949210745872638e-05, + "loss": 1.8179, + "step": 13455 + }, + { + "epoch": 0.026230823457016113, + "grad_norm": 2.215955972671509, + "learning_rate": 2.9949097504392866e-05, + "loss": 1.8573, + "step": 13470 + }, + { + "epoch": 0.026260033728126374, + "grad_norm": 4.64263391494751, + "learning_rate": 2.994898413702464e-05, + "loss": 1.8504, + "step": 13485 + }, + { + "epoch": 0.026289243999236638, + "grad_norm": 2.8851144313812256, + "learning_rate": 2.9948870643768915e-05, + "loss": 1.7891, + "step": 13500 + }, + { + "epoch": 0.026318454270346903, + "grad_norm": 5.705179214477539, + "learning_rate": 2.9948757024626645e-05, + "loss": 1.8502, + "step": 13515 + }, + { + "epoch": 0.026347664541457164, + "grad_norm": 4.427610397338867, + "learning_rate": 2.994864327959879e-05, + "loss": 1.746, + "step": 13530 + }, + { + "epoch": 0.026376874812567428, + "grad_norm": 2.9682793617248535, + "learning_rate": 2.994852940868631e-05, + "loss": 1.8766, + "step": 13545 + }, + { + "epoch": 0.02640608508367769, + "grad_norm": 2.8406543731689453, + "learning_rate": 2.9948415411890164e-05, + "loss": 1.8637, + "step": 13560 + }, + { + "epoch": 0.026435295354787954, + "grad_norm": 2.9661149978637695, + "learning_rate": 2.9948301289211308e-05, + "loss": 1.7703, + "step": 13575 + }, + { + "epoch": 0.026464505625898214, + "grad_norm": 2.961155652999878, + "learning_rate": 2.99481870406507e-05, + "loss": 1.8123, + "step": 13590 + }, + { + "epoch": 0.02649371589700848, + "grad_norm": 3.7241668701171875, + "learning_rate": 2.9948072666209308e-05, + "loss": 2.018, + "step": 13605 + }, + { + "epoch": 0.026522926168118743, + "grad_norm": 2.8102498054504395, + "learning_rate": 2.9947958165888096e-05, + "loss": 1.8577, + "step": 13620 + }, + { + "epoch": 0.026552136439229004, + "grad_norm": 2.061007022857666, + "learning_rate": 2.9947843539688027e-05, + "loss": 1.9684, + "step": 13635 + }, + { + "epoch": 0.02658134671033927, + "grad_norm": 4.699859619140625, + "learning_rate": 2.994772878761006e-05, + "loss": 1.9438, + "step": 13650 + }, + { + "epoch": 0.02661055698144953, + "grad_norm": 5.8805952072143555, + "learning_rate": 2.994761390965517e-05, + "loss": 1.8862, + "step": 13665 + }, + { + "epoch": 0.026639767252559794, + "grad_norm": 3.6178531646728516, + "learning_rate": 2.994749890582432e-05, + "loss": 1.9754, + "step": 13680 + }, + { + "epoch": 0.026668977523670055, + "grad_norm": 2.891448497772217, + "learning_rate": 2.9947383776118482e-05, + "loss": 1.8838, + "step": 13695 + }, + { + "epoch": 0.02669818779478032, + "grad_norm": 2.5380797386169434, + "learning_rate": 2.994726852053862e-05, + "loss": 2.0006, + "step": 13710 + }, + { + "epoch": 0.02672739806589058, + "grad_norm": 3.083801031112671, + "learning_rate": 2.994715313908571e-05, + "loss": 1.9287, + "step": 13725 + }, + { + "epoch": 0.026756608337000845, + "grad_norm": 3.9220306873321533, + "learning_rate": 2.9947037631760717e-05, + "loss": 2.0063, + "step": 13740 + }, + { + "epoch": 0.02678581860811111, + "grad_norm": 2.41329288482666, + "learning_rate": 2.994692199856462e-05, + "loss": 1.7779, + "step": 13755 + }, + { + "epoch": 0.02681502887922137, + "grad_norm": 3.137281656265259, + "learning_rate": 2.9946806239498392e-05, + "loss": 1.7686, + "step": 13770 + }, + { + "epoch": 0.026844239150331635, + "grad_norm": 3.8897507190704346, + "learning_rate": 2.994669035456301e-05, + "loss": 1.9879, + "step": 13785 + }, + { + "epoch": 0.026873449421441896, + "grad_norm": 2.888145685195923, + "learning_rate": 2.994657434375944e-05, + "loss": 2.0012, + "step": 13800 + }, + { + "epoch": 0.02690265969255216, + "grad_norm": 2.683145523071289, + "learning_rate": 2.9946458207088667e-05, + "loss": 1.8579, + "step": 13815 + }, + { + "epoch": 0.02693186996366242, + "grad_norm": 2.5023186206817627, + "learning_rate": 2.9946341944551668e-05, + "loss": 1.8899, + "step": 13830 + }, + { + "epoch": 0.026961080234772686, + "grad_norm": 4.522122383117676, + "learning_rate": 2.994622555614942e-05, + "loss": 1.8373, + "step": 13845 + }, + { + "epoch": 0.02699029050588295, + "grad_norm": 1.9197810888290405, + "learning_rate": 2.9946109041882902e-05, + "loss": 1.874, + "step": 13860 + }, + { + "epoch": 0.02701950077699321, + "grad_norm": 2.0907135009765625, + "learning_rate": 2.9945992401753103e-05, + "loss": 1.9878, + "step": 13875 + }, + { + "epoch": 0.027048711048103476, + "grad_norm": 3.0691592693328857, + "learning_rate": 2.9945875635761e-05, + "loss": 1.8859, + "step": 13890 + }, + { + "epoch": 0.027077921319213737, + "grad_norm": 2.0707552433013916, + "learning_rate": 2.9945758743907573e-05, + "loss": 1.7612, + "step": 13905 + }, + { + "epoch": 0.027107131590324, + "grad_norm": 2.2770462036132812, + "learning_rate": 2.994564172619381e-05, + "loss": 1.8028, + "step": 13920 + }, + { + "epoch": 0.027136341861434262, + "grad_norm": 2.681814193725586, + "learning_rate": 2.9945524582620695e-05, + "loss": 1.7967, + "step": 13935 + }, + { + "epoch": 0.027165552132544526, + "grad_norm": 3.0529186725616455, + "learning_rate": 2.994540731318922e-05, + "loss": 1.7972, + "step": 13950 + }, + { + "epoch": 0.027194762403654787, + "grad_norm": 3.369091033935547, + "learning_rate": 2.9945289917900368e-05, + "loss": 1.8092, + "step": 13965 + }, + { + "epoch": 0.027223972674765052, + "grad_norm": 2.190134048461914, + "learning_rate": 2.9945172396755124e-05, + "loss": 2.0228, + "step": 13980 + }, + { + "epoch": 0.027253182945875316, + "grad_norm": 2.805100202560425, + "learning_rate": 2.9945054749754483e-05, + "loss": 1.9312, + "step": 13995 + }, + { + "epoch": 0.027282393216985577, + "grad_norm": 2.195697546005249, + "learning_rate": 2.9944936976899433e-05, + "loss": 1.9791, + "step": 14010 + }, + { + "epoch": 0.027311603488095842, + "grad_norm": 1.723713755607605, + "learning_rate": 2.9944819078190967e-05, + "loss": 1.8542, + "step": 14025 + }, + { + "epoch": 0.027340813759206103, + "grad_norm": 2.633101463317871, + "learning_rate": 2.9944701053630075e-05, + "loss": 1.8127, + "step": 14040 + }, + { + "epoch": 0.027370024030316367, + "grad_norm": 1.9390171766281128, + "learning_rate": 2.9944582903217756e-05, + "loss": 1.9183, + "step": 14055 + }, + { + "epoch": 0.027399234301426628, + "grad_norm": 3.9491968154907227, + "learning_rate": 2.9944464626955003e-05, + "loss": 2.0849, + "step": 14070 + }, + { + "epoch": 0.027428444572536893, + "grad_norm": 2.4679179191589355, + "learning_rate": 2.9944346224842812e-05, + "loss": 1.9285, + "step": 14085 + }, + { + "epoch": 0.027457654843647157, + "grad_norm": 2.999509334564209, + "learning_rate": 2.994422769688218e-05, + "loss": 1.9523, + "step": 14100 + }, + { + "epoch": 0.027486865114757418, + "grad_norm": 3.8798091411590576, + "learning_rate": 2.9944109043074104e-05, + "loss": 1.9014, + "step": 14115 + }, + { + "epoch": 0.027516075385867683, + "grad_norm": 2.5288240909576416, + "learning_rate": 2.9943990263419582e-05, + "loss": 2.1135, + "step": 14130 + }, + { + "epoch": 0.027545285656977943, + "grad_norm": 2.2120304107666016, + "learning_rate": 2.994387135791962e-05, + "loss": 1.7418, + "step": 14145 + }, + { + "epoch": 0.027574495928088208, + "grad_norm": 2.805328607559204, + "learning_rate": 2.994375232657521e-05, + "loss": 1.8776, + "step": 14160 + }, + { + "epoch": 0.02760370619919847, + "grad_norm": 2.8841097354888916, + "learning_rate": 2.9943633169387365e-05, + "loss": 1.9106, + "step": 14175 + }, + { + "epoch": 0.027632916470308733, + "grad_norm": 1.8887025117874146, + "learning_rate": 2.994351388635708e-05, + "loss": 1.8916, + "step": 14190 + }, + { + "epoch": 0.027662126741418994, + "grad_norm": 2.8623757362365723, + "learning_rate": 2.9943394477485363e-05, + "loss": 1.8735, + "step": 14205 + }, + { + "epoch": 0.02769133701252926, + "grad_norm": 3.1046249866485596, + "learning_rate": 2.994327494277322e-05, + "loss": 1.9124, + "step": 14220 + }, + { + "epoch": 0.027720547283639523, + "grad_norm": 2.653933525085449, + "learning_rate": 2.9943155282221663e-05, + "loss": 1.9387, + "step": 14235 + }, + { + "epoch": 0.027749757554749784, + "grad_norm": 2.975820779800415, + "learning_rate": 2.9943035495831688e-05, + "loss": 1.8232, + "step": 14250 + }, + { + "epoch": 0.02777896782586005, + "grad_norm": 5.906015396118164, + "learning_rate": 2.9942915583604307e-05, + "loss": 1.9167, + "step": 14265 + }, + { + "epoch": 0.02780817809697031, + "grad_norm": 2.592456102371216, + "learning_rate": 2.994279554554054e-05, + "loss": 1.7433, + "step": 14280 + }, + { + "epoch": 0.027837388368080574, + "grad_norm": 5.042680263519287, + "learning_rate": 2.994267538164138e-05, + "loss": 1.7878, + "step": 14295 + }, + { + "epoch": 0.027866598639190835, + "grad_norm": 4.092184066772461, + "learning_rate": 2.9942555091907853e-05, + "loss": 1.6955, + "step": 14310 + }, + { + "epoch": 0.0278958089103011, + "grad_norm": 4.623755931854248, + "learning_rate": 2.994243467634097e-05, + "loss": 1.7866, + "step": 14325 + }, + { + "epoch": 0.027925019181411364, + "grad_norm": 3.042306661605835, + "learning_rate": 2.994231413494174e-05, + "loss": 1.8891, + "step": 14340 + }, + { + "epoch": 0.027954229452521625, + "grad_norm": 2.784275531768799, + "learning_rate": 2.9942193467711184e-05, + "loss": 2.0112, + "step": 14355 + }, + { + "epoch": 0.02798343972363189, + "grad_norm": 1.9308695793151855, + "learning_rate": 2.9942072674650317e-05, + "loss": 1.7964, + "step": 14370 + }, + { + "epoch": 0.02801264999474215, + "grad_norm": 3.7377004623413086, + "learning_rate": 2.994195175576015e-05, + "loss": 1.8661, + "step": 14385 + }, + { + "epoch": 0.028041860265852415, + "grad_norm": 2.484870195388794, + "learning_rate": 2.994183071104171e-05, + "loss": 1.7358, + "step": 14400 + }, + { + "epoch": 0.028071070536962676, + "grad_norm": 2.6344974040985107, + "learning_rate": 2.9941709540496013e-05, + "loss": 1.9183, + "step": 14415 + }, + { + "epoch": 0.02810028080807294, + "grad_norm": 2.168701410293579, + "learning_rate": 2.9941588244124072e-05, + "loss": 1.999, + "step": 14430 + }, + { + "epoch": 0.028129491079183205, + "grad_norm": 2.986727476119995, + "learning_rate": 2.994146682192692e-05, + "loss": 1.8344, + "step": 14445 + }, + { + "epoch": 0.028158701350293466, + "grad_norm": 3.3715713024139404, + "learning_rate": 2.9941345273905573e-05, + "loss": 2.0468, + "step": 14460 + }, + { + "epoch": 0.02818791162140373, + "grad_norm": 2.2077038288116455, + "learning_rate": 2.9941223600061054e-05, + "loss": 2.0255, + "step": 14475 + }, + { + "epoch": 0.02821712189251399, + "grad_norm": 3.4651224613189697, + "learning_rate": 2.994110180039439e-05, + "loss": 1.8604, + "step": 14490 + }, + { + "epoch": 0.028246332163624256, + "grad_norm": 2.0584287643432617, + "learning_rate": 2.994097987490661e-05, + "loss": 2.0636, + "step": 14505 + }, + { + "epoch": 0.028275542434734516, + "grad_norm": 3.1285014152526855, + "learning_rate": 2.9940857823598736e-05, + "loss": 1.7656, + "step": 14520 + }, + { + "epoch": 0.02830475270584478, + "grad_norm": 2.541280746459961, + "learning_rate": 2.9940735646471793e-05, + "loss": 1.8682, + "step": 14535 + }, + { + "epoch": 0.028333962976955042, + "grad_norm": 2.4748847484588623, + "learning_rate": 2.9940613343526817e-05, + "loss": 2.0047, + "step": 14550 + }, + { + "epoch": 0.028363173248065306, + "grad_norm": 3.2105560302734375, + "learning_rate": 2.9940490914764834e-05, + "loss": 1.7709, + "step": 14565 + }, + { + "epoch": 0.02839238351917557, + "grad_norm": 3.491591215133667, + "learning_rate": 2.9940368360186878e-05, + "loss": 1.786, + "step": 14580 + }, + { + "epoch": 0.028421593790285832, + "grad_norm": 3.315342903137207, + "learning_rate": 2.9940245679793978e-05, + "loss": 1.917, + "step": 14595 + }, + { + "epoch": 0.028450804061396096, + "grad_norm": 1.7594997882843018, + "learning_rate": 2.9940122873587164e-05, + "loss": 1.877, + "step": 14610 + }, + { + "epoch": 0.028480014332506357, + "grad_norm": 2.442725896835327, + "learning_rate": 2.9939999941567474e-05, + "loss": 1.9577, + "step": 14625 + }, + { + "epoch": 0.02850922460361662, + "grad_norm": 3.146977663040161, + "learning_rate": 2.993987688373595e-05, + "loss": 1.8722, + "step": 14640 + }, + { + "epoch": 0.028538434874726883, + "grad_norm": 2.6678929328918457, + "learning_rate": 2.9939753700093618e-05, + "loss": 1.659, + "step": 14655 + }, + { + "epoch": 0.028567645145837147, + "grad_norm": 2.4769906997680664, + "learning_rate": 2.9939630390641518e-05, + "loss": 1.8257, + "step": 14670 + }, + { + "epoch": 0.02859685541694741, + "grad_norm": 2.9314770698547363, + "learning_rate": 2.993950695538069e-05, + "loss": 1.8004, + "step": 14685 + }, + { + "epoch": 0.028626065688057672, + "grad_norm": 3.2279980182647705, + "learning_rate": 2.993938339431217e-05, + "loss": 1.9438, + "step": 14700 + }, + { + "epoch": 0.028655275959167937, + "grad_norm": 2.8929495811462402, + "learning_rate": 2.9939259707437002e-05, + "loss": 1.7995, + "step": 14715 + }, + { + "epoch": 0.028684486230278198, + "grad_norm": 4.861998558044434, + "learning_rate": 2.9939135894756232e-05, + "loss": 1.8188, + "step": 14730 + }, + { + "epoch": 0.028713696501388462, + "grad_norm": 5.37394905090332, + "learning_rate": 2.9939011956270893e-05, + "loss": 2.061, + "step": 14745 + }, + { + "epoch": 0.028742906772498723, + "grad_norm": 2.2253520488739014, + "learning_rate": 2.9938887891982035e-05, + "loss": 1.963, + "step": 14760 + }, + { + "epoch": 0.028772117043608988, + "grad_norm": 3.424954414367676, + "learning_rate": 2.99387637018907e-05, + "loss": 2.0077, + "step": 14775 + }, + { + "epoch": 0.02880132731471925, + "grad_norm": 2.8398706912994385, + "learning_rate": 2.9938639385997934e-05, + "loss": 2.0516, + "step": 14790 + }, + { + "epoch": 0.028830537585829513, + "grad_norm": 2.371492385864258, + "learning_rate": 2.9938514944304788e-05, + "loss": 1.8057, + "step": 14805 + }, + { + "epoch": 0.028859747856939778, + "grad_norm": 1.870301365852356, + "learning_rate": 2.9938390376812304e-05, + "loss": 1.8335, + "step": 14820 + }, + { + "epoch": 0.02888895812805004, + "grad_norm": 3.1508800983428955, + "learning_rate": 2.9938265683521533e-05, + "loss": 2.0272, + "step": 14835 + }, + { + "epoch": 0.028918168399160303, + "grad_norm": 2.8456640243530273, + "learning_rate": 2.9938140864433528e-05, + "loss": 1.889, + "step": 14850 + }, + { + "epoch": 0.028947378670270564, + "grad_norm": 2.3040804862976074, + "learning_rate": 2.9938015919549337e-05, + "loss": 1.9274, + "step": 14865 + }, + { + "epoch": 0.02897658894138083, + "grad_norm": 3.075559139251709, + "learning_rate": 2.9937890848870012e-05, + "loss": 1.9239, + "step": 14880 + }, + { + "epoch": 0.02900579921249109, + "grad_norm": 2.4840190410614014, + "learning_rate": 2.9937765652396608e-05, + "loss": 1.9836, + "step": 14895 + }, + { + "epoch": 0.029035009483601354, + "grad_norm": 2.915515422821045, + "learning_rate": 2.9937640330130182e-05, + "loss": 2.0196, + "step": 14910 + }, + { + "epoch": 0.02906421975471162, + "grad_norm": 2.684401035308838, + "learning_rate": 2.993751488207178e-05, + "loss": 1.9699, + "step": 14925 + }, + { + "epoch": 0.02909343002582188, + "grad_norm": 1.7906841039657593, + "learning_rate": 2.9937389308222468e-05, + "loss": 1.9435, + "step": 14940 + }, + { + "epoch": 0.029122640296932144, + "grad_norm": 2.7629384994506836, + "learning_rate": 2.9937263608583297e-05, + "loss": 1.9266, + "step": 14955 + }, + { + "epoch": 0.029151850568042405, + "grad_norm": 3.65447735786438, + "learning_rate": 2.9937137783155326e-05, + "loss": 1.8818, + "step": 14970 + }, + { + "epoch": 0.02918106083915267, + "grad_norm": 2.684885025024414, + "learning_rate": 2.993701183193962e-05, + "loss": 1.744, + "step": 14985 + }, + { + "epoch": 0.02921027111026293, + "grad_norm": 2.995678424835205, + "learning_rate": 2.9936885754937237e-05, + "loss": 1.868, + "step": 15000 + }, + { + "epoch": 0.029239481381373195, + "grad_norm": 1.9314979314804077, + "learning_rate": 2.993675955214924e-05, + "loss": 1.8982, + "step": 15015 + }, + { + "epoch": 0.029268691652483456, + "grad_norm": 3.8655660152435303, + "learning_rate": 2.993663322357669e-05, + "loss": 2.103, + "step": 15030 + }, + { + "epoch": 0.02929790192359372, + "grad_norm": 3.2133545875549316, + "learning_rate": 2.993650676922065e-05, + "loss": 1.867, + "step": 15045 + }, + { + "epoch": 0.029327112194703985, + "grad_norm": 1.7590382099151611, + "learning_rate": 2.9936380189082184e-05, + "loss": 1.9596, + "step": 15060 + }, + { + "epoch": 0.029356322465814245, + "grad_norm": 3.5091662406921387, + "learning_rate": 2.993625348316236e-05, + "loss": 2.0665, + "step": 15075 + }, + { + "epoch": 0.02938553273692451, + "grad_norm": 2.493689775466919, + "learning_rate": 2.9936126651462246e-05, + "loss": 1.8168, + "step": 15090 + }, + { + "epoch": 0.02941474300803477, + "grad_norm": 2.158205509185791, + "learning_rate": 2.993599969398291e-05, + "loss": 1.9542, + "step": 15105 + }, + { + "epoch": 0.029443953279145035, + "grad_norm": 1.6943566799163818, + "learning_rate": 2.9935872610725415e-05, + "loss": 1.9762, + "step": 15120 + }, + { + "epoch": 0.029473163550255296, + "grad_norm": 4.148080825805664, + "learning_rate": 2.993574540169084e-05, + "loss": 1.8338, + "step": 15135 + }, + { + "epoch": 0.02950237382136556, + "grad_norm": 3.9066367149353027, + "learning_rate": 2.993561806688025e-05, + "loss": 1.875, + "step": 15150 + }, + { + "epoch": 0.029531584092475825, + "grad_norm": 4.32364559173584, + "learning_rate": 2.9935490606294726e-05, + "loss": 1.7329, + "step": 15165 + }, + { + "epoch": 0.029560794363586086, + "grad_norm": 3.791557788848877, + "learning_rate": 2.9935363019935327e-05, + "loss": 1.8515, + "step": 15180 + }, + { + "epoch": 0.02959000463469635, + "grad_norm": 2.274827241897583, + "learning_rate": 2.9935235307803137e-05, + "loss": 1.7744, + "step": 15195 + }, + { + "epoch": 0.02961921490580661, + "grad_norm": 3.765348196029663, + "learning_rate": 2.9935107469899235e-05, + "loss": 1.7427, + "step": 15210 + }, + { + "epoch": 0.029648425176916876, + "grad_norm": 1.8818484544754028, + "learning_rate": 2.9934979506224687e-05, + "loss": 1.912, + "step": 15225 + }, + { + "epoch": 0.029677635448027137, + "grad_norm": 3.2676870822906494, + "learning_rate": 2.993485141678058e-05, + "loss": 2.022, + "step": 15240 + }, + { + "epoch": 0.0297068457191374, + "grad_norm": 3.329846143722534, + "learning_rate": 2.9934723201567986e-05, + "loss": 1.9317, + "step": 15255 + }, + { + "epoch": 0.029736055990247662, + "grad_norm": 4.478529453277588, + "learning_rate": 2.993459486058799e-05, + "loss": 1.8172, + "step": 15270 + }, + { + "epoch": 0.029765266261357927, + "grad_norm": 5.912947654724121, + "learning_rate": 2.9934466393841667e-05, + "loss": 1.8828, + "step": 15285 + }, + { + "epoch": 0.02979447653246819, + "grad_norm": 3.079585552215576, + "learning_rate": 2.9934337801330102e-05, + "loss": 1.946, + "step": 15300 + }, + { + "epoch": 0.029823686803578452, + "grad_norm": 2.770911455154419, + "learning_rate": 2.993420908305438e-05, + "loss": 1.884, + "step": 15315 + }, + { + "epoch": 0.029852897074688717, + "grad_norm": 2.8034188747406006, + "learning_rate": 2.993408023901558e-05, + "loss": 1.7856, + "step": 15330 + }, + { + "epoch": 0.029882107345798978, + "grad_norm": 4.039456367492676, + "learning_rate": 2.9933951269214793e-05, + "loss": 1.9302, + "step": 15345 + }, + { + "epoch": 0.029911317616909242, + "grad_norm": 3.329083204269409, + "learning_rate": 2.99338221736531e-05, + "loss": 1.9541, + "step": 15360 + }, + { + "epoch": 0.029940527888019503, + "grad_norm": 2.9838755130767822, + "learning_rate": 2.9933692952331593e-05, + "loss": 1.8633, + "step": 15375 + }, + { + "epoch": 0.029969738159129768, + "grad_norm": 3.2724435329437256, + "learning_rate": 2.9933563605251356e-05, + "loss": 1.9572, + "step": 15390 + }, + { + "epoch": 0.029998948430240032, + "grad_norm": 1.9584968090057373, + "learning_rate": 2.993343413241348e-05, + "loss": 1.7946, + "step": 15405 + }, + { + "epoch": 0.030028158701350293, + "grad_norm": 3.7978711128234863, + "learning_rate": 2.9933304533819053e-05, + "loss": 1.7179, + "step": 15420 + }, + { + "epoch": 0.030057368972460557, + "grad_norm": 3.708948850631714, + "learning_rate": 2.993317480946917e-05, + "loss": 1.7777, + "step": 15435 + }, + { + "epoch": 0.03008657924357082, + "grad_norm": 2.204556941986084, + "learning_rate": 2.993304495936492e-05, + "loss": 2.0804, + "step": 15450 + }, + { + "epoch": 0.030115789514681083, + "grad_norm": 4.719995021820068, + "learning_rate": 2.9932914983507398e-05, + "loss": 1.8505, + "step": 15465 + }, + { + "epoch": 0.030144999785791344, + "grad_norm": 2.958409547805786, + "learning_rate": 2.9932784881897703e-05, + "loss": 1.8833, + "step": 15480 + }, + { + "epoch": 0.03017421005690161, + "grad_norm": 2.398984909057617, + "learning_rate": 2.993265465453692e-05, + "loss": 1.6463, + "step": 15495 + }, + { + "epoch": 0.03020342032801187, + "grad_norm": 2.104973077774048, + "learning_rate": 2.9932524301426155e-05, + "loss": 1.7249, + "step": 15510 + }, + { + "epoch": 0.030232630599122134, + "grad_norm": 5.241917610168457, + "learning_rate": 2.99323938225665e-05, + "loss": 1.7852, + "step": 15525 + }, + { + "epoch": 0.030261840870232398, + "grad_norm": 2.9521865844726562, + "learning_rate": 2.9932263217959064e-05, + "loss": 1.8941, + "step": 15540 + }, + { + "epoch": 0.03029105114134266, + "grad_norm": 3.14367413520813, + "learning_rate": 2.9932132487604936e-05, + "loss": 1.9962, + "step": 15555 + }, + { + "epoch": 0.030320261412452924, + "grad_norm": 1.763987421989441, + "learning_rate": 2.9932001631505217e-05, + "loss": 1.9057, + "step": 15570 + }, + { + "epoch": 0.030349471683563185, + "grad_norm": 4.6951446533203125, + "learning_rate": 2.993187064966101e-05, + "loss": 1.8781, + "step": 15585 + }, + { + "epoch": 0.03037868195467345, + "grad_norm": 2.424650192260742, + "learning_rate": 2.993173954207343e-05, + "loss": 1.8808, + "step": 15600 + }, + { + "epoch": 0.03040789222578371, + "grad_norm": 2.743579387664795, + "learning_rate": 2.9931608308743562e-05, + "loss": 1.899, + "step": 15615 + }, + { + "epoch": 0.030437102496893974, + "grad_norm": 4.443767070770264, + "learning_rate": 2.9931476949672524e-05, + "loss": 1.8727, + "step": 15630 + }, + { + "epoch": 0.03046631276800424, + "grad_norm": 2.6599271297454834, + "learning_rate": 2.9931345464861418e-05, + "loss": 1.8466, + "step": 15645 + }, + { + "epoch": 0.0304955230391145, + "grad_norm": 2.598816394805908, + "learning_rate": 2.993121385431135e-05, + "loss": 2.0863, + "step": 15660 + }, + { + "epoch": 0.030524733310224764, + "grad_norm": 3.126720666885376, + "learning_rate": 2.9931082118023432e-05, + "loss": 1.7731, + "step": 15675 + }, + { + "epoch": 0.030553943581335025, + "grad_norm": 3.7353076934814453, + "learning_rate": 2.9930950255998773e-05, + "loss": 2.1104, + "step": 15690 + }, + { + "epoch": 0.03058315385244529, + "grad_norm": 4.932044982910156, + "learning_rate": 2.9930818268238483e-05, + "loss": 1.8693, + "step": 15705 + }, + { + "epoch": 0.03061236412355555, + "grad_norm": 2.6772072315216064, + "learning_rate": 2.9930686154743666e-05, + "loss": 1.8159, + "step": 15720 + }, + { + "epoch": 0.030641574394665815, + "grad_norm": 3.73004412651062, + "learning_rate": 2.9930553915515445e-05, + "loss": 1.6705, + "step": 15735 + }, + { + "epoch": 0.03067078466577608, + "grad_norm": 2.2986795902252197, + "learning_rate": 2.993042155055493e-05, + "loss": 2.05, + "step": 15750 + }, + { + "epoch": 0.03069999493688634, + "grad_norm": 2.9536030292510986, + "learning_rate": 2.9930289059863234e-05, + "loss": 1.9297, + "step": 15765 + }, + { + "epoch": 0.030729205207996605, + "grad_norm": 1.8823219537734985, + "learning_rate": 2.9930156443441477e-05, + "loss": 1.9812, + "step": 15780 + }, + { + "epoch": 0.030758415479106866, + "grad_norm": 2.553921699523926, + "learning_rate": 2.993002370129077e-05, + "loss": 2.0723, + "step": 15795 + }, + { + "epoch": 0.03078762575021713, + "grad_norm": 2.391080617904663, + "learning_rate": 2.9929890833412233e-05, + "loss": 1.8342, + "step": 15810 + }, + { + "epoch": 0.03081683602132739, + "grad_norm": 1.898431420326233, + "learning_rate": 2.9929757839806985e-05, + "loss": 1.8282, + "step": 15825 + }, + { + "epoch": 0.030846046292437656, + "grad_norm": 3.6443545818328857, + "learning_rate": 2.9929624720476153e-05, + "loss": 1.7823, + "step": 15840 + }, + { + "epoch": 0.030875256563547917, + "grad_norm": 2.1495354175567627, + "learning_rate": 2.9929491475420844e-05, + "loss": 1.9565, + "step": 15855 + }, + { + "epoch": 0.03090446683465818, + "grad_norm": 4.317218780517578, + "learning_rate": 2.992935810464219e-05, + "loss": 1.9516, + "step": 15870 + }, + { + "epoch": 0.030933677105768446, + "grad_norm": 2.2863664627075195, + "learning_rate": 2.992922460814131e-05, + "loss": 1.9399, + "step": 15885 + }, + { + "epoch": 0.030962887376878707, + "grad_norm": 3.2142059803009033, + "learning_rate": 2.9929090985919334e-05, + "loss": 2.0122, + "step": 15900 + }, + { + "epoch": 0.03099209764798897, + "grad_norm": 3.0333306789398193, + "learning_rate": 2.992895723797738e-05, + "loss": 1.7733, + "step": 15915 + }, + { + "epoch": 0.031021307919099232, + "grad_norm": 2.6775388717651367, + "learning_rate": 2.9928823364316575e-05, + "loss": 1.8205, + "step": 15930 + }, + { + "epoch": 0.031050518190209497, + "grad_norm": 2.1693944931030273, + "learning_rate": 2.9928689364938057e-05, + "loss": 1.895, + "step": 15945 + }, + { + "epoch": 0.031079728461319758, + "grad_norm": 2.754948377609253, + "learning_rate": 2.992855523984294e-05, + "loss": 1.8623, + "step": 15960 + }, + { + "epoch": 0.031108938732430022, + "grad_norm": 2.659349203109741, + "learning_rate": 2.9928420989032357e-05, + "loss": 1.9873, + "step": 15975 + }, + { + "epoch": 0.031138149003540287, + "grad_norm": 3.5560247898101807, + "learning_rate": 2.9928286612507445e-05, + "loss": 1.9364, + "step": 15990 + }, + { + "epoch": 0.031167359274650547, + "grad_norm": 4.824398994445801, + "learning_rate": 2.9928152110269335e-05, + "loss": 1.8708, + "step": 16005 + }, + { + "epoch": 0.031196569545760812, + "grad_norm": 2.5657832622528076, + "learning_rate": 2.992801748231915e-05, + "loss": 2.0086, + "step": 16020 + }, + { + "epoch": 0.031225779816871073, + "grad_norm": 5.117823123931885, + "learning_rate": 2.9927882728658036e-05, + "loss": 1.7608, + "step": 16035 + }, + { + "epoch": 0.031254990087981334, + "grad_norm": 1.9517539739608765, + "learning_rate": 2.992774784928712e-05, + "loss": 1.9367, + "step": 16050 + }, + { + "epoch": 0.0312842003590916, + "grad_norm": 3.202143669128418, + "learning_rate": 2.9927612844207537e-05, + "loss": 1.8725, + "step": 16065 + }, + { + "epoch": 0.03131341063020186, + "grad_norm": 2.5217974185943604, + "learning_rate": 2.992747771342043e-05, + "loss": 1.9029, + "step": 16080 + }, + { + "epoch": 0.031342620901312124, + "grad_norm": 1.6550703048706055, + "learning_rate": 2.992734245692693e-05, + "loss": 1.9016, + "step": 16095 + }, + { + "epoch": 0.03137183117242239, + "grad_norm": 2.7231898307800293, + "learning_rate": 2.9927207074728187e-05, + "loss": 1.9735, + "step": 16110 + }, + { + "epoch": 0.03140104144353265, + "grad_norm": 3.855612277984619, + "learning_rate": 2.9927071566825328e-05, + "loss": 1.7641, + "step": 16125 + }, + { + "epoch": 0.031430251714642914, + "grad_norm": 3.016885757446289, + "learning_rate": 2.99269359332195e-05, + "loss": 1.9789, + "step": 16140 + }, + { + "epoch": 0.031459461985753175, + "grad_norm": 2.0709967613220215, + "learning_rate": 2.9926800173911845e-05, + "loss": 1.922, + "step": 16155 + }, + { + "epoch": 0.03148867225686344, + "grad_norm": 5.23029088973999, + "learning_rate": 2.992666428890351e-05, + "loss": 1.9681, + "step": 16170 + }, + { + "epoch": 0.031517882527973703, + "grad_norm": 2.7961294651031494, + "learning_rate": 2.9926528278195634e-05, + "loss": 1.7172, + "step": 16185 + }, + { + "epoch": 0.031547092799083964, + "grad_norm": 2.5741660594940186, + "learning_rate": 2.992639214178936e-05, + "loss": 1.7638, + "step": 16200 + }, + { + "epoch": 0.031576303070194225, + "grad_norm": 1.9811028242111206, + "learning_rate": 2.9926255879685846e-05, + "loss": 2.0028, + "step": 16215 + }, + { + "epoch": 0.03160551334130449, + "grad_norm": 4.054990768432617, + "learning_rate": 2.992611949188623e-05, + "loss": 1.9305, + "step": 16230 + }, + { + "epoch": 0.031634723612414754, + "grad_norm": 2.189107894897461, + "learning_rate": 2.992598297839166e-05, + "loss": 1.7656, + "step": 16245 + }, + { + "epoch": 0.031663933883525015, + "grad_norm": 2.2795944213867188, + "learning_rate": 2.9925846339203285e-05, + "loss": 1.8474, + "step": 16260 + }, + { + "epoch": 0.03169314415463528, + "grad_norm": 5.294840335845947, + "learning_rate": 2.9925709574322262e-05, + "loss": 1.8983, + "step": 16275 + }, + { + "epoch": 0.031722354425745544, + "grad_norm": 3.4388887882232666, + "learning_rate": 2.9925572683749742e-05, + "loss": 1.7128, + "step": 16290 + }, + { + "epoch": 0.031751564696855805, + "grad_norm": 4.72010612487793, + "learning_rate": 2.992543566748687e-05, + "loss": 1.9095, + "step": 16305 + }, + { + "epoch": 0.031780774967966066, + "grad_norm": 3.0153849124908447, + "learning_rate": 2.9925298525534807e-05, + "loss": 1.8908, + "step": 16320 + }, + { + "epoch": 0.031809985239076334, + "grad_norm": 2.4210598468780518, + "learning_rate": 2.992516125789471e-05, + "loss": 2.0611, + "step": 16335 + }, + { + "epoch": 0.031839195510186595, + "grad_norm": 2.9562766551971436, + "learning_rate": 2.9925023864567727e-05, + "loss": 1.7784, + "step": 16350 + }, + { + "epoch": 0.031868405781296856, + "grad_norm": 2.944288969039917, + "learning_rate": 2.992488634555502e-05, + "loss": 1.8543, + "step": 16365 + }, + { + "epoch": 0.031897616052407124, + "grad_norm": 3.5835225582122803, + "learning_rate": 2.9924748700857747e-05, + "loss": 1.9654, + "step": 16380 + }, + { + "epoch": 0.031926826323517385, + "grad_norm": 3.5023677349090576, + "learning_rate": 2.9924610930477062e-05, + "loss": 1.7496, + "step": 16395 + }, + { + "epoch": 0.031956036594627646, + "grad_norm": 2.08420467376709, + "learning_rate": 2.9924473034414136e-05, + "loss": 1.8234, + "step": 16410 + }, + { + "epoch": 0.03198524686573791, + "grad_norm": 4.309360504150391, + "learning_rate": 2.992433501267012e-05, + "loss": 1.9149, + "step": 16425 + }, + { + "epoch": 0.032014457136848175, + "grad_norm": 3.0357537269592285, + "learning_rate": 2.9924196865246175e-05, + "loss": 1.9551, + "step": 16440 + }, + { + "epoch": 0.032043667407958436, + "grad_norm": 4.165322780609131, + "learning_rate": 2.9924058592143473e-05, + "loss": 1.7488, + "step": 16455 + }, + { + "epoch": 0.0320728776790687, + "grad_norm": 3.4814274311065674, + "learning_rate": 2.9923920193363176e-05, + "loss": 1.9659, + "step": 16470 + }, + { + "epoch": 0.032102087950178965, + "grad_norm": 2.5652506351470947, + "learning_rate": 2.992378166890645e-05, + "loss": 1.7986, + "step": 16485 + }, + { + "epoch": 0.032131298221289226, + "grad_norm": 1.9026885032653809, + "learning_rate": 2.9923643018774455e-05, + "loss": 1.8294, + "step": 16500 + }, + { + "epoch": 0.03216050849239949, + "grad_norm": 3.861070156097412, + "learning_rate": 2.9923504242968365e-05, + "loss": 1.7451, + "step": 16515 + }, + { + "epoch": 0.03218971876350975, + "grad_norm": 1.9591658115386963, + "learning_rate": 2.992336534148935e-05, + "loss": 1.805, + "step": 16530 + }, + { + "epoch": 0.032218929034620016, + "grad_norm": 4.189550399780273, + "learning_rate": 2.992322631433857e-05, + "loss": 1.8049, + "step": 16545 + }, + { + "epoch": 0.032248139305730276, + "grad_norm": 3.1660592555999756, + "learning_rate": 2.9923087161517205e-05, + "loss": 1.9415, + "step": 16560 + }, + { + "epoch": 0.03227734957684054, + "grad_norm": 2.801609754562378, + "learning_rate": 2.9922947883026426e-05, + "loss": 1.8096, + "step": 16575 + }, + { + "epoch": 0.032306559847950805, + "grad_norm": 2.699336290359497, + "learning_rate": 2.9922808478867403e-05, + "loss": 1.723, + "step": 16590 + }, + { + "epoch": 0.032335770119061066, + "grad_norm": 2.2953224182128906, + "learning_rate": 2.992266894904131e-05, + "loss": 2.0549, + "step": 16605 + }, + { + "epoch": 0.03236498039017133, + "grad_norm": 3.8196887969970703, + "learning_rate": 2.9922529293549327e-05, + "loss": 1.8312, + "step": 16620 + }, + { + "epoch": 0.03239419066128159, + "grad_norm": 2.974578380584717, + "learning_rate": 2.9922389512392622e-05, + "loss": 1.9242, + "step": 16635 + }, + { + "epoch": 0.032423400932391856, + "grad_norm": 4.62038516998291, + "learning_rate": 2.9922249605572376e-05, + "loss": 1.9786, + "step": 16650 + }, + { + "epoch": 0.03245261120350212, + "grad_norm": 1.5149112939834595, + "learning_rate": 2.992210957308977e-05, + "loss": 1.8984, + "step": 16665 + }, + { + "epoch": 0.03248182147461238, + "grad_norm": 2.2014825344085693, + "learning_rate": 2.992196941494598e-05, + "loss": 1.7994, + "step": 16680 + }, + { + "epoch": 0.032511031745722646, + "grad_norm": 4.065785884857178, + "learning_rate": 2.9921829131142186e-05, + "loss": 1.8157, + "step": 16695 + }, + { + "epoch": 0.03254024201683291, + "grad_norm": 4.191883087158203, + "learning_rate": 2.992168872167957e-05, + "loss": 1.8684, + "step": 16710 + }, + { + "epoch": 0.03256945228794317, + "grad_norm": 3.4441282749176025, + "learning_rate": 2.9921548186559314e-05, + "loss": 1.8299, + "step": 16725 + }, + { + "epoch": 0.03259866255905343, + "grad_norm": 4.0237345695495605, + "learning_rate": 2.9921407525782604e-05, + "loss": 1.9262, + "step": 16740 + }, + { + "epoch": 0.0326278728301637, + "grad_norm": 2.171717405319214, + "learning_rate": 2.992126673935062e-05, + "loss": 1.7679, + "step": 16755 + }, + { + "epoch": 0.03265708310127396, + "grad_norm": 4.097175121307373, + "learning_rate": 2.992112582726455e-05, + "loss": 1.9499, + "step": 16770 + }, + { + "epoch": 0.03268629337238422, + "grad_norm": 4.390431880950928, + "learning_rate": 2.9920984789525583e-05, + "loss": 1.8797, + "step": 16785 + }, + { + "epoch": 0.03271550364349448, + "grad_norm": 3.068178176879883, + "learning_rate": 2.9920843626134907e-05, + "loss": 2.0427, + "step": 16800 + }, + { + "epoch": 0.03274471391460475, + "grad_norm": 3.378275156021118, + "learning_rate": 2.9920702337093707e-05, + "loss": 1.8853, + "step": 16815 + }, + { + "epoch": 0.03277392418571501, + "grad_norm": 2.2076075077056885, + "learning_rate": 2.992056092240317e-05, + "loss": 1.9406, + "step": 16830 + }, + { + "epoch": 0.03280313445682527, + "grad_norm": 3.0358242988586426, + "learning_rate": 2.992041938206449e-05, + "loss": 2.0143, + "step": 16845 + }, + { + "epoch": 0.03283234472793554, + "grad_norm": 4.094939231872559, + "learning_rate": 2.9920277716078868e-05, + "loss": 1.9757, + "step": 16860 + }, + { + "epoch": 0.0328615549990458, + "grad_norm": 1.8015364408493042, + "learning_rate": 2.9920135924447484e-05, + "loss": 2.0698, + "step": 16875 + }, + { + "epoch": 0.03289076527015606, + "grad_norm": 3.4187324047088623, + "learning_rate": 2.9919994007171535e-05, + "loss": 1.9332, + "step": 16890 + }, + { + "epoch": 0.03291997554126632, + "grad_norm": 3.91485595703125, + "learning_rate": 2.991985196425222e-05, + "loss": 1.9433, + "step": 16905 + }, + { + "epoch": 0.03294918581237659, + "grad_norm": 3.0797996520996094, + "learning_rate": 2.9919709795690732e-05, + "loss": 1.7478, + "step": 16920 + }, + { + "epoch": 0.03297839608348685, + "grad_norm": 4.396271705627441, + "learning_rate": 2.9919567501488273e-05, + "loss": 1.9191, + "step": 16935 + }, + { + "epoch": 0.03300760635459711, + "grad_norm": 2.7897489070892334, + "learning_rate": 2.9919425081646036e-05, + "loss": 1.9495, + "step": 16950 + }, + { + "epoch": 0.03303681662570738, + "grad_norm": 3.063068151473999, + "learning_rate": 2.991928253616522e-05, + "loss": 2.1288, + "step": 16965 + }, + { + "epoch": 0.03306602689681764, + "grad_norm": 5.236184120178223, + "learning_rate": 2.991913986504703e-05, + "loss": 1.8698, + "step": 16980 + }, + { + "epoch": 0.0330952371679279, + "grad_norm": 2.758821725845337, + "learning_rate": 2.9918997068292666e-05, + "loss": 1.8664, + "step": 16995 + }, + { + "epoch": 0.03312444743903816, + "grad_norm": 3.3068835735321045, + "learning_rate": 2.9918854145903326e-05, + "loss": 2.0247, + "step": 17010 + }, + { + "epoch": 0.03315365771014843, + "grad_norm": 5.055883884429932, + "learning_rate": 2.991871109788022e-05, + "loss": 1.7333, + "step": 17025 + }, + { + "epoch": 0.03318286798125869, + "grad_norm": 2.9909791946411133, + "learning_rate": 2.9918567924224545e-05, + "loss": 1.7868, + "step": 17040 + }, + { + "epoch": 0.03321207825236895, + "grad_norm": 4.3141913414001465, + "learning_rate": 2.9918424624937514e-05, + "loss": 1.8976, + "step": 17055 + }, + { + "epoch": 0.03324128852347922, + "grad_norm": 3.3365187644958496, + "learning_rate": 2.991828120002033e-05, + "loss": 1.8246, + "step": 17070 + }, + { + "epoch": 0.03327049879458948, + "grad_norm": 1.7029449939727783, + "learning_rate": 2.991813764947421e-05, + "loss": 1.7464, + "step": 17085 + }, + { + "epoch": 0.03329970906569974, + "grad_norm": 2.34187650680542, + "learning_rate": 2.9917993973300343e-05, + "loss": 2.0614, + "step": 17100 + }, + { + "epoch": 0.03332891933681, + "grad_norm": 2.3299100399017334, + "learning_rate": 2.9917850171499957e-05, + "loss": 1.9139, + "step": 17115 + }, + { + "epoch": 0.03335812960792027, + "grad_norm": 2.58627986907959, + "learning_rate": 2.9917706244074254e-05, + "loss": 2.0092, + "step": 17130 + }, + { + "epoch": 0.03338733987903053, + "grad_norm": 3.580124855041504, + "learning_rate": 2.991756219102445e-05, + "loss": 1.8548, + "step": 17145 + }, + { + "epoch": 0.03341655015014079, + "grad_norm": 1.6242703199386597, + "learning_rate": 2.9917418012351755e-05, + "loss": 1.8455, + "step": 17160 + }, + { + "epoch": 0.03344576042125106, + "grad_norm": 3.0739760398864746, + "learning_rate": 2.991727370805739e-05, + "loss": 1.846, + "step": 17175 + }, + { + "epoch": 0.03347497069236132, + "grad_norm": 2.6895573139190674, + "learning_rate": 2.991712927814256e-05, + "loss": 1.8201, + "step": 17190 + }, + { + "epoch": 0.03350418096347158, + "grad_norm": 3.353482961654663, + "learning_rate": 2.9916984722608488e-05, + "loss": 1.9598, + "step": 17205 + }, + { + "epoch": 0.03353339123458184, + "grad_norm": 3.334764242172241, + "learning_rate": 2.991684004145639e-05, + "loss": 1.8388, + "step": 17220 + }, + { + "epoch": 0.03356260150569211, + "grad_norm": 2.8904106616973877, + "learning_rate": 2.9916695234687484e-05, + "loss": 1.9402, + "step": 17235 + }, + { + "epoch": 0.03359181177680237, + "grad_norm": 2.01975154876709, + "learning_rate": 2.991655030230299e-05, + "loss": 1.9462, + "step": 17250 + }, + { + "epoch": 0.03362102204791263, + "grad_norm": 3.490748882293701, + "learning_rate": 2.9916405244304123e-05, + "loss": 1.9171, + "step": 17265 + }, + { + "epoch": 0.033650232319022894, + "grad_norm": 3.2388625144958496, + "learning_rate": 2.9916260060692114e-05, + "loss": 1.8305, + "step": 17280 + }, + { + "epoch": 0.03367944259013316, + "grad_norm": 3.952791690826416, + "learning_rate": 2.991611475146818e-05, + "loss": 1.9577, + "step": 17295 + }, + { + "epoch": 0.03370865286124342, + "grad_norm": 2.0647149085998535, + "learning_rate": 2.9915969316633548e-05, + "loss": 1.7789, + "step": 17310 + }, + { + "epoch": 0.03373786313235368, + "grad_norm": 2.9091007709503174, + "learning_rate": 2.9915823756189438e-05, + "loss": 2.0568, + "step": 17325 + }, + { + "epoch": 0.03376707340346395, + "grad_norm": 3.5968873500823975, + "learning_rate": 2.9915678070137078e-05, + "loss": 1.9264, + "step": 17340 + }, + { + "epoch": 0.03379628367457421, + "grad_norm": 2.7411301136016846, + "learning_rate": 2.9915532258477697e-05, + "loss": 1.832, + "step": 17355 + }, + { + "epoch": 0.03382549394568447, + "grad_norm": 3.8037030696868896, + "learning_rate": 2.991538632121252e-05, + "loss": 1.9751, + "step": 17370 + }, + { + "epoch": 0.033854704216794734, + "grad_norm": 2.2467167377471924, + "learning_rate": 2.9915240258342776e-05, + "loss": 1.8501, + "step": 17385 + }, + { + "epoch": 0.033883914487905, + "grad_norm": 2.01926589012146, + "learning_rate": 2.9915094069869696e-05, + "loss": 1.8398, + "step": 17400 + }, + { + "epoch": 0.03391312475901526, + "grad_norm": 3.427839994430542, + "learning_rate": 2.9914947755794515e-05, + "loss": 2.0322, + "step": 17415 + }, + { + "epoch": 0.033942335030125524, + "grad_norm": 2.9915285110473633, + "learning_rate": 2.991480131611846e-05, + "loss": 1.7704, + "step": 17430 + }, + { + "epoch": 0.03397154530123579, + "grad_norm": 2.0821945667266846, + "learning_rate": 2.9914654750842765e-05, + "loss": 1.9759, + "step": 17445 + }, + { + "epoch": 0.03400075557234605, + "grad_norm": 3.167320966720581, + "learning_rate": 2.9914508059968664e-05, + "loss": 1.8321, + "step": 17460 + }, + { + "epoch": 0.034029965843456314, + "grad_norm": 3.4027440547943115, + "learning_rate": 2.991436124349739e-05, + "loss": 1.8643, + "step": 17475 + }, + { + "epoch": 0.034059176114566575, + "grad_norm": 1.9895273447036743, + "learning_rate": 2.9914214301430183e-05, + "loss": 2.018, + "step": 17490 + }, + { + "epoch": 0.03408838638567684, + "grad_norm": 3.4797520637512207, + "learning_rate": 2.9914067233768285e-05, + "loss": 1.8666, + "step": 17505 + }, + { + "epoch": 0.034117596656787104, + "grad_norm": 2.578434944152832, + "learning_rate": 2.9913920040512925e-05, + "loss": 1.8808, + "step": 17520 + }, + { + "epoch": 0.034146806927897365, + "grad_norm": 3.499577522277832, + "learning_rate": 2.991377272166535e-05, + "loss": 2.112, + "step": 17535 + }, + { + "epoch": 0.03417601719900763, + "grad_norm": 2.6072142124176025, + "learning_rate": 2.9913625277226795e-05, + "loss": 1.7954, + "step": 17550 + }, + { + "epoch": 0.034205227470117894, + "grad_norm": 4.382633686065674, + "learning_rate": 2.9913477707198505e-05, + "loss": 1.8943, + "step": 17565 + }, + { + "epoch": 0.034234437741228155, + "grad_norm": 2.883723497390747, + "learning_rate": 2.9913330011581718e-05, + "loss": 1.8615, + "step": 17580 + }, + { + "epoch": 0.034263648012338416, + "grad_norm": 2.309401273727417, + "learning_rate": 2.991318219037769e-05, + "loss": 1.9813, + "step": 17595 + }, + { + "epoch": 0.034292858283448684, + "grad_norm": 3.3916573524475098, + "learning_rate": 2.991303424358765e-05, + "loss": 2.0303, + "step": 17610 + }, + { + "epoch": 0.034322068554558945, + "grad_norm": 2.0413196086883545, + "learning_rate": 2.9912886171212855e-05, + "loss": 1.7976, + "step": 17625 + }, + { + "epoch": 0.034351278825669206, + "grad_norm": 2.3423221111297607, + "learning_rate": 2.991273797325454e-05, + "loss": 2.0449, + "step": 17640 + }, + { + "epoch": 0.034380489096779474, + "grad_norm": 2.9180712699890137, + "learning_rate": 2.991258964971397e-05, + "loss": 1.937, + "step": 17655 + }, + { + "epoch": 0.034409699367889734, + "grad_norm": 2.8222568035125732, + "learning_rate": 2.9912441200592385e-05, + "loss": 1.8245, + "step": 17670 + }, + { + "epoch": 0.034438909638999995, + "grad_norm": 2.49934720993042, + "learning_rate": 2.991229262589103e-05, + "loss": 1.7475, + "step": 17685 + }, + { + "epoch": 0.034468119910110256, + "grad_norm": 2.517383098602295, + "learning_rate": 2.9912143925611166e-05, + "loss": 2.0156, + "step": 17700 + }, + { + "epoch": 0.034497330181220524, + "grad_norm": 1.9787654876708984, + "learning_rate": 2.9911995099754037e-05, + "loss": 1.7771, + "step": 17715 + }, + { + "epoch": 0.034526540452330785, + "grad_norm": 3.3665497303009033, + "learning_rate": 2.9911846148320903e-05, + "loss": 1.7394, + "step": 17730 + }, + { + "epoch": 0.034555750723441046, + "grad_norm": 2.7270398139953613, + "learning_rate": 2.9911697071313017e-05, + "loss": 1.8666, + "step": 17745 + }, + { + "epoch": 0.034584960994551314, + "grad_norm": 2.7560067176818848, + "learning_rate": 2.9911547868731626e-05, + "loss": 1.7811, + "step": 17760 + }, + { + "epoch": 0.034614171265661575, + "grad_norm": 3.9651834964752197, + "learning_rate": 2.9911398540577996e-05, + "loss": 1.8758, + "step": 17775 + }, + { + "epoch": 0.034643381536771836, + "grad_norm": 2.7909586429595947, + "learning_rate": 2.9911249086853386e-05, + "loss": 2.0029, + "step": 17790 + }, + { + "epoch": 0.0346725918078821, + "grad_norm": 3.109741449356079, + "learning_rate": 2.9911099507559045e-05, + "loss": 1.9692, + "step": 17805 + }, + { + "epoch": 0.034701802078992365, + "grad_norm": 1.7844855785369873, + "learning_rate": 2.9910949802696244e-05, + "loss": 1.7244, + "step": 17820 + }, + { + "epoch": 0.034731012350102626, + "grad_norm": 1.8406388759613037, + "learning_rate": 2.9910799972266232e-05, + "loss": 1.8689, + "step": 17835 + }, + { + "epoch": 0.03476022262121289, + "grad_norm": 2.9145405292510986, + "learning_rate": 2.9910650016270278e-05, + "loss": 1.8982, + "step": 17850 + }, + { + "epoch": 0.03478943289232315, + "grad_norm": 1.9398448467254639, + "learning_rate": 2.991049993470964e-05, + "loss": 1.5994, + "step": 17865 + }, + { + "epoch": 0.034818643163433416, + "grad_norm": 2.3878304958343506, + "learning_rate": 2.991034972758559e-05, + "loss": 1.8854, + "step": 17880 + }, + { + "epoch": 0.03484785343454368, + "grad_norm": 1.8284587860107422, + "learning_rate": 2.9910199394899385e-05, + "loss": 1.8261, + "step": 17895 + }, + { + "epoch": 0.03487706370565394, + "grad_norm": 3.306748151779175, + "learning_rate": 2.9910048936652294e-05, + "loss": 1.8727, + "step": 17910 + }, + { + "epoch": 0.034906273976764206, + "grad_norm": 3.8391776084899902, + "learning_rate": 2.9909898352845585e-05, + "loss": 1.8347, + "step": 17925 + }, + { + "epoch": 0.03493548424787447, + "grad_norm": 2.275381565093994, + "learning_rate": 2.9909747643480526e-05, + "loss": 1.8268, + "step": 17940 + }, + { + "epoch": 0.03496469451898473, + "grad_norm": 2.871506929397583, + "learning_rate": 2.9909596808558385e-05, + "loss": 1.9375, + "step": 17955 + }, + { + "epoch": 0.03499390479009499, + "grad_norm": 3.8660874366760254, + "learning_rate": 2.990944584808043e-05, + "loss": 1.678, + "step": 17970 + }, + { + "epoch": 0.03502311506120526, + "grad_norm": 4.991106033325195, + "learning_rate": 2.9909294762047935e-05, + "loss": 1.9136, + "step": 17985 + }, + { + "epoch": 0.03505232533231552, + "grad_norm": 3.8985953330993652, + "learning_rate": 2.990914355046217e-05, + "loss": 1.7425, + "step": 18000 + }, + { + "epoch": 0.03508153560342578, + "grad_norm": 2.2547903060913086, + "learning_rate": 2.9908992213324413e-05, + "loss": 1.9613, + "step": 18015 + }, + { + "epoch": 0.035110745874536047, + "grad_norm": 4.33575963973999, + "learning_rate": 2.9908840750635936e-05, + "loss": 2.0328, + "step": 18030 + }, + { + "epoch": 0.03513995614564631, + "grad_norm": 3.8632030487060547, + "learning_rate": 2.9908689162398012e-05, + "loss": 1.7863, + "step": 18045 + }, + { + "epoch": 0.03516916641675657, + "grad_norm": 4.932112216949463, + "learning_rate": 2.9908537448611927e-05, + "loss": 1.7159, + "step": 18060 + }, + { + "epoch": 0.03519837668786683, + "grad_norm": 2.3018031120300293, + "learning_rate": 2.9908385609278943e-05, + "loss": 1.9104, + "step": 18075 + }, + { + "epoch": 0.0352275869589771, + "grad_norm": 2.7413060665130615, + "learning_rate": 2.990823364440035e-05, + "loss": 1.6733, + "step": 18090 + }, + { + "epoch": 0.03525679723008736, + "grad_norm": 3.928980588912964, + "learning_rate": 2.9908081553977424e-05, + "loss": 1.9619, + "step": 18105 + }, + { + "epoch": 0.03528600750119762, + "grad_norm": 1.6751407384872437, + "learning_rate": 2.9907929338011447e-05, + "loss": 1.8283, + "step": 18120 + }, + { + "epoch": 0.03531521777230789, + "grad_norm": 4.515170574188232, + "learning_rate": 2.99077769965037e-05, + "loss": 1.7808, + "step": 18135 + }, + { + "epoch": 0.03534442804341815, + "grad_norm": 3.367410182952881, + "learning_rate": 2.9907624529455468e-05, + "loss": 1.9427, + "step": 18150 + }, + { + "epoch": 0.03537363831452841, + "grad_norm": 2.9150290489196777, + "learning_rate": 2.990747193686803e-05, + "loss": 2.021, + "step": 18165 + }, + { + "epoch": 0.03540284858563867, + "grad_norm": 3.456367015838623, + "learning_rate": 2.9907319218742677e-05, + "loss": 1.8711, + "step": 18180 + }, + { + "epoch": 0.03543205885674894, + "grad_norm": 2.504117727279663, + "learning_rate": 2.990716637508069e-05, + "loss": 1.9605, + "step": 18195 + }, + { + "epoch": 0.0354612691278592, + "grad_norm": 3.5525166988372803, + "learning_rate": 2.990701340588336e-05, + "loss": 1.6192, + "step": 18210 + }, + { + "epoch": 0.03549047939896946, + "grad_norm": 4.048015594482422, + "learning_rate": 2.9906860311151973e-05, + "loss": 1.9809, + "step": 18225 + }, + { + "epoch": 0.03551968967007973, + "grad_norm": 3.0211918354034424, + "learning_rate": 2.990670709088782e-05, + "loss": 1.8898, + "step": 18240 + }, + { + "epoch": 0.03554889994118999, + "grad_norm": 2.4371862411499023, + "learning_rate": 2.9906553745092184e-05, + "loss": 1.8836, + "step": 18255 + }, + { + "epoch": 0.03557811021230025, + "grad_norm": 3.461329460144043, + "learning_rate": 2.990640027376637e-05, + "loss": 1.7893, + "step": 18270 + }, + { + "epoch": 0.03560732048341051, + "grad_norm": 2.6082050800323486, + "learning_rate": 2.990624667691166e-05, + "loss": 1.94, + "step": 18285 + }, + { + "epoch": 0.03563653075452078, + "grad_norm": 2.8006784915924072, + "learning_rate": 2.990609295452935e-05, + "loss": 1.7607, + "step": 18300 + }, + { + "epoch": 0.03566574102563104, + "grad_norm": 4.304664611816406, + "learning_rate": 2.990593910662073e-05, + "loss": 1.8973, + "step": 18315 + }, + { + "epoch": 0.0356949512967413, + "grad_norm": 2.063380479812622, + "learning_rate": 2.9905785133187108e-05, + "loss": 1.7456, + "step": 18330 + }, + { + "epoch": 0.03572416156785156, + "grad_norm": 4.626766681671143, + "learning_rate": 2.9905631034229772e-05, + "loss": 1.8402, + "step": 18345 + }, + { + "epoch": 0.03575337183896183, + "grad_norm": 4.0876288414001465, + "learning_rate": 2.9905476809750017e-05, + "loss": 1.8239, + "step": 18360 + }, + { + "epoch": 0.03578258211007209, + "grad_norm": 2.5533339977264404, + "learning_rate": 2.9905322459749148e-05, + "loss": 1.9771, + "step": 18375 + }, + { + "epoch": 0.03581179238118235, + "grad_norm": 2.268286943435669, + "learning_rate": 2.990516798422846e-05, + "loss": 1.8886, + "step": 18390 + }, + { + "epoch": 0.03584100265229262, + "grad_norm": 2.343240261077881, + "learning_rate": 2.9905013383189257e-05, + "loss": 1.8676, + "step": 18405 + }, + { + "epoch": 0.03587021292340288, + "grad_norm": 3.544220209121704, + "learning_rate": 2.990485865663284e-05, + "loss": 1.7922, + "step": 18420 + }, + { + "epoch": 0.03589942319451314, + "grad_norm": 3.608947277069092, + "learning_rate": 2.9904703804560516e-05, + "loss": 1.7506, + "step": 18435 + }, + { + "epoch": 0.0359286334656234, + "grad_norm": 2.520986557006836, + "learning_rate": 2.990454882697358e-05, + "loss": 1.9715, + "step": 18450 + }, + { + "epoch": 0.03595784373673367, + "grad_norm": 2.598273277282715, + "learning_rate": 2.9904393723873342e-05, + "loss": 2.0018, + "step": 18465 + }, + { + "epoch": 0.03598705400784393, + "grad_norm": 3.4648282527923584, + "learning_rate": 2.990423849526111e-05, + "loss": 1.8609, + "step": 18480 + }, + { + "epoch": 0.03601626427895419, + "grad_norm": 4.387876510620117, + "learning_rate": 2.9904083141138194e-05, + "loss": 1.9949, + "step": 18495 + }, + { + "epoch": 0.03604547455006446, + "grad_norm": 2.121781349182129, + "learning_rate": 2.9903927661505888e-05, + "loss": 1.91, + "step": 18510 + }, + { + "epoch": 0.03607468482117472, + "grad_norm": 2.5395514965057373, + "learning_rate": 2.9903772056365516e-05, + "loss": 1.7663, + "step": 18525 + }, + { + "epoch": 0.03610389509228498, + "grad_norm": 4.20078706741333, + "learning_rate": 2.9903616325718385e-05, + "loss": 1.8051, + "step": 18540 + }, + { + "epoch": 0.03613310536339524, + "grad_norm": 3.0787668228149414, + "learning_rate": 2.9903460469565802e-05, + "loss": 1.7457, + "step": 18555 + }, + { + "epoch": 0.03616231563450551, + "grad_norm": 2.493914842605591, + "learning_rate": 2.9903304487909084e-05, + "loss": 1.841, + "step": 18570 + }, + { + "epoch": 0.03619152590561577, + "grad_norm": 4.367134094238281, + "learning_rate": 2.9903148380749543e-05, + "loss": 1.8202, + "step": 18585 + }, + { + "epoch": 0.03622073617672603, + "grad_norm": 4.713270664215088, + "learning_rate": 2.990299214808849e-05, + "loss": 1.7427, + "step": 18600 + }, + { + "epoch": 0.0362499464478363, + "grad_norm": 2.3630452156066895, + "learning_rate": 2.9902835789927246e-05, + "loss": 1.8379, + "step": 18615 + }, + { + "epoch": 0.03627915671894656, + "grad_norm": 4.240983009338379, + "learning_rate": 2.9902679306267127e-05, + "loss": 1.8876, + "step": 18630 + }, + { + "epoch": 0.03630836699005682, + "grad_norm": 2.1189780235290527, + "learning_rate": 2.990252269710945e-05, + "loss": 1.7273, + "step": 18645 + }, + { + "epoch": 0.036337577261167084, + "grad_norm": 4.207765579223633, + "learning_rate": 2.9902365962455533e-05, + "loss": 1.7961, + "step": 18660 + }, + { + "epoch": 0.03636678753227735, + "grad_norm": 4.526651859283447, + "learning_rate": 2.9902209102306694e-05, + "loss": 1.8292, + "step": 18675 + }, + { + "epoch": 0.03639599780338761, + "grad_norm": 4.371520042419434, + "learning_rate": 2.990205211666426e-05, + "loss": 1.9646, + "step": 18690 + }, + { + "epoch": 0.036425208074497874, + "grad_norm": 3.766552209854126, + "learning_rate": 2.990189500552955e-05, + "loss": 1.6872, + "step": 18705 + }, + { + "epoch": 0.03645441834560814, + "grad_norm": 3.610987901687622, + "learning_rate": 2.9901737768903882e-05, + "loss": 1.8095, + "step": 18720 + }, + { + "epoch": 0.0364836286167184, + "grad_norm": 2.77374529838562, + "learning_rate": 2.9901580406788583e-05, + "loss": 1.7853, + "step": 18735 + }, + { + "epoch": 0.036512838887828664, + "grad_norm": 1.9032909870147705, + "learning_rate": 2.9901422919184984e-05, + "loss": 2.066, + "step": 18750 + }, + { + "epoch": 0.036542049158938925, + "grad_norm": 6.839906215667725, + "learning_rate": 2.9901265306094406e-05, + "loss": 2.0159, + "step": 18765 + }, + { + "epoch": 0.03657125943004919, + "grad_norm": 3.8178064823150635, + "learning_rate": 2.9901107567518177e-05, + "loss": 1.9257, + "step": 18780 + }, + { + "epoch": 0.036600469701159453, + "grad_norm": 2.0359580516815186, + "learning_rate": 2.990094970345762e-05, + "loss": 1.8261, + "step": 18795 + }, + { + "epoch": 0.036629679972269714, + "grad_norm": 3.5759871006011963, + "learning_rate": 2.9900791713914078e-05, + "loss": 1.9477, + "step": 18810 + }, + { + "epoch": 0.036658890243379975, + "grad_norm": 2.471161127090454, + "learning_rate": 2.990063359888887e-05, + "loss": 1.9519, + "step": 18825 + }, + { + "epoch": 0.03668810051449024, + "grad_norm": 4.123219966888428, + "learning_rate": 2.9900475358383327e-05, + "loss": 1.9215, + "step": 18840 + }, + { + "epoch": 0.036717310785600504, + "grad_norm": 2.517751693725586, + "learning_rate": 2.9900316992398793e-05, + "loss": 1.9601, + "step": 18855 + }, + { + "epoch": 0.036746521056710765, + "grad_norm": 3.0052502155303955, + "learning_rate": 2.9900158500936587e-05, + "loss": 1.8937, + "step": 18870 + }, + { + "epoch": 0.03677573132782103, + "grad_norm": 3.820679187774658, + "learning_rate": 2.9899999883998057e-05, + "loss": 1.7623, + "step": 18885 + }, + { + "epoch": 0.036804941598931294, + "grad_norm": 4.087255954742432, + "learning_rate": 2.989984114158453e-05, + "loss": 1.7651, + "step": 18900 + }, + { + "epoch": 0.036834151870041555, + "grad_norm": 2.3399763107299805, + "learning_rate": 2.989968227369734e-05, + "loss": 1.8389, + "step": 18915 + }, + { + "epoch": 0.036863362141151816, + "grad_norm": 4.5868072509765625, + "learning_rate": 2.9899523280337836e-05, + "loss": 1.9238, + "step": 18930 + }, + { + "epoch": 0.036892572412262084, + "grad_norm": 2.5856025218963623, + "learning_rate": 2.989936416150735e-05, + "loss": 1.8044, + "step": 18945 + }, + { + "epoch": 0.036921782683372345, + "grad_norm": 2.635301351547241, + "learning_rate": 2.9899204917207222e-05, + "loss": 1.9433, + "step": 18960 + }, + { + "epoch": 0.036950992954482606, + "grad_norm": 2.7841100692749023, + "learning_rate": 2.9899045547438792e-05, + "loss": 1.8401, + "step": 18975 + }, + { + "epoch": 0.036980203225592874, + "grad_norm": 3.0509424209594727, + "learning_rate": 2.9898886052203407e-05, + "loss": 1.8828, + "step": 18990 + }, + { + "epoch": 0.037009413496703135, + "grad_norm": 2.99225115776062, + "learning_rate": 2.9898726431502402e-05, + "loss": 1.8158, + "step": 19005 + }, + { + "epoch": 0.037038623767813396, + "grad_norm": 2.5677740573883057, + "learning_rate": 2.9898566685337135e-05, + "loss": 1.8706, + "step": 19020 + }, + { + "epoch": 0.03706783403892366, + "grad_norm": 3.502387762069702, + "learning_rate": 2.9898406813708934e-05, + "loss": 1.9128, + "step": 19035 + }, + { + "epoch": 0.037097044310033925, + "grad_norm": 3.5594568252563477, + "learning_rate": 2.9898246816619162e-05, + "loss": 1.732, + "step": 19050 + }, + { + "epoch": 0.037126254581144186, + "grad_norm": 2.2263100147247314, + "learning_rate": 2.989808669406915e-05, + "loss": 1.9151, + "step": 19065 + }, + { + "epoch": 0.03715546485225445, + "grad_norm": 3.859412670135498, + "learning_rate": 2.989792644606026e-05, + "loss": 1.9925, + "step": 19080 + }, + { + "epoch": 0.037184675123364715, + "grad_norm": 2.2860965728759766, + "learning_rate": 2.9897766072593834e-05, + "loss": 1.9347, + "step": 19095 + }, + { + "epoch": 0.037213885394474976, + "grad_norm": 3.5833935737609863, + "learning_rate": 2.9897605573671224e-05, + "loss": 1.7977, + "step": 19110 + }, + { + "epoch": 0.03724309566558524, + "grad_norm": 3.612175226211548, + "learning_rate": 2.989744494929378e-05, + "loss": 1.9048, + "step": 19125 + }, + { + "epoch": 0.0372723059366955, + "grad_norm": 2.6882388591766357, + "learning_rate": 2.989728419946286e-05, + "loss": 1.814, + "step": 19140 + }, + { + "epoch": 0.037301516207805765, + "grad_norm": 2.753767728805542, + "learning_rate": 2.989712332417982e-05, + "loss": 1.8261, + "step": 19155 + }, + { + "epoch": 0.037330726478916026, + "grad_norm": 3.5201592445373535, + "learning_rate": 2.9896962323446004e-05, + "loss": 1.9855, + "step": 19170 + }, + { + "epoch": 0.03735993675002629, + "grad_norm": 2.892357587814331, + "learning_rate": 2.9896801197262773e-05, + "loss": 1.8325, + "step": 19185 + }, + { + "epoch": 0.037389147021136555, + "grad_norm": 3.37709903717041, + "learning_rate": 2.9896639945631483e-05, + "loss": 1.9563, + "step": 19200 + }, + { + "epoch": 0.037418357292246816, + "grad_norm": 1.9762299060821533, + "learning_rate": 2.9896478568553492e-05, + "loss": 1.8228, + "step": 19215 + }, + { + "epoch": 0.03744756756335708, + "grad_norm": 5.543561935424805, + "learning_rate": 2.9896317066030162e-05, + "loss": 1.7656, + "step": 19230 + }, + { + "epoch": 0.03747677783446734, + "grad_norm": 1.8155004978179932, + "learning_rate": 2.9896155438062852e-05, + "loss": 1.8033, + "step": 19245 + }, + { + "epoch": 0.037505988105577606, + "grad_norm": 3.613931655883789, + "learning_rate": 2.989599368465292e-05, + "loss": 1.9275, + "step": 19260 + }, + { + "epoch": 0.03753519837668787, + "grad_norm": 2.2457938194274902, + "learning_rate": 2.989583180580173e-05, + "loss": 1.9459, + "step": 19275 + }, + { + "epoch": 0.03756440864779813, + "grad_norm": 2.4014956951141357, + "learning_rate": 2.9895669801510646e-05, + "loss": 1.7973, + "step": 19290 + }, + { + "epoch": 0.037593618918908396, + "grad_norm": 2.1578032970428467, + "learning_rate": 2.9895507671781032e-05, + "loss": 1.8551, + "step": 19305 + }, + { + "epoch": 0.03762282919001866, + "grad_norm": 2.952676296234131, + "learning_rate": 2.9895345416614254e-05, + "loss": 1.7808, + "step": 19320 + }, + { + "epoch": 0.03765203946112892, + "grad_norm": 3.352534770965576, + "learning_rate": 2.989518303601167e-05, + "loss": 1.8889, + "step": 19335 + }, + { + "epoch": 0.03768124973223918, + "grad_norm": 5.104668617248535, + "learning_rate": 2.9895020529974667e-05, + "loss": 1.9522, + "step": 19350 + }, + { + "epoch": 0.03771046000334945, + "grad_norm": 3.0823004245758057, + "learning_rate": 2.9894857898504595e-05, + "loss": 1.916, + "step": 19365 + }, + { + "epoch": 0.03773967027445971, + "grad_norm": 2.544391393661499, + "learning_rate": 2.9894695141602824e-05, + "loss": 1.9019, + "step": 19380 + }, + { + "epoch": 0.03776888054556997, + "grad_norm": 2.4823343753814697, + "learning_rate": 2.989453225927074e-05, + "loss": 1.7927, + "step": 19395 + }, + { + "epoch": 0.03779809081668023, + "grad_norm": 1.8794801235198975, + "learning_rate": 2.98943692515097e-05, + "loss": 2.0112, + "step": 19410 + }, + { + "epoch": 0.0378273010877905, + "grad_norm": 2.5527946949005127, + "learning_rate": 2.9894206118321083e-05, + "loss": 1.8095, + "step": 19425 + }, + { + "epoch": 0.03785651135890076, + "grad_norm": 3.4643588066101074, + "learning_rate": 2.9894042859706265e-05, + "loss": 1.7435, + "step": 19440 + }, + { + "epoch": 0.03788572163001102, + "grad_norm": 3.913656234741211, + "learning_rate": 2.9893879475666613e-05, + "loss": 2.0963, + "step": 19455 + }, + { + "epoch": 0.03791493190112129, + "grad_norm": 5.583499431610107, + "learning_rate": 2.9893715966203502e-05, + "loss": 1.8735, + "step": 19470 + }, + { + "epoch": 0.03794414217223155, + "grad_norm": 1.9320791959762573, + "learning_rate": 2.989355233131832e-05, + "loss": 1.9904, + "step": 19485 + }, + { + "epoch": 0.03797335244334181, + "grad_norm": 3.6746532917022705, + "learning_rate": 2.9893388571012443e-05, + "loss": 1.8231, + "step": 19500 + }, + { + "epoch": 0.03800256271445207, + "grad_norm": 2.002924680709839, + "learning_rate": 2.989322468528724e-05, + "loss": 1.989, + "step": 19515 + }, + { + "epoch": 0.03803177298556234, + "grad_norm": 3.4807581901550293, + "learning_rate": 2.98930606741441e-05, + "loss": 1.849, + "step": 19530 + }, + { + "epoch": 0.0380609832566726, + "grad_norm": 4.430756568908691, + "learning_rate": 2.98928965375844e-05, + "loss": 1.9161, + "step": 19545 + }, + { + "epoch": 0.03809019352778286, + "grad_norm": 3.5762476921081543, + "learning_rate": 2.9892732275609525e-05, + "loss": 1.8861, + "step": 19560 + }, + { + "epoch": 0.03811940379889313, + "grad_norm": 2.6605224609375, + "learning_rate": 2.9892567888220855e-05, + "loss": 1.9023, + "step": 19575 + }, + { + "epoch": 0.03814861407000339, + "grad_norm": 2.929426431655884, + "learning_rate": 2.9892403375419778e-05, + "loss": 1.7741, + "step": 19590 + }, + { + "epoch": 0.03817782434111365, + "grad_norm": 3.075709342956543, + "learning_rate": 2.9892238737207677e-05, + "loss": 1.9575, + "step": 19605 + }, + { + "epoch": 0.03820703461222391, + "grad_norm": 3.5462255477905273, + "learning_rate": 2.9892073973585942e-05, + "loss": 1.8336, + "step": 19620 + }, + { + "epoch": 0.03823624488333418, + "grad_norm": 2.098604202270508, + "learning_rate": 2.9891909084555954e-05, + "loss": 1.8161, + "step": 19635 + }, + { + "epoch": 0.03826545515444444, + "grad_norm": 3.255275249481201, + "learning_rate": 2.9891744070119106e-05, + "loss": 1.9291, + "step": 19650 + }, + { + "epoch": 0.0382946654255547, + "grad_norm": 3.4731807708740234, + "learning_rate": 2.9891578930276787e-05, + "loss": 1.9403, + "step": 19665 + }, + { + "epoch": 0.03832387569666497, + "grad_norm": 2.0823373794555664, + "learning_rate": 2.9891413665030387e-05, + "loss": 1.8713, + "step": 19680 + }, + { + "epoch": 0.03835308596777523, + "grad_norm": 2.6273398399353027, + "learning_rate": 2.98912482743813e-05, + "loss": 1.8764, + "step": 19695 + }, + { + "epoch": 0.03838229623888549, + "grad_norm": 3.4412078857421875, + "learning_rate": 2.9891082758330915e-05, + "loss": 1.8471, + "step": 19710 + }, + { + "epoch": 0.03841150650999575, + "grad_norm": 2.652684211730957, + "learning_rate": 2.9890917116880625e-05, + "loss": 2.0607, + "step": 19725 + }, + { + "epoch": 0.03844071678110602, + "grad_norm": 2.8258442878723145, + "learning_rate": 2.989075135003183e-05, + "loss": 1.8497, + "step": 19740 + }, + { + "epoch": 0.03846992705221628, + "grad_norm": 3.239922285079956, + "learning_rate": 2.9890585457785923e-05, + "loss": 1.7763, + "step": 19755 + }, + { + "epoch": 0.03849913732332654, + "grad_norm": 2.4102368354797363, + "learning_rate": 2.9890419440144303e-05, + "loss": 1.9937, + "step": 19770 + }, + { + "epoch": 0.03852834759443681, + "grad_norm": 3.4451348781585693, + "learning_rate": 2.989025329710837e-05, + "loss": 1.8446, + "step": 19785 + }, + { + "epoch": 0.03855755786554707, + "grad_norm": 2.9538424015045166, + "learning_rate": 2.9890087028679517e-05, + "loss": 1.8478, + "step": 19800 + }, + { + "epoch": 0.03858676813665733, + "grad_norm": 2.6566977500915527, + "learning_rate": 2.9889920634859144e-05, + "loss": 1.7005, + "step": 19815 + }, + { + "epoch": 0.03861597840776759, + "grad_norm": 1.9971015453338623, + "learning_rate": 2.988975411564866e-05, + "loss": 1.8428, + "step": 19830 + }, + { + "epoch": 0.03864518867887786, + "grad_norm": 2.648861885070801, + "learning_rate": 2.9889587471049456e-05, + "loss": 2.0656, + "step": 19845 + }, + { + "epoch": 0.03867439894998812, + "grad_norm": 4.70402193069458, + "learning_rate": 2.9889420701062947e-05, + "loss": 1.8098, + "step": 19860 + }, + { + "epoch": 0.03870360922109838, + "grad_norm": 4.031128883361816, + "learning_rate": 2.988925380569053e-05, + "loss": 1.9847, + "step": 19875 + }, + { + "epoch": 0.038732819492208644, + "grad_norm": 3.7528605461120605, + "learning_rate": 2.988908678493361e-05, + "loss": 1.8671, + "step": 19890 + }, + { + "epoch": 0.03876202976331891, + "grad_norm": 2.24607253074646, + "learning_rate": 2.9888919638793604e-05, + "loss": 1.8053, + "step": 19905 + }, + { + "epoch": 0.03879124003442917, + "grad_norm": 2.6132078170776367, + "learning_rate": 2.9888752367271903e-05, + "loss": 2.0429, + "step": 19920 + }, + { + "epoch": 0.03882045030553943, + "grad_norm": 2.334711790084839, + "learning_rate": 2.988858497036993e-05, + "loss": 1.7526, + "step": 19935 + }, + { + "epoch": 0.0388496605766497, + "grad_norm": 3.065218210220337, + "learning_rate": 2.988841744808909e-05, + "loss": 1.9287, + "step": 19950 + }, + { + "epoch": 0.03887887084775996, + "grad_norm": 4.216036796569824, + "learning_rate": 2.9888249800430787e-05, + "loss": 1.7889, + "step": 19965 + }, + { + "epoch": 0.03890808111887022, + "grad_norm": 4.185488224029541, + "learning_rate": 2.988808202739644e-05, + "loss": 1.865, + "step": 19980 + }, + { + "epoch": 0.038937291389980484, + "grad_norm": 2.3708608150482178, + "learning_rate": 2.9887914128987465e-05, + "loss": 1.852, + "step": 19995 + }, + { + "epoch": 0.03896650166109075, + "grad_norm": 4.136464595794678, + "learning_rate": 2.9887746105205264e-05, + "loss": 1.8378, + "step": 20010 + }, + { + "epoch": 0.03899571193220101, + "grad_norm": 3.3061976432800293, + "learning_rate": 2.9887577956051263e-05, + "loss": 1.8702, + "step": 20025 + }, + { + "epoch": 0.039024922203311274, + "grad_norm": 3.5822839736938477, + "learning_rate": 2.9887409681526876e-05, + "loss": 1.87, + "step": 20040 + }, + { + "epoch": 0.03905413247442154, + "grad_norm": 4.132532119750977, + "learning_rate": 2.9887241281633518e-05, + "loss": 2.051, + "step": 20055 + }, + { + "epoch": 0.0390833427455318, + "grad_norm": 3.9095962047576904, + "learning_rate": 2.9887072756372606e-05, + "loss": 1.8271, + "step": 20070 + }, + { + "epoch": 0.039112553016642064, + "grad_norm": 3.028393507003784, + "learning_rate": 2.988690410574556e-05, + "loss": 1.825, + "step": 20085 + }, + { + "epoch": 0.039141763287752325, + "grad_norm": 1.7903603315353394, + "learning_rate": 2.98867353297538e-05, + "loss": 1.9565, + "step": 20100 + }, + { + "epoch": 0.03917097355886259, + "grad_norm": 2.168895959854126, + "learning_rate": 2.988656642839875e-05, + "loss": 1.8483, + "step": 20115 + }, + { + "epoch": 0.039200183829972854, + "grad_norm": 7.789179801940918, + "learning_rate": 2.988639740168183e-05, + "loss": 2.0627, + "step": 20130 + }, + { + "epoch": 0.039229394101083115, + "grad_norm": 2.033658981323242, + "learning_rate": 2.9886228249604464e-05, + "loss": 1.8905, + "step": 20145 + }, + { + "epoch": 0.03925860437219338, + "grad_norm": 3.6176931858062744, + "learning_rate": 2.9886058972168076e-05, + "loss": 1.8981, + "step": 20160 + }, + { + "epoch": 0.039287814643303644, + "grad_norm": 3.776540517807007, + "learning_rate": 2.9885889569374088e-05, + "loss": 1.7417, + "step": 20175 + }, + { + "epoch": 0.039317024914413905, + "grad_norm": 2.138796329498291, + "learning_rate": 2.9885720041223934e-05, + "loss": 1.8845, + "step": 20190 + }, + { + "epoch": 0.039346235185524166, + "grad_norm": 5.0401482582092285, + "learning_rate": 2.988555038771904e-05, + "loss": 1.8966, + "step": 20205 + }, + { + "epoch": 0.039375445456634434, + "grad_norm": 2.2886552810668945, + "learning_rate": 2.9885380608860827e-05, + "loss": 1.8898, + "step": 20220 + }, + { + "epoch": 0.039404655727744695, + "grad_norm": 2.286959648132324, + "learning_rate": 2.9885210704650734e-05, + "loss": 1.9597, + "step": 20235 + }, + { + "epoch": 0.039433865998854956, + "grad_norm": 2.538752794265747, + "learning_rate": 2.988504067509019e-05, + "loss": 1.8451, + "step": 20250 + }, + { + "epoch": 0.039463076269965223, + "grad_norm": 3.4922502040863037, + "learning_rate": 2.988487052018062e-05, + "loss": 1.8717, + "step": 20265 + }, + { + "epoch": 0.039492286541075484, + "grad_norm": 3.5919086933135986, + "learning_rate": 2.9884700239923467e-05, + "loss": 1.8182, + "step": 20280 + }, + { + "epoch": 0.039521496812185745, + "grad_norm": 1.9253063201904297, + "learning_rate": 2.988452983432016e-05, + "loss": 1.8067, + "step": 20295 + }, + { + "epoch": 0.039550707083296006, + "grad_norm": 3.168278217315674, + "learning_rate": 2.9884359303372127e-05, + "loss": 1.9619, + "step": 20310 + }, + { + "epoch": 0.039579917354406274, + "grad_norm": 2.574296236038208, + "learning_rate": 2.9884188647080816e-05, + "loss": 1.9037, + "step": 20325 + }, + { + "epoch": 0.039609127625516535, + "grad_norm": 3.87908673286438, + "learning_rate": 2.9884017865447657e-05, + "loss": 1.8106, + "step": 20340 + }, + { + "epoch": 0.039638337896626796, + "grad_norm": 3.060088872909546, + "learning_rate": 2.9883846958474093e-05, + "loss": 1.8878, + "step": 20355 + }, + { + "epoch": 0.039667548167737064, + "grad_norm": 2.6092071533203125, + "learning_rate": 2.988367592616156e-05, + "loss": 2.0189, + "step": 20370 + }, + { + "epoch": 0.039696758438847325, + "grad_norm": 2.688831329345703, + "learning_rate": 2.9883504768511496e-05, + "loss": 1.9439, + "step": 20385 + }, + { + "epoch": 0.039725968709957586, + "grad_norm": 4.126415252685547, + "learning_rate": 2.988333348552535e-05, + "loss": 1.7334, + "step": 20400 + }, + { + "epoch": 0.03975517898106785, + "grad_norm": 2.5536105632781982, + "learning_rate": 2.988316207720455e-05, + "loss": 1.8741, + "step": 20415 + }, + { + "epoch": 0.039784389252178115, + "grad_norm": 4.146584510803223, + "learning_rate": 2.9882990543550557e-05, + "loss": 1.839, + "step": 20430 + }, + { + "epoch": 0.039813599523288376, + "grad_norm": 4.531203746795654, + "learning_rate": 2.9882818884564805e-05, + "loss": 1.801, + "step": 20445 + }, + { + "epoch": 0.03984280979439864, + "grad_norm": 4.3206562995910645, + "learning_rate": 2.988264710024874e-05, + "loss": 1.9127, + "step": 20460 + }, + { + "epoch": 0.0398720200655089, + "grad_norm": 2.258627414703369, + "learning_rate": 2.9882475190603815e-05, + "loss": 1.9329, + "step": 20475 + }, + { + "epoch": 0.039901230336619166, + "grad_norm": 2.874171018600464, + "learning_rate": 2.988230315563147e-05, + "loss": 1.8632, + "step": 20490 + }, + { + "epoch": 0.03993044060772943, + "grad_norm": 2.2248058319091797, + "learning_rate": 2.988213099533316e-05, + "loss": 1.7653, + "step": 20505 + }, + { + "epoch": 0.03995965087883969, + "grad_norm": 2.2211756706237793, + "learning_rate": 2.988195870971033e-05, + "loss": 2.0817, + "step": 20520 + }, + { + "epoch": 0.039988861149949956, + "grad_norm": 2.266099214553833, + "learning_rate": 2.9881786298764432e-05, + "loss": 2.0993, + "step": 20535 + }, + { + "epoch": 0.04001807142106022, + "grad_norm": 3.2420692443847656, + "learning_rate": 2.988161376249692e-05, + "loss": 1.8448, + "step": 20550 + }, + { + "epoch": 0.04004728169217048, + "grad_norm": 2.1219029426574707, + "learning_rate": 2.9881441100909244e-05, + "loss": 1.9547, + "step": 20565 + }, + { + "epoch": 0.04007649196328074, + "grad_norm": 4.585850238800049, + "learning_rate": 2.988126831400286e-05, + "loss": 1.9619, + "step": 20580 + }, + { + "epoch": 0.04010570223439101, + "grad_norm": 3.4670188426971436, + "learning_rate": 2.9881095401779224e-05, + "loss": 2.0, + "step": 20595 + }, + { + "epoch": 0.04013491250550127, + "grad_norm": 2.0285186767578125, + "learning_rate": 2.9880922364239787e-05, + "loss": 1.9434, + "step": 20610 + }, + { + "epoch": 0.04016412277661153, + "grad_norm": 1.873279094696045, + "learning_rate": 2.9880749201386014e-05, + "loss": 1.8546, + "step": 20625 + }, + { + "epoch": 0.040193333047721796, + "grad_norm": 2.1878137588500977, + "learning_rate": 2.9880575913219354e-05, + "loss": 1.8624, + "step": 20640 + }, + { + "epoch": 0.04022254331883206, + "grad_norm": 3.9366180896759033, + "learning_rate": 2.988040249974128e-05, + "loss": 1.9143, + "step": 20655 + }, + { + "epoch": 0.04025175358994232, + "grad_norm": 2.3610141277313232, + "learning_rate": 2.9880228960953236e-05, + "loss": 1.9616, + "step": 20670 + }, + { + "epoch": 0.04028096386105258, + "grad_norm": 2.8658604621887207, + "learning_rate": 2.9880055296856695e-05, + "loss": 1.7601, + "step": 20685 + }, + { + "epoch": 0.04031017413216285, + "grad_norm": 4.454057216644287, + "learning_rate": 2.9879881507453112e-05, + "loss": 1.7578, + "step": 20700 + }, + { + "epoch": 0.04033938440327311, + "grad_norm": 2.244455575942993, + "learning_rate": 2.9879707592743957e-05, + "loss": 1.8334, + "step": 20715 + }, + { + "epoch": 0.04036859467438337, + "grad_norm": 1.61302649974823, + "learning_rate": 2.987953355273069e-05, + "loss": 1.9746, + "step": 20730 + }, + { + "epoch": 0.04039780494549364, + "grad_norm": 2.8323678970336914, + "learning_rate": 2.987935938741478e-05, + "loss": 2.1251, + "step": 20745 + }, + { + "epoch": 0.0404270152166039, + "grad_norm": 2.418241262435913, + "learning_rate": 2.987918509679769e-05, + "loss": 2.0798, + "step": 20760 + }, + { + "epoch": 0.04045622548771416, + "grad_norm": 3.465550661087036, + "learning_rate": 2.987901068088089e-05, + "loss": 1.8706, + "step": 20775 + }, + { + "epoch": 0.04048543575882442, + "grad_norm": 4.148263931274414, + "learning_rate": 2.987883613966585e-05, + "loss": 1.7925, + "step": 20790 + }, + { + "epoch": 0.04051464602993469, + "grad_norm": 2.4039666652679443, + "learning_rate": 2.9878661473154037e-05, + "loss": 1.8672, + "step": 20805 + }, + { + "epoch": 0.04054385630104495, + "grad_norm": 2.12880277633667, + "learning_rate": 2.9878486681346923e-05, + "loss": 1.8978, + "step": 20820 + }, + { + "epoch": 0.04057306657215521, + "grad_norm": 1.9612979888916016, + "learning_rate": 2.987831176424598e-05, + "loss": 1.9175, + "step": 20835 + }, + { + "epoch": 0.04060227684326548, + "grad_norm": 2.161982297897339, + "learning_rate": 2.9878136721852682e-05, + "loss": 1.7752, + "step": 20850 + }, + { + "epoch": 0.04063148711437574, + "grad_norm": 4.4879961013793945, + "learning_rate": 2.9877961554168498e-05, + "loss": 2.0857, + "step": 20865 + }, + { + "epoch": 0.040660697385486, + "grad_norm": 4.1571364402771, + "learning_rate": 2.9877786261194914e-05, + "loss": 1.949, + "step": 20880 + }, + { + "epoch": 0.04068990765659626, + "grad_norm": 3.3120033740997314, + "learning_rate": 2.9877610842933397e-05, + "loss": 1.8585, + "step": 20895 + }, + { + "epoch": 0.04071911792770653, + "grad_norm": 3.193117618560791, + "learning_rate": 2.9877435299385424e-05, + "loss": 1.731, + "step": 20910 + }, + { + "epoch": 0.04074832819881679, + "grad_norm": 2.375343084335327, + "learning_rate": 2.987725963055248e-05, + "loss": 1.8269, + "step": 20925 + }, + { + "epoch": 0.04077753846992705, + "grad_norm": 2.3607242107391357, + "learning_rate": 2.9877083836436036e-05, + "loss": 1.8305, + "step": 20940 + }, + { + "epoch": 0.04080674874103731, + "grad_norm": 3.0205342769622803, + "learning_rate": 2.987690791703758e-05, + "loss": 1.8631, + "step": 20955 + }, + { + "epoch": 0.04083595901214758, + "grad_norm": 3.530947685241699, + "learning_rate": 2.9876731872358585e-05, + "loss": 1.8431, + "step": 20970 + }, + { + "epoch": 0.04086516928325784, + "grad_norm": 2.8419220447540283, + "learning_rate": 2.987655570240054e-05, + "loss": 1.8519, + "step": 20985 + }, + { + "epoch": 0.0408943795543681, + "grad_norm": 2.253532886505127, + "learning_rate": 2.9876379407164933e-05, + "loss": 1.8688, + "step": 21000 + }, + { + "epoch": 0.04092358982547837, + "grad_norm": 1.9279251098632812, + "learning_rate": 2.987620298665324e-05, + "loss": 1.9111, + "step": 21015 + }, + { + "epoch": 0.04095280009658863, + "grad_norm": 3.0645790100097656, + "learning_rate": 2.987602644086695e-05, + "loss": 1.8359, + "step": 21030 + }, + { + "epoch": 0.04098201036769889, + "grad_norm": 2.9384896755218506, + "learning_rate": 2.9875849769807544e-05, + "loss": 1.747, + "step": 21045 + }, + { + "epoch": 0.04101122063880915, + "grad_norm": 3.7138500213623047, + "learning_rate": 2.987567297347652e-05, + "loss": 1.8528, + "step": 21060 + }, + { + "epoch": 0.04104043090991942, + "grad_norm": 4.440821170806885, + "learning_rate": 2.987549605187536e-05, + "loss": 1.9611, + "step": 21075 + }, + { + "epoch": 0.04106964118102968, + "grad_norm": 3.0378854274749756, + "learning_rate": 2.9875319005005552e-05, + "loss": 2.012, + "step": 21090 + }, + { + "epoch": 0.04109885145213994, + "grad_norm": 4.378201961517334, + "learning_rate": 2.9875141832868598e-05, + "loss": 1.7366, + "step": 21105 + }, + { + "epoch": 0.04112806172325021, + "grad_norm": 4.794327735900879, + "learning_rate": 2.9874964535465978e-05, + "loss": 1.8728, + "step": 21120 + }, + { + "epoch": 0.04115727199436047, + "grad_norm": 2.5137903690338135, + "learning_rate": 2.987478711279919e-05, + "loss": 1.9813, + "step": 21135 + }, + { + "epoch": 0.04118648226547073, + "grad_norm": 2.2024412155151367, + "learning_rate": 2.987460956486973e-05, + "loss": 1.7721, + "step": 21150 + }, + { + "epoch": 0.04121569253658099, + "grad_norm": 3.0043609142303467, + "learning_rate": 2.987443189167909e-05, + "loss": 1.9015, + "step": 21165 + }, + { + "epoch": 0.04124490280769126, + "grad_norm": 4.627270698547363, + "learning_rate": 2.9874254093228763e-05, + "loss": 1.9487, + "step": 21180 + }, + { + "epoch": 0.04127411307880152, + "grad_norm": 3.208395481109619, + "learning_rate": 2.987407616952025e-05, + "loss": 1.6989, + "step": 21195 + }, + { + "epoch": 0.04130332334991178, + "grad_norm": 4.102930545806885, + "learning_rate": 2.9873898120555055e-05, + "loss": 2.0639, + "step": 21210 + }, + { + "epoch": 0.04133253362102205, + "grad_norm": 3.846593141555786, + "learning_rate": 2.987371994633467e-05, + "loss": 1.7067, + "step": 21225 + }, + { + "epoch": 0.04136174389213231, + "grad_norm": 3.6651105880737305, + "learning_rate": 2.9873541646860597e-05, + "loss": 1.8983, + "step": 21240 + }, + { + "epoch": 0.04139095416324257, + "grad_norm": 3.715604543685913, + "learning_rate": 2.987336322213434e-05, + "loss": 1.6676, + "step": 21255 + }, + { + "epoch": 0.041420164434352834, + "grad_norm": 3.0780601501464844, + "learning_rate": 2.9873184672157395e-05, + "loss": 1.9342, + "step": 21270 + }, + { + "epoch": 0.0414493747054631, + "grad_norm": 3.385103225708008, + "learning_rate": 2.9873005996931274e-05, + "loss": 1.9494, + "step": 21285 + }, + { + "epoch": 0.04147858497657336, + "grad_norm": 2.9652836322784424, + "learning_rate": 2.9872827196457475e-05, + "loss": 1.8491, + "step": 21300 + }, + { + "epoch": 0.041507795247683624, + "grad_norm": 2.346210479736328, + "learning_rate": 2.9872648270737507e-05, + "loss": 1.7948, + "step": 21315 + }, + { + "epoch": 0.04153700551879389, + "grad_norm": 2.0421793460845947, + "learning_rate": 2.9872469219772877e-05, + "loss": 1.7642, + "step": 21330 + }, + { + "epoch": 0.04156621578990415, + "grad_norm": 3.2347426414489746, + "learning_rate": 2.9872290043565094e-05, + "loss": 1.9741, + "step": 21345 + }, + { + "epoch": 0.041595426061014414, + "grad_norm": 2.8749160766601562, + "learning_rate": 2.987211074211566e-05, + "loss": 1.8364, + "step": 21360 + }, + { + "epoch": 0.041624636332124675, + "grad_norm": 3.484539031982422, + "learning_rate": 2.9871931315426094e-05, + "loss": 1.7097, + "step": 21375 + }, + { + "epoch": 0.04165384660323494, + "grad_norm": 3.105286121368408, + "learning_rate": 2.98717517634979e-05, + "loss": 1.8239, + "step": 21390 + }, + { + "epoch": 0.0416830568743452, + "grad_norm": 3.804901361465454, + "learning_rate": 2.9871572086332594e-05, + "loss": 1.7356, + "step": 21405 + }, + { + "epoch": 0.041712267145455464, + "grad_norm": 2.4536283016204834, + "learning_rate": 2.9871392283931686e-05, + "loss": 1.8601, + "step": 21420 + }, + { + "epoch": 0.04174147741656573, + "grad_norm": 2.8864688873291016, + "learning_rate": 2.9871212356296697e-05, + "loss": 1.802, + "step": 21435 + }, + { + "epoch": 0.04177068768767599, + "grad_norm": 3.4735238552093506, + "learning_rate": 2.9871032303429133e-05, + "loss": 1.9557, + "step": 21450 + }, + { + "epoch": 0.041799897958786254, + "grad_norm": 4.21823263168335, + "learning_rate": 2.9870852125330513e-05, + "loss": 1.8513, + "step": 21465 + }, + { + "epoch": 0.041829108229896515, + "grad_norm": 1.6568303108215332, + "learning_rate": 2.9870671822002357e-05, + "loss": 1.8443, + "step": 21480 + }, + { + "epoch": 0.04185831850100678, + "grad_norm": 3.8471972942352295, + "learning_rate": 2.9870491393446184e-05, + "loss": 1.8831, + "step": 21495 + }, + { + "epoch": 0.041887528772117044, + "grad_norm": 4.5181803703308105, + "learning_rate": 2.987031083966351e-05, + "loss": 2.0743, + "step": 21510 + }, + { + "epoch": 0.041916739043227305, + "grad_norm": 2.7604262828826904, + "learning_rate": 2.987013016065586e-05, + "loss": 1.7647, + "step": 21525 + }, + { + "epoch": 0.041945949314337566, + "grad_norm": 1.6995900869369507, + "learning_rate": 2.986994935642475e-05, + "loss": 1.9221, + "step": 21540 + }, + { + "epoch": 0.041975159585447834, + "grad_norm": 3.400073766708374, + "learning_rate": 2.9869768426971706e-05, + "loss": 1.7217, + "step": 21555 + }, + { + "epoch": 0.042004369856558095, + "grad_norm": 2.7783281803131104, + "learning_rate": 2.986958737229825e-05, + "loss": 1.8149, + "step": 21570 + }, + { + "epoch": 0.042033580127668356, + "grad_norm": 1.9334073066711426, + "learning_rate": 2.9869406192405904e-05, + "loss": 1.7657, + "step": 21585 + }, + { + "epoch": 0.042062790398778624, + "grad_norm": 3.244271993637085, + "learning_rate": 2.9869224887296205e-05, + "loss": 1.8122, + "step": 21600 + }, + { + "epoch": 0.042092000669888885, + "grad_norm": 3.9582252502441406, + "learning_rate": 2.9869043456970662e-05, + "loss": 1.8296, + "step": 21615 + }, + { + "epoch": 0.042121210940999146, + "grad_norm": 3.9118635654449463, + "learning_rate": 2.986886190143082e-05, + "loss": 1.9162, + "step": 21630 + }, + { + "epoch": 0.04215042121210941, + "grad_norm": 3.3247733116149902, + "learning_rate": 2.9868680220678198e-05, + "loss": 1.9345, + "step": 21645 + }, + { + "epoch": 0.042179631483219675, + "grad_norm": 4.189952373504639, + "learning_rate": 2.9868498414714332e-05, + "loss": 1.7394, + "step": 21660 + }, + { + "epoch": 0.042208841754329936, + "grad_norm": 4.380448818206787, + "learning_rate": 2.986831648354075e-05, + "loss": 1.8411, + "step": 21675 + }, + { + "epoch": 0.0422380520254402, + "grad_norm": 3.3280911445617676, + "learning_rate": 2.986813442715898e-05, + "loss": 1.7338, + "step": 21690 + }, + { + "epoch": 0.042267262296550465, + "grad_norm": 2.147976875305176, + "learning_rate": 2.9867952245570557e-05, + "loss": 1.7418, + "step": 21705 + }, + { + "epoch": 0.042296472567660726, + "grad_norm": 4.927828788757324, + "learning_rate": 2.9867769938777025e-05, + "loss": 1.9729, + "step": 21720 + }, + { + "epoch": 0.04232568283877099, + "grad_norm": 2.1555793285369873, + "learning_rate": 2.9867587506779903e-05, + "loss": 1.9733, + "step": 21735 + }, + { + "epoch": 0.04235489310988125, + "grad_norm": 4.826751232147217, + "learning_rate": 2.986740494958074e-05, + "loss": 1.9141, + "step": 21750 + }, + { + "epoch": 0.042384103380991515, + "grad_norm": 2.62713360786438, + "learning_rate": 2.986722226718107e-05, + "loss": 1.7623, + "step": 21765 + }, + { + "epoch": 0.042413313652101776, + "grad_norm": 2.9087536334991455, + "learning_rate": 2.9867039459582422e-05, + "loss": 2.0298, + "step": 21780 + }, + { + "epoch": 0.04244252392321204, + "grad_norm": 3.2233824729919434, + "learning_rate": 2.986685652678635e-05, + "loss": 1.8063, + "step": 21795 + }, + { + "epoch": 0.042471734194322305, + "grad_norm": 3.8226282596588135, + "learning_rate": 2.9866673468794392e-05, + "loss": 1.8173, + "step": 21810 + }, + { + "epoch": 0.042500944465432566, + "grad_norm": 2.362210273742676, + "learning_rate": 2.986649028560808e-05, + "loss": 2.0107, + "step": 21825 + }, + { + "epoch": 0.04253015473654283, + "grad_norm": 1.4686235189437866, + "learning_rate": 2.9866306977228964e-05, + "loss": 1.8339, + "step": 21840 + }, + { + "epoch": 0.04255936500765309, + "grad_norm": 3.2174501419067383, + "learning_rate": 2.9866123543658585e-05, + "loss": 1.9372, + "step": 21855 + }, + { + "epoch": 0.042588575278763356, + "grad_norm": 4.215010643005371, + "learning_rate": 2.9865939984898494e-05, + "loss": 1.7492, + "step": 21870 + }, + { + "epoch": 0.04261778554987362, + "grad_norm": 6.015155792236328, + "learning_rate": 2.9865756300950224e-05, + "loss": 1.8146, + "step": 21885 + }, + { + "epoch": 0.04264699582098388, + "grad_norm": 3.102923631668091, + "learning_rate": 2.9865572491815336e-05, + "loss": 1.9206, + "step": 21900 + }, + { + "epoch": 0.042676206092094146, + "grad_norm": 3.5606796741485596, + "learning_rate": 2.986538855749537e-05, + "loss": 1.7442, + "step": 21915 + }, + { + "epoch": 0.04270541636320441, + "grad_norm": 3.877696990966797, + "learning_rate": 2.9865204497991874e-05, + "loss": 1.8356, + "step": 21930 + }, + { + "epoch": 0.04273462663431467, + "grad_norm": 2.7707462310791016, + "learning_rate": 2.98650203133064e-05, + "loss": 2.1114, + "step": 21945 + }, + { + "epoch": 0.04276383690542493, + "grad_norm": 3.8471269607543945, + "learning_rate": 2.9864836003440496e-05, + "loss": 1.974, + "step": 21960 + }, + { + "epoch": 0.0427930471765352, + "grad_norm": 4.265331268310547, + "learning_rate": 2.9864651568395728e-05, + "loss": 1.7851, + "step": 21975 + }, + { + "epoch": 0.04282225744764546, + "grad_norm": 3.578641414642334, + "learning_rate": 2.986446700817363e-05, + "loss": 1.7996, + "step": 21990 + }, + { + "epoch": 0.04285146771875572, + "grad_norm": 3.289726495742798, + "learning_rate": 2.9864282322775768e-05, + "loss": 1.7165, + "step": 22005 + }, + { + "epoch": 0.04288067798986598, + "grad_norm": 1.883353590965271, + "learning_rate": 2.986409751220369e-05, + "loss": 1.8633, + "step": 22020 + }, + { + "epoch": 0.04290988826097625, + "grad_norm": 1.7199418544769287, + "learning_rate": 2.9863912576458955e-05, + "loss": 1.7696, + "step": 22035 + }, + { + "epoch": 0.04293909853208651, + "grad_norm": 3.445349931716919, + "learning_rate": 2.986372751554313e-05, + "loss": 1.8349, + "step": 22050 + }, + { + "epoch": 0.04296830880319677, + "grad_norm": 2.827427387237549, + "learning_rate": 2.986354232945776e-05, + "loss": 1.8647, + "step": 22065 + }, + { + "epoch": 0.04299751907430704, + "grad_norm": 2.4225332736968994, + "learning_rate": 2.986335701820441e-05, + "loss": 2.0951, + "step": 22080 + }, + { + "epoch": 0.0430267293454173, + "grad_norm": 2.9156410694122314, + "learning_rate": 2.986317158178464e-05, + "loss": 1.7989, + "step": 22095 + }, + { + "epoch": 0.04305593961652756, + "grad_norm": 2.3038766384124756, + "learning_rate": 2.986298602020001e-05, + "loss": 1.8867, + "step": 22110 + }, + { + "epoch": 0.04308514988763782, + "grad_norm": 3.085129976272583, + "learning_rate": 2.986280033345209e-05, + "loss": 1.6969, + "step": 22125 + }, + { + "epoch": 0.04311436015874809, + "grad_norm": 4.071490287780762, + "learning_rate": 2.986261452154243e-05, + "loss": 1.7865, + "step": 22140 + }, + { + "epoch": 0.04314357042985835, + "grad_norm": 2.2542295455932617, + "learning_rate": 2.986242858447261e-05, + "loss": 1.8687, + "step": 22155 + }, + { + "epoch": 0.04317278070096861, + "grad_norm": 3.3869330883026123, + "learning_rate": 2.9862242522244183e-05, + "loss": 1.8037, + "step": 22170 + }, + { + "epoch": 0.04320199097207888, + "grad_norm": 3.7993576526641846, + "learning_rate": 2.9862056334858727e-05, + "loss": 1.8819, + "step": 22185 + }, + { + "epoch": 0.04323120124318914, + "grad_norm": 3.019289255142212, + "learning_rate": 2.9861870022317798e-05, + "loss": 1.8932, + "step": 22200 + }, + { + "epoch": 0.0432604115142994, + "grad_norm": 2.831664562225342, + "learning_rate": 2.9861683584622976e-05, + "loss": 1.8813, + "step": 22215 + }, + { + "epoch": 0.04328962178540966, + "grad_norm": 4.9506754875183105, + "learning_rate": 2.9861497021775825e-05, + "loss": 1.7917, + "step": 22230 + }, + { + "epoch": 0.04331883205651993, + "grad_norm": 2.384033203125, + "learning_rate": 2.986131033377792e-05, + "loss": 2.0226, + "step": 22245 + }, + { + "epoch": 0.04334804232763019, + "grad_norm": 1.8661621809005737, + "learning_rate": 2.9861123520630828e-05, + "loss": 1.817, + "step": 22260 + }, + { + "epoch": 0.04337725259874045, + "grad_norm": 3.5283803939819336, + "learning_rate": 2.9860936582336123e-05, + "loss": 1.806, + "step": 22275 + }, + { + "epoch": 0.04340646286985072, + "grad_norm": 4.240048408508301, + "learning_rate": 2.9860749518895386e-05, + "loss": 1.7773, + "step": 22290 + }, + { + "epoch": 0.04343567314096098, + "grad_norm": 3.9326212406158447, + "learning_rate": 2.986056233031018e-05, + "loss": 1.8494, + "step": 22305 + }, + { + "epoch": 0.04346488341207124, + "grad_norm": 4.390437602996826, + "learning_rate": 2.986037501658209e-05, + "loss": 1.7608, + "step": 22320 + }, + { + "epoch": 0.0434940936831815, + "grad_norm": 2.5305089950561523, + "learning_rate": 2.98601875777127e-05, + "loss": 1.7295, + "step": 22335 + }, + { + "epoch": 0.04352330395429177, + "grad_norm": 2.4276509284973145, + "learning_rate": 2.9860000013703576e-05, + "loss": 2.0082, + "step": 22350 + }, + { + "epoch": 0.04355251422540203, + "grad_norm": 3.4239320755004883, + "learning_rate": 2.9859812324556298e-05, + "loss": 1.6979, + "step": 22365 + }, + { + "epoch": 0.04358172449651229, + "grad_norm": 2.544837474822998, + "learning_rate": 2.985962451027245e-05, + "loss": 1.7844, + "step": 22380 + }, + { + "epoch": 0.04361093476762256, + "grad_norm": 5.094317436218262, + "learning_rate": 2.985943657085362e-05, + "loss": 1.8792, + "step": 22395 + }, + { + "epoch": 0.04364014503873282, + "grad_norm": 3.235121250152588, + "learning_rate": 2.985924850630138e-05, + "loss": 1.7795, + "step": 22410 + }, + { + "epoch": 0.04366935530984308, + "grad_norm": 4.88664436340332, + "learning_rate": 2.9859060316617325e-05, + "loss": 1.9035, + "step": 22425 + }, + { + "epoch": 0.04369856558095334, + "grad_norm": 2.401301622390747, + "learning_rate": 2.9858872001803025e-05, + "loss": 1.9182, + "step": 22440 + }, + { + "epoch": 0.04372777585206361, + "grad_norm": 2.24088978767395, + "learning_rate": 2.9858683561860077e-05, + "loss": 1.688, + "step": 22455 + }, + { + "epoch": 0.04375698612317387, + "grad_norm": 2.521176338195801, + "learning_rate": 2.9858494996790065e-05, + "loss": 1.962, + "step": 22470 + }, + { + "epoch": 0.04378619639428413, + "grad_norm": 2.897905111312866, + "learning_rate": 2.9858306306594578e-05, + "loss": 1.7388, + "step": 22485 + }, + { + "epoch": 0.0438154066653944, + "grad_norm": 2.3301055431365967, + "learning_rate": 2.9858117491275204e-05, + "loss": 1.8834, + "step": 22500 + }, + { + "epoch": 0.04384461693650466, + "grad_norm": 2.078205108642578, + "learning_rate": 2.9857928550833533e-05, + "loss": 1.8959, + "step": 22515 + }, + { + "epoch": 0.04387382720761492, + "grad_norm": 2.8938355445861816, + "learning_rate": 2.9857739485271153e-05, + "loss": 1.8529, + "step": 22530 + }, + { + "epoch": 0.04390303747872518, + "grad_norm": 2.0363752841949463, + "learning_rate": 2.9857550294589663e-05, + "loss": 1.9974, + "step": 22545 + }, + { + "epoch": 0.04393224774983545, + "grad_norm": 3.58372163772583, + "learning_rate": 2.9857360978790647e-05, + "loss": 1.7921, + "step": 22560 + }, + { + "epoch": 0.04396145802094571, + "grad_norm": 3.4086432456970215, + "learning_rate": 2.985717153787571e-05, + "loss": 1.8401, + "step": 22575 + }, + { + "epoch": 0.04399066829205597, + "grad_norm": 2.3264150619506836, + "learning_rate": 2.985698197184644e-05, + "loss": 1.8789, + "step": 22590 + }, + { + "epoch": 0.044019878563166234, + "grad_norm": 2.727571964263916, + "learning_rate": 2.9856792280704435e-05, + "loss": 1.7859, + "step": 22605 + }, + { + "epoch": 0.0440490888342765, + "grad_norm": 3.833146095275879, + "learning_rate": 2.9856602464451293e-05, + "loss": 1.9976, + "step": 22620 + }, + { + "epoch": 0.04407829910538676, + "grad_norm": 1.8543720245361328, + "learning_rate": 2.9856412523088612e-05, + "loss": 1.8875, + "step": 22635 + }, + { + "epoch": 0.044107509376497024, + "grad_norm": 2.6003103256225586, + "learning_rate": 2.9856222456617993e-05, + "loss": 2.0006, + "step": 22650 + }, + { + "epoch": 0.04413671964760729, + "grad_norm": 3.0136611461639404, + "learning_rate": 2.9856032265041035e-05, + "loss": 1.8309, + "step": 22665 + }, + { + "epoch": 0.04416592991871755, + "grad_norm": 1.948042631149292, + "learning_rate": 2.9855841948359337e-05, + "loss": 1.8721, + "step": 22680 + }, + { + "epoch": 0.044195140189827814, + "grad_norm": 3.1908979415893555, + "learning_rate": 2.9855651506574507e-05, + "loss": 1.8373, + "step": 22695 + }, + { + "epoch": 0.044224350460938075, + "grad_norm": 3.519826889038086, + "learning_rate": 2.985546093968815e-05, + "loss": 1.8992, + "step": 22710 + }, + { + "epoch": 0.04425356073204834, + "grad_norm": 4.259973526000977, + "learning_rate": 2.985527024770186e-05, + "loss": 1.9466, + "step": 22725 + }, + { + "epoch": 0.044282771003158604, + "grad_norm": 2.69942569732666, + "learning_rate": 2.9855079430617253e-05, + "loss": 1.6805, + "step": 22740 + }, + { + "epoch": 0.044311981274268865, + "grad_norm": 3.0210318565368652, + "learning_rate": 2.9854888488435933e-05, + "loss": 1.8744, + "step": 22755 + }, + { + "epoch": 0.04434119154537913, + "grad_norm": 2.448091745376587, + "learning_rate": 2.9854697421159505e-05, + "loss": 1.8449, + "step": 22770 + }, + { + "epoch": 0.044370401816489394, + "grad_norm": 3.8297231197357178, + "learning_rate": 2.9854506228789586e-05, + "loss": 1.8071, + "step": 22785 + }, + { + "epoch": 0.044399612087599655, + "grad_norm": 4.065975189208984, + "learning_rate": 2.9854314911327777e-05, + "loss": 1.9335, + "step": 22800 + }, + { + "epoch": 0.044428822358709916, + "grad_norm": 2.0296108722686768, + "learning_rate": 2.9854123468775693e-05, + "loss": 1.748, + "step": 22815 + }, + { + "epoch": 0.044458032629820184, + "grad_norm": 1.7504347562789917, + "learning_rate": 2.985393190113495e-05, + "loss": 1.9084, + "step": 22830 + }, + { + "epoch": 0.044487242900930445, + "grad_norm": 2.8802502155303955, + "learning_rate": 2.9853740208407152e-05, + "loss": 1.8649, + "step": 22845 + }, + { + "epoch": 0.044516453172040706, + "grad_norm": 3.5376250743865967, + "learning_rate": 2.985354839059392e-05, + "loss": 1.7617, + "step": 22860 + }, + { + "epoch": 0.044545663443150973, + "grad_norm": 4.846216201782227, + "learning_rate": 2.985335644769687e-05, + "loss": 1.7531, + "step": 22875 + }, + { + "epoch": 0.044574873714261234, + "grad_norm": 1.9456549882888794, + "learning_rate": 2.9853164379717615e-05, + "loss": 1.9782, + "step": 22890 + }, + { + "epoch": 0.044604083985371495, + "grad_norm": 3.7463254928588867, + "learning_rate": 2.9852972186657774e-05, + "loss": 1.9347, + "step": 22905 + }, + { + "epoch": 0.044633294256481756, + "grad_norm": 1.9641201496124268, + "learning_rate": 2.9852779868518967e-05, + "loss": 1.8366, + "step": 22920 + }, + { + "epoch": 0.044662504527592024, + "grad_norm": 3.9136605262756348, + "learning_rate": 2.9852587425302812e-05, + "loss": 2.0271, + "step": 22935 + }, + { + "epoch": 0.044691714798702285, + "grad_norm": 2.161766767501831, + "learning_rate": 2.9852394857010923e-05, + "loss": 1.8681, + "step": 22950 + }, + { + "epoch": 0.044720925069812546, + "grad_norm": 3.1569862365722656, + "learning_rate": 2.9852202163644937e-05, + "loss": 1.7996, + "step": 22965 + }, + { + "epoch": 0.044750135340922814, + "grad_norm": 2.0586421489715576, + "learning_rate": 2.9852009345206458e-05, + "loss": 1.7727, + "step": 22980 + }, + { + "epoch": 0.044779345612033075, + "grad_norm": 5.425686359405518, + "learning_rate": 2.9851816401697127e-05, + "loss": 1.9209, + "step": 22995 + }, + { + "epoch": 0.044808555883143336, + "grad_norm": 5.385043621063232, + "learning_rate": 2.985162333311856e-05, + "loss": 1.8473, + "step": 23010 + }, + { + "epoch": 0.0448377661542536, + "grad_norm": 3.4577736854553223, + "learning_rate": 2.985143013947238e-05, + "loss": 1.848, + "step": 23025 + }, + { + "epoch": 0.044866976425363865, + "grad_norm": 3.0630152225494385, + "learning_rate": 2.985123682076022e-05, + "loss": 1.7272, + "step": 23040 + }, + { + "epoch": 0.044896186696474126, + "grad_norm": 3.8338427543640137, + "learning_rate": 2.985104337698371e-05, + "loss": 1.9042, + "step": 23055 + }, + { + "epoch": 0.04492539696758439, + "grad_norm": 3.1702969074249268, + "learning_rate": 2.985084980814447e-05, + "loss": 1.9389, + "step": 23070 + }, + { + "epoch": 0.04495460723869465, + "grad_norm": 2.758162021636963, + "learning_rate": 2.985065611424414e-05, + "loss": 1.8825, + "step": 23085 + }, + { + "epoch": 0.044983817509804916, + "grad_norm": 2.1373486518859863, + "learning_rate": 2.985046229528434e-05, + "loss": 1.9451, + "step": 23100 + }, + { + "epoch": 0.04501302778091518, + "grad_norm": 2.1302478313446045, + "learning_rate": 2.985026835126671e-05, + "loss": 2.0208, + "step": 23115 + }, + { + "epoch": 0.04504223805202544, + "grad_norm": 3.8777012825012207, + "learning_rate": 2.985007428219289e-05, + "loss": 2.0138, + "step": 23130 + }, + { + "epoch": 0.045071448323135706, + "grad_norm": 3.428769111633301, + "learning_rate": 2.9849880088064497e-05, + "loss": 1.9868, + "step": 23145 + }, + { + "epoch": 0.04510065859424597, + "grad_norm": 5.036014556884766, + "learning_rate": 2.9849685768883172e-05, + "loss": 1.7348, + "step": 23160 + }, + { + "epoch": 0.04512986886535623, + "grad_norm": 3.206932306289673, + "learning_rate": 2.9849491324650563e-05, + "loss": 1.9991, + "step": 23175 + }, + { + "epoch": 0.04515907913646649, + "grad_norm": 2.5311954021453857, + "learning_rate": 2.9849296755368297e-05, + "loss": 1.7501, + "step": 23190 + }, + { + "epoch": 0.04518828940757676, + "grad_norm": 2.1587252616882324, + "learning_rate": 2.984910206103801e-05, + "loss": 1.7981, + "step": 23205 + }, + { + "epoch": 0.04521749967868702, + "grad_norm": 2.198859453201294, + "learning_rate": 2.984890724166135e-05, + "loss": 1.9349, + "step": 23220 + }, + { + "epoch": 0.04524670994979728, + "grad_norm": 3.513099193572998, + "learning_rate": 2.9848712297239955e-05, + "loss": 1.7639, + "step": 23235 + }, + { + "epoch": 0.045275920220907546, + "grad_norm": 4.66519021987915, + "learning_rate": 2.984851722777546e-05, + "loss": 1.6865, + "step": 23250 + }, + { + "epoch": 0.04530513049201781, + "grad_norm": 2.0970771312713623, + "learning_rate": 2.9848322033269523e-05, + "loss": 1.8861, + "step": 23265 + }, + { + "epoch": 0.04533434076312807, + "grad_norm": 4.110560417175293, + "learning_rate": 2.984812671372377e-05, + "loss": 1.8482, + "step": 23280 + }, + { + "epoch": 0.04536355103423833, + "grad_norm": 3.2896182537078857, + "learning_rate": 2.9847931269139854e-05, + "loss": 1.9021, + "step": 23295 + }, + { + "epoch": 0.0453927613053486, + "grad_norm": 1.929777979850769, + "learning_rate": 2.9847735699519423e-05, + "loss": 1.9236, + "step": 23310 + }, + { + "epoch": 0.04542197157645886, + "grad_norm": 2.989177703857422, + "learning_rate": 2.9847540004864115e-05, + "loss": 1.8256, + "step": 23325 + }, + { + "epoch": 0.04545118184756912, + "grad_norm": 2.735318899154663, + "learning_rate": 2.984734418517559e-05, + "loss": 1.8279, + "step": 23340 + }, + { + "epoch": 0.04548039211867939, + "grad_norm": 2.597491502761841, + "learning_rate": 2.9847148240455495e-05, + "loss": 1.8952, + "step": 23355 + }, + { + "epoch": 0.04550960238978965, + "grad_norm": 3.5635156631469727, + "learning_rate": 2.9846952170705473e-05, + "loss": 1.7302, + "step": 23370 + }, + { + "epoch": 0.04553881266089991, + "grad_norm": 6.853522777557373, + "learning_rate": 2.9846755975927174e-05, + "loss": 2.0572, + "step": 23385 + }, + { + "epoch": 0.04556802293201017, + "grad_norm": 2.006582498550415, + "learning_rate": 2.984655965612226e-05, + "loss": 1.939, + "step": 23400 + }, + { + "epoch": 0.04559723320312044, + "grad_norm": 3.6737654209136963, + "learning_rate": 2.984636321129238e-05, + "loss": 1.8714, + "step": 23415 + }, + { + "epoch": 0.0456264434742307, + "grad_norm": 2.5802881717681885, + "learning_rate": 2.9846166641439185e-05, + "loss": 1.8687, + "step": 23430 + }, + { + "epoch": 0.04565565374534096, + "grad_norm": 3.748178005218506, + "learning_rate": 2.984596994656433e-05, + "loss": 1.9146, + "step": 23445 + }, + { + "epoch": 0.04568486401645123, + "grad_norm": 2.5193021297454834, + "learning_rate": 2.9845773126669475e-05, + "loss": 1.8845, + "step": 23460 + }, + { + "epoch": 0.04571407428756149, + "grad_norm": 2.3217403888702393, + "learning_rate": 2.984557618175628e-05, + "loss": 1.9149, + "step": 23475 + }, + { + "epoch": 0.04574328455867175, + "grad_norm": 3.9094045162200928, + "learning_rate": 2.98453791118264e-05, + "loss": 1.8615, + "step": 23490 + }, + { + "epoch": 0.04577249482978201, + "grad_norm": 2.190436840057373, + "learning_rate": 2.9845181916881495e-05, + "loss": 1.8181, + "step": 23505 + }, + { + "epoch": 0.04580170510089228, + "grad_norm": 3.33785343170166, + "learning_rate": 2.984498459692322e-05, + "loss": 1.9345, + "step": 23520 + }, + { + "epoch": 0.04583091537200254, + "grad_norm": 4.065570831298828, + "learning_rate": 2.9844787151953242e-05, + "loss": 1.7801, + "step": 23535 + }, + { + "epoch": 0.0458601256431128, + "grad_norm": 3.4585118293762207, + "learning_rate": 2.984458958197323e-05, + "loss": 1.8181, + "step": 23550 + }, + { + "epoch": 0.04588933591422307, + "grad_norm": 2.1615355014801025, + "learning_rate": 2.984439188698484e-05, + "loss": 1.8265, + "step": 23565 + }, + { + "epoch": 0.04591854618533333, + "grad_norm": 3.9168038368225098, + "learning_rate": 2.9844194066989737e-05, + "loss": 1.8508, + "step": 23580 + }, + { + "epoch": 0.04594775645644359, + "grad_norm": 2.436638593673706, + "learning_rate": 2.9843996121989587e-05, + "loss": 1.9247, + "step": 23595 + }, + { + "epoch": 0.04597696672755385, + "grad_norm": 3.637157678604126, + "learning_rate": 2.9843798051986053e-05, + "loss": 1.9791, + "step": 23610 + }, + { + "epoch": 0.04600617699866412, + "grad_norm": 2.9777755737304688, + "learning_rate": 2.9843599856980815e-05, + "loss": 1.8227, + "step": 23625 + }, + { + "epoch": 0.04603538726977438, + "grad_norm": 5.862057209014893, + "learning_rate": 2.9843401536975533e-05, + "loss": 1.9745, + "step": 23640 + }, + { + "epoch": 0.04606459754088464, + "grad_norm": 2.937347173690796, + "learning_rate": 2.9843203091971878e-05, + "loss": 1.7575, + "step": 23655 + }, + { + "epoch": 0.0460938078119949, + "grad_norm": 3.7695438861846924, + "learning_rate": 2.984300452197152e-05, + "loss": 1.9655, + "step": 23670 + }, + { + "epoch": 0.04612301808310517, + "grad_norm": 3.1122677326202393, + "learning_rate": 2.9842805826976137e-05, + "loss": 2.0459, + "step": 23685 + }, + { + "epoch": 0.04615222835421543, + "grad_norm": 4.503921031951904, + "learning_rate": 2.98426070069874e-05, + "loss": 1.8319, + "step": 23700 + }, + { + "epoch": 0.04618143862532569, + "grad_norm": 2.247570514678955, + "learning_rate": 2.9842408062006982e-05, + "loss": 1.765, + "step": 23715 + }, + { + "epoch": 0.04621064889643596, + "grad_norm": 3.512427806854248, + "learning_rate": 2.9842208992036554e-05, + "loss": 1.8118, + "step": 23730 + }, + { + "epoch": 0.04623985916754622, + "grad_norm": 3.174893379211426, + "learning_rate": 2.98420097970778e-05, + "loss": 1.8791, + "step": 23745 + }, + { + "epoch": 0.04626906943865648, + "grad_norm": 5.151320934295654, + "learning_rate": 2.9841810477132392e-05, + "loss": 1.817, + "step": 23760 + }, + { + "epoch": 0.04629827970976674, + "grad_norm": 2.6584534645080566, + "learning_rate": 2.984161103220201e-05, + "loss": 1.8441, + "step": 23775 + }, + { + "epoch": 0.04632748998087701, + "grad_norm": 1.967596173286438, + "learning_rate": 2.9841411462288335e-05, + "loss": 2.0349, + "step": 23790 + }, + { + "epoch": 0.04635670025198727, + "grad_norm": 3.7256250381469727, + "learning_rate": 2.9841211767393048e-05, + "loss": 1.8131, + "step": 23805 + }, + { + "epoch": 0.04638591052309753, + "grad_norm": 4.081748962402344, + "learning_rate": 2.9841011947517826e-05, + "loss": 1.8664, + "step": 23820 + }, + { + "epoch": 0.0464151207942078, + "grad_norm": 2.527451276779175, + "learning_rate": 2.984081200266436e-05, + "loss": 1.8916, + "step": 23835 + }, + { + "epoch": 0.04644433106531806, + "grad_norm": 2.159346103668213, + "learning_rate": 2.9840611932834326e-05, + "loss": 1.9254, + "step": 23850 + }, + { + "epoch": 0.04647354133642832, + "grad_norm": 2.636519432067871, + "learning_rate": 2.9840411738029412e-05, + "loss": 1.8062, + "step": 23865 + }, + { + "epoch": 0.046502751607538584, + "grad_norm": 2.3750388622283936, + "learning_rate": 2.9840211418251303e-05, + "loss": 1.7347, + "step": 23880 + }, + { + "epoch": 0.04653196187864885, + "grad_norm": 2.181845188140869, + "learning_rate": 2.9840010973501685e-05, + "loss": 1.9207, + "step": 23895 + }, + { + "epoch": 0.04656117214975911, + "grad_norm": 3.098555564880371, + "learning_rate": 2.9839810403782252e-05, + "loss": 1.9566, + "step": 23910 + }, + { + "epoch": 0.046590382420869374, + "grad_norm": 5.142632484436035, + "learning_rate": 2.9839609709094685e-05, + "loss": 1.8871, + "step": 23925 + }, + { + "epoch": 0.04661959269197964, + "grad_norm": 1.7162665128707886, + "learning_rate": 2.9839408889440678e-05, + "loss": 1.8784, + "step": 23940 + }, + { + "epoch": 0.0466488029630899, + "grad_norm": 3.743401050567627, + "learning_rate": 2.9839207944821925e-05, + "loss": 1.9179, + "step": 23955 + }, + { + "epoch": 0.046678013234200164, + "grad_norm": 2.952817440032959, + "learning_rate": 2.983900687524011e-05, + "loss": 1.9182, + "step": 23970 + }, + { + "epoch": 0.046707223505310425, + "grad_norm": 2.3484249114990234, + "learning_rate": 2.983880568069693e-05, + "loss": 1.8336, + "step": 23985 + }, + { + "epoch": 0.04673643377642069, + "grad_norm": 1.931921362876892, + "learning_rate": 2.9838604361194087e-05, + "loss": 1.771, + "step": 24000 + }, + { + "epoch": 0.04676564404753095, + "grad_norm": 5.197054386138916, + "learning_rate": 2.9838402916733263e-05, + "loss": 1.8558, + "step": 24015 + }, + { + "epoch": 0.046794854318641214, + "grad_norm": 2.862427234649658, + "learning_rate": 2.9838201347316164e-05, + "loss": 1.6956, + "step": 24030 + }, + { + "epoch": 0.04682406458975148, + "grad_norm": 2.6003293991088867, + "learning_rate": 2.9837999652944487e-05, + "loss": 2.0473, + "step": 24045 + }, + { + "epoch": 0.04685327486086174, + "grad_norm": 2.068455934524536, + "learning_rate": 2.9837797833619926e-05, + "loss": 1.9233, + "step": 24060 + }, + { + "epoch": 0.046882485131972004, + "grad_norm": 3.4722557067871094, + "learning_rate": 2.983759588934418e-05, + "loss": 2.0052, + "step": 24075 + }, + { + "epoch": 0.046911695403082265, + "grad_norm": 3.198732852935791, + "learning_rate": 2.9837393820118954e-05, + "loss": 1.906, + "step": 24090 + }, + { + "epoch": 0.04694090567419253, + "grad_norm": 2.5050504207611084, + "learning_rate": 2.983719162594595e-05, + "loss": 1.833, + "step": 24105 + }, + { + "epoch": 0.046970115945302794, + "grad_norm": 2.1438305377960205, + "learning_rate": 2.9836989306826866e-05, + "loss": 1.7439, + "step": 24120 + }, + { + "epoch": 0.046999326216413055, + "grad_norm": 1.5980552434921265, + "learning_rate": 2.983678686276341e-05, + "loss": 1.8018, + "step": 24135 + }, + { + "epoch": 0.047028536487523316, + "grad_norm": 2.566882610321045, + "learning_rate": 2.9836584293757282e-05, + "loss": 1.8807, + "step": 24150 + }, + { + "epoch": 0.047057746758633584, + "grad_norm": 4.125818252563477, + "learning_rate": 2.9836381599810196e-05, + "loss": 1.8198, + "step": 24165 + }, + { + "epoch": 0.047086957029743845, + "grad_norm": 3.1217780113220215, + "learning_rate": 2.983617878092385e-05, + "loss": 2.0202, + "step": 24180 + }, + { + "epoch": 0.047116167300854106, + "grad_norm": 3.001835823059082, + "learning_rate": 2.9835975837099956e-05, + "loss": 1.851, + "step": 24195 + }, + { + "epoch": 0.047145377571964374, + "grad_norm": 4.060051918029785, + "learning_rate": 2.9835772768340225e-05, + "loss": 1.8918, + "step": 24210 + }, + { + "epoch": 0.047174587843074635, + "grad_norm": 4.199372291564941, + "learning_rate": 2.9835569574646363e-05, + "loss": 1.848, + "step": 24225 + }, + { + "epoch": 0.047203798114184896, + "grad_norm": 3.1068427562713623, + "learning_rate": 2.9835366256020085e-05, + "loss": 1.793, + "step": 24240 + }, + { + "epoch": 0.04723300838529516, + "grad_norm": 3.30556321144104, + "learning_rate": 2.9835162812463098e-05, + "loss": 1.8914, + "step": 24255 + }, + { + "epoch": 0.047262218656405425, + "grad_norm": 3.898911476135254, + "learning_rate": 2.9834959243977123e-05, + "loss": 1.929, + "step": 24270 + }, + { + "epoch": 0.047291428927515686, + "grad_norm": 2.1169817447662354, + "learning_rate": 2.9834755550563865e-05, + "loss": 1.9513, + "step": 24285 + }, + { + "epoch": 0.04732063919862595, + "grad_norm": 3.0540499687194824, + "learning_rate": 2.9834551732225044e-05, + "loss": 1.8283, + "step": 24300 + }, + { + "epoch": 0.047349849469736215, + "grad_norm": 2.495234727859497, + "learning_rate": 2.9834347788962383e-05, + "loss": 1.8268, + "step": 24315 + }, + { + "epoch": 0.047379059740846476, + "grad_norm": 4.254831790924072, + "learning_rate": 2.9834143720777588e-05, + "loss": 1.8933, + "step": 24330 + }, + { + "epoch": 0.04740827001195674, + "grad_norm": 5.430483818054199, + "learning_rate": 2.9833939527672384e-05, + "loss": 1.8128, + "step": 24345 + }, + { + "epoch": 0.047437480283067, + "grad_norm": 3.332515239715576, + "learning_rate": 2.983373520964849e-05, + "loss": 1.9089, + "step": 24360 + }, + { + "epoch": 0.047466690554177265, + "grad_norm": 2.1588125228881836, + "learning_rate": 2.983353076670762e-05, + "loss": 1.7578, + "step": 24375 + }, + { + "epoch": 0.047495900825287526, + "grad_norm": 2.6773619651794434, + "learning_rate": 2.9833326198851503e-05, + "loss": 1.9486, + "step": 24390 + }, + { + "epoch": 0.04752511109639779, + "grad_norm": 2.855078935623169, + "learning_rate": 2.9833121506081862e-05, + "loss": 1.9339, + "step": 24405 + }, + { + "epoch": 0.047554321367508055, + "grad_norm": 2.539210557937622, + "learning_rate": 2.983291668840042e-05, + "loss": 1.8479, + "step": 24420 + }, + { + "epoch": 0.047583531638618316, + "grad_norm": 3.0605132579803467, + "learning_rate": 2.9832711745808895e-05, + "loss": 1.8924, + "step": 24435 + }, + { + "epoch": 0.04761274190972858, + "grad_norm": 2.142998218536377, + "learning_rate": 2.9832506678309025e-05, + "loss": 2.0331, + "step": 24450 + }, + { + "epoch": 0.04764195218083884, + "grad_norm": 3.082610845565796, + "learning_rate": 2.983230148590253e-05, + "loss": 1.8299, + "step": 24465 + }, + { + "epoch": 0.047671162451949106, + "grad_norm": 4.011190414428711, + "learning_rate": 2.9832096168591128e-05, + "loss": 1.8591, + "step": 24480 + }, + { + "epoch": 0.04770037272305937, + "grad_norm": 2.326756238937378, + "learning_rate": 2.983189072637657e-05, + "loss": 1.8687, + "step": 24495 + }, + { + "epoch": 0.04772958299416963, + "grad_norm": 2.1641993522644043, + "learning_rate": 2.9831685159260568e-05, + "loss": 1.9402, + "step": 24510 + }, + { + "epoch": 0.047758793265279896, + "grad_norm": 2.614429473876953, + "learning_rate": 2.983147946724486e-05, + "loss": 1.8911, + "step": 24525 + }, + { + "epoch": 0.04778800353639016, + "grad_norm": 2.163760185241699, + "learning_rate": 2.983127365033118e-05, + "loss": 1.7623, + "step": 24540 + }, + { + "epoch": 0.04781721380750042, + "grad_norm": 1.837815761566162, + "learning_rate": 2.9831067708521257e-05, + "loss": 2.0103, + "step": 24555 + }, + { + "epoch": 0.04784642407861068, + "grad_norm": 4.629458904266357, + "learning_rate": 2.9830861641816826e-05, + "loss": 1.8473, + "step": 24570 + }, + { + "epoch": 0.04787563434972095, + "grad_norm": 2.741942882537842, + "learning_rate": 2.9830655450219623e-05, + "loss": 1.8219, + "step": 24585 + }, + { + "epoch": 0.04790484462083121, + "grad_norm": 2.6273906230926514, + "learning_rate": 2.9830449133731387e-05, + "loss": 1.7688, + "step": 24600 + }, + { + "epoch": 0.04793405489194147, + "grad_norm": 2.2749216556549072, + "learning_rate": 2.983024269235385e-05, + "loss": 1.9809, + "step": 24615 + }, + { + "epoch": 0.04796326516305174, + "grad_norm": 4.1970367431640625, + "learning_rate": 2.9830036126088754e-05, + "loss": 1.7998, + "step": 24630 + }, + { + "epoch": 0.047992475434162, + "grad_norm": 2.87906813621521, + "learning_rate": 2.982982943493784e-05, + "loss": 1.8496, + "step": 24645 + }, + { + "epoch": 0.04802168570527226, + "grad_norm": 1.9954204559326172, + "learning_rate": 2.9829622618902848e-05, + "loss": 1.8782, + "step": 24660 + }, + { + "epoch": 0.04805089597638252, + "grad_norm": 3.3936777114868164, + "learning_rate": 2.982941567798551e-05, + "loss": 1.7, + "step": 24675 + }, + { + "epoch": 0.04808010624749279, + "grad_norm": 5.30580997467041, + "learning_rate": 2.9829208612187585e-05, + "loss": 1.8712, + "step": 24690 + }, + { + "epoch": 0.04810931651860305, + "grad_norm": 1.797080636024475, + "learning_rate": 2.982900142151081e-05, + "loss": 1.8711, + "step": 24705 + }, + { + "epoch": 0.04813852678971331, + "grad_norm": 1.7338413000106812, + "learning_rate": 2.9828794105956922e-05, + "loss": 1.9125, + "step": 24720 + }, + { + "epoch": 0.04816773706082357, + "grad_norm": 6.833633899688721, + "learning_rate": 2.9828586665527677e-05, + "loss": 1.8622, + "step": 24735 + }, + { + "epoch": 0.04819694733193384, + "grad_norm": 2.8855645656585693, + "learning_rate": 2.9828379100224814e-05, + "loss": 1.7596, + "step": 24750 + }, + { + "epoch": 0.0482261576030441, + "grad_norm": 3.7425286769866943, + "learning_rate": 2.982817141005009e-05, + "loss": 1.8452, + "step": 24765 + }, + { + "epoch": 0.04825536787415436, + "grad_norm": 4.187405109405518, + "learning_rate": 2.9827963595005248e-05, + "loss": 1.8669, + "step": 24780 + }, + { + "epoch": 0.04828457814526463, + "grad_norm": 4.476016998291016, + "learning_rate": 2.982775565509204e-05, + "loss": 1.9221, + "step": 24795 + }, + { + "epoch": 0.04831378841637489, + "grad_norm": 3.6991500854492188, + "learning_rate": 2.9827547590312213e-05, + "loss": 1.5434, + "step": 24810 + }, + { + "epoch": 0.04834299868748515, + "grad_norm": 3.0707576274871826, + "learning_rate": 2.9827339400667524e-05, + "loss": 1.88, + "step": 24825 + }, + { + "epoch": 0.04837220895859541, + "grad_norm": 2.604163646697998, + "learning_rate": 2.9827131086159723e-05, + "loss": 1.9946, + "step": 24840 + }, + { + "epoch": 0.04840141922970568, + "grad_norm": 4.7181549072265625, + "learning_rate": 2.9826922646790568e-05, + "loss": 1.8807, + "step": 24855 + }, + { + "epoch": 0.04843062950081594, + "grad_norm": 3.983146905899048, + "learning_rate": 2.9826714082561808e-05, + "loss": 1.8593, + "step": 24870 + }, + { + "epoch": 0.0484598397719262, + "grad_norm": 4.0342912673950195, + "learning_rate": 2.982650539347521e-05, + "loss": 1.9208, + "step": 24885 + }, + { + "epoch": 0.04848905004303647, + "grad_norm": 3.092644691467285, + "learning_rate": 2.982629657953252e-05, + "loss": 1.9611, + "step": 24900 + }, + { + "epoch": 0.04851826031414673, + "grad_norm": 2.244311809539795, + "learning_rate": 2.98260876407355e-05, + "loss": 1.7965, + "step": 24915 + }, + { + "epoch": 0.04854747058525699, + "grad_norm": 4.740140914916992, + "learning_rate": 2.9825878577085917e-05, + "loss": 2.0418, + "step": 24930 + }, + { + "epoch": 0.04857668085636725, + "grad_norm": 1.6131473779678345, + "learning_rate": 2.9825669388585523e-05, + "loss": 1.7781, + "step": 24945 + }, + { + "epoch": 0.04860589112747752, + "grad_norm": 3.3024797439575195, + "learning_rate": 2.9825460075236077e-05, + "loss": 1.9176, + "step": 24960 + }, + { + "epoch": 0.04863510139858778, + "grad_norm": 3.6958138942718506, + "learning_rate": 2.9825250637039348e-05, + "loss": 1.8613, + "step": 24975 + }, + { + "epoch": 0.04866431166969804, + "grad_norm": 2.277597427368164, + "learning_rate": 2.9825041073997102e-05, + "loss": 1.9942, + "step": 24990 + }, + { + "epoch": 0.04869352194080831, + "grad_norm": 3.3070085048675537, + "learning_rate": 2.9824831386111103e-05, + "loss": 1.5052, + "step": 25005 + }, + { + "epoch": 0.04872273221191857, + "grad_norm": 2.769148826599121, + "learning_rate": 2.9824621573383107e-05, + "loss": 1.8361, + "step": 25020 + }, + { + "epoch": 0.04875194248302883, + "grad_norm": 2.3901126384735107, + "learning_rate": 2.982441163581489e-05, + "loss": 1.8346, + "step": 25035 + }, + { + "epoch": 0.04878115275413909, + "grad_norm": 3.8897323608398438, + "learning_rate": 2.9824201573408218e-05, + "loss": 1.8186, + "step": 25050 + }, + { + "epoch": 0.04881036302524936, + "grad_norm": 1.947713017463684, + "learning_rate": 2.982399138616486e-05, + "loss": 1.8812, + "step": 25065 + }, + { + "epoch": 0.04883957329635962, + "grad_norm": 1.6136304140090942, + "learning_rate": 2.9823781074086582e-05, + "loss": 2.0161, + "step": 25080 + }, + { + "epoch": 0.04886878356746988, + "grad_norm": 2.6880128383636475, + "learning_rate": 2.9823570637175166e-05, + "loss": 1.8861, + "step": 25095 + }, + { + "epoch": 0.04889799383858015, + "grad_norm": 4.511691570281982, + "learning_rate": 2.982336007543237e-05, + "loss": 1.8334, + "step": 25110 + }, + { + "epoch": 0.04892720410969041, + "grad_norm": 2.540619134902954, + "learning_rate": 2.9823149388859975e-05, + "loss": 1.8554, + "step": 25125 + }, + { + "epoch": 0.04895641438080067, + "grad_norm": 2.650416135787964, + "learning_rate": 2.982293857745976e-05, + "loss": 2.0556, + "step": 25140 + }, + { + "epoch": 0.04898562465191093, + "grad_norm": 4.075965881347656, + "learning_rate": 2.9822727641233488e-05, + "loss": 1.7369, + "step": 25155 + }, + { + "epoch": 0.0490148349230212, + "grad_norm": 4.21481466293335, + "learning_rate": 2.9822516580182944e-05, + "loss": 1.9624, + "step": 25170 + }, + { + "epoch": 0.04904404519413146, + "grad_norm": 4.617081642150879, + "learning_rate": 2.98223053943099e-05, + "loss": 1.7276, + "step": 25185 + }, + { + "epoch": 0.04907325546524172, + "grad_norm": 3.5099408626556396, + "learning_rate": 2.9822094083616145e-05, + "loss": 1.8783, + "step": 25200 + }, + { + "epoch": 0.049102465736351984, + "grad_norm": 4.118253231048584, + "learning_rate": 2.9821882648103445e-05, + "loss": 1.7848, + "step": 25215 + }, + { + "epoch": 0.04913167600746225, + "grad_norm": 3.617659091949463, + "learning_rate": 2.982167108777359e-05, + "loss": 1.8307, + "step": 25230 + }, + { + "epoch": 0.04916088627857251, + "grad_norm": 3.3717000484466553, + "learning_rate": 2.9821459402628357e-05, + "loss": 1.6414, + "step": 25245 + }, + { + "epoch": 0.049190096549682774, + "grad_norm": 3.3341469764709473, + "learning_rate": 2.9821247592669526e-05, + "loss": 1.9641, + "step": 25260 + }, + { + "epoch": 0.04921930682079304, + "grad_norm": 3.8818039894104004, + "learning_rate": 2.9821035657898886e-05, + "loss": 1.8227, + "step": 25275 + }, + { + "epoch": 0.0492485170919033, + "grad_norm": 2.4647955894470215, + "learning_rate": 2.9820823598318226e-05, + "loss": 1.9441, + "step": 25290 + }, + { + "epoch": 0.049277727363013564, + "grad_norm": 5.145657062530518, + "learning_rate": 2.9820611413929318e-05, + "loss": 1.7515, + "step": 25305 + }, + { + "epoch": 0.049306937634123825, + "grad_norm": 2.595554828643799, + "learning_rate": 2.9820399104733964e-05, + "loss": 1.8185, + "step": 25320 + }, + { + "epoch": 0.04933614790523409, + "grad_norm": 3.0854387283325195, + "learning_rate": 2.9820186670733944e-05, + "loss": 2.0462, + "step": 25335 + }, + { + "epoch": 0.049365358176344354, + "grad_norm": 4.086148262023926, + "learning_rate": 2.9819974111931045e-05, + "loss": 1.923, + "step": 25350 + }, + { + "epoch": 0.049394568447454615, + "grad_norm": 2.3999664783477783, + "learning_rate": 2.9819761428327057e-05, + "loss": 1.9243, + "step": 25365 + }, + { + "epoch": 0.04942377871856488, + "grad_norm": 1.8367825746536255, + "learning_rate": 2.981954861992378e-05, + "loss": 1.9104, + "step": 25380 + }, + { + "epoch": 0.049452988989675144, + "grad_norm": 2.3233165740966797, + "learning_rate": 2.9819335686722997e-05, + "loss": 1.7446, + "step": 25395 + }, + { + "epoch": 0.049482199260785405, + "grad_norm": 3.7498881816864014, + "learning_rate": 2.98191226287265e-05, + "loss": 2.0172, + "step": 25410 + }, + { + "epoch": 0.049511409531895666, + "grad_norm": 2.636087656021118, + "learning_rate": 2.9818909445936092e-05, + "loss": 1.9958, + "step": 25425 + }, + { + "epoch": 0.049540619803005934, + "grad_norm": 2.6167049407958984, + "learning_rate": 2.9818696138353564e-05, + "loss": 1.9746, + "step": 25440 + }, + { + "epoch": 0.049569830074116195, + "grad_norm": 2.699380874633789, + "learning_rate": 2.9818482705980708e-05, + "loss": 2.0158, + "step": 25455 + }, + { + "epoch": 0.049599040345226456, + "grad_norm": 2.8639230728149414, + "learning_rate": 2.9818269148819326e-05, + "loss": 1.8795, + "step": 25470 + }, + { + "epoch": 0.04962825061633672, + "grad_norm": 3.6716597080230713, + "learning_rate": 2.9818055466871217e-05, + "loss": 2.0066, + "step": 25485 + }, + { + "epoch": 0.049657460887446984, + "grad_norm": 3.0852763652801514, + "learning_rate": 2.981784166013818e-05, + "loss": 1.8392, + "step": 25500 + }, + { + "epoch": 0.049686671158557245, + "grad_norm": 2.0497000217437744, + "learning_rate": 2.981762772862201e-05, + "loss": 2.0587, + "step": 25515 + }, + { + "epoch": 0.049715881429667506, + "grad_norm": 4.302377700805664, + "learning_rate": 2.9817413672324517e-05, + "loss": 1.9415, + "step": 25530 + }, + { + "epoch": 0.049745091700777774, + "grad_norm": 2.478428840637207, + "learning_rate": 2.9817199491247495e-05, + "loss": 2.0082, + "step": 25545 + }, + { + "epoch": 0.049774301971888035, + "grad_norm": 3.375516891479492, + "learning_rate": 2.9816985185392752e-05, + "loss": 2.061, + "step": 25560 + }, + { + "epoch": 0.049803512242998296, + "grad_norm": 2.3733456134796143, + "learning_rate": 2.9816770754762094e-05, + "loss": 1.8752, + "step": 25575 + }, + { + "epoch": 0.049832722514108564, + "grad_norm": 2.8646862506866455, + "learning_rate": 2.9816556199357334e-05, + "loss": 1.9048, + "step": 25590 + }, + { + "epoch": 0.049861932785218825, + "grad_norm": 3.711494207382202, + "learning_rate": 2.981634151918026e-05, + "loss": 1.8555, + "step": 25605 + }, + { + "epoch": 0.049891143056329086, + "grad_norm": 4.652657985687256, + "learning_rate": 2.9816126714232694e-05, + "loss": 1.9591, + "step": 25620 + }, + { + "epoch": 0.04992035332743935, + "grad_norm": 3.5854547023773193, + "learning_rate": 2.981591178451644e-05, + "loss": 1.8043, + "step": 25635 + }, + { + "epoch": 0.049949563598549615, + "grad_norm": 2.532128095626831, + "learning_rate": 2.981569673003331e-05, + "loss": 1.7456, + "step": 25650 + }, + { + "epoch": 0.049978773869659876, + "grad_norm": 1.8968348503112793, + "learning_rate": 2.9815481550785116e-05, + "loss": 1.8971, + "step": 25665 + }, + { + "epoch": 0.05000798414077014, + "grad_norm": 4.662414073944092, + "learning_rate": 2.9815266246773663e-05, + "loss": 2.1576, + "step": 25680 + }, + { + "epoch": 0.050037194411880405, + "grad_norm": 2.1178083419799805, + "learning_rate": 2.9815050818000773e-05, + "loss": 1.9055, + "step": 25695 + }, + { + "epoch": 0.050066404682990666, + "grad_norm": 3.8748810291290283, + "learning_rate": 2.9814835264468254e-05, + "loss": 1.8177, + "step": 25710 + }, + { + "epoch": 0.05009561495410093, + "grad_norm": 3.424405097961426, + "learning_rate": 2.9814619586177926e-05, + "loss": 1.8804, + "step": 25725 + }, + { + "epoch": 0.05012482522521119, + "grad_norm": 4.104612827301025, + "learning_rate": 2.98144037831316e-05, + "loss": 1.973, + "step": 25740 + }, + { + "epoch": 0.050154035496321456, + "grad_norm": 2.1848320960998535, + "learning_rate": 2.98141878553311e-05, + "loss": 1.8529, + "step": 25755 + }, + { + "epoch": 0.05018324576743172, + "grad_norm": 4.191700458526611, + "learning_rate": 2.981397180277824e-05, + "loss": 1.8094, + "step": 25770 + }, + { + "epoch": 0.05021245603854198, + "grad_norm": 3.3454208374023438, + "learning_rate": 2.981375562547484e-05, + "loss": 1.9105, + "step": 25785 + }, + { + "epoch": 0.05024166630965224, + "grad_norm": 2.848618507385254, + "learning_rate": 2.9813539323422717e-05, + "loss": 1.9054, + "step": 25800 + }, + { + "epoch": 0.05027087658076251, + "grad_norm": 1.7352811098098755, + "learning_rate": 2.98133228966237e-05, + "loss": 1.91, + "step": 25815 + }, + { + "epoch": 0.05030008685187277, + "grad_norm": 2.177297592163086, + "learning_rate": 2.9813106345079604e-05, + "loss": 1.8276, + "step": 25830 + }, + { + "epoch": 0.05032929712298303, + "grad_norm": 2.1822316646575928, + "learning_rate": 2.981288966879226e-05, + "loss": 1.7961, + "step": 25845 + }, + { + "epoch": 0.050358507394093296, + "grad_norm": 2.70729398727417, + "learning_rate": 2.9812672867763482e-05, + "loss": 1.7497, + "step": 25860 + }, + { + "epoch": 0.05038771766520356, + "grad_norm": 2.869450807571411, + "learning_rate": 2.981245594199511e-05, + "loss": 1.8309, + "step": 25875 + }, + { + "epoch": 0.05041692793631382, + "grad_norm": 2.233219623565674, + "learning_rate": 2.981223889148896e-05, + "loss": 1.8577, + "step": 25890 + }, + { + "epoch": 0.05044613820742408, + "grad_norm": 5.402493000030518, + "learning_rate": 2.981202171624686e-05, + "loss": 1.9151, + "step": 25905 + }, + { + "epoch": 0.05047534847853435, + "grad_norm": 4.127412796020508, + "learning_rate": 2.9811804416270648e-05, + "loss": 1.8644, + "step": 25920 + }, + { + "epoch": 0.05050455874964461, + "grad_norm": 2.9654555320739746, + "learning_rate": 2.9811586991562145e-05, + "loss": 1.99, + "step": 25935 + }, + { + "epoch": 0.05053376902075487, + "grad_norm": 2.6316134929656982, + "learning_rate": 2.981136944212318e-05, + "loss": 1.7434, + "step": 25950 + }, + { + "epoch": 0.05056297929186514, + "grad_norm": 2.6688010692596436, + "learning_rate": 2.9811151767955597e-05, + "loss": 1.8075, + "step": 25965 + }, + { + "epoch": 0.0505921895629754, + "grad_norm": 1.7729136943817139, + "learning_rate": 2.981093396906122e-05, + "loss": 1.7828, + "step": 25980 + }, + { + "epoch": 0.05062139983408566, + "grad_norm": 3.017512321472168, + "learning_rate": 2.9810716045441884e-05, + "loss": 1.9602, + "step": 25995 + }, + { + "epoch": 0.05065061010519592, + "grad_norm": 3.954099416732788, + "learning_rate": 2.9810497997099427e-05, + "loss": 1.9158, + "step": 26010 + }, + { + "epoch": 0.05067982037630619, + "grad_norm": 4.384187698364258, + "learning_rate": 2.981027982403568e-05, + "loss": 2.0172, + "step": 26025 + }, + { + "epoch": 0.05070903064741645, + "grad_norm": 2.7591800689697266, + "learning_rate": 2.9810061526252488e-05, + "loss": 1.8372, + "step": 26040 + }, + { + "epoch": 0.05073824091852671, + "grad_norm": 2.6615748405456543, + "learning_rate": 2.980984310375168e-05, + "loss": 1.8127, + "step": 26055 + }, + { + "epoch": 0.05076745118963698, + "grad_norm": 3.1193888187408447, + "learning_rate": 2.9809624556535106e-05, + "loss": 1.8826, + "step": 26070 + }, + { + "epoch": 0.05079666146074724, + "grad_norm": 2.338146448135376, + "learning_rate": 2.9809405884604594e-05, + "loss": 1.9003, + "step": 26085 + }, + { + "epoch": 0.0508258717318575, + "grad_norm": 2.339162588119507, + "learning_rate": 2.9809187087961993e-05, + "loss": 1.8444, + "step": 26100 + }, + { + "epoch": 0.05085508200296776, + "grad_norm": 6.189070224761963, + "learning_rate": 2.980896816660915e-05, + "loss": 1.8682, + "step": 26115 + }, + { + "epoch": 0.05088429227407803, + "grad_norm": 3.7556257247924805, + "learning_rate": 2.9808749120547898e-05, + "loss": 1.8692, + "step": 26130 + }, + { + "epoch": 0.05091350254518829, + "grad_norm": 2.4181950092315674, + "learning_rate": 2.980852994978009e-05, + "loss": 1.8497, + "step": 26145 + }, + { + "epoch": 0.05094271281629855, + "grad_norm": 2.1751251220703125, + "learning_rate": 2.9808310654307566e-05, + "loss": 1.7236, + "step": 26160 + }, + { + "epoch": 0.05097192308740882, + "grad_norm": 2.304203748703003, + "learning_rate": 2.9808091234132177e-05, + "loss": 1.8394, + "step": 26175 + }, + { + "epoch": 0.05100113335851908, + "grad_norm": 2.03320050239563, + "learning_rate": 2.980787168925577e-05, + "loss": 2.0082, + "step": 26190 + }, + { + "epoch": 0.05103034362962934, + "grad_norm": 3.1527299880981445, + "learning_rate": 2.9807652019680195e-05, + "loss": 1.8783, + "step": 26205 + }, + { + "epoch": 0.0510595539007396, + "grad_norm": 3.638120412826538, + "learning_rate": 2.9807432225407295e-05, + "loss": 1.8912, + "step": 26220 + }, + { + "epoch": 0.05108876417184987, + "grad_norm": 1.9177019596099854, + "learning_rate": 2.9807212306438927e-05, + "loss": 1.7949, + "step": 26235 + }, + { + "epoch": 0.05111797444296013, + "grad_norm": 4.763120174407959, + "learning_rate": 2.9806992262776945e-05, + "loss": 1.9464, + "step": 26250 + }, + { + "epoch": 0.05114718471407039, + "grad_norm": 4.4592132568359375, + "learning_rate": 2.980677209442319e-05, + "loss": 1.834, + "step": 26265 + }, + { + "epoch": 0.05117639498518065, + "grad_norm": 2.6289279460906982, + "learning_rate": 2.980655180137953e-05, + "loss": 1.7644, + "step": 26280 + }, + { + "epoch": 0.05120560525629092, + "grad_norm": 2.2794735431671143, + "learning_rate": 2.9806331383647816e-05, + "loss": 1.9125, + "step": 26295 + }, + { + "epoch": 0.05123481552740118, + "grad_norm": 3.5884125232696533, + "learning_rate": 2.9806110841229904e-05, + "loss": 1.925, + "step": 26310 + }, + { + "epoch": 0.05126402579851144, + "grad_norm": 2.8750176429748535, + "learning_rate": 2.9805890174127648e-05, + "loss": 1.6463, + "step": 26325 + }, + { + "epoch": 0.05129323606962171, + "grad_norm": 3.1167027950286865, + "learning_rate": 2.980566938234291e-05, + "loss": 1.8897, + "step": 26340 + }, + { + "epoch": 0.05132244634073197, + "grad_norm": 2.8106181621551514, + "learning_rate": 2.9805448465877546e-05, + "loss": 1.855, + "step": 26355 + }, + { + "epoch": 0.05135165661184223, + "grad_norm": 3.8905303478240967, + "learning_rate": 2.980522742473342e-05, + "loss": 1.9127, + "step": 26370 + }, + { + "epoch": 0.05138086688295249, + "grad_norm": 3.031163215637207, + "learning_rate": 2.980500625891239e-05, + "loss": 1.9577, + "step": 26385 + }, + { + "epoch": 0.05141007715406276, + "grad_norm": 1.991543173789978, + "learning_rate": 2.980478496841632e-05, + "loss": 1.6416, + "step": 26400 + }, + { + "epoch": 0.05143928742517302, + "grad_norm": 3.90432071685791, + "learning_rate": 2.9804563553247076e-05, + "loss": 1.7318, + "step": 26415 + }, + { + "epoch": 0.05146849769628328, + "grad_norm": 3.582280158996582, + "learning_rate": 2.980434201340652e-05, + "loss": 1.7349, + "step": 26430 + }, + { + "epoch": 0.05149770796739355, + "grad_norm": 3.1633496284484863, + "learning_rate": 2.980412034889651e-05, + "loss": 1.8023, + "step": 26445 + }, + { + "epoch": 0.05152691823850381, + "grad_norm": 3.7939155101776123, + "learning_rate": 2.9803898559718927e-05, + "loss": 1.6626, + "step": 26460 + }, + { + "epoch": 0.05155612850961407, + "grad_norm": 3.093492031097412, + "learning_rate": 2.9803676645875634e-05, + "loss": 1.7914, + "step": 26475 + }, + { + "epoch": 0.051585338780724334, + "grad_norm": 2.4469592571258545, + "learning_rate": 2.9803454607368493e-05, + "loss": 1.7484, + "step": 26490 + }, + { + "epoch": 0.0516145490518346, + "grad_norm": 2.613276243209839, + "learning_rate": 2.9803232444199382e-05, + "loss": 1.7984, + "step": 26505 + }, + { + "epoch": 0.05164375932294486, + "grad_norm": 3.1793646812438965, + "learning_rate": 2.9803010156370166e-05, + "loss": 1.7114, + "step": 26520 + }, + { + "epoch": 0.051672969594055124, + "grad_norm": 1.9021662473678589, + "learning_rate": 2.980278774388272e-05, + "loss": 1.6837, + "step": 26535 + }, + { + "epoch": 0.05170217986516539, + "grad_norm": 2.415710687637329, + "learning_rate": 2.9802565206738922e-05, + "loss": 1.9818, + "step": 26550 + }, + { + "epoch": 0.05173139013627565, + "grad_norm": 3.8227622509002686, + "learning_rate": 2.9802342544940635e-05, + "loss": 1.9691, + "step": 26565 + }, + { + "epoch": 0.051760600407385914, + "grad_norm": 1.6945210695266724, + "learning_rate": 2.980211975848974e-05, + "loss": 1.9788, + "step": 26580 + }, + { + "epoch": 0.051789810678496175, + "grad_norm": 2.1171348094940186, + "learning_rate": 2.980189684738811e-05, + "loss": 1.8498, + "step": 26595 + }, + { + "epoch": 0.05181902094960644, + "grad_norm": 3.8129332065582275, + "learning_rate": 2.9801673811637628e-05, + "loss": 1.8109, + "step": 26610 + }, + { + "epoch": 0.0518482312207167, + "grad_norm": 3.1358275413513184, + "learning_rate": 2.9801450651240173e-05, + "loss": 1.7809, + "step": 26625 + }, + { + "epoch": 0.051877441491826964, + "grad_norm": 3.9287755489349365, + "learning_rate": 2.9801227366197614e-05, + "loss": 1.7791, + "step": 26640 + }, + { + "epoch": 0.05190665176293723, + "grad_norm": 3.269742488861084, + "learning_rate": 2.980100395651184e-05, + "loss": 2.0053, + "step": 26655 + }, + { + "epoch": 0.05193586203404749, + "grad_norm": 3.23358154296875, + "learning_rate": 2.980078042218473e-05, + "loss": 1.7863, + "step": 26670 + }, + { + "epoch": 0.051965072305157754, + "grad_norm": 3.6313607692718506, + "learning_rate": 2.980055676321817e-05, + "loss": 1.8125, + "step": 26685 + }, + { + "epoch": 0.051994282576268015, + "grad_norm": 2.303229331970215, + "learning_rate": 2.9800332979614035e-05, + "loss": 1.8336, + "step": 26700 + }, + { + "epoch": 0.05202349284737828, + "grad_norm": 4.923130035400391, + "learning_rate": 2.9800109071374216e-05, + "loss": 1.9392, + "step": 26715 + }, + { + "epoch": 0.052052703118488544, + "grad_norm": 6.160820484161377, + "learning_rate": 2.9799885038500597e-05, + "loss": 1.8991, + "step": 26730 + }, + { + "epoch": 0.052081913389598805, + "grad_norm": 1.9204509258270264, + "learning_rate": 2.9799660880995065e-05, + "loss": 1.8511, + "step": 26745 + }, + { + "epoch": 0.05211112366070907, + "grad_norm": 4.1540656089782715, + "learning_rate": 2.9799436598859507e-05, + "loss": 1.7353, + "step": 26760 + }, + { + "epoch": 0.052140333931819334, + "grad_norm": 2.5721335411071777, + "learning_rate": 2.979921219209581e-05, + "loss": 1.6949, + "step": 26775 + }, + { + "epoch": 0.052169544202929595, + "grad_norm": 2.5524466037750244, + "learning_rate": 2.9798987660705867e-05, + "loss": 1.9033, + "step": 26790 + }, + { + "epoch": 0.052198754474039856, + "grad_norm": 2.8870813846588135, + "learning_rate": 2.979876300469157e-05, + "loss": 1.6899, + "step": 26805 + }, + { + "epoch": 0.052227964745150124, + "grad_norm": 2.5837631225585938, + "learning_rate": 2.9798538224054804e-05, + "loss": 1.7715, + "step": 26820 + }, + { + "epoch": 0.052257175016260385, + "grad_norm": 2.5015158653259277, + "learning_rate": 2.979831331879747e-05, + "loss": 1.9121, + "step": 26835 + }, + { + "epoch": 0.052286385287370646, + "grad_norm": 3.3133704662323, + "learning_rate": 2.9798088288921457e-05, + "loss": 1.87, + "step": 26850 + }, + { + "epoch": 0.05231559555848091, + "grad_norm": 2.159090042114258, + "learning_rate": 2.979786313442866e-05, + "loss": 1.9665, + "step": 26865 + }, + { + "epoch": 0.052344805829591175, + "grad_norm": 3.6745707988739014, + "learning_rate": 2.9797637855320977e-05, + "loss": 1.8956, + "step": 26880 + }, + { + "epoch": 0.052374016100701436, + "grad_norm": 2.593538761138916, + "learning_rate": 2.9797412451600305e-05, + "loss": 1.9206, + "step": 26895 + }, + { + "epoch": 0.0524032263718117, + "grad_norm": 2.1524672508239746, + "learning_rate": 2.979718692326854e-05, + "loss": 2.0598, + "step": 26910 + }, + { + "epoch": 0.052432436642921965, + "grad_norm": 5.403210639953613, + "learning_rate": 2.9796961270327583e-05, + "loss": 1.9446, + "step": 26925 + }, + { + "epoch": 0.052461646914032226, + "grad_norm": 2.859320878982544, + "learning_rate": 2.9796735492779338e-05, + "loss": 1.6969, + "step": 26940 + }, + { + "epoch": 0.05249085718514249, + "grad_norm": 2.0825371742248535, + "learning_rate": 2.9796509590625696e-05, + "loss": 1.8951, + "step": 26955 + }, + { + "epoch": 0.05252006745625275, + "grad_norm": 2.3604981899261475, + "learning_rate": 2.979628356386857e-05, + "loss": 1.8861, + "step": 26970 + }, + { + "epoch": 0.052549277727363015, + "grad_norm": 3.510629415512085, + "learning_rate": 2.9796057412509856e-05, + "loss": 1.9885, + "step": 26985 + }, + { + "epoch": 0.052578487998473276, + "grad_norm": 1.72383713722229, + "learning_rate": 2.9795831136551467e-05, + "loss": 1.832, + "step": 27000 + }, + { + "epoch": 0.05260769826958354, + "grad_norm": 2.767523765563965, + "learning_rate": 2.9795604735995297e-05, + "loss": 1.8956, + "step": 27015 + }, + { + "epoch": 0.052636908540693805, + "grad_norm": 2.731154203414917, + "learning_rate": 2.979537821084326e-05, + "loss": 1.7157, + "step": 27030 + }, + { + "epoch": 0.052666118811804066, + "grad_norm": 3.17053484916687, + "learning_rate": 2.9795151561097265e-05, + "loss": 1.9691, + "step": 27045 + }, + { + "epoch": 0.05269532908291433, + "grad_norm": 3.9647130966186523, + "learning_rate": 2.979492478675922e-05, + "loss": 1.9736, + "step": 27060 + }, + { + "epoch": 0.05272453935402459, + "grad_norm": 2.4074623584747314, + "learning_rate": 2.9794697887831027e-05, + "loss": 1.8325, + "step": 27075 + }, + { + "epoch": 0.052753749625134856, + "grad_norm": 4.785901069641113, + "learning_rate": 2.9794470864314603e-05, + "loss": 1.9717, + "step": 27090 + }, + { + "epoch": 0.05278295989624512, + "grad_norm": 5.4298577308654785, + "learning_rate": 2.979424371621186e-05, + "loss": 1.8316, + "step": 27105 + }, + { + "epoch": 0.05281217016735538, + "grad_norm": 2.509413003921509, + "learning_rate": 2.9794016443524713e-05, + "loss": 1.8792, + "step": 27120 + }, + { + "epoch": 0.052841380438465646, + "grad_norm": 1.903182029724121, + "learning_rate": 2.979378904625507e-05, + "loss": 1.8049, + "step": 27135 + }, + { + "epoch": 0.05287059070957591, + "grad_norm": 3.3434927463531494, + "learning_rate": 2.9793561524404846e-05, + "loss": 1.7794, + "step": 27150 + }, + { + "epoch": 0.05289980098068617, + "grad_norm": 5.064967632293701, + "learning_rate": 2.9793333877975964e-05, + "loss": 1.8726, + "step": 27165 + }, + { + "epoch": 0.05292901125179643, + "grad_norm": 1.8450191020965576, + "learning_rate": 2.9793106106970335e-05, + "loss": 1.8586, + "step": 27180 + }, + { + "epoch": 0.0529582215229067, + "grad_norm": 2.540570020675659, + "learning_rate": 2.979287821138988e-05, + "loss": 1.8988, + "step": 27195 + }, + { + "epoch": 0.05298743179401696, + "grad_norm": 2.0893425941467285, + "learning_rate": 2.9792650191236516e-05, + "loss": 1.7794, + "step": 27210 + }, + { + "epoch": 0.05301664206512722, + "grad_norm": 2.7562851905822754, + "learning_rate": 2.979242204651216e-05, + "loss": 1.7025, + "step": 27225 + }, + { + "epoch": 0.05304585233623749, + "grad_norm": 4.634995937347412, + "learning_rate": 2.9792193777218743e-05, + "loss": 1.8236, + "step": 27240 + }, + { + "epoch": 0.05307506260734775, + "grad_norm": 3.1855075359344482, + "learning_rate": 2.9791965383358184e-05, + "loss": 1.8439, + "step": 27255 + }, + { + "epoch": 0.05310427287845801, + "grad_norm": 3.3286306858062744, + "learning_rate": 2.9791736864932403e-05, + "loss": 1.9314, + "step": 27270 + }, + { + "epoch": 0.05313348314956827, + "grad_norm": 3.1028332710266113, + "learning_rate": 2.979150822194332e-05, + "loss": 2.0257, + "step": 27285 + }, + { + "epoch": 0.05316269342067854, + "grad_norm": 4.1933393478393555, + "learning_rate": 2.979127945439287e-05, + "loss": 1.9908, + "step": 27300 + }, + { + "epoch": 0.0531919036917888, + "grad_norm": 4.206679344177246, + "learning_rate": 2.9791050562282974e-05, + "loss": 1.8144, + "step": 27315 + }, + { + "epoch": 0.05322111396289906, + "grad_norm": 2.231621265411377, + "learning_rate": 2.9790821545615562e-05, + "loss": 1.824, + "step": 27330 + }, + { + "epoch": 0.05325032423400932, + "grad_norm": 3.625483512878418, + "learning_rate": 2.9790592404392557e-05, + "loss": 2.0087, + "step": 27345 + }, + { + "epoch": 0.05327953450511959, + "grad_norm": 4.063029766082764, + "learning_rate": 2.9790363138615902e-05, + "loss": 1.8927, + "step": 27360 + }, + { + "epoch": 0.05330874477622985, + "grad_norm": 4.166107654571533, + "learning_rate": 2.979013374828751e-05, + "loss": 1.9004, + "step": 27375 + }, + { + "epoch": 0.05333795504734011, + "grad_norm": 2.737416982650757, + "learning_rate": 2.9789904233409326e-05, + "loss": 1.9678, + "step": 27390 + }, + { + "epoch": 0.05336716531845038, + "grad_norm": 2.131272315979004, + "learning_rate": 2.9789674593983277e-05, + "loss": 1.895, + "step": 27405 + }, + { + "epoch": 0.05339637558956064, + "grad_norm": 2.983872890472412, + "learning_rate": 2.9789444830011302e-05, + "loss": 1.7807, + "step": 27420 + }, + { + "epoch": 0.0534255858606709, + "grad_norm": 5.785390377044678, + "learning_rate": 2.978921494149533e-05, + "loss": 1.9559, + "step": 27435 + }, + { + "epoch": 0.05345479613178116, + "grad_norm": 2.7100813388824463, + "learning_rate": 2.9788984928437298e-05, + "loss": 1.9147, + "step": 27450 + }, + { + "epoch": 0.05348400640289143, + "grad_norm": 2.0759334564208984, + "learning_rate": 2.978875479083914e-05, + "loss": 1.7345, + "step": 27465 + }, + { + "epoch": 0.05351321667400169, + "grad_norm": 4.645893573760986, + "learning_rate": 2.9788524528702804e-05, + "loss": 1.7246, + "step": 27480 + }, + { + "epoch": 0.05354242694511195, + "grad_norm": 3.1474320888519287, + "learning_rate": 2.9788294142030225e-05, + "loss": 1.8679, + "step": 27495 + }, + { + "epoch": 0.05357163721622222, + "grad_norm": 2.8445558547973633, + "learning_rate": 2.9788063630823335e-05, + "loss": 1.748, + "step": 27510 + }, + { + "epoch": 0.05360084748733248, + "grad_norm": 3.176330089569092, + "learning_rate": 2.978783299508408e-05, + "loss": 1.9698, + "step": 27525 + }, + { + "epoch": 0.05363005775844274, + "grad_norm": 2.875300884246826, + "learning_rate": 2.9787602234814407e-05, + "loss": 1.7617, + "step": 27540 + }, + { + "epoch": 0.053659268029553, + "grad_norm": 4.004800796508789, + "learning_rate": 2.978737135001626e-05, + "loss": 1.8204, + "step": 27555 + }, + { + "epoch": 0.05368847830066327, + "grad_norm": 2.3588531017303467, + "learning_rate": 2.9787140340691574e-05, + "loss": 1.8028, + "step": 27570 + }, + { + "epoch": 0.05371768857177353, + "grad_norm": 2.293210744857788, + "learning_rate": 2.9786909206842297e-05, + "loss": 1.9525, + "step": 27585 + }, + { + "epoch": 0.05374689884288379, + "grad_norm": 3.4703333377838135, + "learning_rate": 2.9786677948470382e-05, + "loss": 1.8144, + "step": 27600 + }, + { + "epoch": 0.05377610911399406, + "grad_norm": 2.1387853622436523, + "learning_rate": 2.9786446565577772e-05, + "loss": 1.7853, + "step": 27615 + }, + { + "epoch": 0.05380531938510432, + "grad_norm": 1.9415435791015625, + "learning_rate": 2.9786215058166417e-05, + "loss": 1.8912, + "step": 27630 + }, + { + "epoch": 0.05383452965621458, + "grad_norm": 3.315534830093384, + "learning_rate": 2.978598342623826e-05, + "loss": 1.7991, + "step": 27645 + }, + { + "epoch": 0.05386373992732484, + "grad_norm": 3.2139084339141846, + "learning_rate": 2.9785751669795265e-05, + "loss": 1.8284, + "step": 27660 + }, + { + "epoch": 0.05389295019843511, + "grad_norm": 2.0407660007476807, + "learning_rate": 2.9785519788839368e-05, + "loss": 1.8263, + "step": 27675 + }, + { + "epoch": 0.05392216046954537, + "grad_norm": 3.6719486713409424, + "learning_rate": 2.9785287783372538e-05, + "loss": 1.7552, + "step": 27690 + }, + { + "epoch": 0.05395137074065563, + "grad_norm": 2.3214800357818604, + "learning_rate": 2.978505565339671e-05, + "loss": 1.7752, + "step": 27705 + }, + { + "epoch": 0.0539805810117659, + "grad_norm": 3.5290887355804443, + "learning_rate": 2.9784823398913856e-05, + "loss": 1.6684, + "step": 27720 + }, + { + "epoch": 0.05400979128287616, + "grad_norm": 2.344564437866211, + "learning_rate": 2.978459101992592e-05, + "loss": 1.7649, + "step": 27735 + }, + { + "epoch": 0.05403900155398642, + "grad_norm": 3.5880258083343506, + "learning_rate": 2.9784358516434867e-05, + "loss": 1.8194, + "step": 27750 + }, + { + "epoch": 0.05406821182509668, + "grad_norm": 4.46945858001709, + "learning_rate": 2.978412588844265e-05, + "loss": 1.9552, + "step": 27765 + }, + { + "epoch": 0.05409742209620695, + "grad_norm": 2.9063570499420166, + "learning_rate": 2.978389313595123e-05, + "loss": 1.9078, + "step": 27780 + }, + { + "epoch": 0.05412663236731721, + "grad_norm": 4.740022659301758, + "learning_rate": 2.9783660258962568e-05, + "loss": 1.8773, + "step": 27795 + }, + { + "epoch": 0.05415584263842747, + "grad_norm": 5.9590654373168945, + "learning_rate": 2.9783427257478623e-05, + "loss": 2.0891, + "step": 27810 + }, + { + "epoch": 0.05418505290953774, + "grad_norm": 3.4905142784118652, + "learning_rate": 2.978319413150136e-05, + "loss": 1.912, + "step": 27825 + }, + { + "epoch": 0.054214263180648, + "grad_norm": 4.312283039093018, + "learning_rate": 2.978296088103273e-05, + "loss": 1.7969, + "step": 27840 + }, + { + "epoch": 0.05424347345175826, + "grad_norm": 3.419679880142212, + "learning_rate": 2.978272750607472e-05, + "loss": 2.0149, + "step": 27855 + }, + { + "epoch": 0.054272683722868524, + "grad_norm": 1.4746593236923218, + "learning_rate": 2.9782494006629275e-05, + "loss": 1.7991, + "step": 27870 + }, + { + "epoch": 0.05430189399397879, + "grad_norm": 2.6810593605041504, + "learning_rate": 2.9782260382698374e-05, + "loss": 1.8557, + "step": 27885 + }, + { + "epoch": 0.05433110426508905, + "grad_norm": 4.144235134124756, + "learning_rate": 2.9782026634283975e-05, + "loss": 2.0213, + "step": 27900 + }, + { + "epoch": 0.054360314536199314, + "grad_norm": 3.5831942558288574, + "learning_rate": 2.9781792761388055e-05, + "loss": 1.7606, + "step": 27915 + }, + { + "epoch": 0.054389524807309575, + "grad_norm": 3.201408863067627, + "learning_rate": 2.9781558764012573e-05, + "loss": 1.7619, + "step": 27930 + }, + { + "epoch": 0.05441873507841984, + "grad_norm": 2.8114073276519775, + "learning_rate": 2.978132464215951e-05, + "loss": 1.9014, + "step": 27945 + }, + { + "epoch": 0.054447945349530104, + "grad_norm": 2.400465726852417, + "learning_rate": 2.9781090395830834e-05, + "loss": 1.9041, + "step": 27960 + }, + { + "epoch": 0.054477155620640365, + "grad_norm": 2.1496269702911377, + "learning_rate": 2.9780856025028513e-05, + "loss": 1.7252, + "step": 27975 + }, + { + "epoch": 0.05450636589175063, + "grad_norm": 3.477867603302002, + "learning_rate": 2.978062152975453e-05, + "loss": 1.7466, + "step": 27990 + }, + { + "epoch": 0.054535576162860894, + "grad_norm": 2.1709723472595215, + "learning_rate": 2.978038691001085e-05, + "loss": 1.6808, + "step": 28005 + }, + { + "epoch": 0.054564786433971155, + "grad_norm": 2.13655686378479, + "learning_rate": 2.978015216579945e-05, + "loss": 1.8089, + "step": 28020 + }, + { + "epoch": 0.054593996705081416, + "grad_norm": 3.563598394393921, + "learning_rate": 2.9779917297122318e-05, + "loss": 1.8368, + "step": 28035 + }, + { + "epoch": 0.054623206976191684, + "grad_norm": 3.1836514472961426, + "learning_rate": 2.977968230398142e-05, + "loss": 1.8042, + "step": 28050 + }, + { + "epoch": 0.054652417247301945, + "grad_norm": 2.4931259155273438, + "learning_rate": 2.9779447186378738e-05, + "loss": 1.8061, + "step": 28065 + }, + { + "epoch": 0.054681627518412206, + "grad_norm": 3.2399775981903076, + "learning_rate": 2.977921194431625e-05, + "loss": 1.8723, + "step": 28080 + }, + { + "epoch": 0.05471083778952247, + "grad_norm": 3.281590461730957, + "learning_rate": 2.977897657779594e-05, + "loss": 1.8882, + "step": 28095 + }, + { + "epoch": 0.054740048060632734, + "grad_norm": 2.607039451599121, + "learning_rate": 2.9778741086819795e-05, + "loss": 1.8796, + "step": 28110 + }, + { + "epoch": 0.054769258331742995, + "grad_norm": 3.4543254375457764, + "learning_rate": 2.977850547138979e-05, + "loss": 1.8826, + "step": 28125 + }, + { + "epoch": 0.054798468602853256, + "grad_norm": 3.8168885707855225, + "learning_rate": 2.9778269731507914e-05, + "loss": 1.9827, + "step": 28140 + }, + { + "epoch": 0.054827678873963524, + "grad_norm": 2.7308406829833984, + "learning_rate": 2.977803386717615e-05, + "loss": 1.768, + "step": 28155 + }, + { + "epoch": 0.054856889145073785, + "grad_norm": 4.276648998260498, + "learning_rate": 2.9777797878396477e-05, + "loss": 1.8696, + "step": 28170 + }, + { + "epoch": 0.054886099416184046, + "grad_norm": 3.1874961853027344, + "learning_rate": 2.97775617651709e-05, + "loss": 1.881, + "step": 28185 + }, + { + "epoch": 0.054915309687294314, + "grad_norm": 2.4808239936828613, + "learning_rate": 2.977732552750139e-05, + "loss": 1.7825, + "step": 28200 + }, + { + "epoch": 0.054944519958404575, + "grad_norm": 3.0970373153686523, + "learning_rate": 2.9777089165389942e-05, + "loss": 1.7825, + "step": 28215 + }, + { + "epoch": 0.054973730229514836, + "grad_norm": 1.757534384727478, + "learning_rate": 2.9776852678838555e-05, + "loss": 2.0145, + "step": 28230 + }, + { + "epoch": 0.0550029405006251, + "grad_norm": 2.351555347442627, + "learning_rate": 2.977661606784921e-05, + "loss": 2.0268, + "step": 28245 + }, + { + "epoch": 0.055032150771735365, + "grad_norm": 2.4561007022857666, + "learning_rate": 2.9776379332423902e-05, + "loss": 1.7745, + "step": 28260 + }, + { + "epoch": 0.055061361042845626, + "grad_norm": 3.8144009113311768, + "learning_rate": 2.9776142472564624e-05, + "loss": 1.8408, + "step": 28275 + }, + { + "epoch": 0.05509057131395589, + "grad_norm": 4.969006061553955, + "learning_rate": 2.9775905488273373e-05, + "loss": 1.9223, + "step": 28290 + }, + { + "epoch": 0.055119781585066155, + "grad_norm": 5.71866512298584, + "learning_rate": 2.9775668379552146e-05, + "loss": 1.842, + "step": 28305 + }, + { + "epoch": 0.055148991856176416, + "grad_norm": 1.9934380054473877, + "learning_rate": 2.9775431146402937e-05, + "loss": 1.976, + "step": 28320 + }, + { + "epoch": 0.05517820212728668, + "grad_norm": 1.7895939350128174, + "learning_rate": 2.9775193788827743e-05, + "loss": 2.0921, + "step": 28335 + }, + { + "epoch": 0.05520741239839694, + "grad_norm": 4.20900821685791, + "learning_rate": 2.9774956306828566e-05, + "loss": 1.9333, + "step": 28350 + }, + { + "epoch": 0.055236622669507206, + "grad_norm": 1.7822163105010986, + "learning_rate": 2.97747187004074e-05, + "loss": 2.0872, + "step": 28365 + }, + { + "epoch": 0.05526583294061747, + "grad_norm": 1.7469080686569214, + "learning_rate": 2.9774480969566254e-05, + "loss": 1.8781, + "step": 28380 + }, + { + "epoch": 0.05529504321172773, + "grad_norm": 3.0533454418182373, + "learning_rate": 2.977424311430712e-05, + "loss": 1.7184, + "step": 28395 + }, + { + "epoch": 0.05532425348283799, + "grad_norm": 2.729780912399292, + "learning_rate": 2.977400513463201e-05, + "loss": 1.7794, + "step": 28410 + }, + { + "epoch": 0.05535346375394826, + "grad_norm": 3.2360620498657227, + "learning_rate": 2.9773767030542926e-05, + "loss": 1.7711, + "step": 28425 + }, + { + "epoch": 0.05538267402505852, + "grad_norm": 2.3599355220794678, + "learning_rate": 2.9773528802041873e-05, + "loss": 1.8758, + "step": 28440 + }, + { + "epoch": 0.05541188429616878, + "grad_norm": 3.3827106952667236, + "learning_rate": 2.9773290449130856e-05, + "loss": 1.901, + "step": 28455 + }, + { + "epoch": 0.055441094567279046, + "grad_norm": 2.9506921768188477, + "learning_rate": 2.977305197181188e-05, + "loss": 1.9286, + "step": 28470 + }, + { + "epoch": 0.05547030483838931, + "grad_norm": 3.381622791290283, + "learning_rate": 2.9772813370086956e-05, + "loss": 2.0169, + "step": 28485 + }, + { + "epoch": 0.05549951510949957, + "grad_norm": 3.6618142127990723, + "learning_rate": 2.9772574643958095e-05, + "loss": 1.8318, + "step": 28500 + }, + { + "epoch": 0.05552872538060983, + "grad_norm": 2.906064033508301, + "learning_rate": 2.9772335793427304e-05, + "loss": 1.8778, + "step": 28515 + }, + { + "epoch": 0.0555579356517201, + "grad_norm": 1.9280356168746948, + "learning_rate": 2.9772096818496592e-05, + "loss": 1.9095, + "step": 28530 + }, + { + "epoch": 0.05558714592283036, + "grad_norm": 2.451441764831543, + "learning_rate": 2.977185771916798e-05, + "loss": 1.7975, + "step": 28545 + }, + { + "epoch": 0.05561635619394062, + "grad_norm": 3.4363293647766113, + "learning_rate": 2.9771618495443473e-05, + "loss": 1.9995, + "step": 28560 + }, + { + "epoch": 0.05564556646505089, + "grad_norm": 3.402430772781372, + "learning_rate": 2.9771379147325095e-05, + "loss": 1.879, + "step": 28575 + }, + { + "epoch": 0.05567477673616115, + "grad_norm": 3.7891762256622314, + "learning_rate": 2.977113967481485e-05, + "loss": 1.8275, + "step": 28590 + }, + { + "epoch": 0.05570398700727141, + "grad_norm": 2.45809268951416, + "learning_rate": 2.977090007791476e-05, + "loss": 1.8131, + "step": 28605 + }, + { + "epoch": 0.05573319727838167, + "grad_norm": 3.5447278022766113, + "learning_rate": 2.9770660356626848e-05, + "loss": 1.8373, + "step": 28620 + }, + { + "epoch": 0.05576240754949194, + "grad_norm": 3.397735595703125, + "learning_rate": 2.9770420510953124e-05, + "loss": 1.7907, + "step": 28635 + }, + { + "epoch": 0.0557916178206022, + "grad_norm": 2.185011386871338, + "learning_rate": 2.9770180540895613e-05, + "loss": 1.8909, + "step": 28650 + }, + { + "epoch": 0.05582082809171246, + "grad_norm": 3.66780161857605, + "learning_rate": 2.9769940446456332e-05, + "loss": 1.8898, + "step": 28665 + }, + { + "epoch": 0.05585003836282273, + "grad_norm": 3.8541507720947266, + "learning_rate": 2.9769700227637307e-05, + "loss": 1.8156, + "step": 28680 + }, + { + "epoch": 0.05587924863393299, + "grad_norm": 2.504997491836548, + "learning_rate": 2.9769459884440563e-05, + "loss": 1.8584, + "step": 28695 + }, + { + "epoch": 0.05590845890504325, + "grad_norm": 2.811286687850952, + "learning_rate": 2.9769219416868114e-05, + "loss": 1.8762, + "step": 28710 + }, + { + "epoch": 0.05593766917615351, + "grad_norm": 3.2774808406829834, + "learning_rate": 2.976897882492199e-05, + "loss": 1.8637, + "step": 28725 + }, + { + "epoch": 0.05596687944726378, + "grad_norm": 3.6415061950683594, + "learning_rate": 2.9768738108604222e-05, + "loss": 2.033, + "step": 28740 + }, + { + "epoch": 0.05599608971837404, + "grad_norm": 2.5112550258636475, + "learning_rate": 2.9768497267916833e-05, + "loss": 1.9643, + "step": 28755 + }, + { + "epoch": 0.0560252999894843, + "grad_norm": 3.3538918495178223, + "learning_rate": 2.9768256302861852e-05, + "loss": 1.7964, + "step": 28770 + }, + { + "epoch": 0.05605451026059457, + "grad_norm": 1.7511709928512573, + "learning_rate": 2.9768015213441306e-05, + "loss": 1.9891, + "step": 28785 + }, + { + "epoch": 0.05608372053170483, + "grad_norm": 4.007176399230957, + "learning_rate": 2.9767773999657225e-05, + "loss": 2.0263, + "step": 28800 + }, + { + "epoch": 0.05611293080281509, + "grad_norm": 2.968200206756592, + "learning_rate": 2.9767532661511644e-05, + "loss": 1.7171, + "step": 28815 + }, + { + "epoch": 0.05614214107392535, + "grad_norm": 3.2194066047668457, + "learning_rate": 2.9767291199006594e-05, + "loss": 1.8825, + "step": 28830 + }, + { + "epoch": 0.05617135134503562, + "grad_norm": 3.994147539138794, + "learning_rate": 2.976704961214411e-05, + "loss": 1.6859, + "step": 28845 + }, + { + "epoch": 0.05620056161614588, + "grad_norm": 2.1359071731567383, + "learning_rate": 2.976680790092622e-05, + "loss": 1.7796, + "step": 28860 + }, + { + "epoch": 0.05622977188725614, + "grad_norm": 3.074885606765747, + "learning_rate": 2.976656606535497e-05, + "loss": 1.9507, + "step": 28875 + }, + { + "epoch": 0.05625898215836641, + "grad_norm": 3.184913396835327, + "learning_rate": 2.9766324105432385e-05, + "loss": 1.9536, + "step": 28890 + }, + { + "epoch": 0.05628819242947667, + "grad_norm": 3.911243438720703, + "learning_rate": 2.976608202116051e-05, + "loss": 1.8609, + "step": 28905 + }, + { + "epoch": 0.05631740270058693, + "grad_norm": 4.364305019378662, + "learning_rate": 2.9765839812541378e-05, + "loss": 1.8573, + "step": 28920 + }, + { + "epoch": 0.05634661297169719, + "grad_norm": 3.0088484287261963, + "learning_rate": 2.9765597479577034e-05, + "loss": 2.0879, + "step": 28935 + }, + { + "epoch": 0.05637582324280746, + "grad_norm": 1.9867300987243652, + "learning_rate": 2.9765355022269518e-05, + "loss": 1.7799, + "step": 28950 + }, + { + "epoch": 0.05640503351391772, + "grad_norm": 2.1443519592285156, + "learning_rate": 2.9765112440620874e-05, + "loss": 1.7201, + "step": 28965 + }, + { + "epoch": 0.05643424378502798, + "grad_norm": 2.226407051086426, + "learning_rate": 2.9764869734633134e-05, + "loss": 1.7974, + "step": 28980 + }, + { + "epoch": 0.05646345405613824, + "grad_norm": 4.08579158782959, + "learning_rate": 2.9764626904308354e-05, + "loss": 1.8633, + "step": 28995 + }, + { + "epoch": 0.05649266432724851, + "grad_norm": 4.314965724945068, + "learning_rate": 2.9764383949648576e-05, + "loss": 1.9242, + "step": 29010 + }, + { + "epoch": 0.05652187459835877, + "grad_norm": 4.151242733001709, + "learning_rate": 2.976414087065584e-05, + "loss": 1.942, + "step": 29025 + }, + { + "epoch": 0.05655108486946903, + "grad_norm": 1.9677321910858154, + "learning_rate": 2.97638976673322e-05, + "loss": 1.8371, + "step": 29040 + }, + { + "epoch": 0.0565802951405793, + "grad_norm": 2.814545154571533, + "learning_rate": 2.97636543396797e-05, + "loss": 1.9929, + "step": 29055 + }, + { + "epoch": 0.05660950541168956, + "grad_norm": 3.4638845920562744, + "learning_rate": 2.976341088770039e-05, + "loss": 1.8635, + "step": 29070 + }, + { + "epoch": 0.05663871568279982, + "grad_norm": 2.0748291015625, + "learning_rate": 2.976316731139632e-05, + "loss": 1.6827, + "step": 29085 + }, + { + "epoch": 0.056667925953910084, + "grad_norm": 2.4861667156219482, + "learning_rate": 2.9762923610769545e-05, + "loss": 1.8022, + "step": 29100 + }, + { + "epoch": 0.05669713622502035, + "grad_norm": 2.12156343460083, + "learning_rate": 2.9762679785822113e-05, + "loss": 1.8912, + "step": 29115 + }, + { + "epoch": 0.05672634649613061, + "grad_norm": 2.3051788806915283, + "learning_rate": 2.9762435836556075e-05, + "loss": 1.786, + "step": 29130 + }, + { + "epoch": 0.056755556767240874, + "grad_norm": 3.0045323371887207, + "learning_rate": 2.9762191762973492e-05, + "loss": 1.9105, + "step": 29145 + }, + { + "epoch": 0.05678476703835114, + "grad_norm": 2.9411308765411377, + "learning_rate": 2.9761947565076413e-05, + "loss": 1.7949, + "step": 29160 + }, + { + "epoch": 0.0568139773094614, + "grad_norm": 3.0440449714660645, + "learning_rate": 2.97617032428669e-05, + "loss": 2.0024, + "step": 29175 + }, + { + "epoch": 0.056843187580571664, + "grad_norm": 3.836582660675049, + "learning_rate": 2.976145879634701e-05, + "loss": 1.9192, + "step": 29190 + }, + { + "epoch": 0.056872397851681925, + "grad_norm": 3.63226580619812, + "learning_rate": 2.9761214225518792e-05, + "loss": 1.6728, + "step": 29205 + }, + { + "epoch": 0.05690160812279219, + "grad_norm": 2.588270902633667, + "learning_rate": 2.9760969530384317e-05, + "loss": 1.8828, + "step": 29220 + }, + { + "epoch": 0.05693081839390245, + "grad_norm": 3.4929933547973633, + "learning_rate": 2.9760724710945642e-05, + "loss": 2.0755, + "step": 29235 + }, + { + "epoch": 0.056960028665012714, + "grad_norm": 2.916142463684082, + "learning_rate": 2.976047976720483e-05, + "loss": 1.8955, + "step": 29250 + }, + { + "epoch": 0.05698923893612298, + "grad_norm": 3.606926918029785, + "learning_rate": 2.976023469916394e-05, + "loss": 1.9196, + "step": 29265 + }, + { + "epoch": 0.05701844920723324, + "grad_norm": 4.033385276794434, + "learning_rate": 2.9759989506825033e-05, + "loss": 1.9555, + "step": 29280 + }, + { + "epoch": 0.057047659478343504, + "grad_norm": 2.1779022216796875, + "learning_rate": 2.9759744190190185e-05, + "loss": 1.7735, + "step": 29295 + }, + { + "epoch": 0.057076869749453765, + "grad_norm": 4.592677116394043, + "learning_rate": 2.9759498749261452e-05, + "loss": 1.8971, + "step": 29310 + }, + { + "epoch": 0.05710608002056403, + "grad_norm": 4.797886371612549, + "learning_rate": 2.9759253184040906e-05, + "loss": 1.8494, + "step": 29325 + }, + { + "epoch": 0.057135290291674294, + "grad_norm": 2.316049337387085, + "learning_rate": 2.9759007494530615e-05, + "loss": 1.7288, + "step": 29340 + }, + { + "epoch": 0.057164500562784555, + "grad_norm": 2.9396653175354004, + "learning_rate": 2.975876168073264e-05, + "loss": 1.8751, + "step": 29355 + }, + { + "epoch": 0.05719371083389482, + "grad_norm": 4.224997043609619, + "learning_rate": 2.9758515742649063e-05, + "loss": 1.78, + "step": 29370 + }, + { + "epoch": 0.057222921105005084, + "grad_norm": 2.5549795627593994, + "learning_rate": 2.9758269680281946e-05, + "loss": 1.7375, + "step": 29385 + }, + { + "epoch": 0.057252131376115345, + "grad_norm": 3.9945852756500244, + "learning_rate": 2.9758023493633365e-05, + "loss": 1.7078, + "step": 29400 + }, + { + "epoch": 0.057281341647225606, + "grad_norm": 3.5278730392456055, + "learning_rate": 2.975777718270539e-05, + "loss": 1.7572, + "step": 29415 + }, + { + "epoch": 0.057310551918335874, + "grad_norm": 3.9674766063690186, + "learning_rate": 2.9757530747500104e-05, + "loss": 1.9125, + "step": 29430 + }, + { + "epoch": 0.057339762189446135, + "grad_norm": 3.498309373855591, + "learning_rate": 2.9757284188019573e-05, + "loss": 1.8242, + "step": 29445 + }, + { + "epoch": 0.057368972460556396, + "grad_norm": 3.2913856506347656, + "learning_rate": 2.9757037504265874e-05, + "loss": 1.8559, + "step": 29460 + }, + { + "epoch": 0.05739818273166666, + "grad_norm": 2.72259521484375, + "learning_rate": 2.9756790696241088e-05, + "loss": 1.7855, + "step": 29475 + }, + { + "epoch": 0.057427393002776925, + "grad_norm": 7.3963704109191895, + "learning_rate": 2.9756543763947292e-05, + "loss": 1.9272, + "step": 29490 + }, + { + "epoch": 0.057456603273887186, + "grad_norm": 2.901502847671509, + "learning_rate": 2.9756296707386566e-05, + "loss": 1.7294, + "step": 29505 + }, + { + "epoch": 0.05748581354499745, + "grad_norm": 1.9630255699157715, + "learning_rate": 2.9756049526560995e-05, + "loss": 1.8477, + "step": 29520 + }, + { + "epoch": 0.057515023816107715, + "grad_norm": 2.449836492538452, + "learning_rate": 2.975580222147265e-05, + "loss": 1.9282, + "step": 29535 + }, + { + "epoch": 0.057544234087217976, + "grad_norm": 2.7461347579956055, + "learning_rate": 2.9755554792123617e-05, + "loss": 1.8883, + "step": 29550 + }, + { + "epoch": 0.05757344435832824, + "grad_norm": 3.3567450046539307, + "learning_rate": 2.9755307238515986e-05, + "loss": 1.9717, + "step": 29565 + }, + { + "epoch": 0.0576026546294385, + "grad_norm": 3.6341872215270996, + "learning_rate": 2.975505956065184e-05, + "loss": 1.9378, + "step": 29580 + }, + { + "epoch": 0.057631864900548765, + "grad_norm": 4.170873165130615, + "learning_rate": 2.9754811758533253e-05, + "loss": 1.9077, + "step": 29595 + }, + { + "epoch": 0.057661075171659026, + "grad_norm": 1.9185329675674438, + "learning_rate": 2.975456383216233e-05, + "loss": 1.7987, + "step": 29610 + }, + { + "epoch": 0.05769028544276929, + "grad_norm": 2.3139781951904297, + "learning_rate": 2.9754315781541144e-05, + "loss": 1.9518, + "step": 29625 + }, + { + "epoch": 0.057719495713879555, + "grad_norm": 3.581907033920288, + "learning_rate": 2.9754067606671794e-05, + "loss": 1.8729, + "step": 29640 + }, + { + "epoch": 0.057748705984989816, + "grad_norm": 2.490278720855713, + "learning_rate": 2.975381930755636e-05, + "loss": 1.7576, + "step": 29655 + }, + { + "epoch": 0.05777791625610008, + "grad_norm": 3.5740153789520264, + "learning_rate": 2.9753570884196942e-05, + "loss": 1.9749, + "step": 29670 + }, + { + "epoch": 0.05780712652721034, + "grad_norm": 3.012131929397583, + "learning_rate": 2.975332233659563e-05, + "loss": 1.8704, + "step": 29685 + }, + { + "epoch": 0.057836336798320606, + "grad_norm": 4.225703239440918, + "learning_rate": 2.9753073664754514e-05, + "loss": 1.6134, + "step": 29700 + }, + { + "epoch": 0.05786554706943087, + "grad_norm": 3.840487480163574, + "learning_rate": 2.9752824868675693e-05, + "loss": 1.822, + "step": 29715 + }, + { + "epoch": 0.05789475734054113, + "grad_norm": 2.8694260120391846, + "learning_rate": 2.975257594836125e-05, + "loss": 1.939, + "step": 29730 + }, + { + "epoch": 0.057923967611651396, + "grad_norm": 3.4660465717315674, + "learning_rate": 2.97523269038133e-05, + "loss": 1.9517, + "step": 29745 + }, + { + "epoch": 0.05795317788276166, + "grad_norm": 3.125666379928589, + "learning_rate": 2.9752077735033924e-05, + "loss": 1.7034, + "step": 29760 + }, + { + "epoch": 0.05798238815387192, + "grad_norm": 4.895527362823486, + "learning_rate": 2.975182844202523e-05, + "loss": 1.8021, + "step": 29775 + }, + { + "epoch": 0.05801159842498218, + "grad_norm": 3.4364778995513916, + "learning_rate": 2.9751579024789314e-05, + "loss": 1.9357, + "step": 29790 + }, + { + "epoch": 0.05804080869609245, + "grad_norm": 3.209791421890259, + "learning_rate": 2.9751329483328276e-05, + "loss": 1.8969, + "step": 29805 + }, + { + "epoch": 0.05807001896720271, + "grad_norm": 2.851810932159424, + "learning_rate": 2.9751079817644217e-05, + "loss": 1.8274, + "step": 29820 + }, + { + "epoch": 0.05809922923831297, + "grad_norm": 1.9028266668319702, + "learning_rate": 2.975083002773924e-05, + "loss": 1.7498, + "step": 29835 + }, + { + "epoch": 0.05812843950942324, + "grad_norm": 4.442659378051758, + "learning_rate": 2.9750580113615448e-05, + "loss": 1.906, + "step": 29850 + }, + { + "epoch": 0.0581576497805335, + "grad_norm": 4.513755798339844, + "learning_rate": 2.9750330075274948e-05, + "loss": 1.7855, + "step": 29865 + }, + { + "epoch": 0.05818686005164376, + "grad_norm": 2.4421234130859375, + "learning_rate": 2.975007991271984e-05, + "loss": 1.8355, + "step": 29880 + }, + { + "epoch": 0.05821607032275402, + "grad_norm": 2.0383033752441406, + "learning_rate": 2.974982962595224e-05, + "loss": 1.8456, + "step": 29895 + }, + { + "epoch": 0.05824528059386429, + "grad_norm": 2.9908933639526367, + "learning_rate": 2.9749579214974245e-05, + "loss": 1.7223, + "step": 29910 + }, + { + "epoch": 0.05827449086497455, + "grad_norm": 3.1746137142181396, + "learning_rate": 2.9749328679787976e-05, + "loss": 1.8949, + "step": 29925 + }, + { + "epoch": 0.05830370113608481, + "grad_norm": 3.351712942123413, + "learning_rate": 2.9749078020395526e-05, + "loss": 1.8423, + "step": 29940 + }, + { + "epoch": 0.05833291140719508, + "grad_norm": 4.363383769989014, + "learning_rate": 2.9748827236799024e-05, + "loss": 2.0718, + "step": 29955 + }, + { + "epoch": 0.05836212167830534, + "grad_norm": 3.5311388969421387, + "learning_rate": 2.974857632900057e-05, + "loss": 1.8393, + "step": 29970 + }, + { + "epoch": 0.0583913319494156, + "grad_norm": 2.739178419113159, + "learning_rate": 2.974832529700228e-05, + "loss": 1.901, + "step": 29985 + }, + { + "epoch": 0.05842054222052586, + "grad_norm": 5.376321315765381, + "learning_rate": 2.974807414080627e-05, + "loss": 1.9742, + "step": 30000 + }, + { + "epoch": 0.05844975249163613, + "grad_norm": 4.261213302612305, + "learning_rate": 2.9747822860414652e-05, + "loss": 1.747, + "step": 30015 + }, + { + "epoch": 0.05847896276274639, + "grad_norm": 2.775156259536743, + "learning_rate": 2.9747571455829544e-05, + "loss": 1.8235, + "step": 30030 + }, + { + "epoch": 0.05850817303385665, + "grad_norm": 4.271034240722656, + "learning_rate": 2.9747319927053066e-05, + "loss": 1.834, + "step": 30045 + }, + { + "epoch": 0.05853738330496691, + "grad_norm": 3.628793239593506, + "learning_rate": 2.9747068274087327e-05, + "loss": 2.0451, + "step": 30060 + }, + { + "epoch": 0.05856659357607718, + "grad_norm": 2.388171434402466, + "learning_rate": 2.9746816496934452e-05, + "loss": 1.9012, + "step": 30075 + }, + { + "epoch": 0.05859580384718744, + "grad_norm": 5.260224342346191, + "learning_rate": 2.9746564595596566e-05, + "loss": 1.8394, + "step": 30090 + }, + { + "epoch": 0.0586250141182977, + "grad_norm": 3.47929310798645, + "learning_rate": 2.9746312570075786e-05, + "loss": 1.7858, + "step": 30105 + }, + { + "epoch": 0.05865422438940797, + "grad_norm": 2.082864761352539, + "learning_rate": 2.974606042037423e-05, + "loss": 1.8976, + "step": 30120 + }, + { + "epoch": 0.05868343466051823, + "grad_norm": 2.003638982772827, + "learning_rate": 2.9745808146494027e-05, + "loss": 1.7789, + "step": 30135 + }, + { + "epoch": 0.05871264493162849, + "grad_norm": 3.8583061695098877, + "learning_rate": 2.9745555748437298e-05, + "loss": 1.9207, + "step": 30150 + }, + { + "epoch": 0.05874185520273875, + "grad_norm": 4.0564703941345215, + "learning_rate": 2.9745303226206172e-05, + "loss": 1.9422, + "step": 30165 + }, + { + "epoch": 0.05877106547384902, + "grad_norm": 1.928350806236267, + "learning_rate": 2.974505057980277e-05, + "loss": 1.8624, + "step": 30180 + }, + { + "epoch": 0.05880027574495928, + "grad_norm": 2.1937801837921143, + "learning_rate": 2.974479780922923e-05, + "loss": 1.9617, + "step": 30195 + }, + { + "epoch": 0.05882948601606954, + "grad_norm": 4.145959854125977, + "learning_rate": 2.974454491448767e-05, + "loss": 1.8212, + "step": 30210 + }, + { + "epoch": 0.05885869628717981, + "grad_norm": 4.780306816101074, + "learning_rate": 2.9744291895580222e-05, + "loss": 1.7573, + "step": 30225 + }, + { + "epoch": 0.05888790655829007, + "grad_norm": 3.6125333309173584, + "learning_rate": 2.9744038752509016e-05, + "loss": 1.9406, + "step": 30240 + }, + { + "epoch": 0.05891711682940033, + "grad_norm": 2.673916816711426, + "learning_rate": 2.974378548527619e-05, + "loss": 1.8403, + "step": 30255 + }, + { + "epoch": 0.05894632710051059, + "grad_norm": 2.3269338607788086, + "learning_rate": 2.9743532093883877e-05, + "loss": 2.1496, + "step": 30270 + }, + { + "epoch": 0.05897553737162086, + "grad_norm": 4.606401443481445, + "learning_rate": 2.9743278578334205e-05, + "loss": 1.7679, + "step": 30285 + }, + { + "epoch": 0.05900474764273112, + "grad_norm": 4.217224597930908, + "learning_rate": 2.9743024938629303e-05, + "loss": 1.8091, + "step": 30300 + }, + { + "epoch": 0.05903395791384138, + "grad_norm": 2.7839462757110596, + "learning_rate": 2.974277117477132e-05, + "loss": 1.9216, + "step": 30315 + }, + { + "epoch": 0.05906316818495165, + "grad_norm": 2.616410970687866, + "learning_rate": 2.9742517286762386e-05, + "loss": 1.7535, + "step": 30330 + }, + { + "epoch": 0.05909237845606191, + "grad_norm": 2.646209239959717, + "learning_rate": 2.974226327460464e-05, + "loss": 1.7763, + "step": 30345 + }, + { + "epoch": 0.05912158872717217, + "grad_norm": 4.532891750335693, + "learning_rate": 2.9742009138300225e-05, + "loss": 1.9677, + "step": 30360 + }, + { + "epoch": 0.05915079899828243, + "grad_norm": 3.891831874847412, + "learning_rate": 2.9741754877851277e-05, + "loss": 1.8411, + "step": 30375 + }, + { + "epoch": 0.0591800092693927, + "grad_norm": 2.234100580215454, + "learning_rate": 2.9741500493259933e-05, + "loss": 1.77, + "step": 30390 + }, + { + "epoch": 0.05920921954050296, + "grad_norm": 3.4290971755981445, + "learning_rate": 2.9741245984528342e-05, + "loss": 1.9234, + "step": 30405 + }, + { + "epoch": 0.05923842981161322, + "grad_norm": 2.175567150115967, + "learning_rate": 2.974099135165865e-05, + "loss": 2.1054, + "step": 30420 + }, + { + "epoch": 0.05926764008272349, + "grad_norm": 2.412052631378174, + "learning_rate": 2.9740736594652992e-05, + "loss": 1.8607, + "step": 30435 + }, + { + "epoch": 0.05929685035383375, + "grad_norm": 2.27789044380188, + "learning_rate": 2.974048171351352e-05, + "loss": 1.9491, + "step": 30450 + }, + { + "epoch": 0.05932606062494401, + "grad_norm": 3.9195199012756348, + "learning_rate": 2.9740226708242375e-05, + "loss": 1.8516, + "step": 30465 + }, + { + "epoch": 0.059355270896054274, + "grad_norm": 2.3743503093719482, + "learning_rate": 2.973997157884171e-05, + "loss": 2.1744, + "step": 30480 + }, + { + "epoch": 0.05938448116716454, + "grad_norm": 4.110390663146973, + "learning_rate": 2.9739716325313676e-05, + "loss": 1.8803, + "step": 30495 + }, + { + "epoch": 0.0594136914382748, + "grad_norm": 3.738240957260132, + "learning_rate": 2.973946094766041e-05, + "loss": 1.7978, + "step": 30510 + }, + { + "epoch": 0.059442901709385064, + "grad_norm": 2.840871572494507, + "learning_rate": 2.9739205445884075e-05, + "loss": 1.8413, + "step": 30525 + }, + { + "epoch": 0.059472111980495325, + "grad_norm": 2.931602954864502, + "learning_rate": 2.9738949819986814e-05, + "loss": 1.9035, + "step": 30540 + }, + { + "epoch": 0.05950132225160559, + "grad_norm": 2.814918041229248, + "learning_rate": 2.9738694069970787e-05, + "loss": 1.9169, + "step": 30555 + }, + { + "epoch": 0.059530532522715854, + "grad_norm": 4.824680805206299, + "learning_rate": 2.9738438195838145e-05, + "loss": 1.8696, + "step": 30570 + }, + { + "epoch": 0.059559742793826115, + "grad_norm": 2.1868393421173096, + "learning_rate": 2.9738182197591038e-05, + "loss": 1.8737, + "step": 30585 + }, + { + "epoch": 0.05958895306493638, + "grad_norm": 4.172234535217285, + "learning_rate": 2.973792607523163e-05, + "loss": 1.8148, + "step": 30600 + }, + { + "epoch": 0.059618163336046644, + "grad_norm": 4.052221775054932, + "learning_rate": 2.9737669828762074e-05, + "loss": 1.8824, + "step": 30615 + }, + { + "epoch": 0.059647373607156905, + "grad_norm": 3.6041007041931152, + "learning_rate": 2.9737413458184522e-05, + "loss": 1.8255, + "step": 30630 + }, + { + "epoch": 0.059676583878267166, + "grad_norm": 4.06122350692749, + "learning_rate": 2.9737156963501143e-05, + "loss": 1.951, + "step": 30645 + }, + { + "epoch": 0.059705794149377434, + "grad_norm": 2.3008453845977783, + "learning_rate": 2.973690034471409e-05, + "loss": 1.758, + "step": 30660 + }, + { + "epoch": 0.059735004420487695, + "grad_norm": 3.7912003993988037, + "learning_rate": 2.9736643601825532e-05, + "loss": 1.8688, + "step": 30675 + }, + { + "epoch": 0.059764214691597956, + "grad_norm": 3.7779648303985596, + "learning_rate": 2.9736386734837616e-05, + "loss": 1.7613, + "step": 30690 + }, + { + "epoch": 0.05979342496270822, + "grad_norm": 4.632726669311523, + "learning_rate": 2.9736129743752523e-05, + "loss": 1.6212, + "step": 30705 + }, + { + "epoch": 0.059822635233818484, + "grad_norm": 4.628555774688721, + "learning_rate": 2.9735872628572404e-05, + "loss": 1.8643, + "step": 30720 + }, + { + "epoch": 0.059851845504928745, + "grad_norm": 3.8072681427001953, + "learning_rate": 2.9735615389299434e-05, + "loss": 1.7805, + "step": 30735 + }, + { + "epoch": 0.059881055776039006, + "grad_norm": 3.9826717376708984, + "learning_rate": 2.973535802593577e-05, + "loss": 1.8594, + "step": 30750 + }, + { + "epoch": 0.059910266047149274, + "grad_norm": 3.04962420463562, + "learning_rate": 2.9735100538483587e-05, + "loss": 1.9104, + "step": 30765 + }, + { + "epoch": 0.059939476318259535, + "grad_norm": 2.8500261306762695, + "learning_rate": 2.9734842926945048e-05, + "loss": 1.717, + "step": 30780 + }, + { + "epoch": 0.059968686589369796, + "grad_norm": 1.5368775129318237, + "learning_rate": 2.9734585191322324e-05, + "loss": 1.8905, + "step": 30795 + }, + { + "epoch": 0.059997896860480064, + "grad_norm": 3.886584997177124, + "learning_rate": 2.9734327331617588e-05, + "loss": 1.804, + "step": 30810 + }, + { + "epoch": 0.060027107131590325, + "grad_norm": 3.792663812637329, + "learning_rate": 2.9734069347833005e-05, + "loss": 1.9667, + "step": 30825 + }, + { + "epoch": 0.060056317402700586, + "grad_norm": 3.7504067420959473, + "learning_rate": 2.9733811239970756e-05, + "loss": 1.9053, + "step": 30840 + }, + { + "epoch": 0.06008552767381085, + "grad_norm": 2.617370367050171, + "learning_rate": 2.973355300803301e-05, + "loss": 1.8853, + "step": 30855 + }, + { + "epoch": 0.060114737944921115, + "grad_norm": 3.2594456672668457, + "learning_rate": 2.973329465202194e-05, + "loss": 1.7045, + "step": 30870 + }, + { + "epoch": 0.060143948216031376, + "grad_norm": 1.8166978359222412, + "learning_rate": 2.9733036171939723e-05, + "loss": 1.8431, + "step": 30885 + }, + { + "epoch": 0.06017315848714164, + "grad_norm": 2.9280359745025635, + "learning_rate": 2.973277756778854e-05, + "loss": 1.9088, + "step": 30900 + }, + { + "epoch": 0.060202368758251905, + "grad_norm": 4.446992874145508, + "learning_rate": 2.973251883957056e-05, + "loss": 1.7744, + "step": 30915 + }, + { + "epoch": 0.060231579029362166, + "grad_norm": 4.279955863952637, + "learning_rate": 2.9732259987287967e-05, + "loss": 1.8309, + "step": 30930 + }, + { + "epoch": 0.06026078930047243, + "grad_norm": 2.3727381229400635, + "learning_rate": 2.9732001010942946e-05, + "loss": 1.9247, + "step": 30945 + }, + { + "epoch": 0.06028999957158269, + "grad_norm": 2.6014645099639893, + "learning_rate": 2.9731741910537668e-05, + "loss": 1.7653, + "step": 30960 + }, + { + "epoch": 0.060319209842692956, + "grad_norm": 2.387563467025757, + "learning_rate": 2.973148268607432e-05, + "loss": 1.7814, + "step": 30975 + }, + { + "epoch": 0.06034842011380322, + "grad_norm": 3.4683475494384766, + "learning_rate": 2.9731223337555088e-05, + "loss": 1.8522, + "step": 30990 + }, + { + "epoch": 0.06037763038491348, + "grad_norm": 2.352069854736328, + "learning_rate": 2.973096386498215e-05, + "loss": 2.0015, + "step": 31005 + }, + { + "epoch": 0.06040684065602374, + "grad_norm": 3.2257015705108643, + "learning_rate": 2.9730704268357694e-05, + "loss": 1.8176, + "step": 31020 + }, + { + "epoch": 0.06043605092713401, + "grad_norm": 2.4939286708831787, + "learning_rate": 2.97304445476839e-05, + "loss": 1.7388, + "step": 31035 + }, + { + "epoch": 0.06046526119824427, + "grad_norm": 3.216533660888672, + "learning_rate": 2.9730184702962968e-05, + "loss": 1.859, + "step": 31050 + }, + { + "epoch": 0.06049447146935453, + "grad_norm": 3.4212570190429688, + "learning_rate": 2.9729924734197077e-05, + "loss": 1.7748, + "step": 31065 + }, + { + "epoch": 0.060523681740464796, + "grad_norm": 3.6867172718048096, + "learning_rate": 2.9729664641388417e-05, + "loss": 2.0345, + "step": 31080 + }, + { + "epoch": 0.06055289201157506, + "grad_norm": 3.243898630142212, + "learning_rate": 2.9729404424539183e-05, + "loss": 1.9114, + "step": 31095 + }, + { + "epoch": 0.06058210228268532, + "grad_norm": 2.8866801261901855, + "learning_rate": 2.9729144083651554e-05, + "loss": 1.9833, + "step": 31110 + }, + { + "epoch": 0.06061131255379558, + "grad_norm": 3.0051145553588867, + "learning_rate": 2.9728883618727738e-05, + "loss": 2.026, + "step": 31125 + }, + { + "epoch": 0.06064052282490585, + "grad_norm": 3.0416595935821533, + "learning_rate": 2.972862302976992e-05, + "loss": 1.8541, + "step": 31140 + }, + { + "epoch": 0.06066973309601611, + "grad_norm": 2.835458517074585, + "learning_rate": 2.9728362316780294e-05, + "loss": 1.8636, + "step": 31155 + }, + { + "epoch": 0.06069894336712637, + "grad_norm": 2.40006947517395, + "learning_rate": 2.972810147976106e-05, + "loss": 1.9535, + "step": 31170 + }, + { + "epoch": 0.06072815363823664, + "grad_norm": 2.7326056957244873, + "learning_rate": 2.9727840518714415e-05, + "loss": 1.7814, + "step": 31185 + }, + { + "epoch": 0.0607573639093469, + "grad_norm": 4.403411388397217, + "learning_rate": 2.972757943364255e-05, + "loss": 1.7441, + "step": 31200 + }, + { + "epoch": 0.06078657418045716, + "grad_norm": 2.259716272354126, + "learning_rate": 2.9727318224547667e-05, + "loss": 1.8447, + "step": 31215 + }, + { + "epoch": 0.06081578445156742, + "grad_norm": 2.397702217102051, + "learning_rate": 2.9727056891431966e-05, + "loss": 1.7677, + "step": 31230 + }, + { + "epoch": 0.06084499472267769, + "grad_norm": 3.840811014175415, + "learning_rate": 2.9726795434297646e-05, + "loss": 1.7271, + "step": 31245 + }, + { + "epoch": 0.06087420499378795, + "grad_norm": 2.695345163345337, + "learning_rate": 2.9726533853146913e-05, + "loss": 1.9575, + "step": 31260 + }, + { + "epoch": 0.06090341526489821, + "grad_norm": 2.330676555633545, + "learning_rate": 2.972627214798197e-05, + "loss": 1.9488, + "step": 31275 + }, + { + "epoch": 0.06093262553600848, + "grad_norm": 3.987534999847412, + "learning_rate": 2.9726010318805014e-05, + "loss": 1.8628, + "step": 31290 + }, + { + "epoch": 0.06096183580711874, + "grad_norm": 3.307753086090088, + "learning_rate": 2.9725748365618252e-05, + "loss": 1.9931, + "step": 31305 + }, + { + "epoch": 0.060991046078229, + "grad_norm": 4.703588485717773, + "learning_rate": 2.9725486288423894e-05, + "loss": 1.8416, + "step": 31320 + }, + { + "epoch": 0.06102025634933926, + "grad_norm": 2.01497220993042, + "learning_rate": 2.9725224087224146e-05, + "loss": 1.897, + "step": 31335 + }, + { + "epoch": 0.06104946662044953, + "grad_norm": 2.9679431915283203, + "learning_rate": 2.9724961762021215e-05, + "loss": 1.951, + "step": 31350 + }, + { + "epoch": 0.06107867689155979, + "grad_norm": 2.5453622341156006, + "learning_rate": 2.972469931281731e-05, + "loss": 1.8823, + "step": 31365 + }, + { + "epoch": 0.06110788716267005, + "grad_norm": 4.135504245758057, + "learning_rate": 2.9724436739614643e-05, + "loss": 1.9061, + "step": 31380 + }, + { + "epoch": 0.06113709743378032, + "grad_norm": 1.99344003200531, + "learning_rate": 2.9724174042415417e-05, + "loss": 1.986, + "step": 31395 + }, + { + "epoch": 0.06116630770489058, + "grad_norm": 4.097934722900391, + "learning_rate": 2.9723911221221857e-05, + "loss": 1.8222, + "step": 31410 + }, + { + "epoch": 0.06119551797600084, + "grad_norm": 4.836818695068359, + "learning_rate": 2.9723648276036165e-05, + "loss": 1.9038, + "step": 31425 + }, + { + "epoch": 0.0612247282471111, + "grad_norm": 2.4198572635650635, + "learning_rate": 2.972338520686056e-05, + "loss": 1.9228, + "step": 31440 + }, + { + "epoch": 0.06125393851822137, + "grad_norm": 4.546353816986084, + "learning_rate": 2.9723122013697265e-05, + "loss": 1.6837, + "step": 31455 + }, + { + "epoch": 0.06128314878933163, + "grad_norm": 3.9125819206237793, + "learning_rate": 2.972285869654848e-05, + "loss": 1.9057, + "step": 31470 + }, + { + "epoch": 0.06131235906044189, + "grad_norm": 2.7409276962280273, + "learning_rate": 2.9722595255416435e-05, + "loss": 1.6805, + "step": 31485 + }, + { + "epoch": 0.06134156933155216, + "grad_norm": 3.4497761726379395, + "learning_rate": 2.9722331690303344e-05, + "loss": 1.7006, + "step": 31500 + }, + { + "epoch": 0.06137077960266242, + "grad_norm": 3.8101327419281006, + "learning_rate": 2.9722068001211427e-05, + "loss": 1.8445, + "step": 31515 + }, + { + "epoch": 0.06139998987377268, + "grad_norm": 4.296467304229736, + "learning_rate": 2.9721804188142906e-05, + "loss": 1.9453, + "step": 31530 + }, + { + "epoch": 0.06142920014488294, + "grad_norm": 4.05743408203125, + "learning_rate": 2.9721540251100003e-05, + "loss": 1.6627, + "step": 31545 + }, + { + "epoch": 0.06145841041599321, + "grad_norm": 1.896415114402771, + "learning_rate": 2.9721276190084934e-05, + "loss": 2.046, + "step": 31560 + }, + { + "epoch": 0.06148762068710347, + "grad_norm": 3.3542966842651367, + "learning_rate": 2.9721012005099933e-05, + "loss": 1.7947, + "step": 31575 + }, + { + "epoch": 0.06151683095821373, + "grad_norm": 2.018230676651001, + "learning_rate": 2.9720747696147214e-05, + "loss": 1.8102, + "step": 31590 + }, + { + "epoch": 0.06154604122932399, + "grad_norm": 2.8734261989593506, + "learning_rate": 2.9720483263229012e-05, + "loss": 1.9102, + "step": 31605 + }, + { + "epoch": 0.06157525150043426, + "grad_norm": 2.8697266578674316, + "learning_rate": 2.972021870634755e-05, + "loss": 1.9179, + "step": 31620 + }, + { + "epoch": 0.06160446177154452, + "grad_norm": 2.5872962474823, + "learning_rate": 2.9719954025505054e-05, + "loss": 1.8555, + "step": 31635 + }, + { + "epoch": 0.06163367204265478, + "grad_norm": 3.8212106227874756, + "learning_rate": 2.9719689220703758e-05, + "loss": 1.7516, + "step": 31650 + }, + { + "epoch": 0.06166288231376505, + "grad_norm": 2.7884206771850586, + "learning_rate": 2.9719424291945885e-05, + "loss": 1.8213, + "step": 31665 + }, + { + "epoch": 0.06169209258487531, + "grad_norm": 2.5221076011657715, + "learning_rate": 2.971915923923367e-05, + "loss": 1.6586, + "step": 31680 + }, + { + "epoch": 0.06172130285598557, + "grad_norm": 2.4005963802337646, + "learning_rate": 2.971889406256935e-05, + "loss": 2.0582, + "step": 31695 + }, + { + "epoch": 0.061750513127095834, + "grad_norm": 2.02565860748291, + "learning_rate": 2.9718628761955146e-05, + "loss": 1.8941, + "step": 31710 + }, + { + "epoch": 0.0617797233982061, + "grad_norm": 3.2867822647094727, + "learning_rate": 2.9718363337393303e-05, + "loss": 1.7149, + "step": 31725 + }, + { + "epoch": 0.06180893366931636, + "grad_norm": 2.4237873554229736, + "learning_rate": 2.9718097788886054e-05, + "loss": 1.8685, + "step": 31740 + }, + { + "epoch": 0.061838143940426624, + "grad_norm": 2.000887155532837, + "learning_rate": 2.9717832116435632e-05, + "loss": 1.7489, + "step": 31755 + }, + { + "epoch": 0.06186735421153689, + "grad_norm": 2.2382023334503174, + "learning_rate": 2.9717566320044276e-05, + "loss": 1.8845, + "step": 31770 + }, + { + "epoch": 0.06189656448264715, + "grad_norm": 3.9158990383148193, + "learning_rate": 2.9717300399714222e-05, + "loss": 1.689, + "step": 31785 + }, + { + "epoch": 0.061925774753757414, + "grad_norm": 4.004560470581055, + "learning_rate": 2.9717034355447713e-05, + "loss": 1.9279, + "step": 31800 + }, + { + "epoch": 0.061954985024867675, + "grad_norm": 2.9310123920440674, + "learning_rate": 2.9716768187246986e-05, + "loss": 1.771, + "step": 31815 + }, + { + "epoch": 0.06198419529597794, + "grad_norm": 2.5769150257110596, + "learning_rate": 2.9716501895114287e-05, + "loss": 1.9365, + "step": 31830 + }, + { + "epoch": 0.0620134055670882, + "grad_norm": 3.767500162124634, + "learning_rate": 2.9716235479051858e-05, + "loss": 1.8358, + "step": 31845 + }, + { + "epoch": 0.062042615838198464, + "grad_norm": 4.410051345825195, + "learning_rate": 2.9715968939061932e-05, + "loss": 1.8382, + "step": 31860 + }, + { + "epoch": 0.06207182610930873, + "grad_norm": 2.5900702476501465, + "learning_rate": 2.9715702275146767e-05, + "loss": 1.8717, + "step": 31875 + }, + { + "epoch": 0.06210103638041899, + "grad_norm": 3.7013213634490967, + "learning_rate": 2.9715435487308608e-05, + "loss": 1.8794, + "step": 31890 + }, + { + "epoch": 0.062130246651529254, + "grad_norm": 2.48606276512146, + "learning_rate": 2.9715168575549688e-05, + "loss": 1.9964, + "step": 31905 + }, + { + "epoch": 0.062159456922639515, + "grad_norm": 2.358415365219116, + "learning_rate": 2.9714901539872267e-05, + "loss": 1.902, + "step": 31920 + }, + { + "epoch": 0.06218866719374978, + "grad_norm": 3.140083074569702, + "learning_rate": 2.9714634380278594e-05, + "loss": 1.8696, + "step": 31935 + }, + { + "epoch": 0.062217877464860044, + "grad_norm": 3.2135777473449707, + "learning_rate": 2.9714367096770913e-05, + "loss": 2.0002, + "step": 31950 + }, + { + "epoch": 0.062247087735970305, + "grad_norm": 3.2304131984710693, + "learning_rate": 2.9714099689351477e-05, + "loss": 1.8721, + "step": 31965 + }, + { + "epoch": 0.06227629800708057, + "grad_norm": 2.206209659576416, + "learning_rate": 2.9713832158022535e-05, + "loss": 1.9788, + "step": 31980 + }, + { + "epoch": 0.062305508278190834, + "grad_norm": 3.055828332901001, + "learning_rate": 2.9713564502786348e-05, + "loss": 1.7976, + "step": 31995 + }, + { + "epoch": 0.062334718549301095, + "grad_norm": 3.891845226287842, + "learning_rate": 2.9713296723645165e-05, + "loss": 1.7772, + "step": 32010 + }, + { + "epoch": 0.062363928820411356, + "grad_norm": 1.8532415628433228, + "learning_rate": 2.9713028820601238e-05, + "loss": 1.9817, + "step": 32025 + }, + { + "epoch": 0.062393139091521624, + "grad_norm": 1.650211215019226, + "learning_rate": 2.9712760793656826e-05, + "loss": 2.1243, + "step": 32040 + }, + { + "epoch": 0.062422349362631885, + "grad_norm": 2.663621187210083, + "learning_rate": 2.971249264281419e-05, + "loss": 2.0554, + "step": 32055 + }, + { + "epoch": 0.062451559633742146, + "grad_norm": 3.2437026500701904, + "learning_rate": 2.9712224368075578e-05, + "loss": 1.765, + "step": 32070 + }, + { + "epoch": 0.06248076990485241, + "grad_norm": 4.013802528381348, + "learning_rate": 2.971195596944326e-05, + "loss": 1.8195, + "step": 32085 + }, + { + "epoch": 0.06250998017596267, + "grad_norm": 4.0295610427856445, + "learning_rate": 2.971168744691949e-05, + "loss": 2.036, + "step": 32100 + }, + { + "epoch": 0.06253919044707294, + "grad_norm": 2.830064296722412, + "learning_rate": 2.9711418800506533e-05, + "loss": 1.8499, + "step": 32115 + }, + { + "epoch": 0.0625684007181832, + "grad_norm": 3.896538019180298, + "learning_rate": 2.9711150030206643e-05, + "loss": 2.0089, + "step": 32130 + }, + { + "epoch": 0.06259761098929346, + "grad_norm": 3.6802754402160645, + "learning_rate": 2.97108811360221e-05, + "loss": 1.9039, + "step": 32145 + }, + { + "epoch": 0.06262682126040373, + "grad_norm": 2.8547956943511963, + "learning_rate": 2.9710612117955148e-05, + "loss": 1.9065, + "step": 32160 + }, + { + "epoch": 0.06265603153151399, + "grad_norm": 2.0819807052612305, + "learning_rate": 2.9710342976008066e-05, + "loss": 1.7786, + "step": 32175 + }, + { + "epoch": 0.06268524180262425, + "grad_norm": 2.173609972000122, + "learning_rate": 2.9710073710183118e-05, + "loss": 1.8011, + "step": 32190 + }, + { + "epoch": 0.06271445207373451, + "grad_norm": 2.2233803272247314, + "learning_rate": 2.970980432048257e-05, + "loss": 1.8308, + "step": 32205 + }, + { + "epoch": 0.06274366234484478, + "grad_norm": 2.227229356765747, + "learning_rate": 2.9709534806908692e-05, + "loss": 1.9227, + "step": 32220 + }, + { + "epoch": 0.06277287261595504, + "grad_norm": 3.8464114665985107, + "learning_rate": 2.970926516946375e-05, + "loss": 1.9808, + "step": 32235 + }, + { + "epoch": 0.0628020828870653, + "grad_norm": 4.511869430541992, + "learning_rate": 2.9708995408150018e-05, + "loss": 1.9856, + "step": 32250 + }, + { + "epoch": 0.06283129315817557, + "grad_norm": 2.0209896564483643, + "learning_rate": 2.9708725522969767e-05, + "loss": 1.8766, + "step": 32265 + }, + { + "epoch": 0.06286050342928583, + "grad_norm": 4.369567394256592, + "learning_rate": 2.9708455513925273e-05, + "loss": 1.8103, + "step": 32280 + }, + { + "epoch": 0.06288971370039609, + "grad_norm": 4.502676486968994, + "learning_rate": 2.97081853810188e-05, + "loss": 1.8489, + "step": 32295 + }, + { + "epoch": 0.06291892397150635, + "grad_norm": 2.288998603820801, + "learning_rate": 2.9707915124252637e-05, + "loss": 1.9542, + "step": 32310 + }, + { + "epoch": 0.06294813424261662, + "grad_norm": 2.6359355449676514, + "learning_rate": 2.9707644743629046e-05, + "loss": 1.7883, + "step": 32325 + }, + { + "epoch": 0.06297734451372689, + "grad_norm": 4.356678485870361, + "learning_rate": 2.970737423915031e-05, + "loss": 1.8937, + "step": 32340 + }, + { + "epoch": 0.06300655478483715, + "grad_norm": 3.2190353870391846, + "learning_rate": 2.970710361081871e-05, + "loss": 1.9758, + "step": 32355 + }, + { + "epoch": 0.06303576505594741, + "grad_norm": 2.522303819656372, + "learning_rate": 2.970683285863652e-05, + "loss": 1.7315, + "step": 32370 + }, + { + "epoch": 0.06306497532705767, + "grad_norm": 2.500927448272705, + "learning_rate": 2.9706561982606023e-05, + "loss": 1.9004, + "step": 32385 + }, + { + "epoch": 0.06309418559816793, + "grad_norm": 3.656970262527466, + "learning_rate": 2.97062909827295e-05, + "loss": 1.9246, + "step": 32400 + }, + { + "epoch": 0.06312339586927819, + "grad_norm": 4.1837158203125, + "learning_rate": 2.970601985900923e-05, + "loss": 1.6453, + "step": 32415 + }, + { + "epoch": 0.06315260614038845, + "grad_norm": 5.141902446746826, + "learning_rate": 2.9705748611447498e-05, + "loss": 1.7215, + "step": 32430 + }, + { + "epoch": 0.06318181641149873, + "grad_norm": 5.384919166564941, + "learning_rate": 2.9705477240046595e-05, + "loss": 2.0638, + "step": 32445 + }, + { + "epoch": 0.06321102668260899, + "grad_norm": 2.812014579772949, + "learning_rate": 2.9705205744808795e-05, + "loss": 1.8133, + "step": 32460 + }, + { + "epoch": 0.06324023695371925, + "grad_norm": 4.618744850158691, + "learning_rate": 2.970493412573639e-05, + "loss": 1.7221, + "step": 32475 + }, + { + "epoch": 0.06326944722482951, + "grad_norm": 2.0948140621185303, + "learning_rate": 2.9704662382831665e-05, + "loss": 1.7292, + "step": 32490 + }, + { + "epoch": 0.06329865749593977, + "grad_norm": 2.2021284103393555, + "learning_rate": 2.970439051609691e-05, + "loss": 1.6685, + "step": 32505 + }, + { + "epoch": 0.06332786776705003, + "grad_norm": 4.773664474487305, + "learning_rate": 2.9704118525534414e-05, + "loss": 1.8922, + "step": 32520 + }, + { + "epoch": 0.06335707803816029, + "grad_norm": 2.716444253921509, + "learning_rate": 2.970384641114647e-05, + "loss": 1.816, + "step": 32535 + }, + { + "epoch": 0.06338628830927057, + "grad_norm": 4.086103916168213, + "learning_rate": 2.9703574172935366e-05, + "loss": 1.8084, + "step": 32550 + }, + { + "epoch": 0.06341549858038083, + "grad_norm": 2.4854936599731445, + "learning_rate": 2.9703301810903397e-05, + "loss": 1.7871, + "step": 32565 + }, + { + "epoch": 0.06344470885149109, + "grad_norm": 1.8130111694335938, + "learning_rate": 2.9703029325052857e-05, + "loss": 1.8795, + "step": 32580 + }, + { + "epoch": 0.06347391912260135, + "grad_norm": 3.8686211109161377, + "learning_rate": 2.970275671538604e-05, + "loss": 1.9867, + "step": 32595 + }, + { + "epoch": 0.06350312939371161, + "grad_norm": 2.5253164768218994, + "learning_rate": 2.9702483981905233e-05, + "loss": 1.875, + "step": 32610 + }, + { + "epoch": 0.06353233966482187, + "grad_norm": 3.0097036361694336, + "learning_rate": 2.9702211124612745e-05, + "loss": 1.9521, + "step": 32625 + }, + { + "epoch": 0.06356154993593213, + "grad_norm": 3.3433878421783447, + "learning_rate": 2.9701938143510873e-05, + "loss": 1.806, + "step": 32640 + }, + { + "epoch": 0.06359076020704241, + "grad_norm": 1.4688777923583984, + "learning_rate": 2.970166503860191e-05, + "loss": 1.8248, + "step": 32655 + }, + { + "epoch": 0.06361997047815267, + "grad_norm": 2.365694522857666, + "learning_rate": 2.9701391809888156e-05, + "loss": 1.7152, + "step": 32670 + }, + { + "epoch": 0.06364918074926293, + "grad_norm": 2.6821203231811523, + "learning_rate": 2.9701118457371915e-05, + "loss": 1.858, + "step": 32685 + }, + { + "epoch": 0.06367839102037319, + "grad_norm": 1.842124342918396, + "learning_rate": 2.9700844981055486e-05, + "loss": 1.7689, + "step": 32700 + }, + { + "epoch": 0.06370760129148345, + "grad_norm": 2.2430474758148193, + "learning_rate": 2.9700571380941178e-05, + "loss": 2.0056, + "step": 32715 + }, + { + "epoch": 0.06373681156259371, + "grad_norm": 3.1308460235595703, + "learning_rate": 2.970029765703129e-05, + "loss": 1.9163, + "step": 32730 + }, + { + "epoch": 0.06376602183370397, + "grad_norm": 2.6456706523895264, + "learning_rate": 2.9700023809328122e-05, + "loss": 1.8002, + "step": 32745 + }, + { + "epoch": 0.06379523210481425, + "grad_norm": 3.659024953842163, + "learning_rate": 2.969974983783399e-05, + "loss": 1.9222, + "step": 32760 + }, + { + "epoch": 0.06382444237592451, + "grad_norm": 2.3271381855010986, + "learning_rate": 2.96994757425512e-05, + "loss": 1.7778, + "step": 32775 + }, + { + "epoch": 0.06385365264703477, + "grad_norm": 3.411533832550049, + "learning_rate": 2.9699201523482057e-05, + "loss": 1.778, + "step": 32790 + }, + { + "epoch": 0.06388286291814503, + "grad_norm": 2.649052143096924, + "learning_rate": 2.9698927180628866e-05, + "loss": 1.8199, + "step": 32805 + }, + { + "epoch": 0.06391207318925529, + "grad_norm": 2.8163363933563232, + "learning_rate": 2.9698652713993947e-05, + "loss": 1.8579, + "step": 32820 + }, + { + "epoch": 0.06394128346036555, + "grad_norm": 3.319955348968506, + "learning_rate": 2.9698378123579603e-05, + "loss": 1.9385, + "step": 32835 + }, + { + "epoch": 0.06397049373147581, + "grad_norm": 2.217761754989624, + "learning_rate": 2.969810340938815e-05, + "loss": 2.0007, + "step": 32850 + }, + { + "epoch": 0.06399970400258609, + "grad_norm": 4.801873683929443, + "learning_rate": 2.96978285714219e-05, + "loss": 1.9876, + "step": 32865 + }, + { + "epoch": 0.06402891427369635, + "grad_norm": 3.5146069526672363, + "learning_rate": 2.969755360968317e-05, + "loss": 1.8267, + "step": 32880 + }, + { + "epoch": 0.06405812454480661, + "grad_norm": 3.407545328140259, + "learning_rate": 2.9697278524174275e-05, + "loss": 1.743, + "step": 32895 + }, + { + "epoch": 0.06408733481591687, + "grad_norm": 1.410946011543274, + "learning_rate": 2.969700331489753e-05, + "loss": 1.9281, + "step": 32910 + }, + { + "epoch": 0.06411654508702713, + "grad_norm": 2.602207899093628, + "learning_rate": 2.9696727981855253e-05, + "loss": 1.8532, + "step": 32925 + }, + { + "epoch": 0.0641457553581374, + "grad_norm": 4.358025550842285, + "learning_rate": 2.969645252504976e-05, + "loss": 1.7457, + "step": 32940 + }, + { + "epoch": 0.06417496562924765, + "grad_norm": 3.8209457397460938, + "learning_rate": 2.9696176944483373e-05, + "loss": 2.0736, + "step": 32955 + }, + { + "epoch": 0.06420417590035793, + "grad_norm": 3.317567825317383, + "learning_rate": 2.9695901240158415e-05, + "loss": 1.9322, + "step": 32970 + }, + { + "epoch": 0.06423338617146819, + "grad_norm": 4.668582439422607, + "learning_rate": 2.9695625412077208e-05, + "loss": 2.0152, + "step": 32985 + }, + { + "epoch": 0.06426259644257845, + "grad_norm": 2.6789300441741943, + "learning_rate": 2.969534946024207e-05, + "loss": 1.7129, + "step": 33000 + }, + { + "epoch": 0.06429180671368871, + "grad_norm": 2.073744297027588, + "learning_rate": 2.9695073384655326e-05, + "loss": 1.6952, + "step": 33015 + }, + { + "epoch": 0.06432101698479897, + "grad_norm": 3.3017473220825195, + "learning_rate": 2.9694797185319304e-05, + "loss": 1.8158, + "step": 33030 + }, + { + "epoch": 0.06435022725590923, + "grad_norm": 4.441155433654785, + "learning_rate": 2.969452086223633e-05, + "loss": 1.9204, + "step": 33045 + }, + { + "epoch": 0.0643794375270195, + "grad_norm": 2.9764716625213623, + "learning_rate": 2.9694244415408727e-05, + "loss": 1.8952, + "step": 33060 + }, + { + "epoch": 0.06440864779812977, + "grad_norm": 4.1646575927734375, + "learning_rate": 2.9693967844838827e-05, + "loss": 1.7692, + "step": 33075 + }, + { + "epoch": 0.06443785806924003, + "grad_norm": 4.056210994720459, + "learning_rate": 2.9693691150528955e-05, + "loss": 1.855, + "step": 33090 + }, + { + "epoch": 0.06446706834035029, + "grad_norm": 2.6098761558532715, + "learning_rate": 2.9693414332481443e-05, + "loss": 1.8284, + "step": 33105 + }, + { + "epoch": 0.06449627861146055, + "grad_norm": 4.3388166427612305, + "learning_rate": 2.9693137390698622e-05, + "loss": 1.8155, + "step": 33120 + }, + { + "epoch": 0.06452548888257081, + "grad_norm": 3.3735921382904053, + "learning_rate": 2.9692860325182825e-05, + "loss": 1.7765, + "step": 33135 + }, + { + "epoch": 0.06455469915368107, + "grad_norm": 3.9341137409210205, + "learning_rate": 2.9692583135936385e-05, + "loss": 1.7622, + "step": 33150 + }, + { + "epoch": 0.06458390942479134, + "grad_norm": 5.620511531829834, + "learning_rate": 2.9692305822961637e-05, + "loss": 1.7926, + "step": 33165 + }, + { + "epoch": 0.06461311969590161, + "grad_norm": 1.8949609994888306, + "learning_rate": 2.9692028386260915e-05, + "loss": 1.9158, + "step": 33180 + }, + { + "epoch": 0.06464232996701187, + "grad_norm": 2.183438539505005, + "learning_rate": 2.9691750825836557e-05, + "loss": 1.9586, + "step": 33195 + }, + { + "epoch": 0.06467154023812213, + "grad_norm": 2.05364727973938, + "learning_rate": 2.9691473141690894e-05, + "loss": 1.8952, + "step": 33210 + }, + { + "epoch": 0.0647007505092324, + "grad_norm": 2.479680299758911, + "learning_rate": 2.9691195333826277e-05, + "loss": 1.7908, + "step": 33225 + }, + { + "epoch": 0.06472996078034265, + "grad_norm": 2.0384159088134766, + "learning_rate": 2.9690917402245034e-05, + "loss": 1.8231, + "step": 33240 + }, + { + "epoch": 0.06475917105145292, + "grad_norm": 2.3003928661346436, + "learning_rate": 2.9690639346949503e-05, + "loss": 1.6675, + "step": 33255 + }, + { + "epoch": 0.06478838132256318, + "grad_norm": 2.9308221340179443, + "learning_rate": 2.9690361167942042e-05, + "loss": 1.9136, + "step": 33270 + }, + { + "epoch": 0.06481759159367345, + "grad_norm": 2.3866639137268066, + "learning_rate": 2.9690082865224975e-05, + "loss": 1.8992, + "step": 33285 + }, + { + "epoch": 0.06484680186478371, + "grad_norm": 4.399453163146973, + "learning_rate": 2.968980443880066e-05, + "loss": 1.9369, + "step": 33300 + }, + { + "epoch": 0.06487601213589397, + "grad_norm": 3.06842041015625, + "learning_rate": 2.9689525888671436e-05, + "loss": 1.9624, + "step": 33315 + }, + { + "epoch": 0.06490522240700423, + "grad_norm": 1.9636744260787964, + "learning_rate": 2.9689247214839647e-05, + "loss": 1.7785, + "step": 33330 + }, + { + "epoch": 0.0649344326781145, + "grad_norm": 3.882155418395996, + "learning_rate": 2.968896841730764e-05, + "loss": 1.8328, + "step": 33345 + }, + { + "epoch": 0.06496364294922476, + "grad_norm": 3.773200511932373, + "learning_rate": 2.9688689496077764e-05, + "loss": 1.9322, + "step": 33360 + }, + { + "epoch": 0.06499285322033502, + "grad_norm": 2.331101417541504, + "learning_rate": 2.968841045115237e-05, + "loss": 1.7726, + "step": 33375 + }, + { + "epoch": 0.06502206349144529, + "grad_norm": 3.3869986534118652, + "learning_rate": 2.9688131282533802e-05, + "loss": 1.9489, + "step": 33390 + }, + { + "epoch": 0.06505127376255555, + "grad_norm": 2.627638578414917, + "learning_rate": 2.968785199022442e-05, + "loss": 1.7626, + "step": 33405 + }, + { + "epoch": 0.06508048403366581, + "grad_norm": 2.031043291091919, + "learning_rate": 2.9687572574226566e-05, + "loss": 1.8948, + "step": 33420 + }, + { + "epoch": 0.06510969430477608, + "grad_norm": 1.6995463371276855, + "learning_rate": 2.96872930345426e-05, + "loss": 1.8642, + "step": 33435 + }, + { + "epoch": 0.06513890457588634, + "grad_norm": 3.292022943496704, + "learning_rate": 2.9687013371174873e-05, + "loss": 1.8344, + "step": 33450 + }, + { + "epoch": 0.0651681148469966, + "grad_norm": 2.9283857345581055, + "learning_rate": 2.968673358412574e-05, + "loss": 2.0218, + "step": 33465 + }, + { + "epoch": 0.06519732511810686, + "grad_norm": 3.2172300815582275, + "learning_rate": 2.968645367339756e-05, + "loss": 1.8049, + "step": 33480 + }, + { + "epoch": 0.06522653538921712, + "grad_norm": 2.1529486179351807, + "learning_rate": 2.9686173638992687e-05, + "loss": 1.8755, + "step": 33495 + }, + { + "epoch": 0.0652557456603274, + "grad_norm": 2.5349905490875244, + "learning_rate": 2.9685893480913477e-05, + "loss": 2.0173, + "step": 33510 + }, + { + "epoch": 0.06528495593143765, + "grad_norm": 3.646724224090576, + "learning_rate": 2.9685613199162296e-05, + "loss": 1.9269, + "step": 33525 + }, + { + "epoch": 0.06531416620254792, + "grad_norm": 3.3442795276641846, + "learning_rate": 2.96853327937415e-05, + "loss": 1.7363, + "step": 33540 + }, + { + "epoch": 0.06534337647365818, + "grad_norm": 3.301544189453125, + "learning_rate": 2.9685052264653452e-05, + "loss": 1.9225, + "step": 33555 + }, + { + "epoch": 0.06537258674476844, + "grad_norm": 1.919858455657959, + "learning_rate": 2.9684771611900512e-05, + "loss": 1.7969, + "step": 33570 + }, + { + "epoch": 0.0654017970158787, + "grad_norm": 3.6086807250976562, + "learning_rate": 2.9684490835485045e-05, + "loss": 1.8075, + "step": 33585 + }, + { + "epoch": 0.06543100728698896, + "grad_norm": 1.659013032913208, + "learning_rate": 2.9684209935409418e-05, + "loss": 1.7631, + "step": 33600 + }, + { + "epoch": 0.06546021755809923, + "grad_norm": 4.616360187530518, + "learning_rate": 2.968392891167599e-05, + "loss": 1.817, + "step": 33615 + }, + { + "epoch": 0.0654894278292095, + "grad_norm": 2.684124708175659, + "learning_rate": 2.9683647764287136e-05, + "loss": 1.8655, + "step": 33630 + }, + { + "epoch": 0.06551863810031976, + "grad_norm": 2.5763068199157715, + "learning_rate": 2.9683366493245213e-05, + "loss": 1.851, + "step": 33645 + }, + { + "epoch": 0.06554784837143002, + "grad_norm": 4.1813859939575195, + "learning_rate": 2.96830850985526e-05, + "loss": 1.8571, + "step": 33660 + }, + { + "epoch": 0.06557705864254028, + "grad_norm": 3.5396945476531982, + "learning_rate": 2.968280358021166e-05, + "loss": 1.8414, + "step": 33675 + }, + { + "epoch": 0.06560626891365054, + "grad_norm": 3.497781276702881, + "learning_rate": 2.968252193822477e-05, + "loss": 2.0295, + "step": 33690 + }, + { + "epoch": 0.0656354791847608, + "grad_norm": 2.3261985778808594, + "learning_rate": 2.9682240172594294e-05, + "loss": 1.8092, + "step": 33705 + }, + { + "epoch": 0.06566468945587108, + "grad_norm": 2.558701753616333, + "learning_rate": 2.968195828332261e-05, + "loss": 1.9174, + "step": 33720 + }, + { + "epoch": 0.06569389972698134, + "grad_norm": 3.210392713546753, + "learning_rate": 2.9681676270412092e-05, + "loss": 1.8506, + "step": 33735 + }, + { + "epoch": 0.0657231099980916, + "grad_norm": 2.7672812938690186, + "learning_rate": 2.968139413386511e-05, + "loss": 1.9361, + "step": 33750 + }, + { + "epoch": 0.06575232026920186, + "grad_norm": 4.212212562561035, + "learning_rate": 2.9681111873684046e-05, + "loss": 1.7308, + "step": 33765 + }, + { + "epoch": 0.06578153054031212, + "grad_norm": 1.9889777898788452, + "learning_rate": 2.9680829489871274e-05, + "loss": 1.8549, + "step": 33780 + }, + { + "epoch": 0.06581074081142238, + "grad_norm": 1.7036499977111816, + "learning_rate": 2.9680546982429166e-05, + "loss": 1.877, + "step": 33795 + }, + { + "epoch": 0.06583995108253264, + "grad_norm": 2.7053897380828857, + "learning_rate": 2.9680264351360115e-05, + "loss": 1.8077, + "step": 33810 + }, + { + "epoch": 0.06586916135364292, + "grad_norm": 2.8420169353485107, + "learning_rate": 2.967998159666649e-05, + "loss": 1.8568, + "step": 33825 + }, + { + "epoch": 0.06589837162475318, + "grad_norm": 2.4401204586029053, + "learning_rate": 2.9679698718350673e-05, + "loss": 1.8119, + "step": 33840 + }, + { + "epoch": 0.06592758189586344, + "grad_norm": 3.5529043674468994, + "learning_rate": 2.9679415716415053e-05, + "loss": 1.755, + "step": 33855 + }, + { + "epoch": 0.0659567921669737, + "grad_norm": 2.672156572341919, + "learning_rate": 2.9679132590862004e-05, + "loss": 1.9222, + "step": 33870 + }, + { + "epoch": 0.06598600243808396, + "grad_norm": 1.9536501169204712, + "learning_rate": 2.967884934169392e-05, + "loss": 1.9098, + "step": 33885 + }, + { + "epoch": 0.06601521270919422, + "grad_norm": 2.567133903503418, + "learning_rate": 2.9678565968913177e-05, + "loss": 1.924, + "step": 33900 + }, + { + "epoch": 0.06604442298030448, + "grad_norm": 2.739780902862549, + "learning_rate": 2.967828247252217e-05, + "loss": 1.9591, + "step": 33915 + }, + { + "epoch": 0.06607363325141476, + "grad_norm": 3.8824591636657715, + "learning_rate": 2.9677998852523277e-05, + "loss": 1.8305, + "step": 33930 + }, + { + "epoch": 0.06610284352252502, + "grad_norm": 3.137244939804077, + "learning_rate": 2.967771510891889e-05, + "loss": 1.9673, + "step": 33945 + }, + { + "epoch": 0.06613205379363528, + "grad_norm": 2.5533084869384766, + "learning_rate": 2.9677431241711405e-05, + "loss": 1.7838, + "step": 33960 + }, + { + "epoch": 0.06616126406474554, + "grad_norm": 3.2840499877929688, + "learning_rate": 2.9677147250903203e-05, + "loss": 1.7223, + "step": 33975 + }, + { + "epoch": 0.0661904743358558, + "grad_norm": 3.5756568908691406, + "learning_rate": 2.9676863136496685e-05, + "loss": 2.0505, + "step": 33990 + }, + { + "epoch": 0.06621968460696606, + "grad_norm": 2.9339518547058105, + "learning_rate": 2.967657889849423e-05, + "loss": 1.8248, + "step": 34005 + }, + { + "epoch": 0.06624889487807632, + "grad_norm": 2.587409496307373, + "learning_rate": 2.9676294536898247e-05, + "loss": 2.1007, + "step": 34020 + }, + { + "epoch": 0.0662781051491866, + "grad_norm": 2.164442777633667, + "learning_rate": 2.9676010051711123e-05, + "loss": 1.6494, + "step": 34035 + }, + { + "epoch": 0.06630731542029686, + "grad_norm": 2.022341012954712, + "learning_rate": 2.9675725442935252e-05, + "loss": 2.01, + "step": 34050 + }, + { + "epoch": 0.06633652569140712, + "grad_norm": 2.1963977813720703, + "learning_rate": 2.9675440710573036e-05, + "loss": 1.7275, + "step": 34065 + }, + { + "epoch": 0.06636573596251738, + "grad_norm": 2.7345070838928223, + "learning_rate": 2.967515585462687e-05, + "loss": 1.6542, + "step": 34080 + }, + { + "epoch": 0.06639494623362764, + "grad_norm": 2.9531948566436768, + "learning_rate": 2.9674870875099144e-05, + "loss": 1.9653, + "step": 34095 + }, + { + "epoch": 0.0664241565047379, + "grad_norm": 2.839843988418579, + "learning_rate": 2.9674585771992277e-05, + "loss": 1.8574, + "step": 34110 + }, + { + "epoch": 0.06645336677584816, + "grad_norm": 2.976874351501465, + "learning_rate": 2.967430054530865e-05, + "loss": 2.0622, + "step": 34125 + }, + { + "epoch": 0.06648257704695844, + "grad_norm": 2.1137330532073975, + "learning_rate": 2.967401519505068e-05, + "loss": 1.8188, + "step": 34140 + }, + { + "epoch": 0.0665117873180687, + "grad_norm": 2.7060563564300537, + "learning_rate": 2.9673729721220765e-05, + "loss": 1.9227, + "step": 34155 + }, + { + "epoch": 0.06654099758917896, + "grad_norm": 2.3645975589752197, + "learning_rate": 2.9673444123821306e-05, + "loss": 1.8725, + "step": 34170 + }, + { + "epoch": 0.06657020786028922, + "grad_norm": 2.382066011428833, + "learning_rate": 2.967315840285471e-05, + "loss": 1.7719, + "step": 34185 + }, + { + "epoch": 0.06659941813139948, + "grad_norm": 1.7729606628417969, + "learning_rate": 2.9672872558323385e-05, + "loss": 1.8138, + "step": 34200 + }, + { + "epoch": 0.06662862840250974, + "grad_norm": 2.7769312858581543, + "learning_rate": 2.9672586590229735e-05, + "loss": 1.8957, + "step": 34215 + }, + { + "epoch": 0.06665783867362, + "grad_norm": 4.402248382568359, + "learning_rate": 2.9672300498576173e-05, + "loss": 1.9404, + "step": 34230 + }, + { + "epoch": 0.06668704894473028, + "grad_norm": 4.229050159454346, + "learning_rate": 2.96720142833651e-05, + "loss": 1.8112, + "step": 34245 + }, + { + "epoch": 0.06671625921584054, + "grad_norm": 4.353719711303711, + "learning_rate": 2.9671727944598935e-05, + "loss": 1.8216, + "step": 34260 + }, + { + "epoch": 0.0667454694869508, + "grad_norm": 3.652221918106079, + "learning_rate": 2.9671441482280083e-05, + "loss": 1.9569, + "step": 34275 + }, + { + "epoch": 0.06677467975806106, + "grad_norm": 2.7157819271087646, + "learning_rate": 2.9671154896410962e-05, + "loss": 1.7512, + "step": 34290 + }, + { + "epoch": 0.06680389002917132, + "grad_norm": 2.719609498977661, + "learning_rate": 2.9670868186993982e-05, + "loss": 1.9252, + "step": 34305 + }, + { + "epoch": 0.06683310030028158, + "grad_norm": 2.834977865219116, + "learning_rate": 2.967058135403155e-05, + "loss": 1.7615, + "step": 34320 + }, + { + "epoch": 0.06686231057139184, + "grad_norm": 2.27976393699646, + "learning_rate": 2.9670294397526097e-05, + "loss": 1.7578, + "step": 34335 + }, + { + "epoch": 0.06689152084250212, + "grad_norm": 1.7607027292251587, + "learning_rate": 2.967000731748003e-05, + "loss": 1.8367, + "step": 34350 + }, + { + "epoch": 0.06692073111361238, + "grad_norm": 3.453352451324463, + "learning_rate": 2.9669720113895763e-05, + "loss": 1.8645, + "step": 34365 + }, + { + "epoch": 0.06694994138472264, + "grad_norm": 2.4427194595336914, + "learning_rate": 2.9669432786775727e-05, + "loss": 1.8592, + "step": 34380 + }, + { + "epoch": 0.0669791516558329, + "grad_norm": 3.492604970932007, + "learning_rate": 2.9669145336122335e-05, + "loss": 1.8394, + "step": 34395 + }, + { + "epoch": 0.06700836192694316, + "grad_norm": 2.3918869495391846, + "learning_rate": 2.9668857761938e-05, + "loss": 1.7539, + "step": 34410 + }, + { + "epoch": 0.06703757219805342, + "grad_norm": 1.826912760734558, + "learning_rate": 2.9668570064225156e-05, + "loss": 1.8747, + "step": 34425 + }, + { + "epoch": 0.06706678246916369, + "grad_norm": 4.5764241218566895, + "learning_rate": 2.966828224298622e-05, + "loss": 1.8474, + "step": 34440 + }, + { + "epoch": 0.06709599274027396, + "grad_norm": 3.9237124919891357, + "learning_rate": 2.9667994298223612e-05, + "loss": 1.8965, + "step": 34455 + }, + { + "epoch": 0.06712520301138422, + "grad_norm": 3.064443826675415, + "learning_rate": 2.9667706229939765e-05, + "loss": 1.8549, + "step": 34470 + }, + { + "epoch": 0.06715441328249448, + "grad_norm": 2.0611674785614014, + "learning_rate": 2.96674180381371e-05, + "loss": 1.8735, + "step": 34485 + }, + { + "epoch": 0.06718362355360474, + "grad_norm": 3.887948513031006, + "learning_rate": 2.9667129722818044e-05, + "loss": 1.973, + "step": 34500 + }, + { + "epoch": 0.067212833824715, + "grad_norm": 2.5983119010925293, + "learning_rate": 2.966684128398503e-05, + "loss": 1.8455, + "step": 34515 + }, + { + "epoch": 0.06724204409582527, + "grad_norm": 3.6295909881591797, + "learning_rate": 2.9666552721640474e-05, + "loss": 1.738, + "step": 34530 + }, + { + "epoch": 0.06727125436693553, + "grad_norm": 2.456125020980835, + "learning_rate": 2.966626403578682e-05, + "loss": 1.8651, + "step": 34545 + }, + { + "epoch": 0.06730046463804579, + "grad_norm": 1.6152453422546387, + "learning_rate": 2.966597522642649e-05, + "loss": 1.8728, + "step": 34560 + }, + { + "epoch": 0.06732967490915606, + "grad_norm": 2.789376974105835, + "learning_rate": 2.966568629356193e-05, + "loss": 1.8996, + "step": 34575 + }, + { + "epoch": 0.06735888518026632, + "grad_norm": 3.9699041843414307, + "learning_rate": 2.9665397237195555e-05, + "loss": 1.7645, + "step": 34590 + }, + { + "epoch": 0.06738809545137658, + "grad_norm": 2.7433955669403076, + "learning_rate": 2.966510805732981e-05, + "loss": 1.885, + "step": 34605 + }, + { + "epoch": 0.06741730572248684, + "grad_norm": 3.254692316055298, + "learning_rate": 2.9664818753967123e-05, + "loss": 1.7664, + "step": 34620 + }, + { + "epoch": 0.0674465159935971, + "grad_norm": 2.647752285003662, + "learning_rate": 2.966452932710994e-05, + "loss": 1.8037, + "step": 34635 + }, + { + "epoch": 0.06747572626470737, + "grad_norm": 3.770737648010254, + "learning_rate": 2.966423977676069e-05, + "loss": 1.7053, + "step": 34650 + }, + { + "epoch": 0.06750493653581763, + "grad_norm": 2.101461410522461, + "learning_rate": 2.9663950102921814e-05, + "loss": 1.8039, + "step": 34665 + }, + { + "epoch": 0.0675341468069279, + "grad_norm": 4.246973514556885, + "learning_rate": 2.9663660305595754e-05, + "loss": 1.776, + "step": 34680 + }, + { + "epoch": 0.06756335707803816, + "grad_norm": 6.2981157302856445, + "learning_rate": 2.9663370384784946e-05, + "loss": 1.7988, + "step": 34695 + }, + { + "epoch": 0.06759256734914842, + "grad_norm": 2.8231287002563477, + "learning_rate": 2.9663080340491838e-05, + "loss": 1.9157, + "step": 34710 + }, + { + "epoch": 0.06762177762025869, + "grad_norm": 2.1489720344543457, + "learning_rate": 2.9662790172718867e-05, + "loss": 1.7745, + "step": 34725 + }, + { + "epoch": 0.06765098789136895, + "grad_norm": 2.714576482772827, + "learning_rate": 2.9662499881468475e-05, + "loss": 1.8596, + "step": 34740 + }, + { + "epoch": 0.06768019816247921, + "grad_norm": 2.1026928424835205, + "learning_rate": 2.966220946674311e-05, + "loss": 1.9096, + "step": 34755 + }, + { + "epoch": 0.06770940843358947, + "grad_norm": 4.316662311553955, + "learning_rate": 2.9661918928545215e-05, + "loss": 1.8735, + "step": 34770 + }, + { + "epoch": 0.06773861870469974, + "grad_norm": 3.814514636993408, + "learning_rate": 2.966162826687724e-05, + "loss": 1.9701, + "step": 34785 + }, + { + "epoch": 0.06776782897581, + "grad_norm": 4.129803657531738, + "learning_rate": 2.966133748174164e-05, + "loss": 1.9273, + "step": 34800 + }, + { + "epoch": 0.06779703924692027, + "grad_norm": 1.9344481229782104, + "learning_rate": 2.966104657314084e-05, + "loss": 1.8738, + "step": 34815 + }, + { + "epoch": 0.06782624951803053, + "grad_norm": 5.156272888183594, + "learning_rate": 2.9660755541077314e-05, + "loss": 1.9722, + "step": 34830 + }, + { + "epoch": 0.06785545978914079, + "grad_norm": 2.4508838653564453, + "learning_rate": 2.9660464385553504e-05, + "loss": 1.8354, + "step": 34845 + }, + { + "epoch": 0.06788467006025105, + "grad_norm": 4.646978855133057, + "learning_rate": 2.9660173106571856e-05, + "loss": 1.6924, + "step": 34860 + }, + { + "epoch": 0.06791388033136131, + "grad_norm": 4.116873741149902, + "learning_rate": 2.9659881704134835e-05, + "loss": 1.9238, + "step": 34875 + }, + { + "epoch": 0.06794309060247158, + "grad_norm": 2.2530601024627686, + "learning_rate": 2.965959017824488e-05, + "loss": 1.744, + "step": 34890 + }, + { + "epoch": 0.06797230087358185, + "grad_norm": 3.1118083000183105, + "learning_rate": 2.9659298528904462e-05, + "loss": 1.7291, + "step": 34905 + }, + { + "epoch": 0.0680015111446921, + "grad_norm": 4.10203218460083, + "learning_rate": 2.9659006756116024e-05, + "loss": 1.9676, + "step": 34920 + }, + { + "epoch": 0.06803072141580237, + "grad_norm": 2.487515926361084, + "learning_rate": 2.9658714859882033e-05, + "loss": 1.755, + "step": 34935 + }, + { + "epoch": 0.06805993168691263, + "grad_norm": 4.722667694091797, + "learning_rate": 2.965842284020494e-05, + "loss": 1.6132, + "step": 34950 + }, + { + "epoch": 0.06808914195802289, + "grad_norm": 3.8074474334716797, + "learning_rate": 2.9658130697087206e-05, + "loss": 1.8949, + "step": 34965 + }, + { + "epoch": 0.06811835222913315, + "grad_norm": 3.2412679195404053, + "learning_rate": 2.9657838430531295e-05, + "loss": 1.8062, + "step": 34980 + }, + { + "epoch": 0.06814756250024342, + "grad_norm": 3.2179923057556152, + "learning_rate": 2.9657546040539662e-05, + "loss": 1.7753, + "step": 34995 + }, + { + "epoch": 0.06817677277135369, + "grad_norm": 3.395028591156006, + "learning_rate": 2.9657253527114772e-05, + "loss": 1.7966, + "step": 35010 + }, + { + "epoch": 0.06820598304246395, + "grad_norm": 2.3021628856658936, + "learning_rate": 2.9656960890259093e-05, + "loss": 1.9369, + "step": 35025 + }, + { + "epoch": 0.06823519331357421, + "grad_norm": 2.2561123371124268, + "learning_rate": 2.9656668129975077e-05, + "loss": 1.874, + "step": 35040 + }, + { + "epoch": 0.06826440358468447, + "grad_norm": 3.339489459991455, + "learning_rate": 2.9656375246265205e-05, + "loss": 1.9124, + "step": 35055 + }, + { + "epoch": 0.06829361385579473, + "grad_norm": 3.698767900466919, + "learning_rate": 2.965608223913193e-05, + "loss": 1.9178, + "step": 35070 + }, + { + "epoch": 0.06832282412690499, + "grad_norm": 4.208549499511719, + "learning_rate": 2.965578910857773e-05, + "loss": 2.035, + "step": 35085 + }, + { + "epoch": 0.06835203439801527, + "grad_norm": 3.827305555343628, + "learning_rate": 2.9655495854605067e-05, + "loss": 1.8416, + "step": 35100 + }, + { + "epoch": 0.06838124466912553, + "grad_norm": 3.3755781650543213, + "learning_rate": 2.9655202477216413e-05, + "loss": 1.8389, + "step": 35115 + }, + { + "epoch": 0.06841045494023579, + "grad_norm": 3.1055173873901367, + "learning_rate": 2.9654908976414233e-05, + "loss": 1.7763, + "step": 35130 + }, + { + "epoch": 0.06843966521134605, + "grad_norm": 3.2858526706695557, + "learning_rate": 2.9654615352201006e-05, + "loss": 1.9048, + "step": 35145 + }, + { + "epoch": 0.06846887548245631, + "grad_norm": 2.145545721054077, + "learning_rate": 2.9654321604579208e-05, + "loss": 1.813, + "step": 35160 + }, + { + "epoch": 0.06849808575356657, + "grad_norm": 2.673372268676758, + "learning_rate": 2.9654027733551296e-05, + "loss": 1.8894, + "step": 35175 + }, + { + "epoch": 0.06852729602467683, + "grad_norm": 3.6063785552978516, + "learning_rate": 2.965373373911976e-05, + "loss": 1.7333, + "step": 35190 + }, + { + "epoch": 0.0685565062957871, + "grad_norm": 3.307016611099243, + "learning_rate": 2.9653439621287072e-05, + "loss": 1.9798, + "step": 35205 + }, + { + "epoch": 0.06858571656689737, + "grad_norm": 1.9707258939743042, + "learning_rate": 2.9653145380055713e-05, + "loss": 1.7571, + "step": 35220 + }, + { + "epoch": 0.06861492683800763, + "grad_norm": 3.5209414958953857, + "learning_rate": 2.965285101542815e-05, + "loss": 1.7073, + "step": 35235 + }, + { + "epoch": 0.06864413710911789, + "grad_norm": 3.5485973358154297, + "learning_rate": 2.9652556527406868e-05, + "loss": 1.7296, + "step": 35250 + }, + { + "epoch": 0.06867334738022815, + "grad_norm": 4.829267978668213, + "learning_rate": 2.9652261915994345e-05, + "loss": 1.7223, + "step": 35265 + }, + { + "epoch": 0.06870255765133841, + "grad_norm": 4.529970169067383, + "learning_rate": 2.9651967181193066e-05, + "loss": 1.8972, + "step": 35280 + }, + { + "epoch": 0.06873176792244867, + "grad_norm": 2.448244571685791, + "learning_rate": 2.9651672323005514e-05, + "loss": 1.6871, + "step": 35295 + }, + { + "epoch": 0.06876097819355895, + "grad_norm": 2.37665057182312, + "learning_rate": 2.9651377341434166e-05, + "loss": 1.8861, + "step": 35310 + }, + { + "epoch": 0.06879018846466921, + "grad_norm": 2.69604229927063, + "learning_rate": 2.9651082236481508e-05, + "loss": 1.7115, + "step": 35325 + }, + { + "epoch": 0.06881939873577947, + "grad_norm": 2.311614513397217, + "learning_rate": 2.9650787008150024e-05, + "loss": 1.8526, + "step": 35340 + }, + { + "epoch": 0.06884860900688973, + "grad_norm": 4.728626251220703, + "learning_rate": 2.9650491656442205e-05, + "loss": 1.9195, + "step": 35355 + }, + { + "epoch": 0.06887781927799999, + "grad_norm": 1.917162537574768, + "learning_rate": 2.9650196181360537e-05, + "loss": 1.9401, + "step": 35370 + }, + { + "epoch": 0.06890702954911025, + "grad_norm": 2.5558958053588867, + "learning_rate": 2.96499005829075e-05, + "loss": 1.8347, + "step": 35385 + }, + { + "epoch": 0.06893623982022051, + "grad_norm": 5.177891254425049, + "learning_rate": 2.96496048610856e-05, + "loss": 1.9398, + "step": 35400 + }, + { + "epoch": 0.06896545009133079, + "grad_norm": 4.181756019592285, + "learning_rate": 2.9649309015897306e-05, + "loss": 1.8944, + "step": 35415 + }, + { + "epoch": 0.06899466036244105, + "grad_norm": 2.468771457672119, + "learning_rate": 2.9649013047345123e-05, + "loss": 1.8136, + "step": 35430 + }, + { + "epoch": 0.06902387063355131, + "grad_norm": 4.117799758911133, + "learning_rate": 2.9648716955431545e-05, + "loss": 1.882, + "step": 35445 + }, + { + "epoch": 0.06905308090466157, + "grad_norm": 1.6718223094940186, + "learning_rate": 2.964842074015906e-05, + "loss": 1.908, + "step": 35460 + }, + { + "epoch": 0.06908229117577183, + "grad_norm": 2.493669033050537, + "learning_rate": 2.9648124401530162e-05, + "loss": 1.8696, + "step": 35475 + }, + { + "epoch": 0.06911150144688209, + "grad_norm": 2.0989701747894287, + "learning_rate": 2.964782793954735e-05, + "loss": 1.9898, + "step": 35490 + }, + { + "epoch": 0.06914071171799235, + "grad_norm": 3.0798256397247314, + "learning_rate": 2.9647531354213117e-05, + "loss": 1.7473, + "step": 35505 + }, + { + "epoch": 0.06916992198910263, + "grad_norm": 2.311537265777588, + "learning_rate": 2.9647234645529965e-05, + "loss": 1.9823, + "step": 35520 + }, + { + "epoch": 0.06919913226021289, + "grad_norm": 2.0401742458343506, + "learning_rate": 2.964693781350039e-05, + "loss": 1.8064, + "step": 35535 + }, + { + "epoch": 0.06922834253132315, + "grad_norm": 3.3467860221862793, + "learning_rate": 2.9646640858126888e-05, + "loss": 1.9125, + "step": 35550 + }, + { + "epoch": 0.06925755280243341, + "grad_norm": 2.1629061698913574, + "learning_rate": 2.9646343779411965e-05, + "loss": 1.9639, + "step": 35565 + }, + { + "epoch": 0.06928676307354367, + "grad_norm": 2.8887295722961426, + "learning_rate": 2.9646046577358124e-05, + "loss": 1.839, + "step": 35580 + }, + { + "epoch": 0.06931597334465393, + "grad_norm": 2.5036697387695312, + "learning_rate": 2.964574925196786e-05, + "loss": 1.8771, + "step": 35595 + }, + { + "epoch": 0.0693451836157642, + "grad_norm": 3.0015201568603516, + "learning_rate": 2.9645451803243684e-05, + "loss": 1.7982, + "step": 35610 + }, + { + "epoch": 0.06937439388687446, + "grad_norm": 1.9739888906478882, + "learning_rate": 2.96451542311881e-05, + "loss": 1.8182, + "step": 35625 + }, + { + "epoch": 0.06940360415798473, + "grad_norm": 3.399923801422119, + "learning_rate": 2.964485653580361e-05, + "loss": 1.9865, + "step": 35640 + }, + { + "epoch": 0.06943281442909499, + "grad_norm": 2.0819296836853027, + "learning_rate": 2.9644558717092726e-05, + "loss": 1.7867, + "step": 35655 + }, + { + "epoch": 0.06946202470020525, + "grad_norm": 4.023995876312256, + "learning_rate": 2.964426077505795e-05, + "loss": 1.888, + "step": 35670 + }, + { + "epoch": 0.06949123497131551, + "grad_norm": 3.412802219390869, + "learning_rate": 2.9643962709701797e-05, + "loss": 1.7254, + "step": 35685 + }, + { + "epoch": 0.06952044524242577, + "grad_norm": 5.086963653564453, + "learning_rate": 2.964366452102677e-05, + "loss": 1.8761, + "step": 35700 + }, + { + "epoch": 0.06954965551353604, + "grad_norm": 3.394970417022705, + "learning_rate": 2.964336620903539e-05, + "loss": 1.7277, + "step": 35715 + }, + { + "epoch": 0.0695788657846463, + "grad_norm": 3.2881181240081787, + "learning_rate": 2.964306777373016e-05, + "loss": 1.6754, + "step": 35730 + }, + { + "epoch": 0.06960807605575657, + "grad_norm": 2.9079060554504395, + "learning_rate": 2.96427692151136e-05, + "loss": 1.7099, + "step": 35745 + }, + { + "epoch": 0.06963728632686683, + "grad_norm": 2.8861443996429443, + "learning_rate": 2.964247053318822e-05, + "loss": 2.0041, + "step": 35760 + }, + { + "epoch": 0.06966649659797709, + "grad_norm": 1.9946422576904297, + "learning_rate": 2.9642171727956537e-05, + "loss": 1.6527, + "step": 35775 + }, + { + "epoch": 0.06969570686908735, + "grad_norm": 4.953859806060791, + "learning_rate": 2.9641872799421064e-05, + "loss": 1.8033, + "step": 35790 + }, + { + "epoch": 0.06972491714019761, + "grad_norm": 3.2027766704559326, + "learning_rate": 2.9641573747584324e-05, + "loss": 2.0676, + "step": 35805 + }, + { + "epoch": 0.06975412741130788, + "grad_norm": 3.439577102661133, + "learning_rate": 2.9641274572448833e-05, + "loss": 1.8123, + "step": 35820 + }, + { + "epoch": 0.06978333768241814, + "grad_norm": 3.8689444065093994, + "learning_rate": 2.9640975274017104e-05, + "loss": 1.888, + "step": 35835 + }, + { + "epoch": 0.06981254795352841, + "grad_norm": 2.1769585609436035, + "learning_rate": 2.9640675852291664e-05, + "loss": 1.8283, + "step": 35850 + }, + { + "epoch": 0.06984175822463867, + "grad_norm": 4.714607238769531, + "learning_rate": 2.964037630727504e-05, + "loss": 1.8005, + "step": 35865 + }, + { + "epoch": 0.06987096849574893, + "grad_norm": 3.4551823139190674, + "learning_rate": 2.9640076638969745e-05, + "loss": 1.9842, + "step": 35880 + }, + { + "epoch": 0.0699001787668592, + "grad_norm": 2.2861387729644775, + "learning_rate": 2.96397768473783e-05, + "loss": 1.8149, + "step": 35895 + }, + { + "epoch": 0.06992938903796946, + "grad_norm": 2.4420621395111084, + "learning_rate": 2.963947693250324e-05, + "loss": 2.0058, + "step": 35910 + }, + { + "epoch": 0.06995859930907972, + "grad_norm": 2.770094871520996, + "learning_rate": 2.9639176894347083e-05, + "loss": 2.0128, + "step": 35925 + }, + { + "epoch": 0.06998780958018998, + "grad_norm": 3.4321415424346924, + "learning_rate": 2.9638876732912364e-05, + "loss": 1.7902, + "step": 35940 + }, + { + "epoch": 0.07001701985130025, + "grad_norm": 3.649888753890991, + "learning_rate": 2.96385764482016e-05, + "loss": 1.7774, + "step": 35955 + }, + { + "epoch": 0.07004623012241051, + "grad_norm": 3.11183762550354, + "learning_rate": 2.963827604021733e-05, + "loss": 1.8819, + "step": 35970 + }, + { + "epoch": 0.07007544039352077, + "grad_norm": 3.1074795722961426, + "learning_rate": 2.9637975508962076e-05, + "loss": 1.7814, + "step": 35985 + }, + { + "epoch": 0.07010465066463104, + "grad_norm": 2.314819097518921, + "learning_rate": 2.9637674854438368e-05, + "loss": 2.0389, + "step": 36000 + }, + { + "epoch": 0.0701338609357413, + "grad_norm": 3.201878309249878, + "learning_rate": 2.9637374076648744e-05, + "loss": 1.8632, + "step": 36015 + }, + { + "epoch": 0.07016307120685156, + "grad_norm": 3.392383337020874, + "learning_rate": 2.9637073175595738e-05, + "loss": 1.7767, + "step": 36030 + }, + { + "epoch": 0.07019228147796182, + "grad_norm": 3.2556650638580322, + "learning_rate": 2.9636772151281874e-05, + "loss": 1.748, + "step": 36045 + }, + { + "epoch": 0.07022149174907209, + "grad_norm": 4.08565092086792, + "learning_rate": 2.9636471003709697e-05, + "loss": 1.9722, + "step": 36060 + }, + { + "epoch": 0.07025070202018235, + "grad_norm": 3.140951156616211, + "learning_rate": 2.9636169732881737e-05, + "loss": 1.9794, + "step": 36075 + }, + { + "epoch": 0.07027991229129261, + "grad_norm": 3.052402973175049, + "learning_rate": 2.9635868338800532e-05, + "loss": 1.7071, + "step": 36090 + }, + { + "epoch": 0.07030912256240288, + "grad_norm": 3.39620304107666, + "learning_rate": 2.9635566821468624e-05, + "loss": 2.0134, + "step": 36105 + }, + { + "epoch": 0.07033833283351314, + "grad_norm": 3.394157648086548, + "learning_rate": 2.963526518088855e-05, + "loss": 1.9427, + "step": 36120 + }, + { + "epoch": 0.0703675431046234, + "grad_norm": 2.371034622192383, + "learning_rate": 2.9634963417062847e-05, + "loss": 1.8057, + "step": 36135 + }, + { + "epoch": 0.07039675337573366, + "grad_norm": 2.449495792388916, + "learning_rate": 2.963466152999406e-05, + "loss": 2.0568, + "step": 36150 + }, + { + "epoch": 0.07042596364684393, + "grad_norm": 2.6000113487243652, + "learning_rate": 2.9634359519684732e-05, + "loss": 1.6321, + "step": 36165 + }, + { + "epoch": 0.0704551739179542, + "grad_norm": 4.883264064788818, + "learning_rate": 2.9634057386137405e-05, + "loss": 1.8741, + "step": 36180 + }, + { + "epoch": 0.07048438418906446, + "grad_norm": 3.775376319885254, + "learning_rate": 2.9633755129354614e-05, + "loss": 1.8035, + "step": 36195 + }, + { + "epoch": 0.07051359446017472, + "grad_norm": 2.50616455078125, + "learning_rate": 2.963345274933892e-05, + "loss": 1.7234, + "step": 36210 + }, + { + "epoch": 0.07054280473128498, + "grad_norm": 2.0914623737335205, + "learning_rate": 2.9633150246092862e-05, + "loss": 1.8678, + "step": 36225 + }, + { + "epoch": 0.07057201500239524, + "grad_norm": 2.3975253105163574, + "learning_rate": 2.9632847619618987e-05, + "loss": 2.0389, + "step": 36240 + }, + { + "epoch": 0.0706012252735055, + "grad_norm": 4.599456310272217, + "learning_rate": 2.9632544869919844e-05, + "loss": 1.8546, + "step": 36255 + }, + { + "epoch": 0.07063043554461577, + "grad_norm": 2.8365259170532227, + "learning_rate": 2.9632241996997982e-05, + "loss": 1.8831, + "step": 36270 + }, + { + "epoch": 0.07065964581572604, + "grad_norm": 5.333098888397217, + "learning_rate": 2.963193900085595e-05, + "loss": 1.9734, + "step": 36285 + }, + { + "epoch": 0.0706888560868363, + "grad_norm": 3.1167783737182617, + "learning_rate": 2.9631635881496307e-05, + "loss": 1.8843, + "step": 36300 + }, + { + "epoch": 0.07071806635794656, + "grad_norm": 2.7622556686401367, + "learning_rate": 2.9631332638921597e-05, + "loss": 1.7392, + "step": 36315 + }, + { + "epoch": 0.07074727662905682, + "grad_norm": 2.8044185638427734, + "learning_rate": 2.9631029273134377e-05, + "loss": 1.8026, + "step": 36330 + }, + { + "epoch": 0.07077648690016708, + "grad_norm": 3.532073974609375, + "learning_rate": 2.9630725784137206e-05, + "loss": 1.9251, + "step": 36345 + }, + { + "epoch": 0.07080569717127734, + "grad_norm": 2.9674692153930664, + "learning_rate": 2.963042217193263e-05, + "loss": 1.7559, + "step": 36360 + }, + { + "epoch": 0.07083490744238762, + "grad_norm": 3.634361982345581, + "learning_rate": 2.963011843652321e-05, + "loss": 1.8846, + "step": 36375 + }, + { + "epoch": 0.07086411771349788, + "grad_norm": 3.8142337799072266, + "learning_rate": 2.9629814577911512e-05, + "loss": 1.9241, + "step": 36390 + }, + { + "epoch": 0.07089332798460814, + "grad_norm": 3.4953882694244385, + "learning_rate": 2.9629510596100083e-05, + "loss": 1.6802, + "step": 36405 + }, + { + "epoch": 0.0709225382557184, + "grad_norm": 3.115720272064209, + "learning_rate": 2.962920649109149e-05, + "loss": 1.8446, + "step": 36420 + }, + { + "epoch": 0.07095174852682866, + "grad_norm": 2.615313768386841, + "learning_rate": 2.962890226288829e-05, + "loss": 1.8841, + "step": 36435 + }, + { + "epoch": 0.07098095879793892, + "grad_norm": 2.0485146045684814, + "learning_rate": 2.9628597911493044e-05, + "loss": 1.8237, + "step": 36450 + }, + { + "epoch": 0.07101016906904918, + "grad_norm": 2.433716297149658, + "learning_rate": 2.9628293436908322e-05, + "loss": 1.9638, + "step": 36465 + }, + { + "epoch": 0.07103937934015946, + "grad_norm": 2.5944697856903076, + "learning_rate": 2.962798883913668e-05, + "loss": 1.7643, + "step": 36480 + }, + { + "epoch": 0.07106858961126972, + "grad_norm": 2.829939603805542, + "learning_rate": 2.962768411818069e-05, + "loss": 1.7773, + "step": 36495 + }, + { + "epoch": 0.07109779988237998, + "grad_norm": 4.522650718688965, + "learning_rate": 2.9627379274042914e-05, + "loss": 1.8784, + "step": 36510 + }, + { + "epoch": 0.07112701015349024, + "grad_norm": 2.9910271167755127, + "learning_rate": 2.962707430672592e-05, + "loss": 1.7878, + "step": 36525 + }, + { + "epoch": 0.0711562204246005, + "grad_norm": 3.928048849105835, + "learning_rate": 2.9626769216232272e-05, + "loss": 1.9815, + "step": 36540 + }, + { + "epoch": 0.07118543069571076, + "grad_norm": 2.1179707050323486, + "learning_rate": 2.9626464002564545e-05, + "loss": 1.823, + "step": 36555 + }, + { + "epoch": 0.07121464096682102, + "grad_norm": 3.4463741779327393, + "learning_rate": 2.9626158665725314e-05, + "loss": 2.0457, + "step": 36570 + }, + { + "epoch": 0.07124385123793128, + "grad_norm": 4.101587295532227, + "learning_rate": 2.962585320571714e-05, + "loss": 1.8559, + "step": 36585 + }, + { + "epoch": 0.07127306150904156, + "grad_norm": 3.634690046310425, + "learning_rate": 2.9625547622542594e-05, + "loss": 1.8351, + "step": 36600 + }, + { + "epoch": 0.07130227178015182, + "grad_norm": 2.704759359359741, + "learning_rate": 2.9625241916204257e-05, + "loss": 1.7875, + "step": 36615 + }, + { + "epoch": 0.07133148205126208, + "grad_norm": 2.140449285507202, + "learning_rate": 2.9624936086704705e-05, + "loss": 1.8588, + "step": 36630 + }, + { + "epoch": 0.07136069232237234, + "grad_norm": 4.091612339019775, + "learning_rate": 2.9624630134046504e-05, + "loss": 1.7989, + "step": 36645 + }, + { + "epoch": 0.0713899025934826, + "grad_norm": 2.931297779083252, + "learning_rate": 2.962432405823224e-05, + "loss": 1.8749, + "step": 36660 + }, + { + "epoch": 0.07141911286459286, + "grad_norm": 2.6341919898986816, + "learning_rate": 2.9624017859264485e-05, + "loss": 1.8023, + "step": 36675 + }, + { + "epoch": 0.07144832313570312, + "grad_norm": 4.892661094665527, + "learning_rate": 2.962371153714582e-05, + "loss": 1.7806, + "step": 36690 + }, + { + "epoch": 0.0714775334068134, + "grad_norm": 3.581634998321533, + "learning_rate": 2.962340509187882e-05, + "loss": 1.9455, + "step": 36705 + }, + { + "epoch": 0.07150674367792366, + "grad_norm": 3.0839149951934814, + "learning_rate": 2.9623098523466072e-05, + "loss": 1.8815, + "step": 36720 + }, + { + "epoch": 0.07153595394903392, + "grad_norm": 3.6118085384368896, + "learning_rate": 2.9622791831910152e-05, + "loss": 1.7464, + "step": 36735 + }, + { + "epoch": 0.07156516422014418, + "grad_norm": 2.926898956298828, + "learning_rate": 2.9622485017213646e-05, + "loss": 1.794, + "step": 36750 + }, + { + "epoch": 0.07159437449125444, + "grad_norm": 2.8011724948883057, + "learning_rate": 2.962217807937914e-05, + "loss": 1.9756, + "step": 36765 + }, + { + "epoch": 0.0716235847623647, + "grad_norm": 2.957306146621704, + "learning_rate": 2.9621871018409213e-05, + "loss": 1.799, + "step": 36780 + }, + { + "epoch": 0.07165279503347496, + "grad_norm": 2.094928503036499, + "learning_rate": 2.9621563834306453e-05, + "loss": 1.9149, + "step": 36795 + }, + { + "epoch": 0.07168200530458524, + "grad_norm": 3.5579111576080322, + "learning_rate": 2.962125652707345e-05, + "loss": 1.8073, + "step": 36810 + }, + { + "epoch": 0.0717112155756955, + "grad_norm": 2.3005943298339844, + "learning_rate": 2.962094909671279e-05, + "loss": 1.894, + "step": 36825 + }, + { + "epoch": 0.07174042584680576, + "grad_norm": 1.7549917697906494, + "learning_rate": 2.962064154322706e-05, + "loss": 1.9434, + "step": 36840 + }, + { + "epoch": 0.07176963611791602, + "grad_norm": 2.696244955062866, + "learning_rate": 2.962033386661885e-05, + "loss": 1.8287, + "step": 36855 + }, + { + "epoch": 0.07179884638902628, + "grad_norm": 4.194484233856201, + "learning_rate": 2.9620026066890756e-05, + "loss": 1.8381, + "step": 36870 + }, + { + "epoch": 0.07182805666013654, + "grad_norm": 4.518919944763184, + "learning_rate": 2.961971814404536e-05, + "loss": 1.9311, + "step": 36885 + }, + { + "epoch": 0.0718572669312468, + "grad_norm": 2.8230600357055664, + "learning_rate": 2.9619410098085267e-05, + "loss": 1.9481, + "step": 36900 + }, + { + "epoch": 0.07188647720235708, + "grad_norm": 2.9653265476226807, + "learning_rate": 2.961910192901306e-05, + "loss": 1.9819, + "step": 36915 + }, + { + "epoch": 0.07191568747346734, + "grad_norm": 4.166685104370117, + "learning_rate": 2.9618793636831343e-05, + "loss": 1.902, + "step": 36930 + }, + { + "epoch": 0.0719448977445776, + "grad_norm": 3.7438416481018066, + "learning_rate": 2.9618485221542707e-05, + "loss": 1.9503, + "step": 36945 + }, + { + "epoch": 0.07197410801568786, + "grad_norm": 3.582324266433716, + "learning_rate": 2.9618176683149753e-05, + "loss": 1.9552, + "step": 36960 + }, + { + "epoch": 0.07200331828679812, + "grad_norm": 3.1160881519317627, + "learning_rate": 2.9617868021655077e-05, + "loss": 2.0002, + "step": 36975 + }, + { + "epoch": 0.07203252855790838, + "grad_norm": 4.370865345001221, + "learning_rate": 2.961755923706128e-05, + "loss": 1.8732, + "step": 36990 + }, + { + "epoch": 0.07206173882901865, + "grad_norm": 3.8529598712921143, + "learning_rate": 2.961725032937096e-05, + "loss": 1.7087, + "step": 37005 + }, + { + "epoch": 0.07209094910012892, + "grad_norm": 3.0339152812957764, + "learning_rate": 2.9616941298586717e-05, + "loss": 1.8224, + "step": 37020 + }, + { + "epoch": 0.07212015937123918, + "grad_norm": 2.1726677417755127, + "learning_rate": 2.9616632144711155e-05, + "loss": 1.8681, + "step": 37035 + }, + { + "epoch": 0.07214936964234944, + "grad_norm": 2.9488229751586914, + "learning_rate": 2.961632286774688e-05, + "loss": 1.827, + "step": 37050 + }, + { + "epoch": 0.0721785799134597, + "grad_norm": 2.599917411804199, + "learning_rate": 2.9616013467696494e-05, + "loss": 2.0164, + "step": 37065 + }, + { + "epoch": 0.07220779018456996, + "grad_norm": 2.8067574501037598, + "learning_rate": 2.9615703944562604e-05, + "loss": 1.8203, + "step": 37080 + }, + { + "epoch": 0.07223700045568023, + "grad_norm": 3.9336917400360107, + "learning_rate": 2.9615394298347814e-05, + "loss": 1.7452, + "step": 37095 + }, + { + "epoch": 0.07226621072679049, + "grad_norm": 3.1062943935394287, + "learning_rate": 2.9615084529054732e-05, + "loss": 1.9874, + "step": 37110 + }, + { + "epoch": 0.07229542099790076, + "grad_norm": 2.639976739883423, + "learning_rate": 2.961477463668597e-05, + "loss": 1.8517, + "step": 37125 + }, + { + "epoch": 0.07232463126901102, + "grad_norm": 4.7184529304504395, + "learning_rate": 2.9614464621244135e-05, + "loss": 1.7864, + "step": 37140 + }, + { + "epoch": 0.07235384154012128, + "grad_norm": 3.362306594848633, + "learning_rate": 2.9614154482731838e-05, + "loss": 1.7878, + "step": 37155 + }, + { + "epoch": 0.07238305181123154, + "grad_norm": 3.6693410873413086, + "learning_rate": 2.9613844221151688e-05, + "loss": 1.8032, + "step": 37170 + }, + { + "epoch": 0.0724122620823418, + "grad_norm": 3.523045539855957, + "learning_rate": 2.9613533836506304e-05, + "loss": 1.7877, + "step": 37185 + }, + { + "epoch": 0.07244147235345207, + "grad_norm": 3.0142440795898438, + "learning_rate": 2.961322332879829e-05, + "loss": 1.8242, + "step": 37200 + }, + { + "epoch": 0.07247068262456233, + "grad_norm": 2.044818878173828, + "learning_rate": 2.9612912698030275e-05, + "loss": 1.7869, + "step": 37215 + }, + { + "epoch": 0.0724998928956726, + "grad_norm": 2.8730952739715576, + "learning_rate": 2.9612601944204866e-05, + "loss": 2.0202, + "step": 37230 + }, + { + "epoch": 0.07252910316678286, + "grad_norm": 2.93044114112854, + "learning_rate": 2.9612291067324675e-05, + "loss": 1.9302, + "step": 37245 + }, + { + "epoch": 0.07255831343789312, + "grad_norm": 3.5285401344299316, + "learning_rate": 2.961198006739233e-05, + "loss": 2.0005, + "step": 37260 + }, + { + "epoch": 0.07258752370900338, + "grad_norm": 3.873157262802124, + "learning_rate": 2.9611668944410446e-05, + "loss": 1.869, + "step": 37275 + }, + { + "epoch": 0.07261673398011365, + "grad_norm": 4.3793840408325195, + "learning_rate": 2.961135769838164e-05, + "loss": 1.8881, + "step": 37290 + }, + { + "epoch": 0.0726459442512239, + "grad_norm": 1.939467191696167, + "learning_rate": 2.9611046329308538e-05, + "loss": 1.8512, + "step": 37305 + }, + { + "epoch": 0.07267515452233417, + "grad_norm": 2.9882404804229736, + "learning_rate": 2.961073483719376e-05, + "loss": 1.672, + "step": 37320 + }, + { + "epoch": 0.07270436479344444, + "grad_norm": 1.6897094249725342, + "learning_rate": 2.961042322203993e-05, + "loss": 1.9174, + "step": 37335 + }, + { + "epoch": 0.0727335750645547, + "grad_norm": 4.178640842437744, + "learning_rate": 2.9610111483849668e-05, + "loss": 1.9587, + "step": 37350 + }, + { + "epoch": 0.07276278533566496, + "grad_norm": 4.1040825843811035, + "learning_rate": 2.9609799622625604e-05, + "loss": 1.7908, + "step": 37365 + }, + { + "epoch": 0.07279199560677523, + "grad_norm": 2.250300168991089, + "learning_rate": 2.960948763837036e-05, + "loss": 1.8558, + "step": 37380 + }, + { + "epoch": 0.07282120587788549, + "grad_norm": 3.2331349849700928, + "learning_rate": 2.960917553108657e-05, + "loss": 1.8444, + "step": 37395 + }, + { + "epoch": 0.07285041614899575, + "grad_norm": 3.2942092418670654, + "learning_rate": 2.9608863300776855e-05, + "loss": 1.8456, + "step": 37410 + }, + { + "epoch": 0.07287962642010601, + "grad_norm": 2.708117961883545, + "learning_rate": 2.960855094744385e-05, + "loss": 1.8107, + "step": 37425 + }, + { + "epoch": 0.07290883669121628, + "grad_norm": 5.859858512878418, + "learning_rate": 2.9608238471090187e-05, + "loss": 1.8856, + "step": 37440 + }, + { + "epoch": 0.07293804696232654, + "grad_norm": 2.5912370681762695, + "learning_rate": 2.9607925871718486e-05, + "loss": 1.802, + "step": 37455 + }, + { + "epoch": 0.0729672572334368, + "grad_norm": 2.456089973449707, + "learning_rate": 2.960761314933139e-05, + "loss": 1.9207, + "step": 37470 + }, + { + "epoch": 0.07299646750454707, + "grad_norm": 3.9020590782165527, + "learning_rate": 2.9607300303931526e-05, + "loss": 1.762, + "step": 37485 + }, + { + "epoch": 0.07302567777565733, + "grad_norm": 2.20559024810791, + "learning_rate": 2.960698733552153e-05, + "loss": 1.9261, + "step": 37500 + }, + { + "epoch": 0.07305488804676759, + "grad_norm": 2.871424913406372, + "learning_rate": 2.9606674244104048e-05, + "loss": 1.8707, + "step": 37515 + }, + { + "epoch": 0.07308409831787785, + "grad_norm": 2.1989009380340576, + "learning_rate": 2.96063610296817e-05, + "loss": 2.0706, + "step": 37530 + }, + { + "epoch": 0.07311330858898812, + "grad_norm": 2.3892788887023926, + "learning_rate": 2.9606047692257134e-05, + "loss": 1.8737, + "step": 37545 + }, + { + "epoch": 0.07314251886009838, + "grad_norm": 1.9396522045135498, + "learning_rate": 2.9605734231832992e-05, + "loss": 1.8575, + "step": 37560 + }, + { + "epoch": 0.07317172913120865, + "grad_norm": 4.139869689941406, + "learning_rate": 2.96054206484119e-05, + "loss": 1.7875, + "step": 37575 + }, + { + "epoch": 0.07320093940231891, + "grad_norm": 4.462748050689697, + "learning_rate": 2.960510694199651e-05, + "loss": 1.9463, + "step": 37590 + }, + { + "epoch": 0.07323014967342917, + "grad_norm": 2.580022096633911, + "learning_rate": 2.9604793112589458e-05, + "loss": 1.9219, + "step": 37605 + }, + { + "epoch": 0.07325935994453943, + "grad_norm": 2.900674343109131, + "learning_rate": 2.960447916019339e-05, + "loss": 1.8848, + "step": 37620 + }, + { + "epoch": 0.07328857021564969, + "grad_norm": 4.659104347229004, + "learning_rate": 2.960416508481095e-05, + "loss": 1.7897, + "step": 37635 + }, + { + "epoch": 0.07331778048675995, + "grad_norm": 3.549299955368042, + "learning_rate": 2.960385088644478e-05, + "loss": 1.9, + "step": 37650 + }, + { + "epoch": 0.07334699075787023, + "grad_norm": 2.934943675994873, + "learning_rate": 2.9603536565097526e-05, + "loss": 1.8177, + "step": 37665 + }, + { + "epoch": 0.07337620102898049, + "grad_norm": 2.9049174785614014, + "learning_rate": 2.960322212077184e-05, + "loss": 1.7554, + "step": 37680 + }, + { + "epoch": 0.07340541130009075, + "grad_norm": 2.8519704341888428, + "learning_rate": 2.9602907553470366e-05, + "loss": 1.8584, + "step": 37695 + }, + { + "epoch": 0.07343462157120101, + "grad_norm": 4.067868709564209, + "learning_rate": 2.9602592863195754e-05, + "loss": 1.8013, + "step": 37710 + }, + { + "epoch": 0.07346383184231127, + "grad_norm": 2.65765118598938, + "learning_rate": 2.9602278049950654e-05, + "loss": 1.9897, + "step": 37725 + }, + { + "epoch": 0.07349304211342153, + "grad_norm": 2.998044967651367, + "learning_rate": 2.9601963113737718e-05, + "loss": 1.8279, + "step": 37740 + }, + { + "epoch": 0.07352225238453179, + "grad_norm": 2.2030725479125977, + "learning_rate": 2.9601648054559594e-05, + "loss": 1.939, + "step": 37755 + }, + { + "epoch": 0.07355146265564207, + "grad_norm": 5.181911945343018, + "learning_rate": 2.960133287241894e-05, + "loss": 1.7382, + "step": 37770 + }, + { + "epoch": 0.07358067292675233, + "grad_norm": 3.682950496673584, + "learning_rate": 2.960101756731841e-05, + "loss": 1.8002, + "step": 37785 + }, + { + "epoch": 0.07360988319786259, + "grad_norm": 3.4098355770111084, + "learning_rate": 2.960070213926065e-05, + "loss": 1.8048, + "step": 37800 + }, + { + "epoch": 0.07363909346897285, + "grad_norm": 2.7876944541931152, + "learning_rate": 2.9600386588248333e-05, + "loss": 1.7966, + "step": 37815 + }, + { + "epoch": 0.07366830374008311, + "grad_norm": 3.52449107170105, + "learning_rate": 2.9600070914284103e-05, + "loss": 1.861, + "step": 37830 + }, + { + "epoch": 0.07369751401119337, + "grad_norm": 3.2730326652526855, + "learning_rate": 2.959975511737062e-05, + "loss": 1.9195, + "step": 37845 + }, + { + "epoch": 0.07372672428230363, + "grad_norm": 3.7927465438842773, + "learning_rate": 2.9599439197510553e-05, + "loss": 1.9699, + "step": 37860 + }, + { + "epoch": 0.07375593455341391, + "grad_norm": 2.2197682857513428, + "learning_rate": 2.9599123154706545e-05, + "loss": 1.9252, + "step": 37875 + }, + { + "epoch": 0.07378514482452417, + "grad_norm": 2.5548315048217773, + "learning_rate": 2.9598806988961275e-05, + "loss": 1.933, + "step": 37890 + }, + { + "epoch": 0.07381435509563443, + "grad_norm": 2.335618734359741, + "learning_rate": 2.9598490700277395e-05, + "loss": 1.8436, + "step": 37905 + }, + { + "epoch": 0.07384356536674469, + "grad_norm": 3.7140378952026367, + "learning_rate": 2.9598174288657573e-05, + "loss": 1.7947, + "step": 37920 + }, + { + "epoch": 0.07387277563785495, + "grad_norm": 3.628742218017578, + "learning_rate": 2.9597857754104474e-05, + "loss": 1.8114, + "step": 37935 + }, + { + "epoch": 0.07390198590896521, + "grad_norm": 2.147961139678955, + "learning_rate": 2.9597541096620758e-05, + "loss": 1.799, + "step": 37950 + }, + { + "epoch": 0.07393119618007547, + "grad_norm": 3.9110617637634277, + "learning_rate": 2.9597224316209096e-05, + "loss": 1.9354, + "step": 37965 + }, + { + "epoch": 0.07396040645118575, + "grad_norm": 3.213088274002075, + "learning_rate": 2.9596907412872156e-05, + "loss": 1.9373, + "step": 37980 + }, + { + "epoch": 0.07398961672229601, + "grad_norm": 2.0284922122955322, + "learning_rate": 2.9596590386612607e-05, + "loss": 1.8561, + "step": 37995 + }, + { + "epoch": 0.07401882699340627, + "grad_norm": 1.9195611476898193, + "learning_rate": 2.9596273237433116e-05, + "loss": 1.7875, + "step": 38010 + }, + { + "epoch": 0.07404803726451653, + "grad_norm": 1.8990719318389893, + "learning_rate": 2.9595955965336358e-05, + "loss": 1.886, + "step": 38025 + }, + { + "epoch": 0.07407724753562679, + "grad_norm": 2.7792749404907227, + "learning_rate": 2.9595638570324995e-05, + "loss": 1.9444, + "step": 38040 + }, + { + "epoch": 0.07410645780673705, + "grad_norm": 3.936563730239868, + "learning_rate": 2.9595321052401712e-05, + "loss": 1.8684, + "step": 38055 + }, + { + "epoch": 0.07413566807784731, + "grad_norm": 2.845834255218506, + "learning_rate": 2.9595003411569178e-05, + "loss": 1.8743, + "step": 38070 + }, + { + "epoch": 0.07416487834895759, + "grad_norm": 1.712980031967163, + "learning_rate": 2.9594685647830065e-05, + "loss": 1.8693, + "step": 38085 + }, + { + "epoch": 0.07419408862006785, + "grad_norm": 4.104766845703125, + "learning_rate": 2.959436776118705e-05, + "loss": 1.9156, + "step": 38100 + }, + { + "epoch": 0.07422329889117811, + "grad_norm": 4.8171281814575195, + "learning_rate": 2.9594049751642816e-05, + "loss": 1.8934, + "step": 38115 + }, + { + "epoch": 0.07425250916228837, + "grad_norm": 3.2426438331604004, + "learning_rate": 2.9593731619200037e-05, + "loss": 1.8768, + "step": 38130 + }, + { + "epoch": 0.07428171943339863, + "grad_norm": 4.016167640686035, + "learning_rate": 2.9593413363861386e-05, + "loss": 1.8933, + "step": 38145 + }, + { + "epoch": 0.0743109297045089, + "grad_norm": 1.8209270238876343, + "learning_rate": 2.9593094985629553e-05, + "loss": 1.704, + "step": 38160 + }, + { + "epoch": 0.07434013997561915, + "grad_norm": 2.429961681365967, + "learning_rate": 2.9592776484507213e-05, + "loss": 1.9408, + "step": 38175 + }, + { + "epoch": 0.07436935024672943, + "grad_norm": 4.900204181671143, + "learning_rate": 2.9592457860497048e-05, + "loss": 1.8678, + "step": 38190 + }, + { + "epoch": 0.07439856051783969, + "grad_norm": 2.6684536933898926, + "learning_rate": 2.959213911360175e-05, + "loss": 1.8831, + "step": 38205 + }, + { + "epoch": 0.07442777078894995, + "grad_norm": 3.2485227584838867, + "learning_rate": 2.959182024382399e-05, + "loss": 1.9345, + "step": 38220 + }, + { + "epoch": 0.07445698106006021, + "grad_norm": 2.1203527450561523, + "learning_rate": 2.959150125116646e-05, + "loss": 1.7432, + "step": 38235 + }, + { + "epoch": 0.07448619133117047, + "grad_norm": 4.162999629974365, + "learning_rate": 2.9591182135631848e-05, + "loss": 1.8684, + "step": 38250 + }, + { + "epoch": 0.07451540160228073, + "grad_norm": 2.8070929050445557, + "learning_rate": 2.959086289722284e-05, + "loss": 1.7381, + "step": 38265 + }, + { + "epoch": 0.074544611873391, + "grad_norm": 2.4631731510162354, + "learning_rate": 2.959054353594212e-05, + "loss": 1.5987, + "step": 38280 + }, + { + "epoch": 0.07457382214450127, + "grad_norm": 3.986906051635742, + "learning_rate": 2.9590224051792377e-05, + "loss": 1.9416, + "step": 38295 + }, + { + "epoch": 0.07460303241561153, + "grad_norm": 2.761138439178467, + "learning_rate": 2.9589904444776313e-05, + "loss": 1.819, + "step": 38310 + }, + { + "epoch": 0.07463224268672179, + "grad_norm": 5.958553791046143, + "learning_rate": 2.958958471489661e-05, + "loss": 2.0435, + "step": 38325 + }, + { + "epoch": 0.07466145295783205, + "grad_norm": 4.8247199058532715, + "learning_rate": 2.958926486215596e-05, + "loss": 2.0231, + "step": 38340 + }, + { + "epoch": 0.07469066322894231, + "grad_norm": 2.023098945617676, + "learning_rate": 2.958894488655706e-05, + "loss": 1.6504, + "step": 38355 + }, + { + "epoch": 0.07471987350005257, + "grad_norm": 2.826117992401123, + "learning_rate": 2.9588624788102604e-05, + "loss": 1.6953, + "step": 38370 + }, + { + "epoch": 0.07474908377116284, + "grad_norm": 2.362399101257324, + "learning_rate": 2.9588304566795282e-05, + "loss": 1.8232, + "step": 38385 + }, + { + "epoch": 0.07477829404227311, + "grad_norm": 2.825855255126953, + "learning_rate": 2.9587984222637804e-05, + "loss": 1.8065, + "step": 38400 + }, + { + "epoch": 0.07480750431338337, + "grad_norm": 2.7027783393859863, + "learning_rate": 2.9587663755632854e-05, + "loss": 1.849, + "step": 38415 + }, + { + "epoch": 0.07483671458449363, + "grad_norm": 3.6305863857269287, + "learning_rate": 2.9587343165783136e-05, + "loss": 1.8779, + "step": 38430 + }, + { + "epoch": 0.0748659248556039, + "grad_norm": 2.5080883502960205, + "learning_rate": 2.958702245309135e-05, + "loss": 2.1624, + "step": 38445 + }, + { + "epoch": 0.07489513512671415, + "grad_norm": 2.4447906017303467, + "learning_rate": 2.9586701617560197e-05, + "loss": 1.837, + "step": 38460 + }, + { + "epoch": 0.07492434539782442, + "grad_norm": 2.823728084564209, + "learning_rate": 2.9586380659192373e-05, + "loss": 1.8959, + "step": 38475 + }, + { + "epoch": 0.07495355566893468, + "grad_norm": 3.1042325496673584, + "learning_rate": 2.9586059577990593e-05, + "loss": 1.8348, + "step": 38490 + }, + { + "epoch": 0.07498276594004495, + "grad_norm": 3.0661540031433105, + "learning_rate": 2.958573837395755e-05, + "loss": 2.1726, + "step": 38505 + }, + { + "epoch": 0.07501197621115521, + "grad_norm": 2.4136931896209717, + "learning_rate": 2.9585417047095956e-05, + "loss": 1.8825, + "step": 38520 + }, + { + "epoch": 0.07504118648226547, + "grad_norm": 3.421844244003296, + "learning_rate": 2.9585095597408508e-05, + "loss": 1.85, + "step": 38535 + }, + { + "epoch": 0.07507039675337573, + "grad_norm": 3.8951923847198486, + "learning_rate": 2.9584774024897924e-05, + "loss": 1.9928, + "step": 38550 + }, + { + "epoch": 0.075099607024486, + "grad_norm": 4.008039951324463, + "learning_rate": 2.9584452329566906e-05, + "loss": 1.8238, + "step": 38565 + }, + { + "epoch": 0.07512881729559626, + "grad_norm": 2.5100114345550537, + "learning_rate": 2.9584130511418164e-05, + "loss": 1.854, + "step": 38580 + }, + { + "epoch": 0.07515802756670652, + "grad_norm": 3.6023712158203125, + "learning_rate": 2.9583808570454404e-05, + "loss": 2.0781, + "step": 38595 + }, + { + "epoch": 0.07518723783781679, + "grad_norm": 2.3269500732421875, + "learning_rate": 2.9583486506678342e-05, + "loss": 1.822, + "step": 38610 + }, + { + "epoch": 0.07521644810892705, + "grad_norm": 4.363159656524658, + "learning_rate": 2.9583164320092693e-05, + "loss": 2.0055, + "step": 38625 + }, + { + "epoch": 0.07524565838003731, + "grad_norm": 4.24274206161499, + "learning_rate": 2.9582842010700163e-05, + "loss": 1.9181, + "step": 38640 + }, + { + "epoch": 0.07527486865114758, + "grad_norm": 2.5762596130371094, + "learning_rate": 2.9582519578503468e-05, + "loss": 1.8822, + "step": 38655 + }, + { + "epoch": 0.07530407892225784, + "grad_norm": 2.1492116451263428, + "learning_rate": 2.958219702350533e-05, + "loss": 1.9506, + "step": 38670 + }, + { + "epoch": 0.0753332891933681, + "grad_norm": 3.317279815673828, + "learning_rate": 2.9581874345708455e-05, + "loss": 1.8698, + "step": 38685 + }, + { + "epoch": 0.07536249946447836, + "grad_norm": 2.714073419570923, + "learning_rate": 2.9581551545115565e-05, + "loss": 2.0611, + "step": 38700 + }, + { + "epoch": 0.07539170973558862, + "grad_norm": 2.718071699142456, + "learning_rate": 2.958122862172938e-05, + "loss": 1.9292, + "step": 38715 + }, + { + "epoch": 0.0754209200066989, + "grad_norm": 2.4822440147399902, + "learning_rate": 2.958090557555262e-05, + "loss": 1.9971, + "step": 38730 + }, + { + "epoch": 0.07545013027780915, + "grad_norm": 3.417344808578491, + "learning_rate": 2.9580582406588005e-05, + "loss": 1.8292, + "step": 38745 + }, + { + "epoch": 0.07547934054891942, + "grad_norm": 2.5632808208465576, + "learning_rate": 2.958025911483825e-05, + "loss": 1.816, + "step": 38760 + }, + { + "epoch": 0.07550855082002968, + "grad_norm": 2.580145835876465, + "learning_rate": 2.9579935700306085e-05, + "loss": 1.9879, + "step": 38775 + }, + { + "epoch": 0.07553776109113994, + "grad_norm": 3.973757266998291, + "learning_rate": 2.9579612162994228e-05, + "loss": 1.9594, + "step": 38790 + }, + { + "epoch": 0.0755669713622502, + "grad_norm": 3.92153000831604, + "learning_rate": 2.957928850290541e-05, + "loss": 1.773, + "step": 38805 + }, + { + "epoch": 0.07559618163336046, + "grad_norm": 2.885376214981079, + "learning_rate": 2.9578964720042353e-05, + "loss": 2.2071, + "step": 38820 + }, + { + "epoch": 0.07562539190447073, + "grad_norm": 2.7979674339294434, + "learning_rate": 2.9578640814407777e-05, + "loss": 1.9249, + "step": 38835 + }, + { + "epoch": 0.075654602175581, + "grad_norm": 2.843914270401001, + "learning_rate": 2.9578316786004425e-05, + "loss": 1.9859, + "step": 38850 + }, + { + "epoch": 0.07568381244669126, + "grad_norm": 4.267323017120361, + "learning_rate": 2.9577992634835014e-05, + "loss": 2.2496, + "step": 38865 + }, + { + "epoch": 0.07571302271780152, + "grad_norm": 1.9718761444091797, + "learning_rate": 2.9577668360902276e-05, + "loss": 1.8447, + "step": 38880 + }, + { + "epoch": 0.07574223298891178, + "grad_norm": 4.806777000427246, + "learning_rate": 2.9577343964208944e-05, + "loss": 1.7591, + "step": 38895 + }, + { + "epoch": 0.07577144326002204, + "grad_norm": 2.606508255004883, + "learning_rate": 2.9577019444757746e-05, + "loss": 1.7231, + "step": 38910 + }, + { + "epoch": 0.0758006535311323, + "grad_norm": 3.9986696243286133, + "learning_rate": 2.9576694802551416e-05, + "loss": 1.8346, + "step": 38925 + }, + { + "epoch": 0.07582986380224258, + "grad_norm": 2.828104019165039, + "learning_rate": 2.9576370037592693e-05, + "loss": 1.9536, + "step": 38940 + }, + { + "epoch": 0.07585907407335284, + "grad_norm": 4.390168190002441, + "learning_rate": 2.9576045149884305e-05, + "loss": 1.8409, + "step": 38955 + }, + { + "epoch": 0.0758882843444631, + "grad_norm": 2.9613535404205322, + "learning_rate": 2.957572013942899e-05, + "loss": 1.8244, + "step": 38970 + }, + { + "epoch": 0.07591749461557336, + "grad_norm": 2.586947441101074, + "learning_rate": 2.9575395006229488e-05, + "loss": 1.775, + "step": 38985 + }, + { + "epoch": 0.07594670488668362, + "grad_norm": 1.6005862951278687, + "learning_rate": 2.9575069750288533e-05, + "loss": 1.7216, + "step": 39000 + }, + { + "epoch": 0.07597591515779388, + "grad_norm": 3.558748960494995, + "learning_rate": 2.9574744371608865e-05, + "loss": 1.8911, + "step": 39015 + }, + { + "epoch": 0.07600512542890414, + "grad_norm": 3.7911698818206787, + "learning_rate": 2.957441887019323e-05, + "loss": 1.935, + "step": 39030 + }, + { + "epoch": 0.07603433570001442, + "grad_norm": 3.4796767234802246, + "learning_rate": 2.957409324604436e-05, + "loss": 1.7889, + "step": 39045 + }, + { + "epoch": 0.07606354597112468, + "grad_norm": 4.014665126800537, + "learning_rate": 2.9573767499165e-05, + "loss": 1.8464, + "step": 39060 + }, + { + "epoch": 0.07609275624223494, + "grad_norm": 3.5164144039154053, + "learning_rate": 2.9573441629557895e-05, + "loss": 1.8483, + "step": 39075 + }, + { + "epoch": 0.0761219665133452, + "grad_norm": 2.4140758514404297, + "learning_rate": 2.957311563722579e-05, + "loss": 1.7548, + "step": 39090 + }, + { + "epoch": 0.07615117678445546, + "grad_norm": 2.7079226970672607, + "learning_rate": 2.957278952217143e-05, + "loss": 1.9792, + "step": 39105 + }, + { + "epoch": 0.07618038705556572, + "grad_norm": 3.3955190181732178, + "learning_rate": 2.9572463284397554e-05, + "loss": 1.8887, + "step": 39120 + }, + { + "epoch": 0.07620959732667598, + "grad_norm": 2.3643722534179688, + "learning_rate": 2.9572136923906916e-05, + "loss": 1.8438, + "step": 39135 + }, + { + "epoch": 0.07623880759778626, + "grad_norm": 3.363495111465454, + "learning_rate": 2.957181044070227e-05, + "loss": 1.9439, + "step": 39150 + }, + { + "epoch": 0.07626801786889652, + "grad_norm": 2.454704523086548, + "learning_rate": 2.957148383478635e-05, + "loss": 1.8084, + "step": 39165 + }, + { + "epoch": 0.07629722814000678, + "grad_norm": 3.183285713195801, + "learning_rate": 2.957115710616192e-05, + "loss": 2.0987, + "step": 39180 + }, + { + "epoch": 0.07632643841111704, + "grad_norm": 4.382565975189209, + "learning_rate": 2.9570830254831726e-05, + "loss": 1.7397, + "step": 39195 + }, + { + "epoch": 0.0763556486822273, + "grad_norm": 4.5598320960998535, + "learning_rate": 2.9570503280798523e-05, + "loss": 1.8252, + "step": 39210 + }, + { + "epoch": 0.07638485895333756, + "grad_norm": 4.986062049865723, + "learning_rate": 2.957017618406506e-05, + "loss": 1.7812, + "step": 39225 + }, + { + "epoch": 0.07641406922444782, + "grad_norm": 3.9942119121551514, + "learning_rate": 2.9569848964634098e-05, + "loss": 1.8359, + "step": 39240 + }, + { + "epoch": 0.0764432794955581, + "grad_norm": 3.4912500381469727, + "learning_rate": 2.9569521622508385e-05, + "loss": 1.8588, + "step": 39255 + }, + { + "epoch": 0.07647248976666836, + "grad_norm": 2.060900926589966, + "learning_rate": 2.9569194157690682e-05, + "loss": 1.9333, + "step": 39270 + }, + { + "epoch": 0.07650170003777862, + "grad_norm": 3.9621095657348633, + "learning_rate": 2.9568866570183746e-05, + "loss": 1.9239, + "step": 39285 + }, + { + "epoch": 0.07653091030888888, + "grad_norm": 3.5381596088409424, + "learning_rate": 2.9568538859990336e-05, + "loss": 1.9451, + "step": 39300 + }, + { + "epoch": 0.07656012057999914, + "grad_norm": 3.013927936553955, + "learning_rate": 2.9568211027113212e-05, + "loss": 1.8415, + "step": 39315 + }, + { + "epoch": 0.0765893308511094, + "grad_norm": 3.284257650375366, + "learning_rate": 2.9567883071555136e-05, + "loss": 1.9289, + "step": 39330 + }, + { + "epoch": 0.07661854112221966, + "grad_norm": 2.15523099899292, + "learning_rate": 2.9567554993318863e-05, + "loss": 1.9155, + "step": 39345 + }, + { + "epoch": 0.07664775139332994, + "grad_norm": 3.1030516624450684, + "learning_rate": 2.9567226792407164e-05, + "loss": 1.9226, + "step": 39360 + }, + { + "epoch": 0.0766769616644402, + "grad_norm": 2.698444128036499, + "learning_rate": 2.9566898468822797e-05, + "loss": 1.8306, + "step": 39375 + }, + { + "epoch": 0.07670617193555046, + "grad_norm": 2.724808931350708, + "learning_rate": 2.956657002256853e-05, + "loss": 1.8251, + "step": 39390 + }, + { + "epoch": 0.07673538220666072, + "grad_norm": 2.9143025875091553, + "learning_rate": 2.9566241453647123e-05, + "loss": 1.755, + "step": 39405 + }, + { + "epoch": 0.07676459247777098, + "grad_norm": 3.780363082885742, + "learning_rate": 2.9565912762061357e-05, + "loss": 1.8865, + "step": 39420 + }, + { + "epoch": 0.07679380274888124, + "grad_norm": 2.4257946014404297, + "learning_rate": 2.9565583947813985e-05, + "loss": 1.8871, + "step": 39435 + }, + { + "epoch": 0.0768230130199915, + "grad_norm": 2.0739023685455322, + "learning_rate": 2.9565255010907785e-05, + "loss": 1.9244, + "step": 39450 + }, + { + "epoch": 0.07685222329110178, + "grad_norm": 4.436620712280273, + "learning_rate": 2.9564925951345526e-05, + "loss": 1.9081, + "step": 39465 + }, + { + "epoch": 0.07688143356221204, + "grad_norm": 2.5859375, + "learning_rate": 2.9564596769129976e-05, + "loss": 1.8228, + "step": 39480 + }, + { + "epoch": 0.0769106438333223, + "grad_norm": 2.353349208831787, + "learning_rate": 2.9564267464263906e-05, + "loss": 1.7626, + "step": 39495 + }, + { + "epoch": 0.07693985410443256, + "grad_norm": 2.6317977905273438, + "learning_rate": 2.956393803675009e-05, + "loss": 1.8967, + "step": 39510 + }, + { + "epoch": 0.07696906437554282, + "grad_norm": 2.7638471126556396, + "learning_rate": 2.9563608486591305e-05, + "loss": 2.1839, + "step": 39525 + }, + { + "epoch": 0.07699827464665308, + "grad_norm": 3.3399016857147217, + "learning_rate": 2.9563278813790325e-05, + "loss": 1.6782, + "step": 39540 + }, + { + "epoch": 0.07702748491776334, + "grad_norm": 4.528406620025635, + "learning_rate": 2.9562949018349927e-05, + "loss": 1.8966, + "step": 39555 + }, + { + "epoch": 0.07705669518887362, + "grad_norm": 3.12764573097229, + "learning_rate": 2.9562619100272884e-05, + "loss": 1.7098, + "step": 39570 + }, + { + "epoch": 0.07708590545998388, + "grad_norm": 3.5218427181243896, + "learning_rate": 2.956228905956198e-05, + "loss": 1.8546, + "step": 39585 + }, + { + "epoch": 0.07711511573109414, + "grad_norm": 3.075268268585205, + "learning_rate": 2.956195889621999e-05, + "loss": 1.5858, + "step": 39600 + }, + { + "epoch": 0.0771443260022044, + "grad_norm": 2.4391558170318604, + "learning_rate": 2.9561628610249696e-05, + "loss": 1.7186, + "step": 39615 + }, + { + "epoch": 0.07717353627331466, + "grad_norm": 2.846036195755005, + "learning_rate": 2.956129820165388e-05, + "loss": 1.768, + "step": 39630 + }, + { + "epoch": 0.07720274654442492, + "grad_norm": 4.640497207641602, + "learning_rate": 2.9560967670435326e-05, + "loss": 1.8488, + "step": 39645 + }, + { + "epoch": 0.07723195681553519, + "grad_norm": 4.146282196044922, + "learning_rate": 2.9560637016596815e-05, + "loss": 2.0399, + "step": 39660 + }, + { + "epoch": 0.07726116708664546, + "grad_norm": 2.2617909908294678, + "learning_rate": 2.9560306240141127e-05, + "loss": 1.7802, + "step": 39675 + }, + { + "epoch": 0.07729037735775572, + "grad_norm": 2.9520163536071777, + "learning_rate": 2.9559975341071057e-05, + "loss": 1.7356, + "step": 39690 + }, + { + "epoch": 0.07731958762886598, + "grad_norm": 2.114037036895752, + "learning_rate": 2.955964431938939e-05, + "loss": 1.8138, + "step": 39705 + }, + { + "epoch": 0.07734879789997624, + "grad_norm": 3.0344507694244385, + "learning_rate": 2.9559313175098904e-05, + "loss": 1.9566, + "step": 39720 + }, + { + "epoch": 0.0773780081710865, + "grad_norm": 2.6870083808898926, + "learning_rate": 2.9558981908202395e-05, + "loss": 1.8728, + "step": 39735 + }, + { + "epoch": 0.07740721844219677, + "grad_norm": 4.387387752532959, + "learning_rate": 2.9558650518702654e-05, + "loss": 1.8694, + "step": 39750 + }, + { + "epoch": 0.07743642871330703, + "grad_norm": 3.3471992015838623, + "learning_rate": 2.955831900660247e-05, + "loss": 1.8195, + "step": 39765 + }, + { + "epoch": 0.07746563898441729, + "grad_norm": 3.68962025642395, + "learning_rate": 2.9557987371904634e-05, + "loss": 1.8546, + "step": 39780 + }, + { + "epoch": 0.07749484925552756, + "grad_norm": 4.748502254486084, + "learning_rate": 2.9557655614611935e-05, + "loss": 2.0023, + "step": 39795 + }, + { + "epoch": 0.07752405952663782, + "grad_norm": 3.4492971897125244, + "learning_rate": 2.955732373472717e-05, + "loss": 1.8199, + "step": 39810 + }, + { + "epoch": 0.07755326979774808, + "grad_norm": 2.27078914642334, + "learning_rate": 2.955699173225314e-05, + "loss": 1.8082, + "step": 39825 + }, + { + "epoch": 0.07758248006885834, + "grad_norm": 3.9737517833709717, + "learning_rate": 2.9556659607192633e-05, + "loss": 1.7823, + "step": 39840 + }, + { + "epoch": 0.0776116903399686, + "grad_norm": 2.3084990978240967, + "learning_rate": 2.9556327359548445e-05, + "loss": 1.865, + "step": 39855 + }, + { + "epoch": 0.07764090061107887, + "grad_norm": 3.0408246517181396, + "learning_rate": 2.9555994989323383e-05, + "loss": 1.8234, + "step": 39870 + }, + { + "epoch": 0.07767011088218913, + "grad_norm": 3.016524314880371, + "learning_rate": 2.9555662496520236e-05, + "loss": 1.8594, + "step": 39885 + }, + { + "epoch": 0.0776993211532994, + "grad_norm": 2.70461106300354, + "learning_rate": 2.9555329881141806e-05, + "loss": 1.9029, + "step": 39900 + }, + { + "epoch": 0.07772853142440966, + "grad_norm": 2.4621024131774902, + "learning_rate": 2.9554997143190902e-05, + "loss": 1.8281, + "step": 39915 + }, + { + "epoch": 0.07775774169551992, + "grad_norm": 1.4638293981552124, + "learning_rate": 2.9554664282670316e-05, + "loss": 1.9086, + "step": 39930 + }, + { + "epoch": 0.07778695196663019, + "grad_norm": 1.8144150972366333, + "learning_rate": 2.955433129958286e-05, + "loss": 1.8863, + "step": 39945 + }, + { + "epoch": 0.07781616223774045, + "grad_norm": 3.1351184844970703, + "learning_rate": 2.955399819393132e-05, + "loss": 2.0427, + "step": 39960 + }, + { + "epoch": 0.07784537250885071, + "grad_norm": 2.8836092948913574, + "learning_rate": 2.9553664965718526e-05, + "loss": 1.8601, + "step": 39975 + }, + { + "epoch": 0.07787458277996097, + "grad_norm": 3.3076083660125732, + "learning_rate": 2.9553331614947274e-05, + "loss": 1.7664, + "step": 39990 + }, + { + "epoch": 0.07790379305107124, + "grad_norm": 1.984757900238037, + "learning_rate": 2.955299814162036e-05, + "loss": 2.0087, + "step": 40005 + }, + { + "epoch": 0.0779330033221815, + "grad_norm": 3.184436082839966, + "learning_rate": 2.9552664545740608e-05, + "loss": 1.8379, + "step": 40020 + }, + { + "epoch": 0.07796221359329177, + "grad_norm": 1.9847835302352905, + "learning_rate": 2.9552330827310822e-05, + "loss": 1.8154, + "step": 40035 + }, + { + "epoch": 0.07799142386440203, + "grad_norm": 3.718916893005371, + "learning_rate": 2.9551996986333807e-05, + "loss": 2.0364, + "step": 40050 + }, + { + "epoch": 0.07802063413551229, + "grad_norm": 6.052098274230957, + "learning_rate": 2.9551663022812382e-05, + "loss": 1.7957, + "step": 40065 + }, + { + "epoch": 0.07804984440662255, + "grad_norm": 2.4147698879241943, + "learning_rate": 2.9551328936749355e-05, + "loss": 1.979, + "step": 40080 + }, + { + "epoch": 0.07807905467773281, + "grad_norm": 2.1994168758392334, + "learning_rate": 2.9550994728147542e-05, + "loss": 1.8055, + "step": 40095 + }, + { + "epoch": 0.07810826494884308, + "grad_norm": 2.1785078048706055, + "learning_rate": 2.9550660397009755e-05, + "loss": 1.9657, + "step": 40110 + }, + { + "epoch": 0.07813747521995335, + "grad_norm": 4.680526256561279, + "learning_rate": 2.955032594333881e-05, + "loss": 1.8007, + "step": 40125 + }, + { + "epoch": 0.0781666854910636, + "grad_norm": 2.373208999633789, + "learning_rate": 2.9549991367137522e-05, + "loss": 1.8067, + "step": 40140 + }, + { + "epoch": 0.07819589576217387, + "grad_norm": 2.384096145629883, + "learning_rate": 2.9549656668408714e-05, + "loss": 1.9754, + "step": 40155 + }, + { + "epoch": 0.07822510603328413, + "grad_norm": 3.645989179611206, + "learning_rate": 2.9549321847155197e-05, + "loss": 1.9702, + "step": 40170 + }, + { + "epoch": 0.07825431630439439, + "grad_norm": 2.4934167861938477, + "learning_rate": 2.9548986903379794e-05, + "loss": 1.7859, + "step": 40185 + }, + { + "epoch": 0.07828352657550465, + "grad_norm": 3.5801844596862793, + "learning_rate": 2.954865183708533e-05, + "loss": 2.0304, + "step": 40200 + }, + { + "epoch": 0.07831273684661492, + "grad_norm": 1.9151461124420166, + "learning_rate": 2.954831664827462e-05, + "loss": 1.8535, + "step": 40215 + }, + { + "epoch": 0.07834194711772519, + "grad_norm": 2.5540692806243896, + "learning_rate": 2.9547981336950493e-05, + "loss": 1.797, + "step": 40230 + }, + { + "epoch": 0.07837115738883545, + "grad_norm": 5.4373955726623535, + "learning_rate": 2.9547645903115762e-05, + "loss": 1.9314, + "step": 40245 + }, + { + "epoch": 0.07840036765994571, + "grad_norm": 2.6249446868896484, + "learning_rate": 2.9547310346773267e-05, + "loss": 1.9209, + "step": 40260 + }, + { + "epoch": 0.07842957793105597, + "grad_norm": 2.437994956970215, + "learning_rate": 2.9546974667925824e-05, + "loss": 1.8113, + "step": 40275 + }, + { + "epoch": 0.07845878820216623, + "grad_norm": 2.461296558380127, + "learning_rate": 2.9546638866576257e-05, + "loss": 1.8157, + "step": 40290 + }, + { + "epoch": 0.07848799847327649, + "grad_norm": 3.721189022064209, + "learning_rate": 2.95463029427274e-05, + "loss": 1.7759, + "step": 40305 + }, + { + "epoch": 0.07851720874438677, + "grad_norm": 3.0771353244781494, + "learning_rate": 2.954596689638208e-05, + "loss": 1.9319, + "step": 40320 + }, + { + "epoch": 0.07854641901549703, + "grad_norm": 3.901102066040039, + "learning_rate": 2.9545630727543127e-05, + "loss": 1.8157, + "step": 40335 + }, + { + "epoch": 0.07857562928660729, + "grad_norm": 3.15122127532959, + "learning_rate": 2.9545294436213376e-05, + "loss": 1.7357, + "step": 40350 + }, + { + "epoch": 0.07860483955771755, + "grad_norm": 2.7092130184173584, + "learning_rate": 2.9544958022395647e-05, + "loss": 1.7302, + "step": 40365 + }, + { + "epoch": 0.07863404982882781, + "grad_norm": 2.950127363204956, + "learning_rate": 2.9544621486092787e-05, + "loss": 1.9859, + "step": 40380 + }, + { + "epoch": 0.07866326009993807, + "grad_norm": 3.0624284744262695, + "learning_rate": 2.954428482730762e-05, + "loss": 1.8898, + "step": 40395 + }, + { + "epoch": 0.07869247037104833, + "grad_norm": 2.806520700454712, + "learning_rate": 2.9543948046042988e-05, + "loss": 1.7211, + "step": 40410 + }, + { + "epoch": 0.0787216806421586, + "grad_norm": 2.7486681938171387, + "learning_rate": 2.9543611142301724e-05, + "loss": 1.7469, + "step": 40425 + }, + { + "epoch": 0.07875089091326887, + "grad_norm": 3.4893131256103516, + "learning_rate": 2.954327411608666e-05, + "loss": 1.7237, + "step": 40440 + }, + { + "epoch": 0.07878010118437913, + "grad_norm": 2.313070774078369, + "learning_rate": 2.9542936967400645e-05, + "loss": 1.8785, + "step": 40455 + }, + { + "epoch": 0.07880931145548939, + "grad_norm": 3.2802863121032715, + "learning_rate": 2.954259969624651e-05, + "loss": 1.9601, + "step": 40470 + }, + { + "epoch": 0.07883852172659965, + "grad_norm": 2.6983420848846436, + "learning_rate": 2.9542262302627097e-05, + "loss": 1.8998, + "step": 40485 + }, + { + "epoch": 0.07886773199770991, + "grad_norm": 1.7535884380340576, + "learning_rate": 2.954192478654525e-05, + "loss": 1.8958, + "step": 40500 + }, + { + "epoch": 0.07889694226882017, + "grad_norm": 2.297440528869629, + "learning_rate": 2.9541587148003808e-05, + "loss": 1.8084, + "step": 40515 + }, + { + "epoch": 0.07892615253993045, + "grad_norm": 3.2494289875030518, + "learning_rate": 2.9541249387005618e-05, + "loss": 1.7015, + "step": 40530 + }, + { + "epoch": 0.07895536281104071, + "grad_norm": 3.295280933380127, + "learning_rate": 2.954091150355352e-05, + "loss": 1.8645, + "step": 40545 + }, + { + "epoch": 0.07898457308215097, + "grad_norm": 2.3075287342071533, + "learning_rate": 2.9540573497650358e-05, + "loss": 1.8123, + "step": 40560 + }, + { + "epoch": 0.07901378335326123, + "grad_norm": 2.7343220710754395, + "learning_rate": 2.9540235369298984e-05, + "loss": 2.0116, + "step": 40575 + }, + { + "epoch": 0.07904299362437149, + "grad_norm": 3.2301554679870605, + "learning_rate": 2.9539897118502244e-05, + "loss": 1.777, + "step": 40590 + }, + { + "epoch": 0.07907220389548175, + "grad_norm": 2.2123513221740723, + "learning_rate": 2.953955874526298e-05, + "loss": 1.9181, + "step": 40605 + }, + { + "epoch": 0.07910141416659201, + "grad_norm": 5.533998966217041, + "learning_rate": 2.9539220249584056e-05, + "loss": 1.722, + "step": 40620 + }, + { + "epoch": 0.07913062443770229, + "grad_norm": 4.452574729919434, + "learning_rate": 2.9538881631468304e-05, + "loss": 1.7739, + "step": 40635 + }, + { + "epoch": 0.07915983470881255, + "grad_norm": 3.7982211112976074, + "learning_rate": 2.953854289091859e-05, + "loss": 1.8986, + "step": 40650 + }, + { + "epoch": 0.07918904497992281, + "grad_norm": 2.014842987060547, + "learning_rate": 2.9538204027937764e-05, + "loss": 1.9852, + "step": 40665 + }, + { + "epoch": 0.07921825525103307, + "grad_norm": 2.1987805366516113, + "learning_rate": 2.9537865042528672e-05, + "loss": 1.9075, + "step": 40680 + }, + { + "epoch": 0.07924746552214333, + "grad_norm": 4.476477146148682, + "learning_rate": 2.9537525934694177e-05, + "loss": 1.9831, + "step": 40695 + }, + { + "epoch": 0.07927667579325359, + "grad_norm": 2.2541050910949707, + "learning_rate": 2.953718670443713e-05, + "loss": 1.9745, + "step": 40710 + }, + { + "epoch": 0.07930588606436385, + "grad_norm": 3.9762489795684814, + "learning_rate": 2.953684735176039e-05, + "loss": 1.7637, + "step": 40725 + }, + { + "epoch": 0.07933509633547413, + "grad_norm": 3.4468305110931396, + "learning_rate": 2.9536507876666815e-05, + "loss": 1.721, + "step": 40740 + }, + { + "epoch": 0.07936430660658439, + "grad_norm": 4.6930623054504395, + "learning_rate": 2.953616827915926e-05, + "loss": 1.8222, + "step": 40755 + }, + { + "epoch": 0.07939351687769465, + "grad_norm": 2.1669819355010986, + "learning_rate": 2.953582855924059e-05, + "loss": 1.9004, + "step": 40770 + }, + { + "epoch": 0.07942272714880491, + "grad_norm": 2.0153393745422363, + "learning_rate": 2.953548871691366e-05, + "loss": 1.7301, + "step": 40785 + }, + { + "epoch": 0.07945193741991517, + "grad_norm": 3.3808844089508057, + "learning_rate": 2.9535148752181336e-05, + "loss": 1.8399, + "step": 40800 + }, + { + "epoch": 0.07948114769102543, + "grad_norm": 2.579909563064575, + "learning_rate": 2.9534808665046484e-05, + "loss": 1.838, + "step": 40815 + }, + { + "epoch": 0.0795103579621357, + "grad_norm": 4.192935943603516, + "learning_rate": 2.953446845551196e-05, + "loss": 1.9925, + "step": 40830 + }, + { + "epoch": 0.07953956823324596, + "grad_norm": 3.621619701385498, + "learning_rate": 2.9534128123580632e-05, + "loss": 1.8791, + "step": 40845 + }, + { + "epoch": 0.07956877850435623, + "grad_norm": 2.2754971981048584, + "learning_rate": 2.953378766925537e-05, + "loss": 2.041, + "step": 40860 + }, + { + "epoch": 0.07959798877546649, + "grad_norm": 2.975817918777466, + "learning_rate": 2.9533447092539037e-05, + "loss": 1.6106, + "step": 40875 + }, + { + "epoch": 0.07962719904657675, + "grad_norm": 3.3760697841644287, + "learning_rate": 2.95331063934345e-05, + "loss": 1.9524, + "step": 40890 + }, + { + "epoch": 0.07965640931768701, + "grad_norm": 5.000794887542725, + "learning_rate": 2.953276557194464e-05, + "loss": 1.9093, + "step": 40905 + }, + { + "epoch": 0.07968561958879727, + "grad_norm": 1.6077380180358887, + "learning_rate": 2.9532424628072302e-05, + "loss": 1.9355, + "step": 40920 + }, + { + "epoch": 0.07971482985990754, + "grad_norm": 1.9960527420043945, + "learning_rate": 2.953208356182038e-05, + "loss": 1.8322, + "step": 40935 + }, + { + "epoch": 0.0797440401310178, + "grad_norm": 1.9623360633850098, + "learning_rate": 2.9531742373191738e-05, + "loss": 1.7081, + "step": 40950 + }, + { + "epoch": 0.07977325040212807, + "grad_norm": 3.023564338684082, + "learning_rate": 2.9531401062189253e-05, + "loss": 1.8457, + "step": 40965 + }, + { + "epoch": 0.07980246067323833, + "grad_norm": 2.2501392364501953, + "learning_rate": 2.953105962881579e-05, + "loss": 1.9475, + "step": 40980 + }, + { + "epoch": 0.07983167094434859, + "grad_norm": 2.9284751415252686, + "learning_rate": 2.9530718073074235e-05, + "loss": 1.9101, + "step": 40995 + }, + { + "epoch": 0.07986088121545885, + "grad_norm": 5.609287738800049, + "learning_rate": 2.9530376394967457e-05, + "loss": 1.9273, + "step": 41010 + }, + { + "epoch": 0.07989009148656911, + "grad_norm": 3.8887929916381836, + "learning_rate": 2.9530034594498334e-05, + "loss": 1.8836, + "step": 41025 + }, + { + "epoch": 0.07991930175767938, + "grad_norm": 3.847130537033081, + "learning_rate": 2.952969267166975e-05, + "loss": 2.0747, + "step": 41040 + }, + { + "epoch": 0.07994851202878964, + "grad_norm": 1.7652339935302734, + "learning_rate": 2.952935062648458e-05, + "loss": 1.7563, + "step": 41055 + }, + { + "epoch": 0.07997772229989991, + "grad_norm": 4.982182025909424, + "learning_rate": 2.9529008458945703e-05, + "loss": 1.8016, + "step": 41070 + }, + { + "epoch": 0.08000693257101017, + "grad_norm": 4.783311367034912, + "learning_rate": 2.9528666169056e-05, + "loss": 1.9055, + "step": 41085 + }, + { + "epoch": 0.08003614284212043, + "grad_norm": 2.197756767272949, + "learning_rate": 2.952832375681836e-05, + "loss": 1.7831, + "step": 41100 + }, + { + "epoch": 0.0800653531132307, + "grad_norm": 2.6935906410217285, + "learning_rate": 2.9527981222235662e-05, + "loss": 1.8409, + "step": 41115 + }, + { + "epoch": 0.08009456338434096, + "grad_norm": 2.441845178604126, + "learning_rate": 2.9527638565310787e-05, + "loss": 1.7458, + "step": 41130 + }, + { + "epoch": 0.08012377365545122, + "grad_norm": 2.773716688156128, + "learning_rate": 2.952729578604663e-05, + "loss": 1.8769, + "step": 41145 + }, + { + "epoch": 0.08015298392656148, + "grad_norm": 2.964289665222168, + "learning_rate": 2.9526952884446066e-05, + "loss": 1.7942, + "step": 41160 + }, + { + "epoch": 0.08018219419767175, + "grad_norm": 3.813652992248535, + "learning_rate": 2.9526609860511993e-05, + "loss": 1.8336, + "step": 41175 + }, + { + "epoch": 0.08021140446878201, + "grad_norm": 2.442913293838501, + "learning_rate": 2.9526266714247295e-05, + "loss": 1.9877, + "step": 41190 + }, + { + "epoch": 0.08024061473989227, + "grad_norm": 2.4335250854492188, + "learning_rate": 2.9525923445654863e-05, + "loss": 1.7597, + "step": 41205 + }, + { + "epoch": 0.08026982501100254, + "grad_norm": 3.397404670715332, + "learning_rate": 2.9525580054737582e-05, + "loss": 1.9251, + "step": 41220 + }, + { + "epoch": 0.0802990352821128, + "grad_norm": 2.41749906539917, + "learning_rate": 2.9525236541498353e-05, + "loss": 2.0381, + "step": 41235 + }, + { + "epoch": 0.08032824555322306, + "grad_norm": 4.3670654296875, + "learning_rate": 2.952489290594006e-05, + "loss": 1.796, + "step": 41250 + }, + { + "epoch": 0.08035745582433332, + "grad_norm": 4.269131183624268, + "learning_rate": 2.9524549148065607e-05, + "loss": 1.8409, + "step": 41265 + }, + { + "epoch": 0.08038666609544359, + "grad_norm": 3.3020429611206055, + "learning_rate": 2.9524205267877874e-05, + "loss": 1.956, + "step": 41280 + }, + { + "epoch": 0.08041587636655385, + "grad_norm": 2.023874282836914, + "learning_rate": 2.9523861265379773e-05, + "loss": 1.8352, + "step": 41295 + }, + { + "epoch": 0.08044508663766411, + "grad_norm": 1.9308613538742065, + "learning_rate": 2.952351714057419e-05, + "loss": 1.888, + "step": 41310 + }, + { + "epoch": 0.08047429690877438, + "grad_norm": 5.0439558029174805, + "learning_rate": 2.9523172893464033e-05, + "loss": 1.7923, + "step": 41325 + }, + { + "epoch": 0.08050350717988464, + "grad_norm": 2.352971076965332, + "learning_rate": 2.9522828524052184e-05, + "loss": 1.8482, + "step": 41340 + }, + { + "epoch": 0.0805327174509949, + "grad_norm": 3.005178213119507, + "learning_rate": 2.952248403234156e-05, + "loss": 1.9192, + "step": 41355 + }, + { + "epoch": 0.08056192772210516, + "grad_norm": 2.740795612335205, + "learning_rate": 2.9522139418335058e-05, + "loss": 1.9969, + "step": 41370 + }, + { + "epoch": 0.08059113799321543, + "grad_norm": 2.5653722286224365, + "learning_rate": 2.952179468203557e-05, + "loss": 1.9732, + "step": 41385 + }, + { + "epoch": 0.0806203482643257, + "grad_norm": 2.194342613220215, + "learning_rate": 2.9521449823446013e-05, + "loss": 1.929, + "step": 41400 + }, + { + "epoch": 0.08064955853543596, + "grad_norm": 2.7607100009918213, + "learning_rate": 2.952110484256928e-05, + "loss": 1.8187, + "step": 41415 + }, + { + "epoch": 0.08067876880654622, + "grad_norm": 3.4654762744903564, + "learning_rate": 2.9520759739408285e-05, + "loss": 1.8563, + "step": 41430 + }, + { + "epoch": 0.08070797907765648, + "grad_norm": 2.130572557449341, + "learning_rate": 2.9520414513965926e-05, + "loss": 1.8695, + "step": 41445 + }, + { + "epoch": 0.08073718934876674, + "grad_norm": 3.304382085800171, + "learning_rate": 2.9520069166245114e-05, + "loss": 1.8876, + "step": 41460 + }, + { + "epoch": 0.080766399619877, + "grad_norm": 3.911742925643921, + "learning_rate": 2.951972369624876e-05, + "loss": 1.7005, + "step": 41475 + }, + { + "epoch": 0.08079560989098727, + "grad_norm": 4.597911357879639, + "learning_rate": 2.9519378103979773e-05, + "loss": 2.0433, + "step": 41490 + }, + { + "epoch": 0.08082482016209754, + "grad_norm": 5.262660980224609, + "learning_rate": 2.9519032389441057e-05, + "loss": 1.8291, + "step": 41505 + }, + { + "epoch": 0.0808540304332078, + "grad_norm": 2.321345090866089, + "learning_rate": 2.9518686552635527e-05, + "loss": 1.9505, + "step": 41520 + }, + { + "epoch": 0.08088324070431806, + "grad_norm": 4.096843242645264, + "learning_rate": 2.9518340593566098e-05, + "loss": 1.7984, + "step": 41535 + }, + { + "epoch": 0.08091245097542832, + "grad_norm": 4.057320594787598, + "learning_rate": 2.9517994512235678e-05, + "loss": 1.6034, + "step": 41550 + }, + { + "epoch": 0.08094166124653858, + "grad_norm": 2.995281219482422, + "learning_rate": 2.9517648308647186e-05, + "loss": 1.7187, + "step": 41565 + }, + { + "epoch": 0.08097087151764884, + "grad_norm": 2.0676705837249756, + "learning_rate": 2.951730198280354e-05, + "loss": 1.8877, + "step": 41580 + }, + { + "epoch": 0.08100008178875912, + "grad_norm": 3.508944272994995, + "learning_rate": 2.9516955534707643e-05, + "loss": 1.932, + "step": 41595 + }, + { + "epoch": 0.08102929205986938, + "grad_norm": 5.34934139251709, + "learning_rate": 2.9516608964362426e-05, + "loss": 1.8071, + "step": 41610 + }, + { + "epoch": 0.08105850233097964, + "grad_norm": 2.5603349208831787, + "learning_rate": 2.9516262271770802e-05, + "loss": 1.8635, + "step": 41625 + }, + { + "epoch": 0.0810877126020899, + "grad_norm": 3.6904408931732178, + "learning_rate": 2.95159154569357e-05, + "loss": 1.9512, + "step": 41640 + }, + { + "epoch": 0.08111692287320016, + "grad_norm": 3.564453125, + "learning_rate": 2.9515568519860022e-05, + "loss": 1.6514, + "step": 41655 + }, + { + "epoch": 0.08114613314431042, + "grad_norm": 2.5232253074645996, + "learning_rate": 2.9515221460546703e-05, + "loss": 1.7861, + "step": 41670 + }, + { + "epoch": 0.08117534341542068, + "grad_norm": 1.795364499092102, + "learning_rate": 2.9514874278998666e-05, + "loss": 1.8497, + "step": 41685 + }, + { + "epoch": 0.08120455368653096, + "grad_norm": 3.8312599658966064, + "learning_rate": 2.9514526975218825e-05, + "loss": 1.7274, + "step": 41700 + }, + { + "epoch": 0.08123376395764122, + "grad_norm": 4.170705795288086, + "learning_rate": 2.9514179549210114e-05, + "loss": 1.8489, + "step": 41715 + }, + { + "epoch": 0.08126297422875148, + "grad_norm": 2.639073371887207, + "learning_rate": 2.951383200097546e-05, + "loss": 1.8641, + "step": 41730 + }, + { + "epoch": 0.08129218449986174, + "grad_norm": 2.9022207260131836, + "learning_rate": 2.951348433051778e-05, + "loss": 1.6874, + "step": 41745 + }, + { + "epoch": 0.081321394770972, + "grad_norm": 4.153652191162109, + "learning_rate": 2.9513136537840008e-05, + "loss": 1.8162, + "step": 41760 + }, + { + "epoch": 0.08135060504208226, + "grad_norm": 1.798093318939209, + "learning_rate": 2.9512788622945075e-05, + "loss": 1.8429, + "step": 41775 + }, + { + "epoch": 0.08137981531319252, + "grad_norm": 3.2090342044830322, + "learning_rate": 2.9512440585835906e-05, + "loss": 1.714, + "step": 41790 + }, + { + "epoch": 0.0814090255843028, + "grad_norm": 2.163233518600464, + "learning_rate": 2.951209242651543e-05, + "loss": 1.809, + "step": 41805 + }, + { + "epoch": 0.08143823585541306, + "grad_norm": 4.145123481750488, + "learning_rate": 2.951174414498659e-05, + "loss": 1.7756, + "step": 41820 + }, + { + "epoch": 0.08146744612652332, + "grad_norm": 4.807456016540527, + "learning_rate": 2.9511395741252308e-05, + "loss": 2.0115, + "step": 41835 + }, + { + "epoch": 0.08149665639763358, + "grad_norm": 2.5362160205841064, + "learning_rate": 2.951104721531552e-05, + "loss": 1.8761, + "step": 41850 + }, + { + "epoch": 0.08152586666874384, + "grad_norm": 3.089874505996704, + "learning_rate": 2.9510698567179163e-05, + "loss": 1.9963, + "step": 41865 + }, + { + "epoch": 0.0815550769398541, + "grad_norm": 2.398677349090576, + "learning_rate": 2.9510349796846176e-05, + "loss": 1.9228, + "step": 41880 + }, + { + "epoch": 0.08158428721096436, + "grad_norm": 3.9991390705108643, + "learning_rate": 2.9510000904319487e-05, + "loss": 1.9077, + "step": 41895 + }, + { + "epoch": 0.08161349748207462, + "grad_norm": 5.66585111618042, + "learning_rate": 2.9509651889602044e-05, + "loss": 1.9959, + "step": 41910 + }, + { + "epoch": 0.0816427077531849, + "grad_norm": 4.229529857635498, + "learning_rate": 2.950930275269678e-05, + "loss": 1.8397, + "step": 41925 + }, + { + "epoch": 0.08167191802429516, + "grad_norm": 3.0767483711242676, + "learning_rate": 2.9508953493606637e-05, + "loss": 1.7965, + "step": 41940 + }, + { + "epoch": 0.08170112829540542, + "grad_norm": 8.055323600769043, + "learning_rate": 2.9508604112334558e-05, + "loss": 1.9566, + "step": 41955 + }, + { + "epoch": 0.08173033856651568, + "grad_norm": 2.262690544128418, + "learning_rate": 2.950825460888348e-05, + "loss": 1.7102, + "step": 41970 + }, + { + "epoch": 0.08175954883762594, + "grad_norm": 3.084411859512329, + "learning_rate": 2.9507904983256347e-05, + "loss": 1.8877, + "step": 41985 + }, + { + "epoch": 0.0817887591087362, + "grad_norm": 3.5004804134368896, + "learning_rate": 2.9507555235456113e-05, + "loss": 1.8649, + "step": 42000 + }, + { + "epoch": 0.08181796937984646, + "grad_norm": 3.7755775451660156, + "learning_rate": 2.9507205365485712e-05, + "loss": 1.7779, + "step": 42015 + }, + { + "epoch": 0.08184717965095674, + "grad_norm": 2.6499814987182617, + "learning_rate": 2.9506855373348095e-05, + "loss": 1.8773, + "step": 42030 + }, + { + "epoch": 0.081876389922067, + "grad_norm": 3.725449562072754, + "learning_rate": 2.950650525904621e-05, + "loss": 1.6785, + "step": 42045 + }, + { + "epoch": 0.08190560019317726, + "grad_norm": 2.3505465984344482, + "learning_rate": 2.9506155022583e-05, + "loss": 1.9197, + "step": 42060 + }, + { + "epoch": 0.08193481046428752, + "grad_norm": 3.0493154525756836, + "learning_rate": 2.950580466396142e-05, + "loss": 1.828, + "step": 42075 + }, + { + "epoch": 0.08196402073539778, + "grad_norm": 1.90041184425354, + "learning_rate": 2.9505454183184422e-05, + "loss": 1.8616, + "step": 42090 + }, + { + "epoch": 0.08199323100650804, + "grad_norm": 3.4464054107666016, + "learning_rate": 2.9505103580254948e-05, + "loss": 1.8181, + "step": 42105 + }, + { + "epoch": 0.0820224412776183, + "grad_norm": 2.4218642711639404, + "learning_rate": 2.9504752855175964e-05, + "loss": 1.7809, + "step": 42120 + }, + { + "epoch": 0.08205165154872858, + "grad_norm": 3.8021080493927, + "learning_rate": 2.950440200795041e-05, + "loss": 1.8681, + "step": 42135 + }, + { + "epoch": 0.08208086181983884, + "grad_norm": 4.094058036804199, + "learning_rate": 2.950405103858125e-05, + "loss": 1.8401, + "step": 42150 + }, + { + "epoch": 0.0821100720909491, + "grad_norm": 3.918071985244751, + "learning_rate": 2.9503699947071435e-05, + "loss": 1.8717, + "step": 42165 + }, + { + "epoch": 0.08213928236205936, + "grad_norm": 2.4167909622192383, + "learning_rate": 2.9503348733423925e-05, + "loss": 1.8206, + "step": 42180 + }, + { + "epoch": 0.08216849263316962, + "grad_norm": 4.92215633392334, + "learning_rate": 2.9502997397641674e-05, + "loss": 1.9056, + "step": 42195 + }, + { + "epoch": 0.08219770290427988, + "grad_norm": 4.555650234222412, + "learning_rate": 2.9502645939727643e-05, + "loss": 1.6728, + "step": 42210 + }, + { + "epoch": 0.08222691317539015, + "grad_norm": 3.3694517612457275, + "learning_rate": 2.9502294359684786e-05, + "loss": 1.7773, + "step": 42225 + }, + { + "epoch": 0.08225612344650042, + "grad_norm": 2.3496294021606445, + "learning_rate": 2.9501942657516076e-05, + "loss": 1.9216, + "step": 42240 + }, + { + "epoch": 0.08228533371761068, + "grad_norm": 2.9559128284454346, + "learning_rate": 2.9501590833224462e-05, + "loss": 1.9598, + "step": 42255 + }, + { + "epoch": 0.08231454398872094, + "grad_norm": 2.4877378940582275, + "learning_rate": 2.9501238886812913e-05, + "loss": 1.9056, + "step": 42270 + }, + { + "epoch": 0.0823437542598312, + "grad_norm": 3.095447063446045, + "learning_rate": 2.950088681828439e-05, + "loss": 1.8015, + "step": 42285 + }, + { + "epoch": 0.08237296453094146, + "grad_norm": 3.4131603240966797, + "learning_rate": 2.9500534627641862e-05, + "loss": 1.9394, + "step": 42300 + }, + { + "epoch": 0.08240217480205173, + "grad_norm": 2.7747130393981934, + "learning_rate": 2.9500182314888296e-05, + "loss": 2.0229, + "step": 42315 + }, + { + "epoch": 0.08243138507316199, + "grad_norm": 3.355043888092041, + "learning_rate": 2.9499829880026647e-05, + "loss": 1.8285, + "step": 42330 + }, + { + "epoch": 0.08246059534427226, + "grad_norm": 3.1953892707824707, + "learning_rate": 2.94994773230599e-05, + "loss": 1.8574, + "step": 42345 + }, + { + "epoch": 0.08248980561538252, + "grad_norm": 2.5266549587249756, + "learning_rate": 2.949912464399101e-05, + "loss": 1.8056, + "step": 42360 + }, + { + "epoch": 0.08251901588649278, + "grad_norm": 1.8549035787582397, + "learning_rate": 2.949877184282295e-05, + "loss": 1.9169, + "step": 42375 + }, + { + "epoch": 0.08254822615760304, + "grad_norm": 4.0360612869262695, + "learning_rate": 2.9498418919558698e-05, + "loss": 1.8708, + "step": 42390 + }, + { + "epoch": 0.0825774364287133, + "grad_norm": 2.234935998916626, + "learning_rate": 2.9498065874201223e-05, + "loss": 1.6797, + "step": 42405 + }, + { + "epoch": 0.08260664669982357, + "grad_norm": 3.7219271659851074, + "learning_rate": 2.949771270675349e-05, + "loss": 1.8735, + "step": 42420 + }, + { + "epoch": 0.08263585697093383, + "grad_norm": 3.9249229431152344, + "learning_rate": 2.9497359417218483e-05, + "loss": 1.9485, + "step": 42435 + }, + { + "epoch": 0.0826650672420441, + "grad_norm": 2.7230405807495117, + "learning_rate": 2.9497006005599174e-05, + "loss": 1.8244, + "step": 42450 + }, + { + "epoch": 0.08269427751315436, + "grad_norm": 3.115924119949341, + "learning_rate": 2.9496652471898535e-05, + "loss": 1.858, + "step": 42465 + }, + { + "epoch": 0.08272348778426462, + "grad_norm": 2.483771324157715, + "learning_rate": 2.949629881611955e-05, + "loss": 1.8566, + "step": 42480 + }, + { + "epoch": 0.08275269805537488, + "grad_norm": 3.040868043899536, + "learning_rate": 2.9495945038265194e-05, + "loss": 1.6987, + "step": 42495 + }, + { + "epoch": 0.08278190832648515, + "grad_norm": 2.728682279586792, + "learning_rate": 2.9495591138338442e-05, + "loss": 1.8991, + "step": 42510 + }, + { + "epoch": 0.0828111185975954, + "grad_norm": 2.2937636375427246, + "learning_rate": 2.9495237116342282e-05, + "loss": 1.801, + "step": 42525 + }, + { + "epoch": 0.08284032886870567, + "grad_norm": 2.9612514972686768, + "learning_rate": 2.949488297227969e-05, + "loss": 1.9554, + "step": 42540 + }, + { + "epoch": 0.08286953913981594, + "grad_norm": 2.841144323348999, + "learning_rate": 2.9494528706153652e-05, + "loss": 1.6266, + "step": 42555 + }, + { + "epoch": 0.0828987494109262, + "grad_norm": 2.340595245361328, + "learning_rate": 2.9494174317967144e-05, + "loss": 1.6548, + "step": 42570 + }, + { + "epoch": 0.08292795968203646, + "grad_norm": 3.0034701824188232, + "learning_rate": 2.949381980772316e-05, + "loss": 1.7248, + "step": 42585 + }, + { + "epoch": 0.08295716995314673, + "grad_norm": 3.8806467056274414, + "learning_rate": 2.949346517542468e-05, + "loss": 1.8022, + "step": 42600 + }, + { + "epoch": 0.08298638022425699, + "grad_norm": 2.9137628078460693, + "learning_rate": 2.9493110421074692e-05, + "loss": 1.8976, + "step": 42615 + }, + { + "epoch": 0.08301559049536725, + "grad_norm": 2.0429863929748535, + "learning_rate": 2.949275554467618e-05, + "loss": 1.6848, + "step": 42630 + }, + { + "epoch": 0.08304480076647751, + "grad_norm": 7.566515922546387, + "learning_rate": 2.949240054623214e-05, + "loss": 1.902, + "step": 42645 + }, + { + "epoch": 0.08307401103758778, + "grad_norm": 2.313962697982788, + "learning_rate": 2.9492045425745554e-05, + "loss": 1.8932, + "step": 42660 + }, + { + "epoch": 0.08310322130869804, + "grad_norm": 3.407179117202759, + "learning_rate": 2.9491690183219413e-05, + "loss": 1.7459, + "step": 42675 + }, + { + "epoch": 0.0831324315798083, + "grad_norm": 3.9030697345733643, + "learning_rate": 2.9491334818656716e-05, + "loss": 1.9114, + "step": 42690 + }, + { + "epoch": 0.08316164185091857, + "grad_norm": 2.450484275817871, + "learning_rate": 2.9490979332060444e-05, + "loss": 1.9702, + "step": 42705 + }, + { + "epoch": 0.08319085212202883, + "grad_norm": 4.048968315124512, + "learning_rate": 2.9490623723433603e-05, + "loss": 1.9229, + "step": 42720 + }, + { + "epoch": 0.08322006239313909, + "grad_norm": 4.658755302429199, + "learning_rate": 2.949026799277918e-05, + "loss": 1.824, + "step": 42735 + }, + { + "epoch": 0.08324927266424935, + "grad_norm": 3.0579545497894287, + "learning_rate": 2.9489912140100167e-05, + "loss": 1.7174, + "step": 42750 + }, + { + "epoch": 0.08327848293535962, + "grad_norm": 1.9854892492294312, + "learning_rate": 2.948955616539957e-05, + "loss": 1.7176, + "step": 42765 + }, + { + "epoch": 0.08330769320646988, + "grad_norm": 4.234609127044678, + "learning_rate": 2.9489200068680388e-05, + "loss": 1.9884, + "step": 42780 + }, + { + "epoch": 0.08333690347758015, + "grad_norm": 3.243767023086548, + "learning_rate": 2.9488843849945608e-05, + "loss": 1.8489, + "step": 42795 + }, + { + "epoch": 0.0833661137486904, + "grad_norm": 2.477585792541504, + "learning_rate": 2.9488487509198235e-05, + "loss": 1.8609, + "step": 42810 + }, + { + "epoch": 0.08339532401980067, + "grad_norm": 3.6641616821289062, + "learning_rate": 2.9488131046441276e-05, + "loss": 1.9305, + "step": 42825 + }, + { + "epoch": 0.08342453429091093, + "grad_norm": 2.413062334060669, + "learning_rate": 2.9487774461677725e-05, + "loss": 1.833, + "step": 42840 + }, + { + "epoch": 0.08345374456202119, + "grad_norm": 4.575281620025635, + "learning_rate": 2.9487417754910587e-05, + "loss": 1.8558, + "step": 42855 + }, + { + "epoch": 0.08348295483313146, + "grad_norm": 2.5189337730407715, + "learning_rate": 2.9487060926142868e-05, + "loss": 1.8171, + "step": 42870 + }, + { + "epoch": 0.08351216510424173, + "grad_norm": 2.8013970851898193, + "learning_rate": 2.948670397537757e-05, + "loss": 2.0007, + "step": 42885 + }, + { + "epoch": 0.08354137537535199, + "grad_norm": 4.385171890258789, + "learning_rate": 2.94863469026177e-05, + "loss": 1.791, + "step": 42900 + }, + { + "epoch": 0.08357058564646225, + "grad_norm": 3.104492425918579, + "learning_rate": 2.9485989707866266e-05, + "loss": 1.8535, + "step": 42915 + }, + { + "epoch": 0.08359979591757251, + "grad_norm": 1.912163496017456, + "learning_rate": 2.9485632391126274e-05, + "loss": 1.7938, + "step": 42930 + }, + { + "epoch": 0.08362900618868277, + "grad_norm": 3.2798397541046143, + "learning_rate": 2.9485274952400738e-05, + "loss": 1.8626, + "step": 42945 + }, + { + "epoch": 0.08365821645979303, + "grad_norm": 3.1956722736358643, + "learning_rate": 2.9484917391692663e-05, + "loss": 1.9577, + "step": 42960 + }, + { + "epoch": 0.08368742673090329, + "grad_norm": 4.458518028259277, + "learning_rate": 2.9484559709005063e-05, + "loss": 1.9054, + "step": 42975 + }, + { + "epoch": 0.08371663700201357, + "grad_norm": 2.3765296936035156, + "learning_rate": 2.9484201904340944e-05, + "loss": 2.0728, + "step": 42990 + }, + { + "epoch": 0.08374584727312383, + "grad_norm": 2.685070514678955, + "learning_rate": 2.9483843977703322e-05, + "loss": 2.0102, + "step": 43005 + }, + { + "epoch": 0.08377505754423409, + "grad_norm": 2.6258368492126465, + "learning_rate": 2.948348592909522e-05, + "loss": 1.8041, + "step": 43020 + }, + { + "epoch": 0.08380426781534435, + "grad_norm": 2.5244250297546387, + "learning_rate": 2.948312775851964e-05, + "loss": 1.8507, + "step": 43035 + }, + { + "epoch": 0.08383347808645461, + "grad_norm": 2.486104965209961, + "learning_rate": 2.948276946597961e-05, + "loss": 1.8822, + "step": 43050 + }, + { + "epoch": 0.08386268835756487, + "grad_norm": 3.720233917236328, + "learning_rate": 2.9482411051478137e-05, + "loss": 1.6757, + "step": 43065 + }, + { + "epoch": 0.08389189862867513, + "grad_norm": 2.1805083751678467, + "learning_rate": 2.9482052515018247e-05, + "loss": 1.9184, + "step": 43080 + }, + { + "epoch": 0.08392110889978541, + "grad_norm": 6.519444465637207, + "learning_rate": 2.9481693856602953e-05, + "loss": 1.8735, + "step": 43095 + }, + { + "epoch": 0.08395031917089567, + "grad_norm": 3.030322790145874, + "learning_rate": 2.9481335076235277e-05, + "loss": 2.0388, + "step": 43110 + }, + { + "epoch": 0.08397952944200593, + "grad_norm": 3.333339214324951, + "learning_rate": 2.9480976173918245e-05, + "loss": 1.7558, + "step": 43125 + }, + { + "epoch": 0.08400873971311619, + "grad_norm": 2.7111990451812744, + "learning_rate": 2.9480617149654872e-05, + "loss": 1.7693, + "step": 43140 + }, + { + "epoch": 0.08403794998422645, + "grad_norm": 3.3287692070007324, + "learning_rate": 2.948025800344819e-05, + "loss": 1.8074, + "step": 43155 + }, + { + "epoch": 0.08406716025533671, + "grad_norm": 2.8257551193237305, + "learning_rate": 2.9479898735301217e-05, + "loss": 2.0247, + "step": 43170 + }, + { + "epoch": 0.08409637052644697, + "grad_norm": 2.685481071472168, + "learning_rate": 2.947953934521698e-05, + "loss": 1.8955, + "step": 43185 + }, + { + "epoch": 0.08412558079755725, + "grad_norm": 2.8256003856658936, + "learning_rate": 2.9479179833198504e-05, + "loss": 1.6823, + "step": 43200 + }, + { + "epoch": 0.08415479106866751, + "grad_norm": 2.5234475135803223, + "learning_rate": 2.947882019924882e-05, + "loss": 1.8303, + "step": 43215 + }, + { + "epoch": 0.08418400133977777, + "grad_norm": 2.968050003051758, + "learning_rate": 2.9478460443370956e-05, + "loss": 1.8881, + "step": 43230 + }, + { + "epoch": 0.08421321161088803, + "grad_norm": 2.8748507499694824, + "learning_rate": 2.947810056556794e-05, + "loss": 1.918, + "step": 43245 + }, + { + "epoch": 0.08424242188199829, + "grad_norm": 3.759028434753418, + "learning_rate": 2.94777405658428e-05, + "loss": 1.7358, + "step": 43260 + }, + { + "epoch": 0.08427163215310855, + "grad_norm": 2.4832756519317627, + "learning_rate": 2.947738044419857e-05, + "loss": 1.8534, + "step": 43275 + }, + { + "epoch": 0.08430084242421881, + "grad_norm": 2.4439778327941895, + "learning_rate": 2.9477020200638287e-05, + "loss": 1.6968, + "step": 43290 + }, + { + "epoch": 0.08433005269532909, + "grad_norm": 3.8299081325531006, + "learning_rate": 2.947665983516498e-05, + "loss": 1.9434, + "step": 43305 + }, + { + "epoch": 0.08435926296643935, + "grad_norm": 2.444437026977539, + "learning_rate": 2.947629934778168e-05, + "loss": 1.7078, + "step": 43320 + }, + { + "epoch": 0.08438847323754961, + "grad_norm": 4.316204071044922, + "learning_rate": 2.9475938738491432e-05, + "loss": 2.0225, + "step": 43335 + }, + { + "epoch": 0.08441768350865987, + "grad_norm": 2.716829538345337, + "learning_rate": 2.947557800729727e-05, + "loss": 1.8284, + "step": 43350 + }, + { + "epoch": 0.08444689377977013, + "grad_norm": 1.8645912408828735, + "learning_rate": 2.9475217154202225e-05, + "loss": 1.8862, + "step": 43365 + }, + { + "epoch": 0.0844761040508804, + "grad_norm": 2.9769399166107178, + "learning_rate": 2.9474856179209343e-05, + "loss": 1.9656, + "step": 43380 + }, + { + "epoch": 0.08450531432199065, + "grad_norm": 2.7728824615478516, + "learning_rate": 2.947449508232166e-05, + "loss": 1.8666, + "step": 43395 + }, + { + "epoch": 0.08453452459310093, + "grad_norm": 2.3458614349365234, + "learning_rate": 2.9474133863542216e-05, + "loss": 1.9181, + "step": 43410 + }, + { + "epoch": 0.08456373486421119, + "grad_norm": 2.8927040100097656, + "learning_rate": 2.9473772522874057e-05, + "loss": 1.8441, + "step": 43425 + }, + { + "epoch": 0.08459294513532145, + "grad_norm": 3.4922873973846436, + "learning_rate": 2.9473411060320226e-05, + "loss": 1.9061, + "step": 43440 + }, + { + "epoch": 0.08462215540643171, + "grad_norm": 2.4876444339752197, + "learning_rate": 2.9473049475883762e-05, + "loss": 1.8024, + "step": 43455 + }, + { + "epoch": 0.08465136567754197, + "grad_norm": 2.6545233726501465, + "learning_rate": 2.9472687769567716e-05, + "loss": 1.764, + "step": 43470 + }, + { + "epoch": 0.08468057594865223, + "grad_norm": 3.1122703552246094, + "learning_rate": 2.947232594137513e-05, + "loss": 1.921, + "step": 43485 + }, + { + "epoch": 0.0847097862197625, + "grad_norm": 3.626193046569824, + "learning_rate": 2.947196399130905e-05, + "loss": 1.7188, + "step": 43500 + }, + { + "epoch": 0.08473899649087277, + "grad_norm": 3.0047879219055176, + "learning_rate": 2.9471601919372522e-05, + "loss": 1.9129, + "step": 43515 + }, + { + "epoch": 0.08476820676198303, + "grad_norm": 4.2390456199646, + "learning_rate": 2.9471239725568606e-05, + "loss": 1.7775, + "step": 43530 + }, + { + "epoch": 0.08479741703309329, + "grad_norm": 2.4242374897003174, + "learning_rate": 2.947087740990034e-05, + "loss": 1.9588, + "step": 43545 + }, + { + "epoch": 0.08482662730420355, + "grad_norm": 3.0172243118286133, + "learning_rate": 2.9470514972370784e-05, + "loss": 1.7774, + "step": 43560 + }, + { + "epoch": 0.08485583757531381, + "grad_norm": 2.9357850551605225, + "learning_rate": 2.9470152412982985e-05, + "loss": 1.8818, + "step": 43575 + }, + { + "epoch": 0.08488504784642407, + "grad_norm": 3.8699593544006348, + "learning_rate": 2.9469789731739996e-05, + "loss": 1.8835, + "step": 43590 + }, + { + "epoch": 0.08491425811753434, + "grad_norm": 6.237668037414551, + "learning_rate": 2.9469426928644877e-05, + "loss": 1.8208, + "step": 43605 + }, + { + "epoch": 0.08494346838864461, + "grad_norm": 3.9943366050720215, + "learning_rate": 2.946906400370067e-05, + "loss": 1.7787, + "step": 43620 + }, + { + "epoch": 0.08497267865975487, + "grad_norm": 3.0211308002471924, + "learning_rate": 2.946870095691045e-05, + "loss": 1.9552, + "step": 43635 + }, + { + "epoch": 0.08500188893086513, + "grad_norm": 3.7002623081207275, + "learning_rate": 2.9468337788277255e-05, + "loss": 1.7297, + "step": 43650 + }, + { + "epoch": 0.0850310992019754, + "grad_norm": 3.6715025901794434, + "learning_rate": 2.946797449780416e-05, + "loss": 1.9472, + "step": 43665 + }, + { + "epoch": 0.08506030947308565, + "grad_norm": 3.456092596054077, + "learning_rate": 2.9467611085494212e-05, + "loss": 1.8751, + "step": 43680 + }, + { + "epoch": 0.08508951974419592, + "grad_norm": 5.426120758056641, + "learning_rate": 2.946724755135048e-05, + "loss": 1.8391, + "step": 43695 + }, + { + "epoch": 0.08511873001530618, + "grad_norm": 2.2696099281311035, + "learning_rate": 2.946688389537602e-05, + "loss": 1.7744, + "step": 43710 + }, + { + "epoch": 0.08514794028641645, + "grad_norm": 4.38014030456543, + "learning_rate": 2.9466520117573897e-05, + "loss": 2.0601, + "step": 43725 + }, + { + "epoch": 0.08517715055752671, + "grad_norm": 2.5647072792053223, + "learning_rate": 2.946615621794717e-05, + "loss": 1.8444, + "step": 43740 + }, + { + "epoch": 0.08520636082863697, + "grad_norm": 2.123666286468506, + "learning_rate": 2.946579219649891e-05, + "loss": 1.923, + "step": 43755 + }, + { + "epoch": 0.08523557109974723, + "grad_norm": 2.335630178451538, + "learning_rate": 2.946542805323218e-05, + "loss": 1.8607, + "step": 43770 + }, + { + "epoch": 0.0852647813708575, + "grad_norm": 2.300090789794922, + "learning_rate": 2.946506378815004e-05, + "loss": 1.8034, + "step": 43785 + }, + { + "epoch": 0.08529399164196776, + "grad_norm": 2.441096067428589, + "learning_rate": 2.946469940125557e-05, + "loss": 1.899, + "step": 43800 + }, + { + "epoch": 0.08532320191307802, + "grad_norm": 3.2876861095428467, + "learning_rate": 2.946433489255183e-05, + "loss": 1.8932, + "step": 43815 + }, + { + "epoch": 0.08535241218418829, + "grad_norm": 3.2221438884735107, + "learning_rate": 2.946397026204189e-05, + "loss": 1.8407, + "step": 43830 + }, + { + "epoch": 0.08538162245529855, + "grad_norm": 1.7918709516525269, + "learning_rate": 2.9463605509728823e-05, + "loss": 1.8096, + "step": 43845 + }, + { + "epoch": 0.08541083272640881, + "grad_norm": 3.6342146396636963, + "learning_rate": 2.9463240635615704e-05, + "loss": 1.8982, + "step": 43860 + }, + { + "epoch": 0.08544004299751908, + "grad_norm": 3.7016751766204834, + "learning_rate": 2.9462875639705598e-05, + "loss": 1.7444, + "step": 43875 + }, + { + "epoch": 0.08546925326862934, + "grad_norm": 4.3460259437561035, + "learning_rate": 2.946251052200158e-05, + "loss": 1.8327, + "step": 43890 + }, + { + "epoch": 0.0854984635397396, + "grad_norm": 2.2607274055480957, + "learning_rate": 2.946214528250673e-05, + "loss": 1.9111, + "step": 43905 + }, + { + "epoch": 0.08552767381084986, + "grad_norm": 2.9234113693237305, + "learning_rate": 2.9461779921224118e-05, + "loss": 1.8753, + "step": 43920 + }, + { + "epoch": 0.08555688408196013, + "grad_norm": 4.067141532897949, + "learning_rate": 2.9461414438156825e-05, + "loss": 1.6669, + "step": 43935 + }, + { + "epoch": 0.0855860943530704, + "grad_norm": 2.427942991256714, + "learning_rate": 2.9461048833307926e-05, + "loss": 1.7331, + "step": 43950 + }, + { + "epoch": 0.08561530462418065, + "grad_norm": 2.4281039237976074, + "learning_rate": 2.94606831066805e-05, + "loss": 1.9844, + "step": 43965 + }, + { + "epoch": 0.08564451489529092, + "grad_norm": 4.457733154296875, + "learning_rate": 2.946031725827763e-05, + "loss": 2.0372, + "step": 43980 + }, + { + "epoch": 0.08567372516640118, + "grad_norm": 4.074513912200928, + "learning_rate": 2.94599512881024e-05, + "loss": 1.8984, + "step": 43995 + }, + { + "epoch": 0.08570293543751144, + "grad_norm": 4.201878547668457, + "learning_rate": 2.9459585196157876e-05, + "loss": 1.9715, + "step": 44010 + }, + { + "epoch": 0.0857321457086217, + "grad_norm": 4.125231742858887, + "learning_rate": 2.9459218982447156e-05, + "loss": 1.709, + "step": 44025 + }, + { + "epoch": 0.08576135597973196, + "grad_norm": 4.726757526397705, + "learning_rate": 2.945885264697332e-05, + "loss": 1.784, + "step": 44040 + }, + { + "epoch": 0.08579056625084223, + "grad_norm": 2.9422481060028076, + "learning_rate": 2.945848618973945e-05, + "loss": 1.9584, + "step": 44055 + }, + { + "epoch": 0.0858197765219525, + "grad_norm": 3.583988666534424, + "learning_rate": 2.9458119610748635e-05, + "loss": 1.9566, + "step": 44070 + }, + { + "epoch": 0.08584898679306276, + "grad_norm": 3.0787510871887207, + "learning_rate": 2.9457752910003965e-05, + "loss": 1.8864, + "step": 44085 + }, + { + "epoch": 0.08587819706417302, + "grad_norm": 2.8716557025909424, + "learning_rate": 2.9457386087508517e-05, + "loss": 1.8846, + "step": 44100 + }, + { + "epoch": 0.08590740733528328, + "grad_norm": 2.74885630607605, + "learning_rate": 2.9457019143265392e-05, + "loss": 1.8493, + "step": 44115 + }, + { + "epoch": 0.08593661760639354, + "grad_norm": 2.7693309783935547, + "learning_rate": 2.945665207727768e-05, + "loss": 1.9983, + "step": 44130 + }, + { + "epoch": 0.0859658278775038, + "grad_norm": 4.332008361816406, + "learning_rate": 2.945628488954846e-05, + "loss": 1.6724, + "step": 44145 + }, + { + "epoch": 0.08599503814861408, + "grad_norm": 3.0237069129943848, + "learning_rate": 2.9455917580080834e-05, + "loss": 1.9695, + "step": 44160 + }, + { + "epoch": 0.08602424841972434, + "grad_norm": 3.4789276123046875, + "learning_rate": 2.9455550148877895e-05, + "loss": 1.9119, + "step": 44175 + }, + { + "epoch": 0.0860534586908346, + "grad_norm": 4.246069431304932, + "learning_rate": 2.9455182595942735e-05, + "loss": 1.8591, + "step": 44190 + }, + { + "epoch": 0.08608266896194486, + "grad_norm": 3.916719436645508, + "learning_rate": 2.9454814921278444e-05, + "loss": 1.8525, + "step": 44205 + }, + { + "epoch": 0.08611187923305512, + "grad_norm": 3.07059383392334, + "learning_rate": 2.945444712488813e-05, + "loss": 1.814, + "step": 44220 + }, + { + "epoch": 0.08614108950416538, + "grad_norm": 1.955853819847107, + "learning_rate": 2.945407920677488e-05, + "loss": 1.8526, + "step": 44235 + }, + { + "epoch": 0.08617029977527564, + "grad_norm": 3.5619003772735596, + "learning_rate": 2.9453711166941797e-05, + "loss": 1.9248, + "step": 44250 + }, + { + "epoch": 0.08619951004638592, + "grad_norm": 3.539492607116699, + "learning_rate": 2.9453343005391984e-05, + "loss": 1.9684, + "step": 44265 + }, + { + "epoch": 0.08622872031749618, + "grad_norm": 3.498070240020752, + "learning_rate": 2.9452974722128533e-05, + "loss": 1.892, + "step": 44280 + }, + { + "epoch": 0.08625793058860644, + "grad_norm": 1.820780634880066, + "learning_rate": 2.9452606317154546e-05, + "loss": 1.8162, + "step": 44295 + }, + { + "epoch": 0.0862871408597167, + "grad_norm": 4.180066108703613, + "learning_rate": 2.9452237790473132e-05, + "loss": 1.8408, + "step": 44310 + }, + { + "epoch": 0.08631635113082696, + "grad_norm": 3.168524980545044, + "learning_rate": 2.945186914208739e-05, + "loss": 1.8724, + "step": 44325 + }, + { + "epoch": 0.08634556140193722, + "grad_norm": 3.31315541267395, + "learning_rate": 2.9451500372000425e-05, + "loss": 1.9657, + "step": 44340 + }, + { + "epoch": 0.08637477167304748, + "grad_norm": 3.3687546253204346, + "learning_rate": 2.9451131480215343e-05, + "loss": 2.011, + "step": 44355 + }, + { + "epoch": 0.08640398194415776, + "grad_norm": 2.4594132900238037, + "learning_rate": 2.945076246673525e-05, + "loss": 1.8296, + "step": 44370 + }, + { + "epoch": 0.08643319221526802, + "grad_norm": 2.2705142498016357, + "learning_rate": 2.9450393331563254e-05, + "loss": 1.7826, + "step": 44385 + }, + { + "epoch": 0.08646240248637828, + "grad_norm": 4.375278949737549, + "learning_rate": 2.9450024074702465e-05, + "loss": 1.7716, + "step": 44400 + }, + { + "epoch": 0.08649161275748854, + "grad_norm": 2.814610481262207, + "learning_rate": 2.944965469615599e-05, + "loss": 1.7126, + "step": 44415 + }, + { + "epoch": 0.0865208230285988, + "grad_norm": 2.329692840576172, + "learning_rate": 2.9449285195926942e-05, + "loss": 1.6665, + "step": 44430 + }, + { + "epoch": 0.08655003329970906, + "grad_norm": 2.890446901321411, + "learning_rate": 2.944891557401843e-05, + "loss": 1.6763, + "step": 44445 + }, + { + "epoch": 0.08657924357081932, + "grad_norm": 1.720476746559143, + "learning_rate": 2.9448545830433565e-05, + "loss": 1.8273, + "step": 44460 + }, + { + "epoch": 0.0866084538419296, + "grad_norm": 2.7548534870147705, + "learning_rate": 2.9448175965175465e-05, + "loss": 1.7968, + "step": 44475 + }, + { + "epoch": 0.08663766411303986, + "grad_norm": 2.2770633697509766, + "learning_rate": 2.9447805978247244e-05, + "loss": 1.8821, + "step": 44490 + }, + { + "epoch": 0.08666687438415012, + "grad_norm": 1.8714686632156372, + "learning_rate": 2.944743586965202e-05, + "loss": 1.8877, + "step": 44505 + }, + { + "epoch": 0.08669608465526038, + "grad_norm": 3.634554624557495, + "learning_rate": 2.9447065639392902e-05, + "loss": 1.8628, + "step": 44520 + }, + { + "epoch": 0.08672529492637064, + "grad_norm": 3.813159465789795, + "learning_rate": 2.9446695287473015e-05, + "loss": 1.9144, + "step": 44535 + }, + { + "epoch": 0.0867545051974809, + "grad_norm": 3.10882568359375, + "learning_rate": 2.9446324813895475e-05, + "loss": 1.8602, + "step": 44550 + }, + { + "epoch": 0.08678371546859116, + "grad_norm": 2.8320236206054688, + "learning_rate": 2.94459542186634e-05, + "loss": 1.8681, + "step": 44565 + }, + { + "epoch": 0.08681292573970144, + "grad_norm": 2.379601001739502, + "learning_rate": 2.9445583501779914e-05, + "loss": 2.0004, + "step": 44580 + }, + { + "epoch": 0.0868421360108117, + "grad_norm": 4.256722927093506, + "learning_rate": 2.9445212663248136e-05, + "loss": 1.8432, + "step": 44595 + }, + { + "epoch": 0.08687134628192196, + "grad_norm": 3.5287508964538574, + "learning_rate": 2.944484170307119e-05, + "loss": 2.0324, + "step": 44610 + }, + { + "epoch": 0.08690055655303222, + "grad_norm": 5.067081451416016, + "learning_rate": 2.9444470621252206e-05, + "loss": 1.7134, + "step": 44625 + }, + { + "epoch": 0.08692976682414248, + "grad_norm": 2.34912109375, + "learning_rate": 2.94440994177943e-05, + "loss": 1.9109, + "step": 44640 + }, + { + "epoch": 0.08695897709525274, + "grad_norm": 2.16899037361145, + "learning_rate": 2.9443728092700598e-05, + "loss": 1.939, + "step": 44655 + }, + { + "epoch": 0.086988187366363, + "grad_norm": 2.583625555038452, + "learning_rate": 2.9443356645974234e-05, + "loss": 1.9217, + "step": 44670 + }, + { + "epoch": 0.08701739763747328, + "grad_norm": 3.3169472217559814, + "learning_rate": 2.9442985077618333e-05, + "loss": 1.9824, + "step": 44685 + }, + { + "epoch": 0.08704660790858354, + "grad_norm": 3.411571979522705, + "learning_rate": 2.944261338763602e-05, + "loss": 1.865, + "step": 44700 + }, + { + "epoch": 0.0870758181796938, + "grad_norm": 2.273318290710449, + "learning_rate": 2.9442241576030427e-05, + "loss": 1.7771, + "step": 44715 + }, + { + "epoch": 0.08710502845080406, + "grad_norm": 1.9476948976516724, + "learning_rate": 2.944186964280469e-05, + "loss": 1.8675, + "step": 44730 + }, + { + "epoch": 0.08713423872191432, + "grad_norm": 2.1292243003845215, + "learning_rate": 2.944149758796194e-05, + "loss": 1.7585, + "step": 44745 + }, + { + "epoch": 0.08716344899302458, + "grad_norm": 3.4270260334014893, + "learning_rate": 2.9441125411505303e-05, + "loss": 1.7854, + "step": 44760 + }, + { + "epoch": 0.08719265926413484, + "grad_norm": 2.7164952754974365, + "learning_rate": 2.944075311343792e-05, + "loss": 1.8049, + "step": 44775 + }, + { + "epoch": 0.08722186953524512, + "grad_norm": 4.517385005950928, + "learning_rate": 2.9440380693762927e-05, + "loss": 1.7206, + "step": 44790 + }, + { + "epoch": 0.08725107980635538, + "grad_norm": 4.334033489227295, + "learning_rate": 2.9440008152483452e-05, + "loss": 1.7694, + "step": 44805 + }, + { + "epoch": 0.08728029007746564, + "grad_norm": 3.0128273963928223, + "learning_rate": 2.943963548960264e-05, + "loss": 1.8354, + "step": 44820 + }, + { + "epoch": 0.0873095003485759, + "grad_norm": 6.147031784057617, + "learning_rate": 2.9439262705123626e-05, + "loss": 1.7589, + "step": 44835 + }, + { + "epoch": 0.08733871061968616, + "grad_norm": 3.1377012729644775, + "learning_rate": 2.943888979904955e-05, + "loss": 1.8435, + "step": 44850 + }, + { + "epoch": 0.08736792089079642, + "grad_norm": 3.672729969024658, + "learning_rate": 2.9438516771383554e-05, + "loss": 1.765, + "step": 44865 + }, + { + "epoch": 0.08739713116190669, + "grad_norm": 2.4337987899780273, + "learning_rate": 2.9438143622128772e-05, + "loss": 1.8002, + "step": 44880 + }, + { + "epoch": 0.08742634143301696, + "grad_norm": 2.0574724674224854, + "learning_rate": 2.9437770351288357e-05, + "loss": 1.6424, + "step": 44895 + }, + { + "epoch": 0.08745555170412722, + "grad_norm": 2.1482346057891846, + "learning_rate": 2.9437396958865446e-05, + "loss": 1.92, + "step": 44910 + }, + { + "epoch": 0.08748476197523748, + "grad_norm": 2.3287675380706787, + "learning_rate": 2.9437023444863187e-05, + "loss": 1.9602, + "step": 44925 + }, + { + "epoch": 0.08751397224634774, + "grad_norm": 3.350952625274658, + "learning_rate": 2.943664980928472e-05, + "loss": 1.8386, + "step": 44940 + }, + { + "epoch": 0.087543182517458, + "grad_norm": 3.5424532890319824, + "learning_rate": 2.9436276052133196e-05, + "loss": 1.878, + "step": 44955 + }, + { + "epoch": 0.08757239278856827, + "grad_norm": 2.377460241317749, + "learning_rate": 2.943590217341176e-05, + "loss": 1.9263, + "step": 44970 + }, + { + "epoch": 0.08760160305967853, + "grad_norm": 3.12561297416687, + "learning_rate": 2.943552817312356e-05, + "loss": 1.9618, + "step": 44985 + }, + { + "epoch": 0.0876308133307888, + "grad_norm": 2.070358991622925, + "learning_rate": 2.9435154051271748e-05, + "loss": 1.7616, + "step": 45000 + }, + { + "epoch": 0.08766002360189906, + "grad_norm": 1.9374375343322754, + "learning_rate": 2.9434779807859477e-05, + "loss": 1.9208, + "step": 45015 + }, + { + "epoch": 0.08768923387300932, + "grad_norm": 5.294185161590576, + "learning_rate": 2.943440544288989e-05, + "loss": 1.8057, + "step": 45030 + }, + { + "epoch": 0.08771844414411958, + "grad_norm": 3.713681697845459, + "learning_rate": 2.943403095636615e-05, + "loss": 1.8673, + "step": 45045 + }, + { + "epoch": 0.08774765441522984, + "grad_norm": 2.3561294078826904, + "learning_rate": 2.94336563482914e-05, + "loss": 1.8507, + "step": 45060 + }, + { + "epoch": 0.0877768646863401, + "grad_norm": 2.0783450603485107, + "learning_rate": 2.9433281618668805e-05, + "loss": 1.8737, + "step": 45075 + }, + { + "epoch": 0.08780607495745037, + "grad_norm": 3.277926445007324, + "learning_rate": 2.943290676750151e-05, + "loss": 1.9591, + "step": 45090 + }, + { + "epoch": 0.08783528522856063, + "grad_norm": 3.0757181644439697, + "learning_rate": 2.9432531794792683e-05, + "loss": 1.9447, + "step": 45105 + }, + { + "epoch": 0.0878644954996709, + "grad_norm": 1.8041589260101318, + "learning_rate": 2.943215670054547e-05, + "loss": 1.8033, + "step": 45120 + }, + { + "epoch": 0.08789370577078116, + "grad_norm": 2.865537405014038, + "learning_rate": 2.9431781484763037e-05, + "loss": 1.9961, + "step": 45135 + }, + { + "epoch": 0.08792291604189142, + "grad_norm": 2.375731945037842, + "learning_rate": 2.9431406147448545e-05, + "loss": 1.8439, + "step": 45150 + }, + { + "epoch": 0.08795212631300169, + "grad_norm": 3.0966978073120117, + "learning_rate": 2.9431030688605154e-05, + "loss": 2.1031, + "step": 45165 + }, + { + "epoch": 0.08798133658411195, + "grad_norm": 2.7011566162109375, + "learning_rate": 2.9430655108236018e-05, + "loss": 1.7007, + "step": 45180 + }, + { + "epoch": 0.08801054685522221, + "grad_norm": 2.7523937225341797, + "learning_rate": 2.943027940634431e-05, + "loss": 1.8057, + "step": 45195 + }, + { + "epoch": 0.08803975712633247, + "grad_norm": 3.8090713024139404, + "learning_rate": 2.9429903582933186e-05, + "loss": 2.0009, + "step": 45210 + }, + { + "epoch": 0.08806896739744274, + "grad_norm": 2.986457586288452, + "learning_rate": 2.9429527638005816e-05, + "loss": 1.7333, + "step": 45225 + }, + { + "epoch": 0.088098177668553, + "grad_norm": 2.3911290168762207, + "learning_rate": 2.942915157156537e-05, + "loss": 1.72, + "step": 45240 + }, + { + "epoch": 0.08812738793966327, + "grad_norm": 3.72906756401062, + "learning_rate": 2.9428775383614998e-05, + "loss": 2.0707, + "step": 45255 + }, + { + "epoch": 0.08815659821077353, + "grad_norm": 3.646193504333496, + "learning_rate": 2.942839907415789e-05, + "loss": 1.808, + "step": 45270 + }, + { + "epoch": 0.08818580848188379, + "grad_norm": 2.3747620582580566, + "learning_rate": 2.9428022643197196e-05, + "loss": 1.8695, + "step": 45285 + }, + { + "epoch": 0.08821501875299405, + "grad_norm": 4.011888027191162, + "learning_rate": 2.94276460907361e-05, + "loss": 1.7226, + "step": 45300 + }, + { + "epoch": 0.08824422902410431, + "grad_norm": 5.516124248504639, + "learning_rate": 2.9427269416777765e-05, + "loss": 1.8723, + "step": 45315 + }, + { + "epoch": 0.08827343929521458, + "grad_norm": 3.8080711364746094, + "learning_rate": 2.9426892621325364e-05, + "loss": 2.1226, + "step": 45330 + }, + { + "epoch": 0.08830264956632485, + "grad_norm": 3.3641180992126465, + "learning_rate": 2.9426515704382073e-05, + "loss": 1.9913, + "step": 45345 + }, + { + "epoch": 0.0883318598374351, + "grad_norm": 2.9782888889312744, + "learning_rate": 2.9426138665951062e-05, + "loss": 1.9224, + "step": 45360 + }, + { + "epoch": 0.08836107010854537, + "grad_norm": 3.1421496868133545, + "learning_rate": 2.9425761506035506e-05, + "loss": 1.7136, + "step": 45375 + }, + { + "epoch": 0.08839028037965563, + "grad_norm": 3.040264844894409, + "learning_rate": 2.942538422463859e-05, + "loss": 1.7175, + "step": 45390 + }, + { + "epoch": 0.08841949065076589, + "grad_norm": 2.179347038269043, + "learning_rate": 2.942500682176348e-05, + "loss": 1.7239, + "step": 45405 + }, + { + "epoch": 0.08844870092187615, + "grad_norm": 2.941560983657837, + "learning_rate": 2.9424629297413365e-05, + "loss": 1.7729, + "step": 45420 + }, + { + "epoch": 0.08847791119298642, + "grad_norm": 3.767063617706299, + "learning_rate": 2.942425165159141e-05, + "loss": 1.8561, + "step": 45435 + }, + { + "epoch": 0.08850712146409669, + "grad_norm": 2.905930757522583, + "learning_rate": 2.942387388430081e-05, + "loss": 1.9278, + "step": 45450 + }, + { + "epoch": 0.08853633173520695, + "grad_norm": 4.0752458572387695, + "learning_rate": 2.9423495995544735e-05, + "loss": 1.9129, + "step": 45465 + }, + { + "epoch": 0.08856554200631721, + "grad_norm": 3.448967218399048, + "learning_rate": 2.942311798532637e-05, + "loss": 2.0275, + "step": 45480 + }, + { + "epoch": 0.08859475227742747, + "grad_norm": 3.87823748588562, + "learning_rate": 2.9422739853648903e-05, + "loss": 1.81, + "step": 45495 + }, + { + "epoch": 0.08862396254853773, + "grad_norm": 2.705256938934326, + "learning_rate": 2.9422361600515515e-05, + "loss": 1.8793, + "step": 45510 + }, + { + "epoch": 0.08865317281964799, + "grad_norm": 3.3810877799987793, + "learning_rate": 2.942198322592939e-05, + "loss": 1.9076, + "step": 45525 + }, + { + "epoch": 0.08868238309075827, + "grad_norm": 1.9039440155029297, + "learning_rate": 2.9421604729893716e-05, + "loss": 2.1173, + "step": 45540 + }, + { + "epoch": 0.08871159336186853, + "grad_norm": 2.4718310832977295, + "learning_rate": 2.942122611241168e-05, + "loss": 1.8903, + "step": 45555 + }, + { + "epoch": 0.08874080363297879, + "grad_norm": 1.8678919076919556, + "learning_rate": 2.942084737348647e-05, + "loss": 1.9504, + "step": 45570 + }, + { + "epoch": 0.08877001390408905, + "grad_norm": 4.566815376281738, + "learning_rate": 2.942046851312128e-05, + "loss": 1.8323, + "step": 45585 + }, + { + "epoch": 0.08879922417519931, + "grad_norm": 2.7292160987854004, + "learning_rate": 2.9420089531319295e-05, + "loss": 1.896, + "step": 45600 + }, + { + "epoch": 0.08882843444630957, + "grad_norm": 4.215520858764648, + "learning_rate": 2.9419710428083705e-05, + "loss": 1.9013, + "step": 45615 + }, + { + "epoch": 0.08885764471741983, + "grad_norm": 4.495998859405518, + "learning_rate": 2.941933120341771e-05, + "loss": 2.0898, + "step": 45630 + }, + { + "epoch": 0.0888868549885301, + "grad_norm": 4.300086975097656, + "learning_rate": 2.94189518573245e-05, + "loss": 2.0319, + "step": 45645 + }, + { + "epoch": 0.08891606525964037, + "grad_norm": 2.787299156188965, + "learning_rate": 2.9418572389807263e-05, + "loss": 1.9491, + "step": 45660 + }, + { + "epoch": 0.08894527553075063, + "grad_norm": 3.497532844543457, + "learning_rate": 2.94181928008692e-05, + "loss": 1.852, + "step": 45675 + }, + { + "epoch": 0.08897448580186089, + "grad_norm": 2.902589797973633, + "learning_rate": 2.941781309051351e-05, + "loss": 1.8578, + "step": 45690 + }, + { + "epoch": 0.08900369607297115, + "grad_norm": 4.419853210449219, + "learning_rate": 2.9417433258743386e-05, + "loss": 1.8007, + "step": 45705 + }, + { + "epoch": 0.08903290634408141, + "grad_norm": 1.7236632108688354, + "learning_rate": 2.941705330556203e-05, + "loss": 2.0065, + "step": 45720 + }, + { + "epoch": 0.08906211661519167, + "grad_norm": 3.044177532196045, + "learning_rate": 2.9416673230972642e-05, + "loss": 1.7847, + "step": 45735 + }, + { + "epoch": 0.08909132688630195, + "grad_norm": 4.658002853393555, + "learning_rate": 2.941629303497842e-05, + "loss": 1.6592, + "step": 45750 + }, + { + "epoch": 0.08912053715741221, + "grad_norm": 5.669498443603516, + "learning_rate": 2.9415912717582572e-05, + "loss": 1.7555, + "step": 45765 + }, + { + "epoch": 0.08914974742852247, + "grad_norm": 3.3243215084075928, + "learning_rate": 2.9415532278788288e-05, + "loss": 1.854, + "step": 45780 + }, + { + "epoch": 0.08917895769963273, + "grad_norm": 2.8647665977478027, + "learning_rate": 2.9415151718598783e-05, + "loss": 1.9471, + "step": 45795 + }, + { + "epoch": 0.08920816797074299, + "grad_norm": 2.4509527683258057, + "learning_rate": 2.9414771037017256e-05, + "loss": 1.6894, + "step": 45810 + }, + { + "epoch": 0.08923737824185325, + "grad_norm": 2.953076124191284, + "learning_rate": 2.9414390234046916e-05, + "loss": 1.5692, + "step": 45825 + }, + { + "epoch": 0.08926658851296351, + "grad_norm": 3.253838300704956, + "learning_rate": 2.9414009309690968e-05, + "loss": 1.85, + "step": 45840 + }, + { + "epoch": 0.08929579878407379, + "grad_norm": 2.290224075317383, + "learning_rate": 2.9413628263952616e-05, + "loss": 2.0317, + "step": 45855 + }, + { + "epoch": 0.08932500905518405, + "grad_norm": 4.230685234069824, + "learning_rate": 2.941324709683508e-05, + "loss": 1.7835, + "step": 45870 + }, + { + "epoch": 0.08935421932629431, + "grad_norm": 2.5834014415740967, + "learning_rate": 2.941286580834156e-05, + "loss": 1.8562, + "step": 45885 + }, + { + "epoch": 0.08938342959740457, + "grad_norm": 3.2415804862976074, + "learning_rate": 2.941248439847527e-05, + "loss": 1.7909, + "step": 45900 + }, + { + "epoch": 0.08941263986851483, + "grad_norm": 4.316317081451416, + "learning_rate": 2.941210286723942e-05, + "loss": 1.7632, + "step": 45915 + }, + { + "epoch": 0.08944185013962509, + "grad_norm": 2.109940528869629, + "learning_rate": 2.941172121463723e-05, + "loss": 1.6742, + "step": 45930 + }, + { + "epoch": 0.08947106041073535, + "grad_norm": 3.1508097648620605, + "learning_rate": 2.9411339440671907e-05, + "loss": 2.0397, + "step": 45945 + }, + { + "epoch": 0.08950027068184563, + "grad_norm": 3.212796211242676, + "learning_rate": 2.9410957545346665e-05, + "loss": 1.8796, + "step": 45960 + }, + { + "epoch": 0.08952948095295589, + "grad_norm": 3.6629080772399902, + "learning_rate": 2.9410575528664725e-05, + "loss": 1.9981, + "step": 45975 + }, + { + "epoch": 0.08955869122406615, + "grad_norm": 3.9494054317474365, + "learning_rate": 2.94101933906293e-05, + "loss": 1.8718, + "step": 45990 + }, + { + "epoch": 0.08958790149517641, + "grad_norm": 4.630847930908203, + "learning_rate": 2.9409811131243612e-05, + "loss": 1.9543, + "step": 46005 + }, + { + "epoch": 0.08961711176628667, + "grad_norm": 3.8639118671417236, + "learning_rate": 2.9409428750510873e-05, + "loss": 2.0264, + "step": 46020 + }, + { + "epoch": 0.08964632203739693, + "grad_norm": 4.643235683441162, + "learning_rate": 2.9409046248434314e-05, + "loss": 1.9919, + "step": 46035 + }, + { + "epoch": 0.0896755323085072, + "grad_norm": 2.777958393096924, + "learning_rate": 2.9408663625017145e-05, + "loss": 1.9507, + "step": 46050 + }, + { + "epoch": 0.08970474257961747, + "grad_norm": 3.0993664264678955, + "learning_rate": 2.9408280880262595e-05, + "loss": 1.6426, + "step": 46065 + }, + { + "epoch": 0.08973395285072773, + "grad_norm": 3.035386323928833, + "learning_rate": 2.9407898014173888e-05, + "loss": 2.0447, + "step": 46080 + }, + { + "epoch": 0.08976316312183799, + "grad_norm": 4.014139175415039, + "learning_rate": 2.9407515026754244e-05, + "loss": 1.6994, + "step": 46095 + }, + { + "epoch": 0.08979237339294825, + "grad_norm": 2.161834716796875, + "learning_rate": 2.9407131918006888e-05, + "loss": 1.9796, + "step": 46110 + }, + { + "epoch": 0.08982158366405851, + "grad_norm": 2.8618903160095215, + "learning_rate": 2.940674868793505e-05, + "loss": 1.714, + "step": 46125 + }, + { + "epoch": 0.08985079393516877, + "grad_norm": 3.7389118671417236, + "learning_rate": 2.9406365336541953e-05, + "loss": 1.7062, + "step": 46140 + }, + { + "epoch": 0.08988000420627903, + "grad_norm": 2.665452241897583, + "learning_rate": 2.9405981863830828e-05, + "loss": 1.8467, + "step": 46155 + }, + { + "epoch": 0.0899092144773893, + "grad_norm": 3.7498791217803955, + "learning_rate": 2.9405598269804903e-05, + "loss": 1.7833, + "step": 46170 + }, + { + "epoch": 0.08993842474849957, + "grad_norm": 4.2607035636901855, + "learning_rate": 2.9405214554467404e-05, + "loss": 1.9136, + "step": 46185 + }, + { + "epoch": 0.08996763501960983, + "grad_norm": 3.001507043838501, + "learning_rate": 2.9404830717821577e-05, + "loss": 1.917, + "step": 46200 + }, + { + "epoch": 0.08999684529072009, + "grad_norm": 2.4021408557891846, + "learning_rate": 2.9404446759870636e-05, + "loss": 1.9636, + "step": 46215 + }, + { + "epoch": 0.09002605556183035, + "grad_norm": 3.0223772525787354, + "learning_rate": 2.940406268061783e-05, + "loss": 2.0063, + "step": 46230 + }, + { + "epoch": 0.09005526583294061, + "grad_norm": 3.279000759124756, + "learning_rate": 2.9403678480066383e-05, + "loss": 1.7461, + "step": 46245 + }, + { + "epoch": 0.09008447610405088, + "grad_norm": 4.059992790222168, + "learning_rate": 2.9403294158219534e-05, + "loss": 1.8883, + "step": 46260 + }, + { + "epoch": 0.09011368637516114, + "grad_norm": 1.6985243558883667, + "learning_rate": 2.940290971508052e-05, + "loss": 1.8436, + "step": 46275 + }, + { + "epoch": 0.09014289664627141, + "grad_norm": 4.409860610961914, + "learning_rate": 2.940252515065258e-05, + "loss": 1.6792, + "step": 46290 + }, + { + "epoch": 0.09017210691738167, + "grad_norm": 3.3904550075531006, + "learning_rate": 2.9402140464938943e-05, + "loss": 1.9071, + "step": 46305 + }, + { + "epoch": 0.09020131718849193, + "grad_norm": 2.881351947784424, + "learning_rate": 2.9401755657942858e-05, + "loss": 1.808, + "step": 46320 + }, + { + "epoch": 0.0902305274596022, + "grad_norm": 3.654620409011841, + "learning_rate": 2.9401370729667565e-05, + "loss": 1.9737, + "step": 46335 + }, + { + "epoch": 0.09025973773071246, + "grad_norm": 4.317416667938232, + "learning_rate": 2.9400985680116304e-05, + "loss": 1.696, + "step": 46350 + }, + { + "epoch": 0.09028894800182272, + "grad_norm": 2.8537566661834717, + "learning_rate": 2.940060050929232e-05, + "loss": 1.8523, + "step": 46365 + }, + { + "epoch": 0.09031815827293298, + "grad_norm": 3.021620273590088, + "learning_rate": 2.940021521719885e-05, + "loss": 1.8763, + "step": 46380 + }, + { + "epoch": 0.09034736854404325, + "grad_norm": 2.3264734745025635, + "learning_rate": 2.939982980383914e-05, + "loss": 2.0591, + "step": 46395 + }, + { + "epoch": 0.09037657881515351, + "grad_norm": 3.1548166275024414, + "learning_rate": 2.939944426921644e-05, + "loss": 1.9082, + "step": 46410 + }, + { + "epoch": 0.09040578908626377, + "grad_norm": 2.1883738040924072, + "learning_rate": 2.9399058613333992e-05, + "loss": 1.8559, + "step": 46425 + }, + { + "epoch": 0.09043499935737404, + "grad_norm": 3.480323076248169, + "learning_rate": 2.9398672836195053e-05, + "loss": 1.6578, + "step": 46440 + }, + { + "epoch": 0.0904642096284843, + "grad_norm": 4.378427028656006, + "learning_rate": 2.939828693780286e-05, + "loss": 1.7655, + "step": 46455 + }, + { + "epoch": 0.09049341989959456, + "grad_norm": 2.4116623401641846, + "learning_rate": 2.9397900918160667e-05, + "loss": 1.7197, + "step": 46470 + }, + { + "epoch": 0.09052263017070482, + "grad_norm": 2.80350923538208, + "learning_rate": 2.939751477727173e-05, + "loss": 1.9612, + "step": 46485 + }, + { + "epoch": 0.09055184044181509, + "grad_norm": 2.7664198875427246, + "learning_rate": 2.9397128515139292e-05, + "loss": 1.7675, + "step": 46500 + }, + { + "epoch": 0.09058105071292535, + "grad_norm": 2.6225290298461914, + "learning_rate": 2.9396742131766615e-05, + "loss": 1.65, + "step": 46515 + }, + { + "epoch": 0.09061026098403561, + "grad_norm": 4.222823143005371, + "learning_rate": 2.9396355627156942e-05, + "loss": 1.6833, + "step": 46530 + }, + { + "epoch": 0.09063947125514588, + "grad_norm": 4.5323967933654785, + "learning_rate": 2.9395969001313538e-05, + "loss": 2.0888, + "step": 46545 + }, + { + "epoch": 0.09066868152625614, + "grad_norm": 5.949542045593262, + "learning_rate": 2.939558225423965e-05, + "loss": 2.0995, + "step": 46560 + }, + { + "epoch": 0.0906978917973664, + "grad_norm": 1.957553505897522, + "learning_rate": 2.939519538593854e-05, + "loss": 1.7008, + "step": 46575 + }, + { + "epoch": 0.09072710206847666, + "grad_norm": 3.7566747665405273, + "learning_rate": 2.9394808396413463e-05, + "loss": 1.9004, + "step": 46590 + }, + { + "epoch": 0.09075631233958693, + "grad_norm": 4.103549957275391, + "learning_rate": 2.9394421285667684e-05, + "loss": 1.7224, + "step": 46605 + }, + { + "epoch": 0.0907855226106972, + "grad_norm": 4.131819725036621, + "learning_rate": 2.9394034053704457e-05, + "loss": 2.0474, + "step": 46620 + }, + { + "epoch": 0.09081473288180746, + "grad_norm": 4.420338153839111, + "learning_rate": 2.9393646700527047e-05, + "loss": 1.9015, + "step": 46635 + }, + { + "epoch": 0.09084394315291772, + "grad_norm": 4.487269401550293, + "learning_rate": 2.939325922613871e-05, + "loss": 2.0197, + "step": 46650 + }, + { + "epoch": 0.09087315342402798, + "grad_norm": 5.222062587738037, + "learning_rate": 2.939287163054272e-05, + "loss": 1.9145, + "step": 46665 + }, + { + "epoch": 0.09090236369513824, + "grad_norm": 5.2663655281066895, + "learning_rate": 2.9392483913742325e-05, + "loss": 1.7939, + "step": 46680 + }, + { + "epoch": 0.0909315739662485, + "grad_norm": 6.097779273986816, + "learning_rate": 2.9392096075740804e-05, + "loss": 1.975, + "step": 46695 + }, + { + "epoch": 0.09096078423735877, + "grad_norm": 2.3450088500976562, + "learning_rate": 2.9391708116541417e-05, + "loss": 1.9715, + "step": 46710 + }, + { + "epoch": 0.09098999450846904, + "grad_norm": 3.173430919647217, + "learning_rate": 2.939132003614743e-05, + "loss": 1.8709, + "step": 46725 + }, + { + "epoch": 0.0910192047795793, + "grad_norm": 3.0776472091674805, + "learning_rate": 2.9390931834562113e-05, + "loss": 1.7442, + "step": 46740 + }, + { + "epoch": 0.09104841505068956, + "grad_norm": 2.8173866271972656, + "learning_rate": 2.9390543511788735e-05, + "loss": 1.9549, + "step": 46755 + }, + { + "epoch": 0.09107762532179982, + "grad_norm": 4.186956405639648, + "learning_rate": 2.939015506783057e-05, + "loss": 1.7209, + "step": 46770 + }, + { + "epoch": 0.09110683559291008, + "grad_norm": 1.6793681383132935, + "learning_rate": 2.938976650269088e-05, + "loss": 1.6313, + "step": 46785 + }, + { + "epoch": 0.09113604586402034, + "grad_norm": 2.971320152282715, + "learning_rate": 2.9389377816372947e-05, + "loss": 1.8833, + "step": 46800 + }, + { + "epoch": 0.09116525613513062, + "grad_norm": 2.4654321670532227, + "learning_rate": 2.938898900888004e-05, + "loss": 1.8134, + "step": 46815 + }, + { + "epoch": 0.09119446640624088, + "grad_norm": 2.388918399810791, + "learning_rate": 2.9388600080215428e-05, + "loss": 1.9447, + "step": 46830 + }, + { + "epoch": 0.09122367667735114, + "grad_norm": 3.474928617477417, + "learning_rate": 2.9388211030382395e-05, + "loss": 1.7331, + "step": 46845 + }, + { + "epoch": 0.0912528869484614, + "grad_norm": 2.335491180419922, + "learning_rate": 2.938782185938421e-05, + "loss": 1.8248, + "step": 46860 + }, + { + "epoch": 0.09128209721957166, + "grad_norm": 2.1897199153900146, + "learning_rate": 2.9387432567224155e-05, + "loss": 1.752, + "step": 46875 + }, + { + "epoch": 0.09131130749068192, + "grad_norm": 2.5771682262420654, + "learning_rate": 2.9387043153905506e-05, + "loss": 1.964, + "step": 46890 + }, + { + "epoch": 0.09134051776179218, + "grad_norm": 2.8829903602600098, + "learning_rate": 2.9386653619431548e-05, + "loss": 2.0032, + "step": 46905 + }, + { + "epoch": 0.09136972803290246, + "grad_norm": 2.8059322834014893, + "learning_rate": 2.938626396380555e-05, + "loss": 1.8491, + "step": 46920 + }, + { + "epoch": 0.09139893830401272, + "grad_norm": 3.0749945640563965, + "learning_rate": 2.9385874187030802e-05, + "loss": 1.7575, + "step": 46935 + }, + { + "epoch": 0.09142814857512298, + "grad_norm": 2.8172197341918945, + "learning_rate": 2.938548428911058e-05, + "loss": 1.7225, + "step": 46950 + }, + { + "epoch": 0.09145735884623324, + "grad_norm": 5.23187780380249, + "learning_rate": 2.9385094270048182e-05, + "loss": 1.7931, + "step": 46965 + }, + { + "epoch": 0.0914865691173435, + "grad_norm": 2.241788387298584, + "learning_rate": 2.9384704129846876e-05, + "loss": 1.9205, + "step": 46980 + }, + { + "epoch": 0.09151577938845376, + "grad_norm": 3.4979000091552734, + "learning_rate": 2.938431386850995e-05, + "loss": 1.8775, + "step": 46995 + }, + { + "epoch": 0.09154498965956402, + "grad_norm": 3.5703864097595215, + "learning_rate": 2.93839234860407e-05, + "loss": 1.7483, + "step": 47010 + }, + { + "epoch": 0.0915741999306743, + "grad_norm": 3.68125319480896, + "learning_rate": 2.9383532982442404e-05, + "loss": 1.9416, + "step": 47025 + }, + { + "epoch": 0.09160341020178456, + "grad_norm": 2.311479091644287, + "learning_rate": 2.9383142357718356e-05, + "loss": 1.7815, + "step": 47040 + }, + { + "epoch": 0.09163262047289482, + "grad_norm": 2.455263137817383, + "learning_rate": 2.9382751611871843e-05, + "loss": 1.8873, + "step": 47055 + }, + { + "epoch": 0.09166183074400508, + "grad_norm": 3.0800442695617676, + "learning_rate": 2.9382360744906153e-05, + "loss": 2.0687, + "step": 47070 + }, + { + "epoch": 0.09169104101511534, + "grad_norm": 1.9920344352722168, + "learning_rate": 2.938196975682458e-05, + "loss": 1.8431, + "step": 47085 + }, + { + "epoch": 0.0917202512862256, + "grad_norm": 2.7366907596588135, + "learning_rate": 2.938157864763042e-05, + "loss": 1.7235, + "step": 47100 + }, + { + "epoch": 0.09174946155733586, + "grad_norm": 2.735663414001465, + "learning_rate": 2.9381187417326964e-05, + "loss": 1.874, + "step": 47115 + }, + { + "epoch": 0.09177867182844614, + "grad_norm": 3.411982774734497, + "learning_rate": 2.93807960659175e-05, + "loss": 1.7868, + "step": 47130 + }, + { + "epoch": 0.0918078820995564, + "grad_norm": 4.17244291305542, + "learning_rate": 2.9380404593405338e-05, + "loss": 1.9122, + "step": 47145 + }, + { + "epoch": 0.09183709237066666, + "grad_norm": 2.1301894187927246, + "learning_rate": 2.9380012999793763e-05, + "loss": 1.9863, + "step": 47160 + }, + { + "epoch": 0.09186630264177692, + "grad_norm": 2.0585176944732666, + "learning_rate": 2.9379621285086072e-05, + "loss": 1.9591, + "step": 47175 + }, + { + "epoch": 0.09189551291288718, + "grad_norm": 3.732590913772583, + "learning_rate": 2.9379229449285575e-05, + "loss": 1.7843, + "step": 47190 + }, + { + "epoch": 0.09192472318399744, + "grad_norm": 3.9266607761383057, + "learning_rate": 2.937883749239556e-05, + "loss": 1.685, + "step": 47205 + }, + { + "epoch": 0.0919539334551077, + "grad_norm": 2.804568290710449, + "learning_rate": 2.9378445414419333e-05, + "loss": 1.9214, + "step": 47220 + }, + { + "epoch": 0.09198314372621796, + "grad_norm": 3.9198246002197266, + "learning_rate": 2.9378053215360194e-05, + "loss": 1.8938, + "step": 47235 + }, + { + "epoch": 0.09201235399732824, + "grad_norm": 3.0435566902160645, + "learning_rate": 2.937766089522145e-05, + "loss": 1.8995, + "step": 47250 + }, + { + "epoch": 0.0920415642684385, + "grad_norm": 4.04058313369751, + "learning_rate": 2.9377268454006397e-05, + "loss": 1.7821, + "step": 47265 + }, + { + "epoch": 0.09207077453954876, + "grad_norm": 3.9782166481018066, + "learning_rate": 2.9376875891718348e-05, + "loss": 1.7486, + "step": 47280 + }, + { + "epoch": 0.09209998481065902, + "grad_norm": 3.0122504234313965, + "learning_rate": 2.93764832083606e-05, + "loss": 1.8464, + "step": 47295 + }, + { + "epoch": 0.09212919508176928, + "grad_norm": 2.877807140350342, + "learning_rate": 2.9376090403936474e-05, + "loss": 1.7075, + "step": 47310 + }, + { + "epoch": 0.09215840535287954, + "grad_norm": 3.1419224739074707, + "learning_rate": 2.937569747844926e-05, + "loss": 1.9716, + "step": 47325 + }, + { + "epoch": 0.0921876156239898, + "grad_norm": 2.612062931060791, + "learning_rate": 2.9375304431902277e-05, + "loss": 1.8906, + "step": 47340 + }, + { + "epoch": 0.09221682589510008, + "grad_norm": 2.703277587890625, + "learning_rate": 2.9374911264298836e-05, + "loss": 1.9261, + "step": 47355 + }, + { + "epoch": 0.09224603616621034, + "grad_norm": 2.6118664741516113, + "learning_rate": 2.9374517975642247e-05, + "loss": 1.8008, + "step": 47370 + }, + { + "epoch": 0.0922752464373206, + "grad_norm": 4.062849521636963, + "learning_rate": 2.9374124565935817e-05, + "loss": 1.8587, + "step": 47385 + }, + { + "epoch": 0.09230445670843086, + "grad_norm": 2.748095989227295, + "learning_rate": 2.9373731035182863e-05, + "loss": 1.9159, + "step": 47400 + }, + { + "epoch": 0.09233366697954112, + "grad_norm": 2.4311137199401855, + "learning_rate": 2.93733373833867e-05, + "loss": 1.8601, + "step": 47415 + }, + { + "epoch": 0.09236287725065138, + "grad_norm": 2.542649269104004, + "learning_rate": 2.937294361055064e-05, + "loss": 1.9995, + "step": 47430 + }, + { + "epoch": 0.09239208752176165, + "grad_norm": 2.3988935947418213, + "learning_rate": 2.9372549716678e-05, + "loss": 1.946, + "step": 47445 + }, + { + "epoch": 0.09242129779287192, + "grad_norm": 3.805635690689087, + "learning_rate": 2.93721557017721e-05, + "loss": 1.8609, + "step": 47460 + }, + { + "epoch": 0.09245050806398218, + "grad_norm": 3.613706111907959, + "learning_rate": 2.9371761565836253e-05, + "loss": 1.7101, + "step": 47475 + }, + { + "epoch": 0.09247971833509244, + "grad_norm": 2.3323137760162354, + "learning_rate": 2.937136730887378e-05, + "loss": 1.8609, + "step": 47490 + }, + { + "epoch": 0.0925089286062027, + "grad_norm": 3.184736967086792, + "learning_rate": 2.9370972930888003e-05, + "loss": 1.8663, + "step": 47505 + }, + { + "epoch": 0.09253813887731296, + "grad_norm": 2.841794967651367, + "learning_rate": 2.937057843188224e-05, + "loss": 1.8648, + "step": 47520 + }, + { + "epoch": 0.09256734914842323, + "grad_norm": 3.5229077339172363, + "learning_rate": 2.937018381185982e-05, + "loss": 1.9722, + "step": 47535 + }, + { + "epoch": 0.09259655941953349, + "grad_norm": 2.2388434410095215, + "learning_rate": 2.9369789070824058e-05, + "loss": 1.8857, + "step": 47550 + }, + { + "epoch": 0.09262576969064376, + "grad_norm": 2.3352901935577393, + "learning_rate": 2.9369394208778277e-05, + "loss": 1.8446, + "step": 47565 + }, + { + "epoch": 0.09265497996175402, + "grad_norm": 2.5115973949432373, + "learning_rate": 2.9368999225725814e-05, + "loss": 1.8825, + "step": 47580 + }, + { + "epoch": 0.09268419023286428, + "grad_norm": 5.338953495025635, + "learning_rate": 2.9368604121669984e-05, + "loss": 1.8563, + "step": 47595 + }, + { + "epoch": 0.09271340050397454, + "grad_norm": 3.1112778186798096, + "learning_rate": 2.936820889661412e-05, + "loss": 1.893, + "step": 47610 + }, + { + "epoch": 0.0927426107750848, + "grad_norm": 3.126549243927002, + "learning_rate": 2.9367813550561542e-05, + "loss": 1.897, + "step": 47625 + }, + { + "epoch": 0.09277182104619507, + "grad_norm": 1.950778603553772, + "learning_rate": 2.9367418083515587e-05, + "loss": 1.8451, + "step": 47640 + }, + { + "epoch": 0.09280103131730533, + "grad_norm": 2.479407548904419, + "learning_rate": 2.936702249547959e-05, + "loss": 2.0399, + "step": 47655 + }, + { + "epoch": 0.0928302415884156, + "grad_norm": 6.256388187408447, + "learning_rate": 2.9366626786456866e-05, + "loss": 1.8861, + "step": 47670 + }, + { + "epoch": 0.09285945185952586, + "grad_norm": 2.9249043464660645, + "learning_rate": 2.9366230956450765e-05, + "loss": 1.9422, + "step": 47685 + }, + { + "epoch": 0.09288866213063612, + "grad_norm": 3.4006667137145996, + "learning_rate": 2.936583500546461e-05, + "loss": 1.6933, + "step": 47700 + }, + { + "epoch": 0.09291787240174638, + "grad_norm": 2.45302677154541, + "learning_rate": 2.936543893350174e-05, + "loss": 1.8965, + "step": 47715 + }, + { + "epoch": 0.09294708267285665, + "grad_norm": 2.511388063430786, + "learning_rate": 2.9365042740565486e-05, + "loss": 1.991, + "step": 47730 + }, + { + "epoch": 0.0929762929439669, + "grad_norm": 2.6051487922668457, + "learning_rate": 2.9364646426659185e-05, + "loss": 2.043, + "step": 47745 + }, + { + "epoch": 0.09300550321507717, + "grad_norm": 2.067706346511841, + "learning_rate": 2.936424999178618e-05, + "loss": 1.7748, + "step": 47760 + }, + { + "epoch": 0.09303471348618744, + "grad_norm": 3.630814552307129, + "learning_rate": 2.9363853435949806e-05, + "loss": 1.873, + "step": 47775 + }, + { + "epoch": 0.0930639237572977, + "grad_norm": 4.217411994934082, + "learning_rate": 2.9363456759153398e-05, + "loss": 1.8956, + "step": 47790 + }, + { + "epoch": 0.09309313402840796, + "grad_norm": 1.8592040538787842, + "learning_rate": 2.9363059961400303e-05, + "loss": 1.917, + "step": 47805 + }, + { + "epoch": 0.09312234429951823, + "grad_norm": 2.441096305847168, + "learning_rate": 2.936266304269386e-05, + "loss": 1.7576, + "step": 47820 + }, + { + "epoch": 0.09315155457062849, + "grad_norm": 3.1484501361846924, + "learning_rate": 2.9362266003037408e-05, + "loss": 1.8034, + "step": 47835 + }, + { + "epoch": 0.09318076484173875, + "grad_norm": 3.104492425918579, + "learning_rate": 2.93618688424343e-05, + "loss": 1.9117, + "step": 47850 + }, + { + "epoch": 0.09320997511284901, + "grad_norm": 4.626640796661377, + "learning_rate": 2.9361471560887868e-05, + "loss": 1.8304, + "step": 47865 + }, + { + "epoch": 0.09323918538395928, + "grad_norm": 3.3960869312286377, + "learning_rate": 2.936107415840147e-05, + "loss": 1.9412, + "step": 47880 + }, + { + "epoch": 0.09326839565506954, + "grad_norm": 2.6509194374084473, + "learning_rate": 2.9360676634978445e-05, + "loss": 1.8504, + "step": 47895 + }, + { + "epoch": 0.0932976059261798, + "grad_norm": 4.512158393859863, + "learning_rate": 2.9360278990622142e-05, + "loss": 1.9797, + "step": 47910 + }, + { + "epoch": 0.09332681619729007, + "grad_norm": 3.2379143238067627, + "learning_rate": 2.935988122533591e-05, + "loss": 1.8505, + "step": 47925 + }, + { + "epoch": 0.09335602646840033, + "grad_norm": 2.5431196689605713, + "learning_rate": 2.9359483339123097e-05, + "loss": 2.036, + "step": 47940 + }, + { + "epoch": 0.09338523673951059, + "grad_norm": 5.5352396965026855, + "learning_rate": 2.9359085331987056e-05, + "loss": 1.9166, + "step": 47955 + }, + { + "epoch": 0.09341444701062085, + "grad_norm": 3.4632835388183594, + "learning_rate": 2.935868720393114e-05, + "loss": 1.8466, + "step": 47970 + }, + { + "epoch": 0.09344365728173112, + "grad_norm": 5.365049362182617, + "learning_rate": 2.9358288954958697e-05, + "loss": 2.0837, + "step": 47985 + }, + { + "epoch": 0.09347286755284138, + "grad_norm": 4.2348761558532715, + "learning_rate": 2.9357890585073082e-05, + "loss": 1.7106, + "step": 48000 + }, + { + "epoch": 0.09350207782395165, + "grad_norm": 2.3832342624664307, + "learning_rate": 2.9357492094277653e-05, + "loss": 1.8389, + "step": 48015 + }, + { + "epoch": 0.0935312880950619, + "grad_norm": 3.986356735229492, + "learning_rate": 2.9357093482575766e-05, + "loss": 1.8841, + "step": 48030 + }, + { + "epoch": 0.09356049836617217, + "grad_norm": 2.661665678024292, + "learning_rate": 2.935669474997077e-05, + "loss": 1.8138, + "step": 48045 + }, + { + "epoch": 0.09358970863728243, + "grad_norm": 3.051889419555664, + "learning_rate": 2.9356295896466036e-05, + "loss": 1.8381, + "step": 48060 + }, + { + "epoch": 0.09361891890839269, + "grad_norm": 2.316612482070923, + "learning_rate": 2.935589692206491e-05, + "loss": 1.8114, + "step": 48075 + }, + { + "epoch": 0.09364812917950296, + "grad_norm": 2.154189109802246, + "learning_rate": 2.935549782677076e-05, + "loss": 1.9186, + "step": 48090 + }, + { + "epoch": 0.09367733945061323, + "grad_norm": 2.2305965423583984, + "learning_rate": 2.9355098610586936e-05, + "loss": 1.8252, + "step": 48105 + }, + { + "epoch": 0.09370654972172349, + "grad_norm": 2.1520867347717285, + "learning_rate": 2.9354699273516818e-05, + "loss": 1.9562, + "step": 48120 + }, + { + "epoch": 0.09373575999283375, + "grad_norm": 6.05635929107666, + "learning_rate": 2.935429981556375e-05, + "loss": 1.8604, + "step": 48135 + }, + { + "epoch": 0.09376497026394401, + "grad_norm": 2.818211793899536, + "learning_rate": 2.935390023673111e-05, + "loss": 1.8975, + "step": 48150 + }, + { + "epoch": 0.09379418053505427, + "grad_norm": 3.0314574241638184, + "learning_rate": 2.9353500537022256e-05, + "loss": 1.7525, + "step": 48165 + }, + { + "epoch": 0.09382339080616453, + "grad_norm": 3.3957324028015137, + "learning_rate": 2.9353100716440554e-05, + "loss": 1.8434, + "step": 48180 + }, + { + "epoch": 0.0938526010772748, + "grad_norm": 3.6811084747314453, + "learning_rate": 2.9352700774989374e-05, + "loss": 2.0105, + "step": 48195 + }, + { + "epoch": 0.09388181134838507, + "grad_norm": 3.504967451095581, + "learning_rate": 2.935230071267208e-05, + "loss": 1.9383, + "step": 48210 + }, + { + "epoch": 0.09391102161949533, + "grad_norm": 2.807276964187622, + "learning_rate": 2.935190052949204e-05, + "loss": 1.7538, + "step": 48225 + }, + { + "epoch": 0.09394023189060559, + "grad_norm": 3.5521178245544434, + "learning_rate": 2.9351500225452637e-05, + "loss": 1.8844, + "step": 48240 + }, + { + "epoch": 0.09396944216171585, + "grad_norm": 2.884357213973999, + "learning_rate": 2.9351099800557223e-05, + "loss": 1.7718, + "step": 48255 + }, + { + "epoch": 0.09399865243282611, + "grad_norm": 5.653458595275879, + "learning_rate": 2.9350699254809185e-05, + "loss": 1.8595, + "step": 48270 + }, + { + "epoch": 0.09402786270393637, + "grad_norm": 2.2456629276275635, + "learning_rate": 2.9350298588211887e-05, + "loss": 1.6842, + "step": 48285 + }, + { + "epoch": 0.09405707297504663, + "grad_norm": 3.6718297004699707, + "learning_rate": 2.934989780076871e-05, + "loss": 1.8907, + "step": 48300 + }, + { + "epoch": 0.09408628324615691, + "grad_norm": 3.220428705215454, + "learning_rate": 2.9349496892483017e-05, + "loss": 1.8176, + "step": 48315 + }, + { + "epoch": 0.09411549351726717, + "grad_norm": 2.8433873653411865, + "learning_rate": 2.93490958633582e-05, + "loss": 1.8805, + "step": 48330 + }, + { + "epoch": 0.09414470378837743, + "grad_norm": 3.2564454078674316, + "learning_rate": 2.934869471339763e-05, + "loss": 1.8994, + "step": 48345 + }, + { + "epoch": 0.09417391405948769, + "grad_norm": 3.8655519485473633, + "learning_rate": 2.9348293442604675e-05, + "loss": 1.8639, + "step": 48360 + }, + { + "epoch": 0.09420312433059795, + "grad_norm": 3.8386640548706055, + "learning_rate": 2.9347892050982732e-05, + "loss": 1.7577, + "step": 48375 + }, + { + "epoch": 0.09423233460170821, + "grad_norm": 3.6252169609069824, + "learning_rate": 2.9347490538535165e-05, + "loss": 1.8352, + "step": 48390 + }, + { + "epoch": 0.09426154487281847, + "grad_norm": 3.1127262115478516, + "learning_rate": 2.9347088905265367e-05, + "loss": 1.9405, + "step": 48405 + }, + { + "epoch": 0.09429075514392875, + "grad_norm": 2.3375282287597656, + "learning_rate": 2.934668715117671e-05, + "loss": 1.8079, + "step": 48420 + }, + { + "epoch": 0.09431996541503901, + "grad_norm": 2.0516679286956787, + "learning_rate": 2.9346285276272592e-05, + "loss": 1.9539, + "step": 48435 + }, + { + "epoch": 0.09434917568614927, + "grad_norm": 3.3882431983947754, + "learning_rate": 2.9345883280556384e-05, + "loss": 1.819, + "step": 48450 + }, + { + "epoch": 0.09437838595725953, + "grad_norm": 2.644202709197998, + "learning_rate": 2.934548116403147e-05, + "loss": 1.9176, + "step": 48465 + }, + { + "epoch": 0.09440759622836979, + "grad_norm": 3.2326552867889404, + "learning_rate": 2.934507892670125e-05, + "loss": 1.8493, + "step": 48480 + }, + { + "epoch": 0.09443680649948005, + "grad_norm": 2.4110164642333984, + "learning_rate": 2.9344676568569095e-05, + "loss": 1.7326, + "step": 48495 + }, + { + "epoch": 0.09446601677059031, + "grad_norm": 3.2485642433166504, + "learning_rate": 2.9344274089638405e-05, + "loss": 1.7972, + "step": 48510 + }, + { + "epoch": 0.09449522704170059, + "grad_norm": 2.770838499069214, + "learning_rate": 2.9343871489912563e-05, + "loss": 1.9041, + "step": 48525 + }, + { + "epoch": 0.09452443731281085, + "grad_norm": 2.4787659645080566, + "learning_rate": 2.9343468769394965e-05, + "loss": 1.9034, + "step": 48540 + }, + { + "epoch": 0.09455364758392111, + "grad_norm": 4.980709075927734, + "learning_rate": 2.9343065928088997e-05, + "loss": 1.7995, + "step": 48555 + }, + { + "epoch": 0.09458285785503137, + "grad_norm": 3.064199686050415, + "learning_rate": 2.9342662965998053e-05, + "loss": 1.7465, + "step": 48570 + }, + { + "epoch": 0.09461206812614163, + "grad_norm": 4.2010955810546875, + "learning_rate": 2.9342259883125533e-05, + "loss": 1.8556, + "step": 48585 + }, + { + "epoch": 0.0946412783972519, + "grad_norm": 3.320122718811035, + "learning_rate": 2.9341856679474818e-05, + "loss": 1.8096, + "step": 48600 + }, + { + "epoch": 0.09467048866836215, + "grad_norm": 2.7381668090820312, + "learning_rate": 2.9341453355049315e-05, + "loss": 1.8962, + "step": 48615 + }, + { + "epoch": 0.09469969893947243, + "grad_norm": 3.2660324573516846, + "learning_rate": 2.934104990985241e-05, + "loss": 2.032, + "step": 48630 + }, + { + "epoch": 0.09472890921058269, + "grad_norm": 3.2627756595611572, + "learning_rate": 2.9340646343887514e-05, + "loss": 1.7129, + "step": 48645 + }, + { + "epoch": 0.09475811948169295, + "grad_norm": 2.6300570964813232, + "learning_rate": 2.9340242657158016e-05, + "loss": 1.7445, + "step": 48660 + }, + { + "epoch": 0.09478732975280321, + "grad_norm": 2.7047741413116455, + "learning_rate": 2.9339838849667315e-05, + "loss": 1.7302, + "step": 48675 + }, + { + "epoch": 0.09481654002391347, + "grad_norm": 2.4090497493743896, + "learning_rate": 2.933943492141882e-05, + "loss": 1.9483, + "step": 48690 + }, + { + "epoch": 0.09484575029502373, + "grad_norm": 2.5661919116973877, + "learning_rate": 2.9339030872415923e-05, + "loss": 1.8992, + "step": 48705 + }, + { + "epoch": 0.094874960566134, + "grad_norm": 2.628004789352417, + "learning_rate": 2.9338626702662028e-05, + "loss": 1.9041, + "step": 48720 + }, + { + "epoch": 0.09490417083724427, + "grad_norm": 4.652515888214111, + "learning_rate": 2.9338222412160547e-05, + "loss": 1.9847, + "step": 48735 + }, + { + "epoch": 0.09493338110835453, + "grad_norm": 3.3412258625030518, + "learning_rate": 2.933781800091487e-05, + "loss": 1.8348, + "step": 48750 + }, + { + "epoch": 0.09496259137946479, + "grad_norm": 1.8659213781356812, + "learning_rate": 2.9337413468928417e-05, + "loss": 1.663, + "step": 48765 + }, + { + "epoch": 0.09499180165057505, + "grad_norm": 3.2646689414978027, + "learning_rate": 2.933700881620459e-05, + "loss": 1.842, + "step": 48780 + }, + { + "epoch": 0.09502101192168531, + "grad_norm": 4.736114978790283, + "learning_rate": 2.9336604042746796e-05, + "loss": 2.0885, + "step": 48795 + }, + { + "epoch": 0.09505022219279557, + "grad_norm": 4.1501078605651855, + "learning_rate": 2.9336199148558438e-05, + "loss": 1.8239, + "step": 48810 + }, + { + "epoch": 0.09507943246390584, + "grad_norm": 4.221755027770996, + "learning_rate": 2.9335794133642932e-05, + "loss": 2.0309, + "step": 48825 + }, + { + "epoch": 0.09510864273501611, + "grad_norm": 2.461284875869751, + "learning_rate": 2.933538899800369e-05, + "loss": 1.962, + "step": 48840 + }, + { + "epoch": 0.09513785300612637, + "grad_norm": 4.0798187255859375, + "learning_rate": 2.9334983741644118e-05, + "loss": 1.9001, + "step": 48855 + }, + { + "epoch": 0.09516706327723663, + "grad_norm": 3.028874158859253, + "learning_rate": 2.933457836456763e-05, + "loss": 1.7871, + "step": 48870 + }, + { + "epoch": 0.0951962735483469, + "grad_norm": 2.044832229614258, + "learning_rate": 2.9334172866777646e-05, + "loss": 1.6503, + "step": 48885 + }, + { + "epoch": 0.09522548381945715, + "grad_norm": 3.0452120304107666, + "learning_rate": 2.9333767248277574e-05, + "loss": 1.8384, + "step": 48900 + }, + { + "epoch": 0.09525469409056742, + "grad_norm": 5.384150981903076, + "learning_rate": 2.9333361509070833e-05, + "loss": 1.8761, + "step": 48915 + }, + { + "epoch": 0.09528390436167768, + "grad_norm": 2.2106215953826904, + "learning_rate": 2.933295564916084e-05, + "loss": 1.8296, + "step": 48930 + }, + { + "epoch": 0.09531311463278795, + "grad_norm": 3.9018208980560303, + "learning_rate": 2.933254966855101e-05, + "loss": 1.8843, + "step": 48945 + }, + { + "epoch": 0.09534232490389821, + "grad_norm": 2.1903138160705566, + "learning_rate": 2.933214356724476e-05, + "loss": 1.9112, + "step": 48960 + }, + { + "epoch": 0.09537153517500847, + "grad_norm": 2.6728515625, + "learning_rate": 2.933173734524552e-05, + "loss": 1.9539, + "step": 48975 + }, + { + "epoch": 0.09540074544611873, + "grad_norm": 4.163172721862793, + "learning_rate": 2.9331331002556697e-05, + "loss": 1.9025, + "step": 48990 + }, + { + "epoch": 0.095429955717229, + "grad_norm": 3.705101490020752, + "learning_rate": 2.9330924539181726e-05, + "loss": 1.8348, + "step": 49005 + }, + { + "epoch": 0.09545916598833926, + "grad_norm": 4.595695495605469, + "learning_rate": 2.9330517955124024e-05, + "loss": 1.9086, + "step": 49020 + }, + { + "epoch": 0.09548837625944952, + "grad_norm": 2.3054087162017822, + "learning_rate": 2.933011125038701e-05, + "loss": 1.9129, + "step": 49035 + }, + { + "epoch": 0.09551758653055979, + "grad_norm": 2.9813685417175293, + "learning_rate": 2.9329704424974118e-05, + "loss": 1.7802, + "step": 49050 + }, + { + "epoch": 0.09554679680167005, + "grad_norm": 5.957952499389648, + "learning_rate": 2.932929747888877e-05, + "loss": 1.789, + "step": 49065 + }, + { + "epoch": 0.09557600707278031, + "grad_norm": 3.371269702911377, + "learning_rate": 2.932889041213439e-05, + "loss": 1.772, + "step": 49080 + }, + { + "epoch": 0.09560521734389058, + "grad_norm": 5.457712173461914, + "learning_rate": 2.9328483224714408e-05, + "loss": 1.8218, + "step": 49095 + }, + { + "epoch": 0.09563442761500084, + "grad_norm": 5.165330410003662, + "learning_rate": 2.9328075916632255e-05, + "loss": 1.8362, + "step": 49110 + }, + { + "epoch": 0.0956636378861111, + "grad_norm": 1.8119124174118042, + "learning_rate": 2.9327668487891357e-05, + "loss": 1.8338, + "step": 49125 + }, + { + "epoch": 0.09569284815722136, + "grad_norm": 4.478886127471924, + "learning_rate": 2.9327260938495152e-05, + "loss": 1.7219, + "step": 49140 + }, + { + "epoch": 0.09572205842833163, + "grad_norm": 3.2933688163757324, + "learning_rate": 2.9326853268447066e-05, + "loss": 1.8621, + "step": 49155 + }, + { + "epoch": 0.0957512686994419, + "grad_norm": 4.14644718170166, + "learning_rate": 2.932644547775053e-05, + "loss": 2.0011, + "step": 49170 + }, + { + "epoch": 0.09578047897055215, + "grad_norm": 2.830348014831543, + "learning_rate": 2.9326037566408985e-05, + "loss": 1.9063, + "step": 49185 + }, + { + "epoch": 0.09580968924166242, + "grad_norm": 3.782336473464966, + "learning_rate": 2.9325629534425865e-05, + "loss": 1.7197, + "step": 49200 + }, + { + "epoch": 0.09583889951277268, + "grad_norm": 3.7808749675750732, + "learning_rate": 2.9325221381804603e-05, + "loss": 1.7621, + "step": 49215 + }, + { + "epoch": 0.09586810978388294, + "grad_norm": 2.5332424640655518, + "learning_rate": 2.9324813108548634e-05, + "loss": 1.9891, + "step": 49230 + }, + { + "epoch": 0.0958973200549932, + "grad_norm": 3.981290102005005, + "learning_rate": 2.93244047146614e-05, + "loss": 1.9297, + "step": 49245 + }, + { + "epoch": 0.09592653032610347, + "grad_norm": 5.40943717956543, + "learning_rate": 2.9323996200146338e-05, + "loss": 1.7935, + "step": 49260 + }, + { + "epoch": 0.09595574059721373, + "grad_norm": 2.819638967514038, + "learning_rate": 2.932358756500689e-05, + "loss": 1.7276, + "step": 49275 + }, + { + "epoch": 0.095984950868324, + "grad_norm": 2.1289079189300537, + "learning_rate": 2.93231788092465e-05, + "loss": 1.9824, + "step": 49290 + }, + { + "epoch": 0.09601416113943426, + "grad_norm": 1.9851287603378296, + "learning_rate": 2.93227699328686e-05, + "loss": 1.8725, + "step": 49305 + }, + { + "epoch": 0.09604337141054452, + "grad_norm": 4.321681499481201, + "learning_rate": 2.9322360935876647e-05, + "loss": 1.8184, + "step": 49320 + }, + { + "epoch": 0.09607258168165478, + "grad_norm": 3.4782915115356445, + "learning_rate": 2.932195181827408e-05, + "loss": 2.057, + "step": 49335 + }, + { + "epoch": 0.09610179195276504, + "grad_norm": 3.1813442707061768, + "learning_rate": 2.9321542580064332e-05, + "loss": 1.8996, + "step": 49350 + }, + { + "epoch": 0.0961310022238753, + "grad_norm": 3.8128514289855957, + "learning_rate": 2.932113322125087e-05, + "loss": 1.8302, + "step": 49365 + }, + { + "epoch": 0.09616021249498558, + "grad_norm": 4.271757125854492, + "learning_rate": 2.9320723741837122e-05, + "loss": 1.778, + "step": 49380 + }, + { + "epoch": 0.09618942276609584, + "grad_norm": 1.6343133449554443, + "learning_rate": 2.9320314141826556e-05, + "loss": 1.7966, + "step": 49395 + }, + { + "epoch": 0.0962186330372061, + "grad_norm": 2.3712074756622314, + "learning_rate": 2.9319904421222605e-05, + "loss": 1.6004, + "step": 49410 + }, + { + "epoch": 0.09624784330831636, + "grad_norm": 2.9529919624328613, + "learning_rate": 2.931949458002872e-05, + "loss": 1.918, + "step": 49425 + }, + { + "epoch": 0.09627705357942662, + "grad_norm": 3.2947447299957275, + "learning_rate": 2.9319084618248364e-05, + "loss": 1.8999, + "step": 49440 + }, + { + "epoch": 0.09630626385053688, + "grad_norm": 4.216789245605469, + "learning_rate": 2.9318674535884976e-05, + "loss": 1.867, + "step": 49455 + }, + { + "epoch": 0.09633547412164714, + "grad_norm": 3.108816385269165, + "learning_rate": 2.9318264332942023e-05, + "loss": 1.9415, + "step": 49470 + }, + { + "epoch": 0.09636468439275742, + "grad_norm": 2.3301572799682617, + "learning_rate": 2.9317854009422947e-05, + "loss": 1.8643, + "step": 49485 + }, + { + "epoch": 0.09639389466386768, + "grad_norm": 2.4246294498443604, + "learning_rate": 2.9317443565331215e-05, + "loss": 2.1021, + "step": 49500 + }, + { + "epoch": 0.09642310493497794, + "grad_norm": 2.9938971996307373, + "learning_rate": 2.931703300067027e-05, + "loss": 1.7982, + "step": 49515 + }, + { + "epoch": 0.0964523152060882, + "grad_norm": 4.092379093170166, + "learning_rate": 2.931662231544358e-05, + "loss": 1.8046, + "step": 49530 + }, + { + "epoch": 0.09648152547719846, + "grad_norm": 3.6921536922454834, + "learning_rate": 2.93162115096546e-05, + "loss": 1.8048, + "step": 49545 + }, + { + "epoch": 0.09651073574830872, + "grad_norm": 2.1144022941589355, + "learning_rate": 2.9315800583306788e-05, + "loss": 1.7977, + "step": 49560 + }, + { + "epoch": 0.09653994601941898, + "grad_norm": 3.873920202255249, + "learning_rate": 2.9315389536403608e-05, + "loss": 1.8269, + "step": 49575 + }, + { + "epoch": 0.09656915629052926, + "grad_norm": 2.7853615283966064, + "learning_rate": 2.9314978368948517e-05, + "loss": 1.8349, + "step": 49590 + }, + { + "epoch": 0.09659836656163952, + "grad_norm": 2.341984272003174, + "learning_rate": 2.9314567080944978e-05, + "loss": 1.9739, + "step": 49605 + }, + { + "epoch": 0.09662757683274978, + "grad_norm": 5.124273300170898, + "learning_rate": 2.9314155672396463e-05, + "loss": 1.894, + "step": 49620 + }, + { + "epoch": 0.09665678710386004, + "grad_norm": 2.3911478519439697, + "learning_rate": 2.9313744143306425e-05, + "loss": 1.763, + "step": 49635 + }, + { + "epoch": 0.0966859973749703, + "grad_norm": 3.236575126647949, + "learning_rate": 2.9313332493678334e-05, + "loss": 1.7819, + "step": 49650 + }, + { + "epoch": 0.09671520764608056, + "grad_norm": 3.3634097576141357, + "learning_rate": 2.931292072351566e-05, + "loss": 1.8155, + "step": 49665 + }, + { + "epoch": 0.09674441791719082, + "grad_norm": 2.849099636077881, + "learning_rate": 2.9312508832821865e-05, + "loss": 1.8128, + "step": 49680 + }, + { + "epoch": 0.0967736281883011, + "grad_norm": 2.615405321121216, + "learning_rate": 2.931209682160042e-05, + "loss": 1.7717, + "step": 49695 + }, + { + "epoch": 0.09680283845941136, + "grad_norm": 2.432985305786133, + "learning_rate": 2.9311684689854795e-05, + "loss": 1.9783, + "step": 49710 + }, + { + "epoch": 0.09683204873052162, + "grad_norm": 3.7597007751464844, + "learning_rate": 2.931127243758846e-05, + "loss": 1.7927, + "step": 49725 + }, + { + "epoch": 0.09686125900163188, + "grad_norm": 3.092031240463257, + "learning_rate": 2.9310860064804893e-05, + "loss": 1.8321, + "step": 49740 + }, + { + "epoch": 0.09689046927274214, + "grad_norm": 3.606152296066284, + "learning_rate": 2.931044757150755e-05, + "loss": 1.6903, + "step": 49755 + }, + { + "epoch": 0.0969196795438524, + "grad_norm": 3.0055344104766846, + "learning_rate": 2.9310034957699925e-05, + "loss": 1.9332, + "step": 49770 + }, + { + "epoch": 0.09694888981496266, + "grad_norm": 3.8841702938079834, + "learning_rate": 2.930962222338548e-05, + "loss": 2.0835, + "step": 49785 + }, + { + "epoch": 0.09697810008607294, + "grad_norm": 5.191905975341797, + "learning_rate": 2.930920936856769e-05, + "loss": 1.9355, + "step": 49800 + }, + { + "epoch": 0.0970073103571832, + "grad_norm": 3.7676405906677246, + "learning_rate": 2.9308796393250037e-05, + "loss": 1.889, + "step": 49815 + }, + { + "epoch": 0.09703652062829346, + "grad_norm": 2.2673709392547607, + "learning_rate": 2.9308383297436e-05, + "loss": 1.8505, + "step": 49830 + }, + { + "epoch": 0.09706573089940372, + "grad_norm": 2.153475522994995, + "learning_rate": 2.9307970081129052e-05, + "loss": 1.9271, + "step": 49845 + }, + { + "epoch": 0.09709494117051398, + "grad_norm": 3.575103998184204, + "learning_rate": 2.930755674433268e-05, + "loss": 1.7848, + "step": 49860 + }, + { + "epoch": 0.09712415144162424, + "grad_norm": 3.662231206893921, + "learning_rate": 2.9307143287050355e-05, + "loss": 1.9642, + "step": 49875 + }, + { + "epoch": 0.0971533617127345, + "grad_norm": 3.604665517807007, + "learning_rate": 2.9306729709285567e-05, + "loss": 1.9541, + "step": 49890 + }, + { + "epoch": 0.09718257198384478, + "grad_norm": 5.528378009796143, + "learning_rate": 2.930631601104179e-05, + "loss": 1.8596, + "step": 49905 + }, + { + "epoch": 0.09721178225495504, + "grad_norm": 4.253903865814209, + "learning_rate": 2.9305902192322523e-05, + "loss": 1.9061, + "step": 49920 + }, + { + "epoch": 0.0972409925260653, + "grad_norm": 3.9055936336517334, + "learning_rate": 2.9305488253131236e-05, + "loss": 1.5918, + "step": 49935 + }, + { + "epoch": 0.09727020279717556, + "grad_norm": 3.4722824096679688, + "learning_rate": 2.9305074193471423e-05, + "loss": 1.766, + "step": 49950 + }, + { + "epoch": 0.09729941306828582, + "grad_norm": 3.583777904510498, + "learning_rate": 2.9304660013346567e-05, + "loss": 1.9215, + "step": 49965 + }, + { + "epoch": 0.09732862333939608, + "grad_norm": 2.2608377933502197, + "learning_rate": 2.9304245712760156e-05, + "loss": 1.8978, + "step": 49980 + }, + { + "epoch": 0.09735783361050634, + "grad_norm": 3.6289658546447754, + "learning_rate": 2.930383129171568e-05, + "loss": 1.8614, + "step": 49995 + }, + { + "epoch": 0.09738704388161662, + "grad_norm": 2.0757150650024414, + "learning_rate": 2.9303416750216633e-05, + "loss": 1.6696, + "step": 50010 + }, + { + "epoch": 0.09741625415272688, + "grad_norm": 2.5153489112854004, + "learning_rate": 2.93030020882665e-05, + "loss": 1.9128, + "step": 50025 + }, + { + "epoch": 0.09744546442383714, + "grad_norm": 2.9471731185913086, + "learning_rate": 2.9302587305868775e-05, + "loss": 1.9189, + "step": 50040 + }, + { + "epoch": 0.0974746746949474, + "grad_norm": 1.6924009323120117, + "learning_rate": 2.930217240302695e-05, + "loss": 1.7534, + "step": 50055 + }, + { + "epoch": 0.09750388496605766, + "grad_norm": 4.795204162597656, + "learning_rate": 2.9301757379744518e-05, + "loss": 1.8246, + "step": 50070 + }, + { + "epoch": 0.09753309523716792, + "grad_norm": 4.580473899841309, + "learning_rate": 2.9301342236024977e-05, + "loss": 1.9236, + "step": 50085 + }, + { + "epoch": 0.09756230550827819, + "grad_norm": 2.0105273723602295, + "learning_rate": 2.930092697187182e-05, + "loss": 1.7976, + "step": 50100 + }, + { + "epoch": 0.09759151577938846, + "grad_norm": 2.0396523475646973, + "learning_rate": 2.930051158728855e-05, + "loss": 1.768, + "step": 50115 + }, + { + "epoch": 0.09762072605049872, + "grad_norm": 1.9610315561294556, + "learning_rate": 2.930009608227866e-05, + "loss": 1.7016, + "step": 50130 + }, + { + "epoch": 0.09764993632160898, + "grad_norm": 5.699824333190918, + "learning_rate": 2.9299680456845643e-05, + "loss": 1.8118, + "step": 50145 + }, + { + "epoch": 0.09767914659271924, + "grad_norm": 4.904826641082764, + "learning_rate": 2.929926471099301e-05, + "loss": 1.8966, + "step": 50160 + }, + { + "epoch": 0.0977083568638295, + "grad_norm": 4.166045188903809, + "learning_rate": 2.9298848844724263e-05, + "loss": 1.8054, + "step": 50175 + }, + { + "epoch": 0.09773756713493977, + "grad_norm": 3.8375942707061768, + "learning_rate": 2.929843285804289e-05, + "loss": 1.9218, + "step": 50190 + }, + { + "epoch": 0.09776677740605003, + "grad_norm": 2.3785958290100098, + "learning_rate": 2.9298016750952408e-05, + "loss": 1.8998, + "step": 50205 + }, + { + "epoch": 0.0977959876771603, + "grad_norm": 2.632284164428711, + "learning_rate": 2.9297600523456314e-05, + "loss": 1.6993, + "step": 50220 + }, + { + "epoch": 0.09782519794827056, + "grad_norm": 3.581476926803589, + "learning_rate": 2.9297184175558116e-05, + "loss": 1.6674, + "step": 50235 + }, + { + "epoch": 0.09785440821938082, + "grad_norm": 2.3087403774261475, + "learning_rate": 2.929676770726132e-05, + "loss": 1.8005, + "step": 50250 + }, + { + "epoch": 0.09788361849049108, + "grad_norm": 2.212217330932617, + "learning_rate": 2.9296351118569433e-05, + "loss": 1.8502, + "step": 50265 + }, + { + "epoch": 0.09791282876160134, + "grad_norm": 2.2795591354370117, + "learning_rate": 2.9295934409485957e-05, + "loss": 1.8022, + "step": 50280 + }, + { + "epoch": 0.0979420390327116, + "grad_norm": 1.9416465759277344, + "learning_rate": 2.9295517580014414e-05, + "loss": 2.1082, + "step": 50295 + }, + { + "epoch": 0.09797124930382187, + "grad_norm": 3.3864188194274902, + "learning_rate": 2.9295100630158303e-05, + "loss": 1.8254, + "step": 50310 + }, + { + "epoch": 0.09800045957493214, + "grad_norm": 2.7044012546539307, + "learning_rate": 2.9294683559921138e-05, + "loss": 1.8115, + "step": 50325 + }, + { + "epoch": 0.0980296698460424, + "grad_norm": 3.7825982570648193, + "learning_rate": 2.9294266369306435e-05, + "loss": 1.8268, + "step": 50340 + }, + { + "epoch": 0.09805888011715266, + "grad_norm": 2.6438560485839844, + "learning_rate": 2.92938490583177e-05, + "loss": 1.9648, + "step": 50355 + }, + { + "epoch": 0.09808809038826292, + "grad_norm": 2.6116576194763184, + "learning_rate": 2.9293431626958458e-05, + "loss": 1.9675, + "step": 50370 + }, + { + "epoch": 0.09811730065937319, + "grad_norm": 2.0167150497436523, + "learning_rate": 2.9293014075232217e-05, + "loss": 1.8153, + "step": 50385 + }, + { + "epoch": 0.09814651093048345, + "grad_norm": 3.124234437942505, + "learning_rate": 2.9292596403142492e-05, + "loss": 1.9441, + "step": 50400 + }, + { + "epoch": 0.09817572120159371, + "grad_norm": 4.874459743499756, + "learning_rate": 2.92921786106928e-05, + "loss": 1.8517, + "step": 50415 + }, + { + "epoch": 0.09820493147270397, + "grad_norm": 3.4582738876342773, + "learning_rate": 2.9291760697886668e-05, + "loss": 1.9337, + "step": 50430 + }, + { + "epoch": 0.09823414174381424, + "grad_norm": 3.4939401149749756, + "learning_rate": 2.9291342664727605e-05, + "loss": 1.9163, + "step": 50445 + }, + { + "epoch": 0.0982633520149245, + "grad_norm": 4.0186381340026855, + "learning_rate": 2.9290924511219132e-05, + "loss": 2.0225, + "step": 50460 + }, + { + "epoch": 0.09829256228603477, + "grad_norm": 2.9652369022369385, + "learning_rate": 2.929050623736478e-05, + "loss": 1.9003, + "step": 50475 + }, + { + "epoch": 0.09832177255714503, + "grad_norm": 3.2894749641418457, + "learning_rate": 2.9290087843168065e-05, + "loss": 1.9578, + "step": 50490 + }, + { + "epoch": 0.09835098282825529, + "grad_norm": 4.110862731933594, + "learning_rate": 2.9289669328632504e-05, + "loss": 1.7971, + "step": 50505 + }, + { + "epoch": 0.09838019309936555, + "grad_norm": 3.563746690750122, + "learning_rate": 2.9289250693761635e-05, + "loss": 1.8875, + "step": 50520 + }, + { + "epoch": 0.09840940337047581, + "grad_norm": 2.2554609775543213, + "learning_rate": 2.9288831938558975e-05, + "loss": 1.828, + "step": 50535 + }, + { + "epoch": 0.09843861364158608, + "grad_norm": 4.824254035949707, + "learning_rate": 2.9288413063028048e-05, + "loss": 1.9239, + "step": 50550 + }, + { + "epoch": 0.09846782391269635, + "grad_norm": 1.684004783630371, + "learning_rate": 2.9287994067172388e-05, + "loss": 1.8381, + "step": 50565 + }, + { + "epoch": 0.0984970341838066, + "grad_norm": 5.158596038818359, + "learning_rate": 2.928757495099552e-05, + "loss": 1.8882, + "step": 50580 + }, + { + "epoch": 0.09852624445491687, + "grad_norm": 2.6243045330047607, + "learning_rate": 2.928715571450097e-05, + "loss": 1.9741, + "step": 50595 + }, + { + "epoch": 0.09855545472602713, + "grad_norm": 2.581686019897461, + "learning_rate": 2.9286736357692277e-05, + "loss": 1.7908, + "step": 50610 + }, + { + "epoch": 0.09858466499713739, + "grad_norm": 4.69744348526001, + "learning_rate": 2.9286316880572967e-05, + "loss": 1.9603, + "step": 50625 + }, + { + "epoch": 0.09861387526824765, + "grad_norm": 4.348971366882324, + "learning_rate": 2.9285897283146572e-05, + "loss": 1.8657, + "step": 50640 + }, + { + "epoch": 0.09864308553935792, + "grad_norm": 2.9074223041534424, + "learning_rate": 2.928547756541663e-05, + "loss": 1.9288, + "step": 50655 + }, + { + "epoch": 0.09867229581046819, + "grad_norm": 3.6681530475616455, + "learning_rate": 2.9285057727386668e-05, + "loss": 1.8544, + "step": 50670 + }, + { + "epoch": 0.09870150608157845, + "grad_norm": 2.248783826828003, + "learning_rate": 2.928463776906023e-05, + "loss": 1.9141, + "step": 50685 + }, + { + "epoch": 0.09873071635268871, + "grad_norm": 3.9001669883728027, + "learning_rate": 2.9284217690440845e-05, + "loss": 1.8755, + "step": 50700 + }, + { + "epoch": 0.09875992662379897, + "grad_norm": 3.1399364471435547, + "learning_rate": 2.928379749153205e-05, + "loss": 1.7657, + "step": 50715 + }, + { + "epoch": 0.09878913689490923, + "grad_norm": 3.4785237312316895, + "learning_rate": 2.9283377172337397e-05, + "loss": 1.7498, + "step": 50730 + }, + { + "epoch": 0.09881834716601949, + "grad_norm": 5.273774147033691, + "learning_rate": 2.9282956732860408e-05, + "loss": 1.8231, + "step": 50745 + }, + { + "epoch": 0.09884755743712977, + "grad_norm": 4.573613166809082, + "learning_rate": 2.9282536173104635e-05, + "loss": 1.8316, + "step": 50760 + }, + { + "epoch": 0.09887676770824003, + "grad_norm": 2.501286745071411, + "learning_rate": 2.9282115493073616e-05, + "loss": 1.9897, + "step": 50775 + }, + { + "epoch": 0.09890597797935029, + "grad_norm": 2.594648599624634, + "learning_rate": 2.928169469277089e-05, + "loss": 1.8894, + "step": 50790 + }, + { + "epoch": 0.09893518825046055, + "grad_norm": 3.582399606704712, + "learning_rate": 2.928127377220001e-05, + "loss": 1.8343, + "step": 50805 + }, + { + "epoch": 0.09896439852157081, + "grad_norm": 2.9742801189422607, + "learning_rate": 2.9280852731364515e-05, + "loss": 1.8572, + "step": 50820 + }, + { + "epoch": 0.09899360879268107, + "grad_norm": 2.9038615226745605, + "learning_rate": 2.9280431570267947e-05, + "loss": 1.72, + "step": 50835 + }, + { + "epoch": 0.09902281906379133, + "grad_norm": 1.726894736289978, + "learning_rate": 2.9280010288913853e-05, + "loss": 1.9789, + "step": 50850 + }, + { + "epoch": 0.0990520293349016, + "grad_norm": 2.48158860206604, + "learning_rate": 2.9279588887305793e-05, + "loss": 1.752, + "step": 50865 + }, + { + "epoch": 0.09908123960601187, + "grad_norm": 2.734722852706909, + "learning_rate": 2.9279167365447302e-05, + "loss": 2.0492, + "step": 50880 + }, + { + "epoch": 0.09911044987712213, + "grad_norm": 3.3718225955963135, + "learning_rate": 2.927874572334193e-05, + "loss": 1.7142, + "step": 50895 + }, + { + "epoch": 0.09913966014823239, + "grad_norm": 3.647507667541504, + "learning_rate": 2.9278323960993238e-05, + "loss": 1.8559, + "step": 50910 + }, + { + "epoch": 0.09916887041934265, + "grad_norm": 2.6375949382781982, + "learning_rate": 2.927790207840477e-05, + "loss": 1.7471, + "step": 50925 + }, + { + "epoch": 0.09919808069045291, + "grad_norm": 2.8976383209228516, + "learning_rate": 2.9277480075580083e-05, + "loss": 1.8442, + "step": 50940 + }, + { + "epoch": 0.09922729096156317, + "grad_norm": 4.061347484588623, + "learning_rate": 2.927705795252273e-05, + "loss": 1.717, + "step": 50955 + }, + { + "epoch": 0.09925650123267345, + "grad_norm": 2.345612049102783, + "learning_rate": 2.9276635709236254e-05, + "loss": 1.7043, + "step": 50970 + }, + { + "epoch": 0.09928571150378371, + "grad_norm": 3.33445143699646, + "learning_rate": 2.9276213345724227e-05, + "loss": 1.7703, + "step": 50985 + }, + { + "epoch": 0.09931492177489397, + "grad_norm": 3.6107683181762695, + "learning_rate": 2.9275790861990202e-05, + "loss": 1.8683, + "step": 51000 + }, + { + "epoch": 0.09934413204600423, + "grad_norm": 3.7334389686584473, + "learning_rate": 2.9275368258037732e-05, + "loss": 1.8768, + "step": 51015 + }, + { + "epoch": 0.09937334231711449, + "grad_norm": 2.4035027027130127, + "learning_rate": 2.9274945533870375e-05, + "loss": 1.9563, + "step": 51030 + }, + { + "epoch": 0.09940255258822475, + "grad_norm": 5.561787128448486, + "learning_rate": 2.9274522689491696e-05, + "loss": 1.9885, + "step": 51045 + }, + { + "epoch": 0.09943176285933501, + "grad_norm": 4.420948505401611, + "learning_rate": 2.9274099724905256e-05, + "loss": 2.0977, + "step": 51060 + }, + { + "epoch": 0.09946097313044529, + "grad_norm": 2.3784122467041016, + "learning_rate": 2.9273676640114614e-05, + "loss": 1.8659, + "step": 51075 + }, + { + "epoch": 0.09949018340155555, + "grad_norm": 4.577541351318359, + "learning_rate": 2.927325343512333e-05, + "loss": 1.8968, + "step": 51090 + }, + { + "epoch": 0.09951939367266581, + "grad_norm": 3.8092751502990723, + "learning_rate": 2.9272830109934974e-05, + "loss": 1.8309, + "step": 51105 + }, + { + "epoch": 0.09954860394377607, + "grad_norm": 2.1280651092529297, + "learning_rate": 2.927240666455311e-05, + "loss": 1.8919, + "step": 51120 + }, + { + "epoch": 0.09957781421488633, + "grad_norm": 2.514244794845581, + "learning_rate": 2.92719830989813e-05, + "loss": 1.8846, + "step": 51135 + }, + { + "epoch": 0.09960702448599659, + "grad_norm": 4.485027313232422, + "learning_rate": 2.9271559413223113e-05, + "loss": 1.859, + "step": 51150 + }, + { + "epoch": 0.09963623475710685, + "grad_norm": 2.2585437297821045, + "learning_rate": 2.927113560728212e-05, + "loss": 2.0863, + "step": 51165 + }, + { + "epoch": 0.09966544502821713, + "grad_norm": 3.707125425338745, + "learning_rate": 2.9270711681161884e-05, + "loss": 1.8858, + "step": 51180 + }, + { + "epoch": 0.09969465529932739, + "grad_norm": 3.3982746601104736, + "learning_rate": 2.9270287634865977e-05, + "loss": 1.603, + "step": 51195 + }, + { + "epoch": 0.09972386557043765, + "grad_norm": 4.028176784515381, + "learning_rate": 2.926986346839797e-05, + "loss": 1.8977, + "step": 51210 + }, + { + "epoch": 0.09975307584154791, + "grad_norm": 4.433598041534424, + "learning_rate": 2.926943918176144e-05, + "loss": 1.835, + "step": 51225 + }, + { + "epoch": 0.09978228611265817, + "grad_norm": 2.535775661468506, + "learning_rate": 2.926901477495995e-05, + "loss": 1.7966, + "step": 51240 + }, + { + "epoch": 0.09981149638376843, + "grad_norm": 2.6152336597442627, + "learning_rate": 2.9268590247997087e-05, + "loss": 1.7176, + "step": 51255 + }, + { + "epoch": 0.0998407066548787, + "grad_norm": 3.727165699005127, + "learning_rate": 2.9268165600876414e-05, + "loss": 1.7679, + "step": 51270 + }, + { + "epoch": 0.09986991692598897, + "grad_norm": 3.6433348655700684, + "learning_rate": 2.9267740833601512e-05, + "loss": 1.7259, + "step": 51285 + }, + { + "epoch": 0.09989912719709923, + "grad_norm": 2.462620496749878, + "learning_rate": 2.9267315946175956e-05, + "loss": 1.8875, + "step": 51300 + }, + { + "epoch": 0.09992833746820949, + "grad_norm": 3.0979127883911133, + "learning_rate": 2.9266890938603326e-05, + "loss": 1.8023, + "step": 51315 + }, + { + "epoch": 0.09995754773931975, + "grad_norm": 2.2129147052764893, + "learning_rate": 2.9266465810887205e-05, + "loss": 1.8864, + "step": 51330 + }, + { + "epoch": 0.09998675801043001, + "grad_norm": 3.626903772354126, + "learning_rate": 2.926604056303117e-05, + "loss": 1.7717, + "step": 51345 + }, + { + "epoch": 0.10001596828154027, + "grad_norm": 3.7731549739837646, + "learning_rate": 2.9265615195038797e-05, + "loss": 1.8323, + "step": 51360 + }, + { + "epoch": 0.10004517855265053, + "grad_norm": 2.074625015258789, + "learning_rate": 2.9265189706913673e-05, + "loss": 1.8121, + "step": 51375 + }, + { + "epoch": 0.10007438882376081, + "grad_norm": 2.714456796646118, + "learning_rate": 2.926476409865938e-05, + "loss": 1.8879, + "step": 51390 + }, + { + "epoch": 0.10010359909487107, + "grad_norm": 4.711696147918701, + "learning_rate": 2.9264338370279502e-05, + "loss": 1.9525, + "step": 51405 + }, + { + "epoch": 0.10013280936598133, + "grad_norm": 2.2080190181732178, + "learning_rate": 2.9263912521777624e-05, + "loss": 1.7958, + "step": 51420 + }, + { + "epoch": 0.10016201963709159, + "grad_norm": 2.6654586791992188, + "learning_rate": 2.9263486553157333e-05, + "loss": 2.0238, + "step": 51435 + }, + { + "epoch": 0.10019122990820185, + "grad_norm": 2.1102378368377686, + "learning_rate": 2.9263060464422217e-05, + "loss": 1.8453, + "step": 51450 + }, + { + "epoch": 0.10022044017931211, + "grad_norm": 2.4648003578186035, + "learning_rate": 2.926263425557586e-05, + "loss": 1.8386, + "step": 51465 + }, + { + "epoch": 0.10024965045042238, + "grad_norm": 4.718409538269043, + "learning_rate": 2.9262207926621855e-05, + "loss": 1.9976, + "step": 51480 + }, + { + "epoch": 0.10027886072153264, + "grad_norm": 2.6654043197631836, + "learning_rate": 2.9261781477563792e-05, + "loss": 1.818, + "step": 51495 + }, + { + "epoch": 0.10030807099264291, + "grad_norm": 2.2968661785125732, + "learning_rate": 2.926135490840526e-05, + "loss": 1.9363, + "step": 51510 + }, + { + "epoch": 0.10033728126375317, + "grad_norm": 5.633299350738525, + "learning_rate": 2.9260928219149856e-05, + "loss": 1.7987, + "step": 51525 + }, + { + "epoch": 0.10036649153486343, + "grad_norm": 3.4443323612213135, + "learning_rate": 2.926050140980117e-05, + "loss": 1.9957, + "step": 51540 + }, + { + "epoch": 0.1003957018059737, + "grad_norm": 2.6234657764434814, + "learning_rate": 2.926007448036279e-05, + "loss": 1.8814, + "step": 51555 + }, + { + "epoch": 0.10042491207708396, + "grad_norm": 3.8197031021118164, + "learning_rate": 2.925964743083832e-05, + "loss": 1.8781, + "step": 51570 + }, + { + "epoch": 0.10045412234819422, + "grad_norm": 4.93427848815918, + "learning_rate": 2.9259220261231355e-05, + "loss": 1.9082, + "step": 51585 + }, + { + "epoch": 0.10048333261930448, + "grad_norm": 2.1275949478149414, + "learning_rate": 2.925879297154549e-05, + "loss": 1.8595, + "step": 51600 + }, + { + "epoch": 0.10051254289041475, + "grad_norm": 3.616004467010498, + "learning_rate": 2.925836556178432e-05, + "loss": 1.8563, + "step": 51615 + }, + { + "epoch": 0.10054175316152501, + "grad_norm": 3.3207528591156006, + "learning_rate": 2.9257938031951455e-05, + "loss": 1.9716, + "step": 51630 + }, + { + "epoch": 0.10057096343263527, + "grad_norm": 2.2283315658569336, + "learning_rate": 2.9257510382050484e-05, + "loss": 1.8513, + "step": 51645 + }, + { + "epoch": 0.10060017370374554, + "grad_norm": 2.2449660301208496, + "learning_rate": 2.9257082612085014e-05, + "loss": 1.9802, + "step": 51660 + }, + { + "epoch": 0.1006293839748558, + "grad_norm": 2.182696580886841, + "learning_rate": 2.9256654722058648e-05, + "loss": 1.7613, + "step": 51675 + }, + { + "epoch": 0.10065859424596606, + "grad_norm": 2.3022265434265137, + "learning_rate": 2.9256226711974984e-05, + "loss": 2.0325, + "step": 51690 + }, + { + "epoch": 0.10068780451707632, + "grad_norm": 2.959179162979126, + "learning_rate": 2.9255798581837634e-05, + "loss": 1.8477, + "step": 51705 + }, + { + "epoch": 0.10071701478818659, + "grad_norm": 2.473968505859375, + "learning_rate": 2.9255370331650196e-05, + "loss": 1.7675, + "step": 51720 + }, + { + "epoch": 0.10074622505929685, + "grad_norm": 2.5270917415618896, + "learning_rate": 2.9254941961416282e-05, + "loss": 1.7278, + "step": 51735 + }, + { + "epoch": 0.10077543533040711, + "grad_norm": 3.788010358810425, + "learning_rate": 2.9254513471139493e-05, + "loss": 1.9634, + "step": 51750 + }, + { + "epoch": 0.10080464560151738, + "grad_norm": 2.746396780014038, + "learning_rate": 2.9254084860823444e-05, + "loss": 1.7762, + "step": 51765 + }, + { + "epoch": 0.10083385587262764, + "grad_norm": 3.6009750366210938, + "learning_rate": 2.925365613047174e-05, + "loss": 1.876, + "step": 51780 + }, + { + "epoch": 0.1008630661437379, + "grad_norm": 4.298994541168213, + "learning_rate": 2.9253227280087992e-05, + "loss": 1.8371, + "step": 51795 + }, + { + "epoch": 0.10089227641484816, + "grad_norm": 3.8493106365203857, + "learning_rate": 2.9252798309675818e-05, + "loss": 1.7269, + "step": 51810 + }, + { + "epoch": 0.10092148668595843, + "grad_norm": 2.5960609912872314, + "learning_rate": 2.9252369219238818e-05, + "loss": 1.7847, + "step": 51825 + }, + { + "epoch": 0.1009506969570687, + "grad_norm": 4.169469356536865, + "learning_rate": 2.9251940008780618e-05, + "loss": 1.8292, + "step": 51840 + }, + { + "epoch": 0.10097990722817896, + "grad_norm": 2.5621516704559326, + "learning_rate": 2.9251510678304824e-05, + "loss": 1.947, + "step": 51855 + }, + { + "epoch": 0.10100911749928922, + "grad_norm": 3.860788583755493, + "learning_rate": 2.9251081227815054e-05, + "loss": 1.7945, + "step": 51870 + }, + { + "epoch": 0.10103832777039948, + "grad_norm": 2.9201455116271973, + "learning_rate": 2.9250651657314925e-05, + "loss": 1.821, + "step": 51885 + }, + { + "epoch": 0.10106753804150974, + "grad_norm": 3.1246635913848877, + "learning_rate": 2.925022196680806e-05, + "loss": 1.7569, + "step": 51900 + }, + { + "epoch": 0.10109674831262, + "grad_norm": 3.1227333545684814, + "learning_rate": 2.924979215629806e-05, + "loss": 2.0489, + "step": 51915 + }, + { + "epoch": 0.10112595858373027, + "grad_norm": 2.620610237121582, + "learning_rate": 2.924936222578856e-05, + "loss": 1.6126, + "step": 51930 + }, + { + "epoch": 0.10115516885484054, + "grad_norm": 3.166445255279541, + "learning_rate": 2.924893217528318e-05, + "loss": 1.6744, + "step": 51945 + }, + { + "epoch": 0.1011843791259508, + "grad_norm": 3.7178285121917725, + "learning_rate": 2.9248502004785538e-05, + "loss": 1.9204, + "step": 51960 + }, + { + "epoch": 0.10121358939706106, + "grad_norm": 2.7688398361206055, + "learning_rate": 2.9248071714299255e-05, + "loss": 1.9956, + "step": 51975 + }, + { + "epoch": 0.10124279966817132, + "grad_norm": 3.1003968715667725, + "learning_rate": 2.924764130382796e-05, + "loss": 2.0411, + "step": 51990 + }, + { + "epoch": 0.10127200993928158, + "grad_norm": 2.189988136291504, + "learning_rate": 2.9247210773375268e-05, + "loss": 1.8253, + "step": 52005 + }, + { + "epoch": 0.10130122021039184, + "grad_norm": 1.7499350309371948, + "learning_rate": 2.924678012294481e-05, + "loss": 1.8106, + "step": 52020 + }, + { + "epoch": 0.10133043048150212, + "grad_norm": 3.2424755096435547, + "learning_rate": 2.9246349352540217e-05, + "loss": 1.7018, + "step": 52035 + }, + { + "epoch": 0.10135964075261238, + "grad_norm": 2.846792697906494, + "learning_rate": 2.924591846216511e-05, + "loss": 1.8952, + "step": 52050 + }, + { + "epoch": 0.10138885102372264, + "grad_norm": 2.178586006164551, + "learning_rate": 2.9245487451823123e-05, + "loss": 1.6687, + "step": 52065 + }, + { + "epoch": 0.1014180612948329, + "grad_norm": 4.501686096191406, + "learning_rate": 2.924505632151788e-05, + "loss": 1.8369, + "step": 52080 + }, + { + "epoch": 0.10144727156594316, + "grad_norm": 2.375622272491455, + "learning_rate": 2.9244625071253014e-05, + "loss": 1.7362, + "step": 52095 + }, + { + "epoch": 0.10147648183705342, + "grad_norm": 2.0563292503356934, + "learning_rate": 2.924419370103216e-05, + "loss": 1.8708, + "step": 52110 + }, + { + "epoch": 0.10150569210816368, + "grad_norm": 2.451967477798462, + "learning_rate": 2.924376221085894e-05, + "loss": 1.8092, + "step": 52125 + }, + { + "epoch": 0.10153490237927396, + "grad_norm": 3.121628999710083, + "learning_rate": 2.9243330600737003e-05, + "loss": 1.843, + "step": 52140 + }, + { + "epoch": 0.10156411265038422, + "grad_norm": 2.7133121490478516, + "learning_rate": 2.924289887066997e-05, + "loss": 2.1271, + "step": 52155 + }, + { + "epoch": 0.10159332292149448, + "grad_norm": 2.930771589279175, + "learning_rate": 2.9242467020661488e-05, + "loss": 1.8519, + "step": 52170 + }, + { + "epoch": 0.10162253319260474, + "grad_norm": 2.4638404846191406, + "learning_rate": 2.9242035050715184e-05, + "loss": 1.8286, + "step": 52185 + }, + { + "epoch": 0.101651743463715, + "grad_norm": 2.1258838176727295, + "learning_rate": 2.92416029608347e-05, + "loss": 1.7845, + "step": 52200 + }, + { + "epoch": 0.10168095373482526, + "grad_norm": 2.9611170291900635, + "learning_rate": 2.924117075102368e-05, + "loss": 1.9441, + "step": 52215 + }, + { + "epoch": 0.10171016400593552, + "grad_norm": 3.221245288848877, + "learning_rate": 2.9240738421285748e-05, + "loss": 2.0082, + "step": 52230 + }, + { + "epoch": 0.1017393742770458, + "grad_norm": 2.655762195587158, + "learning_rate": 2.924030597162456e-05, + "loss": 2.1104, + "step": 52245 + }, + { + "epoch": 0.10176858454815606, + "grad_norm": 2.8793299198150635, + "learning_rate": 2.9239873402043755e-05, + "loss": 1.728, + "step": 52260 + }, + { + "epoch": 0.10179779481926632, + "grad_norm": 5.0151214599609375, + "learning_rate": 2.9239440712546974e-05, + "loss": 1.9032, + "step": 52275 + }, + { + "epoch": 0.10182700509037658, + "grad_norm": 3.8040266036987305, + "learning_rate": 2.9239007903137857e-05, + "loss": 1.8819, + "step": 52290 + }, + { + "epoch": 0.10185621536148684, + "grad_norm": 2.1644256114959717, + "learning_rate": 2.9238574973820055e-05, + "loss": 1.8921, + "step": 52305 + }, + { + "epoch": 0.1018854256325971, + "grad_norm": 2.887611150741577, + "learning_rate": 2.9238141924597207e-05, + "loss": 1.7561, + "step": 52320 + }, + { + "epoch": 0.10191463590370736, + "grad_norm": 3.016183853149414, + "learning_rate": 2.9237708755472963e-05, + "loss": 2.0133, + "step": 52335 + }, + { + "epoch": 0.10194384617481764, + "grad_norm": 2.413327693939209, + "learning_rate": 2.923727546645097e-05, + "loss": 1.8857, + "step": 52350 + }, + { + "epoch": 0.1019730564459279, + "grad_norm": 2.3333163261413574, + "learning_rate": 2.923684205753488e-05, + "loss": 1.8865, + "step": 52365 + }, + { + "epoch": 0.10200226671703816, + "grad_norm": 2.833000898361206, + "learning_rate": 2.9236408528728346e-05, + "loss": 1.7953, + "step": 52380 + }, + { + "epoch": 0.10203147698814842, + "grad_norm": 4.4297776222229, + "learning_rate": 2.9235974880035002e-05, + "loss": 1.7317, + "step": 52395 + }, + { + "epoch": 0.10206068725925868, + "grad_norm": 5.2759175300598145, + "learning_rate": 2.9235541111458518e-05, + "loss": 1.9133, + "step": 52410 + }, + { + "epoch": 0.10208989753036894, + "grad_norm": 2.9812684059143066, + "learning_rate": 2.923510722300254e-05, + "loss": 1.8737, + "step": 52425 + }, + { + "epoch": 0.1021191078014792, + "grad_norm": 2.062561511993408, + "learning_rate": 2.9234673214670725e-05, + "loss": 1.9559, + "step": 52440 + }, + { + "epoch": 0.10214831807258948, + "grad_norm": 3.152745008468628, + "learning_rate": 2.9234239086466725e-05, + "loss": 1.9828, + "step": 52455 + }, + { + "epoch": 0.10217752834369974, + "grad_norm": 2.9057366847991943, + "learning_rate": 2.923380483839419e-05, + "loss": 1.7809, + "step": 52470 + }, + { + "epoch": 0.10220673861481, + "grad_norm": 2.111750602722168, + "learning_rate": 2.9233370470456785e-05, + "loss": 1.7501, + "step": 52485 + }, + { + "epoch": 0.10223594888592026, + "grad_norm": 2.3556952476501465, + "learning_rate": 2.9232935982658164e-05, + "loss": 1.8154, + "step": 52500 + }, + { + "epoch": 0.10226515915703052, + "grad_norm": 3.024723768234253, + "learning_rate": 2.923250137500199e-05, + "loss": 1.8, + "step": 52515 + }, + { + "epoch": 0.10229436942814078, + "grad_norm": 2.7702646255493164, + "learning_rate": 2.923206664749192e-05, + "loss": 1.8854, + "step": 52530 + }, + { + "epoch": 0.10232357969925104, + "grad_norm": 3.769913911819458, + "learning_rate": 2.923163180013161e-05, + "loss": 1.6611, + "step": 52545 + }, + { + "epoch": 0.1023527899703613, + "grad_norm": 4.062292575836182, + "learning_rate": 2.923119683292473e-05, + "loss": 1.7646, + "step": 52560 + }, + { + "epoch": 0.10238200024147158, + "grad_norm": 2.6236956119537354, + "learning_rate": 2.9230761745874945e-05, + "loss": 1.74, + "step": 52575 + }, + { + "epoch": 0.10241121051258184, + "grad_norm": 3.377202033996582, + "learning_rate": 2.923032653898591e-05, + "loss": 1.7617, + "step": 52590 + }, + { + "epoch": 0.1024404207836921, + "grad_norm": 2.714411735534668, + "learning_rate": 2.922989121226129e-05, + "loss": 1.9378, + "step": 52605 + }, + { + "epoch": 0.10246963105480236, + "grad_norm": 2.279560089111328, + "learning_rate": 2.9229455765704763e-05, + "loss": 1.8215, + "step": 52620 + }, + { + "epoch": 0.10249884132591262, + "grad_norm": 2.452533721923828, + "learning_rate": 2.9229020199319984e-05, + "loss": 1.8956, + "step": 52635 + }, + { + "epoch": 0.10252805159702288, + "grad_norm": 2.515977382659912, + "learning_rate": 2.9228584513110617e-05, + "loss": 1.838, + "step": 52650 + }, + { + "epoch": 0.10255726186813315, + "grad_norm": 2.400944232940674, + "learning_rate": 2.922814870708035e-05, + "loss": 1.8145, + "step": 52665 + }, + { + "epoch": 0.10258647213924342, + "grad_norm": 4.391109943389893, + "learning_rate": 2.9227712781232837e-05, + "loss": 1.981, + "step": 52680 + }, + { + "epoch": 0.10261568241035368, + "grad_norm": 3.477064847946167, + "learning_rate": 2.922727673557175e-05, + "loss": 1.9165, + "step": 52695 + }, + { + "epoch": 0.10264489268146394, + "grad_norm": 2.854994058609009, + "learning_rate": 2.922684057010077e-05, + "loss": 2.0644, + "step": 52710 + }, + { + "epoch": 0.1026741029525742, + "grad_norm": 2.474790573120117, + "learning_rate": 2.9226404284823566e-05, + "loss": 2.003, + "step": 52725 + }, + { + "epoch": 0.10270331322368446, + "grad_norm": 1.8789151906967163, + "learning_rate": 2.9225967879743805e-05, + "loss": 1.9084, + "step": 52740 + }, + { + "epoch": 0.10273252349479473, + "grad_norm": 3.587696075439453, + "learning_rate": 2.922553135486517e-05, + "loss": 1.7805, + "step": 52755 + }, + { + "epoch": 0.10276173376590499, + "grad_norm": 3.5017142295837402, + "learning_rate": 2.9225094710191335e-05, + "loss": 1.8052, + "step": 52770 + }, + { + "epoch": 0.10279094403701526, + "grad_norm": 3.3561439514160156, + "learning_rate": 2.9224657945725974e-05, + "loss": 1.5558, + "step": 52785 + }, + { + "epoch": 0.10282015430812552, + "grad_norm": 2.1130895614624023, + "learning_rate": 2.9224221061472774e-05, + "loss": 2.0765, + "step": 52800 + }, + { + "epoch": 0.10284936457923578, + "grad_norm": 3.7945330142974854, + "learning_rate": 2.9223784057435403e-05, + "loss": 1.8974, + "step": 52815 + }, + { + "epoch": 0.10287857485034604, + "grad_norm": 2.3494579792022705, + "learning_rate": 2.9223346933617547e-05, + "loss": 1.8936, + "step": 52830 + }, + { + "epoch": 0.1029077851214563, + "grad_norm": 3.628025770187378, + "learning_rate": 2.9222909690022887e-05, + "loss": 1.8944, + "step": 52845 + }, + { + "epoch": 0.10293699539256657, + "grad_norm": 2.961174488067627, + "learning_rate": 2.9222472326655102e-05, + "loss": 1.7924, + "step": 52860 + }, + { + "epoch": 0.10296620566367683, + "grad_norm": 2.788062810897827, + "learning_rate": 2.9222034843517878e-05, + "loss": 1.8915, + "step": 52875 + }, + { + "epoch": 0.1029954159347871, + "grad_norm": 2.6212539672851562, + "learning_rate": 2.92215972406149e-05, + "loss": 1.8854, + "step": 52890 + }, + { + "epoch": 0.10302462620589736, + "grad_norm": 3.915565013885498, + "learning_rate": 2.9221159517949848e-05, + "loss": 1.8489, + "step": 52905 + }, + { + "epoch": 0.10305383647700762, + "grad_norm": 2.287111759185791, + "learning_rate": 2.9220721675526418e-05, + "loss": 1.8905, + "step": 52920 + }, + { + "epoch": 0.10308304674811788, + "grad_norm": 1.9591848850250244, + "learning_rate": 2.9220283713348287e-05, + "loss": 1.8771, + "step": 52935 + }, + { + "epoch": 0.10311225701922815, + "grad_norm": 2.181339979171753, + "learning_rate": 2.9219845631419146e-05, + "loss": 1.8998, + "step": 52950 + }, + { + "epoch": 0.1031414672903384, + "grad_norm": 4.41172981262207, + "learning_rate": 2.9219407429742683e-05, + "loss": 1.7746, + "step": 52965 + }, + { + "epoch": 0.10317067756144867, + "grad_norm": 3.7191624641418457, + "learning_rate": 2.9218969108322593e-05, + "loss": 1.7355, + "step": 52980 + }, + { + "epoch": 0.10319988783255894, + "grad_norm": 2.4057443141937256, + "learning_rate": 2.9218530667162567e-05, + "loss": 1.8293, + "step": 52995 + }, + { + "epoch": 0.1032290981036692, + "grad_norm": 3.978999137878418, + "learning_rate": 2.921809210626629e-05, + "loss": 1.9741, + "step": 53010 + }, + { + "epoch": 0.10325830837477946, + "grad_norm": 1.9080361127853394, + "learning_rate": 2.9217653425637462e-05, + "loss": 1.7423, + "step": 53025 + }, + { + "epoch": 0.10328751864588973, + "grad_norm": 2.817746877670288, + "learning_rate": 2.9217214625279776e-05, + "loss": 1.701, + "step": 53040 + }, + { + "epoch": 0.10331672891699999, + "grad_norm": 2.3751299381256104, + "learning_rate": 2.9216775705196924e-05, + "loss": 1.8182, + "step": 53055 + }, + { + "epoch": 0.10334593918811025, + "grad_norm": 2.12813401222229, + "learning_rate": 2.9216336665392602e-05, + "loss": 1.7919, + "step": 53070 + }, + { + "epoch": 0.10337514945922051, + "grad_norm": 2.3823862075805664, + "learning_rate": 2.921589750587051e-05, + "loss": 1.8145, + "step": 53085 + }, + { + "epoch": 0.10340435973033078, + "grad_norm": 3.9402434825897217, + "learning_rate": 2.921545822663435e-05, + "loss": 1.8797, + "step": 53100 + }, + { + "epoch": 0.10343357000144104, + "grad_norm": 2.0574233531951904, + "learning_rate": 2.9215018827687816e-05, + "loss": 1.8935, + "step": 53115 + }, + { + "epoch": 0.1034627802725513, + "grad_norm": 2.729658842086792, + "learning_rate": 2.9214579309034603e-05, + "loss": 2.0222, + "step": 53130 + }, + { + "epoch": 0.10349199054366157, + "grad_norm": 2.9515881538391113, + "learning_rate": 2.9214139670678423e-05, + "loss": 1.9446, + "step": 53145 + }, + { + "epoch": 0.10352120081477183, + "grad_norm": 3.3878188133239746, + "learning_rate": 2.9213699912622976e-05, + "loss": 1.7399, + "step": 53160 + }, + { + "epoch": 0.10355041108588209, + "grad_norm": 3.7901418209075928, + "learning_rate": 2.921326003487196e-05, + "loss": 1.8246, + "step": 53175 + }, + { + "epoch": 0.10357962135699235, + "grad_norm": 3.6533632278442383, + "learning_rate": 2.9212820037429082e-05, + "loss": 1.835, + "step": 53190 + }, + { + "epoch": 0.10360883162810262, + "grad_norm": 3.312243938446045, + "learning_rate": 2.921237992029805e-05, + "loss": 1.9275, + "step": 53205 + }, + { + "epoch": 0.10363804189921288, + "grad_norm": 3.8666319847106934, + "learning_rate": 2.9211939683482568e-05, + "loss": 1.9619, + "step": 53220 + }, + { + "epoch": 0.10366725217032315, + "grad_norm": 3.0532631874084473, + "learning_rate": 2.921149932698634e-05, + "loss": 1.6835, + "step": 53235 + }, + { + "epoch": 0.1036964624414334, + "grad_norm": 3.0404837131500244, + "learning_rate": 2.9211058850813076e-05, + "loss": 2.1241, + "step": 53250 + }, + { + "epoch": 0.10372567271254367, + "grad_norm": 3.4185240268707275, + "learning_rate": 2.9210618254966492e-05, + "loss": 2.0408, + "step": 53265 + }, + { + "epoch": 0.10375488298365393, + "grad_norm": 3.211634874343872, + "learning_rate": 2.9210177539450294e-05, + "loss": 2.0227, + "step": 53280 + }, + { + "epoch": 0.10378409325476419, + "grad_norm": 2.959613561630249, + "learning_rate": 2.9209736704268188e-05, + "loss": 1.7733, + "step": 53295 + }, + { + "epoch": 0.10381330352587446, + "grad_norm": 2.7243733406066895, + "learning_rate": 2.920929574942389e-05, + "loss": 1.8423, + "step": 53310 + }, + { + "epoch": 0.10384251379698473, + "grad_norm": 1.9866245985031128, + "learning_rate": 2.9208854674921116e-05, + "loss": 1.6459, + "step": 53325 + }, + { + "epoch": 0.10387172406809499, + "grad_norm": 3.77958345413208, + "learning_rate": 2.9208413480763577e-05, + "loss": 1.9329, + "step": 53340 + }, + { + "epoch": 0.10390093433920525, + "grad_norm": 2.4954938888549805, + "learning_rate": 2.9207972166954994e-05, + "loss": 1.6866, + "step": 53355 + }, + { + "epoch": 0.10393014461031551, + "grad_norm": 2.948174476623535, + "learning_rate": 2.9207530733499073e-05, + "loss": 1.676, + "step": 53370 + }, + { + "epoch": 0.10395935488142577, + "grad_norm": 1.7526262998580933, + "learning_rate": 2.920708918039954e-05, + "loss": 1.6741, + "step": 53385 + }, + { + "epoch": 0.10398856515253603, + "grad_norm": 1.9519858360290527, + "learning_rate": 2.920664750766011e-05, + "loss": 1.9324, + "step": 53400 + }, + { + "epoch": 0.1040177754236463, + "grad_norm": 3.7122256755828857, + "learning_rate": 2.920620571528451e-05, + "loss": 2.0231, + "step": 53415 + }, + { + "epoch": 0.10404698569475657, + "grad_norm": 2.3948066234588623, + "learning_rate": 2.920576380327644e-05, + "loss": 1.8915, + "step": 53430 + }, + { + "epoch": 0.10407619596586683, + "grad_norm": 2.3851559162139893, + "learning_rate": 2.920532177163965e-05, + "loss": 1.941, + "step": 53445 + }, + { + "epoch": 0.10410540623697709, + "grad_norm": 2.0305466651916504, + "learning_rate": 2.920487962037784e-05, + "loss": 1.8089, + "step": 53460 + }, + { + "epoch": 0.10413461650808735, + "grad_norm": 3.6030194759368896, + "learning_rate": 2.920443734949474e-05, + "loss": 1.9752, + "step": 53475 + }, + { + "epoch": 0.10416382677919761, + "grad_norm": 4.3383636474609375, + "learning_rate": 2.9203994958994074e-05, + "loss": 1.6396, + "step": 53490 + }, + { + "epoch": 0.10419303705030787, + "grad_norm": 4.8360915184021, + "learning_rate": 2.9203552448879576e-05, + "loss": 1.6672, + "step": 53505 + }, + { + "epoch": 0.10422224732141815, + "grad_norm": 3.100064754486084, + "learning_rate": 2.9203109819154958e-05, + "loss": 1.9135, + "step": 53520 + }, + { + "epoch": 0.10425145759252841, + "grad_norm": 2.019653081893921, + "learning_rate": 2.920266706982396e-05, + "loss": 1.9831, + "step": 53535 + }, + { + "epoch": 0.10428066786363867, + "grad_norm": 2.8222882747650146, + "learning_rate": 2.92022242008903e-05, + "loss": 1.8984, + "step": 53550 + }, + { + "epoch": 0.10430987813474893, + "grad_norm": 1.9325177669525146, + "learning_rate": 2.9201781212357716e-05, + "loss": 1.8352, + "step": 53565 + }, + { + "epoch": 0.10433908840585919, + "grad_norm": 4.413449764251709, + "learning_rate": 2.9201338104229937e-05, + "loss": 1.8048, + "step": 53580 + }, + { + "epoch": 0.10436829867696945, + "grad_norm": 3.0887482166290283, + "learning_rate": 2.9200894876510687e-05, + "loss": 1.8392, + "step": 53595 + }, + { + "epoch": 0.10439750894807971, + "grad_norm": 1.9377580881118774, + "learning_rate": 2.9200451529203707e-05, + "loss": 1.8473, + "step": 53610 + }, + { + "epoch": 0.10442671921918997, + "grad_norm": 2.265601396560669, + "learning_rate": 2.9200008062312728e-05, + "loss": 1.8195, + "step": 53625 + }, + { + "epoch": 0.10445592949030025, + "grad_norm": 3.035935401916504, + "learning_rate": 2.919956447584148e-05, + "loss": 1.8027, + "step": 53640 + }, + { + "epoch": 0.10448513976141051, + "grad_norm": 2.5451693534851074, + "learning_rate": 2.9199120769793705e-05, + "loss": 1.6678, + "step": 53655 + }, + { + "epoch": 0.10451435003252077, + "grad_norm": 3.594522476196289, + "learning_rate": 2.9198676944173135e-05, + "loss": 1.701, + "step": 53670 + }, + { + "epoch": 0.10454356030363103, + "grad_norm": 2.1554315090179443, + "learning_rate": 2.9198232998983514e-05, + "loss": 1.8878, + "step": 53685 + }, + { + "epoch": 0.10457277057474129, + "grad_norm": 3.5895397663116455, + "learning_rate": 2.9197788934228576e-05, + "loss": 1.7508, + "step": 53700 + }, + { + "epoch": 0.10460198084585155, + "grad_norm": 7.4402337074279785, + "learning_rate": 2.9197344749912054e-05, + "loss": 1.6983, + "step": 53715 + }, + { + "epoch": 0.10463119111696181, + "grad_norm": 2.151456117630005, + "learning_rate": 2.9196900446037702e-05, + "loss": 1.7552, + "step": 53730 + }, + { + "epoch": 0.10466040138807209, + "grad_norm": 4.100891590118408, + "learning_rate": 2.919645602260925e-05, + "loss": 1.7669, + "step": 53745 + }, + { + "epoch": 0.10468961165918235, + "grad_norm": 3.0575950145721436, + "learning_rate": 2.9196011479630448e-05, + "loss": 2.0725, + "step": 53760 + }, + { + "epoch": 0.10471882193029261, + "grad_norm": 2.82108211517334, + "learning_rate": 2.9195566817105036e-05, + "loss": 1.9274, + "step": 53775 + }, + { + "epoch": 0.10474803220140287, + "grad_norm": 2.9936230182647705, + "learning_rate": 2.919512203503676e-05, + "loss": 1.75, + "step": 53790 + }, + { + "epoch": 0.10477724247251313, + "grad_norm": 3.8622894287109375, + "learning_rate": 2.9194677133429364e-05, + "loss": 1.962, + "step": 53805 + }, + { + "epoch": 0.1048064527436234, + "grad_norm": 2.2222511768341064, + "learning_rate": 2.9194232112286594e-05, + "loss": 1.806, + "step": 53820 + }, + { + "epoch": 0.10483566301473365, + "grad_norm": 2.2523059844970703, + "learning_rate": 2.91937869716122e-05, + "loss": 1.7863, + "step": 53835 + }, + { + "epoch": 0.10486487328584393, + "grad_norm": 2.7013745307922363, + "learning_rate": 2.919334171140993e-05, + "loss": 1.9081, + "step": 53850 + }, + { + "epoch": 0.10489408355695419, + "grad_norm": 2.1757166385650635, + "learning_rate": 2.9192896331683532e-05, + "loss": 1.7756, + "step": 53865 + }, + { + "epoch": 0.10492329382806445, + "grad_norm": 3.7949211597442627, + "learning_rate": 2.9192450832436762e-05, + "loss": 1.6572, + "step": 53880 + }, + { + "epoch": 0.10495250409917471, + "grad_norm": 4.856202125549316, + "learning_rate": 2.9192005213673363e-05, + "loss": 1.7781, + "step": 53895 + }, + { + "epoch": 0.10498171437028497, + "grad_norm": 3.3601744174957275, + "learning_rate": 2.9191559475397094e-05, + "loss": 1.8988, + "step": 53910 + }, + { + "epoch": 0.10501092464139523, + "grad_norm": 2.9697728157043457, + "learning_rate": 2.919111361761171e-05, + "loss": 1.8731, + "step": 53925 + }, + { + "epoch": 0.1050401349125055, + "grad_norm": 4.680106163024902, + "learning_rate": 2.9190667640320958e-05, + "loss": 1.8678, + "step": 53940 + }, + { + "epoch": 0.10506934518361577, + "grad_norm": 3.960859537124634, + "learning_rate": 2.91902215435286e-05, + "loss": 1.7715, + "step": 53955 + }, + { + "epoch": 0.10509855545472603, + "grad_norm": 2.9306797981262207, + "learning_rate": 2.9189775327238387e-05, + "loss": 1.9109, + "step": 53970 + }, + { + "epoch": 0.10512776572583629, + "grad_norm": 3.277047872543335, + "learning_rate": 2.9189328991454083e-05, + "loss": 1.7692, + "step": 53985 + }, + { + "epoch": 0.10515697599694655, + "grad_norm": 2.453782081604004, + "learning_rate": 2.9188882536179443e-05, + "loss": 1.8473, + "step": 54000 + }, + { + "epoch": 0.10518618626805681, + "grad_norm": 4.33857536315918, + "learning_rate": 2.918843596141823e-05, + "loss": 1.8246, + "step": 54015 + }, + { + "epoch": 0.10521539653916707, + "grad_norm": 2.3152732849121094, + "learning_rate": 2.9187989267174194e-05, + "loss": 1.7076, + "step": 54030 + }, + { + "epoch": 0.10524460681027734, + "grad_norm": 2.5881829261779785, + "learning_rate": 2.9187542453451116e-05, + "loss": 1.8278, + "step": 54045 + }, + { + "epoch": 0.10527381708138761, + "grad_norm": 3.1161749362945557, + "learning_rate": 2.9187095520252742e-05, + "loss": 1.8375, + "step": 54060 + }, + { + "epoch": 0.10530302735249787, + "grad_norm": 2.995056390762329, + "learning_rate": 2.918664846758284e-05, + "loss": 1.974, + "step": 54075 + }, + { + "epoch": 0.10533223762360813, + "grad_norm": 3.396555185317993, + "learning_rate": 2.9186201295445178e-05, + "loss": 1.9339, + "step": 54090 + }, + { + "epoch": 0.1053614478947184, + "grad_norm": 3.7119739055633545, + "learning_rate": 2.9185754003843522e-05, + "loss": 1.9313, + "step": 54105 + }, + { + "epoch": 0.10539065816582865, + "grad_norm": 2.912541627883911, + "learning_rate": 2.9185306592781632e-05, + "loss": 1.8674, + "step": 54120 + }, + { + "epoch": 0.10541986843693892, + "grad_norm": 3.3226304054260254, + "learning_rate": 2.918485906226328e-05, + "loss": 1.9631, + "step": 54135 + }, + { + "epoch": 0.10544907870804918, + "grad_norm": 2.9504806995391846, + "learning_rate": 2.918441141229224e-05, + "loss": 1.8417, + "step": 54150 + }, + { + "epoch": 0.10547828897915945, + "grad_norm": 4.178096771240234, + "learning_rate": 2.9183963642872277e-05, + "loss": 1.8775, + "step": 54165 + }, + { + "epoch": 0.10550749925026971, + "grad_norm": 2.280705690383911, + "learning_rate": 2.9183515754007154e-05, + "loss": 1.794, + "step": 54180 + }, + { + "epoch": 0.10553670952137997, + "grad_norm": 3.4642956256866455, + "learning_rate": 2.9183067745700652e-05, + "loss": 1.699, + "step": 54195 + }, + { + "epoch": 0.10556591979249023, + "grad_norm": 1.8567029237747192, + "learning_rate": 2.918261961795655e-05, + "loss": 1.9783, + "step": 54210 + }, + { + "epoch": 0.1055951300636005, + "grad_norm": 2.8850319385528564, + "learning_rate": 2.9182171370778604e-05, + "loss": 1.8808, + "step": 54225 + }, + { + "epoch": 0.10562434033471076, + "grad_norm": 3.791398525238037, + "learning_rate": 2.9181723004170602e-05, + "loss": 1.9922, + "step": 54240 + }, + { + "epoch": 0.10565355060582102, + "grad_norm": 3.199683666229248, + "learning_rate": 2.9181274518136315e-05, + "loss": 1.8032, + "step": 54255 + }, + { + "epoch": 0.10568276087693129, + "grad_norm": 3.3029825687408447, + "learning_rate": 2.9180825912679517e-05, + "loss": 1.88, + "step": 54270 + }, + { + "epoch": 0.10571197114804155, + "grad_norm": 2.8857414722442627, + "learning_rate": 2.9180377187803996e-05, + "loss": 1.8091, + "step": 54285 + }, + { + "epoch": 0.10574118141915181, + "grad_norm": 2.650928258895874, + "learning_rate": 2.9179928343513522e-05, + "loss": 1.7526, + "step": 54300 + }, + { + "epoch": 0.10577039169026207, + "grad_norm": 2.761585235595703, + "learning_rate": 2.9179479379811877e-05, + "loss": 1.9965, + "step": 54315 + }, + { + "epoch": 0.10579960196137234, + "grad_norm": 2.232044219970703, + "learning_rate": 2.9179030296702843e-05, + "loss": 2.0417, + "step": 54330 + }, + { + "epoch": 0.1058288122324826, + "grad_norm": 2.156282901763916, + "learning_rate": 2.9178581094190196e-05, + "loss": 1.9386, + "step": 54345 + }, + { + "epoch": 0.10585802250359286, + "grad_norm": 3.6601815223693848, + "learning_rate": 2.9178131772277732e-05, + "loss": 2.0005, + "step": 54360 + }, + { + "epoch": 0.10588723277470313, + "grad_norm": 2.8894190788269043, + "learning_rate": 2.9177682330969218e-05, + "loss": 1.9366, + "step": 54375 + }, + { + "epoch": 0.1059164430458134, + "grad_norm": 5.554039478302002, + "learning_rate": 2.9177232770268452e-05, + "loss": 1.9076, + "step": 54390 + }, + { + "epoch": 0.10594565331692365, + "grad_norm": 4.123503684997559, + "learning_rate": 2.9176783090179212e-05, + "loss": 1.8028, + "step": 54405 + }, + { + "epoch": 0.10597486358803392, + "grad_norm": 2.6164982318878174, + "learning_rate": 2.917633329070529e-05, + "loss": 1.7757, + "step": 54420 + }, + { + "epoch": 0.10600407385914418, + "grad_norm": 2.3009068965911865, + "learning_rate": 2.917588337185047e-05, + "loss": 2.0487, + "step": 54435 + }, + { + "epoch": 0.10603328413025444, + "grad_norm": 3.877465009689331, + "learning_rate": 2.917543333361854e-05, + "loss": 1.8231, + "step": 54450 + }, + { + "epoch": 0.1060624944013647, + "grad_norm": 4.830938339233398, + "learning_rate": 2.91749831760133e-05, + "loss": 1.845, + "step": 54465 + }, + { + "epoch": 0.10609170467247497, + "grad_norm": 2.1726012229919434, + "learning_rate": 2.9174532899038528e-05, + "loss": 1.7704, + "step": 54480 + }, + { + "epoch": 0.10612091494358523, + "grad_norm": 3.9829185009002686, + "learning_rate": 2.9174082502698018e-05, + "loss": 1.9241, + "step": 54495 + }, + { + "epoch": 0.1061501252146955, + "grad_norm": 2.6498842239379883, + "learning_rate": 2.917363198699557e-05, + "loss": 1.9205, + "step": 54510 + }, + { + "epoch": 0.10617933548580576, + "grad_norm": 4.133223533630371, + "learning_rate": 2.9173181351934975e-05, + "loss": 1.8846, + "step": 54525 + }, + { + "epoch": 0.10620854575691602, + "grad_norm": 2.6936542987823486, + "learning_rate": 2.917273059752002e-05, + "loss": 1.7683, + "step": 54540 + }, + { + "epoch": 0.10623775602802628, + "grad_norm": 2.0498580932617188, + "learning_rate": 2.9172279723754513e-05, + "loss": 1.7016, + "step": 54555 + }, + { + "epoch": 0.10626696629913654, + "grad_norm": 2.8533356189727783, + "learning_rate": 2.9171828730642242e-05, + "loss": 1.6845, + "step": 54570 + }, + { + "epoch": 0.10629617657024681, + "grad_norm": 2.056248426437378, + "learning_rate": 2.9171377618187007e-05, + "loss": 2.0156, + "step": 54585 + }, + { + "epoch": 0.10632538684135708, + "grad_norm": 2.296671152114868, + "learning_rate": 2.9170926386392612e-05, + "loss": 1.849, + "step": 54600 + }, + { + "epoch": 0.10635459711246734, + "grad_norm": 2.182014226913452, + "learning_rate": 2.9170475035262852e-05, + "loss": 1.8554, + "step": 54615 + }, + { + "epoch": 0.1063838073835776, + "grad_norm": 3.1786794662475586, + "learning_rate": 2.917002356480153e-05, + "loss": 1.7434, + "step": 54630 + }, + { + "epoch": 0.10641301765468786, + "grad_norm": 2.4866461753845215, + "learning_rate": 2.9169571975012443e-05, + "loss": 1.6749, + "step": 54645 + }, + { + "epoch": 0.10644222792579812, + "grad_norm": 2.232187032699585, + "learning_rate": 2.9169120265899397e-05, + "loss": 1.7389, + "step": 54660 + }, + { + "epoch": 0.10647143819690838, + "grad_norm": 2.5730066299438477, + "learning_rate": 2.9168668437466198e-05, + "loss": 1.8662, + "step": 54675 + }, + { + "epoch": 0.10650064846801864, + "grad_norm": 4.279869556427002, + "learning_rate": 2.9168216489716652e-05, + "loss": 1.7921, + "step": 54690 + }, + { + "epoch": 0.10652985873912892, + "grad_norm": 3.771742105484009, + "learning_rate": 2.916776442265456e-05, + "loss": 1.5656, + "step": 54705 + }, + { + "epoch": 0.10655906901023918, + "grad_norm": 3.6204771995544434, + "learning_rate": 2.916731223628373e-05, + "loss": 1.8978, + "step": 54720 + }, + { + "epoch": 0.10658827928134944, + "grad_norm": 2.9383411407470703, + "learning_rate": 2.9166859930607975e-05, + "loss": 1.7613, + "step": 54735 + }, + { + "epoch": 0.1066174895524597, + "grad_norm": 3.8747472763061523, + "learning_rate": 2.9166407505631095e-05, + "loss": 1.932, + "step": 54750 + }, + { + "epoch": 0.10664669982356996, + "grad_norm": 3.5089399814605713, + "learning_rate": 2.916595496135691e-05, + "loss": 1.7863, + "step": 54765 + }, + { + "epoch": 0.10667591009468022, + "grad_norm": 3.0446619987487793, + "learning_rate": 2.916550229778922e-05, + "loss": 1.7766, + "step": 54780 + }, + { + "epoch": 0.10670512036579048, + "grad_norm": 2.6004891395568848, + "learning_rate": 2.9165049514931847e-05, + "loss": 1.8076, + "step": 54795 + }, + { + "epoch": 0.10673433063690076, + "grad_norm": 3.3393611907958984, + "learning_rate": 2.91645966127886e-05, + "loss": 1.7971, + "step": 54810 + }, + { + "epoch": 0.10676354090801102, + "grad_norm": 3.000096559524536, + "learning_rate": 2.9164143591363288e-05, + "loss": 2.0109, + "step": 54825 + }, + { + "epoch": 0.10679275117912128, + "grad_norm": 2.582181930541992, + "learning_rate": 2.9163690450659735e-05, + "loss": 1.8032, + "step": 54840 + }, + { + "epoch": 0.10682196145023154, + "grad_norm": 3.554900884628296, + "learning_rate": 2.9163237190681757e-05, + "loss": 1.7566, + "step": 54855 + }, + { + "epoch": 0.1068511717213418, + "grad_norm": 4.5273895263671875, + "learning_rate": 2.9162783811433157e-05, + "loss": 2.1533, + "step": 54870 + }, + { + "epoch": 0.10688038199245206, + "grad_norm": 3.3506033420562744, + "learning_rate": 2.9162330312917768e-05, + "loss": 1.9713, + "step": 54885 + }, + { + "epoch": 0.10690959226356232, + "grad_norm": 2.58782958984375, + "learning_rate": 2.9161876695139405e-05, + "loss": 1.9986, + "step": 54900 + }, + { + "epoch": 0.1069388025346726, + "grad_norm": 3.1117055416107178, + "learning_rate": 2.9161422958101882e-05, + "loss": 1.7364, + "step": 54915 + }, + { + "epoch": 0.10696801280578286, + "grad_norm": 4.8319091796875, + "learning_rate": 2.9160969101809026e-05, + "loss": 1.8781, + "step": 54930 + }, + { + "epoch": 0.10699722307689312, + "grad_norm": 2.5479674339294434, + "learning_rate": 2.9160515126264658e-05, + "loss": 1.9511, + "step": 54945 + }, + { + "epoch": 0.10702643334800338, + "grad_norm": 3.125947952270508, + "learning_rate": 2.9160061031472602e-05, + "loss": 1.8599, + "step": 54960 + }, + { + "epoch": 0.10705564361911364, + "grad_norm": 2.1764276027679443, + "learning_rate": 2.915960681743668e-05, + "loss": 1.9193, + "step": 54975 + }, + { + "epoch": 0.1070848538902239, + "grad_norm": 2.6139886379241943, + "learning_rate": 2.9159152484160718e-05, + "loss": 1.8184, + "step": 54990 + }, + { + "epoch": 0.10711406416133416, + "grad_norm": 3.7947821617126465, + "learning_rate": 2.9158698031648536e-05, + "loss": 1.9546, + "step": 55005 + }, + { + "epoch": 0.10714327443244444, + "grad_norm": 3.525132894515991, + "learning_rate": 2.9158243459903975e-05, + "loss": 1.8876, + "step": 55020 + }, + { + "epoch": 0.1071724847035547, + "grad_norm": 3.559105157852173, + "learning_rate": 2.9157788768930852e-05, + "loss": 1.928, + "step": 55035 + }, + { + "epoch": 0.10720169497466496, + "grad_norm": 5.050568580627441, + "learning_rate": 2.9157333958732994e-05, + "loss": 1.6953, + "step": 55050 + }, + { + "epoch": 0.10723090524577522, + "grad_norm": 3.4258158206939697, + "learning_rate": 2.915687902931424e-05, + "loss": 1.9079, + "step": 55065 + }, + { + "epoch": 0.10726011551688548, + "grad_norm": 3.1921095848083496, + "learning_rate": 2.9156423980678413e-05, + "loss": 1.9112, + "step": 55080 + }, + { + "epoch": 0.10728932578799574, + "grad_norm": 2.5903563499450684, + "learning_rate": 2.9155968812829353e-05, + "loss": 1.8856, + "step": 55095 + }, + { + "epoch": 0.107318536059106, + "grad_norm": 2.8400394916534424, + "learning_rate": 2.915551352577089e-05, + "loss": 1.72, + "step": 55110 + }, + { + "epoch": 0.10734774633021628, + "grad_norm": 3.399661064147949, + "learning_rate": 2.915505811950685e-05, + "loss": 1.7045, + "step": 55125 + }, + { + "epoch": 0.10737695660132654, + "grad_norm": 3.3100569248199463, + "learning_rate": 2.9154602594041078e-05, + "loss": 1.7718, + "step": 55140 + }, + { + "epoch": 0.1074061668724368, + "grad_norm": 5.031435489654541, + "learning_rate": 2.9154146949377406e-05, + "loss": 1.9372, + "step": 55155 + }, + { + "epoch": 0.10743537714354706, + "grad_norm": 2.311403274536133, + "learning_rate": 2.9153691185519674e-05, + "loss": 1.9594, + "step": 55170 + }, + { + "epoch": 0.10746458741465732, + "grad_norm": 2.3605008125305176, + "learning_rate": 2.9153235302471715e-05, + "loss": 1.845, + "step": 55185 + }, + { + "epoch": 0.10749379768576758, + "grad_norm": 3.509260416030884, + "learning_rate": 2.9152779300237373e-05, + "loss": 1.9057, + "step": 55200 + }, + { + "epoch": 0.10752300795687784, + "grad_norm": 2.9556076526641846, + "learning_rate": 2.9152323178820485e-05, + "loss": 1.93, + "step": 55215 + }, + { + "epoch": 0.10755221822798812, + "grad_norm": 2.795161485671997, + "learning_rate": 2.9151866938224892e-05, + "loss": 1.9254, + "step": 55230 + }, + { + "epoch": 0.10758142849909838, + "grad_norm": 2.399402618408203, + "learning_rate": 2.915141057845444e-05, + "loss": 1.9186, + "step": 55245 + }, + { + "epoch": 0.10761063877020864, + "grad_norm": 2.6638965606689453, + "learning_rate": 2.915095409951297e-05, + "loss": 1.6305, + "step": 55260 + }, + { + "epoch": 0.1076398490413189, + "grad_norm": 3.214134931564331, + "learning_rate": 2.915049750140432e-05, + "loss": 1.961, + "step": 55275 + }, + { + "epoch": 0.10766905931242916, + "grad_norm": 1.9011048078536987, + "learning_rate": 2.9150040784132345e-05, + "loss": 1.8158, + "step": 55290 + }, + { + "epoch": 0.10769826958353942, + "grad_norm": 3.0230658054351807, + "learning_rate": 2.914958394770088e-05, + "loss": 1.8849, + "step": 55305 + }, + { + "epoch": 0.10772747985464969, + "grad_norm": 3.9477922916412354, + "learning_rate": 2.9149126992113787e-05, + "loss": 2.0242, + "step": 55320 + }, + { + "epoch": 0.10775669012575996, + "grad_norm": 2.6040146350860596, + "learning_rate": 2.9148669917374903e-05, + "loss": 1.7643, + "step": 55335 + }, + { + "epoch": 0.10778590039687022, + "grad_norm": 2.8662683963775635, + "learning_rate": 2.914821272348808e-05, + "loss": 1.8317, + "step": 55350 + }, + { + "epoch": 0.10781511066798048, + "grad_norm": 3.39095401763916, + "learning_rate": 2.9147755410457166e-05, + "loss": 1.8256, + "step": 55365 + }, + { + "epoch": 0.10784432093909074, + "grad_norm": 4.699791431427002, + "learning_rate": 2.9147297978286015e-05, + "loss": 1.8673, + "step": 55380 + }, + { + "epoch": 0.107873531210201, + "grad_norm": 2.186253309249878, + "learning_rate": 2.9146840426978476e-05, + "loss": 1.8513, + "step": 55395 + }, + { + "epoch": 0.10790274148131127, + "grad_norm": 3.770679235458374, + "learning_rate": 2.9146382756538407e-05, + "loss": 1.766, + "step": 55410 + }, + { + "epoch": 0.10793195175242153, + "grad_norm": 5.2049880027771, + "learning_rate": 2.9145924966969655e-05, + "loss": 1.7377, + "step": 55425 + }, + { + "epoch": 0.1079611620235318, + "grad_norm": 5.562504291534424, + "learning_rate": 2.9145467058276084e-05, + "loss": 1.8408, + "step": 55440 + }, + { + "epoch": 0.10799037229464206, + "grad_norm": 3.4524123668670654, + "learning_rate": 2.9145009030461542e-05, + "loss": 1.8034, + "step": 55455 + }, + { + "epoch": 0.10801958256575232, + "grad_norm": 4.022284984588623, + "learning_rate": 2.914455088352989e-05, + "loss": 1.8381, + "step": 55470 + }, + { + "epoch": 0.10804879283686258, + "grad_norm": 2.176159143447876, + "learning_rate": 2.9144092617484988e-05, + "loss": 1.8912, + "step": 55485 + }, + { + "epoch": 0.10807800310797284, + "grad_norm": 3.8053183555603027, + "learning_rate": 2.9143634232330688e-05, + "loss": 1.9716, + "step": 55500 + }, + { + "epoch": 0.1081072133790831, + "grad_norm": 4.059062957763672, + "learning_rate": 2.9143175728070864e-05, + "loss": 1.9624, + "step": 55515 + }, + { + "epoch": 0.10813642365019337, + "grad_norm": 3.2755417823791504, + "learning_rate": 2.914271710470936e-05, + "loss": 1.8949, + "step": 55530 + }, + { + "epoch": 0.10816563392130364, + "grad_norm": 2.0111021995544434, + "learning_rate": 2.914225836225005e-05, + "loss": 1.8102, + "step": 55545 + }, + { + "epoch": 0.1081948441924139, + "grad_norm": 3.8048787117004395, + "learning_rate": 2.9141799500696788e-05, + "loss": 1.813, + "step": 55560 + }, + { + "epoch": 0.10822405446352416, + "grad_norm": 4.269690036773682, + "learning_rate": 2.914134052005345e-05, + "loss": 1.7523, + "step": 55575 + }, + { + "epoch": 0.10825326473463442, + "grad_norm": 1.9855411052703857, + "learning_rate": 2.914088142032389e-05, + "loss": 1.86, + "step": 55590 + }, + { + "epoch": 0.10828247500574469, + "grad_norm": 1.9637675285339355, + "learning_rate": 2.914042220151198e-05, + "loss": 1.8974, + "step": 55605 + }, + { + "epoch": 0.10831168527685495, + "grad_norm": 2.4512064456939697, + "learning_rate": 2.9139962863621588e-05, + "loss": 1.7094, + "step": 55620 + }, + { + "epoch": 0.10834089554796521, + "grad_norm": 4.742093086242676, + "learning_rate": 2.9139503406656577e-05, + "loss": 1.9408, + "step": 55635 + }, + { + "epoch": 0.10837010581907548, + "grad_norm": 1.9494361877441406, + "learning_rate": 2.913904383062082e-05, + "loss": 1.9554, + "step": 55650 + }, + { + "epoch": 0.10839931609018574, + "grad_norm": 3.1741740703582764, + "learning_rate": 2.913858413551819e-05, + "loss": 1.9205, + "step": 55665 + }, + { + "epoch": 0.108428526361296, + "grad_norm": 3.1975393295288086, + "learning_rate": 2.9138124321352547e-05, + "loss": 1.8362, + "step": 55680 + }, + { + "epoch": 0.10845773663240627, + "grad_norm": 2.23559308052063, + "learning_rate": 2.9137664388127775e-05, + "loss": 1.7611, + "step": 55695 + }, + { + "epoch": 0.10848694690351653, + "grad_norm": 1.9877007007598877, + "learning_rate": 2.9137204335847744e-05, + "loss": 1.6875, + "step": 55710 + }, + { + "epoch": 0.10851615717462679, + "grad_norm": 5.272434711456299, + "learning_rate": 2.9136744164516325e-05, + "loss": 1.8568, + "step": 55725 + }, + { + "epoch": 0.10854536744573705, + "grad_norm": 2.5122151374816895, + "learning_rate": 2.91362838741374e-05, + "loss": 1.9436, + "step": 55740 + }, + { + "epoch": 0.10857457771684731, + "grad_norm": 2.095167636871338, + "learning_rate": 2.9135823464714834e-05, + "loss": 1.8388, + "step": 55755 + }, + { + "epoch": 0.10860378798795758, + "grad_norm": 3.2172155380249023, + "learning_rate": 2.913536293625251e-05, + "loss": 1.9698, + "step": 55770 + }, + { + "epoch": 0.10863299825906785, + "grad_norm": 3.2175745964050293, + "learning_rate": 2.913490228875431e-05, + "loss": 1.7451, + "step": 55785 + }, + { + "epoch": 0.1086622085301781, + "grad_norm": 3.002246379852295, + "learning_rate": 2.9134441522224112e-05, + "loss": 1.792, + "step": 55800 + }, + { + "epoch": 0.10869141880128837, + "grad_norm": 3.7021007537841797, + "learning_rate": 2.913398063666579e-05, + "loss": 1.8996, + "step": 55815 + }, + { + "epoch": 0.10872062907239863, + "grad_norm": 2.817061424255371, + "learning_rate": 2.9133519632083233e-05, + "loss": 1.8276, + "step": 55830 + }, + { + "epoch": 0.10874983934350889, + "grad_norm": 2.511824369430542, + "learning_rate": 2.9133058508480315e-05, + "loss": 1.9009, + "step": 55845 + }, + { + "epoch": 0.10877904961461915, + "grad_norm": 2.074856758117676, + "learning_rate": 2.9132597265860926e-05, + "loss": 1.6661, + "step": 55860 + }, + { + "epoch": 0.10880825988572942, + "grad_norm": 3.9622280597686768, + "learning_rate": 2.9132135904228943e-05, + "loss": 1.8877, + "step": 55875 + }, + { + "epoch": 0.10883747015683969, + "grad_norm": 3.386929988861084, + "learning_rate": 2.9131674423588258e-05, + "loss": 1.9582, + "step": 55890 + }, + { + "epoch": 0.10886668042794995, + "grad_norm": 2.608276605606079, + "learning_rate": 2.9131212823942757e-05, + "loss": 1.9018, + "step": 55905 + }, + { + "epoch": 0.10889589069906021, + "grad_norm": 2.528632879257202, + "learning_rate": 2.913075110529633e-05, + "loss": 1.7423, + "step": 55920 + }, + { + "epoch": 0.10892510097017047, + "grad_norm": 2.5413215160369873, + "learning_rate": 2.9130289267652855e-05, + "loss": 2.0343, + "step": 55935 + }, + { + "epoch": 0.10895431124128073, + "grad_norm": 2.150153398513794, + "learning_rate": 2.9129827311016228e-05, + "loss": 1.8196, + "step": 55950 + }, + { + "epoch": 0.10898352151239099, + "grad_norm": 3.6752045154571533, + "learning_rate": 2.9129365235390334e-05, + "loss": 1.975, + "step": 55965 + }, + { + "epoch": 0.10901273178350127, + "grad_norm": 2.141899585723877, + "learning_rate": 2.9128903040779076e-05, + "loss": 1.9028, + "step": 55980 + }, + { + "epoch": 0.10904194205461153, + "grad_norm": 3.3839213848114014, + "learning_rate": 2.9128440727186333e-05, + "loss": 1.992, + "step": 55995 + }, + { + "epoch": 0.10907115232572179, + "grad_norm": 4.140224456787109, + "learning_rate": 2.9127978294616002e-05, + "loss": 2.0497, + "step": 56010 + }, + { + "epoch": 0.10910036259683205, + "grad_norm": 3.807996988296509, + "learning_rate": 2.9127515743071982e-05, + "loss": 1.7567, + "step": 56025 + }, + { + "epoch": 0.10912957286794231, + "grad_norm": 5.618107795715332, + "learning_rate": 2.9127053072558166e-05, + "loss": 1.8929, + "step": 56040 + }, + { + "epoch": 0.10915878313905257, + "grad_norm": 4.217125415802002, + "learning_rate": 2.9126590283078447e-05, + "loss": 1.7255, + "step": 56055 + }, + { + "epoch": 0.10918799341016283, + "grad_norm": 3.6640851497650146, + "learning_rate": 2.9126127374636726e-05, + "loss": 1.8538, + "step": 56070 + }, + { + "epoch": 0.1092172036812731, + "grad_norm": 2.6153030395507812, + "learning_rate": 2.9125664347236897e-05, + "loss": 1.8462, + "step": 56085 + }, + { + "epoch": 0.10924641395238337, + "grad_norm": 4.285603046417236, + "learning_rate": 2.9125201200882865e-05, + "loss": 1.7956, + "step": 56100 + }, + { + "epoch": 0.10927562422349363, + "grad_norm": 2.26975679397583, + "learning_rate": 2.9124737935578523e-05, + "loss": 1.7453, + "step": 56115 + }, + { + "epoch": 0.10930483449460389, + "grad_norm": 3.9275543689727783, + "learning_rate": 2.912427455132778e-05, + "loss": 1.974, + "step": 56130 + }, + { + "epoch": 0.10933404476571415, + "grad_norm": 3.3309249877929688, + "learning_rate": 2.9123811048134532e-05, + "loss": 1.7976, + "step": 56145 + }, + { + "epoch": 0.10936325503682441, + "grad_norm": 2.514448642730713, + "learning_rate": 2.9123347426002683e-05, + "loss": 1.9165, + "step": 56160 + }, + { + "epoch": 0.10939246530793467, + "grad_norm": 2.5141212940216064, + "learning_rate": 2.9122883684936142e-05, + "loss": 1.8314, + "step": 56175 + }, + { + "epoch": 0.10942167557904495, + "grad_norm": 4.926792144775391, + "learning_rate": 2.9122419824938814e-05, + "loss": 1.7608, + "step": 56190 + }, + { + "epoch": 0.10945088585015521, + "grad_norm": 4.101518630981445, + "learning_rate": 2.91219558460146e-05, + "loss": 1.781, + "step": 56205 + }, + { + "epoch": 0.10948009612126547, + "grad_norm": 4.165318012237549, + "learning_rate": 2.9121491748167412e-05, + "loss": 1.708, + "step": 56220 + }, + { + "epoch": 0.10950930639237573, + "grad_norm": 3.009540557861328, + "learning_rate": 2.912102753140115e-05, + "loss": 1.7865, + "step": 56235 + }, + { + "epoch": 0.10953851666348599, + "grad_norm": 2.2899017333984375, + "learning_rate": 2.912056319571973e-05, + "loss": 1.8388, + "step": 56250 + }, + { + "epoch": 0.10956772693459625, + "grad_norm": 3.5658440589904785, + "learning_rate": 2.9120098741127066e-05, + "loss": 1.9774, + "step": 56265 + }, + { + "epoch": 0.10959693720570651, + "grad_norm": 6.968889236450195, + "learning_rate": 2.9119634167627063e-05, + "loss": 1.9675, + "step": 56280 + }, + { + "epoch": 0.10962614747681679, + "grad_norm": 2.2708065509796143, + "learning_rate": 2.9119169475223633e-05, + "loss": 1.7519, + "step": 56295 + }, + { + "epoch": 0.10965535774792705, + "grad_norm": 3.349945068359375, + "learning_rate": 2.911870466392069e-05, + "loss": 1.8687, + "step": 56310 + }, + { + "epoch": 0.10968456801903731, + "grad_norm": 2.982822895050049, + "learning_rate": 2.9118239733722153e-05, + "loss": 1.7451, + "step": 56325 + }, + { + "epoch": 0.10971377829014757, + "grad_norm": 3.3108813762664795, + "learning_rate": 2.911777468463193e-05, + "loss": 1.7586, + "step": 56340 + }, + { + "epoch": 0.10974298856125783, + "grad_norm": 3.7344858646392822, + "learning_rate": 2.9117309516653944e-05, + "loss": 1.7684, + "step": 56355 + }, + { + "epoch": 0.10977219883236809, + "grad_norm": 1.7953383922576904, + "learning_rate": 2.9116844229792106e-05, + "loss": 1.9876, + "step": 56370 + }, + { + "epoch": 0.10980140910347835, + "grad_norm": 4.465378761291504, + "learning_rate": 2.911637882405034e-05, + "loss": 2.0802, + "step": 56385 + }, + { + "epoch": 0.10983061937458863, + "grad_norm": 6.089666366577148, + "learning_rate": 2.911591329943256e-05, + "loss": 1.8308, + "step": 56400 + }, + { + "epoch": 0.10985982964569889, + "grad_norm": 2.3315675258636475, + "learning_rate": 2.911544765594269e-05, + "loss": 1.9543, + "step": 56415 + }, + { + "epoch": 0.10988903991680915, + "grad_norm": 2.5730764865875244, + "learning_rate": 2.911498189358465e-05, + "loss": 1.7965, + "step": 56430 + }, + { + "epoch": 0.10991825018791941, + "grad_norm": 2.5840201377868652, + "learning_rate": 2.9114516012362366e-05, + "loss": 1.9262, + "step": 56445 + }, + { + "epoch": 0.10994746045902967, + "grad_norm": 2.0305511951446533, + "learning_rate": 2.911405001227975e-05, + "loss": 1.8799, + "step": 56460 + }, + { + "epoch": 0.10997667073013993, + "grad_norm": 4.52584981918335, + "learning_rate": 2.911358389334074e-05, + "loss": 1.7814, + "step": 56475 + }, + { + "epoch": 0.1100058810012502, + "grad_norm": 4.243358612060547, + "learning_rate": 2.9113117655549254e-05, + "loss": 1.856, + "step": 56490 + }, + { + "epoch": 0.11003509127236047, + "grad_norm": 1.7922805547714233, + "learning_rate": 2.9112651298909215e-05, + "loss": 1.8259, + "step": 56505 + }, + { + "epoch": 0.11006430154347073, + "grad_norm": 3.4469642639160156, + "learning_rate": 2.911218482342456e-05, + "loss": 1.9275, + "step": 56520 + }, + { + "epoch": 0.11009351181458099, + "grad_norm": 4.809131622314453, + "learning_rate": 2.9111718229099207e-05, + "loss": 1.8597, + "step": 56535 + }, + { + "epoch": 0.11012272208569125, + "grad_norm": 4.120931148529053, + "learning_rate": 2.9111251515937095e-05, + "loss": 1.9064, + "step": 56550 + }, + { + "epoch": 0.11015193235680151, + "grad_norm": 3.3360371589660645, + "learning_rate": 2.9110784683942146e-05, + "loss": 1.848, + "step": 56565 + }, + { + "epoch": 0.11018114262791177, + "grad_norm": 4.624992370605469, + "learning_rate": 2.9110317733118296e-05, + "loss": 1.7387, + "step": 56580 + }, + { + "epoch": 0.11021035289902203, + "grad_norm": 2.1515185832977295, + "learning_rate": 2.9109850663469473e-05, + "loss": 1.7019, + "step": 56595 + }, + { + "epoch": 0.11023956317013231, + "grad_norm": 4.24725341796875, + "learning_rate": 2.9109383474999617e-05, + "loss": 1.7749, + "step": 56610 + }, + { + "epoch": 0.11026877344124257, + "grad_norm": 2.9339256286621094, + "learning_rate": 2.9108916167712658e-05, + "loss": 1.8693, + "step": 56625 + }, + { + "epoch": 0.11029798371235283, + "grad_norm": 2.120043992996216, + "learning_rate": 2.9108448741612526e-05, + "loss": 1.7355, + "step": 56640 + }, + { + "epoch": 0.11032719398346309, + "grad_norm": 3.4366424083709717, + "learning_rate": 2.910798119670317e-05, + "loss": 1.8094, + "step": 56655 + }, + { + "epoch": 0.11035640425457335, + "grad_norm": 6.467207908630371, + "learning_rate": 2.910751353298852e-05, + "loss": 1.9038, + "step": 56670 + }, + { + "epoch": 0.11038561452568361, + "grad_norm": 3.02050518989563, + "learning_rate": 2.910704575047251e-05, + "loss": 1.915, + "step": 56685 + }, + { + "epoch": 0.11041482479679388, + "grad_norm": 4.001872539520264, + "learning_rate": 2.9106577849159082e-05, + "loss": 1.7442, + "step": 56700 + }, + { + "epoch": 0.11044403506790415, + "grad_norm": 4.787405967712402, + "learning_rate": 2.9106109829052183e-05, + "loss": 1.9145, + "step": 56715 + }, + { + "epoch": 0.11047324533901441, + "grad_norm": 2.8339648246765137, + "learning_rate": 2.9105641690155747e-05, + "loss": 1.7538, + "step": 56730 + }, + { + "epoch": 0.11050245561012467, + "grad_norm": 3.380337953567505, + "learning_rate": 2.9105173432473718e-05, + "loss": 1.767, + "step": 56745 + }, + { + "epoch": 0.11053166588123493, + "grad_norm": 4.857455253601074, + "learning_rate": 2.910470505601004e-05, + "loss": 1.9686, + "step": 56760 + }, + { + "epoch": 0.1105608761523452, + "grad_norm": 2.3722784519195557, + "learning_rate": 2.910423656076866e-05, + "loss": 2.1694, + "step": 56775 + }, + { + "epoch": 0.11059008642345546, + "grad_norm": 4.866562366485596, + "learning_rate": 2.9103767946753517e-05, + "loss": 1.7788, + "step": 56790 + }, + { + "epoch": 0.11061929669456572, + "grad_norm": 3.093379497528076, + "learning_rate": 2.9103299213968563e-05, + "loss": 1.7891, + "step": 56805 + }, + { + "epoch": 0.11064850696567598, + "grad_norm": 3.1137278079986572, + "learning_rate": 2.9102830362417738e-05, + "loss": 1.8598, + "step": 56820 + }, + { + "epoch": 0.11067771723678625, + "grad_norm": 2.441159248352051, + "learning_rate": 2.9102361392105e-05, + "loss": 2.0121, + "step": 56835 + }, + { + "epoch": 0.11070692750789651, + "grad_norm": 3.7240729331970215, + "learning_rate": 2.9101892303034292e-05, + "loss": 2.0731, + "step": 56850 + }, + { + "epoch": 0.11073613777900677, + "grad_norm": 4.356746196746826, + "learning_rate": 2.9101423095209565e-05, + "loss": 1.9011, + "step": 56865 + }, + { + "epoch": 0.11076534805011704, + "grad_norm": 2.5039849281311035, + "learning_rate": 2.910095376863477e-05, + "loss": 1.906, + "step": 56880 + }, + { + "epoch": 0.1107945583212273, + "grad_norm": 1.9095027446746826, + "learning_rate": 2.910048432331386e-05, + "loss": 1.8508, + "step": 56895 + }, + { + "epoch": 0.11082376859233756, + "grad_norm": 2.6708927154541016, + "learning_rate": 2.9100014759250787e-05, + "loss": 1.7069, + "step": 56910 + }, + { + "epoch": 0.11085297886344782, + "grad_norm": 3.1949994564056396, + "learning_rate": 2.909954507644951e-05, + "loss": 1.9893, + "step": 56925 + }, + { + "epoch": 0.11088218913455809, + "grad_norm": 5.220842361450195, + "learning_rate": 2.909907527491398e-05, + "loss": 1.9406, + "step": 56940 + }, + { + "epoch": 0.11091139940566835, + "grad_norm": 4.030275344848633, + "learning_rate": 2.9098605354648155e-05, + "loss": 1.9269, + "step": 56955 + }, + { + "epoch": 0.11094060967677861, + "grad_norm": 2.8321971893310547, + "learning_rate": 2.9098135315655987e-05, + "loss": 1.7322, + "step": 56970 + }, + { + "epoch": 0.11096981994788888, + "grad_norm": 2.3430140018463135, + "learning_rate": 2.9097665157941448e-05, + "loss": 1.9648, + "step": 56985 + }, + { + "epoch": 0.11099903021899914, + "grad_norm": 2.418548583984375, + "learning_rate": 2.909719488150848e-05, + "loss": 1.9991, + "step": 57000 + }, + { + "epoch": 0.1110282404901094, + "grad_norm": 2.9540579319000244, + "learning_rate": 2.909672448636105e-05, + "loss": 1.7834, + "step": 57015 + }, + { + "epoch": 0.11105745076121966, + "grad_norm": 3.8784842491149902, + "learning_rate": 2.9096253972503124e-05, + "loss": 1.8357, + "step": 57030 + }, + { + "epoch": 0.11108666103232993, + "grad_norm": 2.2207846641540527, + "learning_rate": 2.9095783339938662e-05, + "loss": 1.8796, + "step": 57045 + }, + { + "epoch": 0.1111158713034402, + "grad_norm": 2.3347365856170654, + "learning_rate": 2.909531258867162e-05, + "loss": 1.9379, + "step": 57060 + }, + { + "epoch": 0.11114508157455046, + "grad_norm": 4.679647445678711, + "learning_rate": 2.9094841718705975e-05, + "loss": 1.6484, + "step": 57075 + }, + { + "epoch": 0.11117429184566072, + "grad_norm": 5.4116291999816895, + "learning_rate": 2.909437073004568e-05, + "loss": 1.9041, + "step": 57090 + }, + { + "epoch": 0.11120350211677098, + "grad_norm": 2.1998581886291504, + "learning_rate": 2.9093899622694713e-05, + "loss": 2.039, + "step": 57105 + }, + { + "epoch": 0.11123271238788124, + "grad_norm": 2.2417984008789062, + "learning_rate": 2.9093428396657033e-05, + "loss": 1.7752, + "step": 57120 + }, + { + "epoch": 0.1112619226589915, + "grad_norm": 3.5987281799316406, + "learning_rate": 2.9092957051936608e-05, + "loss": 1.8104, + "step": 57135 + }, + { + "epoch": 0.11129113293010177, + "grad_norm": 3.567805767059326, + "learning_rate": 2.909248558853741e-05, + "loss": 2.0338, + "step": 57150 + }, + { + "epoch": 0.11132034320121204, + "grad_norm": 3.0445523262023926, + "learning_rate": 2.909201400646341e-05, + "loss": 1.9669, + "step": 57165 + }, + { + "epoch": 0.1113495534723223, + "grad_norm": 4.407696723937988, + "learning_rate": 2.9091542305718578e-05, + "loss": 1.9644, + "step": 57180 + }, + { + "epoch": 0.11137876374343256, + "grad_norm": 1.9042673110961914, + "learning_rate": 2.9091070486306885e-05, + "loss": 2.0319, + "step": 57195 + }, + { + "epoch": 0.11140797401454282, + "grad_norm": 2.6861960887908936, + "learning_rate": 2.9090598548232307e-05, + "loss": 1.8436, + "step": 57210 + }, + { + "epoch": 0.11143718428565308, + "grad_norm": 2.5840277671813965, + "learning_rate": 2.9090126491498815e-05, + "loss": 1.6031, + "step": 57225 + }, + { + "epoch": 0.11146639455676334, + "grad_norm": 2.9862539768218994, + "learning_rate": 2.908965431611039e-05, + "loss": 1.8139, + "step": 57240 + }, + { + "epoch": 0.11149560482787362, + "grad_norm": 3.446305751800537, + "learning_rate": 2.9089182022071002e-05, + "loss": 1.9302, + "step": 57255 + }, + { + "epoch": 0.11152481509898388, + "grad_norm": 3.4366722106933594, + "learning_rate": 2.908870960938463e-05, + "loss": 1.7694, + "step": 57270 + }, + { + "epoch": 0.11155402537009414, + "grad_norm": 3.63612961769104, + "learning_rate": 2.9088237078055257e-05, + "loss": 1.767, + "step": 57285 + }, + { + "epoch": 0.1115832356412044, + "grad_norm": 1.9201642274856567, + "learning_rate": 2.908776442808685e-05, + "loss": 1.9064, + "step": 57300 + }, + { + "epoch": 0.11161244591231466, + "grad_norm": 3.53420352935791, + "learning_rate": 2.9087291659483407e-05, + "loss": 1.8105, + "step": 57315 + }, + { + "epoch": 0.11164165618342492, + "grad_norm": 4.431094169616699, + "learning_rate": 2.9086818772248893e-05, + "loss": 1.8004, + "step": 57330 + }, + { + "epoch": 0.11167086645453518, + "grad_norm": 2.1770708560943604, + "learning_rate": 2.9086345766387296e-05, + "loss": 1.8876, + "step": 57345 + }, + { + "epoch": 0.11170007672564546, + "grad_norm": 3.411011219024658, + "learning_rate": 2.9085872641902603e-05, + "loss": 1.7623, + "step": 57360 + }, + { + "epoch": 0.11172928699675572, + "grad_norm": 3.5376415252685547, + "learning_rate": 2.9085399398798797e-05, + "loss": 1.9412, + "step": 57375 + }, + { + "epoch": 0.11175849726786598, + "grad_norm": 4.079948902130127, + "learning_rate": 2.908492603707986e-05, + "loss": 1.7013, + "step": 57390 + }, + { + "epoch": 0.11178770753897624, + "grad_norm": 3.2958173751831055, + "learning_rate": 2.9084452556749777e-05, + "loss": 1.9054, + "step": 57405 + }, + { + "epoch": 0.1118169178100865, + "grad_norm": 3.8516297340393066, + "learning_rate": 2.9083978957812542e-05, + "loss": 1.993, + "step": 57420 + }, + { + "epoch": 0.11184612808119676, + "grad_norm": 2.327601432800293, + "learning_rate": 2.9083505240272135e-05, + "loss": 1.7818, + "step": 57435 + }, + { + "epoch": 0.11187533835230702, + "grad_norm": 3.739375591278076, + "learning_rate": 2.9083031404132556e-05, + "loss": 1.688, + "step": 57450 + }, + { + "epoch": 0.1119045486234173, + "grad_norm": 3.953880548477173, + "learning_rate": 2.9082557449397783e-05, + "loss": 1.8101, + "step": 57465 + }, + { + "epoch": 0.11193375889452756, + "grad_norm": 4.213339805603027, + "learning_rate": 2.9082083376071814e-05, + "loss": 1.8769, + "step": 57480 + }, + { + "epoch": 0.11196296916563782, + "grad_norm": 2.88124680519104, + "learning_rate": 2.9081609184158645e-05, + "loss": 1.8676, + "step": 57495 + }, + { + "epoch": 0.11199217943674808, + "grad_norm": 4.264623641967773, + "learning_rate": 2.9081134873662262e-05, + "loss": 1.8847, + "step": 57510 + }, + { + "epoch": 0.11202138970785834, + "grad_norm": 1.8398874998092651, + "learning_rate": 2.908066044458666e-05, + "loss": 1.7708, + "step": 57525 + }, + { + "epoch": 0.1120505999789686, + "grad_norm": 5.864681720733643, + "learning_rate": 2.9080185896935838e-05, + "loss": 1.7751, + "step": 57540 + }, + { + "epoch": 0.11207981025007886, + "grad_norm": 4.092381954193115, + "learning_rate": 2.907971123071379e-05, + "loss": 1.9256, + "step": 57555 + }, + { + "epoch": 0.11210902052118914, + "grad_norm": 4.937702655792236, + "learning_rate": 2.9079236445924513e-05, + "loss": 1.8853, + "step": 57570 + }, + { + "epoch": 0.1121382307922994, + "grad_norm": 2.045652389526367, + "learning_rate": 2.9078761542572006e-05, + "loss": 1.9029, + "step": 57585 + }, + { + "epoch": 0.11216744106340966, + "grad_norm": 2.207526922225952, + "learning_rate": 2.9078286520660267e-05, + "loss": 1.8693, + "step": 57600 + }, + { + "epoch": 0.11219665133451992, + "grad_norm": 3.1475579738616943, + "learning_rate": 2.9077811380193294e-05, + "loss": 1.8612, + "step": 57615 + }, + { + "epoch": 0.11222586160563018, + "grad_norm": 2.9990086555480957, + "learning_rate": 2.9077336121175096e-05, + "loss": 1.8435, + "step": 57630 + }, + { + "epoch": 0.11225507187674044, + "grad_norm": 2.4511334896087646, + "learning_rate": 2.907686074360967e-05, + "loss": 2.0716, + "step": 57645 + }, + { + "epoch": 0.1122842821478507, + "grad_norm": 4.475519180297852, + "learning_rate": 2.9076385247501022e-05, + "loss": 1.8026, + "step": 57660 + }, + { + "epoch": 0.11231349241896098, + "grad_norm": 2.7447431087493896, + "learning_rate": 2.907590963285315e-05, + "loss": 1.9004, + "step": 57675 + }, + { + "epoch": 0.11234270269007124, + "grad_norm": 1.6893303394317627, + "learning_rate": 2.9075433899670065e-05, + "loss": 1.8963, + "step": 57690 + }, + { + "epoch": 0.1123719129611815, + "grad_norm": 4.325908184051514, + "learning_rate": 2.907495804795577e-05, + "loss": 1.8422, + "step": 57705 + }, + { + "epoch": 0.11240112323229176, + "grad_norm": 2.064561605453491, + "learning_rate": 2.907448207771428e-05, + "loss": 1.936, + "step": 57720 + }, + { + "epoch": 0.11243033350340202, + "grad_norm": 3.901196241378784, + "learning_rate": 2.9074005988949587e-05, + "loss": 1.8779, + "step": 57735 + }, + { + "epoch": 0.11245954377451228, + "grad_norm": 2.794074535369873, + "learning_rate": 2.907352978166572e-05, + "loss": 2.0255, + "step": 57750 + }, + { + "epoch": 0.11248875404562254, + "grad_norm": 3.014103412628174, + "learning_rate": 2.9073053455866674e-05, + "loss": 1.8519, + "step": 57765 + }, + { + "epoch": 0.11251796431673282, + "grad_norm": 3.315856695175171, + "learning_rate": 2.9072577011556464e-05, + "loss": 2.0035, + "step": 57780 + }, + { + "epoch": 0.11254717458784308, + "grad_norm": 4.873906135559082, + "learning_rate": 2.9072100448739105e-05, + "loss": 1.9487, + "step": 57795 + }, + { + "epoch": 0.11257638485895334, + "grad_norm": 2.6035919189453125, + "learning_rate": 2.907162376741861e-05, + "loss": 2.0242, + "step": 57810 + }, + { + "epoch": 0.1126055951300636, + "grad_norm": 4.894930362701416, + "learning_rate": 2.9071146967598993e-05, + "loss": 1.8916, + "step": 57825 + }, + { + "epoch": 0.11263480540117386, + "grad_norm": 1.891161561012268, + "learning_rate": 2.9070670049284267e-05, + "loss": 1.7881, + "step": 57840 + }, + { + "epoch": 0.11266401567228412, + "grad_norm": 4.074905872344971, + "learning_rate": 2.907019301247845e-05, + "loss": 2.0821, + "step": 57855 + }, + { + "epoch": 0.11269322594339438, + "grad_norm": 2.700012683868408, + "learning_rate": 2.906971585718556e-05, + "loss": 1.9861, + "step": 57870 + }, + { + "epoch": 0.11272243621450465, + "grad_norm": 2.463411569595337, + "learning_rate": 2.9069238583409613e-05, + "loss": 1.8816, + "step": 57885 + }, + { + "epoch": 0.11275164648561492, + "grad_norm": 2.3251094818115234, + "learning_rate": 2.9068761191154623e-05, + "loss": 1.922, + "step": 57900 + }, + { + "epoch": 0.11278085675672518, + "grad_norm": 4.436325550079346, + "learning_rate": 2.9068283680424627e-05, + "loss": 1.8609, + "step": 57915 + }, + { + "epoch": 0.11281006702783544, + "grad_norm": 3.2900233268737793, + "learning_rate": 2.9067806051223633e-05, + "loss": 1.8091, + "step": 57930 + }, + { + "epoch": 0.1128392772989457, + "grad_norm": 2.6285436153411865, + "learning_rate": 2.9067328303555658e-05, + "loss": 1.9672, + "step": 57945 + }, + { + "epoch": 0.11286848757005596, + "grad_norm": 2.758803129196167, + "learning_rate": 2.906685043742474e-05, + "loss": 1.7282, + "step": 57960 + }, + { + "epoch": 0.11289769784116623, + "grad_norm": 2.464400291442871, + "learning_rate": 2.9066372452834893e-05, + "loss": 1.8452, + "step": 57975 + }, + { + "epoch": 0.11292690811227649, + "grad_norm": 3.232440233230591, + "learning_rate": 2.9065894349790146e-05, + "loss": 1.8058, + "step": 57990 + }, + { + "epoch": 0.11295611838338676, + "grad_norm": 3.6143486499786377, + "learning_rate": 2.9065416128294526e-05, + "loss": 1.8982, + "step": 58005 + }, + { + "epoch": 0.11298532865449702, + "grad_norm": 2.3097357749938965, + "learning_rate": 2.9064937788352058e-05, + "loss": 1.9881, + "step": 58020 + }, + { + "epoch": 0.11301453892560728, + "grad_norm": 2.809354782104492, + "learning_rate": 2.906445932996677e-05, + "loss": 1.6821, + "step": 58035 + }, + { + "epoch": 0.11304374919671754, + "grad_norm": 3.846630573272705, + "learning_rate": 2.906398075314269e-05, + "loss": 1.9812, + "step": 58050 + }, + { + "epoch": 0.1130729594678278, + "grad_norm": 6.049818992614746, + "learning_rate": 2.906350205788385e-05, + "loss": 1.9593, + "step": 58065 + }, + { + "epoch": 0.11310216973893807, + "grad_norm": 2.0046703815460205, + "learning_rate": 2.9063023244194285e-05, + "loss": 1.8798, + "step": 58080 + }, + { + "epoch": 0.11313138001004833, + "grad_norm": 5.410991191864014, + "learning_rate": 2.9062544312078023e-05, + "loss": 1.814, + "step": 58095 + }, + { + "epoch": 0.1131605902811586, + "grad_norm": 2.880796432495117, + "learning_rate": 2.9062065261539097e-05, + "loss": 1.782, + "step": 58110 + }, + { + "epoch": 0.11318980055226886, + "grad_norm": 2.447312831878662, + "learning_rate": 2.906158609258154e-05, + "loss": 1.9572, + "step": 58125 + }, + { + "epoch": 0.11321901082337912, + "grad_norm": 3.7232038974761963, + "learning_rate": 2.9061106805209386e-05, + "loss": 1.7558, + "step": 58140 + }, + { + "epoch": 0.11324822109448938, + "grad_norm": 4.419861793518066, + "learning_rate": 2.906062739942668e-05, + "loss": 1.8198, + "step": 58155 + }, + { + "epoch": 0.11327743136559965, + "grad_norm": 4.67490816116333, + "learning_rate": 2.9060147875237452e-05, + "loss": 1.7685, + "step": 58170 + }, + { + "epoch": 0.1133066416367099, + "grad_norm": 3.348634958267212, + "learning_rate": 2.905966823264574e-05, + "loss": 2.0068, + "step": 58185 + }, + { + "epoch": 0.11333585190782017, + "grad_norm": 2.4067349433898926, + "learning_rate": 2.9059188471655584e-05, + "loss": 1.8203, + "step": 58200 + }, + { + "epoch": 0.11336506217893044, + "grad_norm": 4.917893886566162, + "learning_rate": 2.9058708592271023e-05, + "loss": 2.045, + "step": 58215 + }, + { + "epoch": 0.1133942724500407, + "grad_norm": 3.0453333854675293, + "learning_rate": 2.9058228594496104e-05, + "loss": 1.7709, + "step": 58230 + }, + { + "epoch": 0.11342348272115096, + "grad_norm": 3.0751965045928955, + "learning_rate": 2.9057748478334863e-05, + "loss": 1.9131, + "step": 58245 + }, + { + "epoch": 0.11345269299226123, + "grad_norm": 3.404200792312622, + "learning_rate": 2.9057268243791345e-05, + "loss": 1.7912, + "step": 58260 + }, + { + "epoch": 0.11348190326337149, + "grad_norm": 3.3802454471588135, + "learning_rate": 2.9056787890869593e-05, + "loss": 1.9546, + "step": 58275 + }, + { + "epoch": 0.11351111353448175, + "grad_norm": 3.0359671115875244, + "learning_rate": 2.9056307419573648e-05, + "loss": 1.8399, + "step": 58290 + }, + { + "epoch": 0.11354032380559201, + "grad_norm": 2.505068302154541, + "learning_rate": 2.905582682990757e-05, + "loss": 1.6998, + "step": 58305 + }, + { + "epoch": 0.11356953407670228, + "grad_norm": 3.7588610649108887, + "learning_rate": 2.9055346121875392e-05, + "loss": 1.8245, + "step": 58320 + }, + { + "epoch": 0.11359874434781254, + "grad_norm": 4.781723976135254, + "learning_rate": 2.9054865295481172e-05, + "loss": 1.7244, + "step": 58335 + }, + { + "epoch": 0.1136279546189228, + "grad_norm": 3.6425397396087646, + "learning_rate": 2.9054384350728947e-05, + "loss": 1.8284, + "step": 58350 + }, + { + "epoch": 0.11365716489003307, + "grad_norm": 4.163845062255859, + "learning_rate": 2.9053903287622785e-05, + "loss": 1.8257, + "step": 58365 + }, + { + "epoch": 0.11368637516114333, + "grad_norm": 3.410801410675049, + "learning_rate": 2.905342210616672e-05, + "loss": 1.8442, + "step": 58380 + }, + { + "epoch": 0.11371558543225359, + "grad_norm": 4.645106792449951, + "learning_rate": 2.9052940806364812e-05, + "loss": 1.8625, + "step": 58395 + }, + { + "epoch": 0.11374479570336385, + "grad_norm": 2.529066562652588, + "learning_rate": 2.9052459388221117e-05, + "loss": 1.8134, + "step": 58410 + }, + { + "epoch": 0.11377400597447412, + "grad_norm": 2.393514394760132, + "learning_rate": 2.905197785173968e-05, + "loss": 2.0513, + "step": 58425 + }, + { + "epoch": 0.11380321624558438, + "grad_norm": 3.6366848945617676, + "learning_rate": 2.905149619692456e-05, + "loss": 1.8285, + "step": 58440 + }, + { + "epoch": 0.11383242651669465, + "grad_norm": 2.627612590789795, + "learning_rate": 2.9051014423779818e-05, + "loss": 1.874, + "step": 58455 + }, + { + "epoch": 0.1138616367878049, + "grad_norm": 4.193912982940674, + "learning_rate": 2.9050532532309507e-05, + "loss": 1.81, + "step": 58470 + }, + { + "epoch": 0.11389084705891517, + "grad_norm": 4.442510604858398, + "learning_rate": 2.9050050522517686e-05, + "loss": 1.7839, + "step": 58485 + }, + { + "epoch": 0.11392005733002543, + "grad_norm": 2.27286696434021, + "learning_rate": 2.9049568394408413e-05, + "loss": 1.8812, + "step": 58500 + }, + { + "epoch": 0.11394926760113569, + "grad_norm": 3.6388399600982666, + "learning_rate": 2.9049086147985744e-05, + "loss": 1.7723, + "step": 58515 + }, + { + "epoch": 0.11397847787224596, + "grad_norm": 2.9918782711029053, + "learning_rate": 2.904860378325375e-05, + "loss": 1.669, + "step": 58530 + }, + { + "epoch": 0.11400768814335623, + "grad_norm": 2.9191107749938965, + "learning_rate": 2.9048121300216488e-05, + "loss": 1.8694, + "step": 58545 + }, + { + "epoch": 0.11403689841446649, + "grad_norm": 3.3483500480651855, + "learning_rate": 2.9047638698878018e-05, + "loss": 1.8167, + "step": 58560 + }, + { + "epoch": 0.11406610868557675, + "grad_norm": 3.788705587387085, + "learning_rate": 2.904715597924241e-05, + "loss": 1.8849, + "step": 58575 + }, + { + "epoch": 0.11409531895668701, + "grad_norm": 2.420478582382202, + "learning_rate": 2.9046673141313722e-05, + "loss": 1.7667, + "step": 58590 + }, + { + "epoch": 0.11412452922779727, + "grad_norm": 2.7111704349517822, + "learning_rate": 2.9046190185096024e-05, + "loss": 1.9454, + "step": 58605 + }, + { + "epoch": 0.11415373949890753, + "grad_norm": 3.8691928386688232, + "learning_rate": 2.9045707110593383e-05, + "loss": 1.9337, + "step": 58620 + }, + { + "epoch": 0.1141829497700178, + "grad_norm": 4.566144943237305, + "learning_rate": 2.904522391780987e-05, + "loss": 1.9072, + "step": 58635 + }, + { + "epoch": 0.11421216004112807, + "grad_norm": 1.9235687255859375, + "learning_rate": 2.9044740606749547e-05, + "loss": 1.8019, + "step": 58650 + }, + { + "epoch": 0.11424137031223833, + "grad_norm": 3.899355173110962, + "learning_rate": 2.9044257177416494e-05, + "loss": 1.8139, + "step": 58665 + }, + { + "epoch": 0.11427058058334859, + "grad_norm": 2.701396942138672, + "learning_rate": 2.904377362981477e-05, + "loss": 2.0869, + "step": 58680 + }, + { + "epoch": 0.11429979085445885, + "grad_norm": 2.4429750442504883, + "learning_rate": 2.9043289963948458e-05, + "loss": 1.7522, + "step": 58695 + }, + { + "epoch": 0.11432900112556911, + "grad_norm": 4.272144794464111, + "learning_rate": 2.9042806179821624e-05, + "loss": 1.868, + "step": 58710 + }, + { + "epoch": 0.11435821139667937, + "grad_norm": 3.5274155139923096, + "learning_rate": 2.9042322277438342e-05, + "loss": 1.7718, + "step": 58725 + }, + { + "epoch": 0.11438742166778965, + "grad_norm": 2.799060106277466, + "learning_rate": 2.904183825680269e-05, + "loss": 2.102, + "step": 58740 + }, + { + "epoch": 0.11441663193889991, + "grad_norm": 1.9795939922332764, + "learning_rate": 2.9041354117918743e-05, + "loss": 1.9716, + "step": 58755 + }, + { + "epoch": 0.11444584221001017, + "grad_norm": 3.1530728340148926, + "learning_rate": 2.9040869860790582e-05, + "loss": 1.8275, + "step": 58770 + }, + { + "epoch": 0.11447505248112043, + "grad_norm": 2.149738073348999, + "learning_rate": 2.9040385485422276e-05, + "loss": 1.7713, + "step": 58785 + }, + { + "epoch": 0.11450426275223069, + "grad_norm": 2.867846965789795, + "learning_rate": 2.9039900991817915e-05, + "loss": 1.7291, + "step": 58800 + }, + { + "epoch": 0.11453347302334095, + "grad_norm": 3.290409564971924, + "learning_rate": 2.9039416379981568e-05, + "loss": 1.8513, + "step": 58815 + }, + { + "epoch": 0.11456268329445121, + "grad_norm": 4.67829704284668, + "learning_rate": 2.903893164991732e-05, + "loss": 1.8685, + "step": 58830 + }, + { + "epoch": 0.11459189356556149, + "grad_norm": 2.1198270320892334, + "learning_rate": 2.903844680162926e-05, + "loss": 2.0454, + "step": 58845 + }, + { + "epoch": 0.11462110383667175, + "grad_norm": 3.5437893867492676, + "learning_rate": 2.903796183512146e-05, + "loss": 1.8919, + "step": 58860 + }, + { + "epoch": 0.11465031410778201, + "grad_norm": 2.6683316230773926, + "learning_rate": 2.903747675039801e-05, + "loss": 1.9095, + "step": 58875 + }, + { + "epoch": 0.11467952437889227, + "grad_norm": 3.6446452140808105, + "learning_rate": 2.9036991547462997e-05, + "loss": 2.0593, + "step": 58890 + }, + { + "epoch": 0.11470873465000253, + "grad_norm": 2.6878817081451416, + "learning_rate": 2.90365062263205e-05, + "loss": 1.7111, + "step": 58905 + }, + { + "epoch": 0.11473794492111279, + "grad_norm": 2.3888416290283203, + "learning_rate": 2.903602078697461e-05, + "loss": 1.9849, + "step": 58920 + }, + { + "epoch": 0.11476715519222305, + "grad_norm": 3.8664708137512207, + "learning_rate": 2.9035535229429417e-05, + "loss": 1.9623, + "step": 58935 + }, + { + "epoch": 0.11479636546333331, + "grad_norm": 2.987964153289795, + "learning_rate": 2.9035049553689005e-05, + "loss": 1.8802, + "step": 58950 + }, + { + "epoch": 0.11482557573444359, + "grad_norm": 2.203887462615967, + "learning_rate": 2.903456375975747e-05, + "loss": 1.803, + "step": 58965 + }, + { + "epoch": 0.11485478600555385, + "grad_norm": 2.8072426319122314, + "learning_rate": 2.9034077847638902e-05, + "loss": 1.8227, + "step": 58980 + }, + { + "epoch": 0.11488399627666411, + "grad_norm": 3.3437204360961914, + "learning_rate": 2.9033591817337383e-05, + "loss": 1.8565, + "step": 58995 + }, + { + "epoch": 0.11491320654777437, + "grad_norm": 2.2321078777313232, + "learning_rate": 2.9033105668857017e-05, + "loss": 1.6556, + "step": 59010 + }, + { + "epoch": 0.11494241681888463, + "grad_norm": 2.2427878379821777, + "learning_rate": 2.9032619402201898e-05, + "loss": 1.8785, + "step": 59025 + }, + { + "epoch": 0.1149716270899949, + "grad_norm": 2.603421688079834, + "learning_rate": 2.9032133017376115e-05, + "loss": 1.6905, + "step": 59040 + }, + { + "epoch": 0.11500083736110515, + "grad_norm": 3.813875436782837, + "learning_rate": 2.9031646514383767e-05, + "loss": 1.9301, + "step": 59055 + }, + { + "epoch": 0.11503004763221543, + "grad_norm": 2.933596134185791, + "learning_rate": 2.903115989322895e-05, + "loss": 1.8446, + "step": 59070 + }, + { + "epoch": 0.11505925790332569, + "grad_norm": 2.255638599395752, + "learning_rate": 2.903067315391576e-05, + "loss": 1.8869, + "step": 59085 + }, + { + "epoch": 0.11508846817443595, + "grad_norm": 4.4029741287231445, + "learning_rate": 2.9030186296448298e-05, + "loss": 1.8302, + "step": 59100 + }, + { + "epoch": 0.11511767844554621, + "grad_norm": 3.5828723907470703, + "learning_rate": 2.9029699320830664e-05, + "loss": 2.0204, + "step": 59115 + }, + { + "epoch": 0.11514688871665647, + "grad_norm": 2.589620590209961, + "learning_rate": 2.902921222706696e-05, + "loss": 1.9468, + "step": 59130 + }, + { + "epoch": 0.11517609898776673, + "grad_norm": 3.320376396179199, + "learning_rate": 2.9028725015161287e-05, + "loss": 1.8529, + "step": 59145 + }, + { + "epoch": 0.115205309258877, + "grad_norm": 5.217315196990967, + "learning_rate": 2.902823768511775e-05, + "loss": 1.6548, + "step": 59160 + }, + { + "epoch": 0.11523451952998727, + "grad_norm": 3.831280469894409, + "learning_rate": 2.902775023694045e-05, + "loss": 1.8414, + "step": 59175 + }, + { + "epoch": 0.11526372980109753, + "grad_norm": 2.6600632667541504, + "learning_rate": 2.902726267063349e-05, + "loss": 1.8874, + "step": 59190 + }, + { + "epoch": 0.11529294007220779, + "grad_norm": 4.642845153808594, + "learning_rate": 2.902677498620098e-05, + "loss": 1.5058, + "step": 59205 + }, + { + "epoch": 0.11532215034331805, + "grad_norm": 2.195073127746582, + "learning_rate": 2.902628718364702e-05, + "loss": 1.9702, + "step": 59220 + }, + { + "epoch": 0.11535136061442831, + "grad_norm": 3.779008150100708, + "learning_rate": 2.9025799262975724e-05, + "loss": 1.7452, + "step": 59235 + }, + { + "epoch": 0.11538057088553857, + "grad_norm": 2.2840652465820312, + "learning_rate": 2.9025311224191208e-05, + "loss": 1.7755, + "step": 59250 + }, + { + "epoch": 0.11540978115664884, + "grad_norm": 2.814328193664551, + "learning_rate": 2.9024823067297567e-05, + "loss": 1.7206, + "step": 59265 + }, + { + "epoch": 0.11543899142775911, + "grad_norm": 4.861764907836914, + "learning_rate": 2.9024334792298917e-05, + "loss": 1.7443, + "step": 59280 + }, + { + "epoch": 0.11546820169886937, + "grad_norm": 4.674343109130859, + "learning_rate": 2.902384639919937e-05, + "loss": 1.9097, + "step": 59295 + }, + { + "epoch": 0.11549741196997963, + "grad_norm": 5.118795394897461, + "learning_rate": 2.902335788800305e-05, + "loss": 1.6996, + "step": 59310 + }, + { + "epoch": 0.1155266222410899, + "grad_norm": 2.835145950317383, + "learning_rate": 2.9022869258714052e-05, + "loss": 1.9264, + "step": 59325 + }, + { + "epoch": 0.11555583251220015, + "grad_norm": 2.7551937103271484, + "learning_rate": 2.90223805113365e-05, + "loss": 1.7045, + "step": 59340 + }, + { + "epoch": 0.11558504278331042, + "grad_norm": 2.7504236698150635, + "learning_rate": 2.9021891645874514e-05, + "loss": 1.9738, + "step": 59355 + }, + { + "epoch": 0.11561425305442068, + "grad_norm": 2.239056348800659, + "learning_rate": 2.9021402662332207e-05, + "loss": 2.1918, + "step": 59370 + }, + { + "epoch": 0.11564346332553095, + "grad_norm": 2.8731796741485596, + "learning_rate": 2.902091356071369e-05, + "loss": 1.8339, + "step": 59385 + }, + { + "epoch": 0.11567267359664121, + "grad_norm": 3.2189009189605713, + "learning_rate": 2.9020424341023088e-05, + "loss": 1.7994, + "step": 59400 + }, + { + "epoch": 0.11570188386775147, + "grad_norm": 3.429677724838257, + "learning_rate": 2.9019935003264528e-05, + "loss": 1.8869, + "step": 59415 + }, + { + "epoch": 0.11573109413886173, + "grad_norm": 3.4154837131500244, + "learning_rate": 2.9019445547442115e-05, + "loss": 1.823, + "step": 59430 + }, + { + "epoch": 0.115760304409972, + "grad_norm": 3.540036678314209, + "learning_rate": 2.901895597355999e-05, + "loss": 1.9045, + "step": 59445 + }, + { + "epoch": 0.11578951468108226, + "grad_norm": 1.9097825288772583, + "learning_rate": 2.9018466281622257e-05, + "loss": 1.8243, + "step": 59460 + }, + { + "epoch": 0.11581872495219252, + "grad_norm": 3.786574125289917, + "learning_rate": 2.9017976471633048e-05, + "loss": 1.9691, + "step": 59475 + }, + { + "epoch": 0.11584793522330279, + "grad_norm": 3.3069751262664795, + "learning_rate": 2.901748654359649e-05, + "loss": 1.7842, + "step": 59490 + }, + { + "epoch": 0.11587714549441305, + "grad_norm": 3.880121946334839, + "learning_rate": 2.9016996497516707e-05, + "loss": 1.7651, + "step": 59505 + }, + { + "epoch": 0.11590635576552331, + "grad_norm": 4.4247636795043945, + "learning_rate": 2.901650633339782e-05, + "loss": 1.9395, + "step": 59520 + }, + { + "epoch": 0.11593556603663357, + "grad_norm": 3.0642709732055664, + "learning_rate": 2.9016016051243967e-05, + "loss": 1.8559, + "step": 59535 + }, + { + "epoch": 0.11596477630774384, + "grad_norm": 2.271341562271118, + "learning_rate": 2.901552565105927e-05, + "loss": 1.9048, + "step": 59550 + }, + { + "epoch": 0.1159939865788541, + "grad_norm": 4.734255313873291, + "learning_rate": 2.901503513284786e-05, + "loss": 1.7353, + "step": 59565 + }, + { + "epoch": 0.11602319684996436, + "grad_norm": 1.9119137525558472, + "learning_rate": 2.9014544496613867e-05, + "loss": 1.6796, + "step": 59580 + }, + { + "epoch": 0.11605240712107463, + "grad_norm": 2.6759443283081055, + "learning_rate": 2.9014053742361425e-05, + "loss": 1.9295, + "step": 59595 + }, + { + "epoch": 0.1160816173921849, + "grad_norm": 3.736943483352661, + "learning_rate": 2.9013562870094665e-05, + "loss": 1.878, + "step": 59610 + }, + { + "epoch": 0.11611082766329515, + "grad_norm": 4.310480117797852, + "learning_rate": 2.9013071879817722e-05, + "loss": 1.8014, + "step": 59625 + }, + { + "epoch": 0.11614003793440542, + "grad_norm": 2.7881555557250977, + "learning_rate": 2.901258077153473e-05, + "loss": 1.9338, + "step": 59640 + }, + { + "epoch": 0.11616924820551568, + "grad_norm": 3.397674798965454, + "learning_rate": 2.9012089545249826e-05, + "loss": 1.9629, + "step": 59655 + }, + { + "epoch": 0.11619845847662594, + "grad_norm": 2.116713762283325, + "learning_rate": 2.9011598200967142e-05, + "loss": 1.8669, + "step": 59670 + }, + { + "epoch": 0.1162276687477362, + "grad_norm": 2.7831530570983887, + "learning_rate": 2.901110673869082e-05, + "loss": 2.0152, + "step": 59685 + }, + { + "epoch": 0.11625687901884647, + "grad_norm": 1.9171403646469116, + "learning_rate": 2.9010615158424994e-05, + "loss": 2.022, + "step": 59700 + }, + { + "epoch": 0.11628608928995673, + "grad_norm": 4.327155113220215, + "learning_rate": 2.9010123460173815e-05, + "loss": 1.8563, + "step": 59715 + }, + { + "epoch": 0.116315299561067, + "grad_norm": 2.6845343112945557, + "learning_rate": 2.9009631643941408e-05, + "loss": 2.0901, + "step": 59730 + }, + { + "epoch": 0.11634450983217726, + "grad_norm": 2.704784870147705, + "learning_rate": 2.9009139709731925e-05, + "loss": 1.6987, + "step": 59745 + }, + { + "epoch": 0.11637372010328752, + "grad_norm": 2.8482110500335693, + "learning_rate": 2.900864765754951e-05, + "loss": 2.0693, + "step": 59760 + }, + { + "epoch": 0.11640293037439778, + "grad_norm": 3.2404778003692627, + "learning_rate": 2.90081554873983e-05, + "loss": 1.9537, + "step": 59775 + }, + { + "epoch": 0.11643214064550804, + "grad_norm": 2.391136884689331, + "learning_rate": 2.900766319928244e-05, + "loss": 1.8881, + "step": 59790 + }, + { + "epoch": 0.11646135091661831, + "grad_norm": 3.924664258956909, + "learning_rate": 2.9007170793206084e-05, + "loss": 1.839, + "step": 59805 + }, + { + "epoch": 0.11649056118772858, + "grad_norm": 3.433196783065796, + "learning_rate": 2.900667826917337e-05, + "loss": 1.9914, + "step": 59820 + }, + { + "epoch": 0.11651977145883884, + "grad_norm": 3.6754157543182373, + "learning_rate": 2.900618562718845e-05, + "loss": 1.9486, + "step": 59835 + }, + { + "epoch": 0.1165489817299491, + "grad_norm": 3.043656826019287, + "learning_rate": 2.9005692867255466e-05, + "loss": 1.9882, + "step": 59850 + }, + { + "epoch": 0.11657819200105936, + "grad_norm": 2.173506498336792, + "learning_rate": 2.9005199989378574e-05, + "loss": 1.8911, + "step": 59865 + }, + { + "epoch": 0.11660740227216962, + "grad_norm": 3.5416691303253174, + "learning_rate": 2.9004706993561926e-05, + "loss": 1.7663, + "step": 59880 + }, + { + "epoch": 0.11663661254327988, + "grad_norm": 5.051126480102539, + "learning_rate": 2.900421387980967e-05, + "loss": 1.9546, + "step": 59895 + }, + { + "epoch": 0.11666582281439015, + "grad_norm": 2.687592029571533, + "learning_rate": 2.9003720648125958e-05, + "loss": 1.8362, + "step": 59910 + }, + { + "epoch": 0.11669503308550042, + "grad_norm": 3.3521056175231934, + "learning_rate": 2.9003227298514947e-05, + "loss": 1.9259, + "step": 59925 + }, + { + "epoch": 0.11672424335661068, + "grad_norm": 3.802572250366211, + "learning_rate": 2.9002733830980785e-05, + "loss": 1.7983, + "step": 59940 + }, + { + "epoch": 0.11675345362772094, + "grad_norm": 4.504456996917725, + "learning_rate": 2.9002240245527636e-05, + "loss": 1.9183, + "step": 59955 + }, + { + "epoch": 0.1167826638988312, + "grad_norm": 2.925114154815674, + "learning_rate": 2.9001746542159653e-05, + "loss": 1.811, + "step": 59970 + }, + { + "epoch": 0.11681187416994146, + "grad_norm": 2.3652501106262207, + "learning_rate": 2.9001252720880988e-05, + "loss": 1.8902, + "step": 59985 + }, + { + "epoch": 0.11684108444105172, + "grad_norm": 4.274910926818848, + "learning_rate": 2.900075878169581e-05, + "loss": 1.9512, + "step": 60000 + }, + { + "epoch": 0.11687029471216198, + "grad_norm": 2.3416433334350586, + "learning_rate": 2.9000264724608272e-05, + "loss": 2.0828, + "step": 60015 + }, + { + "epoch": 0.11689950498327226, + "grad_norm": 4.133050918579102, + "learning_rate": 2.8999770549622533e-05, + "loss": 1.6662, + "step": 60030 + }, + { + "epoch": 0.11692871525438252, + "grad_norm": 3.332608461380005, + "learning_rate": 2.899927625674276e-05, + "loss": 1.9566, + "step": 60045 + }, + { + "epoch": 0.11695792552549278, + "grad_norm": 2.4462785720825195, + "learning_rate": 2.899878184597311e-05, + "loss": 1.8761, + "step": 60060 + }, + { + "epoch": 0.11698713579660304, + "grad_norm": 3.7322981357574463, + "learning_rate": 2.8998287317317754e-05, + "loss": 1.882, + "step": 60075 + }, + { + "epoch": 0.1170163460677133, + "grad_norm": 3.123087167739868, + "learning_rate": 2.8997792670780848e-05, + "loss": 2.1865, + "step": 60090 + }, + { + "epoch": 0.11704555633882356, + "grad_norm": 2.441005229949951, + "learning_rate": 2.899729790636656e-05, + "loss": 1.8138, + "step": 60105 + }, + { + "epoch": 0.11707476660993382, + "grad_norm": 4.429668426513672, + "learning_rate": 2.8996803024079064e-05, + "loss": 1.8553, + "step": 60120 + }, + { + "epoch": 0.1171039768810441, + "grad_norm": 4.3770222663879395, + "learning_rate": 2.8996308023922512e-05, + "loss": 1.6685, + "step": 60135 + }, + { + "epoch": 0.11713318715215436, + "grad_norm": 2.4245259761810303, + "learning_rate": 2.8995812905901087e-05, + "loss": 1.7252, + "step": 60150 + }, + { + "epoch": 0.11716239742326462, + "grad_norm": 4.569708347320557, + "learning_rate": 2.8995317670018955e-05, + "loss": 1.9122, + "step": 60165 + }, + { + "epoch": 0.11719160769437488, + "grad_norm": 2.6065075397491455, + "learning_rate": 2.8994822316280282e-05, + "loss": 2.0053, + "step": 60180 + }, + { + "epoch": 0.11722081796548514, + "grad_norm": 4.467757225036621, + "learning_rate": 2.8994326844689243e-05, + "loss": 1.8504, + "step": 60195 + }, + { + "epoch": 0.1172500282365954, + "grad_norm": 3.050584316253662, + "learning_rate": 2.8993831255250006e-05, + "loss": 1.8604, + "step": 60210 + }, + { + "epoch": 0.11727923850770566, + "grad_norm": 2.53847336769104, + "learning_rate": 2.899333554796675e-05, + "loss": 2.0399, + "step": 60225 + }, + { + "epoch": 0.11730844877881594, + "grad_norm": 2.6371254920959473, + "learning_rate": 2.8992839722843648e-05, + "loss": 1.7966, + "step": 60240 + }, + { + "epoch": 0.1173376590499262, + "grad_norm": 2.7178635597229004, + "learning_rate": 2.8992343779884876e-05, + "loss": 1.8618, + "step": 60255 + }, + { + "epoch": 0.11736686932103646, + "grad_norm": 3.479475498199463, + "learning_rate": 2.8991847719094605e-05, + "loss": 1.9183, + "step": 60270 + }, + { + "epoch": 0.11739607959214672, + "grad_norm": 2.2454817295074463, + "learning_rate": 2.899135154047702e-05, + "loss": 1.7282, + "step": 60285 + }, + { + "epoch": 0.11742528986325698, + "grad_norm": 2.7979140281677246, + "learning_rate": 2.8990855244036295e-05, + "loss": 1.9539, + "step": 60300 + }, + { + "epoch": 0.11745450013436724, + "grad_norm": 2.986590623855591, + "learning_rate": 2.899035882977661e-05, + "loss": 1.9453, + "step": 60315 + }, + { + "epoch": 0.1174837104054775, + "grad_norm": 2.223979949951172, + "learning_rate": 2.8989862297702148e-05, + "loss": 1.9266, + "step": 60330 + }, + { + "epoch": 0.11751292067658778, + "grad_norm": 3.5470621585845947, + "learning_rate": 2.8989365647817085e-05, + "loss": 1.7306, + "step": 60345 + }, + { + "epoch": 0.11754213094769804, + "grad_norm": 2.2605364322662354, + "learning_rate": 2.8988868880125605e-05, + "loss": 1.8383, + "step": 60360 + }, + { + "epoch": 0.1175713412188083, + "grad_norm": 2.4165682792663574, + "learning_rate": 2.8988371994631893e-05, + "loss": 1.8836, + "step": 60375 + }, + { + "epoch": 0.11760055148991856, + "grad_norm": 2.142549991607666, + "learning_rate": 2.8987874991340134e-05, + "loss": 1.6788, + "step": 60390 + }, + { + "epoch": 0.11762976176102882, + "grad_norm": 2.30549955368042, + "learning_rate": 2.8987377870254515e-05, + "loss": 1.8903, + "step": 60405 + }, + { + "epoch": 0.11765897203213908, + "grad_norm": 1.5044097900390625, + "learning_rate": 2.8986880631379217e-05, + "loss": 1.9231, + "step": 60420 + }, + { + "epoch": 0.11768818230324934, + "grad_norm": 3.0893962383270264, + "learning_rate": 2.898638327471843e-05, + "loss": 2.0674, + "step": 60435 + }, + { + "epoch": 0.11771739257435962, + "grad_norm": 2.925095796585083, + "learning_rate": 2.8985885800276344e-05, + "loss": 1.852, + "step": 60450 + }, + { + "epoch": 0.11774660284546988, + "grad_norm": 2.171668291091919, + "learning_rate": 2.8985388208057146e-05, + "loss": 2.1446, + "step": 60465 + }, + { + "epoch": 0.11777581311658014, + "grad_norm": 4.817943096160889, + "learning_rate": 2.8984890498065026e-05, + "loss": 1.687, + "step": 60480 + }, + { + "epoch": 0.1178050233876904, + "grad_norm": 3.2660818099975586, + "learning_rate": 2.898439267030418e-05, + "loss": 1.8976, + "step": 60495 + }, + { + "epoch": 0.11783423365880066, + "grad_norm": 5.511620044708252, + "learning_rate": 2.8983894724778794e-05, + "loss": 1.9338, + "step": 60510 + }, + { + "epoch": 0.11786344392991092, + "grad_norm": 1.8060615062713623, + "learning_rate": 2.8983396661493057e-05, + "loss": 1.7289, + "step": 60525 + }, + { + "epoch": 0.11789265420102119, + "grad_norm": 2.530665874481201, + "learning_rate": 2.898289848045118e-05, + "loss": 1.9435, + "step": 60540 + }, + { + "epoch": 0.11792186447213146, + "grad_norm": 2.7004477977752686, + "learning_rate": 2.8982400181657344e-05, + "loss": 1.8559, + "step": 60555 + }, + { + "epoch": 0.11795107474324172, + "grad_norm": 2.7683591842651367, + "learning_rate": 2.8981901765115748e-05, + "loss": 1.8328, + "step": 60570 + }, + { + "epoch": 0.11798028501435198, + "grad_norm": 2.9221267700195312, + "learning_rate": 2.898140323083059e-05, + "loss": 1.9367, + "step": 60585 + }, + { + "epoch": 0.11800949528546224, + "grad_norm": 2.6875112056732178, + "learning_rate": 2.8980904578806073e-05, + "loss": 1.7684, + "step": 60600 + }, + { + "epoch": 0.1180387055565725, + "grad_norm": 3.996904134750366, + "learning_rate": 2.8980405809046394e-05, + "loss": 1.8427, + "step": 60615 + }, + { + "epoch": 0.11806791582768277, + "grad_norm": 2.540865898132324, + "learning_rate": 2.8979906921555746e-05, + "loss": 1.9931, + "step": 60630 + }, + { + "epoch": 0.11809712609879303, + "grad_norm": 3.322305202484131, + "learning_rate": 2.897940791633834e-05, + "loss": 1.7296, + "step": 60645 + }, + { + "epoch": 0.1181263363699033, + "grad_norm": 5.390257835388184, + "learning_rate": 2.8978908793398374e-05, + "loss": 1.8628, + "step": 60660 + }, + { + "epoch": 0.11815554664101356, + "grad_norm": 3.835862398147583, + "learning_rate": 2.897840955274005e-05, + "loss": 1.958, + "step": 60675 + }, + { + "epoch": 0.11818475691212382, + "grad_norm": 4.43618106842041, + "learning_rate": 2.897791019436757e-05, + "loss": 1.9294, + "step": 60690 + }, + { + "epoch": 0.11821396718323408, + "grad_norm": 2.681001901626587, + "learning_rate": 2.897741071828514e-05, + "loss": 1.8041, + "step": 60705 + }, + { + "epoch": 0.11824317745434434, + "grad_norm": 2.577934741973877, + "learning_rate": 2.8976911124496978e-05, + "loss": 2.1211, + "step": 60720 + }, + { + "epoch": 0.1182723877254546, + "grad_norm": 2.2596302032470703, + "learning_rate": 2.8976411413007274e-05, + "loss": 1.8579, + "step": 60735 + }, + { + "epoch": 0.11830159799656487, + "grad_norm": 1.9503886699676514, + "learning_rate": 2.8975911583820246e-05, + "loss": 1.7347, + "step": 60750 + }, + { + "epoch": 0.11833080826767514, + "grad_norm": 2.2655904293060303, + "learning_rate": 2.89754116369401e-05, + "loss": 1.6944, + "step": 60765 + }, + { + "epoch": 0.1183600185387854, + "grad_norm": 2.3302783966064453, + "learning_rate": 2.8974911572371048e-05, + "loss": 1.9971, + "step": 60780 + }, + { + "epoch": 0.11838922880989566, + "grad_norm": 2.7237141132354736, + "learning_rate": 2.89744113901173e-05, + "loss": 1.9298, + "step": 60795 + }, + { + "epoch": 0.11841843908100592, + "grad_norm": 4.494090557098389, + "learning_rate": 2.8973911090183073e-05, + "loss": 1.9438, + "step": 60810 + }, + { + "epoch": 0.11844764935211619, + "grad_norm": 3.908583879470825, + "learning_rate": 2.897341067257257e-05, + "loss": 1.755, + "step": 60825 + }, + { + "epoch": 0.11847685962322645, + "grad_norm": 2.5664403438568115, + "learning_rate": 2.8972910137290006e-05, + "loss": 1.9111, + "step": 60840 + }, + { + "epoch": 0.11850606989433671, + "grad_norm": 2.166508197784424, + "learning_rate": 2.897240948433961e-05, + "loss": 1.9073, + "step": 60855 + }, + { + "epoch": 0.11853528016544698, + "grad_norm": 4.2281928062438965, + "learning_rate": 2.8971908713725582e-05, + "loss": 1.7498, + "step": 60870 + }, + { + "epoch": 0.11856449043655724, + "grad_norm": 3.2585582733154297, + "learning_rate": 2.897140782545215e-05, + "loss": 1.7064, + "step": 60885 + }, + { + "epoch": 0.1185937007076675, + "grad_norm": 3.947725296020508, + "learning_rate": 2.8970906819523527e-05, + "loss": 1.6341, + "step": 60900 + }, + { + "epoch": 0.11862291097877777, + "grad_norm": 4.203964710235596, + "learning_rate": 2.8970405695943926e-05, + "loss": 1.8891, + "step": 60915 + }, + { + "epoch": 0.11865212124988803, + "grad_norm": 4.477170467376709, + "learning_rate": 2.8969904454717582e-05, + "loss": 1.7875, + "step": 60930 + }, + { + "epoch": 0.11868133152099829, + "grad_norm": 3.4557018280029297, + "learning_rate": 2.896940309584871e-05, + "loss": 1.8095, + "step": 60945 + }, + { + "epoch": 0.11871054179210855, + "grad_norm": 4.404900550842285, + "learning_rate": 2.8968901619341522e-05, + "loss": 1.8171, + "step": 60960 + }, + { + "epoch": 0.11873975206321881, + "grad_norm": 3.3139572143554688, + "learning_rate": 2.8968400025200252e-05, + "loss": 2.0987, + "step": 60975 + }, + { + "epoch": 0.11876896233432908, + "grad_norm": 3.3506247997283936, + "learning_rate": 2.8967898313429118e-05, + "loss": 1.679, + "step": 60990 + }, + { + "epoch": 0.11879817260543934, + "grad_norm": 2.8667874336242676, + "learning_rate": 2.8967396484032352e-05, + "loss": 1.8963, + "step": 61005 + }, + { + "epoch": 0.1188273828765496, + "grad_norm": 3.6289522647857666, + "learning_rate": 2.896689453701417e-05, + "loss": 1.8406, + "step": 61020 + }, + { + "epoch": 0.11885659314765987, + "grad_norm": 2.5075268745422363, + "learning_rate": 2.8966392472378812e-05, + "loss": 1.8282, + "step": 61035 + }, + { + "epoch": 0.11888580341877013, + "grad_norm": 1.956333875656128, + "learning_rate": 2.8965890290130495e-05, + "loss": 1.8805, + "step": 61050 + }, + { + "epoch": 0.11891501368988039, + "grad_norm": 1.7926033735275269, + "learning_rate": 2.8965387990273452e-05, + "loss": 1.7483, + "step": 61065 + }, + { + "epoch": 0.11894422396099065, + "grad_norm": 3.883046865463257, + "learning_rate": 2.8964885572811912e-05, + "loss": 1.7479, + "step": 61080 + }, + { + "epoch": 0.11897343423210092, + "grad_norm": 3.3824377059936523, + "learning_rate": 2.8964383037750107e-05, + "loss": 1.6685, + "step": 61095 + }, + { + "epoch": 0.11900264450321119, + "grad_norm": 3.1565215587615967, + "learning_rate": 2.8963880385092268e-05, + "loss": 1.8292, + "step": 61110 + }, + { + "epoch": 0.11903185477432145, + "grad_norm": 3.7736783027648926, + "learning_rate": 2.896337761484263e-05, + "loss": 1.8199, + "step": 61125 + }, + { + "epoch": 0.11906106504543171, + "grad_norm": 5.266022205352783, + "learning_rate": 2.8962874727005423e-05, + "loss": 1.8482, + "step": 61140 + }, + { + "epoch": 0.11909027531654197, + "grad_norm": 2.3416106700897217, + "learning_rate": 2.8962371721584887e-05, + "loss": 2.0724, + "step": 61155 + }, + { + "epoch": 0.11911948558765223, + "grad_norm": 3.0689048767089844, + "learning_rate": 2.8961868598585252e-05, + "loss": 1.9616, + "step": 61170 + }, + { + "epoch": 0.11914869585876249, + "grad_norm": 2.1503915786743164, + "learning_rate": 2.8961365358010762e-05, + "loss": 1.814, + "step": 61185 + }, + { + "epoch": 0.11917790612987277, + "grad_norm": 1.941544771194458, + "learning_rate": 2.8960861999865646e-05, + "loss": 1.8754, + "step": 61200 + }, + { + "epoch": 0.11920711640098303, + "grad_norm": 3.8613669872283936, + "learning_rate": 2.896035852415415e-05, + "loss": 1.8104, + "step": 61215 + }, + { + "epoch": 0.11923632667209329, + "grad_norm": 2.211416244506836, + "learning_rate": 2.8959854930880513e-05, + "loss": 1.8031, + "step": 61230 + }, + { + "epoch": 0.11926553694320355, + "grad_norm": 3.4285852909088135, + "learning_rate": 2.895935122004897e-05, + "loss": 2.0594, + "step": 61245 + }, + { + "epoch": 0.11929474721431381, + "grad_norm": 3.2467575073242188, + "learning_rate": 2.895884739166377e-05, + "loss": 1.8254, + "step": 61260 + }, + { + "epoch": 0.11932395748542407, + "grad_norm": 4.514699459075928, + "learning_rate": 2.895834344572915e-05, + "loss": 1.9133, + "step": 61275 + }, + { + "epoch": 0.11935316775653433, + "grad_norm": 2.221604585647583, + "learning_rate": 2.895783938224936e-05, + "loss": 1.8552, + "step": 61290 + }, + { + "epoch": 0.1193823780276446, + "grad_norm": 2.2877743244171143, + "learning_rate": 2.895733520122864e-05, + "loss": 1.8547, + "step": 61305 + }, + { + "epoch": 0.11941158829875487, + "grad_norm": 4.658250331878662, + "learning_rate": 2.8956830902671237e-05, + "loss": 1.9996, + "step": 61320 + }, + { + "epoch": 0.11944079856986513, + "grad_norm": 3.654372215270996, + "learning_rate": 2.89563264865814e-05, + "loss": 1.8965, + "step": 61335 + }, + { + "epoch": 0.11947000884097539, + "grad_norm": 2.5835392475128174, + "learning_rate": 2.895582195296337e-05, + "loss": 1.9539, + "step": 61350 + }, + { + "epoch": 0.11949921911208565, + "grad_norm": 3.009052038192749, + "learning_rate": 2.8955317301821407e-05, + "loss": 1.8193, + "step": 61365 + }, + { + "epoch": 0.11952842938319591, + "grad_norm": 3.8412842750549316, + "learning_rate": 2.895481253315975e-05, + "loss": 1.9583, + "step": 61380 + }, + { + "epoch": 0.11955763965430617, + "grad_norm": 2.392502784729004, + "learning_rate": 2.8954307646982655e-05, + "loss": 1.9173, + "step": 61395 + }, + { + "epoch": 0.11958684992541645, + "grad_norm": 2.9303061962127686, + "learning_rate": 2.895380264329437e-05, + "loss": 1.9729, + "step": 61410 + }, + { + "epoch": 0.11961606019652671, + "grad_norm": 2.413651704788208, + "learning_rate": 2.8953297522099153e-05, + "loss": 1.8252, + "step": 61425 + }, + { + "epoch": 0.11964527046763697, + "grad_norm": 2.00931978225708, + "learning_rate": 2.895279228340126e-05, + "loss": 1.75, + "step": 61440 + }, + { + "epoch": 0.11967448073874723, + "grad_norm": 2.3669769763946533, + "learning_rate": 2.8952286927204933e-05, + "loss": 1.8266, + "step": 61455 + }, + { + "epoch": 0.11970369100985749, + "grad_norm": 4.001590728759766, + "learning_rate": 2.895178145351444e-05, + "loss": 1.8002, + "step": 61470 + }, + { + "epoch": 0.11973290128096775, + "grad_norm": 3.8861963748931885, + "learning_rate": 2.895127586233403e-05, + "loss": 1.7609, + "step": 61485 + }, + { + "epoch": 0.11976211155207801, + "grad_norm": 4.440252304077148, + "learning_rate": 2.8950770153667967e-05, + "loss": 1.7088, + "step": 61500 + }, + { + "epoch": 0.11979132182318829, + "grad_norm": 3.9889941215515137, + "learning_rate": 2.8950264327520506e-05, + "loss": 1.8522, + "step": 61515 + }, + { + "epoch": 0.11982053209429855, + "grad_norm": 1.9728597402572632, + "learning_rate": 2.8949758383895905e-05, + "loss": 1.9687, + "step": 61530 + }, + { + "epoch": 0.11984974236540881, + "grad_norm": 3.095778226852417, + "learning_rate": 2.8949252322798425e-05, + "loss": 1.971, + "step": 61545 + }, + { + "epoch": 0.11987895263651907, + "grad_norm": 3.4150264263153076, + "learning_rate": 2.8948746144232332e-05, + "loss": 1.6503, + "step": 61560 + }, + { + "epoch": 0.11990816290762933, + "grad_norm": 2.9555540084838867, + "learning_rate": 2.894823984820189e-05, + "loss": 1.8696, + "step": 61575 + }, + { + "epoch": 0.11993737317873959, + "grad_norm": 4.342530727386475, + "learning_rate": 2.8947733434711353e-05, + "loss": 1.8185, + "step": 61590 + }, + { + "epoch": 0.11996658344984985, + "grad_norm": 4.22271203994751, + "learning_rate": 2.8947226903764992e-05, + "loss": 1.8989, + "step": 61605 + }, + { + "epoch": 0.11999579372096013, + "grad_norm": 2.5919647216796875, + "learning_rate": 2.894672025536707e-05, + "loss": 1.9781, + "step": 61620 + }, + { + "epoch": 0.12002500399207039, + "grad_norm": 3.6082923412323, + "learning_rate": 2.8946213489521858e-05, + "loss": 1.7489, + "step": 61635 + }, + { + "epoch": 0.12005421426318065, + "grad_norm": 2.9248900413513184, + "learning_rate": 2.8945706606233623e-05, + "loss": 1.9294, + "step": 61650 + }, + { + "epoch": 0.12008342453429091, + "grad_norm": 2.6090869903564453, + "learning_rate": 2.8945199605506628e-05, + "loss": 1.7851, + "step": 61665 + }, + { + "epoch": 0.12011263480540117, + "grad_norm": 4.4757513999938965, + "learning_rate": 2.8944692487345145e-05, + "loss": 1.8639, + "step": 61680 + }, + { + "epoch": 0.12014184507651143, + "grad_norm": 2.640516757965088, + "learning_rate": 2.8944185251753444e-05, + "loss": 1.7432, + "step": 61695 + }, + { + "epoch": 0.1201710553476217, + "grad_norm": 2.7510807514190674, + "learning_rate": 2.8943677898735804e-05, + "loss": 1.9618, + "step": 61710 + }, + { + "epoch": 0.12020026561873197, + "grad_norm": 5.443726062774658, + "learning_rate": 2.8943170428296486e-05, + "loss": 1.8162, + "step": 61725 + }, + { + "epoch": 0.12022947588984223, + "grad_norm": 3.45536208152771, + "learning_rate": 2.894266284043977e-05, + "loss": 1.9092, + "step": 61740 + }, + { + "epoch": 0.12025868616095249, + "grad_norm": 2.9676899909973145, + "learning_rate": 2.8942155135169927e-05, + "loss": 1.8096, + "step": 61755 + }, + { + "epoch": 0.12028789643206275, + "grad_norm": 2.7942233085632324, + "learning_rate": 2.894164731249124e-05, + "loss": 1.7788, + "step": 61770 + }, + { + "epoch": 0.12031710670317301, + "grad_norm": 2.2713112831115723, + "learning_rate": 2.894113937240798e-05, + "loss": 1.7322, + "step": 61785 + }, + { + "epoch": 0.12034631697428327, + "grad_norm": 2.374073028564453, + "learning_rate": 2.8940631314924417e-05, + "loss": 1.9569, + "step": 61800 + }, + { + "epoch": 0.12037552724539353, + "grad_norm": 4.36929988861084, + "learning_rate": 2.8940123140044847e-05, + "loss": 1.8433, + "step": 61815 + }, + { + "epoch": 0.12040473751650381, + "grad_norm": 3.401564359664917, + "learning_rate": 2.8939614847773533e-05, + "loss": 2.0956, + "step": 61830 + }, + { + "epoch": 0.12043394778761407, + "grad_norm": 2.654738664627075, + "learning_rate": 2.893910643811476e-05, + "loss": 1.9126, + "step": 61845 + }, + { + "epoch": 0.12046315805872433, + "grad_norm": 2.636667490005493, + "learning_rate": 2.893859791107282e-05, + "loss": 1.9999, + "step": 61860 + }, + { + "epoch": 0.12049236832983459, + "grad_norm": 3.8728511333465576, + "learning_rate": 2.8938089266651975e-05, + "loss": 1.7145, + "step": 61875 + }, + { + "epoch": 0.12052157860094485, + "grad_norm": 3.9990079402923584, + "learning_rate": 2.893758050485653e-05, + "loss": 1.9485, + "step": 61890 + }, + { + "epoch": 0.12055078887205511, + "grad_norm": 5.385690689086914, + "learning_rate": 2.8937071625690755e-05, + "loss": 1.9099, + "step": 61905 + }, + { + "epoch": 0.12057999914316538, + "grad_norm": 2.5041937828063965, + "learning_rate": 2.893656262915894e-05, + "loss": 2.0346, + "step": 61920 + }, + { + "epoch": 0.12060920941427565, + "grad_norm": 3.264338731765747, + "learning_rate": 2.8936053515265372e-05, + "loss": 2.0222, + "step": 61935 + }, + { + "epoch": 0.12063841968538591, + "grad_norm": 3.1201021671295166, + "learning_rate": 2.8935544284014333e-05, + "loss": 1.75, + "step": 61950 + }, + { + "epoch": 0.12066762995649617, + "grad_norm": 4.42014741897583, + "learning_rate": 2.8935034935410118e-05, + "loss": 1.8835, + "step": 61965 + }, + { + "epoch": 0.12069684022760643, + "grad_norm": 2.497922420501709, + "learning_rate": 2.8934525469457017e-05, + "loss": 1.851, + "step": 61980 + }, + { + "epoch": 0.1207260504987167, + "grad_norm": 5.07431173324585, + "learning_rate": 2.8934015886159316e-05, + "loss": 1.7288, + "step": 61995 + }, + { + "epoch": 0.12075526076982696, + "grad_norm": 4.267670154571533, + "learning_rate": 2.8933506185521306e-05, + "loss": 1.9057, + "step": 62010 + }, + { + "epoch": 0.12078447104093722, + "grad_norm": 2.9418106079101562, + "learning_rate": 2.893299636754728e-05, + "loss": 1.8963, + "step": 62025 + }, + { + "epoch": 0.12081368131204748, + "grad_norm": 3.0219571590423584, + "learning_rate": 2.8932486432241536e-05, + "loss": 1.839, + "step": 62040 + }, + { + "epoch": 0.12084289158315775, + "grad_norm": 3.831568956375122, + "learning_rate": 2.8931976379608364e-05, + "loss": 2.0567, + "step": 62055 + }, + { + "epoch": 0.12087210185426801, + "grad_norm": 2.701246976852417, + "learning_rate": 2.8931466209652055e-05, + "loss": 1.9754, + "step": 62070 + }, + { + "epoch": 0.12090131212537827, + "grad_norm": 4.091612815856934, + "learning_rate": 2.893095592237691e-05, + "loss": 1.8665, + "step": 62085 + }, + { + "epoch": 0.12093052239648854, + "grad_norm": 2.7955589294433594, + "learning_rate": 2.893044551778723e-05, + "loss": 1.7736, + "step": 62100 + }, + { + "epoch": 0.1209597326675988, + "grad_norm": 2.7950093746185303, + "learning_rate": 2.8929934995887307e-05, + "loss": 1.7859, + "step": 62115 + }, + { + "epoch": 0.12098894293870906, + "grad_norm": 2.281625986099243, + "learning_rate": 2.892942435668144e-05, + "loss": 1.7336, + "step": 62130 + }, + { + "epoch": 0.12101815320981932, + "grad_norm": 3.1729629039764404, + "learning_rate": 2.8928913600173932e-05, + "loss": 1.9003, + "step": 62145 + }, + { + "epoch": 0.12104736348092959, + "grad_norm": 4.190723419189453, + "learning_rate": 2.8928402726369087e-05, + "loss": 1.928, + "step": 62160 + }, + { + "epoch": 0.12107657375203985, + "grad_norm": 2.0911364555358887, + "learning_rate": 2.89278917352712e-05, + "loss": 1.8792, + "step": 62175 + }, + { + "epoch": 0.12110578402315011, + "grad_norm": 4.6181559562683105, + "learning_rate": 2.892738062688458e-05, + "loss": 1.8116, + "step": 62190 + }, + { + "epoch": 0.12113499429426038, + "grad_norm": 2.0024867057800293, + "learning_rate": 2.8926869401213528e-05, + "loss": 1.7935, + "step": 62205 + }, + { + "epoch": 0.12116420456537064, + "grad_norm": 2.940186023712158, + "learning_rate": 2.8926358058262348e-05, + "loss": 1.7802, + "step": 62220 + }, + { + "epoch": 0.1211934148364809, + "grad_norm": 3.0467312335968018, + "learning_rate": 2.892584659803535e-05, + "loss": 1.9604, + "step": 62235 + }, + { + "epoch": 0.12122262510759116, + "grad_norm": 2.3714380264282227, + "learning_rate": 2.892533502053684e-05, + "loss": 1.9397, + "step": 62250 + }, + { + "epoch": 0.12125183537870143, + "grad_norm": 4.036520004272461, + "learning_rate": 2.8924823325771125e-05, + "loss": 1.9025, + "step": 62265 + }, + { + "epoch": 0.1212810456498117, + "grad_norm": 3.191206693649292, + "learning_rate": 2.8924311513742515e-05, + "loss": 1.7981, + "step": 62280 + }, + { + "epoch": 0.12131025592092196, + "grad_norm": 2.2503061294555664, + "learning_rate": 2.892379958445532e-05, + "loss": 2.0135, + "step": 62295 + }, + { + "epoch": 0.12133946619203222, + "grad_norm": 2.219902515411377, + "learning_rate": 2.892328753791385e-05, + "loss": 1.8386, + "step": 62310 + }, + { + "epoch": 0.12136867646314248, + "grad_norm": 3.0987050533294678, + "learning_rate": 2.8922775374122413e-05, + "loss": 1.845, + "step": 62325 + }, + { + "epoch": 0.12139788673425274, + "grad_norm": 3.779945135116577, + "learning_rate": 2.892226309308533e-05, + "loss": 1.8727, + "step": 62340 + }, + { + "epoch": 0.121427097005363, + "grad_norm": 2.546738624572754, + "learning_rate": 2.892175069480691e-05, + "loss": 1.6608, + "step": 62355 + }, + { + "epoch": 0.12145630727647327, + "grad_norm": 4.801875591278076, + "learning_rate": 2.892123817929147e-05, + "loss": 1.9176, + "step": 62370 + }, + { + "epoch": 0.12148551754758354, + "grad_norm": 2.352383613586426, + "learning_rate": 2.8920725546543328e-05, + "loss": 2.1223, + "step": 62385 + }, + { + "epoch": 0.1215147278186938, + "grad_norm": 4.339234828948975, + "learning_rate": 2.8920212796566796e-05, + "loss": 1.9376, + "step": 62400 + }, + { + "epoch": 0.12154393808980406, + "grad_norm": 3.643491268157959, + "learning_rate": 2.8919699929366197e-05, + "loss": 1.9471, + "step": 62415 + }, + { + "epoch": 0.12157314836091432, + "grad_norm": 3.393761396408081, + "learning_rate": 2.8919186944945844e-05, + "loss": 2.0331, + "step": 62430 + }, + { + "epoch": 0.12160235863202458, + "grad_norm": 3.7035586833953857, + "learning_rate": 2.891867384331006e-05, + "loss": 1.8379, + "step": 62445 + }, + { + "epoch": 0.12163156890313484, + "grad_norm": 2.2590112686157227, + "learning_rate": 2.8918160624463166e-05, + "loss": 1.881, + "step": 62460 + }, + { + "epoch": 0.12166077917424512, + "grad_norm": 4.280823707580566, + "learning_rate": 2.8917647288409485e-05, + "loss": 1.8831, + "step": 62475 + }, + { + "epoch": 0.12168998944535538, + "grad_norm": 2.542282819747925, + "learning_rate": 2.891713383515334e-05, + "loss": 1.9306, + "step": 62490 + }, + { + "epoch": 0.12171919971646564, + "grad_norm": 2.3564460277557373, + "learning_rate": 2.891662026469905e-05, + "loss": 1.9267, + "step": 62505 + }, + { + "epoch": 0.1217484099875759, + "grad_norm": 2.0922868251800537, + "learning_rate": 2.8916106577050945e-05, + "loss": 1.9449, + "step": 62520 + }, + { + "epoch": 0.12177762025868616, + "grad_norm": 3.6258349418640137, + "learning_rate": 2.8915592772213347e-05, + "loss": 1.9711, + "step": 62535 + }, + { + "epoch": 0.12180683052979642, + "grad_norm": 2.3582475185394287, + "learning_rate": 2.891507885019059e-05, + "loss": 1.8104, + "step": 62550 + }, + { + "epoch": 0.12183604080090668, + "grad_norm": 4.437615871429443, + "learning_rate": 2.8914564810986993e-05, + "loss": 1.8647, + "step": 62565 + }, + { + "epoch": 0.12186525107201696, + "grad_norm": 3.7570269107818604, + "learning_rate": 2.891405065460689e-05, + "loss": 2.0324, + "step": 62580 + }, + { + "epoch": 0.12189446134312722, + "grad_norm": 2.267084836959839, + "learning_rate": 2.8913536381054612e-05, + "loss": 1.6888, + "step": 62595 + }, + { + "epoch": 0.12192367161423748, + "grad_norm": 2.2528107166290283, + "learning_rate": 2.8913021990334486e-05, + "loss": 1.9691, + "step": 62610 + }, + { + "epoch": 0.12195288188534774, + "grad_norm": 2.6176981925964355, + "learning_rate": 2.891250748245084e-05, + "loss": 1.8833, + "step": 62625 + }, + { + "epoch": 0.121982092156458, + "grad_norm": 1.6563457250595093, + "learning_rate": 2.891199285740802e-05, + "loss": 1.957, + "step": 62640 + }, + { + "epoch": 0.12201130242756826, + "grad_norm": 5.018991470336914, + "learning_rate": 2.8911478115210348e-05, + "loss": 1.9125, + "step": 62655 + }, + { + "epoch": 0.12204051269867852, + "grad_norm": 4.535865306854248, + "learning_rate": 2.8910963255862162e-05, + "loss": 1.882, + "step": 62670 + }, + { + "epoch": 0.1220697229697888, + "grad_norm": 5.537111759185791, + "learning_rate": 2.8910448279367795e-05, + "loss": 1.647, + "step": 62685 + }, + { + "epoch": 0.12209893324089906, + "grad_norm": 2.0825977325439453, + "learning_rate": 2.890993318573159e-05, + "loss": 1.7819, + "step": 62700 + }, + { + "epoch": 0.12212814351200932, + "grad_norm": 2.717433452606201, + "learning_rate": 2.8909417974957878e-05, + "loss": 1.9498, + "step": 62715 + }, + { + "epoch": 0.12215735378311958, + "grad_norm": 4.576835632324219, + "learning_rate": 2.8908902647051005e-05, + "loss": 1.7522, + "step": 62730 + }, + { + "epoch": 0.12218656405422984, + "grad_norm": 2.449582099914551, + "learning_rate": 2.8908387202015305e-05, + "loss": 1.7626, + "step": 62745 + }, + { + "epoch": 0.1222157743253401, + "grad_norm": 3.3162100315093994, + "learning_rate": 2.890787163985512e-05, + "loss": 1.7367, + "step": 62760 + }, + { + "epoch": 0.12224498459645036, + "grad_norm": 3.9670050144195557, + "learning_rate": 2.890735596057479e-05, + "loss": 1.8548, + "step": 62775 + }, + { + "epoch": 0.12227419486756064, + "grad_norm": 2.2449352741241455, + "learning_rate": 2.8906840164178663e-05, + "loss": 1.811, + "step": 62790 + }, + { + "epoch": 0.1223034051386709, + "grad_norm": 4.013428211212158, + "learning_rate": 2.8906324250671076e-05, + "loss": 1.9154, + "step": 62805 + }, + { + "epoch": 0.12233261540978116, + "grad_norm": 3.7071478366851807, + "learning_rate": 2.8905808220056384e-05, + "loss": 1.7244, + "step": 62820 + }, + { + "epoch": 0.12236182568089142, + "grad_norm": 4.227043151855469, + "learning_rate": 2.8905292072338915e-05, + "loss": 2.0685, + "step": 62835 + }, + { + "epoch": 0.12239103595200168, + "grad_norm": 3.63092303276062, + "learning_rate": 2.890477580752303e-05, + "loss": 1.7712, + "step": 62850 + }, + { + "epoch": 0.12242024622311194, + "grad_norm": 3.316952705383301, + "learning_rate": 2.8904259425613075e-05, + "loss": 1.8854, + "step": 62865 + }, + { + "epoch": 0.1224494564942222, + "grad_norm": 3.0845460891723633, + "learning_rate": 2.890374292661339e-05, + "loss": 1.7987, + "step": 62880 + }, + { + "epoch": 0.12247866676533248, + "grad_norm": 2.1164207458496094, + "learning_rate": 2.8903226310528336e-05, + "loss": 1.72, + "step": 62895 + }, + { + "epoch": 0.12250787703644274, + "grad_norm": 2.226426601409912, + "learning_rate": 2.8902709577362256e-05, + "loss": 1.8239, + "step": 62910 + }, + { + "epoch": 0.122537087307553, + "grad_norm": 4.813535690307617, + "learning_rate": 2.8902192727119503e-05, + "loss": 2.0548, + "step": 62925 + }, + { + "epoch": 0.12256629757866326, + "grad_norm": 2.932439088821411, + "learning_rate": 2.890167575980443e-05, + "loss": 1.8578, + "step": 62940 + }, + { + "epoch": 0.12259550784977352, + "grad_norm": 2.315194845199585, + "learning_rate": 2.8901158675421388e-05, + "loss": 1.9089, + "step": 62955 + }, + { + "epoch": 0.12262471812088378, + "grad_norm": 2.54640531539917, + "learning_rate": 2.8900641473974732e-05, + "loss": 1.7662, + "step": 62970 + }, + { + "epoch": 0.12265392839199404, + "grad_norm": 2.3514091968536377, + "learning_rate": 2.890012415546883e-05, + "loss": 1.8181, + "step": 62985 + }, + { + "epoch": 0.12268313866310432, + "grad_norm": 3.919431686401367, + "learning_rate": 2.8899606719908017e-05, + "loss": 1.888, + "step": 63000 + }, + { + "epoch": 0.12271234893421458, + "grad_norm": 3.8556947708129883, + "learning_rate": 2.8899089167296663e-05, + "loss": 1.943, + "step": 63015 + }, + { + "epoch": 0.12274155920532484, + "grad_norm": 4.162572383880615, + "learning_rate": 2.8898571497639126e-05, + "loss": 1.8776, + "step": 63030 + }, + { + "epoch": 0.1227707694764351, + "grad_norm": 5.002816200256348, + "learning_rate": 2.8898053710939762e-05, + "loss": 1.9297, + "step": 63045 + }, + { + "epoch": 0.12279997974754536, + "grad_norm": 3.133664608001709, + "learning_rate": 2.8897535807202935e-05, + "loss": 2.0413, + "step": 63060 + }, + { + "epoch": 0.12282919001865562, + "grad_norm": 2.037231206893921, + "learning_rate": 2.8897017786433003e-05, + "loss": 1.7563, + "step": 63075 + }, + { + "epoch": 0.12285840028976588, + "grad_norm": 5.102201461791992, + "learning_rate": 2.889649964863433e-05, + "loss": 1.8389, + "step": 63090 + }, + { + "epoch": 0.12288761056087615, + "grad_norm": 2.2610063552856445, + "learning_rate": 2.889598139381128e-05, + "loss": 1.7999, + "step": 63105 + }, + { + "epoch": 0.12291682083198642, + "grad_norm": 4.367095470428467, + "learning_rate": 2.8895463021968213e-05, + "loss": 2.0304, + "step": 63120 + }, + { + "epoch": 0.12294603110309668, + "grad_norm": 3.7887561321258545, + "learning_rate": 2.88949445331095e-05, + "loss": 2.0152, + "step": 63135 + }, + { + "epoch": 0.12297524137420694, + "grad_norm": 3.0901668071746826, + "learning_rate": 2.8894425927239504e-05, + "loss": 2.0434, + "step": 63150 + }, + { + "epoch": 0.1230044516453172, + "grad_norm": 2.291771173477173, + "learning_rate": 2.8893907204362596e-05, + "loss": 1.5804, + "step": 63165 + }, + { + "epoch": 0.12303366191642746, + "grad_norm": 2.1776444911956787, + "learning_rate": 2.8893388364483137e-05, + "loss": 1.8585, + "step": 63180 + }, + { + "epoch": 0.12306287218753773, + "grad_norm": 1.9936423301696777, + "learning_rate": 2.8892869407605503e-05, + "loss": 1.8461, + "step": 63195 + }, + { + "epoch": 0.12309208245864799, + "grad_norm": 1.9299992322921753, + "learning_rate": 2.889235033373406e-05, + "loss": 1.9391, + "step": 63210 + }, + { + "epoch": 0.12312129272975826, + "grad_norm": 3.0500028133392334, + "learning_rate": 2.8891831142873182e-05, + "loss": 1.7387, + "step": 63225 + }, + { + "epoch": 0.12315050300086852, + "grad_norm": 4.977494716644287, + "learning_rate": 2.8891311835027244e-05, + "loss": 2.067, + "step": 63240 + }, + { + "epoch": 0.12317971327197878, + "grad_norm": 2.7151644229888916, + "learning_rate": 2.889079241020061e-05, + "loss": 1.9431, + "step": 63255 + }, + { + "epoch": 0.12320892354308904, + "grad_norm": 3.5354936122894287, + "learning_rate": 2.889027286839766e-05, + "loss": 1.8259, + "step": 63270 + }, + { + "epoch": 0.1232381338141993, + "grad_norm": 3.0156402587890625, + "learning_rate": 2.888975320962277e-05, + "loss": 1.8055, + "step": 63285 + }, + { + "epoch": 0.12326734408530957, + "grad_norm": 5.685827732086182, + "learning_rate": 2.8889233433880316e-05, + "loss": 1.9695, + "step": 63300 + }, + { + "epoch": 0.12329655435641983, + "grad_norm": 2.9744393825531006, + "learning_rate": 2.888871354117467e-05, + "loss": 1.8921, + "step": 63315 + }, + { + "epoch": 0.1233257646275301, + "grad_norm": 1.8981090784072876, + "learning_rate": 2.8888193531510213e-05, + "loss": 1.8503, + "step": 63330 + }, + { + "epoch": 0.12335497489864036, + "grad_norm": 3.6258583068847656, + "learning_rate": 2.888767340489133e-05, + "loss": 1.9838, + "step": 63345 + }, + { + "epoch": 0.12338418516975062, + "grad_norm": 4.110543727874756, + "learning_rate": 2.8887153161322393e-05, + "loss": 1.9897, + "step": 63360 + }, + { + "epoch": 0.12341339544086088, + "grad_norm": 2.7951531410217285, + "learning_rate": 2.888663280080779e-05, + "loss": 1.8382, + "step": 63375 + }, + { + "epoch": 0.12344260571197115, + "grad_norm": 3.7334868907928467, + "learning_rate": 2.8886112323351898e-05, + "loss": 1.924, + "step": 63390 + }, + { + "epoch": 0.1234718159830814, + "grad_norm": 2.9434235095977783, + "learning_rate": 2.8885591728959094e-05, + "loss": 1.8666, + "step": 63405 + }, + { + "epoch": 0.12350102625419167, + "grad_norm": 2.318481206893921, + "learning_rate": 2.888507101763378e-05, + "loss": 1.6018, + "step": 63420 + }, + { + "epoch": 0.12353023652530194, + "grad_norm": 3.797417402267456, + "learning_rate": 2.8884550189380322e-05, + "loss": 1.6832, + "step": 63435 + }, + { + "epoch": 0.1235594467964122, + "grad_norm": 4.110567092895508, + "learning_rate": 2.8884029244203115e-05, + "loss": 1.8271, + "step": 63450 + }, + { + "epoch": 0.12358865706752246, + "grad_norm": 3.406374216079712, + "learning_rate": 2.8883508182106548e-05, + "loss": 1.758, + "step": 63465 + }, + { + "epoch": 0.12361786733863273, + "grad_norm": 2.331756114959717, + "learning_rate": 2.8882987003095006e-05, + "loss": 1.7892, + "step": 63480 + }, + { + "epoch": 0.12364707760974299, + "grad_norm": 3.049309253692627, + "learning_rate": 2.8882465707172877e-05, + "loss": 1.7127, + "step": 63495 + }, + { + "epoch": 0.12367628788085325, + "grad_norm": 2.8949882984161377, + "learning_rate": 2.8881944294344548e-05, + "loss": 1.8263, + "step": 63510 + }, + { + "epoch": 0.12370549815196351, + "grad_norm": 2.5011870861053467, + "learning_rate": 2.888142276461442e-05, + "loss": 1.9601, + "step": 63525 + }, + { + "epoch": 0.12373470842307378, + "grad_norm": 3.7954225540161133, + "learning_rate": 2.8880901117986874e-05, + "loss": 1.8716, + "step": 63540 + }, + { + "epoch": 0.12376391869418404, + "grad_norm": 2.6932499408721924, + "learning_rate": 2.8880379354466312e-05, + "loss": 1.8845, + "step": 63555 + }, + { + "epoch": 0.1237931289652943, + "grad_norm": 1.597825288772583, + "learning_rate": 2.8879857474057117e-05, + "loss": 2.0897, + "step": 63570 + }, + { + "epoch": 0.12382233923640457, + "grad_norm": 3.0030219554901123, + "learning_rate": 2.8879335476763694e-05, + "loss": 1.8082, + "step": 63585 + }, + { + "epoch": 0.12385154950751483, + "grad_norm": 3.359510660171509, + "learning_rate": 2.8878813362590438e-05, + "loss": 1.7425, + "step": 63600 + }, + { + "epoch": 0.12388075977862509, + "grad_norm": 2.3775649070739746, + "learning_rate": 2.887829113154174e-05, + "loss": 1.8354, + "step": 63615 + }, + { + "epoch": 0.12390997004973535, + "grad_norm": 2.412797212600708, + "learning_rate": 2.8877768783621993e-05, + "loss": 1.8869, + "step": 63630 + }, + { + "epoch": 0.12393918032084562, + "grad_norm": 3.483506202697754, + "learning_rate": 2.8877246318835613e-05, + "loss": 1.7311, + "step": 63645 + }, + { + "epoch": 0.12396839059195588, + "grad_norm": 3.427119255065918, + "learning_rate": 2.8876723737186984e-05, + "loss": 2.0598, + "step": 63660 + }, + { + "epoch": 0.12399760086306615, + "grad_norm": 3.329678535461426, + "learning_rate": 2.8876201038680518e-05, + "loss": 1.9445, + "step": 63675 + }, + { + "epoch": 0.1240268111341764, + "grad_norm": 2.487665891647339, + "learning_rate": 2.887567822332061e-05, + "loss": 1.8674, + "step": 63690 + }, + { + "epoch": 0.12405602140528667, + "grad_norm": 2.2204031944274902, + "learning_rate": 2.8875155291111662e-05, + "loss": 1.914, + "step": 63705 + }, + { + "epoch": 0.12408523167639693, + "grad_norm": 2.558992624282837, + "learning_rate": 2.887463224205808e-05, + "loss": 1.7646, + "step": 63720 + }, + { + "epoch": 0.12411444194750719, + "grad_norm": 3.4131546020507812, + "learning_rate": 2.8874109076164267e-05, + "loss": 1.918, + "step": 63735 + }, + { + "epoch": 0.12414365221861746, + "grad_norm": 1.932386875152588, + "learning_rate": 2.8873585793434633e-05, + "loss": 1.9782, + "step": 63750 + }, + { + "epoch": 0.12417286248972773, + "grad_norm": 2.2213900089263916, + "learning_rate": 2.8873062393873582e-05, + "loss": 1.8535, + "step": 63765 + }, + { + "epoch": 0.12420207276083799, + "grad_norm": 1.906951665878296, + "learning_rate": 2.887253887748552e-05, + "loss": 1.8015, + "step": 63780 + }, + { + "epoch": 0.12423128303194825, + "grad_norm": 2.378446340560913, + "learning_rate": 2.887201524427486e-05, + "loss": 1.8594, + "step": 63795 + }, + { + "epoch": 0.12426049330305851, + "grad_norm": 4.512030601501465, + "learning_rate": 2.8871491494246007e-05, + "loss": 1.8476, + "step": 63810 + }, + { + "epoch": 0.12428970357416877, + "grad_norm": 6.289272785186768, + "learning_rate": 2.8870967627403367e-05, + "loss": 1.9805, + "step": 63825 + }, + { + "epoch": 0.12431891384527903, + "grad_norm": 2.5792481899261475, + "learning_rate": 2.8870443643751367e-05, + "loss": 1.8174, + "step": 63840 + }, + { + "epoch": 0.1243481241163893, + "grad_norm": 2.570173501968384, + "learning_rate": 2.8869919543294408e-05, + "loss": 1.8518, + "step": 63855 + }, + { + "epoch": 0.12437733438749957, + "grad_norm": 2.7765541076660156, + "learning_rate": 2.88693953260369e-05, + "loss": 1.6097, + "step": 63870 + }, + { + "epoch": 0.12440654465860983, + "grad_norm": 1.9239813089370728, + "learning_rate": 2.8868870991983268e-05, + "loss": 1.9059, + "step": 63885 + }, + { + "epoch": 0.12443575492972009, + "grad_norm": 4.170844078063965, + "learning_rate": 2.8868346541137924e-05, + "loss": 1.9428, + "step": 63900 + }, + { + "epoch": 0.12446496520083035, + "grad_norm": 4.433461666107178, + "learning_rate": 2.886782197350528e-05, + "loss": 1.8889, + "step": 63915 + }, + { + "epoch": 0.12449417547194061, + "grad_norm": 2.7384488582611084, + "learning_rate": 2.886729728908976e-05, + "loss": 1.7698, + "step": 63930 + }, + { + "epoch": 0.12452338574305087, + "grad_norm": 4.619007110595703, + "learning_rate": 2.8866772487895775e-05, + "loss": 1.8817, + "step": 63945 + }, + { + "epoch": 0.12455259601416115, + "grad_norm": 5.189652442932129, + "learning_rate": 2.8866247569927752e-05, + "loss": 1.7155, + "step": 63960 + }, + { + "epoch": 0.12458180628527141, + "grad_norm": 2.6596601009368896, + "learning_rate": 2.8865722535190108e-05, + "loss": 1.8141, + "step": 63975 + }, + { + "epoch": 0.12461101655638167, + "grad_norm": 4.129812240600586, + "learning_rate": 2.8865197383687264e-05, + "loss": 1.7896, + "step": 63990 + }, + { + "epoch": 0.12464022682749193, + "grad_norm": 2.553821325302124, + "learning_rate": 2.8864672115423643e-05, + "loss": 1.9619, + "step": 64005 + }, + { + "epoch": 0.12466943709860219, + "grad_norm": 3.977208137512207, + "learning_rate": 2.8864146730403666e-05, + "loss": 1.7583, + "step": 64020 + }, + { + "epoch": 0.12469864736971245, + "grad_norm": 2.185173988342285, + "learning_rate": 2.8863621228631763e-05, + "loss": 1.8492, + "step": 64035 + }, + { + "epoch": 0.12472785764082271, + "grad_norm": 2.377901077270508, + "learning_rate": 2.8863095610112354e-05, + "loss": 1.9537, + "step": 64050 + }, + { + "epoch": 0.12475706791193299, + "grad_norm": 4.199695110321045, + "learning_rate": 2.8862569874849864e-05, + "loss": 1.9515, + "step": 64065 + }, + { + "epoch": 0.12478627818304325, + "grad_norm": 2.4705047607421875, + "learning_rate": 2.8862044022848727e-05, + "loss": 1.7964, + "step": 64080 + }, + { + "epoch": 0.12481548845415351, + "grad_norm": 2.4468441009521484, + "learning_rate": 2.8861518054113368e-05, + "loss": 1.8532, + "step": 64095 + }, + { + "epoch": 0.12484469872526377, + "grad_norm": 5.260087013244629, + "learning_rate": 2.8860991968648212e-05, + "loss": 1.5326, + "step": 64110 + }, + { + "epoch": 0.12487390899637403, + "grad_norm": 2.134791135787964, + "learning_rate": 2.8860465766457695e-05, + "loss": 1.9259, + "step": 64125 + }, + { + "epoch": 0.12490311926748429, + "grad_norm": 2.4688735008239746, + "learning_rate": 2.8859939447546247e-05, + "loss": 1.7874, + "step": 64140 + }, + { + "epoch": 0.12493232953859455, + "grad_norm": 2.7646422386169434, + "learning_rate": 2.88594130119183e-05, + "loss": 1.7505, + "step": 64155 + }, + { + "epoch": 0.12496153980970481, + "grad_norm": 3.3007280826568604, + "learning_rate": 2.885888645957829e-05, + "loss": 1.9265, + "step": 64170 + }, + { + "epoch": 0.12499075008081509, + "grad_norm": 2.437790632247925, + "learning_rate": 2.885835979053064e-05, + "loss": 1.8096, + "step": 64185 + }, + { + "epoch": 0.12501996035192534, + "grad_norm": 2.7036232948303223, + "learning_rate": 2.88578330047798e-05, + "loss": 1.9318, + "step": 64200 + }, + { + "epoch": 0.1250491706230356, + "grad_norm": 3.2564563751220703, + "learning_rate": 2.8857306102330198e-05, + "loss": 1.7712, + "step": 64215 + }, + { + "epoch": 0.12507838089414589, + "grad_norm": 2.2260680198669434, + "learning_rate": 2.885677908318627e-05, + "loss": 1.9602, + "step": 64230 + }, + { + "epoch": 0.12510759116525613, + "grad_norm": 3.7234678268432617, + "learning_rate": 2.885625194735246e-05, + "loss": 1.9589, + "step": 64245 + }, + { + "epoch": 0.1251368014363664, + "grad_norm": 2.796656608581543, + "learning_rate": 2.8855724694833196e-05, + "loss": 1.9083, + "step": 64260 + }, + { + "epoch": 0.12516601170747665, + "grad_norm": 3.207852840423584, + "learning_rate": 2.8855197325632932e-05, + "loss": 1.8024, + "step": 64275 + }, + { + "epoch": 0.12519522197858693, + "grad_norm": 3.01822829246521, + "learning_rate": 2.8854669839756103e-05, + "loss": 1.7025, + "step": 64290 + }, + { + "epoch": 0.12522443224969718, + "grad_norm": 3.295149087905884, + "learning_rate": 2.885414223720715e-05, + "loss": 1.7098, + "step": 64305 + }, + { + "epoch": 0.12525364252080745, + "grad_norm": 2.0517284870147705, + "learning_rate": 2.8853614517990513e-05, + "loss": 1.897, + "step": 64320 + }, + { + "epoch": 0.12528285279191773, + "grad_norm": 4.530146598815918, + "learning_rate": 2.8853086682110643e-05, + "loss": 1.7723, + "step": 64335 + }, + { + "epoch": 0.12531206306302797, + "grad_norm": 2.5274271965026855, + "learning_rate": 2.8852558729571982e-05, + "loss": 1.8609, + "step": 64350 + }, + { + "epoch": 0.12534127333413825, + "grad_norm": 2.1734817028045654, + "learning_rate": 2.8852030660378978e-05, + "loss": 1.8455, + "step": 64365 + }, + { + "epoch": 0.1253704836052485, + "grad_norm": 2.3831608295440674, + "learning_rate": 2.885150247453607e-05, + "loss": 1.8966, + "step": 64380 + }, + { + "epoch": 0.12539969387635877, + "grad_norm": 3.3310117721557617, + "learning_rate": 2.8850974172047714e-05, + "loss": 2.039, + "step": 64395 + }, + { + "epoch": 0.12542890414746902, + "grad_norm": 4.581573963165283, + "learning_rate": 2.8850445752918358e-05, + "loss": 1.9365, + "step": 64410 + }, + { + "epoch": 0.1254581144185793, + "grad_norm": 2.3201091289520264, + "learning_rate": 2.8849917217152447e-05, + "loss": 1.9613, + "step": 64425 + }, + { + "epoch": 0.12548732468968957, + "grad_norm": 2.9959585666656494, + "learning_rate": 2.8849388564754436e-05, + "loss": 1.9158, + "step": 64440 + }, + { + "epoch": 0.1255165349607998, + "grad_norm": 3.9779443740844727, + "learning_rate": 2.8848859795728777e-05, + "loss": 1.7678, + "step": 64455 + }, + { + "epoch": 0.1255457452319101, + "grad_norm": 3.4818570613861084, + "learning_rate": 2.8848330910079922e-05, + "loss": 1.9997, + "step": 64470 + }, + { + "epoch": 0.12557495550302034, + "grad_norm": 2.5183308124542236, + "learning_rate": 2.8847801907812326e-05, + "loss": 1.7559, + "step": 64485 + }, + { + "epoch": 0.1256041657741306, + "grad_norm": 2.891653060913086, + "learning_rate": 2.884727278893044e-05, + "loss": 1.7905, + "step": 64500 + }, + { + "epoch": 0.12563337604524086, + "grad_norm": 4.580650806427002, + "learning_rate": 2.8846743553438724e-05, + "loss": 1.9012, + "step": 64515 + }, + { + "epoch": 0.12566258631635113, + "grad_norm": 3.7239174842834473, + "learning_rate": 2.8846214201341632e-05, + "loss": 1.8629, + "step": 64530 + }, + { + "epoch": 0.1256917965874614, + "grad_norm": 2.6561672687530518, + "learning_rate": 2.8845684732643625e-05, + "loss": 1.886, + "step": 64545 + }, + { + "epoch": 0.12572100685857165, + "grad_norm": 5.098997116088867, + "learning_rate": 2.8845155147349157e-05, + "loss": 1.9726, + "step": 64560 + }, + { + "epoch": 0.12575021712968193, + "grad_norm": 3.375009536743164, + "learning_rate": 2.8844625445462687e-05, + "loss": 1.8562, + "step": 64575 + }, + { + "epoch": 0.12577942740079218, + "grad_norm": 3.1518938541412354, + "learning_rate": 2.884409562698869e-05, + "loss": 1.8511, + "step": 64590 + }, + { + "epoch": 0.12580863767190245, + "grad_norm": 3.3796141147613525, + "learning_rate": 2.8843565691931606e-05, + "loss": 1.8368, + "step": 64605 + }, + { + "epoch": 0.1258378479430127, + "grad_norm": 2.1037158966064453, + "learning_rate": 2.8843035640295912e-05, + "loss": 1.9198, + "step": 64620 + }, + { + "epoch": 0.12586705821412297, + "grad_norm": 2.289296865463257, + "learning_rate": 2.884250547208607e-05, + "loss": 1.9617, + "step": 64635 + }, + { + "epoch": 0.12589626848523325, + "grad_norm": 3.0644466876983643, + "learning_rate": 2.8841975187306538e-05, + "loss": 1.8683, + "step": 64650 + }, + { + "epoch": 0.1259254787563435, + "grad_norm": 2.926682949066162, + "learning_rate": 2.884144478596179e-05, + "loss": 2.1487, + "step": 64665 + }, + { + "epoch": 0.12595468902745377, + "grad_norm": 3.0101606845855713, + "learning_rate": 2.8840914268056284e-05, + "loss": 2.0012, + "step": 64680 + }, + { + "epoch": 0.12598389929856402, + "grad_norm": 2.9421703815460205, + "learning_rate": 2.88403836335945e-05, + "loss": 1.899, + "step": 64695 + }, + { + "epoch": 0.1260131095696743, + "grad_norm": 2.595001459121704, + "learning_rate": 2.8839852882580894e-05, + "loss": 1.7682, + "step": 64710 + }, + { + "epoch": 0.12604231984078454, + "grad_norm": 3.492920160293579, + "learning_rate": 2.8839322015019938e-05, + "loss": 1.8176, + "step": 64725 + }, + { + "epoch": 0.12607153011189481, + "grad_norm": 4.069270133972168, + "learning_rate": 2.883879103091611e-05, + "loss": 1.9551, + "step": 64740 + }, + { + "epoch": 0.12610074038300506, + "grad_norm": 3.419351577758789, + "learning_rate": 2.8838259930273873e-05, + "loss": 1.9065, + "step": 64755 + }, + { + "epoch": 0.12612995065411534, + "grad_norm": 2.9626283645629883, + "learning_rate": 2.8837728713097706e-05, + "loss": 2.069, + "step": 64770 + }, + { + "epoch": 0.1261591609252256, + "grad_norm": 2.98925518989563, + "learning_rate": 2.8837197379392072e-05, + "loss": 1.9035, + "step": 64785 + }, + { + "epoch": 0.12618837119633586, + "grad_norm": 3.0914554595947266, + "learning_rate": 2.8836665929161464e-05, + "loss": 1.9422, + "step": 64800 + }, + { + "epoch": 0.12621758146744613, + "grad_norm": 2.896775722503662, + "learning_rate": 2.8836134362410337e-05, + "loss": 1.8179, + "step": 64815 + }, + { + "epoch": 0.12624679173855638, + "grad_norm": 2.040506601333618, + "learning_rate": 2.883560267914318e-05, + "loss": 1.8487, + "step": 64830 + }, + { + "epoch": 0.12627600200966665, + "grad_norm": 3.4945831298828125, + "learning_rate": 2.883507087936447e-05, + "loss": 1.823, + "step": 64845 + }, + { + "epoch": 0.1263052122807769, + "grad_norm": 3.0313773155212402, + "learning_rate": 2.8834538963078677e-05, + "loss": 2.0473, + "step": 64860 + }, + { + "epoch": 0.12633442255188718, + "grad_norm": 2.5263724327087402, + "learning_rate": 2.8834006930290284e-05, + "loss": 2.0135, + "step": 64875 + }, + { + "epoch": 0.12636363282299745, + "grad_norm": 3.2931931018829346, + "learning_rate": 2.883347478100378e-05, + "loss": 1.8289, + "step": 64890 + }, + { + "epoch": 0.1263928430941077, + "grad_norm": 2.162726402282715, + "learning_rate": 2.8832942515223638e-05, + "loss": 1.9084, + "step": 64905 + }, + { + "epoch": 0.12642205336521797, + "grad_norm": 3.2192437648773193, + "learning_rate": 2.8832410132954333e-05, + "loss": 1.9642, + "step": 64920 + }, + { + "epoch": 0.12645126363632822, + "grad_norm": 1.959382176399231, + "learning_rate": 2.8831877634200366e-05, + "loss": 1.8375, + "step": 64935 + }, + { + "epoch": 0.1264804739074385, + "grad_norm": 2.788815975189209, + "learning_rate": 2.883134501896621e-05, + "loss": 1.6245, + "step": 64950 + }, + { + "epoch": 0.12650968417854874, + "grad_norm": 2.264317750930786, + "learning_rate": 2.8830812287256355e-05, + "loss": 1.8792, + "step": 64965 + }, + { + "epoch": 0.12653889444965902, + "grad_norm": 2.6143927574157715, + "learning_rate": 2.8830279439075276e-05, + "loss": 1.8446, + "step": 64980 + }, + { + "epoch": 0.1265681047207693, + "grad_norm": 2.1117353439331055, + "learning_rate": 2.8829746474427476e-05, + "loss": 1.9703, + "step": 64995 + }, + { + "epoch": 0.12659731499187954, + "grad_norm": 4.056313991546631, + "learning_rate": 2.8829213393317432e-05, + "loss": 1.8611, + "step": 65010 + }, + { + "epoch": 0.12662652526298981, + "grad_norm": 3.005725383758545, + "learning_rate": 2.882868019574964e-05, + "loss": 1.858, + "step": 65025 + }, + { + "epoch": 0.12665573553410006, + "grad_norm": 4.17820405960083, + "learning_rate": 2.8828146881728584e-05, + "loss": 1.8271, + "step": 65040 + }, + { + "epoch": 0.12668494580521034, + "grad_norm": 2.2635533809661865, + "learning_rate": 2.882761345125876e-05, + "loss": 1.843, + "step": 65055 + }, + { + "epoch": 0.12671415607632058, + "grad_norm": 2.8097002506256104, + "learning_rate": 2.8827079904344656e-05, + "loss": 2.1206, + "step": 65070 + }, + { + "epoch": 0.12674336634743086, + "grad_norm": 5.229006767272949, + "learning_rate": 2.8826546240990772e-05, + "loss": 1.9803, + "step": 65085 + }, + { + "epoch": 0.12677257661854113, + "grad_norm": 3.5992817878723145, + "learning_rate": 2.8826012461201594e-05, + "loss": 1.8178, + "step": 65100 + }, + { + "epoch": 0.12680178688965138, + "grad_norm": 2.38820481300354, + "learning_rate": 2.8825478564981625e-05, + "loss": 1.9756, + "step": 65115 + }, + { + "epoch": 0.12683099716076165, + "grad_norm": 2.278277635574341, + "learning_rate": 2.882494455233535e-05, + "loss": 1.7746, + "step": 65130 + }, + { + "epoch": 0.1268602074318719, + "grad_norm": 3.2115724086761475, + "learning_rate": 2.8824410423267275e-05, + "loss": 2.0088, + "step": 65145 + }, + { + "epoch": 0.12688941770298218, + "grad_norm": 5.20640754699707, + "learning_rate": 2.8823876177781897e-05, + "loss": 1.9898, + "step": 65160 + }, + { + "epoch": 0.12691862797409242, + "grad_norm": 2.363861083984375, + "learning_rate": 2.8823341815883715e-05, + "loss": 1.574, + "step": 65175 + }, + { + "epoch": 0.1269478382452027, + "grad_norm": 4.3480377197265625, + "learning_rate": 2.8822807337577226e-05, + "loss": 1.9031, + "step": 65190 + }, + { + "epoch": 0.12697704851631297, + "grad_norm": 3.267199754714966, + "learning_rate": 2.882227274286693e-05, + "loss": 1.6944, + "step": 65205 + }, + { + "epoch": 0.12700625878742322, + "grad_norm": 2.3712382316589355, + "learning_rate": 2.882173803175733e-05, + "loss": 1.7736, + "step": 65220 + }, + { + "epoch": 0.1270354690585335, + "grad_norm": 1.8935054540634155, + "learning_rate": 2.8821203204252935e-05, + "loss": 1.8651, + "step": 65235 + }, + { + "epoch": 0.12706467932964374, + "grad_norm": 2.1789796352386475, + "learning_rate": 2.8820668260358242e-05, + "loss": 1.9417, + "step": 65250 + }, + { + "epoch": 0.12709388960075402, + "grad_norm": 2.480024814605713, + "learning_rate": 2.882013320007776e-05, + "loss": 1.8297, + "step": 65265 + }, + { + "epoch": 0.12712309987186426, + "grad_norm": 2.918729066848755, + "learning_rate": 2.8819598023415988e-05, + "loss": 1.8682, + "step": 65280 + }, + { + "epoch": 0.12715231014297454, + "grad_norm": 3.1942031383514404, + "learning_rate": 2.8819062730377438e-05, + "loss": 1.9975, + "step": 65295 + }, + { + "epoch": 0.12718152041408481, + "grad_norm": 4.053991794586182, + "learning_rate": 2.881852732096662e-05, + "loss": 1.8599, + "step": 65310 + }, + { + "epoch": 0.12721073068519506, + "grad_norm": 2.6643126010894775, + "learning_rate": 2.881799179518804e-05, + "loss": 1.9707, + "step": 65325 + }, + { + "epoch": 0.12723994095630534, + "grad_norm": 2.341019630432129, + "learning_rate": 2.8817456153046202e-05, + "loss": 1.9029, + "step": 65340 + }, + { + "epoch": 0.12726915122741558, + "grad_norm": 3.088487148284912, + "learning_rate": 2.881692039454563e-05, + "loss": 1.9615, + "step": 65355 + }, + { + "epoch": 0.12729836149852586, + "grad_norm": 3.1452345848083496, + "learning_rate": 2.8816384519690824e-05, + "loss": 1.9485, + "step": 65370 + }, + { + "epoch": 0.1273275717696361, + "grad_norm": 2.4688608646392822, + "learning_rate": 2.88158485284863e-05, + "loss": 1.8019, + "step": 65385 + }, + { + "epoch": 0.12735678204074638, + "grad_norm": 2.8671841621398926, + "learning_rate": 2.8815312420936572e-05, + "loss": 1.683, + "step": 65400 + }, + { + "epoch": 0.12738599231185666, + "grad_norm": 2.7107467651367188, + "learning_rate": 2.8814776197046157e-05, + "loss": 1.7119, + "step": 65415 + }, + { + "epoch": 0.1274152025829669, + "grad_norm": 3.93402361869812, + "learning_rate": 2.8814239856819572e-05, + "loss": 1.7754, + "step": 65430 + }, + { + "epoch": 0.12744441285407718, + "grad_norm": 2.0531046390533447, + "learning_rate": 2.8813703400261326e-05, + "loss": 1.8301, + "step": 65445 + }, + { + "epoch": 0.12747362312518742, + "grad_norm": 2.8359248638153076, + "learning_rate": 2.8813166827375943e-05, + "loss": 1.9635, + "step": 65460 + }, + { + "epoch": 0.1275028333962977, + "grad_norm": 3.6197519302368164, + "learning_rate": 2.8812630138167938e-05, + "loss": 1.8024, + "step": 65475 + }, + { + "epoch": 0.12753204366740795, + "grad_norm": 3.1808016300201416, + "learning_rate": 2.881209333264183e-05, + "loss": 1.7584, + "step": 65490 + }, + { + "epoch": 0.12756125393851822, + "grad_norm": 2.5192482471466064, + "learning_rate": 2.8811556410802143e-05, + "loss": 1.7153, + "step": 65505 + }, + { + "epoch": 0.1275904642096285, + "grad_norm": 2.254960775375366, + "learning_rate": 2.88110193726534e-05, + "loss": 1.8456, + "step": 65520 + }, + { + "epoch": 0.12761967448073874, + "grad_norm": 3.0390379428863525, + "learning_rate": 2.8810482218200114e-05, + "loss": 1.6019, + "step": 65535 + }, + { + "epoch": 0.12764888475184902, + "grad_norm": 3.328944206237793, + "learning_rate": 2.8809944947446823e-05, + "loss": 1.7163, + "step": 65550 + }, + { + "epoch": 0.12767809502295926, + "grad_norm": 2.3456404209136963, + "learning_rate": 2.8809407560398036e-05, + "loss": 1.9479, + "step": 65565 + }, + { + "epoch": 0.12770730529406954, + "grad_norm": 4.819161415100098, + "learning_rate": 2.880887005705829e-05, + "loss": 1.8769, + "step": 65580 + }, + { + "epoch": 0.1277365155651798, + "grad_norm": 3.071819305419922, + "learning_rate": 2.8808332437432104e-05, + "loss": 1.8851, + "step": 65595 + }, + { + "epoch": 0.12776572583629006, + "grad_norm": 3.7768914699554443, + "learning_rate": 2.8807794701524013e-05, + "loss": 1.9714, + "step": 65610 + }, + { + "epoch": 0.12779493610740034, + "grad_norm": 2.3370285034179688, + "learning_rate": 2.8807256849338538e-05, + "loss": 1.8949, + "step": 65625 + }, + { + "epoch": 0.12782414637851058, + "grad_norm": 4.984701633453369, + "learning_rate": 2.880671888088021e-05, + "loss": 1.8557, + "step": 65640 + }, + { + "epoch": 0.12785335664962086, + "grad_norm": 3.053363084793091, + "learning_rate": 2.880618079615356e-05, + "loss": 1.7479, + "step": 65655 + }, + { + "epoch": 0.1278825669207311, + "grad_norm": 5.252918243408203, + "learning_rate": 2.8805642595163122e-05, + "loss": 1.8732, + "step": 65670 + }, + { + "epoch": 0.12791177719184138, + "grad_norm": 3.567188024520874, + "learning_rate": 2.8805104277913425e-05, + "loss": 2.2211, + "step": 65685 + }, + { + "epoch": 0.12794098746295163, + "grad_norm": 3.3326799869537354, + "learning_rate": 2.8804565844409005e-05, + "loss": 1.7931, + "step": 65700 + }, + { + "epoch": 0.1279701977340619, + "grad_norm": 2.334365129470825, + "learning_rate": 2.880402729465439e-05, + "loss": 1.9189, + "step": 65715 + }, + { + "epoch": 0.12799940800517218, + "grad_norm": 1.6437816619873047, + "learning_rate": 2.8803488628654125e-05, + "loss": 1.8192, + "step": 65730 + }, + { + "epoch": 0.12802861827628242, + "grad_norm": 6.312033176422119, + "learning_rate": 2.880294984641274e-05, + "loss": 1.8375, + "step": 65745 + }, + { + "epoch": 0.1280578285473927, + "grad_norm": 2.6998863220214844, + "learning_rate": 2.880241094793477e-05, + "loss": 1.7736, + "step": 65760 + }, + { + "epoch": 0.12808703881850295, + "grad_norm": 4.209472179412842, + "learning_rate": 2.880187193322476e-05, + "loss": 1.9665, + "step": 65775 + }, + { + "epoch": 0.12811624908961322, + "grad_norm": 2.5797815322875977, + "learning_rate": 2.880133280228724e-05, + "loss": 1.6284, + "step": 65790 + }, + { + "epoch": 0.12814545936072347, + "grad_norm": 3.089334487915039, + "learning_rate": 2.8800793555126764e-05, + "loss": 1.7804, + "step": 65805 + }, + { + "epoch": 0.12817466963183374, + "grad_norm": 2.913644552230835, + "learning_rate": 2.8800254191747856e-05, + "loss": 1.8608, + "step": 65820 + }, + { + "epoch": 0.12820387990294402, + "grad_norm": 4.943084716796875, + "learning_rate": 2.8799714712155073e-05, + "loss": 1.9032, + "step": 65835 + }, + { + "epoch": 0.12823309017405427, + "grad_norm": 2.598860263824463, + "learning_rate": 2.8799175116352944e-05, + "loss": 1.9818, + "step": 65850 + }, + { + "epoch": 0.12826230044516454, + "grad_norm": 5.154215335845947, + "learning_rate": 2.8798635404346026e-05, + "loss": 1.8784, + "step": 65865 + }, + { + "epoch": 0.1282915107162748, + "grad_norm": 2.27248477935791, + "learning_rate": 2.8798095576138855e-05, + "loss": 2.0138, + "step": 65880 + }, + { + "epoch": 0.12832072098738506, + "grad_norm": 2.398949146270752, + "learning_rate": 2.8797555631735983e-05, + "loss": 1.8623, + "step": 65895 + }, + { + "epoch": 0.1283499312584953, + "grad_norm": 2.2456207275390625, + "learning_rate": 2.8797015571141956e-05, + "loss": 1.8461, + "step": 65910 + }, + { + "epoch": 0.12837914152960558, + "grad_norm": 3.295001983642578, + "learning_rate": 2.8796475394361318e-05, + "loss": 1.8851, + "step": 65925 + }, + { + "epoch": 0.12840835180071586, + "grad_norm": 2.2818961143493652, + "learning_rate": 2.879593510139862e-05, + "loss": 1.9005, + "step": 65940 + }, + { + "epoch": 0.1284375620718261, + "grad_norm": 3.6639366149902344, + "learning_rate": 2.8795394692258417e-05, + "loss": 2.018, + "step": 65955 + }, + { + "epoch": 0.12846677234293638, + "grad_norm": 2.1098251342773438, + "learning_rate": 2.879485416694525e-05, + "loss": 1.9125, + "step": 65970 + }, + { + "epoch": 0.12849598261404663, + "grad_norm": 4.794539451599121, + "learning_rate": 2.8794313525463677e-05, + "loss": 1.9403, + "step": 65985 + }, + { + "epoch": 0.1285251928851569, + "grad_norm": 3.6163485050201416, + "learning_rate": 2.879377276781825e-05, + "loss": 1.7901, + "step": 66000 + }, + { + "epoch": 0.12855440315626715, + "grad_norm": 3.6221139430999756, + "learning_rate": 2.8793231894013523e-05, + "loss": 1.9172, + "step": 66015 + }, + { + "epoch": 0.12858361342737742, + "grad_norm": 3.325624465942383, + "learning_rate": 2.879269090405405e-05, + "loss": 1.9139, + "step": 66030 + }, + { + "epoch": 0.1286128236984877, + "grad_norm": 2.292201280593872, + "learning_rate": 2.8792149797944382e-05, + "loss": 1.7878, + "step": 66045 + }, + { + "epoch": 0.12864203396959795, + "grad_norm": 4.61007022857666, + "learning_rate": 2.879160857568909e-05, + "loss": 1.821, + "step": 66060 + }, + { + "epoch": 0.12867124424070822, + "grad_norm": 1.8645203113555908, + "learning_rate": 2.8791067237292715e-05, + "loss": 1.9116, + "step": 66075 + }, + { + "epoch": 0.12870045451181847, + "grad_norm": 2.449690580368042, + "learning_rate": 2.8790525782759826e-05, + "loss": 2.0127, + "step": 66090 + }, + { + "epoch": 0.12872966478292874, + "grad_norm": 3.2927744388580322, + "learning_rate": 2.878998421209498e-05, + "loss": 1.8068, + "step": 66105 + }, + { + "epoch": 0.128758875054039, + "grad_norm": 3.9324254989624023, + "learning_rate": 2.8789442525302736e-05, + "loss": 1.8738, + "step": 66120 + }, + { + "epoch": 0.12878808532514927, + "grad_norm": 2.693402051925659, + "learning_rate": 2.878890072238766e-05, + "loss": 1.9898, + "step": 66135 + }, + { + "epoch": 0.12881729559625954, + "grad_norm": 3.9288182258605957, + "learning_rate": 2.878835880335431e-05, + "loss": 1.9142, + "step": 66150 + }, + { + "epoch": 0.1288465058673698, + "grad_norm": 4.016102313995361, + "learning_rate": 2.878781676820725e-05, + "loss": 1.84, + "step": 66165 + }, + { + "epoch": 0.12887571613848006, + "grad_norm": 3.676327705383301, + "learning_rate": 2.8787274616951047e-05, + "loss": 1.8891, + "step": 66180 + }, + { + "epoch": 0.1289049264095903, + "grad_norm": 3.4741785526275635, + "learning_rate": 2.8786732349590265e-05, + "loss": 1.6718, + "step": 66195 + }, + { + "epoch": 0.12893413668070058, + "grad_norm": 2.7453677654266357, + "learning_rate": 2.8786189966129475e-05, + "loss": 1.8173, + "step": 66210 + }, + { + "epoch": 0.12896334695181083, + "grad_norm": 2.2902612686157227, + "learning_rate": 2.8785647466573238e-05, + "loss": 1.7844, + "step": 66225 + }, + { + "epoch": 0.1289925572229211, + "grad_norm": 3.0282065868377686, + "learning_rate": 2.878510485092612e-05, + "loss": 1.9362, + "step": 66240 + }, + { + "epoch": 0.12902176749403138, + "grad_norm": 3.395966053009033, + "learning_rate": 2.8784562119192703e-05, + "loss": 2.0385, + "step": 66255 + }, + { + "epoch": 0.12905097776514163, + "grad_norm": 2.393310070037842, + "learning_rate": 2.878401927137754e-05, + "loss": 1.9473, + "step": 66270 + }, + { + "epoch": 0.1290801880362519, + "grad_norm": 3.5441393852233887, + "learning_rate": 2.8783476307485222e-05, + "loss": 2.0295, + "step": 66285 + }, + { + "epoch": 0.12910939830736215, + "grad_norm": 2.3071370124816895, + "learning_rate": 2.8782933227520302e-05, + "loss": 1.9038, + "step": 66300 + }, + { + "epoch": 0.12913860857847242, + "grad_norm": 1.9165912866592407, + "learning_rate": 2.878239003148737e-05, + "loss": 1.788, + "step": 66315 + }, + { + "epoch": 0.12916781884958267, + "grad_norm": 3.5178415775299072, + "learning_rate": 2.878184671939099e-05, + "loss": 1.7834, + "step": 66330 + }, + { + "epoch": 0.12919702912069295, + "grad_norm": 3.725377082824707, + "learning_rate": 2.878130329123574e-05, + "loss": 1.8542, + "step": 66345 + }, + { + "epoch": 0.12922623939180322, + "grad_norm": 4.175662517547607, + "learning_rate": 2.8780759747026198e-05, + "loss": 1.8923, + "step": 66360 + }, + { + "epoch": 0.12925544966291347, + "grad_norm": 2.5805375576019287, + "learning_rate": 2.878021608676694e-05, + "loss": 1.8448, + "step": 66375 + }, + { + "epoch": 0.12928465993402374, + "grad_norm": 2.706115961074829, + "learning_rate": 2.8779672310462544e-05, + "loss": 2.0845, + "step": 66390 + }, + { + "epoch": 0.129313870205134, + "grad_norm": 2.710679531097412, + "learning_rate": 2.877912841811759e-05, + "loss": 1.7479, + "step": 66405 + }, + { + "epoch": 0.12934308047624427, + "grad_norm": 3.627084493637085, + "learning_rate": 2.877858440973666e-05, + "loss": 1.8292, + "step": 66420 + }, + { + "epoch": 0.1293722907473545, + "grad_norm": 3.17694354057312, + "learning_rate": 2.877804028532433e-05, + "loss": 1.7593, + "step": 66435 + }, + { + "epoch": 0.1294015010184648, + "grad_norm": 3.346898317337036, + "learning_rate": 2.8777496044885183e-05, + "loss": 1.7555, + "step": 66450 + }, + { + "epoch": 0.12943071128957506, + "grad_norm": 3.465731382369995, + "learning_rate": 2.8776951688423807e-05, + "loss": 1.8564, + "step": 66465 + }, + { + "epoch": 0.1294599215606853, + "grad_norm": 3.05916690826416, + "learning_rate": 2.877640721594478e-05, + "loss": 1.891, + "step": 66480 + }, + { + "epoch": 0.12948913183179558, + "grad_norm": 3.16304349899292, + "learning_rate": 2.8775862627452696e-05, + "loss": 2.0304, + "step": 66495 + }, + { + "epoch": 0.12951834210290583, + "grad_norm": 3.4588115215301514, + "learning_rate": 2.877531792295213e-05, + "loss": 2.0573, + "step": 66510 + }, + { + "epoch": 0.1295475523740161, + "grad_norm": 2.639214515686035, + "learning_rate": 2.8774773102447677e-05, + "loss": 1.6277, + "step": 66525 + }, + { + "epoch": 0.12957676264512635, + "grad_norm": 2.7769949436187744, + "learning_rate": 2.8774228165943922e-05, + "loss": 1.8592, + "step": 66540 + }, + { + "epoch": 0.12960597291623663, + "grad_norm": 2.6024632453918457, + "learning_rate": 2.8773683113445452e-05, + "loss": 1.9916, + "step": 66555 + }, + { + "epoch": 0.1296351831873469, + "grad_norm": 3.6925864219665527, + "learning_rate": 2.877313794495686e-05, + "loss": 1.913, + "step": 66570 + }, + { + "epoch": 0.12966439345845715, + "grad_norm": 2.8301281929016113, + "learning_rate": 2.877259266048274e-05, + "loss": 1.9434, + "step": 66585 + }, + { + "epoch": 0.12969360372956742, + "grad_norm": 3.9472060203552246, + "learning_rate": 2.8772047260027677e-05, + "loss": 1.7998, + "step": 66600 + }, + { + "epoch": 0.12972281400067767, + "grad_norm": 3.631502151489258, + "learning_rate": 2.8771501743596265e-05, + "loss": 1.7505, + "step": 66615 + }, + { + "epoch": 0.12975202427178795, + "grad_norm": 2.6554458141326904, + "learning_rate": 2.8770956111193107e-05, + "loss": 1.8747, + "step": 66630 + }, + { + "epoch": 0.1297812345428982, + "grad_norm": 2.7983198165893555, + "learning_rate": 2.8770410362822784e-05, + "loss": 1.8788, + "step": 66645 + }, + { + "epoch": 0.12981044481400847, + "grad_norm": 1.874887466430664, + "learning_rate": 2.8769864498489903e-05, + "loss": 2.0088, + "step": 66660 + }, + { + "epoch": 0.12983965508511874, + "grad_norm": 2.888300657272339, + "learning_rate": 2.876931851819905e-05, + "loss": 1.9372, + "step": 66675 + }, + { + "epoch": 0.129868865356229, + "grad_norm": 1.908041000366211, + "learning_rate": 2.8768772421954834e-05, + "loss": 1.6132, + "step": 66690 + }, + { + "epoch": 0.12989807562733927, + "grad_norm": 2.139601707458496, + "learning_rate": 2.8768226209761853e-05, + "loss": 2.0994, + "step": 66705 + }, + { + "epoch": 0.1299272858984495, + "grad_norm": 3.5329482555389404, + "learning_rate": 2.8767679881624697e-05, + "loss": 2.0083, + "step": 66720 + }, + { + "epoch": 0.1299564961695598, + "grad_norm": 4.251967906951904, + "learning_rate": 2.8767133437547976e-05, + "loss": 1.7221, + "step": 66735 + }, + { + "epoch": 0.12998570644067003, + "grad_norm": 3.4373340606689453, + "learning_rate": 2.8766586877536283e-05, + "loss": 1.8573, + "step": 66750 + }, + { + "epoch": 0.1300149167117803, + "grad_norm": 2.3318285942077637, + "learning_rate": 2.8766040201594226e-05, + "loss": 1.8416, + "step": 66765 + }, + { + "epoch": 0.13004412698289058, + "grad_norm": 3.046931743621826, + "learning_rate": 2.8765493409726413e-05, + "loss": 1.9814, + "step": 66780 + }, + { + "epoch": 0.13007333725400083, + "grad_norm": 3.4320180416107178, + "learning_rate": 2.876494650193744e-05, + "loss": 1.9485, + "step": 66795 + }, + { + "epoch": 0.1301025475251111, + "grad_norm": 4.573573112487793, + "learning_rate": 2.8764399478231918e-05, + "loss": 1.7618, + "step": 66810 + }, + { + "epoch": 0.13013175779622135, + "grad_norm": 4.97303581237793, + "learning_rate": 2.876385233861445e-05, + "loss": 1.752, + "step": 66825 + }, + { + "epoch": 0.13016096806733163, + "grad_norm": 3.721406936645508, + "learning_rate": 2.876330508308965e-05, + "loss": 1.6955, + "step": 66840 + }, + { + "epoch": 0.13019017833844188, + "grad_norm": 2.616175413131714, + "learning_rate": 2.876275771166212e-05, + "loss": 1.755, + "step": 66855 + }, + { + "epoch": 0.13021938860955215, + "grad_norm": 2.9869017601013184, + "learning_rate": 2.8762210224336474e-05, + "loss": 1.8816, + "step": 66870 + }, + { + "epoch": 0.1302485988806624, + "grad_norm": 3.258497953414917, + "learning_rate": 2.8761662621117316e-05, + "loss": 1.9057, + "step": 66885 + }, + { + "epoch": 0.13027780915177267, + "grad_norm": 4.098228931427002, + "learning_rate": 2.8761114902009265e-05, + "loss": 1.8523, + "step": 66900 + }, + { + "epoch": 0.13030701942288295, + "grad_norm": 2.2816691398620605, + "learning_rate": 2.876056706701693e-05, + "loss": 1.8696, + "step": 66915 + }, + { + "epoch": 0.1303362296939932, + "grad_norm": 2.764667510986328, + "learning_rate": 2.8760019116144918e-05, + "loss": 1.6322, + "step": 66930 + }, + { + "epoch": 0.13036543996510347, + "grad_norm": 3.673103094100952, + "learning_rate": 2.8759471049397856e-05, + "loss": 1.8601, + "step": 66945 + }, + { + "epoch": 0.13039465023621372, + "grad_norm": 2.546541452407837, + "learning_rate": 2.8758922866780352e-05, + "loss": 1.8438, + "step": 66960 + }, + { + "epoch": 0.130423860507324, + "grad_norm": 3.561556816101074, + "learning_rate": 2.8758374568297025e-05, + "loss": 1.8338, + "step": 66975 + }, + { + "epoch": 0.13045307077843424, + "grad_norm": 2.477541923522949, + "learning_rate": 2.8757826153952486e-05, + "loss": 1.5823, + "step": 66990 + }, + { + "epoch": 0.1304822810495445, + "grad_norm": 1.9878603219985962, + "learning_rate": 2.875727762375136e-05, + "loss": 1.7307, + "step": 67005 + }, + { + "epoch": 0.1305114913206548, + "grad_norm": 2.126744031906128, + "learning_rate": 2.8756728977698264e-05, + "loss": 1.9878, + "step": 67020 + }, + { + "epoch": 0.13054070159176503, + "grad_norm": 5.064667701721191, + "learning_rate": 2.875618021579782e-05, + "loss": 1.6445, + "step": 67035 + }, + { + "epoch": 0.1305699118628753, + "grad_norm": 3.8306710720062256, + "learning_rate": 2.8755631338054645e-05, + "loss": 1.7151, + "step": 67050 + }, + { + "epoch": 0.13059912213398556, + "grad_norm": 3.468627691268921, + "learning_rate": 2.8755082344473367e-05, + "loss": 1.832, + "step": 67065 + }, + { + "epoch": 0.13062833240509583, + "grad_norm": 2.4451801776885986, + "learning_rate": 2.8754533235058606e-05, + "loss": 1.8397, + "step": 67080 + }, + { + "epoch": 0.13065754267620608, + "grad_norm": 2.9594390392303467, + "learning_rate": 2.8753984009814987e-05, + "loss": 1.8399, + "step": 67095 + }, + { + "epoch": 0.13068675294731635, + "grad_norm": 2.8104984760284424, + "learning_rate": 2.875343466874713e-05, + "loss": 1.9299, + "step": 67110 + }, + { + "epoch": 0.13071596321842663, + "grad_norm": 2.0295498371124268, + "learning_rate": 2.8752885211859667e-05, + "loss": 1.8094, + "step": 67125 + }, + { + "epoch": 0.13074517348953688, + "grad_norm": 2.9220430850982666, + "learning_rate": 2.8752335639157226e-05, + "loss": 1.8219, + "step": 67140 + }, + { + "epoch": 0.13077438376064715, + "grad_norm": 1.9838787317276, + "learning_rate": 2.8751785950644427e-05, + "loss": 1.8514, + "step": 67155 + }, + { + "epoch": 0.1308035940317574, + "grad_norm": 5.2550811767578125, + "learning_rate": 2.875123614632591e-05, + "loss": 1.8111, + "step": 67170 + }, + { + "epoch": 0.13083280430286767, + "grad_norm": 2.7518036365509033, + "learning_rate": 2.87506862262063e-05, + "loss": 1.958, + "step": 67185 + }, + { + "epoch": 0.13086201457397792, + "grad_norm": 3.9723989963531494, + "learning_rate": 2.875013619029022e-05, + "loss": 1.7816, + "step": 67200 + }, + { + "epoch": 0.1308912248450882, + "grad_norm": 2.8679094314575195, + "learning_rate": 2.8749586038582315e-05, + "loss": 1.674, + "step": 67215 + }, + { + "epoch": 0.13092043511619847, + "grad_norm": 5.053723335266113, + "learning_rate": 2.8749035771087212e-05, + "loss": 1.8904, + "step": 67230 + }, + { + "epoch": 0.13094964538730872, + "grad_norm": 2.5131609439849854, + "learning_rate": 2.8748485387809546e-05, + "loss": 1.9436, + "step": 67245 + }, + { + "epoch": 0.130978855658419, + "grad_norm": 3.221109628677368, + "learning_rate": 2.8747934888753944e-05, + "loss": 1.8688, + "step": 67260 + }, + { + "epoch": 0.13100806592952924, + "grad_norm": 3.8406662940979004, + "learning_rate": 2.8747384273925053e-05, + "loss": 1.8157, + "step": 67275 + }, + { + "epoch": 0.1310372762006395, + "grad_norm": 5.8072638511657715, + "learning_rate": 2.8746833543327506e-05, + "loss": 2.0254, + "step": 67290 + }, + { + "epoch": 0.13106648647174976, + "grad_norm": 2.596179485321045, + "learning_rate": 2.8746282696965943e-05, + "loss": 1.8614, + "step": 67305 + }, + { + "epoch": 0.13109569674286004, + "grad_norm": 2.5730509757995605, + "learning_rate": 2.8745731734844995e-05, + "loss": 2.0649, + "step": 67320 + }, + { + "epoch": 0.1311249070139703, + "grad_norm": 2.7249279022216797, + "learning_rate": 2.874518065696931e-05, + "loss": 1.7187, + "step": 67335 + }, + { + "epoch": 0.13115411728508056, + "grad_norm": 2.0172507762908936, + "learning_rate": 2.8744629463343524e-05, + "loss": 2.0294, + "step": 67350 + }, + { + "epoch": 0.13118332755619083, + "grad_norm": 3.5570526123046875, + "learning_rate": 2.8744078153972278e-05, + "loss": 1.9897, + "step": 67365 + }, + { + "epoch": 0.13121253782730108, + "grad_norm": 1.857101321220398, + "learning_rate": 2.8743526728860224e-05, + "loss": 1.7697, + "step": 67380 + }, + { + "epoch": 0.13124174809841135, + "grad_norm": 4.140279293060303, + "learning_rate": 2.8742975188011994e-05, + "loss": 1.7874, + "step": 67395 + }, + { + "epoch": 0.1312709583695216, + "grad_norm": 2.2621564865112305, + "learning_rate": 2.874242353143224e-05, + "loss": 1.8673, + "step": 67410 + }, + { + "epoch": 0.13130016864063188, + "grad_norm": 3.2192656993865967, + "learning_rate": 2.87418717591256e-05, + "loss": 2.0513, + "step": 67425 + }, + { + "epoch": 0.13132937891174215, + "grad_norm": 2.3391270637512207, + "learning_rate": 2.8741319871096724e-05, + "loss": 2.0733, + "step": 67440 + }, + { + "epoch": 0.1313585891828524, + "grad_norm": 2.8572590351104736, + "learning_rate": 2.8740767867350263e-05, + "loss": 1.8815, + "step": 67455 + }, + { + "epoch": 0.13138779945396267, + "grad_norm": 3.0698556900024414, + "learning_rate": 2.8740215747890865e-05, + "loss": 1.7972, + "step": 67470 + }, + { + "epoch": 0.13141700972507292, + "grad_norm": 3.0042874813079834, + "learning_rate": 2.873966351272318e-05, + "loss": 2.0349, + "step": 67485 + }, + { + "epoch": 0.1314462199961832, + "grad_norm": 2.179337501525879, + "learning_rate": 2.873911116185185e-05, + "loss": 1.9132, + "step": 67500 + }, + { + "epoch": 0.13147543026729344, + "grad_norm": 2.222003936767578, + "learning_rate": 2.8738558695281538e-05, + "loss": 1.9499, + "step": 67515 + }, + { + "epoch": 0.13150464053840372, + "grad_norm": 4.374329566955566, + "learning_rate": 2.8738006113016887e-05, + "loss": 1.9184, + "step": 67530 + }, + { + "epoch": 0.131533850809514, + "grad_norm": 3.095432996749878, + "learning_rate": 2.8737453415062555e-05, + "loss": 1.7847, + "step": 67545 + }, + { + "epoch": 0.13156306108062424, + "grad_norm": 4.952056884765625, + "learning_rate": 2.8736900601423197e-05, + "loss": 1.7146, + "step": 67560 + }, + { + "epoch": 0.1315922713517345, + "grad_norm": 3.800189971923828, + "learning_rate": 2.8736347672103466e-05, + "loss": 1.9003, + "step": 67575 + }, + { + "epoch": 0.13162148162284476, + "grad_norm": 2.564591646194458, + "learning_rate": 2.8735794627108022e-05, + "loss": 1.8199, + "step": 67590 + }, + { + "epoch": 0.13165069189395504, + "grad_norm": 2.126452922821045, + "learning_rate": 2.8735241466441516e-05, + "loss": 1.8407, + "step": 67605 + }, + { + "epoch": 0.13167990216506528, + "grad_norm": 2.4940192699432373, + "learning_rate": 2.8734688190108612e-05, + "loss": 2.0098, + "step": 67620 + }, + { + "epoch": 0.13170911243617556, + "grad_norm": 4.615790843963623, + "learning_rate": 2.8734134798113965e-05, + "loss": 1.8405, + "step": 67635 + }, + { + "epoch": 0.13173832270728583, + "grad_norm": 2.3933308124542236, + "learning_rate": 2.8733581290462238e-05, + "loss": 1.7982, + "step": 67650 + }, + { + "epoch": 0.13176753297839608, + "grad_norm": 3.4647879600524902, + "learning_rate": 2.8733027667158092e-05, + "loss": 1.7029, + "step": 67665 + }, + { + "epoch": 0.13179674324950635, + "grad_norm": 2.247746467590332, + "learning_rate": 2.8732473928206187e-05, + "loss": 1.9462, + "step": 67680 + }, + { + "epoch": 0.1318259535206166, + "grad_norm": 2.119633197784424, + "learning_rate": 2.873192007361119e-05, + "loss": 1.9116, + "step": 67695 + }, + { + "epoch": 0.13185516379172688, + "grad_norm": 2.2814173698425293, + "learning_rate": 2.873136610337776e-05, + "loss": 1.9661, + "step": 67710 + }, + { + "epoch": 0.13188437406283712, + "grad_norm": 2.4555954933166504, + "learning_rate": 2.873081201751057e-05, + "loss": 1.7874, + "step": 67725 + }, + { + "epoch": 0.1319135843339474, + "grad_norm": 3.7809746265411377, + "learning_rate": 2.873025781601428e-05, + "loss": 1.8942, + "step": 67740 + }, + { + "epoch": 0.13194279460505767, + "grad_norm": 4.395142555236816, + "learning_rate": 2.872970349889355e-05, + "loss": 1.8674, + "step": 67755 + }, + { + "epoch": 0.13197200487616792, + "grad_norm": 5.517259120941162, + "learning_rate": 2.8729149066153063e-05, + "loss": 1.7875, + "step": 67770 + }, + { + "epoch": 0.1320012151472782, + "grad_norm": 2.1210813522338867, + "learning_rate": 2.872859451779748e-05, + "loss": 1.8451, + "step": 67785 + }, + { + "epoch": 0.13203042541838844, + "grad_norm": 3.486795425415039, + "learning_rate": 2.872803985383147e-05, + "loss": 1.948, + "step": 67800 + }, + { + "epoch": 0.13205963568949872, + "grad_norm": 2.551523208618164, + "learning_rate": 2.872748507425971e-05, + "loss": 1.7632, + "step": 67815 + }, + { + "epoch": 0.13208884596060896, + "grad_norm": 3.7557811737060547, + "learning_rate": 2.8726930179086858e-05, + "loss": 1.6793, + "step": 67830 + }, + { + "epoch": 0.13211805623171924, + "grad_norm": 2.217620372772217, + "learning_rate": 2.8726375168317605e-05, + "loss": 1.8407, + "step": 67845 + }, + { + "epoch": 0.1321472665028295, + "grad_norm": 3.28259015083313, + "learning_rate": 2.872582004195661e-05, + "loss": 1.8631, + "step": 67860 + }, + { + "epoch": 0.13217647677393976, + "grad_norm": 2.7340691089630127, + "learning_rate": 2.8725264800008557e-05, + "loss": 2.0131, + "step": 67875 + }, + { + "epoch": 0.13220568704505004, + "grad_norm": 2.2978339195251465, + "learning_rate": 2.872470944247812e-05, + "loss": 1.8791, + "step": 67890 + }, + { + "epoch": 0.13223489731616028, + "grad_norm": 5.169447898864746, + "learning_rate": 2.8724153969369973e-05, + "loss": 2.0519, + "step": 67905 + }, + { + "epoch": 0.13226410758727056, + "grad_norm": 3.5658042430877686, + "learning_rate": 2.8723598380688792e-05, + "loss": 2.0561, + "step": 67920 + }, + { + "epoch": 0.1322933178583808, + "grad_norm": 3.061711311340332, + "learning_rate": 2.8723042676439262e-05, + "loss": 1.6826, + "step": 67935 + }, + { + "epoch": 0.13232252812949108, + "grad_norm": 4.970159530639648, + "learning_rate": 2.8722486856626063e-05, + "loss": 1.7423, + "step": 67950 + }, + { + "epoch": 0.13235173840060135, + "grad_norm": 4.6319169998168945, + "learning_rate": 2.8721930921253873e-05, + "loss": 1.9036, + "step": 67965 + }, + { + "epoch": 0.1323809486717116, + "grad_norm": 4.235183238983154, + "learning_rate": 2.8721374870327368e-05, + "loss": 1.8563, + "step": 67980 + }, + { + "epoch": 0.13241015894282188, + "grad_norm": 4.325774192810059, + "learning_rate": 2.8720818703851234e-05, + "loss": 1.9874, + "step": 67995 + }, + { + "epoch": 0.13243936921393212, + "grad_norm": 3.9221222400665283, + "learning_rate": 2.8720262421830163e-05, + "loss": 1.8196, + "step": 68010 + }, + { + "epoch": 0.1324685794850424, + "grad_norm": 4.31191349029541, + "learning_rate": 2.8719706024268824e-05, + "loss": 1.8825, + "step": 68025 + }, + { + "epoch": 0.13249778975615265, + "grad_norm": 3.0712528228759766, + "learning_rate": 2.8719149511171914e-05, + "loss": 1.9333, + "step": 68040 + }, + { + "epoch": 0.13252700002726292, + "grad_norm": 4.205140113830566, + "learning_rate": 2.8718592882544114e-05, + "loss": 1.8647, + "step": 68055 + }, + { + "epoch": 0.1325562102983732, + "grad_norm": 4.297982215881348, + "learning_rate": 2.8718036138390124e-05, + "loss": 1.9464, + "step": 68070 + }, + { + "epoch": 0.13258542056948344, + "grad_norm": 2.884995460510254, + "learning_rate": 2.8717479278714612e-05, + "loss": 1.8249, + "step": 68085 + }, + { + "epoch": 0.13261463084059372, + "grad_norm": 4.097220420837402, + "learning_rate": 2.871692230352228e-05, + "loss": 1.618, + "step": 68100 + }, + { + "epoch": 0.13264384111170396, + "grad_norm": 3.8072409629821777, + "learning_rate": 2.8716365212817817e-05, + "loss": 1.7722, + "step": 68115 + }, + { + "epoch": 0.13267305138281424, + "grad_norm": 2.440891742706299, + "learning_rate": 2.8715808006605914e-05, + "loss": 1.8661, + "step": 68130 + }, + { + "epoch": 0.13270226165392449, + "grad_norm": 2.6586554050445557, + "learning_rate": 2.871525068489126e-05, + "loss": 1.8815, + "step": 68145 + }, + { + "epoch": 0.13273147192503476, + "grad_norm": 3.1823229789733887, + "learning_rate": 2.8714693247678555e-05, + "loss": 1.7746, + "step": 68160 + }, + { + "epoch": 0.13276068219614504, + "grad_norm": 5.664133071899414, + "learning_rate": 2.8714135694972487e-05, + "loss": 1.8466, + "step": 68175 + }, + { + "epoch": 0.13278989246725528, + "grad_norm": 3.864992380142212, + "learning_rate": 2.8713578026777756e-05, + "loss": 2.0081, + "step": 68190 + }, + { + "epoch": 0.13281910273836556, + "grad_norm": 3.317230224609375, + "learning_rate": 2.8713020243099055e-05, + "loss": 1.8631, + "step": 68205 + }, + { + "epoch": 0.1328483130094758, + "grad_norm": 3.758924961090088, + "learning_rate": 2.871246234394108e-05, + "loss": 1.8402, + "step": 68220 + }, + { + "epoch": 0.13287752328058608, + "grad_norm": 2.1369099617004395, + "learning_rate": 2.871190432930853e-05, + "loss": 1.9708, + "step": 68235 + }, + { + "epoch": 0.13290673355169633, + "grad_norm": 4.580000400543213, + "learning_rate": 2.8711346199206106e-05, + "loss": 1.7985, + "step": 68250 + }, + { + "epoch": 0.1329359438228066, + "grad_norm": 2.259363889694214, + "learning_rate": 2.8710787953638506e-05, + "loss": 1.8259, + "step": 68265 + }, + { + "epoch": 0.13296515409391688, + "grad_norm": 3.068458318710327, + "learning_rate": 2.8710229592610438e-05, + "loss": 1.886, + "step": 68280 + }, + { + "epoch": 0.13299436436502712, + "grad_norm": 2.5178871154785156, + "learning_rate": 2.8709671116126595e-05, + "loss": 1.8417, + "step": 68295 + }, + { + "epoch": 0.1330235746361374, + "grad_norm": 2.892791271209717, + "learning_rate": 2.870911252419168e-05, + "loss": 1.765, + "step": 68310 + }, + { + "epoch": 0.13305278490724765, + "grad_norm": 3.5758793354034424, + "learning_rate": 2.8708553816810405e-05, + "loss": 1.8392, + "step": 68325 + }, + { + "epoch": 0.13308199517835792, + "grad_norm": 2.372753858566284, + "learning_rate": 2.870799499398747e-05, + "loss": 1.8749, + "step": 68340 + }, + { + "epoch": 0.13311120544946817, + "grad_norm": 2.760868787765503, + "learning_rate": 2.8707436055727576e-05, + "loss": 1.9776, + "step": 68355 + }, + { + "epoch": 0.13314041572057844, + "grad_norm": 2.2490272521972656, + "learning_rate": 2.870687700203544e-05, + "loss": 1.9387, + "step": 68370 + }, + { + "epoch": 0.13316962599168872, + "grad_norm": 5.308937072753906, + "learning_rate": 2.8706317832915764e-05, + "loss": 1.8691, + "step": 68385 + }, + { + "epoch": 0.13319883626279896, + "grad_norm": 3.532862901687622, + "learning_rate": 2.8705758548373257e-05, + "loss": 1.878, + "step": 68400 + }, + { + "epoch": 0.13322804653390924, + "grad_norm": 3.7335612773895264, + "learning_rate": 2.8705199148412632e-05, + "loss": 1.8424, + "step": 68415 + }, + { + "epoch": 0.13325725680501949, + "grad_norm": 3.213212013244629, + "learning_rate": 2.8704639633038594e-05, + "loss": 1.9356, + "step": 68430 + }, + { + "epoch": 0.13328646707612976, + "grad_norm": 5.062726974487305, + "learning_rate": 2.870408000225586e-05, + "loss": 1.8243, + "step": 68445 + }, + { + "epoch": 0.13331567734724, + "grad_norm": 3.6996593475341797, + "learning_rate": 2.870352025606914e-05, + "loss": 2.0953, + "step": 68460 + }, + { + "epoch": 0.13334488761835028, + "grad_norm": 3.797618865966797, + "learning_rate": 2.8702960394483152e-05, + "loss": 1.7371, + "step": 68475 + }, + { + "epoch": 0.13337409788946056, + "grad_norm": 2.501950740814209, + "learning_rate": 2.8702400417502606e-05, + "loss": 1.9201, + "step": 68490 + }, + { + "epoch": 0.1334033081605708, + "grad_norm": 4.612263202667236, + "learning_rate": 2.8701840325132217e-05, + "loss": 1.9872, + "step": 68505 + }, + { + "epoch": 0.13343251843168108, + "grad_norm": 3.0579094886779785, + "learning_rate": 2.87012801173767e-05, + "loss": 1.7953, + "step": 68520 + }, + { + "epoch": 0.13346172870279133, + "grad_norm": 2.8914904594421387, + "learning_rate": 2.8700719794240782e-05, + "loss": 1.8652, + "step": 68535 + }, + { + "epoch": 0.1334909389739016, + "grad_norm": 3.4576163291931152, + "learning_rate": 2.8700159355729174e-05, + "loss": 1.9123, + "step": 68550 + }, + { + "epoch": 0.13352014924501185, + "grad_norm": 4.8904547691345215, + "learning_rate": 2.86995988018466e-05, + "loss": 1.8009, + "step": 68565 + }, + { + "epoch": 0.13354935951612212, + "grad_norm": 2.8669166564941406, + "learning_rate": 2.869903813259777e-05, + "loss": 1.8889, + "step": 68580 + }, + { + "epoch": 0.1335785697872324, + "grad_norm": 5.823536396026611, + "learning_rate": 2.869847734798742e-05, + "loss": 1.9386, + "step": 68595 + }, + { + "epoch": 0.13360778005834265, + "grad_norm": 3.583139181137085, + "learning_rate": 2.869791644802026e-05, + "loss": 1.7596, + "step": 68610 + }, + { + "epoch": 0.13363699032945292, + "grad_norm": 2.540173292160034, + "learning_rate": 2.8697355432701026e-05, + "loss": 1.7587, + "step": 68625 + }, + { + "epoch": 0.13366620060056317, + "grad_norm": 1.6857051849365234, + "learning_rate": 2.8696794302034432e-05, + "loss": 1.9945, + "step": 68640 + }, + { + "epoch": 0.13369541087167344, + "grad_norm": 3.5034568309783936, + "learning_rate": 2.8696233056025207e-05, + "loss": 1.952, + "step": 68655 + }, + { + "epoch": 0.1337246211427837, + "grad_norm": 1.9393774271011353, + "learning_rate": 2.8695671694678077e-05, + "loss": 1.9962, + "step": 68670 + }, + { + "epoch": 0.13375383141389396, + "grad_norm": 4.289730548858643, + "learning_rate": 2.8695110217997766e-05, + "loss": 1.7797, + "step": 68685 + }, + { + "epoch": 0.13378304168500424, + "grad_norm": 2.334529161453247, + "learning_rate": 2.869454862598901e-05, + "loss": 2.0003, + "step": 68700 + }, + { + "epoch": 0.1338122519561145, + "grad_norm": 1.9498342275619507, + "learning_rate": 2.8693986918656537e-05, + "loss": 1.654, + "step": 68715 + }, + { + "epoch": 0.13384146222722476, + "grad_norm": 2.5142414569854736, + "learning_rate": 2.869342509600507e-05, + "loss": 1.8526, + "step": 68730 + }, + { + "epoch": 0.133870672498335, + "grad_norm": 5.777437686920166, + "learning_rate": 2.8692863158039343e-05, + "loss": 1.9045, + "step": 68745 + }, + { + "epoch": 0.13389988276944528, + "grad_norm": 3.1715190410614014, + "learning_rate": 2.869230110476409e-05, + "loss": 1.7892, + "step": 68760 + }, + { + "epoch": 0.13392909304055553, + "grad_norm": 2.183556079864502, + "learning_rate": 2.8691738936184045e-05, + "loss": 1.9545, + "step": 68775 + }, + { + "epoch": 0.1339583033116658, + "grad_norm": 3.0973060131073, + "learning_rate": 2.8691176652303944e-05, + "loss": 1.9848, + "step": 68790 + }, + { + "epoch": 0.13398751358277608, + "grad_norm": 3.7649102210998535, + "learning_rate": 2.8690614253128513e-05, + "loss": 1.8817, + "step": 68805 + }, + { + "epoch": 0.13401672385388633, + "grad_norm": 2.284191608428955, + "learning_rate": 2.8690051738662496e-05, + "loss": 1.8006, + "step": 68820 + }, + { + "epoch": 0.1340459341249966, + "grad_norm": 2.5201404094696045, + "learning_rate": 2.8689489108910626e-05, + "loss": 1.9512, + "step": 68835 + }, + { + "epoch": 0.13407514439610685, + "grad_norm": 3.9927327632904053, + "learning_rate": 2.8688926363877646e-05, + "loss": 1.5578, + "step": 68850 + }, + { + "epoch": 0.13410435466721712, + "grad_norm": 2.6085221767425537, + "learning_rate": 2.868836350356829e-05, + "loss": 1.9129, + "step": 68865 + }, + { + "epoch": 0.13413356493832737, + "grad_norm": 2.84348201751709, + "learning_rate": 2.86878005279873e-05, + "loss": 2.0387, + "step": 68880 + }, + { + "epoch": 0.13416277520943765, + "grad_norm": 3.226632595062256, + "learning_rate": 2.868723743713942e-05, + "loss": 1.8344, + "step": 68895 + }, + { + "epoch": 0.13419198548054792, + "grad_norm": 3.2206411361694336, + "learning_rate": 2.8686674231029385e-05, + "loss": 1.7023, + "step": 68910 + }, + { + "epoch": 0.13422119575165817, + "grad_norm": 2.05230975151062, + "learning_rate": 2.868611090966194e-05, + "loss": 1.8692, + "step": 68925 + }, + { + "epoch": 0.13425040602276844, + "grad_norm": 2.591548442840576, + "learning_rate": 2.8685547473041833e-05, + "loss": 1.8863, + "step": 68940 + }, + { + "epoch": 0.1342796162938787, + "grad_norm": 3.931533098220825, + "learning_rate": 2.8684983921173806e-05, + "loss": 1.9342, + "step": 68955 + }, + { + "epoch": 0.13430882656498896, + "grad_norm": 2.482069969177246, + "learning_rate": 2.8684420254062604e-05, + "loss": 1.642, + "step": 68970 + }, + { + "epoch": 0.1343380368360992, + "grad_norm": 3.0767180919647217, + "learning_rate": 2.8683856471712975e-05, + "loss": 1.789, + "step": 68985 + }, + { + "epoch": 0.1343672471072095, + "grad_norm": 3.8090362548828125, + "learning_rate": 2.8683292574129664e-05, + "loss": 1.7652, + "step": 69000 + }, + { + "epoch": 0.13439645737831973, + "grad_norm": 3.0935914516448975, + "learning_rate": 2.868272856131742e-05, + "loss": 1.9881, + "step": 69015 + }, + { + "epoch": 0.13442566764943, + "grad_norm": 1.9060760736465454, + "learning_rate": 2.8682164433280998e-05, + "loss": 1.8441, + "step": 69030 + }, + { + "epoch": 0.13445487792054028, + "grad_norm": 3.2525599002838135, + "learning_rate": 2.8681600190025147e-05, + "loss": 1.9291, + "step": 69045 + }, + { + "epoch": 0.13448408819165053, + "grad_norm": 4.030635356903076, + "learning_rate": 2.8681035831554614e-05, + "loss": 1.8524, + "step": 69060 + }, + { + "epoch": 0.1345132984627608, + "grad_norm": 3.5617623329162598, + "learning_rate": 2.8680471357874153e-05, + "loss": 1.9179, + "step": 69075 + }, + { + "epoch": 0.13454250873387105, + "grad_norm": 3.4556782245635986, + "learning_rate": 2.8679906768988523e-05, + "loss": 1.9186, + "step": 69090 + }, + { + "epoch": 0.13457171900498133, + "grad_norm": 3.6172924041748047, + "learning_rate": 2.8679342064902466e-05, + "loss": 2.0415, + "step": 69105 + }, + { + "epoch": 0.13460092927609157, + "grad_norm": 2.401454448699951, + "learning_rate": 2.867877724562075e-05, + "loss": 1.8363, + "step": 69120 + }, + { + "epoch": 0.13463013954720185, + "grad_norm": 4.511697769165039, + "learning_rate": 2.867821231114813e-05, + "loss": 1.8174, + "step": 69135 + }, + { + "epoch": 0.13465934981831212, + "grad_norm": 4.811645030975342, + "learning_rate": 2.8677647261489357e-05, + "loss": 2.0242, + "step": 69150 + }, + { + "epoch": 0.13468856008942237, + "grad_norm": 4.66762638092041, + "learning_rate": 2.867708209664919e-05, + "loss": 1.7449, + "step": 69165 + }, + { + "epoch": 0.13471777036053265, + "grad_norm": 1.8615055084228516, + "learning_rate": 2.8676516816632395e-05, + "loss": 1.9931, + "step": 69180 + }, + { + "epoch": 0.1347469806316429, + "grad_norm": 2.8960793018341064, + "learning_rate": 2.867595142144373e-05, + "loss": 1.9353, + "step": 69195 + }, + { + "epoch": 0.13477619090275317, + "grad_norm": 2.673841953277588, + "learning_rate": 2.8675385911087953e-05, + "loss": 1.8314, + "step": 69210 + }, + { + "epoch": 0.13480540117386342, + "grad_norm": 4.322967529296875, + "learning_rate": 2.8674820285569826e-05, + "loss": 1.8198, + "step": 69225 + }, + { + "epoch": 0.1348346114449737, + "grad_norm": 2.4058098793029785, + "learning_rate": 2.8674254544894115e-05, + "loss": 1.8773, + "step": 69240 + }, + { + "epoch": 0.13486382171608396, + "grad_norm": 2.5986292362213135, + "learning_rate": 2.8673688689065582e-05, + "loss": 1.8527, + "step": 69255 + }, + { + "epoch": 0.1348930319871942, + "grad_norm": 3.6617393493652344, + "learning_rate": 2.8673122718088998e-05, + "loss": 1.8906, + "step": 69270 + }, + { + "epoch": 0.1349222422583045, + "grad_norm": 2.539179563522339, + "learning_rate": 2.8672556631969117e-05, + "loss": 1.8227, + "step": 69285 + }, + { + "epoch": 0.13495145252941473, + "grad_norm": 3.938680648803711, + "learning_rate": 2.867199043071072e-05, + "loss": 2.0156, + "step": 69300 + }, + { + "epoch": 0.134980662800525, + "grad_norm": 2.546804428100586, + "learning_rate": 2.867142411431857e-05, + "loss": 2.089, + "step": 69315 + }, + { + "epoch": 0.13500987307163526, + "grad_norm": 4.415302276611328, + "learning_rate": 2.8670857682797432e-05, + "loss": 1.7945, + "step": 69330 + }, + { + "epoch": 0.13503908334274553, + "grad_norm": 2.914358615875244, + "learning_rate": 2.8670291136152077e-05, + "loss": 1.7852, + "step": 69345 + }, + { + "epoch": 0.1350682936138558, + "grad_norm": 2.9381892681121826, + "learning_rate": 2.8669724474387277e-05, + "loss": 1.8429, + "step": 69360 + }, + { + "epoch": 0.13509750388496605, + "grad_norm": 4.21970272064209, + "learning_rate": 2.8669157697507806e-05, + "loss": 1.896, + "step": 69375 + }, + { + "epoch": 0.13512671415607633, + "grad_norm": 3.651648759841919, + "learning_rate": 2.8668590805518434e-05, + "loss": 2.0095, + "step": 69390 + }, + { + "epoch": 0.13515592442718657, + "grad_norm": 2.8745598793029785, + "learning_rate": 2.8668023798423942e-05, + "loss": 1.8756, + "step": 69405 + }, + { + "epoch": 0.13518513469829685, + "grad_norm": 2.494096517562866, + "learning_rate": 2.8667456676229095e-05, + "loss": 1.902, + "step": 69420 + }, + { + "epoch": 0.1352143449694071, + "grad_norm": 3.6382057666778564, + "learning_rate": 2.8666889438938675e-05, + "loss": 1.7773, + "step": 69435 + }, + { + "epoch": 0.13524355524051737, + "grad_norm": 3.727525234222412, + "learning_rate": 2.8666322086557457e-05, + "loss": 1.9321, + "step": 69450 + }, + { + "epoch": 0.13527276551162765, + "grad_norm": 3.038733720779419, + "learning_rate": 2.8665754619090215e-05, + "loss": 1.8836, + "step": 69465 + }, + { + "epoch": 0.1353019757827379, + "grad_norm": 2.4349241256713867, + "learning_rate": 2.8665187036541736e-05, + "loss": 1.8838, + "step": 69480 + }, + { + "epoch": 0.13533118605384817, + "grad_norm": 3.3281540870666504, + "learning_rate": 2.8664619338916793e-05, + "loss": 1.792, + "step": 69495 + }, + { + "epoch": 0.13536039632495842, + "grad_norm": 4.144216537475586, + "learning_rate": 2.8664051526220165e-05, + "loss": 1.9561, + "step": 69510 + }, + { + "epoch": 0.1353896065960687, + "grad_norm": 4.028075218200684, + "learning_rate": 2.8663483598456642e-05, + "loss": 1.8629, + "step": 69525 + }, + { + "epoch": 0.13541881686717894, + "grad_norm": 2.8389856815338135, + "learning_rate": 2.8662915555631e-05, + "loss": 1.76, + "step": 69540 + }, + { + "epoch": 0.1354480271382892, + "grad_norm": 2.8980984687805176, + "learning_rate": 2.8662347397748027e-05, + "loss": 1.8237, + "step": 69555 + }, + { + "epoch": 0.1354772374093995, + "grad_norm": 4.472830772399902, + "learning_rate": 2.86617791248125e-05, + "loss": 1.8809, + "step": 69570 + }, + { + "epoch": 0.13550644768050973, + "grad_norm": 2.4350852966308594, + "learning_rate": 2.8661210736829214e-05, + "loss": 1.8251, + "step": 69585 + }, + { + "epoch": 0.13553565795162, + "grad_norm": 3.9808621406555176, + "learning_rate": 2.866064223380295e-05, + "loss": 1.814, + "step": 69600 + }, + { + "epoch": 0.13556486822273026, + "grad_norm": 2.541930913925171, + "learning_rate": 2.86600736157385e-05, + "loss": 1.9075, + "step": 69615 + }, + { + "epoch": 0.13559407849384053, + "grad_norm": 3.6149306297302246, + "learning_rate": 2.8659504882640644e-05, + "loss": 1.9216, + "step": 69630 + }, + { + "epoch": 0.13562328876495078, + "grad_norm": 3.375601053237915, + "learning_rate": 2.8658936034514173e-05, + "loss": 1.7895, + "step": 69645 + }, + { + "epoch": 0.13565249903606105, + "grad_norm": 2.1346988677978516, + "learning_rate": 2.8658367071363887e-05, + "loss": 1.8464, + "step": 69660 + }, + { + "epoch": 0.13568170930717133, + "grad_norm": 3.806905746459961, + "learning_rate": 2.8657797993194568e-05, + "loss": 1.8165, + "step": 69675 + }, + { + "epoch": 0.13571091957828157, + "grad_norm": 4.392396926879883, + "learning_rate": 2.865722880001101e-05, + "loss": 1.9268, + "step": 69690 + }, + { + "epoch": 0.13574012984939185, + "grad_norm": 3.2058327198028564, + "learning_rate": 2.8656659491818013e-05, + "loss": 1.6949, + "step": 69705 + }, + { + "epoch": 0.1357693401205021, + "grad_norm": 6.982804775238037, + "learning_rate": 2.865609006862036e-05, + "loss": 1.85, + "step": 69720 + }, + { + "epoch": 0.13579855039161237, + "grad_norm": 4.272575855255127, + "learning_rate": 2.8655520530422855e-05, + "loss": 1.7517, + "step": 69735 + }, + { + "epoch": 0.13582776066272262, + "grad_norm": 3.99265718460083, + "learning_rate": 2.865495087723029e-05, + "loss": 1.7822, + "step": 69750 + }, + { + "epoch": 0.1358569709338329, + "grad_norm": 5.301887035369873, + "learning_rate": 2.865438110904746e-05, + "loss": 2.0515, + "step": 69765 + }, + { + "epoch": 0.13588618120494317, + "grad_norm": 3.1535747051239014, + "learning_rate": 2.8653811225879174e-05, + "loss": 1.7036, + "step": 69780 + }, + { + "epoch": 0.13591539147605342, + "grad_norm": 3.844027280807495, + "learning_rate": 2.865324122773022e-05, + "loss": 1.7941, + "step": 69795 + }, + { + "epoch": 0.1359446017471637, + "grad_norm": 2.608506917953491, + "learning_rate": 2.86526711146054e-05, + "loss": 2.0675, + "step": 69810 + }, + { + "epoch": 0.13597381201827394, + "grad_norm": 4.972315311431885, + "learning_rate": 2.8652100886509517e-05, + "loss": 1.7046, + "step": 69825 + }, + { + "epoch": 0.1360030222893842, + "grad_norm": 2.5615687370300293, + "learning_rate": 2.865153054344737e-05, + "loss": 1.8108, + "step": 69840 + }, + { + "epoch": 0.13603223256049446, + "grad_norm": 3.6874258518218994, + "learning_rate": 2.8650960085423765e-05, + "loss": 1.8509, + "step": 69855 + }, + { + "epoch": 0.13606144283160473, + "grad_norm": 4.713318347930908, + "learning_rate": 2.8650389512443508e-05, + "loss": 1.8962, + "step": 69870 + }, + { + "epoch": 0.136090653102715, + "grad_norm": 1.7649924755096436, + "learning_rate": 2.86498188245114e-05, + "loss": 1.9218, + "step": 69885 + }, + { + "epoch": 0.13611986337382526, + "grad_norm": 5.523397922515869, + "learning_rate": 2.8649248021632244e-05, + "loss": 1.8151, + "step": 69900 + }, + { + "epoch": 0.13614907364493553, + "grad_norm": 3.470808982849121, + "learning_rate": 2.8648677103810856e-05, + "loss": 1.7345, + "step": 69915 + }, + { + "epoch": 0.13617828391604578, + "grad_norm": 3.577054262161255, + "learning_rate": 2.8648106071052038e-05, + "loss": 1.9622, + "step": 69930 + }, + { + "epoch": 0.13620749418715605, + "grad_norm": 2.668632745742798, + "learning_rate": 2.8647534923360597e-05, + "loss": 1.7735, + "step": 69945 + }, + { + "epoch": 0.1362367044582663, + "grad_norm": 3.424464225769043, + "learning_rate": 2.8646963660741343e-05, + "loss": 1.9655, + "step": 69960 + }, + { + "epoch": 0.13626591472937657, + "grad_norm": 2.502997636795044, + "learning_rate": 2.8646392283199095e-05, + "loss": 1.9703, + "step": 69975 + }, + { + "epoch": 0.13629512500048685, + "grad_norm": 3.864332675933838, + "learning_rate": 2.8645820790738654e-05, + "loss": 1.8135, + "step": 69990 + }, + { + "epoch": 0.1363243352715971, + "grad_norm": 2.123206377029419, + "learning_rate": 2.8645249183364834e-05, + "loss": 2.0135, + "step": 70005 + }, + { + "epoch": 0.13635354554270737, + "grad_norm": 4.530364513397217, + "learning_rate": 2.8644677461082457e-05, + "loss": 1.7662, + "step": 70020 + }, + { + "epoch": 0.13638275581381762, + "grad_norm": 2.3569891452789307, + "learning_rate": 2.8644105623896333e-05, + "loss": 1.7531, + "step": 70035 + }, + { + "epoch": 0.1364119660849279, + "grad_norm": 3.978269100189209, + "learning_rate": 2.864353367181127e-05, + "loss": 1.755, + "step": 70050 + }, + { + "epoch": 0.13644117635603814, + "grad_norm": 2.6547672748565674, + "learning_rate": 2.8642961604832096e-05, + "loss": 1.8935, + "step": 70065 + }, + { + "epoch": 0.13647038662714842, + "grad_norm": 2.4695494174957275, + "learning_rate": 2.864238942296362e-05, + "loss": 1.8694, + "step": 70080 + }, + { + "epoch": 0.1364995968982587, + "grad_norm": 2.127756357192993, + "learning_rate": 2.8641817126210668e-05, + "loss": 1.7671, + "step": 70095 + }, + { + "epoch": 0.13652880716936894, + "grad_norm": 3.58125901222229, + "learning_rate": 2.864124471457805e-05, + "loss": 1.8248, + "step": 70110 + }, + { + "epoch": 0.1365580174404792, + "grad_norm": 2.0331993103027344, + "learning_rate": 2.86406721880706e-05, + "loss": 2.1161, + "step": 70125 + }, + { + "epoch": 0.13658722771158946, + "grad_norm": 2.5416359901428223, + "learning_rate": 2.8640099546693127e-05, + "loss": 1.837, + "step": 70140 + }, + { + "epoch": 0.13661643798269973, + "grad_norm": 3.407165765762329, + "learning_rate": 2.8639526790450455e-05, + "loss": 1.8438, + "step": 70155 + }, + { + "epoch": 0.13664564825380998, + "grad_norm": 2.5498344898223877, + "learning_rate": 2.863895391934741e-05, + "loss": 2.008, + "step": 70170 + }, + { + "epoch": 0.13667485852492026, + "grad_norm": 3.6924312114715576, + "learning_rate": 2.8638380933388816e-05, + "loss": 1.8779, + "step": 70185 + }, + { + "epoch": 0.13670406879603053, + "grad_norm": 3.1848909854888916, + "learning_rate": 2.86378078325795e-05, + "loss": 2.0103, + "step": 70200 + }, + { + "epoch": 0.13673327906714078, + "grad_norm": 3.5813753604888916, + "learning_rate": 2.8637234616924288e-05, + "loss": 1.6701, + "step": 70215 + }, + { + "epoch": 0.13676248933825105, + "grad_norm": 3.0269784927368164, + "learning_rate": 2.8636661286428003e-05, + "loss": 1.8589, + "step": 70230 + }, + { + "epoch": 0.1367916996093613, + "grad_norm": 1.8303242921829224, + "learning_rate": 2.8636087841095473e-05, + "loss": 1.7957, + "step": 70245 + }, + { + "epoch": 0.13682090988047158, + "grad_norm": 5.159512042999268, + "learning_rate": 2.863551428093153e-05, + "loss": 2.0476, + "step": 70260 + }, + { + "epoch": 0.13685012015158182, + "grad_norm": 3.9471347332000732, + "learning_rate": 2.8634940605941006e-05, + "loss": 1.8627, + "step": 70275 + }, + { + "epoch": 0.1368793304226921, + "grad_norm": 2.1260244846343994, + "learning_rate": 2.8634366816128725e-05, + "loss": 2.0826, + "step": 70290 + }, + { + "epoch": 0.13690854069380237, + "grad_norm": 3.6818416118621826, + "learning_rate": 2.863379291149953e-05, + "loss": 1.7907, + "step": 70305 + }, + { + "epoch": 0.13693775096491262, + "grad_norm": 3.602248430252075, + "learning_rate": 2.8633218892058245e-05, + "loss": 1.9224, + "step": 70320 + }, + { + "epoch": 0.1369669612360229, + "grad_norm": 3.4093058109283447, + "learning_rate": 2.86326447578097e-05, + "loss": 1.7702, + "step": 70335 + }, + { + "epoch": 0.13699617150713314, + "grad_norm": 1.6547880172729492, + "learning_rate": 2.8632070508758743e-05, + "loss": 1.8019, + "step": 70350 + }, + { + "epoch": 0.13702538177824342, + "grad_norm": 2.2754833698272705, + "learning_rate": 2.86314961449102e-05, + "loss": 1.826, + "step": 70365 + }, + { + "epoch": 0.13705459204935366, + "grad_norm": 3.2182445526123047, + "learning_rate": 2.8630921666268918e-05, + "loss": 1.8688, + "step": 70380 + }, + { + "epoch": 0.13708380232046394, + "grad_norm": 2.528761625289917, + "learning_rate": 2.863034707283972e-05, + "loss": 1.911, + "step": 70395 + }, + { + "epoch": 0.1371130125915742, + "grad_norm": 2.8982722759246826, + "learning_rate": 2.8629772364627455e-05, + "loss": 2.0188, + "step": 70410 + }, + { + "epoch": 0.13714222286268446, + "grad_norm": 2.3159430027008057, + "learning_rate": 2.8629197541636955e-05, + "loss": 1.8784, + "step": 70425 + }, + { + "epoch": 0.13717143313379473, + "grad_norm": 3.244826555252075, + "learning_rate": 2.862862260387307e-05, + "loss": 1.9028, + "step": 70440 + }, + { + "epoch": 0.13720064340490498, + "grad_norm": 3.5103042125701904, + "learning_rate": 2.8628047551340636e-05, + "loss": 1.736, + "step": 70455 + }, + { + "epoch": 0.13722985367601526, + "grad_norm": 3.8966267108917236, + "learning_rate": 2.86274723840445e-05, + "loss": 1.762, + "step": 70470 + }, + { + "epoch": 0.1372590639471255, + "grad_norm": 2.7817652225494385, + "learning_rate": 2.86268971019895e-05, + "loss": 1.7712, + "step": 70485 + }, + { + "epoch": 0.13728827421823578, + "grad_norm": 5.372223854064941, + "learning_rate": 2.862632170518048e-05, + "loss": 1.8165, + "step": 70500 + }, + { + "epoch": 0.13731748448934605, + "grad_norm": 2.7599360942840576, + "learning_rate": 2.862574619362229e-05, + "loss": 2.0248, + "step": 70515 + }, + { + "epoch": 0.1373466947604563, + "grad_norm": 3.2481796741485596, + "learning_rate": 2.862517056731978e-05, + "loss": 1.797, + "step": 70530 + }, + { + "epoch": 0.13737590503156658, + "grad_norm": 3.2481703758239746, + "learning_rate": 2.8624594826277784e-05, + "loss": 1.6762, + "step": 70545 + }, + { + "epoch": 0.13740511530267682, + "grad_norm": 5.844207763671875, + "learning_rate": 2.862401897050117e-05, + "loss": 1.8943, + "step": 70560 + }, + { + "epoch": 0.1374343255737871, + "grad_norm": 3.037783145904541, + "learning_rate": 2.8623442999994767e-05, + "loss": 1.9261, + "step": 70575 + }, + { + "epoch": 0.13746353584489734, + "grad_norm": 2.8143088817596436, + "learning_rate": 2.8622866914763433e-05, + "loss": 1.9345, + "step": 70590 + }, + { + "epoch": 0.13749274611600762, + "grad_norm": 2.1399495601654053, + "learning_rate": 2.862229071481203e-05, + "loss": 1.7769, + "step": 70605 + }, + { + "epoch": 0.1375219563871179, + "grad_norm": 3.7307958602905273, + "learning_rate": 2.8621714400145394e-05, + "loss": 1.7935, + "step": 70620 + }, + { + "epoch": 0.13755116665822814, + "grad_norm": 2.177015781402588, + "learning_rate": 2.8621137970768385e-05, + "loss": 1.8755, + "step": 70635 + }, + { + "epoch": 0.13758037692933842, + "grad_norm": 3.256514310836792, + "learning_rate": 2.862056142668586e-05, + "loss": 2.0177, + "step": 70650 + }, + { + "epoch": 0.13760958720044866, + "grad_norm": 3.078580141067505, + "learning_rate": 2.861998476790267e-05, + "loss": 1.8304, + "step": 70665 + }, + { + "epoch": 0.13763879747155894, + "grad_norm": 3.839451313018799, + "learning_rate": 2.8619407994423675e-05, + "loss": 2.1288, + "step": 70680 + }, + { + "epoch": 0.13766800774266919, + "grad_norm": 2.0073304176330566, + "learning_rate": 2.861883110625373e-05, + "loss": 1.8201, + "step": 70695 + }, + { + "epoch": 0.13769721801377946, + "grad_norm": 2.4279634952545166, + "learning_rate": 2.861825410339769e-05, + "loss": 1.917, + "step": 70710 + }, + { + "epoch": 0.13772642828488973, + "grad_norm": 2.3362443447113037, + "learning_rate": 2.8617676985860417e-05, + "loss": 2.0894, + "step": 70725 + }, + { + "epoch": 0.13775563855599998, + "grad_norm": 3.2584242820739746, + "learning_rate": 2.8617099753646773e-05, + "loss": 1.8502, + "step": 70740 + }, + { + "epoch": 0.13778484882711026, + "grad_norm": 3.9391400814056396, + "learning_rate": 2.8616522406761616e-05, + "loss": 1.9063, + "step": 70755 + }, + { + "epoch": 0.1378140590982205, + "grad_norm": 2.10555362701416, + "learning_rate": 2.8615944945209807e-05, + "loss": 1.7109, + "step": 70770 + }, + { + "epoch": 0.13784326936933078, + "grad_norm": 1.9338374137878418, + "learning_rate": 2.861536736899621e-05, + "loss": 1.818, + "step": 70785 + }, + { + "epoch": 0.13787247964044103, + "grad_norm": 3.0186808109283447, + "learning_rate": 2.8614789678125693e-05, + "loss": 1.7597, + "step": 70800 + }, + { + "epoch": 0.1379016899115513, + "grad_norm": 4.825221538543701, + "learning_rate": 2.8614211872603115e-05, + "loss": 1.884, + "step": 70815 + }, + { + "epoch": 0.13793090018266158, + "grad_norm": 3.219672918319702, + "learning_rate": 2.8613633952433347e-05, + "loss": 1.7872, + "step": 70830 + }, + { + "epoch": 0.13796011045377182, + "grad_norm": 3.1833243370056152, + "learning_rate": 2.861305591762125e-05, + "loss": 1.9234, + "step": 70845 + }, + { + "epoch": 0.1379893207248821, + "grad_norm": 2.361645460128784, + "learning_rate": 2.8612477768171694e-05, + "loss": 1.9496, + "step": 70860 + }, + { + "epoch": 0.13801853099599234, + "grad_norm": 3.5741446018218994, + "learning_rate": 2.861189950408955e-05, + "loss": 1.8476, + "step": 70875 + }, + { + "epoch": 0.13804774126710262, + "grad_norm": 3.198500156402588, + "learning_rate": 2.8611321125379686e-05, + "loss": 1.8414, + "step": 70890 + }, + { + "epoch": 0.13807695153821287, + "grad_norm": 2.563737392425537, + "learning_rate": 2.861074263204697e-05, + "loss": 1.8466, + "step": 70905 + }, + { + "epoch": 0.13810616180932314, + "grad_norm": 3.325080394744873, + "learning_rate": 2.861016402409628e-05, + "loss": 1.9971, + "step": 70920 + }, + { + "epoch": 0.13813537208043342, + "grad_norm": 3.8434271812438965, + "learning_rate": 2.860958530153248e-05, + "loss": 1.7892, + "step": 70935 + }, + { + "epoch": 0.13816458235154366, + "grad_norm": 3.9635753631591797, + "learning_rate": 2.860900646436045e-05, + "loss": 1.7376, + "step": 70950 + }, + { + "epoch": 0.13819379262265394, + "grad_norm": 3.949159622192383, + "learning_rate": 2.8608427512585064e-05, + "loss": 1.8708, + "step": 70965 + }, + { + "epoch": 0.13822300289376419, + "grad_norm": 2.4773647785186768, + "learning_rate": 2.8607848446211196e-05, + "loss": 2.0197, + "step": 70980 + }, + { + "epoch": 0.13825221316487446, + "grad_norm": 3.007659673690796, + "learning_rate": 2.860726926524372e-05, + "loss": 1.7678, + "step": 70995 + }, + { + "epoch": 0.1382814234359847, + "grad_norm": 2.3068344593048096, + "learning_rate": 2.8606689969687518e-05, + "loss": 1.838, + "step": 71010 + }, + { + "epoch": 0.13831063370709498, + "grad_norm": 3.94028377532959, + "learning_rate": 2.8606110559547466e-05, + "loss": 1.8505, + "step": 71025 + }, + { + "epoch": 0.13833984397820526, + "grad_norm": 3.5514869689941406, + "learning_rate": 2.8605531034828437e-05, + "loss": 1.9161, + "step": 71040 + }, + { + "epoch": 0.1383690542493155, + "grad_norm": 3.0559253692626953, + "learning_rate": 2.8604951395535324e-05, + "loss": 1.951, + "step": 71055 + }, + { + "epoch": 0.13839826452042578, + "grad_norm": 3.690962314605713, + "learning_rate": 2.8604371641673005e-05, + "loss": 1.7942, + "step": 71070 + }, + { + "epoch": 0.13842747479153603, + "grad_norm": 1.867956280708313, + "learning_rate": 2.8603791773246353e-05, + "loss": 1.6268, + "step": 71085 + }, + { + "epoch": 0.1384566850626463, + "grad_norm": 3.255117416381836, + "learning_rate": 2.8603211790260257e-05, + "loss": 1.8889, + "step": 71100 + }, + { + "epoch": 0.13848589533375655, + "grad_norm": 2.8291800022125244, + "learning_rate": 2.8602631692719605e-05, + "loss": 1.9543, + "step": 71115 + }, + { + "epoch": 0.13851510560486682, + "grad_norm": 3.5518085956573486, + "learning_rate": 2.8602051480629274e-05, + "loss": 2.0465, + "step": 71130 + }, + { + "epoch": 0.13854431587597707, + "grad_norm": 3.7115893363952637, + "learning_rate": 2.860147115399416e-05, + "loss": 1.7979, + "step": 71145 + }, + { + "epoch": 0.13857352614708734, + "grad_norm": 2.1558685302734375, + "learning_rate": 2.860089071281914e-05, + "loss": 1.7327, + "step": 71160 + }, + { + "epoch": 0.13860273641819762, + "grad_norm": 3.2817349433898926, + "learning_rate": 2.8600310157109106e-05, + "loss": 1.6618, + "step": 71175 + }, + { + "epoch": 0.13863194668930787, + "grad_norm": 2.672372579574585, + "learning_rate": 2.859972948686895e-05, + "loss": 2.0392, + "step": 71190 + }, + { + "epoch": 0.13866115696041814, + "grad_norm": 3.65826153755188, + "learning_rate": 2.859914870210356e-05, + "loss": 1.9047, + "step": 71205 + }, + { + "epoch": 0.1386903672315284, + "grad_norm": 3.3695216178894043, + "learning_rate": 2.8598567802817825e-05, + "loss": 2.0386, + "step": 71220 + }, + { + "epoch": 0.13871957750263866, + "grad_norm": 5.822972297668457, + "learning_rate": 2.8597986789016633e-05, + "loss": 1.6654, + "step": 71235 + }, + { + "epoch": 0.1387487877737489, + "grad_norm": 3.310852527618408, + "learning_rate": 2.8597405660704888e-05, + "loss": 1.9656, + "step": 71250 + }, + { + "epoch": 0.13877799804485919, + "grad_norm": 2.0551564693450928, + "learning_rate": 2.8596824417887475e-05, + "loss": 1.9576, + "step": 71265 + }, + { + "epoch": 0.13880720831596946, + "grad_norm": 2.6878106594085693, + "learning_rate": 2.8596243060569293e-05, + "loss": 1.9214, + "step": 71280 + }, + { + "epoch": 0.1388364185870797, + "grad_norm": 3.410797119140625, + "learning_rate": 2.8595661588755235e-05, + "loss": 1.8431, + "step": 71295 + }, + { + "epoch": 0.13886562885818998, + "grad_norm": 3.6820807456970215, + "learning_rate": 2.85950800024502e-05, + "loss": 1.8654, + "step": 71310 + }, + { + "epoch": 0.13889483912930023, + "grad_norm": 2.0582315921783447, + "learning_rate": 2.8594498301659085e-05, + "loss": 1.9418, + "step": 71325 + }, + { + "epoch": 0.1389240494004105, + "grad_norm": 3.8680450916290283, + "learning_rate": 2.8593916486386784e-05, + "loss": 1.7728, + "step": 71340 + }, + { + "epoch": 0.13895325967152075, + "grad_norm": 3.46667742729187, + "learning_rate": 2.8593334556638204e-05, + "loss": 1.9932, + "step": 71355 + }, + { + "epoch": 0.13898246994263103, + "grad_norm": 2.202939510345459, + "learning_rate": 2.8592752512418233e-05, + "loss": 1.9344, + "step": 71370 + }, + { + "epoch": 0.1390116802137413, + "grad_norm": 2.7220945358276367, + "learning_rate": 2.859217035373179e-05, + "loss": 2.0671, + "step": 71385 + }, + { + "epoch": 0.13904089048485155, + "grad_norm": 2.4224140644073486, + "learning_rate": 2.859158808058377e-05, + "loss": 1.9264, + "step": 71400 + }, + { + "epoch": 0.13907010075596182, + "grad_norm": 2.667553186416626, + "learning_rate": 2.8591005692979072e-05, + "loss": 1.839, + "step": 71415 + }, + { + "epoch": 0.13909931102707207, + "grad_norm": 2.7434866428375244, + "learning_rate": 2.85904231909226e-05, + "loss": 1.9893, + "step": 71430 + }, + { + "epoch": 0.13912852129818234, + "grad_norm": 4.503064155578613, + "learning_rate": 2.858984057441927e-05, + "loss": 1.9206, + "step": 71445 + }, + { + "epoch": 0.1391577315692926, + "grad_norm": 5.560927391052246, + "learning_rate": 2.858925784347398e-05, + "loss": 1.8916, + "step": 71460 + }, + { + "epoch": 0.13918694184040287, + "grad_norm": 3.5854392051696777, + "learning_rate": 2.8588674998091633e-05, + "loss": 1.9012, + "step": 71475 + }, + { + "epoch": 0.13921615211151314, + "grad_norm": 4.194972515106201, + "learning_rate": 2.8588092038277148e-05, + "loss": 1.7605, + "step": 71490 + }, + { + "epoch": 0.1392453623826234, + "grad_norm": 2.6064321994781494, + "learning_rate": 2.8587508964035427e-05, + "loss": 1.92, + "step": 71505 + }, + { + "epoch": 0.13927457265373366, + "grad_norm": 2.2962450981140137, + "learning_rate": 2.858692577537138e-05, + "loss": 2.0298, + "step": 71520 + }, + { + "epoch": 0.1393037829248439, + "grad_norm": 4.937525749206543, + "learning_rate": 2.8586342472289924e-05, + "loss": 1.8273, + "step": 71535 + }, + { + "epoch": 0.13933299319595419, + "grad_norm": 2.760812520980835, + "learning_rate": 2.8585759054795962e-05, + "loss": 1.6509, + "step": 71550 + }, + { + "epoch": 0.13936220346706443, + "grad_norm": 3.2743563652038574, + "learning_rate": 2.8585175522894415e-05, + "loss": 1.8332, + "step": 71565 + }, + { + "epoch": 0.1393914137381747, + "grad_norm": 3.5828170776367188, + "learning_rate": 2.8584591876590195e-05, + "loss": 1.9604, + "step": 71580 + }, + { + "epoch": 0.13942062400928498, + "grad_norm": 3.5931100845336914, + "learning_rate": 2.8584008115888217e-05, + "loss": 2.0369, + "step": 71595 + }, + { + "epoch": 0.13944983428039523, + "grad_norm": 3.753509521484375, + "learning_rate": 2.8583424240793395e-05, + "loss": 1.9286, + "step": 71610 + }, + { + "epoch": 0.1394790445515055, + "grad_norm": 2.669769525527954, + "learning_rate": 2.8582840251310647e-05, + "loss": 1.7617, + "step": 71625 + }, + { + "epoch": 0.13950825482261575, + "grad_norm": 3.6030869483947754, + "learning_rate": 2.858225614744489e-05, + "loss": 1.7366, + "step": 71640 + }, + { + "epoch": 0.13953746509372603, + "grad_norm": 4.630536079406738, + "learning_rate": 2.8581671929201044e-05, + "loss": 2.0326, + "step": 71655 + }, + { + "epoch": 0.13956667536483627, + "grad_norm": 3.303692102432251, + "learning_rate": 2.858108759658403e-05, + "loss": 1.7888, + "step": 71670 + }, + { + "epoch": 0.13959588563594655, + "grad_norm": 3.2129344940185547, + "learning_rate": 2.8580503149598765e-05, + "loss": 1.9526, + "step": 71685 + }, + { + "epoch": 0.13962509590705682, + "grad_norm": 2.228689670562744, + "learning_rate": 2.8579918588250175e-05, + "loss": 1.9626, + "step": 71700 + }, + { + "epoch": 0.13965430617816707, + "grad_norm": 3.8349947929382324, + "learning_rate": 2.857933391254318e-05, + "loss": 1.866, + "step": 71715 + }, + { + "epoch": 0.13968351644927735, + "grad_norm": 4.557125568389893, + "learning_rate": 2.8578749122482704e-05, + "loss": 1.773, + "step": 71730 + }, + { + "epoch": 0.1397127267203876, + "grad_norm": 3.422135591506958, + "learning_rate": 2.857816421807367e-05, + "loss": 1.7933, + "step": 71745 + }, + { + "epoch": 0.13974193699149787, + "grad_norm": 2.453348398208618, + "learning_rate": 2.857757919932101e-05, + "loss": 1.9603, + "step": 71760 + }, + { + "epoch": 0.13977114726260811, + "grad_norm": 3.0494730472564697, + "learning_rate": 2.8576994066229643e-05, + "loss": 1.8024, + "step": 71775 + }, + { + "epoch": 0.1398003575337184, + "grad_norm": 3.5220632553100586, + "learning_rate": 2.8576408818804502e-05, + "loss": 1.9201, + "step": 71790 + }, + { + "epoch": 0.13982956780482866, + "grad_norm": 2.5408124923706055, + "learning_rate": 2.857582345705051e-05, + "loss": 1.8995, + "step": 71805 + }, + { + "epoch": 0.1398587780759389, + "grad_norm": 2.9946658611297607, + "learning_rate": 2.85752379809726e-05, + "loss": 1.8417, + "step": 71820 + }, + { + "epoch": 0.13988798834704919, + "grad_norm": 4.913395881652832, + "learning_rate": 2.85746523905757e-05, + "loss": 1.8443, + "step": 71835 + }, + { + "epoch": 0.13991719861815943, + "grad_norm": 2.408912181854248, + "learning_rate": 2.8574066685864744e-05, + "loss": 1.8901, + "step": 71850 + }, + { + "epoch": 0.1399464088892697, + "grad_norm": 2.9877641201019287, + "learning_rate": 2.8573480866844664e-05, + "loss": 1.9017, + "step": 71865 + }, + { + "epoch": 0.13997561916037995, + "grad_norm": 2.2591705322265625, + "learning_rate": 2.8572894933520394e-05, + "loss": 1.74, + "step": 71880 + }, + { + "epoch": 0.14000482943149023, + "grad_norm": 1.7987804412841797, + "learning_rate": 2.8572308885896865e-05, + "loss": 1.7682, + "step": 71895 + }, + { + "epoch": 0.1400340397026005, + "grad_norm": 2.5721595287323, + "learning_rate": 2.8571722723979016e-05, + "loss": 2.02, + "step": 71910 + }, + { + "epoch": 0.14006324997371075, + "grad_norm": 3.3154850006103516, + "learning_rate": 2.857113644777178e-05, + "loss": 1.993, + "step": 71925 + }, + { + "epoch": 0.14009246024482103, + "grad_norm": 3.714911937713623, + "learning_rate": 2.8570550057280092e-05, + "loss": 1.9497, + "step": 71940 + }, + { + "epoch": 0.14012167051593127, + "grad_norm": 4.850703239440918, + "learning_rate": 2.8569963552508897e-05, + "loss": 2.043, + "step": 71955 + }, + { + "epoch": 0.14015088078704155, + "grad_norm": 4.17022705078125, + "learning_rate": 2.8569376933463128e-05, + "loss": 1.9227, + "step": 71970 + }, + { + "epoch": 0.1401800910581518, + "grad_norm": 3.2848081588745117, + "learning_rate": 2.8568790200147732e-05, + "loss": 1.7643, + "step": 71985 + }, + { + "epoch": 0.14020930132926207, + "grad_norm": 2.454164505004883, + "learning_rate": 2.856820335256764e-05, + "loss": 1.7121, + "step": 72000 + }, + { + "epoch": 0.14023851160037235, + "grad_norm": 2.653320074081421, + "learning_rate": 2.8567616390727802e-05, + "loss": 1.876, + "step": 72015 + }, + { + "epoch": 0.1402677218714826, + "grad_norm": 2.751718521118164, + "learning_rate": 2.856702931463316e-05, + "loss": 1.9042, + "step": 72030 + }, + { + "epoch": 0.14029693214259287, + "grad_norm": 2.1671385765075684, + "learning_rate": 2.856644212428865e-05, + "loss": 1.8015, + "step": 72045 + }, + { + "epoch": 0.14032614241370311, + "grad_norm": 2.423429012298584, + "learning_rate": 2.856585481969923e-05, + "loss": 1.8018, + "step": 72060 + }, + { + "epoch": 0.1403553526848134, + "grad_norm": 5.615732669830322, + "learning_rate": 2.8565267400869834e-05, + "loss": 1.8395, + "step": 72075 + }, + { + "epoch": 0.14038456295592364, + "grad_norm": 2.5066416263580322, + "learning_rate": 2.856467986780541e-05, + "loss": 1.781, + "step": 72090 + }, + { + "epoch": 0.1404137732270339, + "grad_norm": 2.842067241668701, + "learning_rate": 2.8564092220510918e-05, + "loss": 1.9483, + "step": 72105 + }, + { + "epoch": 0.14044298349814419, + "grad_norm": 5.812524795532227, + "learning_rate": 2.856350445899129e-05, + "loss": 1.8804, + "step": 72120 + }, + { + "epoch": 0.14047219376925443, + "grad_norm": 3.507147789001465, + "learning_rate": 2.856291658325149e-05, + "loss": 1.9493, + "step": 72135 + }, + { + "epoch": 0.1405014040403647, + "grad_norm": 2.52659273147583, + "learning_rate": 2.8562328593296458e-05, + "loss": 1.7875, + "step": 72150 + }, + { + "epoch": 0.14053061431147496, + "grad_norm": 2.5239145755767822, + "learning_rate": 2.856174048913115e-05, + "loss": 1.9956, + "step": 72165 + }, + { + "epoch": 0.14055982458258523, + "grad_norm": 4.1702189445495605, + "learning_rate": 2.8561152270760518e-05, + "loss": 1.8297, + "step": 72180 + }, + { + "epoch": 0.14058903485369548, + "grad_norm": 2.927109718322754, + "learning_rate": 2.8560563938189513e-05, + "loss": 1.8601, + "step": 72195 + }, + { + "epoch": 0.14061824512480575, + "grad_norm": 1.9822522401809692, + "learning_rate": 2.8559975491423094e-05, + "loss": 2.026, + "step": 72210 + }, + { + "epoch": 0.14064745539591603, + "grad_norm": 3.1911373138427734, + "learning_rate": 2.8559386930466212e-05, + "loss": 1.7113, + "step": 72225 + }, + { + "epoch": 0.14067666566702627, + "grad_norm": 3.5574357509613037, + "learning_rate": 2.855879825532383e-05, + "loss": 2.0161, + "step": 72240 + }, + { + "epoch": 0.14070587593813655, + "grad_norm": 2.067826509475708, + "learning_rate": 2.8558209466000897e-05, + "loss": 1.825, + "step": 72255 + }, + { + "epoch": 0.1407350862092468, + "grad_norm": 3.21112322807312, + "learning_rate": 2.8557620562502377e-05, + "loss": 1.9053, + "step": 72270 + }, + { + "epoch": 0.14076429648035707, + "grad_norm": 3.5433173179626465, + "learning_rate": 2.855703154483323e-05, + "loss": 1.891, + "step": 72285 + }, + { + "epoch": 0.14079350675146732, + "grad_norm": 2.5802927017211914, + "learning_rate": 2.855644241299841e-05, + "loss": 1.9096, + "step": 72300 + }, + { + "epoch": 0.1408227170225776, + "grad_norm": 2.4555084705352783, + "learning_rate": 2.8555853167002886e-05, + "loss": 1.9266, + "step": 72315 + }, + { + "epoch": 0.14085192729368787, + "grad_norm": 3.553542375564575, + "learning_rate": 2.855526380685161e-05, + "loss": 1.8656, + "step": 72330 + }, + { + "epoch": 0.14088113756479811, + "grad_norm": 3.42453932762146, + "learning_rate": 2.855467433254955e-05, + "loss": 1.7774, + "step": 72345 + }, + { + "epoch": 0.1409103478359084, + "grad_norm": 3.7535057067871094, + "learning_rate": 2.855408474410168e-05, + "loss": 2.0529, + "step": 72360 + }, + { + "epoch": 0.14093955810701864, + "grad_norm": 2.661471366882324, + "learning_rate": 2.855349504151295e-05, + "loss": 1.8574, + "step": 72375 + }, + { + "epoch": 0.1409687683781289, + "grad_norm": 2.816754102706909, + "learning_rate": 2.8552905224788334e-05, + "loss": 1.8226, + "step": 72390 + }, + { + "epoch": 0.14099797864923916, + "grad_norm": 3.2662816047668457, + "learning_rate": 2.8552315293932795e-05, + "loss": 1.8502, + "step": 72405 + }, + { + "epoch": 0.14102718892034943, + "grad_norm": 1.646978497505188, + "learning_rate": 2.8551725248951306e-05, + "loss": 1.6915, + "step": 72420 + }, + { + "epoch": 0.1410563991914597, + "grad_norm": 2.7637829780578613, + "learning_rate": 2.8551135089848828e-05, + "loss": 1.8993, + "step": 72435 + }, + { + "epoch": 0.14108560946256996, + "grad_norm": 3.0363423824310303, + "learning_rate": 2.8550544816630337e-05, + "loss": 1.7698, + "step": 72450 + }, + { + "epoch": 0.14111481973368023, + "grad_norm": 4.07511568069458, + "learning_rate": 2.8549954429300808e-05, + "loss": 1.9219, + "step": 72465 + }, + { + "epoch": 0.14114403000479048, + "grad_norm": 2.8016772270202637, + "learning_rate": 2.8549363927865197e-05, + "loss": 2.0479, + "step": 72480 + }, + { + "epoch": 0.14117324027590075, + "grad_norm": 2.318236827850342, + "learning_rate": 2.8548773312328495e-05, + "loss": 1.8605, + "step": 72495 + }, + { + "epoch": 0.141202450547011, + "grad_norm": 4.731409072875977, + "learning_rate": 2.854818258269566e-05, + "loss": 2.048, + "step": 72510 + }, + { + "epoch": 0.14123166081812127, + "grad_norm": 3.2995524406433105, + "learning_rate": 2.8547591738971682e-05, + "loss": 1.6959, + "step": 72525 + }, + { + "epoch": 0.14126087108923155, + "grad_norm": 3.210134267807007, + "learning_rate": 2.8547000781161522e-05, + "loss": 1.8642, + "step": 72540 + }, + { + "epoch": 0.1412900813603418, + "grad_norm": 2.76589035987854, + "learning_rate": 2.854640970927017e-05, + "loss": 1.9711, + "step": 72555 + }, + { + "epoch": 0.14131929163145207, + "grad_norm": 5.192736625671387, + "learning_rate": 2.8545818523302592e-05, + "loss": 1.726, + "step": 72570 + }, + { + "epoch": 0.14134850190256232, + "grad_norm": 3.0749924182891846, + "learning_rate": 2.8545227223263773e-05, + "loss": 1.8705, + "step": 72585 + }, + { + "epoch": 0.1413777121736726, + "grad_norm": 4.19124698638916, + "learning_rate": 2.854463580915869e-05, + "loss": 1.7416, + "step": 72600 + }, + { + "epoch": 0.14140692244478284, + "grad_norm": 3.1728155612945557, + "learning_rate": 2.854404428099232e-05, + "loss": 1.8707, + "step": 72615 + }, + { + "epoch": 0.14143613271589311, + "grad_norm": 4.006614685058594, + "learning_rate": 2.8543452638769654e-05, + "loss": 1.8875, + "step": 72630 + }, + { + "epoch": 0.1414653429870034, + "grad_norm": 2.681722640991211, + "learning_rate": 2.8542860882495663e-05, + "loss": 1.7689, + "step": 72645 + }, + { + "epoch": 0.14149455325811364, + "grad_norm": 2.611468553543091, + "learning_rate": 2.854226901217534e-05, + "loss": 1.7835, + "step": 72660 + }, + { + "epoch": 0.1415237635292239, + "grad_norm": 2.6967124938964844, + "learning_rate": 2.8541677027813664e-05, + "loss": 1.822, + "step": 72675 + }, + { + "epoch": 0.14155297380033416, + "grad_norm": 3.5053014755249023, + "learning_rate": 2.8541084929415622e-05, + "loss": 1.9839, + "step": 72690 + }, + { + "epoch": 0.14158218407144443, + "grad_norm": 3.278634548187256, + "learning_rate": 2.8540492716986197e-05, + "loss": 1.8012, + "step": 72705 + }, + { + "epoch": 0.14161139434255468, + "grad_norm": 2.977452516555786, + "learning_rate": 2.853990039053038e-05, + "loss": 1.7916, + "step": 72720 + }, + { + "epoch": 0.14164060461366496, + "grad_norm": 3.2461748123168945, + "learning_rate": 2.8539307950053155e-05, + "loss": 1.9501, + "step": 72735 + }, + { + "epoch": 0.14166981488477523, + "grad_norm": 2.3273677825927734, + "learning_rate": 2.8538715395559517e-05, + "loss": 1.8374, + "step": 72750 + }, + { + "epoch": 0.14169902515588548, + "grad_norm": 3.406216621398926, + "learning_rate": 2.853812272705445e-05, + "loss": 1.8103, + "step": 72765 + }, + { + "epoch": 0.14172823542699575, + "grad_norm": 3.64847469329834, + "learning_rate": 2.853752994454295e-05, + "loss": 1.9675, + "step": 72780 + }, + { + "epoch": 0.141757445698106, + "grad_norm": 3.6286330223083496, + "learning_rate": 2.8536937048030006e-05, + "loss": 1.8864, + "step": 72795 + }, + { + "epoch": 0.14178665596921627, + "grad_norm": 3.850263833999634, + "learning_rate": 2.853634403752061e-05, + "loss": 1.8857, + "step": 72810 + }, + { + "epoch": 0.14181586624032652, + "grad_norm": 2.695934295654297, + "learning_rate": 2.8535750913019753e-05, + "loss": 1.9056, + "step": 72825 + }, + { + "epoch": 0.1418450765114368, + "grad_norm": 1.9541032314300537, + "learning_rate": 2.853515767453244e-05, + "loss": 1.9825, + "step": 72840 + }, + { + "epoch": 0.14187428678254707, + "grad_norm": 2.2406160831451416, + "learning_rate": 2.8534564322063662e-05, + "loss": 1.8271, + "step": 72855 + }, + { + "epoch": 0.14190349705365732, + "grad_norm": 4.775651454925537, + "learning_rate": 2.853397085561841e-05, + "loss": 1.7936, + "step": 72870 + }, + { + "epoch": 0.1419327073247676, + "grad_norm": 3.4600954055786133, + "learning_rate": 2.8533377275201686e-05, + "loss": 1.704, + "step": 72885 + }, + { + "epoch": 0.14196191759587784, + "grad_norm": 2.914008855819702, + "learning_rate": 2.8532783580818487e-05, + "loss": 1.776, + "step": 72900 + }, + { + "epoch": 0.14199112786698811, + "grad_norm": 2.9199490547180176, + "learning_rate": 2.8532189772473818e-05, + "loss": 1.793, + "step": 72915 + }, + { + "epoch": 0.14202033813809836, + "grad_norm": 2.5030198097229004, + "learning_rate": 2.853159585017267e-05, + "loss": 1.8021, + "step": 72930 + }, + { + "epoch": 0.14204954840920864, + "grad_norm": 2.1847662925720215, + "learning_rate": 2.8531001813920055e-05, + "loss": 1.9015, + "step": 72945 + }, + { + "epoch": 0.1420787586803189, + "grad_norm": 1.8469825983047485, + "learning_rate": 2.8530407663720973e-05, + "loss": 1.8369, + "step": 72960 + }, + { + "epoch": 0.14210796895142916, + "grad_norm": 2.576925754547119, + "learning_rate": 2.8529813399580422e-05, + "loss": 1.8099, + "step": 72975 + }, + { + "epoch": 0.14213717922253943, + "grad_norm": 2.152590274810791, + "learning_rate": 2.852921902150341e-05, + "loss": 1.808, + "step": 72990 + }, + { + "epoch": 0.14216638949364968, + "grad_norm": 2.7946550846099854, + "learning_rate": 2.852862452949494e-05, + "loss": 1.8326, + "step": 73005 + }, + { + "epoch": 0.14219559976475996, + "grad_norm": 3.9384844303131104, + "learning_rate": 2.8528029923560024e-05, + "loss": 1.8811, + "step": 73020 + }, + { + "epoch": 0.1422248100358702, + "grad_norm": 2.516542434692383, + "learning_rate": 2.852743520370366e-05, + "loss": 2.0782, + "step": 73035 + }, + { + "epoch": 0.14225402030698048, + "grad_norm": 3.010446786880493, + "learning_rate": 2.8526840369930868e-05, + "loss": 1.9166, + "step": 73050 + }, + { + "epoch": 0.14228323057809075, + "grad_norm": 3.3608057498931885, + "learning_rate": 2.8526245422246644e-05, + "loss": 1.7613, + "step": 73065 + }, + { + "epoch": 0.142312440849201, + "grad_norm": 1.8177303075790405, + "learning_rate": 2.852565036065601e-05, + "loss": 1.7569, + "step": 73080 + }, + { + "epoch": 0.14234165112031127, + "grad_norm": 5.860317707061768, + "learning_rate": 2.852505518516397e-05, + "loss": 1.8441, + "step": 73095 + }, + { + "epoch": 0.14237086139142152, + "grad_norm": 2.5878102779388428, + "learning_rate": 2.852445989577554e-05, + "loss": 1.8303, + "step": 73110 + }, + { + "epoch": 0.1424000716625318, + "grad_norm": 2.642291784286499, + "learning_rate": 2.8523864492495724e-05, + "loss": 1.9826, + "step": 73125 + }, + { + "epoch": 0.14242928193364204, + "grad_norm": 3.965019702911377, + "learning_rate": 2.8523268975329553e-05, + "loss": 1.9948, + "step": 73140 + }, + { + "epoch": 0.14245849220475232, + "grad_norm": 4.348023891448975, + "learning_rate": 2.8522673344282027e-05, + "loss": 1.7908, + "step": 73155 + }, + { + "epoch": 0.14248770247586257, + "grad_norm": 3.697197198867798, + "learning_rate": 2.8522077599358165e-05, + "loss": 1.8805, + "step": 73170 + }, + { + "epoch": 0.14251691274697284, + "grad_norm": 2.8124146461486816, + "learning_rate": 2.8521481740562992e-05, + "loss": 1.897, + "step": 73185 + }, + { + "epoch": 0.14254612301808312, + "grad_norm": 2.2915263175964355, + "learning_rate": 2.852088576790151e-05, + "loss": 2.0654, + "step": 73200 + }, + { + "epoch": 0.14257533328919336, + "grad_norm": 2.5295584201812744, + "learning_rate": 2.8520289681378756e-05, + "loss": 1.7831, + "step": 73215 + }, + { + "epoch": 0.14260454356030364, + "grad_norm": 3.6719937324523926, + "learning_rate": 2.8519693480999736e-05, + "loss": 2.0032, + "step": 73230 + }, + { + "epoch": 0.14263375383141388, + "grad_norm": 4.619721412658691, + "learning_rate": 2.8519097166769482e-05, + "loss": 1.8473, + "step": 73245 + }, + { + "epoch": 0.14266296410252416, + "grad_norm": 4.388256549835205, + "learning_rate": 2.8518500738693e-05, + "loss": 1.8777, + "step": 73260 + }, + { + "epoch": 0.1426921743736344, + "grad_norm": 3.7511298656463623, + "learning_rate": 2.851790419677533e-05, + "loss": 1.8411, + "step": 73275 + }, + { + "epoch": 0.14272138464474468, + "grad_norm": 4.122090816497803, + "learning_rate": 2.8517307541021483e-05, + "loss": 1.7702, + "step": 73290 + }, + { + "epoch": 0.14275059491585496, + "grad_norm": 2.7420620918273926, + "learning_rate": 2.8516710771436488e-05, + "loss": 1.9173, + "step": 73305 + }, + { + "epoch": 0.1427798051869652, + "grad_norm": 3.2522389888763428, + "learning_rate": 2.8516113888025376e-05, + "loss": 2.1851, + "step": 73320 + }, + { + "epoch": 0.14280901545807548, + "grad_norm": 3.34875750541687, + "learning_rate": 2.8515516890793162e-05, + "loss": 1.8475, + "step": 73335 + }, + { + "epoch": 0.14283822572918572, + "grad_norm": 3.603339910507202, + "learning_rate": 2.851491977974488e-05, + "loss": 1.963, + "step": 73350 + }, + { + "epoch": 0.142867436000296, + "grad_norm": 6.2788496017456055, + "learning_rate": 2.851432255488556e-05, + "loss": 2.0639, + "step": 73365 + }, + { + "epoch": 0.14289664627140625, + "grad_norm": 3.5592310428619385, + "learning_rate": 2.8513725216220226e-05, + "loss": 1.9551, + "step": 73380 + }, + { + "epoch": 0.14292585654251652, + "grad_norm": 2.7750020027160645, + "learning_rate": 2.8513127763753915e-05, + "loss": 1.8545, + "step": 73395 + }, + { + "epoch": 0.1429550668136268, + "grad_norm": 3.8551149368286133, + "learning_rate": 2.8512530197491655e-05, + "loss": 2.0404, + "step": 73410 + }, + { + "epoch": 0.14298427708473704, + "grad_norm": 5.746589660644531, + "learning_rate": 2.8511932517438473e-05, + "loss": 1.8263, + "step": 73425 + }, + { + "epoch": 0.14301348735584732, + "grad_norm": 2.104475259780884, + "learning_rate": 2.851133472359941e-05, + "loss": 1.8464, + "step": 73440 + }, + { + "epoch": 0.14304269762695757, + "grad_norm": 2.755807876586914, + "learning_rate": 2.85107368159795e-05, + "loss": 2.0467, + "step": 73455 + }, + { + "epoch": 0.14307190789806784, + "grad_norm": 3.3435897827148438, + "learning_rate": 2.851013879458377e-05, + "loss": 1.7152, + "step": 73470 + }, + { + "epoch": 0.1431011181691781, + "grad_norm": 2.9040682315826416, + "learning_rate": 2.8509540659417262e-05, + "loss": 1.7891, + "step": 73485 + }, + { + "epoch": 0.14313032844028836, + "grad_norm": 3.3197898864746094, + "learning_rate": 2.8508942410485016e-05, + "loss": 1.9664, + "step": 73500 + }, + { + "epoch": 0.14315953871139864, + "grad_norm": 5.075369834899902, + "learning_rate": 2.8508344047792062e-05, + "loss": 1.9992, + "step": 73515 + }, + { + "epoch": 0.14318874898250888, + "grad_norm": 5.03831672668457, + "learning_rate": 2.8507745571343443e-05, + "loss": 1.9112, + "step": 73530 + }, + { + "epoch": 0.14321795925361916, + "grad_norm": 4.1223039627075195, + "learning_rate": 2.8507146981144198e-05, + "loss": 1.9345, + "step": 73545 + }, + { + "epoch": 0.1432471695247294, + "grad_norm": 3.088857412338257, + "learning_rate": 2.850654827719937e-05, + "loss": 1.7773, + "step": 73560 + }, + { + "epoch": 0.14327637979583968, + "grad_norm": 2.367082357406616, + "learning_rate": 2.8505949459514e-05, + "loss": 1.9083, + "step": 73575 + }, + { + "epoch": 0.14330559006694993, + "grad_norm": 2.2321507930755615, + "learning_rate": 2.850535052809313e-05, + "loss": 1.7374, + "step": 73590 + }, + { + "epoch": 0.1433348003380602, + "grad_norm": 3.0865018367767334, + "learning_rate": 2.85047514829418e-05, + "loss": 1.8268, + "step": 73605 + }, + { + "epoch": 0.14336401060917048, + "grad_norm": 4.458057880401611, + "learning_rate": 2.8504152324065064e-05, + "loss": 1.7676, + "step": 73620 + }, + { + "epoch": 0.14339322088028073, + "grad_norm": 3.407834053039551, + "learning_rate": 2.8503553051467962e-05, + "loss": 1.7036, + "step": 73635 + }, + { + "epoch": 0.143422431151391, + "grad_norm": 3.198607921600342, + "learning_rate": 2.8502953665155535e-05, + "loss": 1.78, + "step": 73650 + }, + { + "epoch": 0.14345164142250125, + "grad_norm": 3.9906721115112305, + "learning_rate": 2.850235416513284e-05, + "loss": 1.918, + "step": 73665 + }, + { + "epoch": 0.14348085169361152, + "grad_norm": 3.416074275970459, + "learning_rate": 2.850175455140492e-05, + "loss": 1.9376, + "step": 73680 + }, + { + "epoch": 0.14351006196472177, + "grad_norm": 3.2589006423950195, + "learning_rate": 2.8501154823976824e-05, + "loss": 1.7923, + "step": 73695 + }, + { + "epoch": 0.14353927223583204, + "grad_norm": 4.395214557647705, + "learning_rate": 2.850055498285361e-05, + "loss": 1.9298, + "step": 73710 + }, + { + "epoch": 0.14356848250694232, + "grad_norm": 2.460036039352417, + "learning_rate": 2.8499955028040323e-05, + "loss": 1.8209, + "step": 73725 + }, + { + "epoch": 0.14359769277805257, + "grad_norm": 2.944854736328125, + "learning_rate": 2.8499354959542014e-05, + "loss": 1.7814, + "step": 73740 + }, + { + "epoch": 0.14362690304916284, + "grad_norm": 2.690488338470459, + "learning_rate": 2.8498754777363735e-05, + "loss": 2.0388, + "step": 73755 + }, + { + "epoch": 0.1436561133202731, + "grad_norm": 4.302223205566406, + "learning_rate": 2.849815448151055e-05, + "loss": 1.8864, + "step": 73770 + }, + { + "epoch": 0.14368532359138336, + "grad_norm": 1.946239709854126, + "learning_rate": 2.8497554071987505e-05, + "loss": 1.8, + "step": 73785 + }, + { + "epoch": 0.1437145338624936, + "grad_norm": 2.5286450386047363, + "learning_rate": 2.8496953548799653e-05, + "loss": 1.7292, + "step": 73800 + }, + { + "epoch": 0.14374374413360388, + "grad_norm": 2.502965211868286, + "learning_rate": 2.8496352911952062e-05, + "loss": 2.027, + "step": 73815 + }, + { + "epoch": 0.14377295440471416, + "grad_norm": 2.498934507369995, + "learning_rate": 2.8495752161449783e-05, + "loss": 1.8608, + "step": 73830 + }, + { + "epoch": 0.1438021646758244, + "grad_norm": 2.8029141426086426, + "learning_rate": 2.8495151297297882e-05, + "loss": 2.1363, + "step": 73845 + }, + { + "epoch": 0.14383137494693468, + "grad_norm": 3.0271644592285156, + "learning_rate": 2.849455031950141e-05, + "loss": 1.86, + "step": 73860 + }, + { + "epoch": 0.14386058521804493, + "grad_norm": 3.0142743587493896, + "learning_rate": 2.8493949228065433e-05, + "loss": 2.1117, + "step": 73875 + }, + { + "epoch": 0.1438897954891552, + "grad_norm": 2.441868782043457, + "learning_rate": 2.8493348022995008e-05, + "loss": 1.9914, + "step": 73890 + }, + { + "epoch": 0.14391900576026545, + "grad_norm": 3.0463695526123047, + "learning_rate": 2.849274670429521e-05, + "loss": 1.7735, + "step": 73905 + }, + { + "epoch": 0.14394821603137573, + "grad_norm": 3.954310655593872, + "learning_rate": 2.8492145271971086e-05, + "loss": 1.8172, + "step": 73920 + }, + { + "epoch": 0.143977426302486, + "grad_norm": 2.8609185218811035, + "learning_rate": 2.8491543726027715e-05, + "loss": 1.8304, + "step": 73935 + }, + { + "epoch": 0.14400663657359625, + "grad_norm": 2.3113441467285156, + "learning_rate": 2.8490942066470155e-05, + "loss": 1.9043, + "step": 73950 + }, + { + "epoch": 0.14403584684470652, + "grad_norm": 2.0025594234466553, + "learning_rate": 2.849034029330347e-05, + "loss": 1.7936, + "step": 73965 + }, + { + "epoch": 0.14406505711581677, + "grad_norm": 3.4090352058410645, + "learning_rate": 2.8489738406532738e-05, + "loss": 1.9495, + "step": 73980 + }, + { + "epoch": 0.14409426738692704, + "grad_norm": 3.351456642150879, + "learning_rate": 2.8489136406163018e-05, + "loss": 1.7941, + "step": 73995 + }, + { + "epoch": 0.1441234776580373, + "grad_norm": 4.966386795043945, + "learning_rate": 2.8488534292199383e-05, + "loss": 1.6873, + "step": 74010 + }, + { + "epoch": 0.14415268792914757, + "grad_norm": 3.6761229038238525, + "learning_rate": 2.8487932064646906e-05, + "loss": 2.0415, + "step": 74025 + }, + { + "epoch": 0.14418189820025784, + "grad_norm": 2.7794039249420166, + "learning_rate": 2.8487329723510654e-05, + "loss": 1.5494, + "step": 74040 + }, + { + "epoch": 0.1442111084713681, + "grad_norm": 3.7864420413970947, + "learning_rate": 2.8486727268795704e-05, + "loss": 2.078, + "step": 74055 + }, + { + "epoch": 0.14424031874247836, + "grad_norm": 2.605635643005371, + "learning_rate": 2.8486124700507124e-05, + "loss": 1.933, + "step": 74070 + }, + { + "epoch": 0.1442695290135886, + "grad_norm": 2.7622265815734863, + "learning_rate": 2.8485522018649996e-05, + "loss": 2.066, + "step": 74085 + }, + { + "epoch": 0.14429873928469888, + "grad_norm": 2.185512065887451, + "learning_rate": 2.848491922322938e-05, + "loss": 1.8183, + "step": 74100 + }, + { + "epoch": 0.14432794955580913, + "grad_norm": 4.003538608551025, + "learning_rate": 2.8484316314250373e-05, + "loss": 1.8887, + "step": 74115 + }, + { + "epoch": 0.1443571598269194, + "grad_norm": 1.8242509365081787, + "learning_rate": 2.8483713291718037e-05, + "loss": 1.6684, + "step": 74130 + }, + { + "epoch": 0.14438637009802968, + "grad_norm": 3.349139928817749, + "learning_rate": 2.8483110155637455e-05, + "loss": 1.8893, + "step": 74145 + }, + { + "epoch": 0.14441558036913993, + "grad_norm": 2.4987919330596924, + "learning_rate": 2.8482506906013708e-05, + "loss": 1.9281, + "step": 74160 + }, + { + "epoch": 0.1444447906402502, + "grad_norm": 5.021810531616211, + "learning_rate": 2.8481903542851873e-05, + "loss": 1.8334, + "step": 74175 + }, + { + "epoch": 0.14447400091136045, + "grad_norm": 2.732571601867676, + "learning_rate": 2.8481300066157032e-05, + "loss": 1.8669, + "step": 74190 + }, + { + "epoch": 0.14450321118247073, + "grad_norm": 2.6198461055755615, + "learning_rate": 2.8480696475934266e-05, + "loss": 2.0427, + "step": 74205 + }, + { + "epoch": 0.14453242145358097, + "grad_norm": 3.114480495452881, + "learning_rate": 2.8480092772188665e-05, + "loss": 1.7658, + "step": 74220 + }, + { + "epoch": 0.14456163172469125, + "grad_norm": 2.6460888385772705, + "learning_rate": 2.8479488954925305e-05, + "loss": 1.7866, + "step": 74235 + }, + { + "epoch": 0.14459084199580152, + "grad_norm": 2.9281113147735596, + "learning_rate": 2.8478885024149267e-05, + "loss": 1.6264, + "step": 74250 + }, + { + "epoch": 0.14462005226691177, + "grad_norm": 3.5315093994140625, + "learning_rate": 2.8478280979865643e-05, + "loss": 1.7175, + "step": 74265 + }, + { + "epoch": 0.14464926253802204, + "grad_norm": 3.094531297683716, + "learning_rate": 2.8477676822079524e-05, + "loss": 1.8942, + "step": 74280 + }, + { + "epoch": 0.1446784728091323, + "grad_norm": 3.203866481781006, + "learning_rate": 2.8477072550795994e-05, + "loss": 1.8425, + "step": 74295 + }, + { + "epoch": 0.14470768308024257, + "grad_norm": 2.6281557083129883, + "learning_rate": 2.8476468166020135e-05, + "loss": 2.1451, + "step": 74310 + }, + { + "epoch": 0.1447368933513528, + "grad_norm": 1.7994927167892456, + "learning_rate": 2.847586366775704e-05, + "loss": 1.8934, + "step": 74325 + }, + { + "epoch": 0.1447661036224631, + "grad_norm": 3.425593376159668, + "learning_rate": 2.847525905601181e-05, + "loss": 1.9483, + "step": 74340 + }, + { + "epoch": 0.14479531389357336, + "grad_norm": 5.228343486785889, + "learning_rate": 2.8474654330789525e-05, + "loss": 2.0232, + "step": 74355 + }, + { + "epoch": 0.1448245241646836, + "grad_norm": 3.656255006790161, + "learning_rate": 2.8474049492095276e-05, + "loss": 1.9476, + "step": 74370 + }, + { + "epoch": 0.14485373443579388, + "grad_norm": 2.6708178520202637, + "learning_rate": 2.8473444539934167e-05, + "loss": 1.7418, + "step": 74385 + }, + { + "epoch": 0.14488294470690413, + "grad_norm": 5.376559734344482, + "learning_rate": 2.8472839474311287e-05, + "loss": 1.8638, + "step": 74400 + }, + { + "epoch": 0.1449121549780144, + "grad_norm": 5.04163932800293, + "learning_rate": 2.847223429523173e-05, + "loss": 1.8921, + "step": 74415 + }, + { + "epoch": 0.14494136524912465, + "grad_norm": 2.8315744400024414, + "learning_rate": 2.8471629002700588e-05, + "loss": 2.1462, + "step": 74430 + }, + { + "epoch": 0.14497057552023493, + "grad_norm": 2.8156352043151855, + "learning_rate": 2.847102359672297e-05, + "loss": 1.9281, + "step": 74445 + }, + { + "epoch": 0.1449997857913452, + "grad_norm": 2.954503059387207, + "learning_rate": 2.8470418077303964e-05, + "loss": 1.8767, + "step": 74460 + }, + { + "epoch": 0.14502899606245545, + "grad_norm": 3.026890516281128, + "learning_rate": 2.8469812444448673e-05, + "loss": 1.8342, + "step": 74475 + }, + { + "epoch": 0.14505820633356573, + "grad_norm": 3.266925573348999, + "learning_rate": 2.84692066981622e-05, + "loss": 1.8007, + "step": 74490 + }, + { + "epoch": 0.14508741660467597, + "grad_norm": 3.231792688369751, + "learning_rate": 2.846860083844964e-05, + "loss": 2.0157, + "step": 74505 + }, + { + "epoch": 0.14511662687578625, + "grad_norm": 4.349849224090576, + "learning_rate": 2.84679948653161e-05, + "loss": 1.8478, + "step": 74520 + }, + { + "epoch": 0.1451458371468965, + "grad_norm": 4.3141093254089355, + "learning_rate": 2.846738877876668e-05, + "loss": 1.8996, + "step": 74535 + }, + { + "epoch": 0.14517504741800677, + "grad_norm": 4.4374165534973145, + "learning_rate": 2.8466782578806484e-05, + "loss": 1.9721, + "step": 74550 + }, + { + "epoch": 0.14520425768911704, + "grad_norm": 2.977494478225708, + "learning_rate": 2.846617626544062e-05, + "loss": 1.8242, + "step": 74565 + }, + { + "epoch": 0.1452334679602273, + "grad_norm": 2.8183326721191406, + "learning_rate": 2.846556983867419e-05, + "loss": 1.8529, + "step": 74580 + }, + { + "epoch": 0.14526267823133757, + "grad_norm": 2.8301045894622803, + "learning_rate": 2.84649632985123e-05, + "loss": 1.8368, + "step": 74595 + }, + { + "epoch": 0.1452918885024478, + "grad_norm": 3.769669771194458, + "learning_rate": 2.846435664496007e-05, + "loss": 1.6952, + "step": 74610 + }, + { + "epoch": 0.1453210987735581, + "grad_norm": 2.1358160972595215, + "learning_rate": 2.8463749878022587e-05, + "loss": 1.9827, + "step": 74625 + }, + { + "epoch": 0.14535030904466834, + "grad_norm": 2.610013723373413, + "learning_rate": 2.846314299770498e-05, + "loss": 1.8518, + "step": 74640 + }, + { + "epoch": 0.1453795193157786, + "grad_norm": 2.181931734085083, + "learning_rate": 2.846253600401235e-05, + "loss": 1.8017, + "step": 74655 + }, + { + "epoch": 0.14540872958688889, + "grad_norm": 3.9830493927001953, + "learning_rate": 2.8461928896949808e-05, + "loss": 1.7276, + "step": 74670 + }, + { + "epoch": 0.14543793985799913, + "grad_norm": 1.8338431119918823, + "learning_rate": 2.8461321676522476e-05, + "loss": 2.0653, + "step": 74685 + }, + { + "epoch": 0.1454671501291094, + "grad_norm": 4.270096778869629, + "learning_rate": 2.8460714342735458e-05, + "loss": 1.706, + "step": 74700 + }, + { + "epoch": 0.14549636040021965, + "grad_norm": 3.531867742538452, + "learning_rate": 2.846010689559387e-05, + "loss": 1.9466, + "step": 74715 + }, + { + "epoch": 0.14552557067132993, + "grad_norm": 3.1367979049682617, + "learning_rate": 2.845949933510283e-05, + "loss": 1.929, + "step": 74730 + }, + { + "epoch": 0.14555478094244018, + "grad_norm": 5.362056732177734, + "learning_rate": 2.8458891661267453e-05, + "loss": 1.9178, + "step": 74745 + }, + { + "epoch": 0.14558399121355045, + "grad_norm": 2.6152162551879883, + "learning_rate": 2.8458283874092856e-05, + "loss": 1.9403, + "step": 74760 + }, + { + "epoch": 0.14561320148466073, + "grad_norm": 2.050978183746338, + "learning_rate": 2.8457675973584157e-05, + "loss": 1.959, + "step": 74775 + }, + { + "epoch": 0.14564241175577097, + "grad_norm": 2.5266189575195312, + "learning_rate": 2.845706795974648e-05, + "loss": 1.9799, + "step": 74790 + }, + { + "epoch": 0.14567162202688125, + "grad_norm": 3.2171003818511963, + "learning_rate": 2.8456459832584932e-05, + "loss": 1.9757, + "step": 74805 + }, + { + "epoch": 0.1457008322979915, + "grad_norm": 3.7277936935424805, + "learning_rate": 2.8455851592104652e-05, + "loss": 1.9226, + "step": 74820 + }, + { + "epoch": 0.14573004256910177, + "grad_norm": 2.1250598430633545, + "learning_rate": 2.845524323831075e-05, + "loss": 1.8724, + "step": 74835 + }, + { + "epoch": 0.14575925284021202, + "grad_norm": 2.79555606842041, + "learning_rate": 2.8454634771208353e-05, + "loss": 1.9392, + "step": 74850 + }, + { + "epoch": 0.1457884631113223, + "grad_norm": 2.9395086765289307, + "learning_rate": 2.8454026190802584e-05, + "loss": 2.2047, + "step": 74865 + }, + { + "epoch": 0.14581767338243257, + "grad_norm": 3.125072956085205, + "learning_rate": 2.845341749709857e-05, + "loss": 1.8144, + "step": 74880 + }, + { + "epoch": 0.1458468836535428, + "grad_norm": 2.2944655418395996, + "learning_rate": 2.8452808690101427e-05, + "loss": 1.7972, + "step": 74895 + }, + { + "epoch": 0.1458760939246531, + "grad_norm": 2.545682907104492, + "learning_rate": 2.84521997698163e-05, + "loss": 1.7865, + "step": 74910 + }, + { + "epoch": 0.14590530419576334, + "grad_norm": 1.6764289140701294, + "learning_rate": 2.84515907362483e-05, + "loss": 1.9144, + "step": 74925 + }, + { + "epoch": 0.1459345144668736, + "grad_norm": 4.046501636505127, + "learning_rate": 2.8450981589402564e-05, + "loss": 1.9184, + "step": 74940 + }, + { + "epoch": 0.14596372473798386, + "grad_norm": 2.8961234092712402, + "learning_rate": 2.8450372329284222e-05, + "loss": 1.7575, + "step": 74955 + }, + { + "epoch": 0.14599293500909413, + "grad_norm": 4.405148983001709, + "learning_rate": 2.8449762955898405e-05, + "loss": 1.7266, + "step": 74970 + }, + { + "epoch": 0.1460221452802044, + "grad_norm": 4.16251277923584, + "learning_rate": 2.844915346925024e-05, + "loss": 1.9166, + "step": 74985 + }, + { + "epoch": 0.14605135555131465, + "grad_norm": 3.183410167694092, + "learning_rate": 2.844854386934486e-05, + "loss": 2.037, + "step": 75000 + }, + { + "epoch": 0.14608056582242493, + "grad_norm": 3.802748680114746, + "learning_rate": 2.8447934156187402e-05, + "loss": 1.921, + "step": 75015 + }, + { + "epoch": 0.14610977609353518, + "grad_norm": 4.682132244110107, + "learning_rate": 2.8447324329783e-05, + "loss": 1.7106, + "step": 75030 + }, + { + "epoch": 0.14613898636464545, + "grad_norm": 3.3939461708068848, + "learning_rate": 2.844671439013679e-05, + "loss": 1.942, + "step": 75045 + }, + { + "epoch": 0.1461681966357557, + "grad_norm": 4.50538444519043, + "learning_rate": 2.8446104337253906e-05, + "loss": 1.7352, + "step": 75060 + }, + { + "epoch": 0.14619740690686597, + "grad_norm": 3.8883001804351807, + "learning_rate": 2.8445494171139484e-05, + "loss": 1.9349, + "step": 75075 + }, + { + "epoch": 0.14622661717797625, + "grad_norm": 4.0128021240234375, + "learning_rate": 2.8444883891798665e-05, + "loss": 1.8628, + "step": 75090 + }, + { + "epoch": 0.1462558274490865, + "grad_norm": 3.2269062995910645, + "learning_rate": 2.8444273499236592e-05, + "loss": 1.7823, + "step": 75105 + }, + { + "epoch": 0.14628503772019677, + "grad_norm": 2.4312047958374023, + "learning_rate": 2.8443662993458397e-05, + "loss": 1.7056, + "step": 75120 + }, + { + "epoch": 0.14631424799130702, + "grad_norm": 3.8795266151428223, + "learning_rate": 2.8443052374469225e-05, + "loss": 1.9205, + "step": 75135 + }, + { + "epoch": 0.1463434582624173, + "grad_norm": 2.642686128616333, + "learning_rate": 2.8442441642274217e-05, + "loss": 1.8431, + "step": 75150 + }, + { + "epoch": 0.14637266853352754, + "grad_norm": 3.8588173389434814, + "learning_rate": 2.844183079687852e-05, + "loss": 1.9041, + "step": 75165 + }, + { + "epoch": 0.14640187880463781, + "grad_norm": 2.2779126167297363, + "learning_rate": 2.8441219838287274e-05, + "loss": 1.8913, + "step": 75180 + }, + { + "epoch": 0.1464310890757481, + "grad_norm": 2.095672369003296, + "learning_rate": 2.844060876650563e-05, + "loss": 1.8272, + "step": 75195 + }, + { + "epoch": 0.14646029934685834, + "grad_norm": 2.298053503036499, + "learning_rate": 2.843999758153872e-05, + "loss": 1.7922, + "step": 75210 + }, + { + "epoch": 0.1464895096179686, + "grad_norm": 3.258687734603882, + "learning_rate": 2.8439386283391707e-05, + "loss": 1.9292, + "step": 75225 + }, + { + "epoch": 0.14651871988907886, + "grad_norm": 2.1723039150238037, + "learning_rate": 2.8438774872069732e-05, + "loss": 1.6176, + "step": 75240 + }, + { + "epoch": 0.14654793016018913, + "grad_norm": 4.218878746032715, + "learning_rate": 2.8438163347577944e-05, + "loss": 1.7608, + "step": 75255 + }, + { + "epoch": 0.14657714043129938, + "grad_norm": 1.904353380203247, + "learning_rate": 2.8437551709921487e-05, + "loss": 1.7635, + "step": 75270 + }, + { + "epoch": 0.14660635070240965, + "grad_norm": 4.3778157234191895, + "learning_rate": 2.8436939959105522e-05, + "loss": 1.9764, + "step": 75285 + }, + { + "epoch": 0.1466355609735199, + "grad_norm": 2.6779420375823975, + "learning_rate": 2.8436328095135194e-05, + "loss": 1.7377, + "step": 75300 + }, + { + "epoch": 0.14666477124463018, + "grad_norm": 3.847527027130127, + "learning_rate": 2.843571611801566e-05, + "loss": 1.8064, + "step": 75315 + }, + { + "epoch": 0.14669398151574045, + "grad_norm": 2.658083915710449, + "learning_rate": 2.8435104027752065e-05, + "loss": 1.9527, + "step": 75330 + }, + { + "epoch": 0.1467231917868507, + "grad_norm": 4.472046375274658, + "learning_rate": 2.8434491824349573e-05, + "loss": 1.7327, + "step": 75345 + }, + { + "epoch": 0.14675240205796097, + "grad_norm": 2.7553508281707764, + "learning_rate": 2.843387950781334e-05, + "loss": 1.8601, + "step": 75360 + }, + { + "epoch": 0.14678161232907122, + "grad_norm": 3.9834704399108887, + "learning_rate": 2.843326707814851e-05, + "loss": 1.7919, + "step": 75375 + }, + { + "epoch": 0.1468108226001815, + "grad_norm": 2.2407164573669434, + "learning_rate": 2.8432654535360256e-05, + "loss": 1.8836, + "step": 75390 + }, + { + "epoch": 0.14684003287129174, + "grad_norm": 3.1211001873016357, + "learning_rate": 2.8432041879453727e-05, + "loss": 1.7624, + "step": 75405 + }, + { + "epoch": 0.14686924314240202, + "grad_norm": 2.6442675590515137, + "learning_rate": 2.8431429110434078e-05, + "loss": 1.7744, + "step": 75420 + }, + { + "epoch": 0.1468984534135123, + "grad_norm": 2.478588819503784, + "learning_rate": 2.8430816228306485e-05, + "loss": 1.9589, + "step": 75435 + }, + { + "epoch": 0.14692766368462254, + "grad_norm": 3.0382254123687744, + "learning_rate": 2.8430203233076094e-05, + "loss": 1.6242, + "step": 75450 + }, + { + "epoch": 0.14695687395573281, + "grad_norm": 3.256681203842163, + "learning_rate": 2.842959012474807e-05, + "loss": 1.7883, + "step": 75465 + }, + { + "epoch": 0.14698608422684306, + "grad_norm": 4.512953758239746, + "learning_rate": 2.8428976903327586e-05, + "loss": 1.9623, + "step": 75480 + }, + { + "epoch": 0.14701529449795334, + "grad_norm": 2.489250421524048, + "learning_rate": 2.842836356881979e-05, + "loss": 1.9268, + "step": 75495 + }, + { + "epoch": 0.14704450476906358, + "grad_norm": 3.805304765701294, + "learning_rate": 2.8427750121229865e-05, + "loss": 1.7599, + "step": 75510 + }, + { + "epoch": 0.14707371504017386, + "grad_norm": 4.435770511627197, + "learning_rate": 2.842713656056296e-05, + "loss": 1.7962, + "step": 75525 + }, + { + "epoch": 0.14710292531128413, + "grad_norm": 4.997632026672363, + "learning_rate": 2.8426522886824254e-05, + "loss": 1.7421, + "step": 75540 + }, + { + "epoch": 0.14713213558239438, + "grad_norm": 4.232823848724365, + "learning_rate": 2.842590910001891e-05, + "loss": 1.8794, + "step": 75555 + }, + { + "epoch": 0.14716134585350465, + "grad_norm": 3.2560341358184814, + "learning_rate": 2.8425295200152097e-05, + "loss": 2.0397, + "step": 75570 + }, + { + "epoch": 0.1471905561246149, + "grad_norm": 3.4824719429016113, + "learning_rate": 2.8424681187228983e-05, + "loss": 1.7778, + "step": 75585 + }, + { + "epoch": 0.14721976639572518, + "grad_norm": 2.115450143814087, + "learning_rate": 2.8424067061254744e-05, + "loss": 2.0553, + "step": 75600 + }, + { + "epoch": 0.14724897666683542, + "grad_norm": 3.9036290645599365, + "learning_rate": 2.842345282223455e-05, + "loss": 1.8187, + "step": 75615 + }, + { + "epoch": 0.1472781869379457, + "grad_norm": 2.39381742477417, + "learning_rate": 2.8422838470173568e-05, + "loss": 1.8001, + "step": 75630 + }, + { + "epoch": 0.14730739720905597, + "grad_norm": 2.6120264530181885, + "learning_rate": 2.8422224005076976e-05, + "loss": 1.919, + "step": 75645 + }, + { + "epoch": 0.14733660748016622, + "grad_norm": 2.1776349544525146, + "learning_rate": 2.8421609426949946e-05, + "loss": 1.8457, + "step": 75660 + }, + { + "epoch": 0.1473658177512765, + "grad_norm": 2.1389527320861816, + "learning_rate": 2.8420994735797655e-05, + "loss": 1.6331, + "step": 75675 + }, + { + "epoch": 0.14739502802238674, + "grad_norm": 2.425187110900879, + "learning_rate": 2.8420379931625283e-05, + "loss": 1.7988, + "step": 75690 + }, + { + "epoch": 0.14742423829349702, + "grad_norm": 3.734109878540039, + "learning_rate": 2.8419765014438e-05, + "loss": 1.9696, + "step": 75705 + }, + { + "epoch": 0.14745344856460726, + "grad_norm": 2.777010440826416, + "learning_rate": 2.8419149984240997e-05, + "loss": 1.7686, + "step": 75720 + }, + { + "epoch": 0.14748265883571754, + "grad_norm": 5.420701026916504, + "learning_rate": 2.8418534841039436e-05, + "loss": 1.8833, + "step": 75735 + }, + { + "epoch": 0.14751186910682781, + "grad_norm": 3.072200059890747, + "learning_rate": 2.8417919584838506e-05, + "loss": 2.0534, + "step": 75750 + }, + { + "epoch": 0.14754107937793806, + "grad_norm": 3.3708369731903076, + "learning_rate": 2.841730421564339e-05, + "loss": 1.8143, + "step": 75765 + }, + { + "epoch": 0.14757028964904834, + "grad_norm": 3.582489490509033, + "learning_rate": 2.8416688733459272e-05, + "loss": 1.8505, + "step": 75780 + }, + { + "epoch": 0.14759949992015858, + "grad_norm": 2.1890854835510254, + "learning_rate": 2.8416073138291325e-05, + "loss": 1.867, + "step": 75795 + }, + { + "epoch": 0.14762871019126886, + "grad_norm": 2.340660810470581, + "learning_rate": 2.8415457430144745e-05, + "loss": 1.9259, + "step": 75810 + }, + { + "epoch": 0.1476579204623791, + "grad_norm": 3.5146968364715576, + "learning_rate": 2.8414841609024705e-05, + "loss": 1.7676, + "step": 75825 + }, + { + "epoch": 0.14768713073348938, + "grad_norm": 4.4006218910217285, + "learning_rate": 2.8414225674936403e-05, + "loss": 2.0096, + "step": 75840 + }, + { + "epoch": 0.14771634100459965, + "grad_norm": 3.3753602504730225, + "learning_rate": 2.841360962788502e-05, + "loss": 1.8204, + "step": 75855 + }, + { + "epoch": 0.1477455512757099, + "grad_norm": 2.7762327194213867, + "learning_rate": 2.841299346787574e-05, + "loss": 2.1623, + "step": 75870 + }, + { + "epoch": 0.14777476154682018, + "grad_norm": 2.0558714866638184, + "learning_rate": 2.8412377194913756e-05, + "loss": 1.7151, + "step": 75885 + }, + { + "epoch": 0.14780397181793042, + "grad_norm": 2.7416152954101562, + "learning_rate": 2.841176080900426e-05, + "loss": 2.1504, + "step": 75900 + }, + { + "epoch": 0.1478331820890407, + "grad_norm": 4.107183456420898, + "learning_rate": 2.8411144310152437e-05, + "loss": 1.8993, + "step": 75915 + }, + { + "epoch": 0.14786239236015095, + "grad_norm": 2.4044148921966553, + "learning_rate": 2.8410527698363482e-05, + "loss": 1.7354, + "step": 75930 + }, + { + "epoch": 0.14789160263126122, + "grad_norm": 2.4700186252593994, + "learning_rate": 2.8409910973642585e-05, + "loss": 1.8139, + "step": 75945 + }, + { + "epoch": 0.1479208129023715, + "grad_norm": 3.8761379718780518, + "learning_rate": 2.8409294135994947e-05, + "loss": 1.824, + "step": 75960 + }, + { + "epoch": 0.14795002317348174, + "grad_norm": 2.421818256378174, + "learning_rate": 2.8408677185425753e-05, + "loss": 1.8645, + "step": 75975 + }, + { + "epoch": 0.14797923344459202, + "grad_norm": 3.022249460220337, + "learning_rate": 2.8408060121940202e-05, + "loss": 1.79, + "step": 75990 + }, + { + "epoch": 0.14800844371570226, + "grad_norm": 2.6644575595855713, + "learning_rate": 2.8407442945543492e-05, + "loss": 1.974, + "step": 76005 + }, + { + "epoch": 0.14803765398681254, + "grad_norm": 4.2788519859313965, + "learning_rate": 2.8406825656240825e-05, + "loss": 1.9263, + "step": 76020 + }, + { + "epoch": 0.1480668642579228, + "grad_norm": 5.908010482788086, + "learning_rate": 2.840620825403739e-05, + "loss": 1.9475, + "step": 76035 + }, + { + "epoch": 0.14809607452903306, + "grad_norm": 2.75813627243042, + "learning_rate": 2.8405590738938388e-05, + "loss": 1.8475, + "step": 76050 + }, + { + "epoch": 0.14812528480014334, + "grad_norm": 1.8042970895767212, + "learning_rate": 2.8404973110949017e-05, + "loss": 1.702, + "step": 76065 + }, + { + "epoch": 0.14815449507125358, + "grad_norm": 2.0125386714935303, + "learning_rate": 2.840435537007449e-05, + "loss": 1.6676, + "step": 76080 + }, + { + "epoch": 0.14818370534236386, + "grad_norm": 3.034642219543457, + "learning_rate": 2.8403737516319997e-05, + "loss": 1.847, + "step": 76095 + }, + { + "epoch": 0.1482129156134741, + "grad_norm": 4.536755084991455, + "learning_rate": 2.8403119549690747e-05, + "loss": 1.8028, + "step": 76110 + }, + { + "epoch": 0.14824212588458438, + "grad_norm": 2.9709532260894775, + "learning_rate": 2.840250147019194e-05, + "loss": 1.8201, + "step": 76125 + }, + { + "epoch": 0.14827133615569463, + "grad_norm": 2.2233574390411377, + "learning_rate": 2.8401883277828785e-05, + "loss": 1.8912, + "step": 76140 + }, + { + "epoch": 0.1483005464268049, + "grad_norm": 2.777071237564087, + "learning_rate": 2.8401264972606486e-05, + "loss": 1.8547, + "step": 76155 + }, + { + "epoch": 0.14832975669791518, + "grad_norm": 2.2497169971466064, + "learning_rate": 2.8400646554530247e-05, + "loss": 1.7606, + "step": 76170 + }, + { + "epoch": 0.14835896696902542, + "grad_norm": 4.904995441436768, + "learning_rate": 2.840002802360528e-05, + "loss": 1.8296, + "step": 76185 + }, + { + "epoch": 0.1483881772401357, + "grad_norm": 2.172206163406372, + "learning_rate": 2.8399409379836793e-05, + "loss": 2.0352, + "step": 76200 + }, + { + "epoch": 0.14841738751124595, + "grad_norm": 3.801562547683716, + "learning_rate": 2.8398790623229996e-05, + "loss": 1.8052, + "step": 76215 + }, + { + "epoch": 0.14844659778235622, + "grad_norm": 3.271150827407837, + "learning_rate": 2.83981717537901e-05, + "loss": 1.9231, + "step": 76230 + }, + { + "epoch": 0.14847580805346647, + "grad_norm": 2.0403850078582764, + "learning_rate": 2.8397552771522314e-05, + "loss": 1.7625, + "step": 76245 + }, + { + "epoch": 0.14850501832457674, + "grad_norm": 2.860950469970703, + "learning_rate": 2.839693367643185e-05, + "loss": 1.7842, + "step": 76260 + }, + { + "epoch": 0.14853422859568702, + "grad_norm": 3.5709025859832764, + "learning_rate": 2.8396314468523924e-05, + "loss": 1.8774, + "step": 76275 + }, + { + "epoch": 0.14856343886679726, + "grad_norm": 4.0893330574035645, + "learning_rate": 2.8395695147803752e-05, + "loss": 1.8418, + "step": 76290 + }, + { + "epoch": 0.14859264913790754, + "grad_norm": 4.498213291168213, + "learning_rate": 2.8395075714276542e-05, + "loss": 1.8143, + "step": 76305 + }, + { + "epoch": 0.1486218594090178, + "grad_norm": 3.7956056594848633, + "learning_rate": 2.839445616794752e-05, + "loss": 1.8904, + "step": 76320 + }, + { + "epoch": 0.14865106968012806, + "grad_norm": 2.955240249633789, + "learning_rate": 2.8393836508821895e-05, + "loss": 1.8976, + "step": 76335 + }, + { + "epoch": 0.1486802799512383, + "grad_norm": 2.197618246078491, + "learning_rate": 2.8393216736904893e-05, + "loss": 1.8629, + "step": 76350 + }, + { + "epoch": 0.14870949022234858, + "grad_norm": 2.430612325668335, + "learning_rate": 2.8392596852201728e-05, + "loss": 1.7856, + "step": 76365 + }, + { + "epoch": 0.14873870049345886, + "grad_norm": 3.529574394226074, + "learning_rate": 2.839197685471762e-05, + "loss": 1.7329, + "step": 76380 + }, + { + "epoch": 0.1487679107645691, + "grad_norm": 3.4793806076049805, + "learning_rate": 2.8391356744457793e-05, + "loss": 1.6756, + "step": 76395 + }, + { + "epoch": 0.14879712103567938, + "grad_norm": 4.20980167388916, + "learning_rate": 2.8390736521427466e-05, + "loss": 2.0696, + "step": 76410 + }, + { + "epoch": 0.14882633130678963, + "grad_norm": 2.8805341720581055, + "learning_rate": 2.8390116185631866e-05, + "loss": 1.8651, + "step": 76425 + }, + { + "epoch": 0.1488555415778999, + "grad_norm": 3.3854541778564453, + "learning_rate": 2.8389495737076212e-05, + "loss": 1.7348, + "step": 76440 + }, + { + "epoch": 0.14888475184901015, + "grad_norm": 3.9908287525177, + "learning_rate": 2.838887517576573e-05, + "loss": 1.583, + "step": 76455 + }, + { + "epoch": 0.14891396212012042, + "grad_norm": 2.500596284866333, + "learning_rate": 2.838825450170565e-05, + "loss": 1.9274, + "step": 76470 + }, + { + "epoch": 0.1489431723912307, + "grad_norm": 2.28163480758667, + "learning_rate": 2.8387633714901194e-05, + "loss": 2.0082, + "step": 76485 + }, + { + "epoch": 0.14897238266234095, + "grad_norm": 2.3963749408721924, + "learning_rate": 2.8387012815357595e-05, + "loss": 1.9358, + "step": 76500 + }, + { + "epoch": 0.14900159293345122, + "grad_norm": 2.4222412109375, + "learning_rate": 2.838639180308007e-05, + "loss": 1.7486, + "step": 76515 + }, + { + "epoch": 0.14903080320456147, + "grad_norm": 3.246101140975952, + "learning_rate": 2.8385770678073865e-05, + "loss": 1.93, + "step": 76530 + }, + { + "epoch": 0.14906001347567174, + "grad_norm": 3.9545979499816895, + "learning_rate": 2.83851494403442e-05, + "loss": 1.5622, + "step": 76545 + }, + { + "epoch": 0.149089223746782, + "grad_norm": 4.766272068023682, + "learning_rate": 2.838452808989631e-05, + "loss": 1.74, + "step": 76560 + }, + { + "epoch": 0.14911843401789227, + "grad_norm": 2.5779788494110107, + "learning_rate": 2.8383906626735427e-05, + "loss": 2.0482, + "step": 76575 + }, + { + "epoch": 0.14914764428900254, + "grad_norm": 3.1827492713928223, + "learning_rate": 2.838328505086678e-05, + "loss": 1.7136, + "step": 76590 + }, + { + "epoch": 0.1491768545601128, + "grad_norm": 3.90043044090271, + "learning_rate": 2.838266336229561e-05, + "loss": 1.9432, + "step": 76605 + }, + { + "epoch": 0.14920606483122306, + "grad_norm": 2.4096438884735107, + "learning_rate": 2.838204156102715e-05, + "loss": 1.9753, + "step": 76620 + }, + { + "epoch": 0.1492352751023333, + "grad_norm": 2.4965484142303467, + "learning_rate": 2.8381419647066637e-05, + "loss": 1.9384, + "step": 76635 + }, + { + "epoch": 0.14926448537344358, + "grad_norm": 2.8003644943237305, + "learning_rate": 2.83807976204193e-05, + "loss": 1.8295, + "step": 76650 + }, + { + "epoch": 0.14929369564455383, + "grad_norm": 4.554660320281982, + "learning_rate": 2.8380175481090392e-05, + "loss": 1.9126, + "step": 76665 + }, + { + "epoch": 0.1493229059156641, + "grad_norm": 3.4256391525268555, + "learning_rate": 2.8379553229085143e-05, + "loss": 1.706, + "step": 76680 + }, + { + "epoch": 0.14935211618677438, + "grad_norm": 2.792707920074463, + "learning_rate": 2.8378930864408794e-05, + "loss": 1.9174, + "step": 76695 + }, + { + "epoch": 0.14938132645788463, + "grad_norm": 3.020430326461792, + "learning_rate": 2.8378308387066583e-05, + "loss": 1.7867, + "step": 76710 + }, + { + "epoch": 0.1494105367289949, + "grad_norm": 2.443082571029663, + "learning_rate": 2.837768579706376e-05, + "loss": 1.8217, + "step": 76725 + }, + { + "epoch": 0.14943974700010515, + "grad_norm": 3.1390039920806885, + "learning_rate": 2.837706309440556e-05, + "loss": 1.7242, + "step": 76740 + }, + { + "epoch": 0.14946895727121542, + "grad_norm": 2.6781768798828125, + "learning_rate": 2.8376440279097232e-05, + "loss": 1.9066, + "step": 76755 + }, + { + "epoch": 0.14949816754232567, + "grad_norm": 3.1253488063812256, + "learning_rate": 2.837581735114402e-05, + "loss": 1.7119, + "step": 76770 + }, + { + "epoch": 0.14952737781343595, + "grad_norm": 2.453080415725708, + "learning_rate": 2.8375194310551168e-05, + "loss": 1.7544, + "step": 76785 + }, + { + "epoch": 0.14955658808454622, + "grad_norm": 2.51411509513855, + "learning_rate": 2.8374571157323924e-05, + "loss": 1.9772, + "step": 76800 + }, + { + "epoch": 0.14958579835565647, + "grad_norm": 3.996797561645508, + "learning_rate": 2.8373947891467538e-05, + "loss": 1.7716, + "step": 76815 + }, + { + "epoch": 0.14961500862676674, + "grad_norm": 3.2852976322174072, + "learning_rate": 2.837332451298725e-05, + "loss": 1.9317, + "step": 76830 + }, + { + "epoch": 0.149644218897877, + "grad_norm": 3.2640724182128906, + "learning_rate": 2.837270102188832e-05, + "loss": 1.6667, + "step": 76845 + }, + { + "epoch": 0.14967342916898727, + "grad_norm": 3.866288661956787, + "learning_rate": 2.837207741817599e-05, + "loss": 1.6664, + "step": 76860 + }, + { + "epoch": 0.1497026394400975, + "grad_norm": 2.8372108936309814, + "learning_rate": 2.8371453701855518e-05, + "loss": 1.681, + "step": 76875 + }, + { + "epoch": 0.1497318497112078, + "grad_norm": 3.8551697731018066, + "learning_rate": 2.8370829872932154e-05, + "loss": 1.9469, + "step": 76890 + }, + { + "epoch": 0.14976105998231806, + "grad_norm": 2.492412805557251, + "learning_rate": 2.837020593141115e-05, + "loss": 2.0397, + "step": 76905 + }, + { + "epoch": 0.1497902702534283, + "grad_norm": 4.147984504699707, + "learning_rate": 2.8369581877297758e-05, + "loss": 1.9413, + "step": 76920 + }, + { + "epoch": 0.14981948052453858, + "grad_norm": 4.334084510803223, + "learning_rate": 2.8368957710597242e-05, + "loss": 1.8286, + "step": 76935 + }, + { + "epoch": 0.14984869079564883, + "grad_norm": 4.103565216064453, + "learning_rate": 2.8368333431314853e-05, + "loss": 1.8566, + "step": 76950 + }, + { + "epoch": 0.1498779010667591, + "grad_norm": 1.927396535873413, + "learning_rate": 2.8367709039455846e-05, + "loss": 1.7684, + "step": 76965 + }, + { + "epoch": 0.14990711133786935, + "grad_norm": 1.884482741355896, + "learning_rate": 2.836708453502548e-05, + "loss": 1.988, + "step": 76980 + }, + { + "epoch": 0.14993632160897963, + "grad_norm": 2.1564292907714844, + "learning_rate": 2.8366459918029014e-05, + "loss": 2.028, + "step": 76995 + }, + { + "epoch": 0.1499655318800899, + "grad_norm": 3.0453295707702637, + "learning_rate": 2.8365835188471717e-05, + "loss": 1.8641, + "step": 77010 + }, + { + "epoch": 0.14999474215120015, + "grad_norm": 3.6438117027282715, + "learning_rate": 2.836521034635883e-05, + "loss": 1.7231, + "step": 77025 + }, + { + "epoch": 0.15002395242231042, + "grad_norm": 2.9053168296813965, + "learning_rate": 2.836458539169564e-05, + "loss": 1.6572, + "step": 77040 + }, + { + "epoch": 0.15005316269342067, + "grad_norm": 3.4861180782318115, + "learning_rate": 2.8363960324487387e-05, + "loss": 1.8296, + "step": 77055 + }, + { + "epoch": 0.15008237296453095, + "grad_norm": 3.07234263420105, + "learning_rate": 2.836333514473935e-05, + "loss": 1.9092, + "step": 77070 + }, + { + "epoch": 0.1501115832356412, + "grad_norm": 2.8184378147125244, + "learning_rate": 2.8362709852456784e-05, + "loss": 1.9142, + "step": 77085 + }, + { + "epoch": 0.15014079350675147, + "grad_norm": 2.632733106613159, + "learning_rate": 2.836208444764496e-05, + "loss": 1.9149, + "step": 77100 + }, + { + "epoch": 0.15017000377786174, + "grad_norm": 2.236257553100586, + "learning_rate": 2.8361458930309142e-05, + "loss": 1.936, + "step": 77115 + }, + { + "epoch": 0.150199214048972, + "grad_norm": 3.7179739475250244, + "learning_rate": 2.8360833300454602e-05, + "loss": 1.9802, + "step": 77130 + }, + { + "epoch": 0.15022842432008227, + "grad_norm": 3.400845766067505, + "learning_rate": 2.8360207558086606e-05, + "loss": 1.8955, + "step": 77145 + }, + { + "epoch": 0.1502576345911925, + "grad_norm": 4.1731438636779785, + "learning_rate": 2.8359581703210422e-05, + "loss": 2.0376, + "step": 77160 + }, + { + "epoch": 0.1502868448623028, + "grad_norm": 2.7194149494171143, + "learning_rate": 2.8358955735831325e-05, + "loss": 1.9258, + "step": 77175 + }, + { + "epoch": 0.15031605513341303, + "grad_norm": 2.3724169731140137, + "learning_rate": 2.8358329655954578e-05, + "loss": 1.8739, + "step": 77190 + }, + { + "epoch": 0.1503452654045233, + "grad_norm": 2.8573286533355713, + "learning_rate": 2.8357703463585458e-05, + "loss": 1.9252, + "step": 77205 + }, + { + "epoch": 0.15037447567563358, + "grad_norm": 2.595545768737793, + "learning_rate": 2.835707715872924e-05, + "loss": 1.843, + "step": 77220 + }, + { + "epoch": 0.15040368594674383, + "grad_norm": 3.5548083782196045, + "learning_rate": 2.83564507413912e-05, + "loss": 1.825, + "step": 77235 + }, + { + "epoch": 0.1504328962178541, + "grad_norm": 1.9573009014129639, + "learning_rate": 2.8355824211576605e-05, + "loss": 1.811, + "step": 77250 + }, + { + "epoch": 0.15046210648896435, + "grad_norm": 4.457601070404053, + "learning_rate": 2.835519756929074e-05, + "loss": 1.8464, + "step": 77265 + }, + { + "epoch": 0.15049131676007463, + "grad_norm": 3.799541473388672, + "learning_rate": 2.8354570814538875e-05, + "loss": 1.8774, + "step": 77280 + }, + { + "epoch": 0.15052052703118488, + "grad_norm": 3.6558761596679688, + "learning_rate": 2.8353943947326294e-05, + "loss": 1.7569, + "step": 77295 + }, + { + "epoch": 0.15054973730229515, + "grad_norm": 2.549957513809204, + "learning_rate": 2.8353316967658268e-05, + "loss": 1.8556, + "step": 77310 + }, + { + "epoch": 0.15057894757340543, + "grad_norm": 3.6615633964538574, + "learning_rate": 2.8352689875540082e-05, + "loss": 1.9283, + "step": 77325 + }, + { + "epoch": 0.15060815784451567, + "grad_norm": 2.2295496463775635, + "learning_rate": 2.835206267097702e-05, + "loss": 1.806, + "step": 77340 + }, + { + "epoch": 0.15063736811562595, + "grad_norm": 2.5141348838806152, + "learning_rate": 2.835143535397436e-05, + "loss": 1.9033, + "step": 77355 + }, + { + "epoch": 0.1506665783867362, + "grad_norm": 3.9742422103881836, + "learning_rate": 2.8350807924537386e-05, + "loss": 1.7533, + "step": 77370 + }, + { + "epoch": 0.15069578865784647, + "grad_norm": 3.1630399227142334, + "learning_rate": 2.8350180382671376e-05, + "loss": 1.7986, + "step": 77385 + }, + { + "epoch": 0.15072499892895672, + "grad_norm": 2.5035641193389893, + "learning_rate": 2.8349552728381625e-05, + "loss": 1.7167, + "step": 77400 + }, + { + "epoch": 0.150754209200067, + "grad_norm": 4.307298183441162, + "learning_rate": 2.834892496167341e-05, + "loss": 1.9659, + "step": 77415 + }, + { + "epoch": 0.15078341947117724, + "grad_norm": 2.9440340995788574, + "learning_rate": 2.834829708255202e-05, + "loss": 1.8477, + "step": 77430 + }, + { + "epoch": 0.1508126297422875, + "grad_norm": 2.251739978790283, + "learning_rate": 2.8347669091022745e-05, + "loss": 1.8551, + "step": 77445 + }, + { + "epoch": 0.1508418400133978, + "grad_norm": 3.8087217807769775, + "learning_rate": 2.834704098709087e-05, + "loss": 1.9413, + "step": 77460 + }, + { + "epoch": 0.15087105028450803, + "grad_norm": 3.292526960372925, + "learning_rate": 2.834641277076168e-05, + "loss": 2.0415, + "step": 77475 + }, + { + "epoch": 0.1509002605556183, + "grad_norm": 3.185164451599121, + "learning_rate": 2.834578444204048e-05, + "loss": 1.7934, + "step": 77490 + }, + { + "epoch": 0.15092947082672856, + "grad_norm": 2.0528712272644043, + "learning_rate": 2.834515600093255e-05, + "loss": 1.9574, + "step": 77505 + }, + { + "epoch": 0.15095868109783883, + "grad_norm": 5.014784336090088, + "learning_rate": 2.8344527447443182e-05, + "loss": 1.7146, + "step": 77520 + }, + { + "epoch": 0.15098789136894908, + "grad_norm": 4.068947792053223, + "learning_rate": 2.8343898781577673e-05, + "loss": 1.8002, + "step": 77535 + }, + { + "epoch": 0.15101710164005935, + "grad_norm": 5.826878547668457, + "learning_rate": 2.8343270003341316e-05, + "loss": 1.7434, + "step": 77550 + }, + { + "epoch": 0.15104631191116963, + "grad_norm": 2.1598801612854004, + "learning_rate": 2.8342641112739402e-05, + "loss": 1.8739, + "step": 77565 + }, + { + "epoch": 0.15107552218227988, + "grad_norm": 3.5090527534484863, + "learning_rate": 2.8342012109777235e-05, + "loss": 1.891, + "step": 77580 + }, + { + "epoch": 0.15110473245339015, + "grad_norm": 3.734327554702759, + "learning_rate": 2.8341382994460103e-05, + "loss": 1.9927, + "step": 77595 + }, + { + "epoch": 0.1511339427245004, + "grad_norm": 3.6260833740234375, + "learning_rate": 2.8340753766793312e-05, + "loss": 1.6587, + "step": 77610 + }, + { + "epoch": 0.15116315299561067, + "grad_norm": 3.6602845191955566, + "learning_rate": 2.8340124426782157e-05, + "loss": 1.9888, + "step": 77625 + }, + { + "epoch": 0.15119236326672092, + "grad_norm": 3.3770320415496826, + "learning_rate": 2.833949497443194e-05, + "loss": 1.8745, + "step": 77640 + }, + { + "epoch": 0.1512215735378312, + "grad_norm": 2.6772773265838623, + "learning_rate": 2.8338865409747955e-05, + "loss": 1.8572, + "step": 77655 + }, + { + "epoch": 0.15125078380894147, + "grad_norm": 2.6824944019317627, + "learning_rate": 2.833823573273551e-05, + "loss": 1.6882, + "step": 77670 + }, + { + "epoch": 0.15127999408005172, + "grad_norm": 4.292243957519531, + "learning_rate": 2.8337605943399908e-05, + "loss": 1.8524, + "step": 77685 + }, + { + "epoch": 0.151309204351162, + "grad_norm": 4.254456043243408, + "learning_rate": 2.833697604174645e-05, + "loss": 2.0289, + "step": 77700 + }, + { + "epoch": 0.15133841462227224, + "grad_norm": 3.6422243118286133, + "learning_rate": 2.833634602778044e-05, + "loss": 1.8613, + "step": 77715 + }, + { + "epoch": 0.1513676248933825, + "grad_norm": 4.528667449951172, + "learning_rate": 2.8335715901507183e-05, + "loss": 1.9831, + "step": 77730 + }, + { + "epoch": 0.15139683516449276, + "grad_norm": 4.303900718688965, + "learning_rate": 2.8335085662931988e-05, + "loss": 1.8412, + "step": 77745 + }, + { + "epoch": 0.15142604543560303, + "grad_norm": 3.0902206897735596, + "learning_rate": 2.8334455312060163e-05, + "loss": 2.0548, + "step": 77760 + }, + { + "epoch": 0.1514552557067133, + "grad_norm": 3.2652089595794678, + "learning_rate": 2.833382484889701e-05, + "loss": 1.986, + "step": 77775 + }, + { + "epoch": 0.15148446597782356, + "grad_norm": 2.8417513370513916, + "learning_rate": 2.8333194273447848e-05, + "loss": 1.8872, + "step": 77790 + }, + { + "epoch": 0.15151367624893383, + "grad_norm": 2.4151108264923096, + "learning_rate": 2.833256358571798e-05, + "loss": 1.8762, + "step": 77805 + }, + { + "epoch": 0.15154288652004408, + "grad_norm": 4.797358989715576, + "learning_rate": 2.8331932785712715e-05, + "loss": 1.9346, + "step": 77820 + }, + { + "epoch": 0.15157209679115435, + "grad_norm": 2.680054187774658, + "learning_rate": 2.8331301873437374e-05, + "loss": 1.7842, + "step": 77835 + }, + { + "epoch": 0.1516013070622646, + "grad_norm": 3.1099088191986084, + "learning_rate": 2.8330670848897264e-05, + "loss": 1.8131, + "step": 77850 + }, + { + "epoch": 0.15163051733337488, + "grad_norm": 3.914663314819336, + "learning_rate": 2.8330039712097697e-05, + "loss": 1.8042, + "step": 77865 + }, + { + "epoch": 0.15165972760448515, + "grad_norm": 2.1181468963623047, + "learning_rate": 2.8329408463043993e-05, + "loss": 1.8576, + "step": 77880 + }, + { + "epoch": 0.1516889378755954, + "grad_norm": 4.445952415466309, + "learning_rate": 2.8328777101741467e-05, + "loss": 1.9495, + "step": 77895 + }, + { + "epoch": 0.15171814814670567, + "grad_norm": 3.265141010284424, + "learning_rate": 2.832814562819543e-05, + "loss": 1.9069, + "step": 77910 + }, + { + "epoch": 0.15174735841781592, + "grad_norm": 2.6958353519439697, + "learning_rate": 2.832751404241121e-05, + "loss": 1.9092, + "step": 77925 + }, + { + "epoch": 0.1517765686889262, + "grad_norm": 1.9564762115478516, + "learning_rate": 2.8326882344394115e-05, + "loss": 1.8973, + "step": 77940 + }, + { + "epoch": 0.15180577896003644, + "grad_norm": 2.433993339538574, + "learning_rate": 2.8326250534149473e-05, + "loss": 1.8317, + "step": 77955 + }, + { + "epoch": 0.15183498923114672, + "grad_norm": 3.4923458099365234, + "learning_rate": 2.8325618611682598e-05, + "loss": 1.8179, + "step": 77970 + }, + { + "epoch": 0.151864199502257, + "grad_norm": 3.558544635772705, + "learning_rate": 2.8324986576998816e-05, + "loss": 1.8204, + "step": 77985 + }, + { + "epoch": 0.15189340977336724, + "grad_norm": 3.615591049194336, + "learning_rate": 2.8324354430103446e-05, + "loss": 1.8518, + "step": 78000 + }, + { + "epoch": 0.1519226200444775, + "grad_norm": 4.228688716888428, + "learning_rate": 2.8323722171001815e-05, + "loss": 1.7714, + "step": 78015 + }, + { + "epoch": 0.15195183031558776, + "grad_norm": 2.872915506362915, + "learning_rate": 2.832308979969924e-05, + "loss": 1.8045, + "step": 78030 + }, + { + "epoch": 0.15198104058669804, + "grad_norm": 2.5743587017059326, + "learning_rate": 2.832245731620106e-05, + "loss": 1.8226, + "step": 78045 + }, + { + "epoch": 0.15201025085780828, + "grad_norm": 3.8700995445251465, + "learning_rate": 2.8321824720512594e-05, + "loss": 1.8214, + "step": 78060 + }, + { + "epoch": 0.15203946112891856, + "grad_norm": 5.276907444000244, + "learning_rate": 2.8321192012639162e-05, + "loss": 2.0062, + "step": 78075 + }, + { + "epoch": 0.15206867140002883, + "grad_norm": 4.182982921600342, + "learning_rate": 2.8320559192586102e-05, + "loss": 1.9456, + "step": 78090 + }, + { + "epoch": 0.15209788167113908, + "grad_norm": 3.7196786403656006, + "learning_rate": 2.831992626035874e-05, + "loss": 1.8259, + "step": 78105 + }, + { + "epoch": 0.15212709194224935, + "grad_norm": 2.3887393474578857, + "learning_rate": 2.8319293215962403e-05, + "loss": 1.6931, + "step": 78120 + }, + { + "epoch": 0.1521563022133596, + "grad_norm": 2.1422173976898193, + "learning_rate": 2.831866005940243e-05, + "loss": 1.8647, + "step": 78135 + }, + { + "epoch": 0.15218551248446988, + "grad_norm": 2.7031822204589844, + "learning_rate": 2.8318026790684145e-05, + "loss": 1.9057, + "step": 78150 + }, + { + "epoch": 0.15221472275558012, + "grad_norm": 4.873050212860107, + "learning_rate": 2.8317393409812878e-05, + "loss": 1.8919, + "step": 78165 + }, + { + "epoch": 0.1522439330266904, + "grad_norm": 4.518728256225586, + "learning_rate": 2.8316759916793972e-05, + "loss": 1.8364, + "step": 78180 + }, + { + "epoch": 0.15227314329780067, + "grad_norm": 3.1657261848449707, + "learning_rate": 2.8316126311632758e-05, + "loss": 1.867, + "step": 78195 + }, + { + "epoch": 0.15230235356891092, + "grad_norm": 2.2308356761932373, + "learning_rate": 2.831549259433457e-05, + "loss": 2.1229, + "step": 78210 + }, + { + "epoch": 0.1523315638400212, + "grad_norm": 3.1004323959350586, + "learning_rate": 2.831485876490475e-05, + "loss": 2.0346, + "step": 78225 + }, + { + "epoch": 0.15236077411113144, + "grad_norm": 5.912225723266602, + "learning_rate": 2.8314224823348632e-05, + "loss": 1.817, + "step": 78240 + }, + { + "epoch": 0.15238998438224172, + "grad_norm": 4.399219989776611, + "learning_rate": 2.831359076967155e-05, + "loss": 1.8562, + "step": 78255 + }, + { + "epoch": 0.15241919465335196, + "grad_norm": 2.1678943634033203, + "learning_rate": 2.8312956603878846e-05, + "loss": 1.9265, + "step": 78270 + }, + { + "epoch": 0.15244840492446224, + "grad_norm": 7.064698219299316, + "learning_rate": 2.8312322325975866e-05, + "loss": 1.721, + "step": 78285 + }, + { + "epoch": 0.1524776151955725, + "grad_norm": 3.3917887210845947, + "learning_rate": 2.8311687935967944e-05, + "loss": 1.8232, + "step": 78300 + }, + { + "epoch": 0.15250682546668276, + "grad_norm": 2.3426544666290283, + "learning_rate": 2.831105343386043e-05, + "loss": 1.8651, + "step": 78315 + }, + { + "epoch": 0.15253603573779304, + "grad_norm": 2.822598457336426, + "learning_rate": 2.831041881965866e-05, + "loss": 1.9136, + "step": 78330 + }, + { + "epoch": 0.15256524600890328, + "grad_norm": 4.391366481781006, + "learning_rate": 2.8309784093367985e-05, + "loss": 2.1663, + "step": 78345 + }, + { + "epoch": 0.15259445628001356, + "grad_norm": 2.2223432064056396, + "learning_rate": 2.830914925499374e-05, + "loss": 1.7807, + "step": 78360 + }, + { + "epoch": 0.1526236665511238, + "grad_norm": 3.59328556060791, + "learning_rate": 2.830851430454128e-05, + "loss": 1.8543, + "step": 78375 + }, + { + "epoch": 0.15265287682223408, + "grad_norm": 2.2192344665527344, + "learning_rate": 2.830787924201595e-05, + "loss": 1.9838, + "step": 78390 + }, + { + "epoch": 0.15268208709334435, + "grad_norm": 5.444881439208984, + "learning_rate": 2.83072440674231e-05, + "loss": 1.6705, + "step": 78405 + }, + { + "epoch": 0.1527112973644546, + "grad_norm": 4.0161027908325195, + "learning_rate": 2.8306608780768072e-05, + "loss": 1.8973, + "step": 78420 + }, + { + "epoch": 0.15274050763556488, + "grad_norm": 3.008929967880249, + "learning_rate": 2.8305973382056223e-05, + "loss": 1.7605, + "step": 78435 + }, + { + "epoch": 0.15276971790667512, + "grad_norm": 2.6174750328063965, + "learning_rate": 2.83053378712929e-05, + "loss": 1.9428, + "step": 78450 + }, + { + "epoch": 0.1527989281777854, + "grad_norm": 3.401848793029785, + "learning_rate": 2.8304702248483453e-05, + "loss": 2.053, + "step": 78465 + }, + { + "epoch": 0.15282813844889565, + "grad_norm": 4.477932929992676, + "learning_rate": 2.8304066513633236e-05, + "loss": 1.7426, + "step": 78480 + }, + { + "epoch": 0.15285734872000592, + "grad_norm": 2.9167962074279785, + "learning_rate": 2.8303430666747605e-05, + "loss": 2.0137, + "step": 78495 + }, + { + "epoch": 0.1528865589911162, + "grad_norm": 3.704078197479248, + "learning_rate": 2.8302794707831915e-05, + "loss": 1.9082, + "step": 78510 + }, + { + "epoch": 0.15291576926222644, + "grad_norm": 2.659475326538086, + "learning_rate": 2.830215863689152e-05, + "loss": 1.9523, + "step": 78525 + }, + { + "epoch": 0.15294497953333672, + "grad_norm": 1.828421711921692, + "learning_rate": 2.8301522453931774e-05, + "loss": 1.6199, + "step": 78540 + }, + { + "epoch": 0.15297418980444696, + "grad_norm": 2.4483065605163574, + "learning_rate": 2.8300886158958042e-05, + "loss": 1.8224, + "step": 78555 + }, + { + "epoch": 0.15300340007555724, + "grad_norm": 3.4007294178009033, + "learning_rate": 2.830024975197567e-05, + "loss": 2.0813, + "step": 78570 + }, + { + "epoch": 0.15303261034666749, + "grad_norm": 2.7482893466949463, + "learning_rate": 2.829961323299003e-05, + "loss": 1.904, + "step": 78585 + }, + { + "epoch": 0.15306182061777776, + "grad_norm": 2.0441031455993652, + "learning_rate": 2.829897660200647e-05, + "loss": 1.7493, + "step": 78600 + }, + { + "epoch": 0.15309103088888804, + "grad_norm": 4.078094959259033, + "learning_rate": 2.8298339859030362e-05, + "loss": 2.0135, + "step": 78615 + }, + { + "epoch": 0.15312024115999828, + "grad_norm": 4.656898498535156, + "learning_rate": 2.8297703004067066e-05, + "loss": 1.6474, + "step": 78630 + }, + { + "epoch": 0.15314945143110856, + "grad_norm": 2.3605144023895264, + "learning_rate": 2.8297066037121938e-05, + "loss": 1.9051, + "step": 78645 + }, + { + "epoch": 0.1531786617022188, + "grad_norm": 4.127384662628174, + "learning_rate": 2.829642895820035e-05, + "loss": 1.9033, + "step": 78660 + }, + { + "epoch": 0.15320787197332908, + "grad_norm": 1.9077873229980469, + "learning_rate": 2.8295791767307658e-05, + "loss": 1.7017, + "step": 78675 + }, + { + "epoch": 0.15323708224443933, + "grad_norm": 2.2632439136505127, + "learning_rate": 2.829515446444924e-05, + "loss": 1.8146, + "step": 78690 + }, + { + "epoch": 0.1532662925155496, + "grad_norm": 5.7192230224609375, + "learning_rate": 2.8294517049630454e-05, + "loss": 1.7805, + "step": 78705 + }, + { + "epoch": 0.15329550278665988, + "grad_norm": 3.5982019901275635, + "learning_rate": 2.829387952285667e-05, + "loss": 2.0321, + "step": 78720 + }, + { + "epoch": 0.15332471305777012, + "grad_norm": 3.209650754928589, + "learning_rate": 2.829324188413326e-05, + "loss": 1.7485, + "step": 78735 + }, + { + "epoch": 0.1533539233288804, + "grad_norm": 2.90230393409729, + "learning_rate": 2.8292604133465587e-05, + "loss": 1.8029, + "step": 78750 + }, + { + "epoch": 0.15338313359999065, + "grad_norm": 4.478569030761719, + "learning_rate": 2.8291966270859025e-05, + "loss": 1.7969, + "step": 78765 + }, + { + "epoch": 0.15341234387110092, + "grad_norm": 2.163280487060547, + "learning_rate": 2.8291328296318947e-05, + "loss": 1.9994, + "step": 78780 + }, + { + "epoch": 0.15344155414221117, + "grad_norm": 2.539865255355835, + "learning_rate": 2.829069020985072e-05, + "loss": 1.8986, + "step": 78795 + }, + { + "epoch": 0.15347076441332144, + "grad_norm": 3.6060850620269775, + "learning_rate": 2.829005201145973e-05, + "loss": 1.7581, + "step": 78810 + }, + { + "epoch": 0.15349997468443172, + "grad_norm": 3.346775531768799, + "learning_rate": 2.8289413701151332e-05, + "loss": 1.7855, + "step": 78825 + }, + { + "epoch": 0.15352918495554196, + "grad_norm": 3.004331111907959, + "learning_rate": 2.828877527893092e-05, + "loss": 2.0464, + "step": 78840 + }, + { + "epoch": 0.15355839522665224, + "grad_norm": 4.538378715515137, + "learning_rate": 2.828813674480386e-05, + "loss": 1.8422, + "step": 78855 + }, + { + "epoch": 0.15358760549776249, + "grad_norm": 2.2600247859954834, + "learning_rate": 2.8287498098775532e-05, + "loss": 2.069, + "step": 78870 + }, + { + "epoch": 0.15361681576887276, + "grad_norm": 2.6263186931610107, + "learning_rate": 2.8286859340851313e-05, + "loss": 1.8724, + "step": 78885 + }, + { + "epoch": 0.153646026039983, + "grad_norm": 3.645336866378784, + "learning_rate": 2.8286220471036582e-05, + "loss": 1.8021, + "step": 78900 + }, + { + "epoch": 0.15367523631109328, + "grad_norm": 2.3809547424316406, + "learning_rate": 2.828558148933672e-05, + "loss": 1.8267, + "step": 78915 + }, + { + "epoch": 0.15370444658220356, + "grad_norm": 3.20304799079895, + "learning_rate": 2.828494239575711e-05, + "loss": 1.9897, + "step": 78930 + }, + { + "epoch": 0.1537336568533138, + "grad_norm": 6.630771160125732, + "learning_rate": 2.828430319030313e-05, + "loss": 1.8329, + "step": 78945 + }, + { + "epoch": 0.15376286712442408, + "grad_norm": 2.5891201496124268, + "learning_rate": 2.8283663872980166e-05, + "loss": 1.8601, + "step": 78960 + }, + { + "epoch": 0.15379207739553433, + "grad_norm": 3.903089761734009, + "learning_rate": 2.8283024443793598e-05, + "loss": 1.9186, + "step": 78975 + }, + { + "epoch": 0.1538212876666446, + "grad_norm": 4.2659783363342285, + "learning_rate": 2.8282384902748818e-05, + "loss": 1.7639, + "step": 78990 + }, + { + "epoch": 0.15385049793775485, + "grad_norm": 3.382220506668091, + "learning_rate": 2.82817452498512e-05, + "loss": 2.0548, + "step": 79005 + }, + { + "epoch": 0.15387970820886512, + "grad_norm": 2.5053818225860596, + "learning_rate": 2.828110548510614e-05, + "loss": 1.8697, + "step": 79020 + }, + { + "epoch": 0.1539089184799754, + "grad_norm": 3.027419090270996, + "learning_rate": 2.828046560851902e-05, + "loss": 1.6792, + "step": 79035 + }, + { + "epoch": 0.15393812875108565, + "grad_norm": 3.716770648956299, + "learning_rate": 2.8279825620095234e-05, + "loss": 1.8993, + "step": 79050 + }, + { + "epoch": 0.15396733902219592, + "grad_norm": 2.6639816761016846, + "learning_rate": 2.827918551984017e-05, + "loss": 1.8517, + "step": 79065 + }, + { + "epoch": 0.15399654929330617, + "grad_norm": 2.7598884105682373, + "learning_rate": 2.827854530775922e-05, + "loss": 1.7585, + "step": 79080 + }, + { + "epoch": 0.15402575956441644, + "grad_norm": 2.4604318141937256, + "learning_rate": 2.8277904983857764e-05, + "loss": 1.7874, + "step": 79095 + }, + { + "epoch": 0.1540549698355267, + "grad_norm": 4.252652168273926, + "learning_rate": 2.8277264548141206e-05, + "loss": 2.1106, + "step": 79110 + }, + { + "epoch": 0.15408418010663696, + "grad_norm": 4.358330726623535, + "learning_rate": 2.8276624000614938e-05, + "loss": 1.9435, + "step": 79125 + }, + { + "epoch": 0.15411339037774724, + "grad_norm": 2.592512369155884, + "learning_rate": 2.8275983341284352e-05, + "loss": 1.9553, + "step": 79140 + }, + { + "epoch": 0.1541426006488575, + "grad_norm": 2.6024131774902344, + "learning_rate": 2.827534257015484e-05, + "loss": 1.7518, + "step": 79155 + }, + { + "epoch": 0.15417181091996776, + "grad_norm": 5.048944473266602, + "learning_rate": 2.8274701687231805e-05, + "loss": 1.8137, + "step": 79170 + }, + { + "epoch": 0.154201021191078, + "grad_norm": 3.6628575325012207, + "learning_rate": 2.8274060692520634e-05, + "loss": 1.8706, + "step": 79185 + }, + { + "epoch": 0.15423023146218828, + "grad_norm": 3.3859686851501465, + "learning_rate": 2.8273419586026736e-05, + "loss": 1.8748, + "step": 79200 + }, + { + "epoch": 0.15425944173329853, + "grad_norm": 4.2406463623046875, + "learning_rate": 2.8272778367755503e-05, + "loss": 1.6969, + "step": 79215 + }, + { + "epoch": 0.1542886520044088, + "grad_norm": 2.8845160007476807, + "learning_rate": 2.8272137037712337e-05, + "loss": 1.8789, + "step": 79230 + }, + { + "epoch": 0.15431786227551908, + "grad_norm": 2.0660598278045654, + "learning_rate": 2.8271495595902642e-05, + "loss": 1.9238, + "step": 79245 + }, + { + "epoch": 0.15434707254662933, + "grad_norm": 2.2481045722961426, + "learning_rate": 2.827085404233181e-05, + "loss": 1.9048, + "step": 79260 + }, + { + "epoch": 0.1543762828177396, + "grad_norm": 2.678368091583252, + "learning_rate": 2.8270212377005253e-05, + "loss": 1.7065, + "step": 79275 + }, + { + "epoch": 0.15440549308884985, + "grad_norm": 5.001224040985107, + "learning_rate": 2.826957059992837e-05, + "loss": 1.9136, + "step": 79290 + }, + { + "epoch": 0.15443470335996012, + "grad_norm": 3.2356767654418945, + "learning_rate": 2.826892871110657e-05, + "loss": 1.7515, + "step": 79305 + }, + { + "epoch": 0.15446391363107037, + "grad_norm": 2.5089375972747803, + "learning_rate": 2.826828671054525e-05, + "loss": 2.0012, + "step": 79320 + }, + { + "epoch": 0.15449312390218065, + "grad_norm": 2.7956724166870117, + "learning_rate": 2.8267644598249823e-05, + "loss": 2.0048, + "step": 79335 + }, + { + "epoch": 0.15452233417329092, + "grad_norm": 2.7638449668884277, + "learning_rate": 2.8267002374225693e-05, + "loss": 1.8283, + "step": 79350 + }, + { + "epoch": 0.15455154444440117, + "grad_norm": 3.8453621864318848, + "learning_rate": 2.8266360038478276e-05, + "loss": 2.0384, + "step": 79365 + }, + { + "epoch": 0.15458075471551144, + "grad_norm": 2.4856810569763184, + "learning_rate": 2.8265717591012973e-05, + "loss": 1.7846, + "step": 79380 + }, + { + "epoch": 0.1546099649866217, + "grad_norm": 2.769892692565918, + "learning_rate": 2.8265075031835198e-05, + "loss": 1.961, + "step": 79395 + }, + { + "epoch": 0.15463917525773196, + "grad_norm": 2.300211191177368, + "learning_rate": 2.8264432360950355e-05, + "loss": 1.8821, + "step": 79410 + }, + { + "epoch": 0.1546683855288422, + "grad_norm": 2.995603084564209, + "learning_rate": 2.8263789578363863e-05, + "loss": 2.0535, + "step": 79425 + }, + { + "epoch": 0.1546975957999525, + "grad_norm": 3.6719794273376465, + "learning_rate": 2.8263146684081135e-05, + "loss": 1.793, + "step": 79440 + }, + { + "epoch": 0.15472680607106276, + "grad_norm": 2.3180785179138184, + "learning_rate": 2.8262503678107585e-05, + "loss": 1.9727, + "step": 79455 + }, + { + "epoch": 0.154756016342173, + "grad_norm": 3.0216362476348877, + "learning_rate": 2.8261860560448625e-05, + "loss": 1.8748, + "step": 79470 + }, + { + "epoch": 0.15478522661328328, + "grad_norm": 3.4041905403137207, + "learning_rate": 2.826121733110967e-05, + "loss": 2.0407, + "step": 79485 + }, + { + "epoch": 0.15481443688439353, + "grad_norm": 5.5904998779296875, + "learning_rate": 2.826057399009614e-05, + "loss": 1.878, + "step": 79500 + }, + { + "epoch": 0.1548436471555038, + "grad_norm": 1.790554404258728, + "learning_rate": 2.8259930537413452e-05, + "loss": 1.9139, + "step": 79515 + }, + { + "epoch": 0.15487285742661405, + "grad_norm": 4.14886999130249, + "learning_rate": 2.8259286973067022e-05, + "loss": 1.97, + "step": 79530 + }, + { + "epoch": 0.15490206769772433, + "grad_norm": 3.3898847103118896, + "learning_rate": 2.8258643297062276e-05, + "loss": 1.7091, + "step": 79545 + }, + { + "epoch": 0.15493127796883457, + "grad_norm": 2.1362826824188232, + "learning_rate": 2.825799950940463e-05, + "loss": 1.9073, + "step": 79560 + }, + { + "epoch": 0.15496048823994485, + "grad_norm": 3.1636834144592285, + "learning_rate": 2.82573556100995e-05, + "loss": 1.6895, + "step": 79575 + }, + { + "epoch": 0.15498969851105512, + "grad_norm": 3.876979351043701, + "learning_rate": 2.8256711599152316e-05, + "loss": 1.6817, + "step": 79590 + }, + { + "epoch": 0.15501890878216537, + "grad_norm": 3.0398216247558594, + "learning_rate": 2.82560674765685e-05, + "loss": 2.0609, + "step": 79605 + }, + { + "epoch": 0.15504811905327565, + "grad_norm": 3.3018503189086914, + "learning_rate": 2.8255423242353474e-05, + "loss": 1.8369, + "step": 79620 + }, + { + "epoch": 0.1550773293243859, + "grad_norm": 3.8785617351531982, + "learning_rate": 2.8254778896512667e-05, + "loss": 1.9356, + "step": 79635 + }, + { + "epoch": 0.15510653959549617, + "grad_norm": 2.8071508407592773, + "learning_rate": 2.8254134439051502e-05, + "loss": 1.8889, + "step": 79650 + }, + { + "epoch": 0.15513574986660641, + "grad_norm": 2.667494535446167, + "learning_rate": 2.8253489869975404e-05, + "loss": 1.6819, + "step": 79665 + }, + { + "epoch": 0.1551649601377167, + "grad_norm": 2.1014106273651123, + "learning_rate": 2.825284518928981e-05, + "loss": 1.8705, + "step": 79680 + }, + { + "epoch": 0.15519417040882696, + "grad_norm": 3.070732831954956, + "learning_rate": 2.8252200397000135e-05, + "loss": 1.9042, + "step": 79695 + }, + { + "epoch": 0.1552233806799372, + "grad_norm": 3.788663387298584, + "learning_rate": 2.8251555493111824e-05, + "loss": 1.6991, + "step": 79710 + }, + { + "epoch": 0.1552525909510475, + "grad_norm": 2.8029448986053467, + "learning_rate": 2.8250910477630293e-05, + "loss": 1.8334, + "step": 79725 + }, + { + "epoch": 0.15528180122215773, + "grad_norm": 3.6765177249908447, + "learning_rate": 2.8250265350560984e-05, + "loss": 1.8011, + "step": 79740 + }, + { + "epoch": 0.155311011493268, + "grad_norm": 2.792893648147583, + "learning_rate": 2.824962011190933e-05, + "loss": 1.8635, + "step": 79755 + }, + { + "epoch": 0.15534022176437826, + "grad_norm": 4.286826133728027, + "learning_rate": 2.824897476168076e-05, + "loss": 1.9719, + "step": 79770 + }, + { + "epoch": 0.15536943203548853, + "grad_norm": 2.677119255065918, + "learning_rate": 2.8248329299880707e-05, + "loss": 1.9105, + "step": 79785 + }, + { + "epoch": 0.1553986423065988, + "grad_norm": 3.5076262950897217, + "learning_rate": 2.8247683726514613e-05, + "loss": 1.7995, + "step": 79800 + }, + { + "epoch": 0.15542785257770905, + "grad_norm": 2.4521238803863525, + "learning_rate": 2.824703804158791e-05, + "loss": 1.8412, + "step": 79815 + }, + { + "epoch": 0.15545706284881933, + "grad_norm": 3.3617990016937256, + "learning_rate": 2.8246392245106035e-05, + "loss": 2.2377, + "step": 79830 + }, + { + "epoch": 0.15548627311992957, + "grad_norm": 2.606064796447754, + "learning_rate": 2.824574633707443e-05, + "loss": 2.0256, + "step": 79845 + }, + { + "epoch": 0.15551548339103985, + "grad_norm": 7.699562072753906, + "learning_rate": 2.824510031749853e-05, + "loss": 1.9742, + "step": 79860 + }, + { + "epoch": 0.1555446936621501, + "grad_norm": 4.098522186279297, + "learning_rate": 2.8244454186383784e-05, + "loss": 1.9751, + "step": 79875 + }, + { + "epoch": 0.15557390393326037, + "grad_norm": 3.5030972957611084, + "learning_rate": 2.824380794373562e-05, + "loss": 1.9082, + "step": 79890 + }, + { + "epoch": 0.15560311420437065, + "grad_norm": 3.721139669418335, + "learning_rate": 2.8243161589559486e-05, + "loss": 1.8799, + "step": 79905 + }, + { + "epoch": 0.1556323244754809, + "grad_norm": 3.2253010272979736, + "learning_rate": 2.824251512386083e-05, + "loss": 1.89, + "step": 79920 + }, + { + "epoch": 0.15566153474659117, + "grad_norm": 2.433263063430786, + "learning_rate": 2.824186854664509e-05, + "loss": 1.9504, + "step": 79935 + }, + { + "epoch": 0.15569074501770142, + "grad_norm": 3.994739294052124, + "learning_rate": 2.824122185791771e-05, + "loss": 1.9691, + "step": 79950 + }, + { + "epoch": 0.1557199552888117, + "grad_norm": 4.70686674118042, + "learning_rate": 2.8240575057684143e-05, + "loss": 1.8391, + "step": 79965 + }, + { + "epoch": 0.15574916555992194, + "grad_norm": 2.2091572284698486, + "learning_rate": 2.823992814594983e-05, + "loss": 1.6705, + "step": 79980 + }, + { + "epoch": 0.1557783758310322, + "grad_norm": 5.216940402984619, + "learning_rate": 2.8239281122720218e-05, + "loss": 1.9197, + "step": 79995 + }, + { + "epoch": 0.1558075861021425, + "grad_norm": 3.534649133682251, + "learning_rate": 2.823863398800076e-05, + "loss": 1.9008, + "step": 80010 + }, + { + "epoch": 0.15583679637325273, + "grad_norm": 2.2800536155700684, + "learning_rate": 2.82379867417969e-05, + "loss": 1.9644, + "step": 80025 + }, + { + "epoch": 0.155866006644363, + "grad_norm": 3.039250135421753, + "learning_rate": 2.8237339384114095e-05, + "loss": 2.0692, + "step": 80040 + }, + { + "epoch": 0.15589521691547326, + "grad_norm": 4.483944416046143, + "learning_rate": 2.8236691914957792e-05, + "loss": 1.7982, + "step": 80055 + }, + { + "epoch": 0.15592442718658353, + "grad_norm": 2.564791679382324, + "learning_rate": 2.823604433433344e-05, + "loss": 2.0609, + "step": 80070 + }, + { + "epoch": 0.15595363745769378, + "grad_norm": 2.87155818939209, + "learning_rate": 2.8235396642246503e-05, + "loss": 1.9072, + "step": 80085 + }, + { + "epoch": 0.15598284772880405, + "grad_norm": 2.6038753986358643, + "learning_rate": 2.823474883870243e-05, + "loss": 1.674, + "step": 80100 + }, + { + "epoch": 0.15601205799991433, + "grad_norm": 2.528726816177368, + "learning_rate": 2.8234100923706672e-05, + "loss": 1.6888, + "step": 80115 + }, + { + "epoch": 0.15604126827102457, + "grad_norm": 2.873342514038086, + "learning_rate": 2.823345289726469e-05, + "loss": 1.8763, + "step": 80130 + }, + { + "epoch": 0.15607047854213485, + "grad_norm": 2.239096164703369, + "learning_rate": 2.8232804759381943e-05, + "loss": 2.0389, + "step": 80145 + }, + { + "epoch": 0.1560996888132451, + "grad_norm": 2.548341751098633, + "learning_rate": 2.823215651006388e-05, + "loss": 1.8693, + "step": 80160 + }, + { + "epoch": 0.15612889908435537, + "grad_norm": 3.8689095973968506, + "learning_rate": 2.8231508149315972e-05, + "loss": 1.7703, + "step": 80175 + }, + { + "epoch": 0.15615810935546562, + "grad_norm": 2.5180134773254395, + "learning_rate": 2.8230859677143666e-05, + "loss": 1.92, + "step": 80190 + }, + { + "epoch": 0.1561873196265759, + "grad_norm": 3.2880468368530273, + "learning_rate": 2.8230211093552437e-05, + "loss": 2.0154, + "step": 80205 + }, + { + "epoch": 0.15621652989768617, + "grad_norm": 2.8187382221221924, + "learning_rate": 2.8229562398547736e-05, + "loss": 2.2022, + "step": 80220 + }, + { + "epoch": 0.15624574016879642, + "grad_norm": 2.151110887527466, + "learning_rate": 2.8228913592135033e-05, + "loss": 1.8486, + "step": 80235 + }, + { + "epoch": 0.1562749504399067, + "grad_norm": 3.39892315864563, + "learning_rate": 2.8228264674319782e-05, + "loss": 1.861, + "step": 80250 + }, + { + "epoch": 0.15630416071101694, + "grad_norm": 2.5439517498016357, + "learning_rate": 2.8227615645107457e-05, + "loss": 1.7235, + "step": 80265 + }, + { + "epoch": 0.1563333709821272, + "grad_norm": 2.7133395671844482, + "learning_rate": 2.822696650450352e-05, + "loss": 1.9489, + "step": 80280 + }, + { + "epoch": 0.15636258125323746, + "grad_norm": 3.359396457672119, + "learning_rate": 2.8226317252513435e-05, + "loss": 1.9399, + "step": 80295 + }, + { + "epoch": 0.15639179152434773, + "grad_norm": 2.342132329940796, + "learning_rate": 2.8225667889142678e-05, + "loss": 1.7111, + "step": 80310 + }, + { + "epoch": 0.156421001795458, + "grad_norm": 3.6000783443450928, + "learning_rate": 2.822501841439671e-05, + "loss": 1.9435, + "step": 80325 + }, + { + "epoch": 0.15645021206656826, + "grad_norm": 3.303403615951538, + "learning_rate": 2.8224368828281e-05, + "loss": 1.9636, + "step": 80340 + }, + { + "epoch": 0.15647942233767853, + "grad_norm": 2.425217866897583, + "learning_rate": 2.8223719130801017e-05, + "loss": 1.9308, + "step": 80355 + }, + { + "epoch": 0.15650863260878878, + "grad_norm": 3.51242995262146, + "learning_rate": 2.822306932196224e-05, + "loss": 1.7706, + "step": 80370 + }, + { + "epoch": 0.15653784287989905, + "grad_norm": 3.1173760890960693, + "learning_rate": 2.8222419401770137e-05, + "loss": 1.8252, + "step": 80385 + }, + { + "epoch": 0.1565670531510093, + "grad_norm": 2.3988189697265625, + "learning_rate": 2.8221769370230183e-05, + "loss": 1.8108, + "step": 80400 + }, + { + "epoch": 0.15659626342211957, + "grad_norm": 5.308299541473389, + "learning_rate": 2.8221119227347846e-05, + "loss": 1.8577, + "step": 80415 + }, + { + "epoch": 0.15662547369322985, + "grad_norm": 1.6429029703140259, + "learning_rate": 2.8220468973128602e-05, + "loss": 2.0189, + "step": 80430 + }, + { + "epoch": 0.1566546839643401, + "grad_norm": 3.271713972091675, + "learning_rate": 2.8219818607577932e-05, + "loss": 1.9731, + "step": 80445 + }, + { + "epoch": 0.15668389423545037, + "grad_norm": 2.580191135406494, + "learning_rate": 2.821916813070131e-05, + "loss": 1.9169, + "step": 80460 + }, + { + "epoch": 0.15671310450656062, + "grad_norm": 3.6132266521453857, + "learning_rate": 2.821851754250421e-05, + "loss": 2.0049, + "step": 80475 + }, + { + "epoch": 0.1567423147776709, + "grad_norm": 4.096710681915283, + "learning_rate": 2.821786684299212e-05, + "loss": 1.8497, + "step": 80490 + }, + { + "epoch": 0.15677152504878114, + "grad_norm": 2.9794976711273193, + "learning_rate": 2.8217216032170513e-05, + "loss": 1.9236, + "step": 80505 + }, + { + "epoch": 0.15680073531989142, + "grad_norm": 3.6211843490600586, + "learning_rate": 2.8216565110044868e-05, + "loss": 1.9154, + "step": 80520 + }, + { + "epoch": 0.1568299455910017, + "grad_norm": 3.4639501571655273, + "learning_rate": 2.8215914076620673e-05, + "loss": 1.8078, + "step": 80535 + }, + { + "epoch": 0.15685915586211194, + "grad_norm": 3.9276089668273926, + "learning_rate": 2.8215262931903406e-05, + "loss": 1.8165, + "step": 80550 + }, + { + "epoch": 0.1568883661332222, + "grad_norm": 2.472640037536621, + "learning_rate": 2.821461167589855e-05, + "loss": 1.7604, + "step": 80565 + }, + { + "epoch": 0.15691757640433246, + "grad_norm": 3.4727706909179688, + "learning_rate": 2.8213960308611594e-05, + "loss": 1.99, + "step": 80580 + }, + { + "epoch": 0.15694678667544273, + "grad_norm": 3.616140365600586, + "learning_rate": 2.8213308830048017e-05, + "loss": 1.8167, + "step": 80595 + }, + { + "epoch": 0.15697599694655298, + "grad_norm": 4.975011825561523, + "learning_rate": 2.8212657240213307e-05, + "loss": 1.7281, + "step": 80610 + }, + { + "epoch": 0.15700520721766326, + "grad_norm": 4.45720100402832, + "learning_rate": 2.821200553911295e-05, + "loss": 1.7145, + "step": 80625 + }, + { + "epoch": 0.15703441748877353, + "grad_norm": 2.319274663925171, + "learning_rate": 2.8211353726752443e-05, + "loss": 2.0067, + "step": 80640 + }, + { + "epoch": 0.15706362775988378, + "grad_norm": 3.2491366863250732, + "learning_rate": 2.8210701803137264e-05, + "loss": 1.7731, + "step": 80655 + }, + { + "epoch": 0.15709283803099405, + "grad_norm": 1.903242826461792, + "learning_rate": 2.8210049768272907e-05, + "loss": 1.9279, + "step": 80670 + }, + { + "epoch": 0.1571220483021043, + "grad_norm": 2.033527374267578, + "learning_rate": 2.8209397622164866e-05, + "loss": 1.6304, + "step": 80685 + }, + { + "epoch": 0.15715125857321458, + "grad_norm": 3.9363932609558105, + "learning_rate": 2.8208745364818625e-05, + "loss": 1.6908, + "step": 80700 + }, + { + "epoch": 0.15718046884432482, + "grad_norm": 2.696791172027588, + "learning_rate": 2.8208092996239688e-05, + "loss": 1.9093, + "step": 80715 + }, + { + "epoch": 0.1572096791154351, + "grad_norm": 2.7116141319274902, + "learning_rate": 2.8207440516433535e-05, + "loss": 1.8245, + "step": 80730 + }, + { + "epoch": 0.15723888938654537, + "grad_norm": 3.7848494052886963, + "learning_rate": 2.8206787925405674e-05, + "loss": 1.9194, + "step": 80745 + }, + { + "epoch": 0.15726809965765562, + "grad_norm": 5.806147575378418, + "learning_rate": 2.820613522316159e-05, + "loss": 1.9716, + "step": 80760 + }, + { + "epoch": 0.1572973099287659, + "grad_norm": 4.238548755645752, + "learning_rate": 2.8205482409706788e-05, + "loss": 1.8155, + "step": 80775 + }, + { + "epoch": 0.15732652019987614, + "grad_norm": 3.4387500286102295, + "learning_rate": 2.8204829485046754e-05, + "loss": 1.8225, + "step": 80790 + }, + { + "epoch": 0.15735573047098642, + "grad_norm": 2.0617027282714844, + "learning_rate": 2.8204176449187e-05, + "loss": 1.8651, + "step": 80805 + }, + { + "epoch": 0.15738494074209666, + "grad_norm": 2.27787446975708, + "learning_rate": 2.820352330213302e-05, + "loss": 1.7399, + "step": 80820 + }, + { + "epoch": 0.15741415101320694, + "grad_norm": 3.803090810775757, + "learning_rate": 2.820287004389031e-05, + "loss": 1.87, + "step": 80835 + }, + { + "epoch": 0.1574433612843172, + "grad_norm": 3.44406795501709, + "learning_rate": 2.8202216674464373e-05, + "loss": 1.8545, + "step": 80850 + }, + { + "epoch": 0.15747257155542746, + "grad_norm": 2.483839511871338, + "learning_rate": 2.8201563193860712e-05, + "loss": 1.8745, + "step": 80865 + }, + { + "epoch": 0.15750178182653773, + "grad_norm": 2.1469671726226807, + "learning_rate": 2.8200909602084837e-05, + "loss": 1.9102, + "step": 80880 + }, + { + "epoch": 0.15753099209764798, + "grad_norm": 3.3669514656066895, + "learning_rate": 2.8200255899142243e-05, + "loss": 1.7938, + "step": 80895 + }, + { + "epoch": 0.15756020236875826, + "grad_norm": 3.3053646087646484, + "learning_rate": 2.8199602085038434e-05, + "loss": 1.9703, + "step": 80910 + }, + { + "epoch": 0.1575894126398685, + "grad_norm": 2.778557538986206, + "learning_rate": 2.8198948159778925e-05, + "loss": 1.8247, + "step": 80925 + }, + { + "epoch": 0.15761862291097878, + "grad_norm": 3.980415105819702, + "learning_rate": 2.819829412336921e-05, + "loss": 1.8455, + "step": 80940 + }, + { + "epoch": 0.15764783318208905, + "grad_norm": 2.2206199169158936, + "learning_rate": 2.819763997581481e-05, + "loss": 1.8953, + "step": 80955 + }, + { + "epoch": 0.1576770434531993, + "grad_norm": 3.5980825424194336, + "learning_rate": 2.8196985717121228e-05, + "loss": 2.0573, + "step": 80970 + }, + { + "epoch": 0.15770625372430958, + "grad_norm": 2.140207290649414, + "learning_rate": 2.819633134729397e-05, + "loss": 1.7948, + "step": 80985 + }, + { + "epoch": 0.15773546399541982, + "grad_norm": 3.9434874057769775, + "learning_rate": 2.8195676866338552e-05, + "loss": 1.7207, + "step": 81000 + }, + { + "epoch": 0.1577646742665301, + "grad_norm": 3.0137832164764404, + "learning_rate": 2.8195022274260477e-05, + "loss": 1.9754, + "step": 81015 + }, + { + "epoch": 0.15779388453764034, + "grad_norm": 4.5265212059021, + "learning_rate": 2.819436757106527e-05, + "loss": 1.9262, + "step": 81030 + }, + { + "epoch": 0.15782309480875062, + "grad_norm": 2.1957404613494873, + "learning_rate": 2.8193712756758433e-05, + "loss": 1.7842, + "step": 81045 + }, + { + "epoch": 0.1578523050798609, + "grad_norm": 4.568048477172852, + "learning_rate": 2.8193057831345492e-05, + "loss": 1.8191, + "step": 81060 + }, + { + "epoch": 0.15788151535097114, + "grad_norm": 3.4339022636413574, + "learning_rate": 2.8192402794831953e-05, + "loss": 1.8252, + "step": 81075 + }, + { + "epoch": 0.15791072562208142, + "grad_norm": 3.2385871410369873, + "learning_rate": 2.819174764722333e-05, + "loss": 1.7918, + "step": 81090 + }, + { + "epoch": 0.15793993589319166, + "grad_norm": 3.356942653656006, + "learning_rate": 2.819109238852515e-05, + "loss": 1.671, + "step": 81105 + }, + { + "epoch": 0.15796914616430194, + "grad_norm": 4.490026473999023, + "learning_rate": 2.8190437018742923e-05, + "loss": 1.7759, + "step": 81120 + }, + { + "epoch": 0.15799835643541218, + "grad_norm": 4.227987289428711, + "learning_rate": 2.8189781537882175e-05, + "loss": 2.1198, + "step": 81135 + }, + { + "epoch": 0.15802756670652246, + "grad_norm": 3.9179623126983643, + "learning_rate": 2.8189125945948417e-05, + "loss": 1.7809, + "step": 81150 + }, + { + "epoch": 0.15805677697763273, + "grad_norm": 2.4316089153289795, + "learning_rate": 2.8188470242947177e-05, + "loss": 1.8624, + "step": 81165 + }, + { + "epoch": 0.15808598724874298, + "grad_norm": 2.131746292114258, + "learning_rate": 2.8187814428883972e-05, + "loss": 1.8554, + "step": 81180 + }, + { + "epoch": 0.15811519751985326, + "grad_norm": 4.319799423217773, + "learning_rate": 2.8187158503764326e-05, + "loss": 1.7918, + "step": 81195 + }, + { + "epoch": 0.1581444077909635, + "grad_norm": 2.7815661430358887, + "learning_rate": 2.8186502467593765e-05, + "loss": 1.91, + "step": 81210 + }, + { + "epoch": 0.15817361806207378, + "grad_norm": 2.4438674449920654, + "learning_rate": 2.818584632037781e-05, + "loss": 1.8411, + "step": 81225 + }, + { + "epoch": 0.15820282833318403, + "grad_norm": 2.317816734313965, + "learning_rate": 2.8185190062121994e-05, + "loss": 1.8286, + "step": 81240 + }, + { + "epoch": 0.1582320386042943, + "grad_norm": 3.226630687713623, + "learning_rate": 2.8184533692831834e-05, + "loss": 1.6863, + "step": 81255 + }, + { + "epoch": 0.15826124887540458, + "grad_norm": 2.3429477214813232, + "learning_rate": 2.8183877212512865e-05, + "loss": 1.8728, + "step": 81270 + }, + { + "epoch": 0.15829045914651482, + "grad_norm": 2.736405611038208, + "learning_rate": 2.818322062117061e-05, + "loss": 1.9654, + "step": 81285 + }, + { + "epoch": 0.1583196694176251, + "grad_norm": 3.84123158454895, + "learning_rate": 2.81825639188106e-05, + "loss": 1.7206, + "step": 81300 + }, + { + "epoch": 0.15834887968873534, + "grad_norm": 3.439379930496216, + "learning_rate": 2.8181907105438364e-05, + "loss": 1.7402, + "step": 81315 + }, + { + "epoch": 0.15837808995984562, + "grad_norm": 2.444263219833374, + "learning_rate": 2.8181250181059436e-05, + "loss": 1.792, + "step": 81330 + }, + { + "epoch": 0.15840730023095587, + "grad_norm": 3.232215642929077, + "learning_rate": 2.8180593145679343e-05, + "loss": 1.7666, + "step": 81345 + }, + { + "epoch": 0.15843651050206614, + "grad_norm": 5.52445650100708, + "learning_rate": 2.8179935999303627e-05, + "loss": 1.9929, + "step": 81360 + }, + { + "epoch": 0.15846572077317642, + "grad_norm": 3.5913734436035156, + "learning_rate": 2.817927874193781e-05, + "loss": 1.9121, + "step": 81375 + }, + { + "epoch": 0.15849493104428666, + "grad_norm": 3.4441328048706055, + "learning_rate": 2.8178621373587438e-05, + "loss": 1.9429, + "step": 81390 + }, + { + "epoch": 0.15852414131539694, + "grad_norm": 3.983950614929199, + "learning_rate": 2.817796389425804e-05, + "loss": 1.6985, + "step": 81405 + }, + { + "epoch": 0.15855335158650719, + "grad_norm": 3.0761523246765137, + "learning_rate": 2.8177306303955156e-05, + "loss": 1.9902, + "step": 81420 + }, + { + "epoch": 0.15858256185761746, + "grad_norm": 3.388519287109375, + "learning_rate": 2.8176648602684317e-05, + "loss": 1.8442, + "step": 81435 + }, + { + "epoch": 0.1586117721287277, + "grad_norm": 1.9653489589691162, + "learning_rate": 2.817599079045107e-05, + "loss": 1.9087, + "step": 81450 + }, + { + "epoch": 0.15864098239983798, + "grad_norm": 4.192715167999268, + "learning_rate": 2.8175332867260953e-05, + "loss": 1.8524, + "step": 81465 + }, + { + "epoch": 0.15867019267094826, + "grad_norm": 2.1585469245910645, + "learning_rate": 2.8174674833119508e-05, + "loss": 1.6773, + "step": 81480 + }, + { + "epoch": 0.1586994029420585, + "grad_norm": 4.216448783874512, + "learning_rate": 2.8174016688032265e-05, + "loss": 1.8415, + "step": 81495 + }, + { + "epoch": 0.15872861321316878, + "grad_norm": 3.49615478515625, + "learning_rate": 2.817335843200478e-05, + "loss": 1.8557, + "step": 81510 + }, + { + "epoch": 0.15875782348427903, + "grad_norm": 3.6683173179626465, + "learning_rate": 2.817270006504259e-05, + "loss": 1.757, + "step": 81525 + }, + { + "epoch": 0.1587870337553893, + "grad_norm": 3.743548631668091, + "learning_rate": 2.8172041587151238e-05, + "loss": 1.852, + "step": 81540 + }, + { + "epoch": 0.15881624402649955, + "grad_norm": 2.46884822845459, + "learning_rate": 2.8171382998336274e-05, + "loss": 1.8199, + "step": 81555 + }, + { + "epoch": 0.15884545429760982, + "grad_norm": 5.1284403800964355, + "learning_rate": 2.8170724298603244e-05, + "loss": 1.7776, + "step": 81570 + }, + { + "epoch": 0.1588746645687201, + "grad_norm": 2.0639679431915283, + "learning_rate": 2.8170065487957687e-05, + "loss": 1.8862, + "step": 81585 + }, + { + "epoch": 0.15890387483983034, + "grad_norm": 2.5327508449554443, + "learning_rate": 2.8169406566405157e-05, + "loss": 1.8155, + "step": 81600 + }, + { + "epoch": 0.15893308511094062, + "grad_norm": 2.421888828277588, + "learning_rate": 2.81687475339512e-05, + "loss": 1.8175, + "step": 81615 + }, + { + "epoch": 0.15896229538205087, + "grad_norm": 5.192629337310791, + "learning_rate": 2.8168088390601376e-05, + "loss": 1.8812, + "step": 81630 + }, + { + "epoch": 0.15899150565316114, + "grad_norm": 3.4022157192230225, + "learning_rate": 2.8167429136361222e-05, + "loss": 1.9217, + "step": 81645 + }, + { + "epoch": 0.1590207159242714, + "grad_norm": 3.3692498207092285, + "learning_rate": 2.8166769771236297e-05, + "loss": 1.7387, + "step": 81660 + }, + { + "epoch": 0.15904992619538166, + "grad_norm": 3.6598587036132812, + "learning_rate": 2.816611029523215e-05, + "loss": 1.7733, + "step": 81675 + }, + { + "epoch": 0.1590791364664919, + "grad_norm": 3.3557143211364746, + "learning_rate": 2.8165450708354334e-05, + "loss": 1.9061, + "step": 81690 + }, + { + "epoch": 0.15910834673760219, + "grad_norm": 1.6506764888763428, + "learning_rate": 2.816479101060841e-05, + "loss": 1.8483, + "step": 81705 + }, + { + "epoch": 0.15913755700871246, + "grad_norm": 3.343931198120117, + "learning_rate": 2.8164131201999928e-05, + "loss": 1.7593, + "step": 81720 + }, + { + "epoch": 0.1591667672798227, + "grad_norm": 4.288000106811523, + "learning_rate": 2.8163471282534446e-05, + "loss": 1.9112, + "step": 81735 + }, + { + "epoch": 0.15919597755093298, + "grad_norm": 2.1892921924591064, + "learning_rate": 2.816281125221752e-05, + "loss": 1.8529, + "step": 81750 + }, + { + "epoch": 0.15922518782204323, + "grad_norm": 2.477942705154419, + "learning_rate": 2.8162151111054712e-05, + "loss": 2.045, + "step": 81765 + }, + { + "epoch": 0.1592543980931535, + "grad_norm": 3.9407200813293457, + "learning_rate": 2.8161490859051576e-05, + "loss": 1.7453, + "step": 81780 + }, + { + "epoch": 0.15928360836426375, + "grad_norm": 3.1561851501464844, + "learning_rate": 2.8160830496213678e-05, + "loss": 2.077, + "step": 81795 + }, + { + "epoch": 0.15931281863537403, + "grad_norm": 2.5603551864624023, + "learning_rate": 2.8160170022546573e-05, + "loss": 1.7758, + "step": 81810 + }, + { + "epoch": 0.1593420289064843, + "grad_norm": 2.593172311782837, + "learning_rate": 2.8159509438055823e-05, + "loss": 1.7883, + "step": 81825 + }, + { + "epoch": 0.15937123917759455, + "grad_norm": 3.0641725063323975, + "learning_rate": 2.8158848742746993e-05, + "loss": 1.7223, + "step": 81840 + }, + { + "epoch": 0.15940044944870482, + "grad_norm": 3.310239791870117, + "learning_rate": 2.8158187936625654e-05, + "loss": 1.8615, + "step": 81855 + }, + { + "epoch": 0.15942965971981507, + "grad_norm": 3.176450729370117, + "learning_rate": 2.8157527019697355e-05, + "loss": 1.9685, + "step": 81870 + }, + { + "epoch": 0.15945886999092534, + "grad_norm": 2.5300955772399902, + "learning_rate": 2.8156865991967673e-05, + "loss": 1.9685, + "step": 81885 + }, + { + "epoch": 0.1594880802620356, + "grad_norm": 2.585354804992676, + "learning_rate": 2.8156204853442172e-05, + "loss": 1.7363, + "step": 81900 + }, + { + "epoch": 0.15951729053314587, + "grad_norm": 3.581352949142456, + "learning_rate": 2.8155543604126425e-05, + "loss": 2.0956, + "step": 81915 + }, + { + "epoch": 0.15954650080425614, + "grad_norm": 3.1380393505096436, + "learning_rate": 2.815488224402599e-05, + "loss": 1.9012, + "step": 81930 + }, + { + "epoch": 0.1595757110753664, + "grad_norm": 3.212256908416748, + "learning_rate": 2.8154220773146443e-05, + "loss": 1.771, + "step": 81945 + }, + { + "epoch": 0.15960492134647666, + "grad_norm": 2.554527759552002, + "learning_rate": 2.8153559191493352e-05, + "loss": 1.8714, + "step": 81960 + }, + { + "epoch": 0.1596341316175869, + "grad_norm": 5.6289801597595215, + "learning_rate": 2.8152897499072285e-05, + "loss": 1.8293, + "step": 81975 + }, + { + "epoch": 0.15966334188869719, + "grad_norm": 5.339416027069092, + "learning_rate": 2.8152235695888822e-05, + "loss": 2.0157, + "step": 81990 + }, + { + "epoch": 0.15969255215980743, + "grad_norm": 3.5716357231140137, + "learning_rate": 2.8151573781948538e-05, + "loss": 1.9813, + "step": 82005 + }, + { + "epoch": 0.1597217624309177, + "grad_norm": 2.4011945724487305, + "learning_rate": 2.815091175725699e-05, + "loss": 1.6411, + "step": 82020 + }, + { + "epoch": 0.15975097270202798, + "grad_norm": 3.6129086017608643, + "learning_rate": 2.8150249621819776e-05, + "loss": 1.8504, + "step": 82035 + }, + { + "epoch": 0.15978018297313823, + "grad_norm": 2.9883155822753906, + "learning_rate": 2.8149587375642456e-05, + "loss": 1.7648, + "step": 82050 + }, + { + "epoch": 0.1598093932442485, + "grad_norm": 3.764047384262085, + "learning_rate": 2.8148925018730608e-05, + "loss": 1.8213, + "step": 82065 + }, + { + "epoch": 0.15983860351535875, + "grad_norm": 3.1278605461120605, + "learning_rate": 2.8148262551089816e-05, + "loss": 1.7444, + "step": 82080 + }, + { + "epoch": 0.15986781378646903, + "grad_norm": 2.8085875511169434, + "learning_rate": 2.8147599972725652e-05, + "loss": 1.9187, + "step": 82095 + }, + { + "epoch": 0.15989702405757927, + "grad_norm": 2.910923957824707, + "learning_rate": 2.8146937283643708e-05, + "loss": 1.9519, + "step": 82110 + }, + { + "epoch": 0.15992623432868955, + "grad_norm": 2.8058440685272217, + "learning_rate": 2.8146274483849547e-05, + "loss": 1.7248, + "step": 82125 + }, + { + "epoch": 0.15995544459979982, + "grad_norm": 2.5292129516601562, + "learning_rate": 2.814561157334877e-05, + "loss": 1.9637, + "step": 82140 + }, + { + "epoch": 0.15998465487091007, + "grad_norm": 2.1737513542175293, + "learning_rate": 2.814494855214694e-05, + "loss": 1.8457, + "step": 82155 + }, + { + "epoch": 0.16001386514202035, + "grad_norm": 3.072577953338623, + "learning_rate": 2.814428542024965e-05, + "loss": 1.8208, + "step": 82170 + }, + { + "epoch": 0.1600430754131306, + "grad_norm": 3.3401997089385986, + "learning_rate": 2.814362217766249e-05, + "loss": 1.8177, + "step": 82185 + }, + { + "epoch": 0.16007228568424087, + "grad_norm": 3.402217388153076, + "learning_rate": 2.814295882439103e-05, + "loss": 2.0065, + "step": 82200 + }, + { + "epoch": 0.16010149595535111, + "grad_norm": 3.149379014968872, + "learning_rate": 2.814229536044087e-05, + "loss": 1.8819, + "step": 82215 + }, + { + "epoch": 0.1601307062264614, + "grad_norm": 4.362020969390869, + "learning_rate": 2.8141631785817595e-05, + "loss": 1.8351, + "step": 82230 + }, + { + "epoch": 0.16015991649757166, + "grad_norm": 2.816967010498047, + "learning_rate": 2.8140968100526788e-05, + "loss": 1.8598, + "step": 82245 + }, + { + "epoch": 0.1601891267686819, + "grad_norm": 4.589569091796875, + "learning_rate": 2.814030430457404e-05, + "loss": 1.7694, + "step": 82260 + }, + { + "epoch": 0.16021833703979219, + "grad_norm": 2.8394856452941895, + "learning_rate": 2.8139640397964942e-05, + "loss": 1.8338, + "step": 82275 + }, + { + "epoch": 0.16024754731090243, + "grad_norm": 4.5126848220825195, + "learning_rate": 2.8138976380705084e-05, + "loss": 1.9582, + "step": 82290 + }, + { + "epoch": 0.1602767575820127, + "grad_norm": 4.313023090362549, + "learning_rate": 2.813831225280006e-05, + "loss": 1.9391, + "step": 82305 + }, + { + "epoch": 0.16030596785312295, + "grad_norm": 4.92047119140625, + "learning_rate": 2.8137648014255454e-05, + "loss": 1.6457, + "step": 82320 + }, + { + "epoch": 0.16033517812423323, + "grad_norm": 3.7909910678863525, + "learning_rate": 2.8136983665076872e-05, + "loss": 1.8597, + "step": 82335 + }, + { + "epoch": 0.1603643883953435, + "grad_norm": 2.4445409774780273, + "learning_rate": 2.8136319205269902e-05, + "loss": 1.7861, + "step": 82350 + }, + { + "epoch": 0.16039359866645375, + "grad_norm": 3.8499512672424316, + "learning_rate": 2.8135654634840142e-05, + "loss": 1.6956, + "step": 82365 + }, + { + "epoch": 0.16042280893756403, + "grad_norm": 4.034378528594971, + "learning_rate": 2.8134989953793187e-05, + "loss": 1.9281, + "step": 82380 + }, + { + "epoch": 0.16045201920867427, + "grad_norm": 3.8575189113616943, + "learning_rate": 2.813432516213463e-05, + "loss": 1.8337, + "step": 82395 + }, + { + "epoch": 0.16048122947978455, + "grad_norm": 2.5840206146240234, + "learning_rate": 2.8133660259870076e-05, + "loss": 1.8565, + "step": 82410 + }, + { + "epoch": 0.1605104397508948, + "grad_norm": 2.3869524002075195, + "learning_rate": 2.8132995247005124e-05, + "loss": 1.9927, + "step": 82425 + }, + { + "epoch": 0.16053965002200507, + "grad_norm": 3.506598949432373, + "learning_rate": 2.8132330123545366e-05, + "loss": 1.8821, + "step": 82440 + }, + { + "epoch": 0.16056886029311535, + "grad_norm": 3.6460132598876953, + "learning_rate": 2.8131664889496415e-05, + "loss": 1.9739, + "step": 82455 + }, + { + "epoch": 0.1605980705642256, + "grad_norm": 4.48237419128418, + "learning_rate": 2.8130999544863863e-05, + "loss": 1.9475, + "step": 82470 + }, + { + "epoch": 0.16062728083533587, + "grad_norm": 2.6555659770965576, + "learning_rate": 2.8130334089653322e-05, + "loss": 1.9473, + "step": 82485 + }, + { + "epoch": 0.16065649110644611, + "grad_norm": 3.0035080909729004, + "learning_rate": 2.8129668523870388e-05, + "loss": 2.0306, + "step": 82500 + }, + { + "epoch": 0.1606857013775564, + "grad_norm": 2.0329442024230957, + "learning_rate": 2.8129002847520665e-05, + "loss": 1.898, + "step": 82515 + }, + { + "epoch": 0.16071491164866664, + "grad_norm": 3.0646984577178955, + "learning_rate": 2.812833706060977e-05, + "loss": 1.932, + "step": 82530 + }, + { + "epoch": 0.1607441219197769, + "grad_norm": 2.9754109382629395, + "learning_rate": 2.8127671163143295e-05, + "loss": 1.8651, + "step": 82545 + }, + { + "epoch": 0.16077333219088719, + "grad_norm": 4.017286777496338, + "learning_rate": 2.8127005155126858e-05, + "loss": 1.8633, + "step": 82560 + }, + { + "epoch": 0.16080254246199743, + "grad_norm": 3.6643784046173096, + "learning_rate": 2.8126339036566065e-05, + "loss": 1.8705, + "step": 82575 + }, + { + "epoch": 0.1608317527331077, + "grad_norm": 3.2265286445617676, + "learning_rate": 2.8125672807466528e-05, + "loss": 1.9005, + "step": 82590 + }, + { + "epoch": 0.16086096300421796, + "grad_norm": 4.928393840789795, + "learning_rate": 2.812500646783385e-05, + "loss": 1.8748, + "step": 82605 + }, + { + "epoch": 0.16089017327532823, + "grad_norm": 3.4344944953918457, + "learning_rate": 2.8124340017673644e-05, + "loss": 1.8592, + "step": 82620 + }, + { + "epoch": 0.16091938354643848, + "grad_norm": 2.9308063983917236, + "learning_rate": 2.8123673456991527e-05, + "loss": 1.8004, + "step": 82635 + }, + { + "epoch": 0.16094859381754875, + "grad_norm": 1.9366892576217651, + "learning_rate": 2.812300678579311e-05, + "loss": 2.087, + "step": 82650 + }, + { + "epoch": 0.16097780408865903, + "grad_norm": 3.438849925994873, + "learning_rate": 2.812234000408401e-05, + "loss": 1.7679, + "step": 82665 + }, + { + "epoch": 0.16100701435976927, + "grad_norm": 3.2245700359344482, + "learning_rate": 2.8121673111869835e-05, + "loss": 1.8697, + "step": 82680 + }, + { + "epoch": 0.16103622463087955, + "grad_norm": 3.9220566749572754, + "learning_rate": 2.8121006109156208e-05, + "loss": 1.7964, + "step": 82695 + }, + { + "epoch": 0.1610654349019898, + "grad_norm": 3.131941080093384, + "learning_rate": 2.8120338995948746e-05, + "loss": 1.9086, + "step": 82710 + }, + { + "epoch": 0.16109464517310007, + "grad_norm": 3.136931896209717, + "learning_rate": 2.811967177225306e-05, + "loss": 1.7904, + "step": 82725 + }, + { + "epoch": 0.16112385544421032, + "grad_norm": 2.3643617630004883, + "learning_rate": 2.811900443807477e-05, + "loss": 1.9116, + "step": 82740 + }, + { + "epoch": 0.1611530657153206, + "grad_norm": 2.3170998096466064, + "learning_rate": 2.8118336993419505e-05, + "loss": 1.9265, + "step": 82755 + }, + { + "epoch": 0.16118227598643087, + "grad_norm": 4.6217780113220215, + "learning_rate": 2.8117669438292874e-05, + "loss": 1.7115, + "step": 82770 + }, + { + "epoch": 0.16121148625754111, + "grad_norm": 3.7841084003448486, + "learning_rate": 2.811700177270051e-05, + "loss": 2.1325, + "step": 82785 + }, + { + "epoch": 0.1612406965286514, + "grad_norm": 4.044402599334717, + "learning_rate": 2.811633399664802e-05, + "loss": 1.8823, + "step": 82800 + }, + { + "epoch": 0.16126990679976164, + "grad_norm": 3.5792396068573, + "learning_rate": 2.8115666110141045e-05, + "loss": 1.7684, + "step": 82815 + }, + { + "epoch": 0.1612991170708719, + "grad_norm": 2.7184062004089355, + "learning_rate": 2.8114998113185197e-05, + "loss": 1.7934, + "step": 82830 + }, + { + "epoch": 0.16132832734198216, + "grad_norm": 2.9321513175964355, + "learning_rate": 2.8114330005786107e-05, + "loss": 2.1735, + "step": 82845 + }, + { + "epoch": 0.16135753761309243, + "grad_norm": 4.462427616119385, + "learning_rate": 2.8113661787949395e-05, + "loss": 1.9661, + "step": 82860 + }, + { + "epoch": 0.1613867478842027, + "grad_norm": 4.128498554229736, + "learning_rate": 2.81129934596807e-05, + "loss": 1.9328, + "step": 82875 + }, + { + "epoch": 0.16141595815531296, + "grad_norm": 5.243194103240967, + "learning_rate": 2.811232502098564e-05, + "loss": 1.8392, + "step": 82890 + }, + { + "epoch": 0.16144516842642323, + "grad_norm": 3.2173666954040527, + "learning_rate": 2.8111656471869848e-05, + "loss": 1.7717, + "step": 82905 + }, + { + "epoch": 0.16147437869753348, + "grad_norm": 4.007807731628418, + "learning_rate": 2.811098781233895e-05, + "loss": 1.8913, + "step": 82920 + }, + { + "epoch": 0.16150358896864375, + "grad_norm": 3.6915395259857178, + "learning_rate": 2.8110319042398576e-05, + "loss": 2.0185, + "step": 82935 + }, + { + "epoch": 0.161532799239754, + "grad_norm": 4.210644721984863, + "learning_rate": 2.810965016205437e-05, + "loss": 1.8776, + "step": 82950 + }, + { + "epoch": 0.16156200951086427, + "grad_norm": 2.3887219429016113, + "learning_rate": 2.8108981171311945e-05, + "loss": 1.8105, + "step": 82965 + }, + { + "epoch": 0.16159121978197455, + "grad_norm": 4.334285736083984, + "learning_rate": 2.8108312070176957e-05, + "loss": 1.8815, + "step": 82980 + }, + { + "epoch": 0.1616204300530848, + "grad_norm": 3.277202606201172, + "learning_rate": 2.8107642858655025e-05, + "loss": 1.895, + "step": 82995 + }, + { + "epoch": 0.16164964032419507, + "grad_norm": 3.016605854034424, + "learning_rate": 2.810697353675179e-05, + "loss": 1.7791, + "step": 83010 + }, + { + "epoch": 0.16167885059530532, + "grad_norm": 2.096148729324341, + "learning_rate": 2.8106304104472888e-05, + "loss": 1.8657, + "step": 83025 + }, + { + "epoch": 0.1617080608664156, + "grad_norm": 2.945565938949585, + "learning_rate": 2.8105634561823954e-05, + "loss": 2.0042, + "step": 83040 + }, + { + "epoch": 0.16173727113752584, + "grad_norm": 4.671420574188232, + "learning_rate": 2.810496490881063e-05, + "loss": 1.9059, + "step": 83055 + }, + { + "epoch": 0.16176648140863611, + "grad_norm": 3.68866229057312, + "learning_rate": 2.8104295145438553e-05, + "loss": 1.699, + "step": 83070 + }, + { + "epoch": 0.1617956916797464, + "grad_norm": 3.8480587005615234, + "learning_rate": 2.8103625271713358e-05, + "loss": 1.9795, + "step": 83085 + }, + { + "epoch": 0.16182490195085664, + "grad_norm": 3.6876132488250732, + "learning_rate": 2.81029552876407e-05, + "loss": 1.9073, + "step": 83100 + }, + { + "epoch": 0.1618541122219669, + "grad_norm": 3.1514546871185303, + "learning_rate": 2.8102285193226205e-05, + "loss": 1.7562, + "step": 83115 + }, + { + "epoch": 0.16188332249307716, + "grad_norm": 3.9040403366088867, + "learning_rate": 2.8101614988475527e-05, + "loss": 1.8073, + "step": 83130 + }, + { + "epoch": 0.16191253276418743, + "grad_norm": 2.33160400390625, + "learning_rate": 2.810094467339431e-05, + "loss": 1.5688, + "step": 83145 + }, + { + "epoch": 0.16194174303529768, + "grad_norm": 3.626812696456909, + "learning_rate": 2.8100274247988192e-05, + "loss": 1.9807, + "step": 83160 + }, + { + "epoch": 0.16197095330640796, + "grad_norm": 2.5435140132904053, + "learning_rate": 2.8099603712262825e-05, + "loss": 1.8384, + "step": 83175 + }, + { + "epoch": 0.16200016357751823, + "grad_norm": 3.039280891418457, + "learning_rate": 2.8098933066223848e-05, + "loss": 1.7198, + "step": 83190 + }, + { + "epoch": 0.16202937384862848, + "grad_norm": 4.861842632293701, + "learning_rate": 2.8098262309876915e-05, + "loss": 1.8001, + "step": 83205 + }, + { + "epoch": 0.16205858411973875, + "grad_norm": 3.1622912883758545, + "learning_rate": 2.809759144322767e-05, + "loss": 2.0221, + "step": 83220 + }, + { + "epoch": 0.162087794390849, + "grad_norm": 3.073633909225464, + "learning_rate": 2.809692046628177e-05, + "loss": 1.8744, + "step": 83235 + }, + { + "epoch": 0.16211700466195927, + "grad_norm": 3.17309308052063, + "learning_rate": 2.809624937904486e-05, + "loss": 1.8203, + "step": 83250 + }, + { + "epoch": 0.16214621493306952, + "grad_norm": 3.74810528755188, + "learning_rate": 2.8095578181522592e-05, + "loss": 1.8134, + "step": 83265 + }, + { + "epoch": 0.1621754252041798, + "grad_norm": 3.768850088119507, + "learning_rate": 2.809490687372062e-05, + "loss": 1.8538, + "step": 83280 + }, + { + "epoch": 0.16220463547529007, + "grad_norm": 2.3148956298828125, + "learning_rate": 2.809423545564459e-05, + "loss": 1.9453, + "step": 83295 + }, + { + "epoch": 0.16223384574640032, + "grad_norm": 4.133246421813965, + "learning_rate": 2.8093563927300168e-05, + "loss": 1.8728, + "step": 83310 + }, + { + "epoch": 0.1622630560175106, + "grad_norm": 5.001368045806885, + "learning_rate": 2.8092892288692997e-05, + "loss": 1.907, + "step": 83325 + }, + { + "epoch": 0.16229226628862084, + "grad_norm": 2.216571569442749, + "learning_rate": 2.8092220539828744e-05, + "loss": 1.8864, + "step": 83340 + }, + { + "epoch": 0.16232147655973111, + "grad_norm": 3.8686487674713135, + "learning_rate": 2.8091548680713056e-05, + "loss": 1.9975, + "step": 83355 + }, + { + "epoch": 0.16235068683084136, + "grad_norm": 3.394099712371826, + "learning_rate": 2.8090876711351598e-05, + "loss": 2.067, + "step": 83370 + }, + { + "epoch": 0.16237989710195164, + "grad_norm": 3.401331663131714, + "learning_rate": 2.8090204631750022e-05, + "loss": 1.7323, + "step": 83385 + }, + { + "epoch": 0.1624091073730619, + "grad_norm": 2.9316823482513428, + "learning_rate": 2.8089532441913996e-05, + "loss": 1.9813, + "step": 83400 + }, + { + "epoch": 0.16243831764417216, + "grad_norm": 3.4558181762695312, + "learning_rate": 2.8088860141849177e-05, + "loss": 1.7986, + "step": 83415 + }, + { + "epoch": 0.16246752791528243, + "grad_norm": 3.627876043319702, + "learning_rate": 2.8088187731561222e-05, + "loss": 2.0182, + "step": 83430 + }, + { + "epoch": 0.16249673818639268, + "grad_norm": 2.8163950443267822, + "learning_rate": 2.80875152110558e-05, + "loss": 1.6465, + "step": 83445 + }, + { + "epoch": 0.16252594845750296, + "grad_norm": 4.024330139160156, + "learning_rate": 2.808684258033857e-05, + "loss": 1.6848, + "step": 83460 + }, + { + "epoch": 0.1625551587286132, + "grad_norm": 3.535698652267456, + "learning_rate": 2.80861698394152e-05, + "loss": 1.8736, + "step": 83475 + }, + { + "epoch": 0.16258436899972348, + "grad_norm": 2.6665103435516357, + "learning_rate": 2.808549698829135e-05, + "loss": 1.9619, + "step": 83490 + }, + { + "epoch": 0.16261357927083375, + "grad_norm": 2.837829828262329, + "learning_rate": 2.8084824026972695e-05, + "loss": 1.9531, + "step": 83505 + }, + { + "epoch": 0.162642789541944, + "grad_norm": 2.7827441692352295, + "learning_rate": 2.8084150955464892e-05, + "loss": 1.8917, + "step": 83520 + }, + { + "epoch": 0.16267199981305427, + "grad_norm": 4.356003284454346, + "learning_rate": 2.8083477773773617e-05, + "loss": 1.8742, + "step": 83535 + }, + { + "epoch": 0.16270121008416452, + "grad_norm": 4.308392524719238, + "learning_rate": 2.808280448190453e-05, + "loss": 1.8339, + "step": 83550 + }, + { + "epoch": 0.1627304203552748, + "grad_norm": 2.3540122509002686, + "learning_rate": 2.808213107986331e-05, + "loss": 1.8978, + "step": 83565 + }, + { + "epoch": 0.16275963062638504, + "grad_norm": 2.8848860263824463, + "learning_rate": 2.8081457567655624e-05, + "loss": 1.8311, + "step": 83580 + }, + { + "epoch": 0.16278884089749532, + "grad_norm": 2.495410442352295, + "learning_rate": 2.8080783945287144e-05, + "loss": 2.0331, + "step": 83595 + }, + { + "epoch": 0.1628180511686056, + "grad_norm": 2.871223211288452, + "learning_rate": 2.8080110212763543e-05, + "loss": 1.8825, + "step": 83610 + }, + { + "epoch": 0.16284726143971584, + "grad_norm": 2.2787489891052246, + "learning_rate": 2.8079436370090493e-05, + "loss": 1.8961, + "step": 83625 + }, + { + "epoch": 0.16287647171082612, + "grad_norm": 2.0761358737945557, + "learning_rate": 2.8078762417273674e-05, + "loss": 1.7136, + "step": 83640 + }, + { + "epoch": 0.16290568198193636, + "grad_norm": 5.033082008361816, + "learning_rate": 2.8078088354318753e-05, + "loss": 1.9018, + "step": 83655 + }, + { + "epoch": 0.16293489225304664, + "grad_norm": 3.9279539585113525, + "learning_rate": 2.807741418123141e-05, + "loss": 1.9664, + "step": 83670 + }, + { + "epoch": 0.16296410252415688, + "grad_norm": 3.543733835220337, + "learning_rate": 2.8076739898017326e-05, + "loss": 1.9522, + "step": 83685 + }, + { + "epoch": 0.16299331279526716, + "grad_norm": 5.450058460235596, + "learning_rate": 2.8076065504682174e-05, + "loss": 1.6692, + "step": 83700 + }, + { + "epoch": 0.16302252306637743, + "grad_norm": 2.169194221496582, + "learning_rate": 2.8075391001231634e-05, + "loss": 1.8861, + "step": 83715 + }, + { + "epoch": 0.16305173333748768, + "grad_norm": 3.770310640335083, + "learning_rate": 2.8074716387671392e-05, + "loss": 1.8023, + "step": 83730 + }, + { + "epoch": 0.16308094360859796, + "grad_norm": 4.1261515617370605, + "learning_rate": 2.807404166400712e-05, + "loss": 1.8512, + "step": 83745 + }, + { + "epoch": 0.1631101538797082, + "grad_norm": 3.9217333793640137, + "learning_rate": 2.8073366830244505e-05, + "loss": 1.8588, + "step": 83760 + }, + { + "epoch": 0.16313936415081848, + "grad_norm": 2.7448298931121826, + "learning_rate": 2.8072691886389234e-05, + "loss": 1.8048, + "step": 83775 + }, + { + "epoch": 0.16316857442192872, + "grad_norm": 4.212766170501709, + "learning_rate": 2.8072016832446984e-05, + "loss": 1.9323, + "step": 83790 + }, + { + "epoch": 0.163197784693039, + "grad_norm": 3.3249781131744385, + "learning_rate": 2.807134166842344e-05, + "loss": 1.6668, + "step": 83805 + }, + { + "epoch": 0.16322699496414925, + "grad_norm": 2.0949337482452393, + "learning_rate": 2.807066639432429e-05, + "loss": 1.8893, + "step": 83820 + }, + { + "epoch": 0.16325620523525952, + "grad_norm": 3.0747997760772705, + "learning_rate": 2.806999101015522e-05, + "loss": 1.9608, + "step": 83835 + }, + { + "epoch": 0.1632854155063698, + "grad_norm": 4.438546180725098, + "learning_rate": 2.8069315515921914e-05, + "loss": 1.7931, + "step": 83850 + }, + { + "epoch": 0.16331462577748004, + "grad_norm": 3.1391921043395996, + "learning_rate": 2.8068639911630066e-05, + "loss": 2.0357, + "step": 83865 + }, + { + "epoch": 0.16334383604859032, + "grad_norm": 3.535644769668579, + "learning_rate": 2.8067964197285365e-05, + "loss": 1.7682, + "step": 83880 + }, + { + "epoch": 0.16337304631970057, + "grad_norm": 4.105393886566162, + "learning_rate": 2.8067288372893497e-05, + "loss": 1.809, + "step": 83895 + }, + { + "epoch": 0.16340225659081084, + "grad_norm": 2.9479217529296875, + "learning_rate": 2.806661243846016e-05, + "loss": 1.8905, + "step": 83910 + }, + { + "epoch": 0.1634314668619211, + "grad_norm": 3.3937618732452393, + "learning_rate": 2.806593639399104e-05, + "loss": 1.7645, + "step": 83925 + }, + { + "epoch": 0.16346067713303136, + "grad_norm": 4.6164751052856445, + "learning_rate": 2.806526023949183e-05, + "loss": 2.0139, + "step": 83940 + }, + { + "epoch": 0.16348988740414164, + "grad_norm": 2.9675981998443604, + "learning_rate": 2.8064583974968227e-05, + "loss": 1.8339, + "step": 83955 + }, + { + "epoch": 0.16351909767525188, + "grad_norm": 2.38981556892395, + "learning_rate": 2.8063907600425925e-05, + "loss": 1.8241, + "step": 83970 + }, + { + "epoch": 0.16354830794636216, + "grad_norm": 3.3857972621917725, + "learning_rate": 2.8063231115870618e-05, + "loss": 1.6785, + "step": 83985 + }, + { + "epoch": 0.1635775182174724, + "grad_norm": 4.350620269775391, + "learning_rate": 2.8062554521308005e-05, + "loss": 1.8442, + "step": 84000 + }, + { + "epoch": 0.16360672848858268, + "grad_norm": 3.306528091430664, + "learning_rate": 2.8061877816743788e-05, + "loss": 1.6842, + "step": 84015 + }, + { + "epoch": 0.16363593875969293, + "grad_norm": 3.0955023765563965, + "learning_rate": 2.806120100218366e-05, + "loss": 1.753, + "step": 84030 + }, + { + "epoch": 0.1636651490308032, + "grad_norm": 3.3505823612213135, + "learning_rate": 2.806052407763332e-05, + "loss": 1.9601, + "step": 84045 + }, + { + "epoch": 0.16369435930191348, + "grad_norm": 1.9739930629730225, + "learning_rate": 2.8059847043098463e-05, + "loss": 1.9025, + "step": 84060 + }, + { + "epoch": 0.16372356957302373, + "grad_norm": 2.947619676589966, + "learning_rate": 2.80591698985848e-05, + "loss": 1.8573, + "step": 84075 + }, + { + "epoch": 0.163752779844134, + "grad_norm": 6.246986389160156, + "learning_rate": 2.8058492644098036e-05, + "loss": 1.7682, + "step": 84090 + }, + { + "epoch": 0.16378199011524425, + "grad_norm": 4.133143901824951, + "learning_rate": 2.8057815279643867e-05, + "loss": 2.0101, + "step": 84105 + }, + { + "epoch": 0.16381120038635452, + "grad_norm": 5.530602931976318, + "learning_rate": 2.8057137805227996e-05, + "loss": 1.9433, + "step": 84120 + }, + { + "epoch": 0.16384041065746477, + "grad_norm": 4.0296783447265625, + "learning_rate": 2.805646022085614e-05, + "loss": 1.9851, + "step": 84135 + }, + { + "epoch": 0.16386962092857504, + "grad_norm": 2.62994122505188, + "learning_rate": 2.8055782526533987e-05, + "loss": 2.1243, + "step": 84150 + }, + { + "epoch": 0.16389883119968532, + "grad_norm": 3.3628041744232178, + "learning_rate": 2.8055104722267253e-05, + "loss": 1.6841, + "step": 84165 + }, + { + "epoch": 0.16392804147079557, + "grad_norm": 2.838013172149658, + "learning_rate": 2.8054426808061647e-05, + "loss": 1.9514, + "step": 84180 + }, + { + "epoch": 0.16395725174190584, + "grad_norm": 2.5751872062683105, + "learning_rate": 2.805374878392288e-05, + "loss": 1.9142, + "step": 84195 + }, + { + "epoch": 0.1639864620130161, + "grad_norm": 2.528268337249756, + "learning_rate": 2.805307064985666e-05, + "loss": 1.9408, + "step": 84210 + }, + { + "epoch": 0.16401567228412636, + "grad_norm": 3.542693853378296, + "learning_rate": 2.8052392405868695e-05, + "loss": 1.9624, + "step": 84225 + }, + { + "epoch": 0.1640448825552366, + "grad_norm": 3.478200674057007, + "learning_rate": 2.805171405196469e-05, + "loss": 1.8032, + "step": 84240 + }, + { + "epoch": 0.16407409282634688, + "grad_norm": 1.9695062637329102, + "learning_rate": 2.8051035588150377e-05, + "loss": 1.7407, + "step": 84255 + }, + { + "epoch": 0.16410330309745716, + "grad_norm": 3.080308675765991, + "learning_rate": 2.805035701443145e-05, + "loss": 1.6716, + "step": 84270 + }, + { + "epoch": 0.1641325133685674, + "grad_norm": 2.1596643924713135, + "learning_rate": 2.8049678330813634e-05, + "loss": 1.784, + "step": 84285 + }, + { + "epoch": 0.16416172363967768, + "grad_norm": 4.200150012969971, + "learning_rate": 2.804899953730264e-05, + "loss": 2.0034, + "step": 84300 + }, + { + "epoch": 0.16419093391078793, + "grad_norm": 2.5685014724731445, + "learning_rate": 2.8048320633904186e-05, + "loss": 1.7863, + "step": 84315 + }, + { + "epoch": 0.1642201441818982, + "grad_norm": 5.3111090660095215, + "learning_rate": 2.804764162062399e-05, + "loss": 1.8948, + "step": 84330 + }, + { + "epoch": 0.16424935445300845, + "grad_norm": 4.073383331298828, + "learning_rate": 2.804696249746777e-05, + "loss": 1.832, + "step": 84345 + }, + { + "epoch": 0.16427856472411873, + "grad_norm": 3.693546772003174, + "learning_rate": 2.804628326444124e-05, + "loss": 1.6886, + "step": 84360 + }, + { + "epoch": 0.164307774995229, + "grad_norm": 6.029183387756348, + "learning_rate": 2.8045603921550125e-05, + "loss": 1.9278, + "step": 84375 + }, + { + "epoch": 0.16433698526633925, + "grad_norm": 2.3653836250305176, + "learning_rate": 2.8044924468800145e-05, + "loss": 1.7383, + "step": 84390 + }, + { + "epoch": 0.16436619553744952, + "grad_norm": 5.183484077453613, + "learning_rate": 2.8044244906197018e-05, + "loss": 1.8933, + "step": 84405 + }, + { + "epoch": 0.16439540580855977, + "grad_norm": 3.3372654914855957, + "learning_rate": 2.8043565233746478e-05, + "loss": 1.956, + "step": 84420 + }, + { + "epoch": 0.16442461607967004, + "grad_norm": 2.2536230087280273, + "learning_rate": 2.8042885451454234e-05, + "loss": 1.9638, + "step": 84435 + }, + { + "epoch": 0.1644538263507803, + "grad_norm": 2.549088716506958, + "learning_rate": 2.804220555932602e-05, + "loss": 1.899, + "step": 84450 + }, + { + "epoch": 0.16448303662189057, + "grad_norm": 2.0314691066741943, + "learning_rate": 2.804152555736756e-05, + "loss": 1.8299, + "step": 84465 + }, + { + "epoch": 0.16451224689300084, + "grad_norm": 3.090559959411621, + "learning_rate": 2.8040845445584573e-05, + "loss": 2.0353, + "step": 84480 + }, + { + "epoch": 0.1645414571641111, + "grad_norm": 2.8737590312957764, + "learning_rate": 2.80401652239828e-05, + "loss": 1.8539, + "step": 84495 + }, + { + "epoch": 0.16457066743522136, + "grad_norm": 4.370304584503174, + "learning_rate": 2.8039484892567956e-05, + "loss": 1.9731, + "step": 84510 + }, + { + "epoch": 0.1645998777063316, + "grad_norm": 5.284047603607178, + "learning_rate": 2.803880445134578e-05, + "loss": 1.9863, + "step": 84525 + }, + { + "epoch": 0.16462908797744188, + "grad_norm": 4.027320861816406, + "learning_rate": 2.8038123900321994e-05, + "loss": 2.0336, + "step": 84540 + }, + { + "epoch": 0.16465829824855213, + "grad_norm": 2.6698293685913086, + "learning_rate": 2.8037443239502335e-05, + "loss": 1.9874, + "step": 84555 + }, + { + "epoch": 0.1646875085196624, + "grad_norm": 4.53780460357666, + "learning_rate": 2.8036762468892535e-05, + "loss": 1.7988, + "step": 84570 + }, + { + "epoch": 0.16471671879077268, + "grad_norm": 2.541128635406494, + "learning_rate": 2.803608158849832e-05, + "loss": 1.7507, + "step": 84585 + }, + { + "epoch": 0.16474592906188293, + "grad_norm": 4.037196636199951, + "learning_rate": 2.803540059832543e-05, + "loss": 2.0064, + "step": 84600 + }, + { + "epoch": 0.1647751393329932, + "grad_norm": 4.342711448669434, + "learning_rate": 2.80347194983796e-05, + "loss": 1.9037, + "step": 84615 + }, + { + "epoch": 0.16480434960410345, + "grad_norm": 2.2379393577575684, + "learning_rate": 2.8034038288666562e-05, + "loss": 1.793, + "step": 84630 + }, + { + "epoch": 0.16483355987521373, + "grad_norm": 1.749857783317566, + "learning_rate": 2.8033356969192057e-05, + "loss": 1.8663, + "step": 84645 + }, + { + "epoch": 0.16486277014632397, + "grad_norm": 2.5136685371398926, + "learning_rate": 2.8032675539961815e-05, + "loss": 2.0475, + "step": 84660 + }, + { + "epoch": 0.16489198041743425, + "grad_norm": 2.899177312850952, + "learning_rate": 2.803199400098158e-05, + "loss": 1.7951, + "step": 84675 + }, + { + "epoch": 0.16492119068854452, + "grad_norm": 5.255392551422119, + "learning_rate": 2.8031312352257096e-05, + "loss": 1.7738, + "step": 84690 + }, + { + "epoch": 0.16495040095965477, + "grad_norm": 5.20246696472168, + "learning_rate": 2.8030630593794092e-05, + "loss": 1.7164, + "step": 84705 + }, + { + "epoch": 0.16497961123076504, + "grad_norm": 1.9588276147842407, + "learning_rate": 2.802994872559832e-05, + "loss": 1.9388, + "step": 84720 + }, + { + "epoch": 0.1650088215018753, + "grad_norm": 4.184682846069336, + "learning_rate": 2.8029266747675515e-05, + "loss": 2.0472, + "step": 84735 + }, + { + "epoch": 0.16503803177298557, + "grad_norm": 2.0913069248199463, + "learning_rate": 2.802858466003142e-05, + "loss": 1.7788, + "step": 84750 + }, + { + "epoch": 0.1650672420440958, + "grad_norm": 2.3282485008239746, + "learning_rate": 2.8027902462671787e-05, + "loss": 1.9518, + "step": 84765 + }, + { + "epoch": 0.1650964523152061, + "grad_norm": 2.4634013175964355, + "learning_rate": 2.8027220155602354e-05, + "loss": 1.9039, + "step": 84780 + }, + { + "epoch": 0.16512566258631636, + "grad_norm": 2.5319318771362305, + "learning_rate": 2.8026537738828864e-05, + "loss": 1.8211, + "step": 84795 + }, + { + "epoch": 0.1651548728574266, + "grad_norm": 3.099039316177368, + "learning_rate": 2.8025855212357068e-05, + "loss": 1.9601, + "step": 84810 + }, + { + "epoch": 0.16518408312853688, + "grad_norm": 3.9690353870391846, + "learning_rate": 2.8025172576192716e-05, + "loss": 2.0471, + "step": 84825 + }, + { + "epoch": 0.16521329339964713, + "grad_norm": 1.9761989116668701, + "learning_rate": 2.802448983034155e-05, + "loss": 1.9529, + "step": 84840 + }, + { + "epoch": 0.1652425036707574, + "grad_norm": 2.8102729320526123, + "learning_rate": 2.802380697480933e-05, + "loss": 1.9122, + "step": 84855 + }, + { + "epoch": 0.16527171394186765, + "grad_norm": 3.583547353744507, + "learning_rate": 2.8023124009601788e-05, + "loss": 1.9556, + "step": 84870 + }, + { + "epoch": 0.16530092421297793, + "grad_norm": 3.332496404647827, + "learning_rate": 2.8022440934724697e-05, + "loss": 1.9243, + "step": 84885 + }, + { + "epoch": 0.1653301344840882, + "grad_norm": 3.8673999309539795, + "learning_rate": 2.8021757750183795e-05, + "loss": 1.7374, + "step": 84900 + }, + { + "epoch": 0.16535934475519845, + "grad_norm": 3.151942014694214, + "learning_rate": 2.8021074455984843e-05, + "loss": 1.8335, + "step": 84915 + }, + { + "epoch": 0.16538855502630873, + "grad_norm": 4.350759029388428, + "learning_rate": 2.8020391052133586e-05, + "loss": 1.8208, + "step": 84930 + }, + { + "epoch": 0.16541776529741897, + "grad_norm": 4.372674465179443, + "learning_rate": 2.801970753863579e-05, + "loss": 1.835, + "step": 84945 + }, + { + "epoch": 0.16544697556852925, + "grad_norm": 2.787752866744995, + "learning_rate": 2.8019023915497204e-05, + "loss": 1.7215, + "step": 84960 + }, + { + "epoch": 0.1654761858396395, + "grad_norm": 3.124127149581909, + "learning_rate": 2.801834018272359e-05, + "loss": 1.9894, + "step": 84975 + }, + { + "epoch": 0.16550539611074977, + "grad_norm": 3.131024122238159, + "learning_rate": 2.8017656340320695e-05, + "loss": 1.8775, + "step": 84990 + }, + { + "epoch": 0.16553460638186004, + "grad_norm": 4.088716983795166, + "learning_rate": 2.8016972388294292e-05, + "loss": 2.0014, + "step": 85005 + }, + { + "epoch": 0.1655638166529703, + "grad_norm": 4.504201412200928, + "learning_rate": 2.8016288326650127e-05, + "loss": 1.8986, + "step": 85020 + }, + { + "epoch": 0.16559302692408057, + "grad_norm": 3.474787950515747, + "learning_rate": 2.8015604155393972e-05, + "loss": 1.8153, + "step": 85035 + }, + { + "epoch": 0.1656222371951908, + "grad_norm": 2.7330682277679443, + "learning_rate": 2.8014919874531588e-05, + "loss": 1.8833, + "step": 85050 + }, + { + "epoch": 0.1656514474663011, + "grad_norm": 2.693739891052246, + "learning_rate": 2.8014235484068724e-05, + "loss": 1.7969, + "step": 85065 + }, + { + "epoch": 0.16568065773741134, + "grad_norm": 2.6607675552368164, + "learning_rate": 2.8013550984011157e-05, + "loss": 1.8868, + "step": 85080 + }, + { + "epoch": 0.1657098680085216, + "grad_norm": 3.3002219200134277, + "learning_rate": 2.8012866374364647e-05, + "loss": 1.954, + "step": 85095 + }, + { + "epoch": 0.16573907827963189, + "grad_norm": 4.921784400939941, + "learning_rate": 2.801218165513496e-05, + "loss": 2.0241, + "step": 85110 + }, + { + "epoch": 0.16576828855074213, + "grad_norm": 2.19203782081604, + "learning_rate": 2.8011496826327857e-05, + "loss": 1.8053, + "step": 85125 + }, + { + "epoch": 0.1657974988218524, + "grad_norm": 5.644973278045654, + "learning_rate": 2.8010811887949114e-05, + "loss": 1.8577, + "step": 85140 + }, + { + "epoch": 0.16582670909296265, + "grad_norm": 3.731236219406128, + "learning_rate": 2.8010126840004494e-05, + "loss": 1.912, + "step": 85155 + }, + { + "epoch": 0.16585591936407293, + "grad_norm": 4.182020664215088, + "learning_rate": 2.800944168249976e-05, + "loss": 2.0091, + "step": 85170 + }, + { + "epoch": 0.16588512963518318, + "grad_norm": 2.9865341186523438, + "learning_rate": 2.8008756415440697e-05, + "loss": 1.9213, + "step": 85185 + }, + { + "epoch": 0.16591433990629345, + "grad_norm": 2.382753849029541, + "learning_rate": 2.800807103883306e-05, + "loss": 1.9351, + "step": 85200 + }, + { + "epoch": 0.16594355017740373, + "grad_norm": 3.049863576889038, + "learning_rate": 2.8007385552682626e-05, + "loss": 1.8336, + "step": 85215 + }, + { + "epoch": 0.16597276044851397, + "grad_norm": 3.3488526344299316, + "learning_rate": 2.8006699956995176e-05, + "loss": 1.8814, + "step": 85230 + }, + { + "epoch": 0.16600197071962425, + "grad_norm": 2.8722083568573, + "learning_rate": 2.8006014251776472e-05, + "loss": 1.7149, + "step": 85245 + }, + { + "epoch": 0.1660311809907345, + "grad_norm": 3.152118444442749, + "learning_rate": 2.800532843703229e-05, + "loss": 1.8599, + "step": 85260 + }, + { + "epoch": 0.16606039126184477, + "grad_norm": 2.8969244956970215, + "learning_rate": 2.8004642512768414e-05, + "loss": 1.7813, + "step": 85275 + }, + { + "epoch": 0.16608960153295502, + "grad_norm": 4.032709121704102, + "learning_rate": 2.800395647899061e-05, + "loss": 1.8947, + "step": 85290 + }, + { + "epoch": 0.1661188118040653, + "grad_norm": 4.046450614929199, + "learning_rate": 2.8003270335704664e-05, + "loss": 1.8439, + "step": 85305 + }, + { + "epoch": 0.16614802207517557, + "grad_norm": 2.405841588973999, + "learning_rate": 2.8002584082916348e-05, + "loss": 1.7117, + "step": 85320 + }, + { + "epoch": 0.1661772323462858, + "grad_norm": 2.4156463146209717, + "learning_rate": 2.800189772063144e-05, + "loss": 1.7707, + "step": 85335 + }, + { + "epoch": 0.1662064426173961, + "grad_norm": 2.577035427093506, + "learning_rate": 2.8001211248855724e-05, + "loss": 1.7558, + "step": 85350 + }, + { + "epoch": 0.16623565288850634, + "grad_norm": 2.21968412399292, + "learning_rate": 2.800052466759498e-05, + "loss": 1.8039, + "step": 85365 + }, + { + "epoch": 0.1662648631596166, + "grad_norm": 2.626068592071533, + "learning_rate": 2.7999837976854987e-05, + "loss": 1.9606, + "step": 85380 + }, + { + "epoch": 0.16629407343072686, + "grad_norm": 2.9049072265625, + "learning_rate": 2.7999151176641532e-05, + "loss": 1.9331, + "step": 85395 + }, + { + "epoch": 0.16632328370183713, + "grad_norm": 2.110912561416626, + "learning_rate": 2.79984642669604e-05, + "loss": 1.8975, + "step": 85410 + }, + { + "epoch": 0.1663524939729474, + "grad_norm": 3.295287609100342, + "learning_rate": 2.799777724781737e-05, + "loss": 1.8991, + "step": 85425 + }, + { + "epoch": 0.16638170424405765, + "grad_norm": 2.418525218963623, + "learning_rate": 2.7997090119218226e-05, + "loss": 2.0626, + "step": 85440 + }, + { + "epoch": 0.16641091451516793, + "grad_norm": 4.826914310455322, + "learning_rate": 2.7996402881168762e-05, + "loss": 2.0044, + "step": 85455 + }, + { + "epoch": 0.16644012478627818, + "grad_norm": 2.187988758087158, + "learning_rate": 2.799571553367476e-05, + "loss": 2.045, + "step": 85470 + }, + { + "epoch": 0.16646933505738845, + "grad_norm": 2.367311716079712, + "learning_rate": 2.799502807674201e-05, + "loss": 2.0583, + "step": 85485 + }, + { + "epoch": 0.1664985453284987, + "grad_norm": 4.570099830627441, + "learning_rate": 2.79943405103763e-05, + "loss": 1.9508, + "step": 85500 + }, + { + "epoch": 0.16652775559960897, + "grad_norm": 3.6001203060150146, + "learning_rate": 2.799365283458342e-05, + "loss": 1.7592, + "step": 85515 + }, + { + "epoch": 0.16655696587071925, + "grad_norm": 4.19040060043335, + "learning_rate": 2.7992965049369165e-05, + "loss": 1.9915, + "step": 85530 + }, + { + "epoch": 0.1665861761418295, + "grad_norm": 3.1569197177886963, + "learning_rate": 2.7992277154739323e-05, + "loss": 1.7777, + "step": 85545 + }, + { + "epoch": 0.16661538641293977, + "grad_norm": 2.4647107124328613, + "learning_rate": 2.799158915069969e-05, + "loss": 1.8827, + "step": 85560 + }, + { + "epoch": 0.16664459668405002, + "grad_norm": 2.5181756019592285, + "learning_rate": 2.7990901037256056e-05, + "loss": 2.0253, + "step": 85575 + }, + { + "epoch": 0.1666738069551603, + "grad_norm": 5.391116619110107, + "learning_rate": 2.7990212814414215e-05, + "loss": 1.7319, + "step": 85590 + }, + { + "epoch": 0.16670301722627054, + "grad_norm": 2.2008304595947266, + "learning_rate": 2.7989524482179966e-05, + "loss": 2.1013, + "step": 85605 + }, + { + "epoch": 0.1667322274973808, + "grad_norm": 2.502542495727539, + "learning_rate": 2.7988836040559106e-05, + "loss": 1.9177, + "step": 85620 + }, + { + "epoch": 0.1667614377684911, + "grad_norm": 3.678185224533081, + "learning_rate": 2.798814748955743e-05, + "loss": 1.8713, + "step": 85635 + }, + { + "epoch": 0.16679064803960134, + "grad_norm": 4.678583145141602, + "learning_rate": 2.7987458829180736e-05, + "loss": 2.0709, + "step": 85650 + }, + { + "epoch": 0.1668198583107116, + "grad_norm": 3.7226016521453857, + "learning_rate": 2.7986770059434824e-05, + "loss": 2.0292, + "step": 85665 + }, + { + "epoch": 0.16684906858182186, + "grad_norm": 3.6722655296325684, + "learning_rate": 2.79860811803255e-05, + "loss": 1.8559, + "step": 85680 + }, + { + "epoch": 0.16687827885293213, + "grad_norm": 2.628980875015259, + "learning_rate": 2.7985392191858553e-05, + "loss": 1.9355, + "step": 85695 + }, + { + "epoch": 0.16690748912404238, + "grad_norm": 3.9263062477111816, + "learning_rate": 2.7984703094039798e-05, + "loss": 1.808, + "step": 85710 + }, + { + "epoch": 0.16693669939515265, + "grad_norm": 4.1697611808776855, + "learning_rate": 2.7984013886875026e-05, + "loss": 1.9379, + "step": 85725 + }, + { + "epoch": 0.16696590966626293, + "grad_norm": 3.749804735183716, + "learning_rate": 2.798332457037005e-05, + "loss": 1.7986, + "step": 85740 + }, + { + "epoch": 0.16699511993737318, + "grad_norm": 5.136580467224121, + "learning_rate": 2.7982635144530673e-05, + "loss": 1.8576, + "step": 85755 + }, + { + "epoch": 0.16702433020848345, + "grad_norm": 2.4272539615631104, + "learning_rate": 2.79819456093627e-05, + "loss": 1.9215, + "step": 85770 + }, + { + "epoch": 0.1670535404795937, + "grad_norm": 2.389630079269409, + "learning_rate": 2.7981255964871933e-05, + "loss": 1.8211, + "step": 85785 + }, + { + "epoch": 0.16708275075070397, + "grad_norm": 2.56075119972229, + "learning_rate": 2.798056621106419e-05, + "loss": 1.9378, + "step": 85800 + }, + { + "epoch": 0.16711196102181422, + "grad_norm": 3.3079729080200195, + "learning_rate": 2.797987634794527e-05, + "loss": 1.9439, + "step": 85815 + }, + { + "epoch": 0.1671411712929245, + "grad_norm": 2.6672608852386475, + "learning_rate": 2.7979186375520984e-05, + "loss": 1.8338, + "step": 85830 + }, + { + "epoch": 0.16717038156403477, + "grad_norm": 4.230236053466797, + "learning_rate": 2.797849629379714e-05, + "loss": 1.9227, + "step": 85845 + }, + { + "epoch": 0.16719959183514502, + "grad_norm": 4.147488594055176, + "learning_rate": 2.7977806102779565e-05, + "loss": 1.9357, + "step": 85860 + }, + { + "epoch": 0.1672288021062553, + "grad_norm": 2.81075119972229, + "learning_rate": 2.7977115802474055e-05, + "loss": 1.9495, + "step": 85875 + }, + { + "epoch": 0.16725801237736554, + "grad_norm": 1.843131184577942, + "learning_rate": 2.7976425392886424e-05, + "loss": 1.8318, + "step": 85890 + }, + { + "epoch": 0.16728722264847581, + "grad_norm": 2.802412271499634, + "learning_rate": 2.797573487402249e-05, + "loss": 1.9578, + "step": 85905 + }, + { + "epoch": 0.16731643291958606, + "grad_norm": 4.871231555938721, + "learning_rate": 2.797504424588807e-05, + "loss": 2.0298, + "step": 85920 + }, + { + "epoch": 0.16734564319069634, + "grad_norm": 2.4380788803100586, + "learning_rate": 2.7974353508488978e-05, + "loss": 1.6671, + "step": 85935 + }, + { + "epoch": 0.16737485346180658, + "grad_norm": 2.108659267425537, + "learning_rate": 2.797366266183103e-05, + "loss": 1.7478, + "step": 85950 + }, + { + "epoch": 0.16740406373291686, + "grad_norm": 2.5961341857910156, + "learning_rate": 2.7972971705920042e-05, + "loss": 1.9303, + "step": 85965 + }, + { + "epoch": 0.16743327400402713, + "grad_norm": 2.3773584365844727, + "learning_rate": 2.7972280640761836e-05, + "loss": 1.7674, + "step": 85980 + }, + { + "epoch": 0.16746248427513738, + "grad_norm": 3.761040687561035, + "learning_rate": 2.7971589466362234e-05, + "loss": 1.7722, + "step": 85995 + }, + { + "epoch": 0.16749169454624765, + "grad_norm": 2.698612928390503, + "learning_rate": 2.7970898182727044e-05, + "loss": 1.8031, + "step": 86010 + }, + { + "epoch": 0.1675209048173579, + "grad_norm": 2.611518621444702, + "learning_rate": 2.7970206789862104e-05, + "loss": 2.0427, + "step": 86025 + }, + { + "epoch": 0.16755011508846818, + "grad_norm": 4.3929667472839355, + "learning_rate": 2.7969515287773225e-05, + "loss": 2.0517, + "step": 86040 + }, + { + "epoch": 0.16757932535957842, + "grad_norm": 3.4562489986419678, + "learning_rate": 2.7968823676466233e-05, + "loss": 1.7637, + "step": 86055 + }, + { + "epoch": 0.1676085356306887, + "grad_norm": 2.392221450805664, + "learning_rate": 2.7968131955946953e-05, + "loss": 1.8937, + "step": 86070 + }, + { + "epoch": 0.16763774590179897, + "grad_norm": 3.572672128677368, + "learning_rate": 2.7967440126221213e-05, + "loss": 2.0033, + "step": 86085 + }, + { + "epoch": 0.16766695617290922, + "grad_norm": 3.0939183235168457, + "learning_rate": 2.7966748187294833e-05, + "loss": 1.8074, + "step": 86100 + }, + { + "epoch": 0.1676961664440195, + "grad_norm": 2.9930481910705566, + "learning_rate": 2.7966056139173645e-05, + "loss": 1.8568, + "step": 86115 + }, + { + "epoch": 0.16772537671512974, + "grad_norm": 4.93708610534668, + "learning_rate": 2.7965363981863473e-05, + "loss": 1.8265, + "step": 86130 + }, + { + "epoch": 0.16775458698624002, + "grad_norm": 4.270525932312012, + "learning_rate": 2.7964671715370147e-05, + "loss": 1.9474, + "step": 86145 + }, + { + "epoch": 0.16778379725735026, + "grad_norm": 2.666229248046875, + "learning_rate": 2.7963979339699502e-05, + "loss": 1.906, + "step": 86160 + }, + { + "epoch": 0.16781300752846054, + "grad_norm": 2.5731043815612793, + "learning_rate": 2.7963286854857355e-05, + "loss": 1.7171, + "step": 86175 + }, + { + "epoch": 0.16784221779957081, + "grad_norm": 2.852069139480591, + "learning_rate": 2.796259426084955e-05, + "loss": 1.9204, + "step": 86190 + }, + { + "epoch": 0.16787142807068106, + "grad_norm": 3.9128577709198, + "learning_rate": 2.796190155768192e-05, + "loss": 1.8817, + "step": 86205 + }, + { + "epoch": 0.16790063834179134, + "grad_norm": 3.5916457176208496, + "learning_rate": 2.796120874536029e-05, + "loss": 1.9411, + "step": 86220 + }, + { + "epoch": 0.16792984861290158, + "grad_norm": 3.4578700065612793, + "learning_rate": 2.7960515823890498e-05, + "loss": 1.7339, + "step": 86235 + }, + { + "epoch": 0.16795905888401186, + "grad_norm": 3.715704917907715, + "learning_rate": 2.795982279327838e-05, + "loss": 1.7846, + "step": 86250 + }, + { + "epoch": 0.1679882691551221, + "grad_norm": 2.2966363430023193, + "learning_rate": 2.7959129653529772e-05, + "loss": 1.8433, + "step": 86265 + }, + { + "epoch": 0.16801747942623238, + "grad_norm": 3.693284273147583, + "learning_rate": 2.795843640465051e-05, + "loss": 1.8179, + "step": 86280 + }, + { + "epoch": 0.16804668969734265, + "grad_norm": 3.669166326522827, + "learning_rate": 2.7957743046646432e-05, + "loss": 1.7746, + "step": 86295 + }, + { + "epoch": 0.1680758999684529, + "grad_norm": 4.729858875274658, + "learning_rate": 2.7957049579523375e-05, + "loss": 1.6939, + "step": 86310 + }, + { + "epoch": 0.16810511023956318, + "grad_norm": 2.1191747188568115, + "learning_rate": 2.7956356003287184e-05, + "loss": 1.73, + "step": 86325 + }, + { + "epoch": 0.16813432051067342, + "grad_norm": 2.40170955657959, + "learning_rate": 2.79556623179437e-05, + "loss": 1.9329, + "step": 86340 + }, + { + "epoch": 0.1681635307817837, + "grad_norm": 4.313083171844482, + "learning_rate": 2.7954968523498754e-05, + "loss": 1.8287, + "step": 86355 + }, + { + "epoch": 0.16819274105289395, + "grad_norm": 3.751596450805664, + "learning_rate": 2.7954274619958203e-05, + "loss": 2.0336, + "step": 86370 + }, + { + "epoch": 0.16822195132400422, + "grad_norm": 3.3805999755859375, + "learning_rate": 2.7953580607327883e-05, + "loss": 1.9203, + "step": 86385 + }, + { + "epoch": 0.1682511615951145, + "grad_norm": 1.9758504629135132, + "learning_rate": 2.7952886485613634e-05, + "loss": 1.7092, + "step": 86400 + }, + { + "epoch": 0.16828037186622474, + "grad_norm": 2.9824628829956055, + "learning_rate": 2.795219225482131e-05, + "loss": 1.8156, + "step": 86415 + }, + { + "epoch": 0.16830958213733502, + "grad_norm": 3.25563907623291, + "learning_rate": 2.7951497914956753e-05, + "loss": 1.7606, + "step": 86430 + }, + { + "epoch": 0.16833879240844526, + "grad_norm": 3.97023344039917, + "learning_rate": 2.795080346602581e-05, + "loss": 1.8869, + "step": 86445 + }, + { + "epoch": 0.16836800267955554, + "grad_norm": 2.7483327388763428, + "learning_rate": 2.7950108908034326e-05, + "loss": 1.8547, + "step": 86460 + }, + { + "epoch": 0.1683972129506658, + "grad_norm": 3.2617428302764893, + "learning_rate": 2.7949414240988157e-05, + "loss": 2.0444, + "step": 86475 + }, + { + "epoch": 0.16842642322177606, + "grad_norm": 3.017998456954956, + "learning_rate": 2.794871946489315e-05, + "loss": 2.0028, + "step": 86490 + }, + { + "epoch": 0.16845563349288634, + "grad_norm": 2.118093729019165, + "learning_rate": 2.7948024579755156e-05, + "loss": 1.9545, + "step": 86505 + }, + { + "epoch": 0.16848484376399658, + "grad_norm": 3.7398133277893066, + "learning_rate": 2.794732958558002e-05, + "loss": 1.8301, + "step": 86520 + }, + { + "epoch": 0.16851405403510686, + "grad_norm": 3.869276523590088, + "learning_rate": 2.794663448237361e-05, + "loss": 1.7938, + "step": 86535 + }, + { + "epoch": 0.1685432643062171, + "grad_norm": 2.869392156600952, + "learning_rate": 2.7945939270141764e-05, + "loss": 2.1531, + "step": 86550 + }, + { + "epoch": 0.16857247457732738, + "grad_norm": 4.394891262054443, + "learning_rate": 2.7945243948890342e-05, + "loss": 1.8988, + "step": 86565 + }, + { + "epoch": 0.16860168484843763, + "grad_norm": 3.7028937339782715, + "learning_rate": 2.79445485186252e-05, + "loss": 1.8252, + "step": 86580 + }, + { + "epoch": 0.1686308951195479, + "grad_norm": 2.433528423309326, + "learning_rate": 2.7943852979352196e-05, + "loss": 1.7823, + "step": 86595 + }, + { + "epoch": 0.16866010539065818, + "grad_norm": 2.6936898231506348, + "learning_rate": 2.7943157331077186e-05, + "loss": 1.8181, + "step": 86610 + }, + { + "epoch": 0.16868931566176842, + "grad_norm": 4.247588157653809, + "learning_rate": 2.794246157380603e-05, + "loss": 1.8574, + "step": 86625 + }, + { + "epoch": 0.1687185259328787, + "grad_norm": 2.947840452194214, + "learning_rate": 2.7941765707544583e-05, + "loss": 1.8778, + "step": 86640 + }, + { + "epoch": 0.16874773620398895, + "grad_norm": 2.997903823852539, + "learning_rate": 2.7941069732298704e-05, + "loss": 2.0352, + "step": 86655 + }, + { + "epoch": 0.16877694647509922, + "grad_norm": 4.1995439529418945, + "learning_rate": 2.794037364807426e-05, + "loss": 1.8076, + "step": 86670 + }, + { + "epoch": 0.16880615674620947, + "grad_norm": 1.9795687198638916, + "learning_rate": 2.793967745487711e-05, + "loss": 1.8211, + "step": 86685 + }, + { + "epoch": 0.16883536701731974, + "grad_norm": 3.173441171646118, + "learning_rate": 2.793898115271311e-05, + "loss": 1.8966, + "step": 86700 + }, + { + "epoch": 0.16886457728843002, + "grad_norm": 5.153031349182129, + "learning_rate": 2.7938284741588136e-05, + "loss": 1.7198, + "step": 86715 + }, + { + "epoch": 0.16889378755954026, + "grad_norm": 2.513739824295044, + "learning_rate": 2.7937588221508042e-05, + "loss": 1.8157, + "step": 86730 + }, + { + "epoch": 0.16892299783065054, + "grad_norm": 4.0160298347473145, + "learning_rate": 2.79368915924787e-05, + "loss": 1.8999, + "step": 86745 + }, + { + "epoch": 0.1689522081017608, + "grad_norm": 4.072794437408447, + "learning_rate": 2.793619485450598e-05, + "loss": 1.7462, + "step": 86760 + }, + { + "epoch": 0.16898141837287106, + "grad_norm": 3.2809090614318848, + "learning_rate": 2.7935498007595736e-05, + "loss": 1.9822, + "step": 86775 + }, + { + "epoch": 0.1690106286439813, + "grad_norm": 6.963658809661865, + "learning_rate": 2.793480105175385e-05, + "loss": 1.8834, + "step": 86790 + }, + { + "epoch": 0.16903983891509158, + "grad_norm": 3.769411087036133, + "learning_rate": 2.7934103986986184e-05, + "loss": 2.0618, + "step": 86805 + }, + { + "epoch": 0.16906904918620186, + "grad_norm": 2.0544545650482178, + "learning_rate": 2.7933406813298602e-05, + "loss": 1.8441, + "step": 86820 + }, + { + "epoch": 0.1690982594573121, + "grad_norm": 2.379471778869629, + "learning_rate": 2.7932709530696993e-05, + "loss": 1.6866, + "step": 86835 + }, + { + "epoch": 0.16912746972842238, + "grad_norm": 2.7043585777282715, + "learning_rate": 2.793201213918721e-05, + "loss": 1.7222, + "step": 86850 + }, + { + "epoch": 0.16915667999953263, + "grad_norm": 3.1886067390441895, + "learning_rate": 2.7931314638775135e-05, + "loss": 2.0303, + "step": 86865 + }, + { + "epoch": 0.1691858902706429, + "grad_norm": 3.3426759243011475, + "learning_rate": 2.7930617029466643e-05, + "loss": 1.7885, + "step": 86880 + }, + { + "epoch": 0.16921510054175315, + "grad_norm": 3.4579195976257324, + "learning_rate": 2.7929919311267607e-05, + "loss": 1.9343, + "step": 86895 + }, + { + "epoch": 0.16924431081286342, + "grad_norm": 6.206538200378418, + "learning_rate": 2.7929221484183898e-05, + "loss": 1.8751, + "step": 86910 + }, + { + "epoch": 0.1692735210839737, + "grad_norm": 3.4537949562072754, + "learning_rate": 2.79285235482214e-05, + "loss": 1.9107, + "step": 86925 + }, + { + "epoch": 0.16930273135508395, + "grad_norm": 2.4171297550201416, + "learning_rate": 2.7927825503385984e-05, + "loss": 1.7849, + "step": 86940 + }, + { + "epoch": 0.16933194162619422, + "grad_norm": 3.309356451034546, + "learning_rate": 2.7927127349683535e-05, + "loss": 1.8778, + "step": 86955 + }, + { + "epoch": 0.16936115189730447, + "grad_norm": 3.138411521911621, + "learning_rate": 2.7926429087119924e-05, + "loss": 1.8813, + "step": 86970 + }, + { + "epoch": 0.16939036216841474, + "grad_norm": 3.0347280502319336, + "learning_rate": 2.7925730715701042e-05, + "loss": 1.916, + "step": 86985 + }, + { + "epoch": 0.169419572439525, + "grad_norm": 2.392861843109131, + "learning_rate": 2.792503223543276e-05, + "loss": 1.903, + "step": 87000 + }, + { + "epoch": 0.16944878271063527, + "grad_norm": 2.4729256629943848, + "learning_rate": 2.7924333646320962e-05, + "loss": 2.0148, + "step": 87015 + }, + { + "epoch": 0.16947799298174554, + "grad_norm": 3.036146402359009, + "learning_rate": 2.7923634948371533e-05, + "loss": 1.8755, + "step": 87030 + }, + { + "epoch": 0.1695072032528558, + "grad_norm": 4.769365310668945, + "learning_rate": 2.7922936141590358e-05, + "loss": 1.9632, + "step": 87045 + }, + { + "epoch": 0.16953641352396606, + "grad_norm": 2.0792274475097656, + "learning_rate": 2.792223722598332e-05, + "loss": 1.8999, + "step": 87060 + }, + { + "epoch": 0.1695656237950763, + "grad_norm": 3.1346921920776367, + "learning_rate": 2.7921538201556305e-05, + "loss": 1.988, + "step": 87075 + }, + { + "epoch": 0.16959483406618658, + "grad_norm": 3.368657112121582, + "learning_rate": 2.7920839068315198e-05, + "loss": 1.8991, + "step": 87090 + }, + { + "epoch": 0.16962404433729683, + "grad_norm": 2.291987419128418, + "learning_rate": 2.792013982626589e-05, + "loss": 1.7781, + "step": 87105 + }, + { + "epoch": 0.1696532546084071, + "grad_norm": 5.38674259185791, + "learning_rate": 2.7919440475414265e-05, + "loss": 1.6992, + "step": 87120 + }, + { + "epoch": 0.16968246487951738, + "grad_norm": 3.8929696083068848, + "learning_rate": 2.7918741015766217e-05, + "loss": 1.7862, + "step": 87135 + }, + { + "epoch": 0.16971167515062763, + "grad_norm": 2.6595265865325928, + "learning_rate": 2.7918041447327635e-05, + "loss": 1.9221, + "step": 87150 + }, + { + "epoch": 0.1697408854217379, + "grad_norm": 3.4818854331970215, + "learning_rate": 2.791734177010441e-05, + "loss": 1.88, + "step": 87165 + }, + { + "epoch": 0.16977009569284815, + "grad_norm": 2.475863218307495, + "learning_rate": 2.791664198410243e-05, + "loss": 1.8032, + "step": 87180 + }, + { + "epoch": 0.16979930596395842, + "grad_norm": 5.91239595413208, + "learning_rate": 2.791594208932759e-05, + "loss": 1.6916, + "step": 87195 + }, + { + "epoch": 0.16982851623506867, + "grad_norm": 2.361457347869873, + "learning_rate": 2.7915242085785787e-05, + "loss": 1.8865, + "step": 87210 + }, + { + "epoch": 0.16985772650617895, + "grad_norm": 3.656527519226074, + "learning_rate": 2.7914541973482912e-05, + "loss": 1.8775, + "step": 87225 + }, + { + "epoch": 0.16988693677728922, + "grad_norm": 3.3165628910064697, + "learning_rate": 2.791384175242487e-05, + "loss": 2.0565, + "step": 87240 + }, + { + "epoch": 0.16991614704839947, + "grad_norm": 2.410724401473999, + "learning_rate": 2.7913141422617542e-05, + "loss": 1.7248, + "step": 87255 + }, + { + "epoch": 0.16994535731950974, + "grad_norm": 3.183039903640747, + "learning_rate": 2.7912440984066835e-05, + "loss": 1.9037, + "step": 87270 + }, + { + "epoch": 0.16997456759062, + "grad_norm": 4.178635120391846, + "learning_rate": 2.7911740436778647e-05, + "loss": 1.8069, + "step": 87285 + }, + { + "epoch": 0.17000377786173027, + "grad_norm": 3.9849953651428223, + "learning_rate": 2.7911039780758878e-05, + "loss": 1.777, + "step": 87300 + }, + { + "epoch": 0.1700329881328405, + "grad_norm": 2.700085401535034, + "learning_rate": 2.791033901601343e-05, + "loss": 2.1485, + "step": 87315 + }, + { + "epoch": 0.1700621984039508, + "grad_norm": 3.645005702972412, + "learning_rate": 2.7909638142548196e-05, + "loss": 1.9997, + "step": 87330 + }, + { + "epoch": 0.17009140867506106, + "grad_norm": 2.690925121307373, + "learning_rate": 2.7908937160369084e-05, + "loss": 2.0263, + "step": 87345 + }, + { + "epoch": 0.1701206189461713, + "grad_norm": 4.018752098083496, + "learning_rate": 2.7908236069482e-05, + "loss": 1.9072, + "step": 87360 + }, + { + "epoch": 0.17014982921728158, + "grad_norm": 3.6640520095825195, + "learning_rate": 2.7907534869892845e-05, + "loss": 2.0041, + "step": 87375 + }, + { + "epoch": 0.17017903948839183, + "grad_norm": 4.49193000793457, + "learning_rate": 2.7906833561607522e-05, + "loss": 1.8883, + "step": 87390 + }, + { + "epoch": 0.1702082497595021, + "grad_norm": 3.442638397216797, + "learning_rate": 2.7906132144631934e-05, + "loss": 1.9077, + "step": 87405 + }, + { + "epoch": 0.17023746003061235, + "grad_norm": 2.6391854286193848, + "learning_rate": 2.7905430618972e-05, + "loss": 1.9737, + "step": 87420 + }, + { + "epoch": 0.17026667030172263, + "grad_norm": 3.5197558403015137, + "learning_rate": 2.7904728984633616e-05, + "loss": 1.8316, + "step": 87435 + }, + { + "epoch": 0.1702958805728329, + "grad_norm": 5.3151631355285645, + "learning_rate": 2.7904027241622694e-05, + "loss": 1.9998, + "step": 87450 + }, + { + "epoch": 0.17032509084394315, + "grad_norm": 2.2073421478271484, + "learning_rate": 2.790332538994514e-05, + "loss": 1.6697, + "step": 87465 + }, + { + "epoch": 0.17035430111505342, + "grad_norm": 2.802800416946411, + "learning_rate": 2.7902623429606873e-05, + "loss": 1.9456, + "step": 87480 + }, + { + "epoch": 0.17038351138616367, + "grad_norm": 2.0173850059509277, + "learning_rate": 2.79019213606138e-05, + "loss": 1.6899, + "step": 87495 + }, + { + "epoch": 0.17041272165727395, + "grad_norm": 2.7733025550842285, + "learning_rate": 2.790121918297183e-05, + "loss": 1.9637, + "step": 87510 + }, + { + "epoch": 0.1704419319283842, + "grad_norm": 4.355037212371826, + "learning_rate": 2.7900516896686878e-05, + "loss": 1.9527, + "step": 87525 + }, + { + "epoch": 0.17047114219949447, + "grad_norm": 4.067016124725342, + "learning_rate": 2.789981450176486e-05, + "loss": 1.8496, + "step": 87540 + }, + { + "epoch": 0.17050035247060474, + "grad_norm": 3.396195888519287, + "learning_rate": 2.789911199821169e-05, + "loss": 1.9627, + "step": 87555 + }, + { + "epoch": 0.170529562741715, + "grad_norm": 2.533646583557129, + "learning_rate": 2.7898409386033285e-05, + "loss": 2.0675, + "step": 87570 + }, + { + "epoch": 0.17055877301282527, + "grad_norm": 3.9269025325775146, + "learning_rate": 2.7897706665235562e-05, + "loss": 1.706, + "step": 87585 + }, + { + "epoch": 0.1705879832839355, + "grad_norm": 1.9529690742492676, + "learning_rate": 2.789700383582443e-05, + "loss": 1.7483, + "step": 87600 + }, + { + "epoch": 0.1706171935550458, + "grad_norm": 2.301961898803711, + "learning_rate": 2.7896300897805826e-05, + "loss": 1.9401, + "step": 87615 + }, + { + "epoch": 0.17064640382615603, + "grad_norm": 3.272174835205078, + "learning_rate": 2.7895597851185648e-05, + "loss": 1.7303, + "step": 87630 + }, + { + "epoch": 0.1706756140972663, + "grad_norm": 2.8104374408721924, + "learning_rate": 2.7894894695969832e-05, + "loss": 1.8989, + "step": 87645 + }, + { + "epoch": 0.17070482436837658, + "grad_norm": 3.6130502223968506, + "learning_rate": 2.789419143216429e-05, + "loss": 1.871, + "step": 87660 + }, + { + "epoch": 0.17073403463948683, + "grad_norm": 4.250863552093506, + "learning_rate": 2.789348805977495e-05, + "loss": 1.8155, + "step": 87675 + }, + { + "epoch": 0.1707632449105971, + "grad_norm": 3.6635870933532715, + "learning_rate": 2.7892784578807735e-05, + "loss": 1.8352, + "step": 87690 + }, + { + "epoch": 0.17079245518170735, + "grad_norm": 2.2338929176330566, + "learning_rate": 2.789208098926857e-05, + "loss": 2.1082, + "step": 87705 + }, + { + "epoch": 0.17082166545281763, + "grad_norm": 4.359334468841553, + "learning_rate": 2.7891377291163373e-05, + "loss": 2.1074, + "step": 87720 + }, + { + "epoch": 0.17085087572392788, + "grad_norm": 4.065170764923096, + "learning_rate": 2.789067348449808e-05, + "loss": 1.9829, + "step": 87735 + }, + { + "epoch": 0.17088008599503815, + "grad_norm": 5.102344512939453, + "learning_rate": 2.7889969569278607e-05, + "loss": 1.5442, + "step": 87750 + }, + { + "epoch": 0.17090929626614842, + "grad_norm": 2.6355929374694824, + "learning_rate": 2.788926554551089e-05, + "loss": 1.9845, + "step": 87765 + }, + { + "epoch": 0.17093850653725867, + "grad_norm": 4.133533000946045, + "learning_rate": 2.7888561413200855e-05, + "loss": 1.7897, + "step": 87780 + }, + { + "epoch": 0.17096771680836895, + "grad_norm": 7.129817485809326, + "learning_rate": 2.788785717235443e-05, + "loss": 1.8117, + "step": 87795 + }, + { + "epoch": 0.1709969270794792, + "grad_norm": 2.9881539344787598, + "learning_rate": 2.788715282297755e-05, + "loss": 1.7876, + "step": 87810 + }, + { + "epoch": 0.17102613735058947, + "grad_norm": 5.115859031677246, + "learning_rate": 2.788644836507614e-05, + "loss": 1.8925, + "step": 87825 + }, + { + "epoch": 0.17105534762169972, + "grad_norm": 4.179714202880859, + "learning_rate": 2.7885743798656142e-05, + "loss": 1.8686, + "step": 87840 + }, + { + "epoch": 0.17108455789281, + "grad_norm": 2.2996532917022705, + "learning_rate": 2.788503912372348e-05, + "loss": 1.8676, + "step": 87855 + }, + { + "epoch": 0.17111376816392027, + "grad_norm": 4.52742338180542, + "learning_rate": 2.788433434028409e-05, + "loss": 1.8404, + "step": 87870 + }, + { + "epoch": 0.1711429784350305, + "grad_norm": 2.543853759765625, + "learning_rate": 2.788362944834391e-05, + "loss": 2.0198, + "step": 87885 + }, + { + "epoch": 0.1711721887061408, + "grad_norm": 2.5702719688415527, + "learning_rate": 2.788292444790887e-05, + "loss": 1.8805, + "step": 87900 + }, + { + "epoch": 0.17120139897725103, + "grad_norm": 3.5167903900146484, + "learning_rate": 2.7882219338984914e-05, + "loss": 1.8898, + "step": 87915 + }, + { + "epoch": 0.1712306092483613, + "grad_norm": 3.4654746055603027, + "learning_rate": 2.788151412157798e-05, + "loss": 1.9784, + "step": 87930 + }, + { + "epoch": 0.17125981951947156, + "grad_norm": 3.3887298107147217, + "learning_rate": 2.7880808795694004e-05, + "loss": 1.9004, + "step": 87945 + }, + { + "epoch": 0.17128902979058183, + "grad_norm": 2.475003242492676, + "learning_rate": 2.788010336133892e-05, + "loss": 1.7415, + "step": 87960 + }, + { + "epoch": 0.1713182400616921, + "grad_norm": 2.393444538116455, + "learning_rate": 2.787939781851868e-05, + "loss": 1.6666, + "step": 87975 + }, + { + "epoch": 0.17134745033280235, + "grad_norm": 2.5584492683410645, + "learning_rate": 2.7878692167239214e-05, + "loss": 1.9021, + "step": 87990 + }, + { + "epoch": 0.17137666060391263, + "grad_norm": 2.8385488986968994, + "learning_rate": 2.7877986407506475e-05, + "loss": 1.9218, + "step": 88005 + }, + { + "epoch": 0.17140587087502288, + "grad_norm": 3.4241373538970947, + "learning_rate": 2.7877280539326396e-05, + "loss": 1.9449, + "step": 88020 + }, + { + "epoch": 0.17143508114613315, + "grad_norm": 3.1694869995117188, + "learning_rate": 2.787657456270493e-05, + "loss": 1.8056, + "step": 88035 + }, + { + "epoch": 0.1714642914172434, + "grad_norm": 2.365126848220825, + "learning_rate": 2.787586847764802e-05, + "loss": 1.8256, + "step": 88050 + }, + { + "epoch": 0.17149350168835367, + "grad_norm": 4.6236419677734375, + "learning_rate": 2.787516228416161e-05, + "loss": 1.6848, + "step": 88065 + }, + { + "epoch": 0.17152271195946392, + "grad_norm": 4.975138187408447, + "learning_rate": 2.7874455982251646e-05, + "loss": 2.0252, + "step": 88080 + }, + { + "epoch": 0.1715519222305742, + "grad_norm": 2.200153350830078, + "learning_rate": 2.7873749571924077e-05, + "loss": 1.9271, + "step": 88095 + }, + { + "epoch": 0.17158113250168447, + "grad_norm": 2.604912042617798, + "learning_rate": 2.7873043053184853e-05, + "loss": 2.3016, + "step": 88110 + }, + { + "epoch": 0.17161034277279472, + "grad_norm": 3.819098472595215, + "learning_rate": 2.7872336426039923e-05, + "loss": 1.8302, + "step": 88125 + }, + { + "epoch": 0.171639553043905, + "grad_norm": 2.6062324047088623, + "learning_rate": 2.7871629690495236e-05, + "loss": 1.8468, + "step": 88140 + }, + { + "epoch": 0.17166876331501524, + "grad_norm": 2.733017683029175, + "learning_rate": 2.7870922846556747e-05, + "loss": 1.8896, + "step": 88155 + }, + { + "epoch": 0.1716979735861255, + "grad_norm": 2.310469627380371, + "learning_rate": 2.787021589423041e-05, + "loss": 1.9627, + "step": 88170 + }, + { + "epoch": 0.17172718385723576, + "grad_norm": 3.388695478439331, + "learning_rate": 2.786950883352217e-05, + "loss": 1.878, + "step": 88185 + }, + { + "epoch": 0.17175639412834603, + "grad_norm": 3.288750648498535, + "learning_rate": 2.7868801664437986e-05, + "loss": 1.6069, + "step": 88200 + }, + { + "epoch": 0.1717856043994563, + "grad_norm": 4.271388053894043, + "learning_rate": 2.7868094386983812e-05, + "loss": 1.9449, + "step": 88215 + }, + { + "epoch": 0.17181481467056656, + "grad_norm": 3.8591396808624268, + "learning_rate": 2.786738700116561e-05, + "loss": 1.8353, + "step": 88230 + }, + { + "epoch": 0.17184402494167683, + "grad_norm": 2.101454496383667, + "learning_rate": 2.786667950698933e-05, + "loss": 1.9545, + "step": 88245 + }, + { + "epoch": 0.17187323521278708, + "grad_norm": 2.3093912601470947, + "learning_rate": 2.7865971904460932e-05, + "loss": 1.9633, + "step": 88260 + }, + { + "epoch": 0.17190244548389735, + "grad_norm": 2.3513643741607666, + "learning_rate": 2.7865264193586375e-05, + "loss": 1.8648, + "step": 88275 + }, + { + "epoch": 0.1719316557550076, + "grad_norm": 3.0354158878326416, + "learning_rate": 2.7864556374371616e-05, + "loss": 2.0074, + "step": 88290 + }, + { + "epoch": 0.17196086602611788, + "grad_norm": 2.676938772201538, + "learning_rate": 2.7863848446822623e-05, + "loss": 2.0261, + "step": 88305 + }, + { + "epoch": 0.17199007629722815, + "grad_norm": 2.194063186645508, + "learning_rate": 2.786314041094535e-05, + "loss": 1.9386, + "step": 88320 + }, + { + "epoch": 0.1720192865683384, + "grad_norm": 2.1363637447357178, + "learning_rate": 2.7862432266745766e-05, + "loss": 1.7709, + "step": 88335 + }, + { + "epoch": 0.17204849683944867, + "grad_norm": 2.275808811187744, + "learning_rate": 2.7861724014229825e-05, + "loss": 1.8301, + "step": 88350 + }, + { + "epoch": 0.17207770711055892, + "grad_norm": 4.052717685699463, + "learning_rate": 2.7861015653403503e-05, + "loss": 1.7277, + "step": 88365 + }, + { + "epoch": 0.1721069173816692, + "grad_norm": 3.851738214492798, + "learning_rate": 2.7860307184272754e-05, + "loss": 1.7305, + "step": 88380 + }, + { + "epoch": 0.17213612765277944, + "grad_norm": 4.355624198913574, + "learning_rate": 2.7859598606843556e-05, + "loss": 2.0954, + "step": 88395 + }, + { + "epoch": 0.17216533792388972, + "grad_norm": 3.5033466815948486, + "learning_rate": 2.7858889921121867e-05, + "loss": 1.9174, + "step": 88410 + }, + { + "epoch": 0.172194548195, + "grad_norm": 2.6904313564300537, + "learning_rate": 2.7858181127113657e-05, + "loss": 1.8478, + "step": 88425 + }, + { + "epoch": 0.17222375846611024, + "grad_norm": 3.1379032135009766, + "learning_rate": 2.7857472224824895e-05, + "loss": 2.026, + "step": 88440 + }, + { + "epoch": 0.1722529687372205, + "grad_norm": 2.3870766162872314, + "learning_rate": 2.785676321426155e-05, + "loss": 1.9988, + "step": 88455 + }, + { + "epoch": 0.17228217900833076, + "grad_norm": 2.437161445617676, + "learning_rate": 2.7856054095429595e-05, + "loss": 1.9288, + "step": 88470 + }, + { + "epoch": 0.17231138927944104, + "grad_norm": 2.481391668319702, + "learning_rate": 2.7855344868335e-05, + "loss": 1.9899, + "step": 88485 + }, + { + "epoch": 0.17234059955055128, + "grad_norm": 3.16351580619812, + "learning_rate": 2.7854635532983737e-05, + "loss": 2.1198, + "step": 88500 + }, + { + "epoch": 0.17236980982166156, + "grad_norm": 4.559393882751465, + "learning_rate": 2.7853926089381782e-05, + "loss": 1.6827, + "step": 88515 + }, + { + "epoch": 0.17239902009277183, + "grad_norm": 2.597702741622925, + "learning_rate": 2.7853216537535108e-05, + "loss": 2.0348, + "step": 88530 + }, + { + "epoch": 0.17242823036388208, + "grad_norm": 4.713498115539551, + "learning_rate": 2.7852506877449688e-05, + "loss": 1.92, + "step": 88545 + }, + { + "epoch": 0.17245744063499235, + "grad_norm": 2.338606595993042, + "learning_rate": 2.78517971091315e-05, + "loss": 1.9943, + "step": 88560 + }, + { + "epoch": 0.1724866509061026, + "grad_norm": 2.3854386806488037, + "learning_rate": 2.7851087232586522e-05, + "loss": 1.9943, + "step": 88575 + }, + { + "epoch": 0.17251586117721288, + "grad_norm": 2.414715528488159, + "learning_rate": 2.785037724782073e-05, + "loss": 1.829, + "step": 88590 + }, + { + "epoch": 0.17254507144832312, + "grad_norm": 3.002180576324463, + "learning_rate": 2.7849667154840105e-05, + "loss": 1.9265, + "step": 88605 + }, + { + "epoch": 0.1725742817194334, + "grad_norm": 3.404930353164673, + "learning_rate": 2.7848956953650624e-05, + "loss": 1.6335, + "step": 88620 + }, + { + "epoch": 0.17260349199054367, + "grad_norm": 3.8675458431243896, + "learning_rate": 2.784824664425827e-05, + "loss": 1.8427, + "step": 88635 + }, + { + "epoch": 0.17263270226165392, + "grad_norm": 2.727069854736328, + "learning_rate": 2.7847536226669024e-05, + "loss": 1.7344, + "step": 88650 + }, + { + "epoch": 0.1726619125327642, + "grad_norm": 3.1589672565460205, + "learning_rate": 2.784682570088887e-05, + "loss": 2.0086, + "step": 88665 + }, + { + "epoch": 0.17269112280387444, + "grad_norm": 2.219796895980835, + "learning_rate": 2.7846115066923787e-05, + "loss": 1.9095, + "step": 88680 + }, + { + "epoch": 0.17272033307498472, + "grad_norm": 3.5037944316864014, + "learning_rate": 2.7845404324779764e-05, + "loss": 1.8684, + "step": 88695 + }, + { + "epoch": 0.17274954334609496, + "grad_norm": 2.7663533687591553, + "learning_rate": 2.7844693474462787e-05, + "loss": 1.8062, + "step": 88710 + }, + { + "epoch": 0.17277875361720524, + "grad_norm": 3.9269888401031494, + "learning_rate": 2.7843982515978835e-05, + "loss": 1.9698, + "step": 88725 + }, + { + "epoch": 0.1728079638883155, + "grad_norm": 4.455451011657715, + "learning_rate": 2.78432714493339e-05, + "loss": 1.947, + "step": 88740 + }, + { + "epoch": 0.17283717415942576, + "grad_norm": 1.977190375328064, + "learning_rate": 2.7842560274533974e-05, + "loss": 1.8181, + "step": 88755 + }, + { + "epoch": 0.17286638443053604, + "grad_norm": 2.457735776901245, + "learning_rate": 2.784184899158504e-05, + "loss": 1.9058, + "step": 88770 + }, + { + "epoch": 0.17289559470164628, + "grad_norm": 3.2907094955444336, + "learning_rate": 2.784113760049309e-05, + "loss": 1.9371, + "step": 88785 + }, + { + "epoch": 0.17292480497275656, + "grad_norm": 2.19822096824646, + "learning_rate": 2.784042610126411e-05, + "loss": 1.9304, + "step": 88800 + }, + { + "epoch": 0.1729540152438668, + "grad_norm": 3.5285351276397705, + "learning_rate": 2.78397144939041e-05, + "loss": 1.8351, + "step": 88815 + }, + { + "epoch": 0.17298322551497708, + "grad_norm": 2.4372923374176025, + "learning_rate": 2.7839002778419048e-05, + "loss": 1.879, + "step": 88830 + }, + { + "epoch": 0.17301243578608735, + "grad_norm": 4.532806396484375, + "learning_rate": 2.7838290954814946e-05, + "loss": 1.686, + "step": 88845 + }, + { + "epoch": 0.1730416460571976, + "grad_norm": 3.569352865219116, + "learning_rate": 2.7837579023097794e-05, + "loss": 1.8263, + "step": 88860 + }, + { + "epoch": 0.17307085632830788, + "grad_norm": 3.488201141357422, + "learning_rate": 2.783686698327358e-05, + "loss": 1.8357, + "step": 88875 + }, + { + "epoch": 0.17310006659941812, + "grad_norm": 2.5802218914031982, + "learning_rate": 2.7836154835348304e-05, + "loss": 1.901, + "step": 88890 + }, + { + "epoch": 0.1731292768705284, + "grad_norm": 2.843557596206665, + "learning_rate": 2.7835442579327967e-05, + "loss": 1.8512, + "step": 88905 + }, + { + "epoch": 0.17315848714163865, + "grad_norm": 2.6477513313293457, + "learning_rate": 2.7834730215218558e-05, + "loss": 1.739, + "step": 88920 + }, + { + "epoch": 0.17318769741274892, + "grad_norm": 5.536716461181641, + "learning_rate": 2.783401774302608e-05, + "loss": 1.9952, + "step": 88935 + }, + { + "epoch": 0.1732169076838592, + "grad_norm": 3.0617306232452393, + "learning_rate": 2.7833305162756537e-05, + "loss": 2.0108, + "step": 88950 + }, + { + "epoch": 0.17324611795496944, + "grad_norm": 3.800445795059204, + "learning_rate": 2.7832592474415923e-05, + "loss": 1.8455, + "step": 88965 + }, + { + "epoch": 0.17327532822607972, + "grad_norm": 2.4237465858459473, + "learning_rate": 2.783187967801025e-05, + "loss": 1.86, + "step": 88980 + }, + { + "epoch": 0.17330453849718996, + "grad_norm": 3.5917246341705322, + "learning_rate": 2.7831166773545506e-05, + "loss": 1.8303, + "step": 88995 + }, + { + "epoch": 0.17333374876830024, + "grad_norm": 2.956064462661743, + "learning_rate": 2.7830453761027704e-05, + "loss": 1.6996, + "step": 89010 + }, + { + "epoch": 0.17336295903941049, + "grad_norm": 2.6760928630828857, + "learning_rate": 2.7829740640462846e-05, + "loss": 1.6839, + "step": 89025 + }, + { + "epoch": 0.17339216931052076, + "grad_norm": 3.4546456336975098, + "learning_rate": 2.7829027411856938e-05, + "loss": 1.9683, + "step": 89040 + }, + { + "epoch": 0.17342137958163104, + "grad_norm": 3.5256564617156982, + "learning_rate": 2.7828314075215988e-05, + "loss": 1.8987, + "step": 89055 + }, + { + "epoch": 0.17345058985274128, + "grad_norm": 4.645074367523193, + "learning_rate": 2.7827600630546e-05, + "loss": 2.0539, + "step": 89070 + }, + { + "epoch": 0.17347980012385156, + "grad_norm": 2.23844313621521, + "learning_rate": 2.782688707785298e-05, + "loss": 1.779, + "step": 89085 + }, + { + "epoch": 0.1735090103949618, + "grad_norm": 3.6744942665100098, + "learning_rate": 2.782617341714294e-05, + "loss": 1.8119, + "step": 89100 + }, + { + "epoch": 0.17353822066607208, + "grad_norm": 7.358153343200684, + "learning_rate": 2.7825459648421893e-05, + "loss": 1.9034, + "step": 89115 + }, + { + "epoch": 0.17356743093718233, + "grad_norm": 4.005391597747803, + "learning_rate": 2.7824745771695846e-05, + "loss": 1.9932, + "step": 89130 + }, + { + "epoch": 0.1735966412082926, + "grad_norm": 3.448897361755371, + "learning_rate": 2.7824031786970808e-05, + "loss": 1.9704, + "step": 89145 + }, + { + "epoch": 0.17362585147940288, + "grad_norm": 2.8943889141082764, + "learning_rate": 2.78233176942528e-05, + "loss": 1.8422, + "step": 89160 + }, + { + "epoch": 0.17365506175051312, + "grad_norm": 5.02158260345459, + "learning_rate": 2.7822603493547825e-05, + "loss": 1.8167, + "step": 89175 + }, + { + "epoch": 0.1736842720216234, + "grad_norm": 2.590074062347412, + "learning_rate": 2.7821889184861904e-05, + "loss": 1.8462, + "step": 89190 + }, + { + "epoch": 0.17371348229273365, + "grad_norm": 3.7797505855560303, + "learning_rate": 2.782117476820105e-05, + "loss": 1.7262, + "step": 89205 + }, + { + "epoch": 0.17374269256384392, + "grad_norm": 3.689462661743164, + "learning_rate": 2.7820460243571286e-05, + "loss": 1.9908, + "step": 89220 + }, + { + "epoch": 0.17377190283495417, + "grad_norm": 2.5714001655578613, + "learning_rate": 2.781974561097862e-05, + "loss": 1.943, + "step": 89235 + }, + { + "epoch": 0.17380111310606444, + "grad_norm": 3.0879268646240234, + "learning_rate": 2.781903087042907e-05, + "loss": 1.9393, + "step": 89250 + }, + { + "epoch": 0.17383032337717472, + "grad_norm": 2.0955796241760254, + "learning_rate": 2.781831602192866e-05, + "loss": 1.7845, + "step": 89265 + }, + { + "epoch": 0.17385953364828496, + "grad_norm": 2.3010976314544678, + "learning_rate": 2.7817601065483408e-05, + "loss": 1.8344, + "step": 89280 + }, + { + "epoch": 0.17388874391939524, + "grad_norm": 4.152106285095215, + "learning_rate": 2.7816886001099337e-05, + "loss": 1.9133, + "step": 89295 + }, + { + "epoch": 0.17391795419050549, + "grad_norm": 3.1556005477905273, + "learning_rate": 2.781617082878246e-05, + "loss": 1.9849, + "step": 89310 + }, + { + "epoch": 0.17394716446161576, + "grad_norm": 2.472399950027466, + "learning_rate": 2.7815455548538813e-05, + "loss": 1.851, + "step": 89325 + }, + { + "epoch": 0.173976374732726, + "grad_norm": 3.3041281700134277, + "learning_rate": 2.7814740160374405e-05, + "loss": 1.8485, + "step": 89340 + }, + { + "epoch": 0.17400558500383628, + "grad_norm": 2.9651687145233154, + "learning_rate": 2.7814024664295273e-05, + "loss": 2.1436, + "step": 89355 + }, + { + "epoch": 0.17403479527494656, + "grad_norm": 2.975517749786377, + "learning_rate": 2.7813309060307436e-05, + "loss": 2.0152, + "step": 89370 + }, + { + "epoch": 0.1740640055460568, + "grad_norm": 3.3052468299865723, + "learning_rate": 2.7812593348416923e-05, + "loss": 1.7269, + "step": 89385 + }, + { + "epoch": 0.17409321581716708, + "grad_norm": 3.3983285427093506, + "learning_rate": 2.7811877528629757e-05, + "loss": 2.0297, + "step": 89400 + }, + { + "epoch": 0.17412242608827733, + "grad_norm": 5.843042850494385, + "learning_rate": 2.781116160095197e-05, + "loss": 1.8906, + "step": 89415 + }, + { + "epoch": 0.1741516363593876, + "grad_norm": 3.7618746757507324, + "learning_rate": 2.7810445565389587e-05, + "loss": 1.974, + "step": 89430 + }, + { + "epoch": 0.17418084663049785, + "grad_norm": 4.965116024017334, + "learning_rate": 2.7809729421948645e-05, + "loss": 1.849, + "step": 89445 + }, + { + "epoch": 0.17421005690160812, + "grad_norm": 2.4078760147094727, + "learning_rate": 2.7809013170635163e-05, + "loss": 1.9232, + "step": 89460 + }, + { + "epoch": 0.1742392671727184, + "grad_norm": 2.9732818603515625, + "learning_rate": 2.7808296811455183e-05, + "loss": 1.8322, + "step": 89475 + }, + { + "epoch": 0.17426847744382865, + "grad_norm": 4.797516822814941, + "learning_rate": 2.7807580344414736e-05, + "loss": 1.9388, + "step": 89490 + }, + { + "epoch": 0.17429768771493892, + "grad_norm": 2.3788983821868896, + "learning_rate": 2.7806863769519853e-05, + "loss": 1.789, + "step": 89505 + }, + { + "epoch": 0.17432689798604917, + "grad_norm": 2.8066797256469727, + "learning_rate": 2.7806147086776567e-05, + "loss": 1.8966, + "step": 89520 + }, + { + "epoch": 0.17435610825715944, + "grad_norm": 4.233401775360107, + "learning_rate": 2.7805430296190913e-05, + "loss": 2.0313, + "step": 89535 + }, + { + "epoch": 0.1743853185282697, + "grad_norm": 3.2195284366607666, + "learning_rate": 2.7804713397768936e-05, + "loss": 1.792, + "step": 89550 + }, + { + "epoch": 0.17441452879937996, + "grad_norm": 3.711766481399536, + "learning_rate": 2.7803996391516662e-05, + "loss": 1.7331, + "step": 89565 + }, + { + "epoch": 0.17444373907049024, + "grad_norm": 5.176243305206299, + "learning_rate": 2.7803279277440137e-05, + "loss": 1.9319, + "step": 89580 + }, + { + "epoch": 0.1744729493416005, + "grad_norm": 2.940187454223633, + "learning_rate": 2.7802562055545395e-05, + "loss": 1.9696, + "step": 89595 + }, + { + "epoch": 0.17450215961271076, + "grad_norm": 2.738393545150757, + "learning_rate": 2.7801844725838476e-05, + "loss": 1.9284, + "step": 89610 + }, + { + "epoch": 0.174531369883821, + "grad_norm": 3.630924701690674, + "learning_rate": 2.7801127288325422e-05, + "loss": 1.7365, + "step": 89625 + }, + { + "epoch": 0.17456058015493128, + "grad_norm": 3.1053097248077393, + "learning_rate": 2.7800409743012277e-05, + "loss": 1.8388, + "step": 89640 + }, + { + "epoch": 0.17458979042604153, + "grad_norm": 3.314096212387085, + "learning_rate": 2.779969208990508e-05, + "loss": 1.9964, + "step": 89655 + }, + { + "epoch": 0.1746190006971518, + "grad_norm": 4.199493408203125, + "learning_rate": 2.7798974329009875e-05, + "loss": 1.8458, + "step": 89670 + }, + { + "epoch": 0.17464821096826208, + "grad_norm": 3.3810462951660156, + "learning_rate": 2.7798256460332707e-05, + "loss": 1.9699, + "step": 89685 + }, + { + "epoch": 0.17467742123937233, + "grad_norm": 2.298717737197876, + "learning_rate": 2.7797538483879623e-05, + "loss": 1.8935, + "step": 89700 + }, + { + "epoch": 0.1747066315104826, + "grad_norm": 3.485271692276001, + "learning_rate": 2.7796820399656666e-05, + "loss": 1.8933, + "step": 89715 + }, + { + "epoch": 0.17473584178159285, + "grad_norm": 2.701718330383301, + "learning_rate": 2.7796102207669885e-05, + "loss": 1.9343, + "step": 89730 + }, + { + "epoch": 0.17476505205270312, + "grad_norm": 2.6345162391662598, + "learning_rate": 2.7795383907925328e-05, + "loss": 1.9531, + "step": 89745 + }, + { + "epoch": 0.17479426232381337, + "grad_norm": 3.044572114944458, + "learning_rate": 2.779466550042904e-05, + "loss": 1.8771, + "step": 89760 + }, + { + "epoch": 0.17482347259492365, + "grad_norm": 3.390333890914917, + "learning_rate": 2.7793946985187078e-05, + "loss": 1.695, + "step": 89775 + }, + { + "epoch": 0.17485268286603392, + "grad_norm": 2.1160192489624023, + "learning_rate": 2.7793228362205488e-05, + "loss": 1.8379, + "step": 89790 + }, + { + "epoch": 0.17488189313714417, + "grad_norm": 6.34581995010376, + "learning_rate": 2.7792509631490323e-05, + "loss": 1.925, + "step": 89805 + }, + { + "epoch": 0.17491110340825444, + "grad_norm": 8.139732360839844, + "learning_rate": 2.7791790793047634e-05, + "loss": 1.9349, + "step": 89820 + }, + { + "epoch": 0.1749403136793647, + "grad_norm": 2.525301218032837, + "learning_rate": 2.779107184688348e-05, + "loss": 1.75, + "step": 89835 + }, + { + "epoch": 0.17496952395047496, + "grad_norm": 2.51540470123291, + "learning_rate": 2.7790352793003907e-05, + "loss": 1.8986, + "step": 89850 + }, + { + "epoch": 0.1749987342215852, + "grad_norm": 2.993166208267212, + "learning_rate": 2.7789633631414974e-05, + "loss": 1.8666, + "step": 89865 + }, + { + "epoch": 0.1750279444926955, + "grad_norm": 3.18035626411438, + "learning_rate": 2.778891436212274e-05, + "loss": 1.8127, + "step": 89880 + }, + { + "epoch": 0.17505715476380576, + "grad_norm": 3.360891342163086, + "learning_rate": 2.7788194985133256e-05, + "loss": 1.9287, + "step": 89895 + }, + { + "epoch": 0.175086365034916, + "grad_norm": 4.014555931091309, + "learning_rate": 2.7787475500452587e-05, + "loss": 1.84, + "step": 89910 + }, + { + "epoch": 0.17511557530602628, + "grad_norm": 3.5245680809020996, + "learning_rate": 2.7786755908086787e-05, + "loss": 2.0551, + "step": 89925 + }, + { + "epoch": 0.17514478557713653, + "grad_norm": 6.240306377410889, + "learning_rate": 2.7786036208041917e-05, + "loss": 1.8978, + "step": 89940 + }, + { + "epoch": 0.1751739958482468, + "grad_norm": 3.7071139812469482, + "learning_rate": 2.7785316400324034e-05, + "loss": 1.7178, + "step": 89955 + }, + { + "epoch": 0.17520320611935705, + "grad_norm": 3.455045223236084, + "learning_rate": 2.778459648493921e-05, + "loss": 1.9111, + "step": 89970 + }, + { + "epoch": 0.17523241639046733, + "grad_norm": 2.523939371109009, + "learning_rate": 2.7783876461893498e-05, + "loss": 1.8382, + "step": 89985 + }, + { + "epoch": 0.1752616266615776, + "grad_norm": 3.7760586738586426, + "learning_rate": 2.7783156331192964e-05, + "loss": 1.8508, + "step": 90000 + }, + { + "epoch": 0.17529083693268785, + "grad_norm": 4.258102893829346, + "learning_rate": 2.7782436092843676e-05, + "loss": 1.7281, + "step": 90015 + }, + { + "epoch": 0.17532004720379812, + "grad_norm": 4.258599281311035, + "learning_rate": 2.778171574685169e-05, + "loss": 1.7334, + "step": 90030 + }, + { + "epoch": 0.17534925747490837, + "grad_norm": 2.721313238143921, + "learning_rate": 2.778099529322308e-05, + "loss": 1.8324, + "step": 90045 + }, + { + "epoch": 0.17537846774601865, + "grad_norm": 3.31717586517334, + "learning_rate": 2.7780274731963912e-05, + "loss": 2.0146, + "step": 90060 + }, + { + "epoch": 0.1754076780171289, + "grad_norm": 3.6453425884246826, + "learning_rate": 2.7779554063080256e-05, + "loss": 1.6734, + "step": 90075 + }, + { + "epoch": 0.17543688828823917, + "grad_norm": 3.1512513160705566, + "learning_rate": 2.777883328657817e-05, + "loss": 1.8565, + "step": 90090 + }, + { + "epoch": 0.17546609855934944, + "grad_norm": 5.4527716636657715, + "learning_rate": 2.7778112402463734e-05, + "loss": 1.7925, + "step": 90105 + }, + { + "epoch": 0.1754953088304597, + "grad_norm": 3.9396626949310303, + "learning_rate": 2.7777391410743022e-05, + "loss": 1.9329, + "step": 90120 + }, + { + "epoch": 0.17552451910156996, + "grad_norm": 3.2829508781433105, + "learning_rate": 2.7776670311422098e-05, + "loss": 2.0129, + "step": 90135 + }, + { + "epoch": 0.1755537293726802, + "grad_norm": 3.5777692794799805, + "learning_rate": 2.7775949104507034e-05, + "loss": 2.0102, + "step": 90150 + }, + { + "epoch": 0.1755829396437905, + "grad_norm": 2.6787233352661133, + "learning_rate": 2.7775227790003905e-05, + "loss": 1.8752, + "step": 90165 + }, + { + "epoch": 0.17561214991490073, + "grad_norm": 3.5650339126586914, + "learning_rate": 2.777450636791879e-05, + "loss": 2.0009, + "step": 90180 + }, + { + "epoch": 0.175641360186011, + "grad_norm": 3.0136802196502686, + "learning_rate": 2.7773784838257755e-05, + "loss": 1.8485, + "step": 90195 + }, + { + "epoch": 0.17567057045712126, + "grad_norm": 4.413090229034424, + "learning_rate": 2.7773063201026884e-05, + "loss": 1.873, + "step": 90210 + }, + { + "epoch": 0.17569978072823153, + "grad_norm": 1.978641152381897, + "learning_rate": 2.7772341456232254e-05, + "loss": 1.9009, + "step": 90225 + }, + { + "epoch": 0.1757289909993418, + "grad_norm": 2.154536485671997, + "learning_rate": 2.7771619603879938e-05, + "loss": 1.7712, + "step": 90240 + }, + { + "epoch": 0.17575820127045205, + "grad_norm": 3.6047170162200928, + "learning_rate": 2.7770897643976016e-05, + "loss": 2.1745, + "step": 90255 + }, + { + "epoch": 0.17578741154156233, + "grad_norm": 2.3465096950531006, + "learning_rate": 2.777017557652657e-05, + "loss": 2.0652, + "step": 90270 + }, + { + "epoch": 0.17581662181267257, + "grad_norm": 3.23372220993042, + "learning_rate": 2.7769453401537677e-05, + "loss": 1.9456, + "step": 90285 + }, + { + "epoch": 0.17584583208378285, + "grad_norm": 2.5508956909179688, + "learning_rate": 2.7768731119015422e-05, + "loss": 2.0536, + "step": 90300 + }, + { + "epoch": 0.1758750423548931, + "grad_norm": 4.362436294555664, + "learning_rate": 2.776800872896589e-05, + "loss": 1.8566, + "step": 90315 + }, + { + "epoch": 0.17590425262600337, + "grad_norm": 4.012520790100098, + "learning_rate": 2.776728623139516e-05, + "loss": 1.8553, + "step": 90330 + }, + { + "epoch": 0.17593346289711365, + "grad_norm": 3.6440787315368652, + "learning_rate": 2.7766563626309312e-05, + "loss": 1.9124, + "step": 90345 + }, + { + "epoch": 0.1759626731682239, + "grad_norm": 2.302351474761963, + "learning_rate": 2.7765840913714437e-05, + "loss": 1.727, + "step": 90360 + }, + { + "epoch": 0.17599188343933417, + "grad_norm": 4.8543500900268555, + "learning_rate": 2.7765118093616625e-05, + "loss": 1.6877, + "step": 90375 + }, + { + "epoch": 0.17602109371044442, + "grad_norm": 2.5577950477600098, + "learning_rate": 2.7764395166021955e-05, + "loss": 1.8706, + "step": 90390 + }, + { + "epoch": 0.1760503039815547, + "grad_norm": 5.435410499572754, + "learning_rate": 2.7763672130936518e-05, + "loss": 1.9875, + "step": 90405 + }, + { + "epoch": 0.17607951425266494, + "grad_norm": 2.796084403991699, + "learning_rate": 2.7762948988366404e-05, + "loss": 1.9757, + "step": 90420 + }, + { + "epoch": 0.1761087245237752, + "grad_norm": 2.1117942333221436, + "learning_rate": 2.77622257383177e-05, + "loss": 1.8782, + "step": 90435 + }, + { + "epoch": 0.1761379347948855, + "grad_norm": 4.877633571624756, + "learning_rate": 2.7761502380796496e-05, + "loss": 1.7457, + "step": 90450 + }, + { + "epoch": 0.17616714506599573, + "grad_norm": 5.352377414703369, + "learning_rate": 2.7760778915808892e-05, + "loss": 1.8886, + "step": 90465 + }, + { + "epoch": 0.176196355337106, + "grad_norm": 4.2299041748046875, + "learning_rate": 2.7760055343360972e-05, + "loss": 1.7708, + "step": 90480 + }, + { + "epoch": 0.17622556560821626, + "grad_norm": 2.7221109867095947, + "learning_rate": 2.7759331663458832e-05, + "loss": 1.7965, + "step": 90495 + }, + { + "epoch": 0.17625477587932653, + "grad_norm": 1.7758771181106567, + "learning_rate": 2.775860787610856e-05, + "loss": 1.7878, + "step": 90510 + }, + { + "epoch": 0.17628398615043678, + "grad_norm": 4.953612327575684, + "learning_rate": 2.7757883981316263e-05, + "loss": 1.7436, + "step": 90525 + }, + { + "epoch": 0.17631319642154705, + "grad_norm": 4.967999458312988, + "learning_rate": 2.7757159979088023e-05, + "loss": 1.9835, + "step": 90540 + }, + { + "epoch": 0.17634240669265733, + "grad_norm": 3.22163462638855, + "learning_rate": 2.7756435869429955e-05, + "loss": 1.5967, + "step": 90555 + }, + { + "epoch": 0.17637161696376757, + "grad_norm": 4.714684009552002, + "learning_rate": 2.775571165234814e-05, + "loss": 1.6631, + "step": 90570 + }, + { + "epoch": 0.17640082723487785, + "grad_norm": 4.441711902618408, + "learning_rate": 2.7754987327848682e-05, + "loss": 1.9158, + "step": 90585 + }, + { + "epoch": 0.1764300375059881, + "grad_norm": 2.8121848106384277, + "learning_rate": 2.7754262895937684e-05, + "loss": 1.8395, + "step": 90600 + }, + { + "epoch": 0.17645924777709837, + "grad_norm": 6.396861553192139, + "learning_rate": 2.7753538356621245e-05, + "loss": 1.9325, + "step": 90615 + }, + { + "epoch": 0.17648845804820862, + "grad_norm": 4.290360450744629, + "learning_rate": 2.7752813709905464e-05, + "loss": 1.9054, + "step": 90630 + }, + { + "epoch": 0.1765176683193189, + "grad_norm": 4.980266571044922, + "learning_rate": 2.775208895579645e-05, + "loss": 1.8907, + "step": 90645 + }, + { + "epoch": 0.17654687859042917, + "grad_norm": 3.91514253616333, + "learning_rate": 2.7751364094300295e-05, + "loss": 1.8424, + "step": 90660 + }, + { + "epoch": 0.17657608886153942, + "grad_norm": 2.4604971408843994, + "learning_rate": 2.775063912542311e-05, + "loss": 1.8166, + "step": 90675 + }, + { + "epoch": 0.1766052991326497, + "grad_norm": 4.065450668334961, + "learning_rate": 2.7749914049171003e-05, + "loss": 1.9248, + "step": 90690 + }, + { + "epoch": 0.17663450940375994, + "grad_norm": 3.4355249404907227, + "learning_rate": 2.7749188865550076e-05, + "loss": 1.8723, + "step": 90705 + }, + { + "epoch": 0.1766637196748702, + "grad_norm": 3.5076210498809814, + "learning_rate": 2.7748463574566435e-05, + "loss": 1.834, + "step": 90720 + }, + { + "epoch": 0.17669292994598046, + "grad_norm": 4.0915069580078125, + "learning_rate": 2.7747738176226186e-05, + "loss": 1.7553, + "step": 90735 + }, + { + "epoch": 0.17672214021709073, + "grad_norm": 3.6160566806793213, + "learning_rate": 2.774701267053545e-05, + "loss": 1.6606, + "step": 90750 + }, + { + "epoch": 0.176751350488201, + "grad_norm": 3.0106403827667236, + "learning_rate": 2.774628705750032e-05, + "loss": 1.7066, + "step": 90765 + }, + { + "epoch": 0.17678056075931126, + "grad_norm": 2.6574342250823975, + "learning_rate": 2.7745561337126917e-05, + "loss": 1.7961, + "step": 90780 + }, + { + "epoch": 0.17680977103042153, + "grad_norm": 4.0654168128967285, + "learning_rate": 2.7744835509421347e-05, + "loss": 1.9463, + "step": 90795 + }, + { + "epoch": 0.17683898130153178, + "grad_norm": 4.0692458152771, + "learning_rate": 2.7744109574389727e-05, + "loss": 1.9297, + "step": 90810 + }, + { + "epoch": 0.17686819157264205, + "grad_norm": 2.4688684940338135, + "learning_rate": 2.774338353203817e-05, + "loss": 1.8526, + "step": 90825 + }, + { + "epoch": 0.1768974018437523, + "grad_norm": 2.185706377029419, + "learning_rate": 2.7742657382372784e-05, + "loss": 1.8465, + "step": 90840 + }, + { + "epoch": 0.17692661211486257, + "grad_norm": 3.3587512969970703, + "learning_rate": 2.7741931125399688e-05, + "loss": 1.7678, + "step": 90855 + }, + { + "epoch": 0.17695582238597285, + "grad_norm": 2.019087791442871, + "learning_rate": 2.7741204761124998e-05, + "loss": 1.881, + "step": 90870 + }, + { + "epoch": 0.1769850326570831, + "grad_norm": 4.066291809082031, + "learning_rate": 2.7740478289554835e-05, + "loss": 1.9337, + "step": 90885 + }, + { + "epoch": 0.17701424292819337, + "grad_norm": 3.485825777053833, + "learning_rate": 2.7739751710695308e-05, + "loss": 1.8624, + "step": 90900 + }, + { + "epoch": 0.17704345319930362, + "grad_norm": 3.984581470489502, + "learning_rate": 2.773902502455254e-05, + "loss": 1.832, + "step": 90915 + }, + { + "epoch": 0.1770726634704139, + "grad_norm": 4.722860813140869, + "learning_rate": 2.773829823113265e-05, + "loss": 1.8632, + "step": 90930 + }, + { + "epoch": 0.17710187374152414, + "grad_norm": 2.307084083557129, + "learning_rate": 2.7737571330441767e-05, + "loss": 2.0044, + "step": 90945 + }, + { + "epoch": 0.17713108401263442, + "grad_norm": 1.8309338092803955, + "learning_rate": 2.7736844322485998e-05, + "loss": 1.7493, + "step": 90960 + }, + { + "epoch": 0.1771602942837447, + "grad_norm": 5.768820285797119, + "learning_rate": 2.773611720727147e-05, + "loss": 1.748, + "step": 90975 + }, + { + "epoch": 0.17718950455485494, + "grad_norm": 3.1634578704833984, + "learning_rate": 2.773538998480431e-05, + "loss": 1.9535, + "step": 90990 + }, + { + "epoch": 0.1772187148259652, + "grad_norm": 2.279015064239502, + "learning_rate": 2.7734662655090636e-05, + "loss": 1.9047, + "step": 91005 + }, + { + "epoch": 0.17724792509707546, + "grad_norm": 3.9165773391723633, + "learning_rate": 2.7733935218136583e-05, + "loss": 1.876, + "step": 91020 + }, + { + "epoch": 0.17727713536818573, + "grad_norm": 2.3547329902648926, + "learning_rate": 2.773320767394827e-05, + "loss": 1.7169, + "step": 91035 + }, + { + "epoch": 0.17730634563929598, + "grad_norm": 2.329373359680176, + "learning_rate": 2.773248002253182e-05, + "loss": 1.6965, + "step": 91050 + }, + { + "epoch": 0.17733555591040626, + "grad_norm": 3.249741792678833, + "learning_rate": 2.773175226389337e-05, + "loss": 1.843, + "step": 91065 + }, + { + "epoch": 0.17736476618151653, + "grad_norm": 4.3851823806762695, + "learning_rate": 2.7731024398039035e-05, + "loss": 1.6883, + "step": 91080 + }, + { + "epoch": 0.17739397645262678, + "grad_norm": 4.099958419799805, + "learning_rate": 2.7730296424974963e-05, + "loss": 1.8403, + "step": 91095 + }, + { + "epoch": 0.17742318672373705, + "grad_norm": 3.3914308547973633, + "learning_rate": 2.7729568344707267e-05, + "loss": 1.8545, + "step": 91110 + }, + { + "epoch": 0.1774523969948473, + "grad_norm": 3.6236979961395264, + "learning_rate": 2.772884015724209e-05, + "loss": 1.8896, + "step": 91125 + }, + { + "epoch": 0.17748160726595757, + "grad_norm": 1.9073833227157593, + "learning_rate": 2.772811186258556e-05, + "loss": 1.8144, + "step": 91140 + }, + { + "epoch": 0.17751081753706782, + "grad_norm": 3.121502161026001, + "learning_rate": 2.7727383460743807e-05, + "loss": 1.9667, + "step": 91155 + }, + { + "epoch": 0.1775400278081781, + "grad_norm": 2.1707286834716797, + "learning_rate": 2.7726654951722968e-05, + "loss": 1.8441, + "step": 91170 + }, + { + "epoch": 0.17756923807928837, + "grad_norm": 3.592103958129883, + "learning_rate": 2.772592633552918e-05, + "loss": 2.0384, + "step": 91185 + }, + { + "epoch": 0.17759844835039862, + "grad_norm": 4.06790828704834, + "learning_rate": 2.7725197612168575e-05, + "loss": 1.6831, + "step": 91200 + }, + { + "epoch": 0.1776276586215089, + "grad_norm": 3.032888412475586, + "learning_rate": 2.772446878164729e-05, + "loss": 2.0433, + "step": 91215 + }, + { + "epoch": 0.17765686889261914, + "grad_norm": 2.9623143672943115, + "learning_rate": 2.7723739843971465e-05, + "loss": 2.3134, + "step": 91230 + }, + { + "epoch": 0.17768607916372942, + "grad_norm": 4.133320331573486, + "learning_rate": 2.7723010799147237e-05, + "loss": 1.9292, + "step": 91245 + }, + { + "epoch": 0.17771528943483966, + "grad_norm": 2.772233009338379, + "learning_rate": 2.772228164718075e-05, + "loss": 1.9144, + "step": 91260 + }, + { + "epoch": 0.17774449970594994, + "grad_norm": 4.58173131942749, + "learning_rate": 2.7721552388078134e-05, + "loss": 1.9304, + "step": 91275 + }, + { + "epoch": 0.1777737099770602, + "grad_norm": 5.428915977478027, + "learning_rate": 2.7720823021845536e-05, + "loss": 1.8156, + "step": 91290 + }, + { + "epoch": 0.17780292024817046, + "grad_norm": 4.247468948364258, + "learning_rate": 2.77200935484891e-05, + "loss": 1.9841, + "step": 91305 + }, + { + "epoch": 0.17783213051928073, + "grad_norm": 4.9154510498046875, + "learning_rate": 2.7719363968014968e-05, + "loss": 1.8489, + "step": 91320 + }, + { + "epoch": 0.17786134079039098, + "grad_norm": 2.32688045501709, + "learning_rate": 2.7718634280429284e-05, + "loss": 1.6249, + "step": 91335 + }, + { + "epoch": 0.17789055106150126, + "grad_norm": 2.882077932357788, + "learning_rate": 2.771790448573819e-05, + "loss": 1.8065, + "step": 91350 + }, + { + "epoch": 0.1779197613326115, + "grad_norm": 4.619049549102783, + "learning_rate": 2.7717174583947833e-05, + "loss": 1.8465, + "step": 91365 + }, + { + "epoch": 0.17794897160372178, + "grad_norm": 3.607131242752075, + "learning_rate": 2.771644457506436e-05, + "loss": 1.8736, + "step": 91380 + }, + { + "epoch": 0.17797818187483205, + "grad_norm": 3.367401361465454, + "learning_rate": 2.7715714459093922e-05, + "loss": 1.8257, + "step": 91395 + }, + { + "epoch": 0.1780073921459423, + "grad_norm": 2.355822801589966, + "learning_rate": 2.7714984236042666e-05, + "loss": 1.7666, + "step": 91410 + }, + { + "epoch": 0.17803660241705258, + "grad_norm": 4.480250835418701, + "learning_rate": 2.771425390591673e-05, + "loss": 2.0408, + "step": 91425 + }, + { + "epoch": 0.17806581268816282, + "grad_norm": 5.397293567657471, + "learning_rate": 2.771352346872228e-05, + "loss": 1.8832, + "step": 91440 + }, + { + "epoch": 0.1780950229592731, + "grad_norm": 3.723433017730713, + "learning_rate": 2.7712792924465462e-05, + "loss": 1.8118, + "step": 91455 + }, + { + "epoch": 0.17812423323038334, + "grad_norm": 3.479463577270508, + "learning_rate": 2.7712062273152426e-05, + "loss": 1.9764, + "step": 91470 + }, + { + "epoch": 0.17815344350149362, + "grad_norm": 3.7407853603363037, + "learning_rate": 2.7711331514789327e-05, + "loss": 1.9089, + "step": 91485 + }, + { + "epoch": 0.1781826537726039, + "grad_norm": 1.7621113061904907, + "learning_rate": 2.771060064938231e-05, + "loss": 1.7896, + "step": 91500 + }, + { + "epoch": 0.17821186404371414, + "grad_norm": 2.797764778137207, + "learning_rate": 2.7709869676937546e-05, + "loss": 1.6483, + "step": 91515 + }, + { + "epoch": 0.17824107431482442, + "grad_norm": 3.6728739738464355, + "learning_rate": 2.7709138597461177e-05, + "loss": 1.7975, + "step": 91530 + }, + { + "epoch": 0.17827028458593466, + "grad_norm": 2.809462785720825, + "learning_rate": 2.770840741095937e-05, + "loss": 2.0825, + "step": 91545 + }, + { + "epoch": 0.17829949485704494, + "grad_norm": 2.7235429286956787, + "learning_rate": 2.770767611743827e-05, + "loss": 1.9106, + "step": 91560 + }, + { + "epoch": 0.17832870512815518, + "grad_norm": 2.5808889865875244, + "learning_rate": 2.7706944716904046e-05, + "loss": 2.1257, + "step": 91575 + }, + { + "epoch": 0.17835791539926546, + "grad_norm": 3.1290502548217773, + "learning_rate": 2.7706213209362853e-05, + "loss": 1.8771, + "step": 91590 + }, + { + "epoch": 0.17838712567037573, + "grad_norm": 2.4853668212890625, + "learning_rate": 2.770548159482085e-05, + "loss": 1.9719, + "step": 91605 + }, + { + "epoch": 0.17841633594148598, + "grad_norm": 2.088733673095703, + "learning_rate": 2.77047498732842e-05, + "loss": 1.6872, + "step": 91620 + }, + { + "epoch": 0.17844554621259626, + "grad_norm": 3.5094075202941895, + "learning_rate": 2.770401804475906e-05, + "loss": 2.0606, + "step": 91635 + }, + { + "epoch": 0.1784747564837065, + "grad_norm": 2.7712767124176025, + "learning_rate": 2.7703286109251605e-05, + "loss": 1.7808, + "step": 91650 + }, + { + "epoch": 0.17850396675481678, + "grad_norm": 2.0916554927825928, + "learning_rate": 2.7702554066767988e-05, + "loss": 1.9717, + "step": 91665 + }, + { + "epoch": 0.17853317702592703, + "grad_norm": 3.251678228378296, + "learning_rate": 2.7701821917314375e-05, + "loss": 1.792, + "step": 91680 + }, + { + "epoch": 0.1785623872970373, + "grad_norm": 4.187269687652588, + "learning_rate": 2.7701089660896933e-05, + "loss": 1.8538, + "step": 91695 + }, + { + "epoch": 0.17859159756814758, + "grad_norm": 1.955680012702942, + "learning_rate": 2.7700357297521833e-05, + "loss": 2.0169, + "step": 91710 + }, + { + "epoch": 0.17862080783925782, + "grad_norm": 2.465576410293579, + "learning_rate": 2.7699624827195236e-05, + "loss": 1.8574, + "step": 91725 + }, + { + "epoch": 0.1786500181103681, + "grad_norm": 3.3191077709198, + "learning_rate": 2.7698892249923314e-05, + "loss": 1.9591, + "step": 91740 + }, + { + "epoch": 0.17867922838147834, + "grad_norm": 2.276421308517456, + "learning_rate": 2.769815956571223e-05, + "loss": 1.8174, + "step": 91755 + }, + { + "epoch": 0.17870843865258862, + "grad_norm": 4.777490139007568, + "learning_rate": 2.769742677456816e-05, + "loss": 1.9214, + "step": 91770 + }, + { + "epoch": 0.17873764892369887, + "grad_norm": 2.302922487258911, + "learning_rate": 2.7696693876497276e-05, + "loss": 2.0762, + "step": 91785 + }, + { + "epoch": 0.17876685919480914, + "grad_norm": 4.496932506561279, + "learning_rate": 2.7695960871505743e-05, + "loss": 2.0063, + "step": 91800 + }, + { + "epoch": 0.17879606946591942, + "grad_norm": 1.8797622919082642, + "learning_rate": 2.769522775959974e-05, + "loss": 1.7944, + "step": 91815 + }, + { + "epoch": 0.17882527973702966, + "grad_norm": 4.22454833984375, + "learning_rate": 2.7694494540785438e-05, + "loss": 1.7519, + "step": 91830 + }, + { + "epoch": 0.17885449000813994, + "grad_norm": 5.21730899810791, + "learning_rate": 2.769376121506901e-05, + "loss": 1.7192, + "step": 91845 + }, + { + "epoch": 0.17888370027925019, + "grad_norm": 4.535948753356934, + "learning_rate": 2.7693027782456637e-05, + "loss": 1.8721, + "step": 91860 + }, + { + "epoch": 0.17891291055036046, + "grad_norm": 3.7082018852233887, + "learning_rate": 2.769229424295449e-05, + "loss": 2.0798, + "step": 91875 + }, + { + "epoch": 0.1789421208214707, + "grad_norm": 3.035527229309082, + "learning_rate": 2.769156059656875e-05, + "loss": 1.978, + "step": 91890 + }, + { + "epoch": 0.17897133109258098, + "grad_norm": 2.7884960174560547, + "learning_rate": 2.7690826843305593e-05, + "loss": 1.7372, + "step": 91905 + }, + { + "epoch": 0.17900054136369126, + "grad_norm": 4.397505760192871, + "learning_rate": 2.7690092983171195e-05, + "loss": 1.8849, + "step": 91920 + }, + { + "epoch": 0.1790297516348015, + "grad_norm": 2.2995827198028564, + "learning_rate": 2.768935901617174e-05, + "loss": 2.0106, + "step": 91935 + }, + { + "epoch": 0.17905896190591178, + "grad_norm": 4.365660667419434, + "learning_rate": 2.7688624942313407e-05, + "loss": 1.7988, + "step": 91950 + }, + { + "epoch": 0.17908817217702203, + "grad_norm": 2.288593053817749, + "learning_rate": 2.7687890761602386e-05, + "loss": 1.9101, + "step": 91965 + }, + { + "epoch": 0.1791173824481323, + "grad_norm": 4.755799293518066, + "learning_rate": 2.7687156474044843e-05, + "loss": 2.0132, + "step": 91980 + }, + { + "epoch": 0.17914659271924255, + "grad_norm": 2.694140911102295, + "learning_rate": 2.7686422079646976e-05, + "loss": 2.0839, + "step": 91995 + }, + { + "epoch": 0.17917580299035282, + "grad_norm": 2.3538320064544678, + "learning_rate": 2.7685687578414967e-05, + "loss": 1.7442, + "step": 92010 + }, + { + "epoch": 0.1792050132614631, + "grad_norm": 3.3404524326324463, + "learning_rate": 2.7684952970354994e-05, + "loss": 2.0237, + "step": 92025 + }, + { + "epoch": 0.17923422353257334, + "grad_norm": 2.922121286392212, + "learning_rate": 2.768421825547325e-05, + "loss": 1.9281, + "step": 92040 + }, + { + "epoch": 0.17926343380368362, + "grad_norm": 3.9325039386749268, + "learning_rate": 2.768348343377592e-05, + "loss": 1.7876, + "step": 92055 + }, + { + "epoch": 0.17929264407479387, + "grad_norm": 4.110378265380859, + "learning_rate": 2.768274850526919e-05, + "loss": 1.8982, + "step": 92070 + }, + { + "epoch": 0.17932185434590414, + "grad_norm": 4.18983793258667, + "learning_rate": 2.7682013469959255e-05, + "loss": 1.8976, + "step": 92085 + }, + { + "epoch": 0.1793510646170144, + "grad_norm": 3.115391492843628, + "learning_rate": 2.76812783278523e-05, + "loss": 2.22, + "step": 92100 + }, + { + "epoch": 0.17938027488812466, + "grad_norm": 3.306413173675537, + "learning_rate": 2.768054307895451e-05, + "loss": 1.8575, + "step": 92115 + }, + { + "epoch": 0.17940948515923494, + "grad_norm": 4.646861553192139, + "learning_rate": 2.767980772327209e-05, + "loss": 1.994, + "step": 92130 + }, + { + "epoch": 0.17943869543034519, + "grad_norm": 2.383268356323242, + "learning_rate": 2.7679072260811223e-05, + "loss": 1.8303, + "step": 92145 + }, + { + "epoch": 0.17946790570145546, + "grad_norm": 3.8319637775421143, + "learning_rate": 2.767833669157811e-05, + "loss": 1.9058, + "step": 92160 + }, + { + "epoch": 0.1794971159725657, + "grad_norm": 4.063842296600342, + "learning_rate": 2.7677601015578935e-05, + "loss": 1.9637, + "step": 92175 + }, + { + "epoch": 0.17952632624367598, + "grad_norm": 2.8375391960144043, + "learning_rate": 2.76768652328199e-05, + "loss": 1.8422, + "step": 92190 + }, + { + "epoch": 0.17955553651478623, + "grad_norm": 2.5808022022247314, + "learning_rate": 2.7676129343307204e-05, + "loss": 2.0232, + "step": 92205 + }, + { + "epoch": 0.1795847467858965, + "grad_norm": 1.9084349870681763, + "learning_rate": 2.7675393347047037e-05, + "loss": 1.9163, + "step": 92220 + }, + { + "epoch": 0.17961395705700678, + "grad_norm": 3.7782301902770996, + "learning_rate": 2.7674657244045598e-05, + "loss": 1.8936, + "step": 92235 + }, + { + "epoch": 0.17964316732811703, + "grad_norm": 4.687180042266846, + "learning_rate": 2.767392103430909e-05, + "loss": 1.8467, + "step": 92250 + }, + { + "epoch": 0.1796723775992273, + "grad_norm": 4.25228214263916, + "learning_rate": 2.767318471784371e-05, + "loss": 1.9247, + "step": 92265 + }, + { + "epoch": 0.17970158787033755, + "grad_norm": 4.55307674407959, + "learning_rate": 2.767244829465566e-05, + "loss": 1.7032, + "step": 92280 + }, + { + "epoch": 0.17973079814144782, + "grad_norm": 3.058771848678589, + "learning_rate": 2.767171176475114e-05, + "loss": 2.0241, + "step": 92295 + }, + { + "epoch": 0.17976000841255807, + "grad_norm": 2.196815013885498, + "learning_rate": 2.7670975128136354e-05, + "loss": 1.7656, + "step": 92310 + }, + { + "epoch": 0.17978921868366834, + "grad_norm": 3.44468355178833, + "learning_rate": 2.7670238384817502e-05, + "loss": 1.8441, + "step": 92325 + }, + { + "epoch": 0.1798184289547786, + "grad_norm": 3.1608753204345703, + "learning_rate": 2.7669501534800793e-05, + "loss": 2.0047, + "step": 92340 + }, + { + "epoch": 0.17984763922588887, + "grad_norm": 7.173932075500488, + "learning_rate": 2.7668764578092428e-05, + "loss": 1.872, + "step": 92355 + }, + { + "epoch": 0.17987684949699914, + "grad_norm": 5.875077247619629, + "learning_rate": 2.7668027514698616e-05, + "loss": 1.8713, + "step": 92370 + }, + { + "epoch": 0.1799060597681094, + "grad_norm": 3.0245134830474854, + "learning_rate": 2.766729034462556e-05, + "loss": 1.7123, + "step": 92385 + }, + { + "epoch": 0.17993527003921966, + "grad_norm": 3.8824782371520996, + "learning_rate": 2.766655306787947e-05, + "loss": 1.8699, + "step": 92400 + }, + { + "epoch": 0.1799644803103299, + "grad_norm": 2.154015302658081, + "learning_rate": 2.7665815684466562e-05, + "loss": 1.9598, + "step": 92415 + }, + { + "epoch": 0.17999369058144019, + "grad_norm": 2.327570915222168, + "learning_rate": 2.766507819439303e-05, + "loss": 1.8709, + "step": 92430 + }, + { + "epoch": 0.18002290085255043, + "grad_norm": 2.6921374797821045, + "learning_rate": 2.76643405976651e-05, + "loss": 1.7889, + "step": 92445 + }, + { + "epoch": 0.1800521111236607, + "grad_norm": 2.691473960876465, + "learning_rate": 2.7663602894288972e-05, + "loss": 2.0013, + "step": 92460 + }, + { + "epoch": 0.18008132139477098, + "grad_norm": 5.79668664932251, + "learning_rate": 2.7662865084270864e-05, + "loss": 1.9918, + "step": 92475 + }, + { + "epoch": 0.18011053166588123, + "grad_norm": 3.6742846965789795, + "learning_rate": 2.7662127167616993e-05, + "loss": 1.8611, + "step": 92490 + }, + { + "epoch": 0.1801397419369915, + "grad_norm": 2.91487455368042, + "learning_rate": 2.7661389144333563e-05, + "loss": 1.9964, + "step": 92505 + }, + { + "epoch": 0.18016895220810175, + "grad_norm": 5.073643207550049, + "learning_rate": 2.7660651014426794e-05, + "loss": 1.9636, + "step": 92520 + }, + { + "epoch": 0.18019816247921203, + "grad_norm": 2.6396565437316895, + "learning_rate": 2.7659912777902904e-05, + "loss": 1.9351, + "step": 92535 + }, + { + "epoch": 0.18022737275032227, + "grad_norm": 4.053412914276123, + "learning_rate": 2.765917443476811e-05, + "loss": 1.9839, + "step": 92550 + }, + { + "epoch": 0.18025658302143255, + "grad_norm": 4.9180755615234375, + "learning_rate": 2.7658435985028626e-05, + "loss": 1.7125, + "step": 92565 + }, + { + "epoch": 0.18028579329254282, + "grad_norm": 3.3816518783569336, + "learning_rate": 2.765769742869067e-05, + "loss": 1.9187, + "step": 92580 + }, + { + "epoch": 0.18031500356365307, + "grad_norm": 3.1389973163604736, + "learning_rate": 2.7656958765760463e-05, + "loss": 1.6982, + "step": 92595 + }, + { + "epoch": 0.18034421383476334, + "grad_norm": 1.9660248756408691, + "learning_rate": 2.7656219996244228e-05, + "loss": 1.7224, + "step": 92610 + }, + { + "epoch": 0.1803734241058736, + "grad_norm": 2.452911853790283, + "learning_rate": 2.7655481120148186e-05, + "loss": 1.9435, + "step": 92625 + }, + { + "epoch": 0.18040263437698387, + "grad_norm": 3.6187262535095215, + "learning_rate": 2.7654742137478556e-05, + "loss": 1.773, + "step": 92640 + }, + { + "epoch": 0.18043184464809411, + "grad_norm": 4.287827491760254, + "learning_rate": 2.7654003048241564e-05, + "loss": 1.9726, + "step": 92655 + }, + { + "epoch": 0.1804610549192044, + "grad_norm": 4.026311874389648, + "learning_rate": 2.765326385244343e-05, + "loss": 1.9733, + "step": 92670 + }, + { + "epoch": 0.18049026519031466, + "grad_norm": 2.239495277404785, + "learning_rate": 2.7652524550090385e-05, + "loss": 1.9743, + "step": 92685 + }, + { + "epoch": 0.1805194754614249, + "grad_norm": 4.623342514038086, + "learning_rate": 2.765178514118865e-05, + "loss": 1.8453, + "step": 92700 + }, + { + "epoch": 0.18054868573253519, + "grad_norm": 2.484206438064575, + "learning_rate": 2.7651045625744448e-05, + "loss": 1.9109, + "step": 92715 + }, + { + "epoch": 0.18057789600364543, + "grad_norm": 3.673715353012085, + "learning_rate": 2.7650306003764016e-05, + "loss": 1.9266, + "step": 92730 + }, + { + "epoch": 0.1806071062747557, + "grad_norm": 2.6208252906799316, + "learning_rate": 2.764956627525358e-05, + "loss": 1.8606, + "step": 92745 + }, + { + "epoch": 0.18063631654586595, + "grad_norm": 4.198822975158691, + "learning_rate": 2.7648826440219367e-05, + "loss": 2.0113, + "step": 92760 + }, + { + "epoch": 0.18066552681697623, + "grad_norm": 2.303046464920044, + "learning_rate": 2.76480864986676e-05, + "loss": 1.9884, + "step": 92775 + }, + { + "epoch": 0.1806947370880865, + "grad_norm": 3.672036647796631, + "learning_rate": 2.7647346450604524e-05, + "loss": 1.7954, + "step": 92790 + }, + { + "epoch": 0.18072394735919675, + "grad_norm": 3.3701744079589844, + "learning_rate": 2.7646606296036362e-05, + "loss": 1.6927, + "step": 92805 + }, + { + "epoch": 0.18075315763030703, + "grad_norm": 2.074408531188965, + "learning_rate": 2.7645866034969355e-05, + "loss": 1.8464, + "step": 92820 + }, + { + "epoch": 0.18078236790141727, + "grad_norm": 2.9515576362609863, + "learning_rate": 2.764512566740973e-05, + "loss": 1.9439, + "step": 92835 + }, + { + "epoch": 0.18081157817252755, + "grad_norm": 3.593489646911621, + "learning_rate": 2.764438519336372e-05, + "loss": 2.0924, + "step": 92850 + }, + { + "epoch": 0.1808407884436378, + "grad_norm": 4.0020952224731445, + "learning_rate": 2.7643644612837564e-05, + "loss": 1.947, + "step": 92865 + }, + { + "epoch": 0.18086999871474807, + "grad_norm": 5.371128559112549, + "learning_rate": 2.7642903925837502e-05, + "loss": 1.9112, + "step": 92880 + }, + { + "epoch": 0.18089920898585835, + "grad_norm": 4.9128828048706055, + "learning_rate": 2.7642163132369765e-05, + "loss": 1.9424, + "step": 92895 + }, + { + "epoch": 0.1809284192569686, + "grad_norm": 4.493983268737793, + "learning_rate": 2.7641422232440596e-05, + "loss": 1.7808, + "step": 92910 + }, + { + "epoch": 0.18095762952807887, + "grad_norm": 6.360769271850586, + "learning_rate": 2.764068122605623e-05, + "loss": 1.8462, + "step": 92925 + }, + { + "epoch": 0.18098683979918911, + "grad_norm": 3.6582531929016113, + "learning_rate": 2.763994011322291e-05, + "loss": 2.0381, + "step": 92940 + }, + { + "epoch": 0.1810160500702994, + "grad_norm": 2.1021182537078857, + "learning_rate": 2.763919889394688e-05, + "loss": 1.8617, + "step": 92955 + }, + { + "epoch": 0.18104526034140964, + "grad_norm": 3.7234206199645996, + "learning_rate": 2.7638457568234376e-05, + "loss": 1.8159, + "step": 92970 + }, + { + "epoch": 0.1810744706125199, + "grad_norm": 2.5171637535095215, + "learning_rate": 2.763771613609164e-05, + "loss": 2.0183, + "step": 92985 + }, + { + "epoch": 0.18110368088363019, + "grad_norm": 3.8474509716033936, + "learning_rate": 2.7636974597524924e-05, + "loss": 1.7629, + "step": 93000 + }, + { + "epoch": 0.18113289115474043, + "grad_norm": 3.9770400524139404, + "learning_rate": 2.7636232952540464e-05, + "loss": 1.9905, + "step": 93015 + }, + { + "epoch": 0.1811621014258507, + "grad_norm": 4.456263065338135, + "learning_rate": 2.7635491201144513e-05, + "loss": 1.8008, + "step": 93030 + }, + { + "epoch": 0.18119131169696095, + "grad_norm": 2.2288267612457275, + "learning_rate": 2.763474934334331e-05, + "loss": 2.0548, + "step": 93045 + }, + { + "epoch": 0.18122052196807123, + "grad_norm": 2.5032432079315186, + "learning_rate": 2.7634007379143107e-05, + "loss": 1.9155, + "step": 93060 + }, + { + "epoch": 0.18124973223918148, + "grad_norm": 2.7400872707366943, + "learning_rate": 2.763326530855015e-05, + "loss": 1.9574, + "step": 93075 + }, + { + "epoch": 0.18127894251029175, + "grad_norm": 2.4222750663757324, + "learning_rate": 2.763252313157069e-05, + "loss": 1.8935, + "step": 93090 + }, + { + "epoch": 0.18130815278140203, + "grad_norm": 3.762211322784424, + "learning_rate": 2.7631780848210978e-05, + "loss": 1.8148, + "step": 93105 + }, + { + "epoch": 0.18133736305251227, + "grad_norm": 3.675910472869873, + "learning_rate": 2.763103845847726e-05, + "loss": 2.0913, + "step": 93120 + }, + { + "epoch": 0.18136657332362255, + "grad_norm": 2.924429178237915, + "learning_rate": 2.7630295962375794e-05, + "loss": 1.7726, + "step": 93135 + }, + { + "epoch": 0.1813957835947328, + "grad_norm": 3.1411404609680176, + "learning_rate": 2.7629553359912826e-05, + "loss": 1.8925, + "step": 93150 + }, + { + "epoch": 0.18142499386584307, + "grad_norm": 3.419050693511963, + "learning_rate": 2.7628810651094612e-05, + "loss": 1.8582, + "step": 93165 + }, + { + "epoch": 0.18145420413695332, + "grad_norm": 2.889031410217285, + "learning_rate": 2.762806783592741e-05, + "loss": 1.8179, + "step": 93180 + }, + { + "epoch": 0.1814834144080636, + "grad_norm": 4.2747273445129395, + "learning_rate": 2.7627324914417472e-05, + "loss": 1.7593, + "step": 93195 + }, + { + "epoch": 0.18151262467917387, + "grad_norm": 5.497156620025635, + "learning_rate": 2.7626581886571055e-05, + "loss": 2.0412, + "step": 93210 + }, + { + "epoch": 0.18154183495028411, + "grad_norm": 3.0694739818573, + "learning_rate": 2.7625838752394413e-05, + "loss": 2.0692, + "step": 93225 + }, + { + "epoch": 0.1815710452213944, + "grad_norm": 2.726219415664673, + "learning_rate": 2.762509551189381e-05, + "loss": 1.8663, + "step": 93240 + }, + { + "epoch": 0.18160025549250464, + "grad_norm": 2.1628475189208984, + "learning_rate": 2.7624352165075508e-05, + "loss": 1.9499, + "step": 93255 + }, + { + "epoch": 0.1816294657636149, + "grad_norm": 2.3830838203430176, + "learning_rate": 2.762360871194575e-05, + "loss": 1.7766, + "step": 93270 + }, + { + "epoch": 0.18165867603472516, + "grad_norm": 2.210273027420044, + "learning_rate": 2.7622865152510813e-05, + "loss": 1.9099, + "step": 93285 + }, + { + "epoch": 0.18168788630583543, + "grad_norm": 3.482532024383545, + "learning_rate": 2.7622121486776955e-05, + "loss": 1.8971, + "step": 93300 + }, + { + "epoch": 0.1817170965769457, + "grad_norm": 3.518601417541504, + "learning_rate": 2.7621377714750433e-05, + "loss": 1.8988, + "step": 93315 + }, + { + "epoch": 0.18174630684805596, + "grad_norm": 2.806331157684326, + "learning_rate": 2.762063383643752e-05, + "loss": 1.9072, + "step": 93330 + }, + { + "epoch": 0.18177551711916623, + "grad_norm": 4.225208282470703, + "learning_rate": 2.7619889851844467e-05, + "loss": 2.2095, + "step": 93345 + }, + { + "epoch": 0.18180472739027648, + "grad_norm": 3.866720676422119, + "learning_rate": 2.7619145760977552e-05, + "loss": 1.9214, + "step": 93360 + }, + { + "epoch": 0.18183393766138675, + "grad_norm": 3.8077690601348877, + "learning_rate": 2.7618401563843032e-05, + "loss": 1.8767, + "step": 93375 + }, + { + "epoch": 0.181863147932497, + "grad_norm": 2.9681737422943115, + "learning_rate": 2.761765726044718e-05, + "loss": 2.0945, + "step": 93390 + }, + { + "epoch": 0.18189235820360727, + "grad_norm": 3.44258451461792, + "learning_rate": 2.7616912850796256e-05, + "loss": 1.9279, + "step": 93405 + }, + { + "epoch": 0.18192156847471755, + "grad_norm": 2.1440718173980713, + "learning_rate": 2.7616168334896543e-05, + "loss": 1.95, + "step": 93420 + }, + { + "epoch": 0.1819507787458278, + "grad_norm": 3.4202229976654053, + "learning_rate": 2.7615423712754298e-05, + "loss": 2.0584, + "step": 93435 + }, + { + "epoch": 0.18197998901693807, + "grad_norm": 4.576741695404053, + "learning_rate": 2.7614678984375794e-05, + "loss": 1.8119, + "step": 93450 + }, + { + "epoch": 0.18200919928804832, + "grad_norm": 4.168622016906738, + "learning_rate": 2.7613934149767307e-05, + "loss": 1.9024, + "step": 93465 + }, + { + "epoch": 0.1820384095591586, + "grad_norm": 2.2588460445404053, + "learning_rate": 2.7613189208935105e-05, + "loss": 1.8158, + "step": 93480 + }, + { + "epoch": 0.18206761983026884, + "grad_norm": 3.0095467567443848, + "learning_rate": 2.7612444161885463e-05, + "loss": 1.9585, + "step": 93495 + }, + { + "epoch": 0.18209683010137911, + "grad_norm": 3.6314756870269775, + "learning_rate": 2.7611699008624655e-05, + "loss": 1.9104, + "step": 93510 + }, + { + "epoch": 0.1821260403724894, + "grad_norm": 4.37233829498291, + "learning_rate": 2.7610953749158953e-05, + "loss": 1.9225, + "step": 93525 + }, + { + "epoch": 0.18215525064359964, + "grad_norm": 3.5955750942230225, + "learning_rate": 2.7610208383494645e-05, + "loss": 1.8617, + "step": 93540 + }, + { + "epoch": 0.1821844609147099, + "grad_norm": 4.51361083984375, + "learning_rate": 2.7609462911637986e-05, + "loss": 1.8058, + "step": 93555 + }, + { + "epoch": 0.18221367118582016, + "grad_norm": 4.442575931549072, + "learning_rate": 2.7608717333595275e-05, + "loss": 1.8778, + "step": 93570 + }, + { + "epoch": 0.18224288145693043, + "grad_norm": 1.7832722663879395, + "learning_rate": 2.760797164937278e-05, + "loss": 1.881, + "step": 93585 + }, + { + "epoch": 0.18227209172804068, + "grad_norm": 3.072748899459839, + "learning_rate": 2.7607225858976783e-05, + "loss": 1.7505, + "step": 93600 + }, + { + "epoch": 0.18230130199915096, + "grad_norm": 2.988489866256714, + "learning_rate": 2.760647996241356e-05, + "loss": 1.9309, + "step": 93615 + }, + { + "epoch": 0.18233051227026123, + "grad_norm": 2.770848035812378, + "learning_rate": 2.76057339596894e-05, + "loss": 1.7728, + "step": 93630 + }, + { + "epoch": 0.18235972254137148, + "grad_norm": 4.5652756690979, + "learning_rate": 2.7604987850810584e-05, + "loss": 1.6187, + "step": 93645 + }, + { + "epoch": 0.18238893281248175, + "grad_norm": 3.4148263931274414, + "learning_rate": 2.7604241635783385e-05, + "loss": 2.0196, + "step": 93660 + }, + { + "epoch": 0.182418143083592, + "grad_norm": 3.650883674621582, + "learning_rate": 2.76034953146141e-05, + "loss": 1.8581, + "step": 93675 + }, + { + "epoch": 0.18244735335470227, + "grad_norm": 2.640186071395874, + "learning_rate": 2.7602748887309005e-05, + "loss": 1.9419, + "step": 93690 + }, + { + "epoch": 0.18247656362581252, + "grad_norm": 5.353418827056885, + "learning_rate": 2.7602002353874394e-05, + "loss": 1.9653, + "step": 93705 + }, + { + "epoch": 0.1825057738969228, + "grad_norm": 2.1555888652801514, + "learning_rate": 2.7601255714316545e-05, + "loss": 2.0065, + "step": 93720 + }, + { + "epoch": 0.18253498416803307, + "grad_norm": 5.662516117095947, + "learning_rate": 2.760050896864175e-05, + "loss": 1.6292, + "step": 93735 + }, + { + "epoch": 0.18256419443914332, + "grad_norm": 3.6848344802856445, + "learning_rate": 2.7599762116856295e-05, + "loss": 1.9887, + "step": 93750 + }, + { + "epoch": 0.1825934047102536, + "grad_norm": 2.3457562923431396, + "learning_rate": 2.7599015158966474e-05, + "loss": 2.0472, + "step": 93765 + }, + { + "epoch": 0.18262261498136384, + "grad_norm": 4.555329322814941, + "learning_rate": 2.7598268094978572e-05, + "loss": 1.9983, + "step": 93780 + }, + { + "epoch": 0.18265182525247411, + "grad_norm": 2.9889895915985107, + "learning_rate": 2.7597520924898883e-05, + "loss": 1.8411, + "step": 93795 + }, + { + "epoch": 0.18268103552358436, + "grad_norm": 3.1734416484832764, + "learning_rate": 2.75967736487337e-05, + "loss": 2.0244, + "step": 93810 + }, + { + "epoch": 0.18271024579469464, + "grad_norm": 2.517132043838501, + "learning_rate": 2.7596026266489317e-05, + "loss": 1.9653, + "step": 93825 + }, + { + "epoch": 0.1827394560658049, + "grad_norm": 2.421205997467041, + "learning_rate": 2.759527877817202e-05, + "loss": 1.8639, + "step": 93840 + }, + { + "epoch": 0.18276866633691516, + "grad_norm": 1.9306482076644897, + "learning_rate": 2.759453118378811e-05, + "loss": 1.8266, + "step": 93855 + }, + { + "epoch": 0.18279787660802543, + "grad_norm": 3.2078773975372314, + "learning_rate": 2.759378348334388e-05, + "loss": 1.8212, + "step": 93870 + }, + { + "epoch": 0.18282708687913568, + "grad_norm": 4.434211730957031, + "learning_rate": 2.7593035676845635e-05, + "loss": 1.9236, + "step": 93885 + }, + { + "epoch": 0.18285629715024596, + "grad_norm": 2.363943099975586, + "learning_rate": 2.759228776429966e-05, + "loss": 2.0851, + "step": 93900 + }, + { + "epoch": 0.1828855074213562, + "grad_norm": 2.472825527191162, + "learning_rate": 2.7591539745712257e-05, + "loss": 1.7821, + "step": 93915 + }, + { + "epoch": 0.18291471769246648, + "grad_norm": 4.735840797424316, + "learning_rate": 2.7590791621089733e-05, + "loss": 1.8115, + "step": 93930 + }, + { + "epoch": 0.18294392796357675, + "grad_norm": 5.477010726928711, + "learning_rate": 2.7590043390438376e-05, + "loss": 1.8708, + "step": 93945 + }, + { + "epoch": 0.182973138234687, + "grad_norm": 3.9854114055633545, + "learning_rate": 2.7589295053764495e-05, + "loss": 1.969, + "step": 93960 + }, + { + "epoch": 0.18300234850579727, + "grad_norm": 1.7628816366195679, + "learning_rate": 2.758854661107439e-05, + "loss": 1.8541, + "step": 93975 + }, + { + "epoch": 0.18303155877690752, + "grad_norm": 6.822324752807617, + "learning_rate": 2.7587798062374366e-05, + "loss": 1.8846, + "step": 93990 + }, + { + "epoch": 0.1830607690480178, + "grad_norm": 2.451692819595337, + "learning_rate": 2.7587049407670723e-05, + "loss": 1.7942, + "step": 94005 + }, + { + "epoch": 0.18308997931912804, + "grad_norm": 1.7566745281219482, + "learning_rate": 2.7586300646969763e-05, + "loss": 1.9207, + "step": 94020 + }, + { + "epoch": 0.18311918959023832, + "grad_norm": 3.999807119369507, + "learning_rate": 2.7585551780277794e-05, + "loss": 1.8028, + "step": 94035 + }, + { + "epoch": 0.1831483998613486, + "grad_norm": 5.702386379241943, + "learning_rate": 2.758480280760113e-05, + "loss": 1.8814, + "step": 94050 + }, + { + "epoch": 0.18317761013245884, + "grad_norm": 3.393380880355835, + "learning_rate": 2.758405372894607e-05, + "loss": 2.0335, + "step": 94065 + }, + { + "epoch": 0.18320682040356912, + "grad_norm": 3.221254348754883, + "learning_rate": 2.758330454431892e-05, + "loss": 1.8658, + "step": 94080 + }, + { + "epoch": 0.18323603067467936, + "grad_norm": 6.44359016418457, + "learning_rate": 2.7582555253725995e-05, + "loss": 1.9036, + "step": 94095 + }, + { + "epoch": 0.18326524094578964, + "grad_norm": 1.8060381412506104, + "learning_rate": 2.7581805857173603e-05, + "loss": 1.7912, + "step": 94110 + }, + { + "epoch": 0.18329445121689988, + "grad_norm": 5.123130798339844, + "learning_rate": 2.7581056354668052e-05, + "loss": 1.7738, + "step": 94125 + }, + { + "epoch": 0.18332366148801016, + "grad_norm": 2.1349294185638428, + "learning_rate": 2.7580306746215654e-05, + "loss": 1.8936, + "step": 94140 + }, + { + "epoch": 0.18335287175912043, + "grad_norm": 3.465413808822632, + "learning_rate": 2.7579557031822728e-05, + "loss": 1.617, + "step": 94155 + }, + { + "epoch": 0.18338208203023068, + "grad_norm": 3.956171751022339, + "learning_rate": 2.7578807211495586e-05, + "loss": 1.7957, + "step": 94170 + }, + { + "epoch": 0.18341129230134096, + "grad_norm": 5.1210103034973145, + "learning_rate": 2.7578057285240532e-05, + "loss": 1.9538, + "step": 94185 + }, + { + "epoch": 0.1834405025724512, + "grad_norm": 3.114718198776245, + "learning_rate": 2.7577307253063894e-05, + "loss": 1.7473, + "step": 94200 + }, + { + "epoch": 0.18346971284356148, + "grad_norm": 4.184943675994873, + "learning_rate": 2.7576557114971982e-05, + "loss": 1.8848, + "step": 94215 + }, + { + "epoch": 0.18349892311467172, + "grad_norm": 2.8505895137786865, + "learning_rate": 2.7575806870971112e-05, + "loss": 1.9196, + "step": 94230 + }, + { + "epoch": 0.183528133385782, + "grad_norm": 2.0261776447296143, + "learning_rate": 2.75750565210676e-05, + "loss": 2.1848, + "step": 94245 + }, + { + "epoch": 0.18355734365689227, + "grad_norm": 3.711357355117798, + "learning_rate": 2.7574306065267776e-05, + "loss": 1.8269, + "step": 94260 + }, + { + "epoch": 0.18358655392800252, + "grad_norm": 2.082653045654297, + "learning_rate": 2.757355550357795e-05, + "loss": 1.929, + "step": 94275 + }, + { + "epoch": 0.1836157641991128, + "grad_norm": 3.7206952571868896, + "learning_rate": 2.757280483600444e-05, + "loss": 1.9571, + "step": 94290 + }, + { + "epoch": 0.18364497447022304, + "grad_norm": 2.2836344242095947, + "learning_rate": 2.7572054062553577e-05, + "loss": 1.6557, + "step": 94305 + }, + { + "epoch": 0.18367418474133332, + "grad_norm": 3.186068296432495, + "learning_rate": 2.7571303183231675e-05, + "loss": 1.772, + "step": 94320 + }, + { + "epoch": 0.18370339501244357, + "grad_norm": 4.397800445556641, + "learning_rate": 2.7570552198045063e-05, + "loss": 1.8295, + "step": 94335 + }, + { + "epoch": 0.18373260528355384, + "grad_norm": 3.3702023029327393, + "learning_rate": 2.7569801107000066e-05, + "loss": 1.9649, + "step": 94350 + }, + { + "epoch": 0.18376181555466412, + "grad_norm": 2.5428740978240967, + "learning_rate": 2.7569049910103e-05, + "loss": 1.8869, + "step": 94365 + }, + { + "epoch": 0.18379102582577436, + "grad_norm": 3.697800397872925, + "learning_rate": 2.75682986073602e-05, + "loss": 1.902, + "step": 94380 + }, + { + "epoch": 0.18382023609688464, + "grad_norm": 3.8650431632995605, + "learning_rate": 2.7567547198777987e-05, + "loss": 1.9771, + "step": 94395 + }, + { + "epoch": 0.18384944636799488, + "grad_norm": 3.848344326019287, + "learning_rate": 2.7566795684362697e-05, + "loss": 1.7689, + "step": 94410 + }, + { + "epoch": 0.18387865663910516, + "grad_norm": 3.8078856468200684, + "learning_rate": 2.756604406412065e-05, + "loss": 2.0858, + "step": 94425 + }, + { + "epoch": 0.1839078669102154, + "grad_norm": 5.627542495727539, + "learning_rate": 2.7565292338058178e-05, + "loss": 2.0789, + "step": 94440 + }, + { + "epoch": 0.18393707718132568, + "grad_norm": 4.422365665435791, + "learning_rate": 2.756454050618161e-05, + "loss": 1.9324, + "step": 94455 + }, + { + "epoch": 0.18396628745243593, + "grad_norm": 3.436211109161377, + "learning_rate": 2.7563788568497285e-05, + "loss": 1.8743, + "step": 94470 + }, + { + "epoch": 0.1839954977235462, + "grad_norm": 4.95798921585083, + "learning_rate": 2.7563036525011524e-05, + "loss": 2.0182, + "step": 94485 + }, + { + "epoch": 0.18402470799465648, + "grad_norm": 6.198334217071533, + "learning_rate": 2.7562284375730668e-05, + "loss": 1.6697, + "step": 94500 + }, + { + "epoch": 0.18405391826576672, + "grad_norm": 3.1994106769561768, + "learning_rate": 2.756153212066105e-05, + "loss": 1.9176, + "step": 94515 + }, + { + "epoch": 0.184083128536877, + "grad_norm": 4.196457386016846, + "learning_rate": 2.7560779759809e-05, + "loss": 1.8749, + "step": 94530 + }, + { + "epoch": 0.18411233880798725, + "grad_norm": 3.382073402404785, + "learning_rate": 2.756002729318086e-05, + "loss": 1.8393, + "step": 94545 + }, + { + "epoch": 0.18414154907909752, + "grad_norm": 4.312603950500488, + "learning_rate": 2.755927472078296e-05, + "loss": 1.9022, + "step": 94560 + }, + { + "epoch": 0.18417075935020777, + "grad_norm": 3.5769381523132324, + "learning_rate": 2.7558522042621645e-05, + "loss": 1.8863, + "step": 94575 + }, + { + "epoch": 0.18419996962131804, + "grad_norm": 3.8104968070983887, + "learning_rate": 2.7557769258703246e-05, + "loss": 1.9547, + "step": 94590 + }, + { + "epoch": 0.18422917989242832, + "grad_norm": 3.268310070037842, + "learning_rate": 2.7557016369034106e-05, + "loss": 1.9676, + "step": 94605 + }, + { + "epoch": 0.18425839016353857, + "grad_norm": 2.1242496967315674, + "learning_rate": 2.755626337362057e-05, + "loss": 2.006, + "step": 94620 + }, + { + "epoch": 0.18428760043464884, + "grad_norm": 2.2751739025115967, + "learning_rate": 2.755551027246897e-05, + "loss": 1.9436, + "step": 94635 + }, + { + "epoch": 0.1843168107057591, + "grad_norm": 2.62917423248291, + "learning_rate": 2.755475706558565e-05, + "loss": 1.842, + "step": 94650 + }, + { + "epoch": 0.18434602097686936, + "grad_norm": 4.473318099975586, + "learning_rate": 2.755400375297696e-05, + "loss": 1.7189, + "step": 94665 + }, + { + "epoch": 0.1843752312479796, + "grad_norm": 3.864413022994995, + "learning_rate": 2.7553250334649242e-05, + "loss": 1.8636, + "step": 94680 + }, + { + "epoch": 0.18440444151908988, + "grad_norm": 4.371596813201904, + "learning_rate": 2.755249681060883e-05, + "loss": 2.0511, + "step": 94695 + }, + { + "epoch": 0.18443365179020016, + "grad_norm": 7.783666610717773, + "learning_rate": 2.7551743180862073e-05, + "loss": 1.9481, + "step": 94710 + }, + { + "epoch": 0.1844628620613104, + "grad_norm": 4.631928443908691, + "learning_rate": 2.7550989445415332e-05, + "loss": 2.0301, + "step": 94725 + }, + { + "epoch": 0.18449207233242068, + "grad_norm": 4.134591579437256, + "learning_rate": 2.755023560427494e-05, + "loss": 1.793, + "step": 94740 + }, + { + "epoch": 0.18452128260353093, + "grad_norm": 2.5131123065948486, + "learning_rate": 2.7549481657447244e-05, + "loss": 1.9033, + "step": 94755 + }, + { + "epoch": 0.1845504928746412, + "grad_norm": 4.243981838226318, + "learning_rate": 2.7548727604938605e-05, + "loss": 1.9544, + "step": 94770 + }, + { + "epoch": 0.18457970314575145, + "grad_norm": 3.4515132904052734, + "learning_rate": 2.7547973446755362e-05, + "loss": 2.045, + "step": 94785 + }, + { + "epoch": 0.18460891341686173, + "grad_norm": 2.511699676513672, + "learning_rate": 2.754721918290387e-05, + "loss": 1.5695, + "step": 94800 + }, + { + "epoch": 0.184638123687972, + "grad_norm": 4.05910062789917, + "learning_rate": 2.7546464813390482e-05, + "loss": 1.9435, + "step": 94815 + }, + { + "epoch": 0.18466733395908225, + "grad_norm": 4.804703235626221, + "learning_rate": 2.7545710338221547e-05, + "loss": 1.9266, + "step": 94830 + }, + { + "epoch": 0.18469654423019252, + "grad_norm": 2.8074498176574707, + "learning_rate": 2.7544955757403426e-05, + "loss": 1.842, + "step": 94845 + }, + { + "epoch": 0.18472575450130277, + "grad_norm": 3.4078102111816406, + "learning_rate": 2.754420107094246e-05, + "loss": 1.8262, + "step": 94860 + }, + { + "epoch": 0.18475496477241304, + "grad_norm": 2.2840304374694824, + "learning_rate": 2.754344627884502e-05, + "loss": 1.6681, + "step": 94875 + }, + { + "epoch": 0.1847841750435233, + "grad_norm": 4.115520477294922, + "learning_rate": 2.7542691381117452e-05, + "loss": 1.728, + "step": 94890 + }, + { + "epoch": 0.18481338531463357, + "grad_norm": 2.944448471069336, + "learning_rate": 2.7541936377766113e-05, + "loss": 1.8659, + "step": 94905 + }, + { + "epoch": 0.18484259558574384, + "grad_norm": 3.002603054046631, + "learning_rate": 2.754118126879737e-05, + "loss": 1.8889, + "step": 94920 + }, + { + "epoch": 0.1848718058568541, + "grad_norm": 2.9134867191314697, + "learning_rate": 2.7540426054217572e-05, + "loss": 1.947, + "step": 94935 + }, + { + "epoch": 0.18490101612796436, + "grad_norm": 3.4980084896087646, + "learning_rate": 2.753967073403308e-05, + "loss": 1.8387, + "step": 94950 + }, + { + "epoch": 0.1849302263990746, + "grad_norm": 2.8111305236816406, + "learning_rate": 2.7538915308250258e-05, + "loss": 1.7648, + "step": 94965 + }, + { + "epoch": 0.18495943667018488, + "grad_norm": 2.7262823581695557, + "learning_rate": 2.753815977687547e-05, + "loss": 1.6165, + "step": 94980 + }, + { + "epoch": 0.18498864694129513, + "grad_norm": 3.0753395557403564, + "learning_rate": 2.7537404139915073e-05, + "loss": 1.8517, + "step": 94995 + }, + { + "epoch": 0.1850178572124054, + "grad_norm": 3.734402894973755, + "learning_rate": 2.753664839737543e-05, + "loss": 1.8539, + "step": 95010 + }, + { + "epoch": 0.18504706748351568, + "grad_norm": 3.8353652954101562, + "learning_rate": 2.7535892549262912e-05, + "loss": 1.9387, + "step": 95025 + }, + { + "epoch": 0.18507627775462593, + "grad_norm": 3.427877187728882, + "learning_rate": 2.753513659558388e-05, + "loss": 1.7842, + "step": 95040 + }, + { + "epoch": 0.1851054880257362, + "grad_norm": 4.751176834106445, + "learning_rate": 2.7534380536344695e-05, + "loss": 1.9154, + "step": 95055 + }, + { + "epoch": 0.18513469829684645, + "grad_norm": 2.9390766620635986, + "learning_rate": 2.7533624371551733e-05, + "loss": 1.6487, + "step": 95070 + }, + { + "epoch": 0.18516390856795673, + "grad_norm": 3.870060443878174, + "learning_rate": 2.7532868101211355e-05, + "loss": 1.7111, + "step": 95085 + }, + { + "epoch": 0.18519311883906697, + "grad_norm": 2.3689475059509277, + "learning_rate": 2.7532111725329932e-05, + "loss": 2.1714, + "step": 95100 + }, + { + "epoch": 0.18522232911017725, + "grad_norm": 5.204848289489746, + "learning_rate": 2.7531355243913836e-05, + "loss": 1.8574, + "step": 95115 + }, + { + "epoch": 0.18525153938128752, + "grad_norm": 2.498096227645874, + "learning_rate": 2.7530598656969435e-05, + "loss": 1.9812, + "step": 95130 + }, + { + "epoch": 0.18528074965239777, + "grad_norm": 3.860705852508545, + "learning_rate": 2.75298419645031e-05, + "loss": 2.0284, + "step": 95145 + }, + { + "epoch": 0.18530995992350804, + "grad_norm": 4.3888020515441895, + "learning_rate": 2.75290851665212e-05, + "loss": 1.8903, + "step": 95160 + }, + { + "epoch": 0.1853391701946183, + "grad_norm": 3.5124685764312744, + "learning_rate": 2.7528328263030112e-05, + "loss": 1.8711, + "step": 95175 + }, + { + "epoch": 0.18536838046572857, + "grad_norm": 4.521871089935303, + "learning_rate": 2.7527571254036214e-05, + "loss": 1.879, + "step": 95190 + }, + { + "epoch": 0.1853975907368388, + "grad_norm": 3.291149854660034, + "learning_rate": 2.7526814139545873e-05, + "loss": 1.843, + "step": 95205 + }, + { + "epoch": 0.1854268010079491, + "grad_norm": 3.5813679695129395, + "learning_rate": 2.7526056919565466e-05, + "loss": 2.0303, + "step": 95220 + }, + { + "epoch": 0.18545601127905936, + "grad_norm": 4.212502479553223, + "learning_rate": 2.7525299594101377e-05, + "loss": 1.9284, + "step": 95235 + }, + { + "epoch": 0.1854852215501696, + "grad_norm": 3.982229232788086, + "learning_rate": 2.752454216315998e-05, + "loss": 2.0001, + "step": 95250 + }, + { + "epoch": 0.18551443182127988, + "grad_norm": 3.2863729000091553, + "learning_rate": 2.752378462674765e-05, + "loss": 1.7623, + "step": 95265 + }, + { + "epoch": 0.18554364209239013, + "grad_norm": 2.418975830078125, + "learning_rate": 2.7523026984870766e-05, + "loss": 1.8103, + "step": 95280 + }, + { + "epoch": 0.1855728523635004, + "grad_norm": 3.2842280864715576, + "learning_rate": 2.752226923753571e-05, + "loss": 1.7849, + "step": 95295 + }, + { + "epoch": 0.18560206263461065, + "grad_norm": 1.9291599988937378, + "learning_rate": 2.7521511384748867e-05, + "loss": 1.9585, + "step": 95310 + }, + { + "epoch": 0.18563127290572093, + "grad_norm": 2.5627834796905518, + "learning_rate": 2.7520753426516614e-05, + "loss": 2.0234, + "step": 95325 + }, + { + "epoch": 0.1856604831768312, + "grad_norm": 2.6081185340881348, + "learning_rate": 2.7519995362845334e-05, + "loss": 1.9825, + "step": 95340 + }, + { + "epoch": 0.18568969344794145, + "grad_norm": 2.4659955501556396, + "learning_rate": 2.7519237193741413e-05, + "loss": 1.8185, + "step": 95355 + }, + { + "epoch": 0.18571890371905173, + "grad_norm": 3.792269229888916, + "learning_rate": 2.7518478919211238e-05, + "loss": 1.9446, + "step": 95370 + }, + { + "epoch": 0.18574811399016197, + "grad_norm": 3.6157641410827637, + "learning_rate": 2.7517720539261185e-05, + "loss": 2.0067, + "step": 95385 + }, + { + "epoch": 0.18577732426127225, + "grad_norm": 2.3287453651428223, + "learning_rate": 2.751696205389765e-05, + "loss": 1.9579, + "step": 95400 + }, + { + "epoch": 0.1858065345323825, + "grad_norm": 2.8468403816223145, + "learning_rate": 2.7516203463127018e-05, + "loss": 1.9921, + "step": 95415 + }, + { + "epoch": 0.18583574480349277, + "grad_norm": 4.485379219055176, + "learning_rate": 2.7515444766955673e-05, + "loss": 1.9612, + "step": 95430 + }, + { + "epoch": 0.18586495507460304, + "grad_norm": 5.000667095184326, + "learning_rate": 2.751468596539001e-05, + "loss": 1.9809, + "step": 95445 + }, + { + "epoch": 0.1858941653457133, + "grad_norm": 3.3073322772979736, + "learning_rate": 2.7513927058436418e-05, + "loss": 1.8718, + "step": 95460 + }, + { + "epoch": 0.18592337561682357, + "grad_norm": 2.5046308040618896, + "learning_rate": 2.751316804610128e-05, + "loss": 1.8077, + "step": 95475 + }, + { + "epoch": 0.1859525858879338, + "grad_norm": 2.0459272861480713, + "learning_rate": 2.7512408928391004e-05, + "loss": 2.0035, + "step": 95490 + }, + { + "epoch": 0.1859817961590441, + "grad_norm": 2.7898263931274414, + "learning_rate": 2.7511649705311965e-05, + "loss": 1.7812, + "step": 95505 + }, + { + "epoch": 0.18601100643015434, + "grad_norm": 2.4660818576812744, + "learning_rate": 2.7510890376870566e-05, + "loss": 1.991, + "step": 95520 + }, + { + "epoch": 0.1860402167012646, + "grad_norm": 4.31742525100708, + "learning_rate": 2.7510130943073204e-05, + "loss": 2.0983, + "step": 95535 + }, + { + "epoch": 0.18606942697237489, + "grad_norm": 2.7516841888427734, + "learning_rate": 2.7509371403926264e-05, + "loss": 1.7813, + "step": 95550 + }, + { + "epoch": 0.18609863724348513, + "grad_norm": 3.5403342247009277, + "learning_rate": 2.750861175943615e-05, + "loss": 1.9003, + "step": 95565 + }, + { + "epoch": 0.1861278475145954, + "grad_norm": 3.990453004837036, + "learning_rate": 2.7507852009609257e-05, + "loss": 1.8769, + "step": 95580 + }, + { + "epoch": 0.18615705778570565, + "grad_norm": 3.8055214881896973, + "learning_rate": 2.7507092154451983e-05, + "loss": 1.7388, + "step": 95595 + }, + { + "epoch": 0.18618626805681593, + "grad_norm": 3.023470878601074, + "learning_rate": 2.7506332193970727e-05, + "loss": 2.0101, + "step": 95610 + }, + { + "epoch": 0.18621547832792618, + "grad_norm": 2.360166311264038, + "learning_rate": 2.7505572128171887e-05, + "loss": 1.8564, + "step": 95625 + }, + { + "epoch": 0.18624468859903645, + "grad_norm": 4.901341438293457, + "learning_rate": 2.7504811957061868e-05, + "loss": 1.8126, + "step": 95640 + }, + { + "epoch": 0.18627389887014673, + "grad_norm": 3.840001344680786, + "learning_rate": 2.7504051680647063e-05, + "loss": 2.0874, + "step": 95655 + }, + { + "epoch": 0.18630310914125697, + "grad_norm": 2.170409917831421, + "learning_rate": 2.750329129893389e-05, + "loss": 2.1046, + "step": 95670 + }, + { + "epoch": 0.18633231941236725, + "grad_norm": 2.9182116985321045, + "learning_rate": 2.7502530811928736e-05, + "loss": 1.8236, + "step": 95685 + }, + { + "epoch": 0.1863615296834775, + "grad_norm": 9.392932891845703, + "learning_rate": 2.750177021963801e-05, + "loss": 1.9193, + "step": 95700 + }, + { + "epoch": 0.18639073995458777, + "grad_norm": 4.135622501373291, + "learning_rate": 2.7501009522068125e-05, + "loss": 2.0392, + "step": 95715 + }, + { + "epoch": 0.18641995022569802, + "grad_norm": 3.0941381454467773, + "learning_rate": 2.7500248719225477e-05, + "loss": 1.8182, + "step": 95730 + }, + { + "epoch": 0.1864491604968083, + "grad_norm": 3.8988256454467773, + "learning_rate": 2.7499487811116478e-05, + "loss": 1.623, + "step": 95745 + }, + { + "epoch": 0.18647837076791857, + "grad_norm": 4.095931529998779, + "learning_rate": 2.7498726797747528e-05, + "loss": 1.7905, + "step": 95760 + }, + { + "epoch": 0.1865075810390288, + "grad_norm": 1.8716729879379272, + "learning_rate": 2.7497965679125048e-05, + "loss": 1.9836, + "step": 95775 + }, + { + "epoch": 0.1865367913101391, + "grad_norm": 2.5921576023101807, + "learning_rate": 2.749720445525544e-05, + "loss": 1.7862, + "step": 95790 + }, + { + "epoch": 0.18656600158124934, + "grad_norm": 3.7627806663513184, + "learning_rate": 2.7496443126145114e-05, + "loss": 1.9243, + "step": 95805 + }, + { + "epoch": 0.1865952118523596, + "grad_norm": 3.7505106925964355, + "learning_rate": 2.7495681691800485e-05, + "loss": 1.6657, + "step": 95820 + }, + { + "epoch": 0.18662442212346986, + "grad_norm": 3.939401388168335, + "learning_rate": 2.7494920152227963e-05, + "loss": 2.0031, + "step": 95835 + }, + { + "epoch": 0.18665363239458013, + "grad_norm": 2.5329501628875732, + "learning_rate": 2.7494158507433963e-05, + "loss": 1.8708, + "step": 95850 + }, + { + "epoch": 0.1866828426656904, + "grad_norm": 2.9200894832611084, + "learning_rate": 2.749339675742489e-05, + "loss": 1.8048, + "step": 95865 + }, + { + "epoch": 0.18671205293680065, + "grad_norm": 2.675281524658203, + "learning_rate": 2.749263490220717e-05, + "loss": 1.8412, + "step": 95880 + }, + { + "epoch": 0.18674126320791093, + "grad_norm": 2.579969644546509, + "learning_rate": 2.7491872941787216e-05, + "loss": 2.0351, + "step": 95895 + }, + { + "epoch": 0.18677047347902118, + "grad_norm": 1.7147372961044312, + "learning_rate": 2.7491110876171442e-05, + "loss": 2.0603, + "step": 95910 + }, + { + "epoch": 0.18679968375013145, + "grad_norm": 2.2116317749023438, + "learning_rate": 2.7490348705366265e-05, + "loss": 1.7782, + "step": 95925 + }, + { + "epoch": 0.1868288940212417, + "grad_norm": 2.7448058128356934, + "learning_rate": 2.7489586429378108e-05, + "loss": 1.9326, + "step": 95940 + }, + { + "epoch": 0.18685810429235197, + "grad_norm": 2.7208070755004883, + "learning_rate": 2.7488824048213382e-05, + "loss": 2.0383, + "step": 95955 + }, + { + "epoch": 0.18688731456346225, + "grad_norm": 3.0263869762420654, + "learning_rate": 2.7488061561878515e-05, + "loss": 1.7971, + "step": 95970 + }, + { + "epoch": 0.1869165248345725, + "grad_norm": 3.1661837100982666, + "learning_rate": 2.7487298970379923e-05, + "loss": 2.025, + "step": 95985 + }, + { + "epoch": 0.18694573510568277, + "grad_norm": 2.48641300201416, + "learning_rate": 2.7486536273724037e-05, + "loss": 2.1373, + "step": 96000 + }, + { + "epoch": 0.18697494537679302, + "grad_norm": 3.8167295455932617, + "learning_rate": 2.7485773471917268e-05, + "loss": 1.8551, + "step": 96015 + }, + { + "epoch": 0.1870041556479033, + "grad_norm": 2.391829252243042, + "learning_rate": 2.7485010564966042e-05, + "loss": 1.8185, + "step": 96030 + }, + { + "epoch": 0.18703336591901354, + "grad_norm": 5.274075984954834, + "learning_rate": 2.7484247552876787e-05, + "loss": 2.1631, + "step": 96045 + }, + { + "epoch": 0.1870625761901238, + "grad_norm": 4.351299285888672, + "learning_rate": 2.7483484435655935e-05, + "loss": 1.8491, + "step": 96060 + }, + { + "epoch": 0.1870917864612341, + "grad_norm": 2.931455135345459, + "learning_rate": 2.7482721213309897e-05, + "loss": 1.7455, + "step": 96075 + }, + { + "epoch": 0.18712099673234434, + "grad_norm": 3.8535053730010986, + "learning_rate": 2.748195788584511e-05, + "loss": 1.7946, + "step": 96090 + }, + { + "epoch": 0.1871502070034546, + "grad_norm": 2.9708755016326904, + "learning_rate": 2.7481194453268e-05, + "loss": 1.8206, + "step": 96105 + }, + { + "epoch": 0.18717941727456486, + "grad_norm": 2.919200897216797, + "learning_rate": 2.7480430915584998e-05, + "loss": 1.991, + "step": 96120 + }, + { + "epoch": 0.18720862754567513, + "grad_norm": 2.9604036808013916, + "learning_rate": 2.7479667272802533e-05, + "loss": 1.9787, + "step": 96135 + }, + { + "epoch": 0.18723783781678538, + "grad_norm": 2.8988590240478516, + "learning_rate": 2.747890352492703e-05, + "loss": 1.6568, + "step": 96150 + }, + { + "epoch": 0.18726704808789565, + "grad_norm": 3.8487296104431152, + "learning_rate": 2.7478139671964926e-05, + "loss": 1.825, + "step": 96165 + }, + { + "epoch": 0.18729625835900593, + "grad_norm": 2.968209981918335, + "learning_rate": 2.7477375713922655e-05, + "loss": 2.0871, + "step": 96180 + }, + { + "epoch": 0.18732546863011618, + "grad_norm": 2.256697416305542, + "learning_rate": 2.747661165080665e-05, + "loss": 1.6965, + "step": 96195 + }, + { + "epoch": 0.18735467890122645, + "grad_norm": 2.7572877407073975, + "learning_rate": 2.7475847482623338e-05, + "loss": 1.8185, + "step": 96210 + }, + { + "epoch": 0.1873838891723367, + "grad_norm": 3.1118829250335693, + "learning_rate": 2.7475083209379166e-05, + "loss": 1.8298, + "step": 96225 + }, + { + "epoch": 0.18741309944344697, + "grad_norm": 4.69570255279541, + "learning_rate": 2.7474318831080557e-05, + "loss": 1.92, + "step": 96240 + }, + { + "epoch": 0.18744230971455722, + "grad_norm": 2.1464555263519287, + "learning_rate": 2.747355434773396e-05, + "loss": 2.0206, + "step": 96255 + }, + { + "epoch": 0.1874715199856675, + "grad_norm": 2.9638564586639404, + "learning_rate": 2.7472789759345805e-05, + "loss": 1.9897, + "step": 96270 + }, + { + "epoch": 0.18750073025677777, + "grad_norm": 2.7547013759613037, + "learning_rate": 2.7472025065922534e-05, + "loss": 1.8355, + "step": 96285 + }, + { + "epoch": 0.18752994052788802, + "grad_norm": 3.0415849685668945, + "learning_rate": 2.747126026747059e-05, + "loss": 2.0165, + "step": 96300 + }, + { + "epoch": 0.1875591507989983, + "grad_norm": 3.3349177837371826, + "learning_rate": 2.74704953639964e-05, + "loss": 2.0546, + "step": 96315 + }, + { + "epoch": 0.18758836107010854, + "grad_norm": 4.154085636138916, + "learning_rate": 2.746973035550642e-05, + "loss": 2.154, + "step": 96330 + }, + { + "epoch": 0.18761757134121881, + "grad_norm": 2.5753583908081055, + "learning_rate": 2.7468965242007086e-05, + "loss": 1.9134, + "step": 96345 + }, + { + "epoch": 0.18764678161232906, + "grad_norm": 4.860604763031006, + "learning_rate": 2.7468200023504845e-05, + "loss": 1.9773, + "step": 96360 + }, + { + "epoch": 0.18767599188343934, + "grad_norm": 4.454910755157471, + "learning_rate": 2.746743470000613e-05, + "loss": 1.8569, + "step": 96375 + }, + { + "epoch": 0.1877052021545496, + "grad_norm": 3.263760805130005, + "learning_rate": 2.74666692715174e-05, + "loss": 1.9735, + "step": 96390 + }, + { + "epoch": 0.18773441242565986, + "grad_norm": 1.6806658506393433, + "learning_rate": 2.7465903738045098e-05, + "loss": 1.8237, + "step": 96405 + }, + { + "epoch": 0.18776362269677013, + "grad_norm": 3.6400883197784424, + "learning_rate": 2.746513809959566e-05, + "loss": 1.7421, + "step": 96420 + }, + { + "epoch": 0.18779283296788038, + "grad_norm": 2.9631810188293457, + "learning_rate": 2.7464372356175546e-05, + "loss": 2.0036, + "step": 96435 + }, + { + "epoch": 0.18782204323899065, + "grad_norm": 3.117196798324585, + "learning_rate": 2.7463606507791196e-05, + "loss": 1.9593, + "step": 96450 + }, + { + "epoch": 0.1878512535101009, + "grad_norm": 4.621789455413818, + "learning_rate": 2.746284055444907e-05, + "loss": 1.904, + "step": 96465 + }, + { + "epoch": 0.18788046378121118, + "grad_norm": 4.2281575202941895, + "learning_rate": 2.7462074496155605e-05, + "loss": 1.8005, + "step": 96480 + }, + { + "epoch": 0.18790967405232145, + "grad_norm": 2.1505675315856934, + "learning_rate": 2.7461308332917253e-05, + "loss": 1.9305, + "step": 96495 + }, + { + "epoch": 0.1879388843234317, + "grad_norm": 3.8990657329559326, + "learning_rate": 2.7460542064740477e-05, + "loss": 1.987, + "step": 96510 + }, + { + "epoch": 0.18796809459454197, + "grad_norm": 4.247411251068115, + "learning_rate": 2.7459775691631725e-05, + "loss": 1.7434, + "step": 96525 + }, + { + "epoch": 0.18799730486565222, + "grad_norm": 3.308161735534668, + "learning_rate": 2.7459009213597447e-05, + "loss": 1.8159, + "step": 96540 + }, + { + "epoch": 0.1880265151367625, + "grad_norm": 4.592803478240967, + "learning_rate": 2.74582426306441e-05, + "loss": 1.8438, + "step": 96555 + }, + { + "epoch": 0.18805572540787274, + "grad_norm": 1.8546725511550903, + "learning_rate": 2.745747594277814e-05, + "loss": 2.0979, + "step": 96570 + }, + { + "epoch": 0.18808493567898302, + "grad_norm": 2.5075149536132812, + "learning_rate": 2.7456709150006024e-05, + "loss": 1.8124, + "step": 96585 + }, + { + "epoch": 0.18811414595009326, + "grad_norm": 3.6434648036956787, + "learning_rate": 2.7455942252334206e-05, + "loss": 1.8582, + "step": 96600 + }, + { + "epoch": 0.18814335622120354, + "grad_norm": 2.302992105484009, + "learning_rate": 2.7455175249769148e-05, + "loss": 1.9717, + "step": 96615 + }, + { + "epoch": 0.18817256649231381, + "grad_norm": 2.99843168258667, + "learning_rate": 2.7454408142317308e-05, + "loss": 1.9818, + "step": 96630 + }, + { + "epoch": 0.18820177676342406, + "grad_norm": 3.045596122741699, + "learning_rate": 2.7453640929985147e-05, + "loss": 2.0684, + "step": 96645 + }, + { + "epoch": 0.18823098703453434, + "grad_norm": 4.589967727661133, + "learning_rate": 2.745287361277912e-05, + "loss": 1.7995, + "step": 96660 + }, + { + "epoch": 0.18826019730564458, + "grad_norm": 3.3215739727020264, + "learning_rate": 2.7452106190705694e-05, + "loss": 1.9745, + "step": 96675 + }, + { + "epoch": 0.18828940757675486, + "grad_norm": 2.734424114227295, + "learning_rate": 2.745133866377133e-05, + "loss": 1.9319, + "step": 96690 + }, + { + "epoch": 0.1883186178478651, + "grad_norm": 4.794501781463623, + "learning_rate": 2.7450571031982495e-05, + "loss": 1.8915, + "step": 96705 + }, + { + "epoch": 0.18834782811897538, + "grad_norm": 4.0221428871154785, + "learning_rate": 2.7449803295345648e-05, + "loss": 2.0789, + "step": 96720 + }, + { + "epoch": 0.18837703839008565, + "grad_norm": 4.325769901275635, + "learning_rate": 2.7449035453867257e-05, + "loss": 2.0297, + "step": 96735 + }, + { + "epoch": 0.1884062486611959, + "grad_norm": 4.308589458465576, + "learning_rate": 2.7448267507553786e-05, + "loss": 1.8359, + "step": 96750 + }, + { + "epoch": 0.18843545893230618, + "grad_norm": 4.418067455291748, + "learning_rate": 2.7447499456411705e-05, + "loss": 1.8016, + "step": 96765 + }, + { + "epoch": 0.18846466920341642, + "grad_norm": 2.786647319793701, + "learning_rate": 2.744673130044748e-05, + "loss": 1.7825, + "step": 96780 + }, + { + "epoch": 0.1884938794745267, + "grad_norm": 3.794327974319458, + "learning_rate": 2.744596303966758e-05, + "loss": 2.0093, + "step": 96795 + }, + { + "epoch": 0.18852308974563695, + "grad_norm": 3.3645236492156982, + "learning_rate": 2.7445194674078475e-05, + "loss": 1.7588, + "step": 96810 + }, + { + "epoch": 0.18855230001674722, + "grad_norm": 3.456848621368408, + "learning_rate": 2.744442620368664e-05, + "loss": 2.0324, + "step": 96825 + }, + { + "epoch": 0.1885815102878575, + "grad_norm": 2.405742883682251, + "learning_rate": 2.7443657628498533e-05, + "loss": 1.7593, + "step": 96840 + }, + { + "epoch": 0.18861072055896774, + "grad_norm": 2.664844274520874, + "learning_rate": 2.744288894852064e-05, + "loss": 1.9035, + "step": 96855 + }, + { + "epoch": 0.18863993083007802, + "grad_norm": 4.586266994476318, + "learning_rate": 2.744212016375943e-05, + "loss": 2.1277, + "step": 96870 + }, + { + "epoch": 0.18866914110118826, + "grad_norm": 7.623379230499268, + "learning_rate": 2.7441351274221374e-05, + "loss": 1.8319, + "step": 96885 + }, + { + "epoch": 0.18869835137229854, + "grad_norm": 3.128723621368408, + "learning_rate": 2.7440582279912947e-05, + "loss": 2.0417, + "step": 96900 + }, + { + "epoch": 0.1887275616434088, + "grad_norm": 3.3000259399414062, + "learning_rate": 2.743981318084063e-05, + "loss": 1.8927, + "step": 96915 + }, + { + "epoch": 0.18875677191451906, + "grad_norm": 3.549215316772461, + "learning_rate": 2.7439043977010895e-05, + "loss": 1.992, + "step": 96930 + }, + { + "epoch": 0.18878598218562934, + "grad_norm": 2.7479825019836426, + "learning_rate": 2.7438274668430223e-05, + "loss": 1.8823, + "step": 96945 + }, + { + "epoch": 0.18881519245673958, + "grad_norm": 2.665654420852661, + "learning_rate": 2.743750525510509e-05, + "loss": 1.7312, + "step": 96960 + }, + { + "epoch": 0.18884440272784986, + "grad_norm": 3.302964687347412, + "learning_rate": 2.7436735737041975e-05, + "loss": 1.654, + "step": 96975 + }, + { + "epoch": 0.1888736129989601, + "grad_norm": 2.0254805088043213, + "learning_rate": 2.7435966114247363e-05, + "loss": 1.9524, + "step": 96990 + }, + { + "epoch": 0.18890282327007038, + "grad_norm": 2.481362819671631, + "learning_rate": 2.743519638672773e-05, + "loss": 1.9955, + "step": 97005 + }, + { + "epoch": 0.18893203354118063, + "grad_norm": 3.1424448490142822, + "learning_rate": 2.7434426554489554e-05, + "loss": 1.9553, + "step": 97020 + }, + { + "epoch": 0.1889612438122909, + "grad_norm": 3.681554079055786, + "learning_rate": 2.743365661753933e-05, + "loss": 1.9798, + "step": 97035 + }, + { + "epoch": 0.18899045408340118, + "grad_norm": 6.029425621032715, + "learning_rate": 2.743288657588353e-05, + "loss": 1.7448, + "step": 97050 + }, + { + "epoch": 0.18901966435451142, + "grad_norm": 4.390709400177002, + "learning_rate": 2.7432116429528644e-05, + "loss": 1.9508, + "step": 97065 + }, + { + "epoch": 0.1890488746256217, + "grad_norm": 3.148656129837036, + "learning_rate": 2.7431346178481157e-05, + "loss": 1.898, + "step": 97080 + }, + { + "epoch": 0.18907808489673195, + "grad_norm": 2.915015459060669, + "learning_rate": 2.743057582274756e-05, + "loss": 1.7686, + "step": 97095 + }, + { + "epoch": 0.18910729516784222, + "grad_norm": 3.1245813369750977, + "learning_rate": 2.742980536233433e-05, + "loss": 1.7821, + "step": 97110 + }, + { + "epoch": 0.18913650543895247, + "grad_norm": 2.577244758605957, + "learning_rate": 2.7429034797247967e-05, + "loss": 1.7129, + "step": 97125 + }, + { + "epoch": 0.18916571571006274, + "grad_norm": 2.9369568824768066, + "learning_rate": 2.7428264127494946e-05, + "loss": 1.9553, + "step": 97140 + }, + { + "epoch": 0.18919492598117302, + "grad_norm": 5.901589870452881, + "learning_rate": 2.7427493353081773e-05, + "loss": 2.1666, + "step": 97155 + }, + { + "epoch": 0.18922413625228326, + "grad_norm": 3.89601469039917, + "learning_rate": 2.7426722474014924e-05, + "loss": 1.8181, + "step": 97170 + }, + { + "epoch": 0.18925334652339354, + "grad_norm": 3.652013063430786, + "learning_rate": 2.74259514903009e-05, + "loss": 1.816, + "step": 97185 + }, + { + "epoch": 0.1892825567945038, + "grad_norm": 2.7386059761047363, + "learning_rate": 2.742518040194619e-05, + "loss": 1.9621, + "step": 97200 + }, + { + "epoch": 0.18931176706561406, + "grad_norm": 2.4260880947113037, + "learning_rate": 2.742440920895729e-05, + "loss": 1.8346, + "step": 97215 + }, + { + "epoch": 0.1893409773367243, + "grad_norm": 4.076447486877441, + "learning_rate": 2.7423637911340692e-05, + "loss": 2.0192, + "step": 97230 + }, + { + "epoch": 0.18937018760783458, + "grad_norm": 4.128701210021973, + "learning_rate": 2.7422866509102893e-05, + "loss": 1.8914, + "step": 97245 + }, + { + "epoch": 0.18939939787894486, + "grad_norm": 3.867953062057495, + "learning_rate": 2.7422095002250386e-05, + "loss": 1.883, + "step": 97260 + }, + { + "epoch": 0.1894286081500551, + "grad_norm": 5.5211992263793945, + "learning_rate": 2.7421323390789673e-05, + "loss": 1.9091, + "step": 97275 + }, + { + "epoch": 0.18945781842116538, + "grad_norm": 2.2246036529541016, + "learning_rate": 2.742055167472724e-05, + "loss": 1.8166, + "step": 97290 + }, + { + "epoch": 0.18948702869227563, + "grad_norm": 3.474224328994751, + "learning_rate": 2.7419779854069603e-05, + "loss": 1.9348, + "step": 97305 + }, + { + "epoch": 0.1895162389633859, + "grad_norm": 2.9693291187286377, + "learning_rate": 2.741900792882325e-05, + "loss": 1.849, + "step": 97320 + }, + { + "epoch": 0.18954544923449615, + "grad_norm": 2.077162504196167, + "learning_rate": 2.741823589899469e-05, + "loss": 1.7863, + "step": 97335 + }, + { + "epoch": 0.18957465950560642, + "grad_norm": 3.169417142868042, + "learning_rate": 2.741746376459041e-05, + "loss": 1.861, + "step": 97350 + }, + { + "epoch": 0.1896038697767167, + "grad_norm": 3.4711220264434814, + "learning_rate": 2.7416691525616924e-05, + "loss": 1.8907, + "step": 97365 + }, + { + "epoch": 0.18963308004782695, + "grad_norm": 4.315990924835205, + "learning_rate": 2.7415919182080735e-05, + "loss": 1.7817, + "step": 97380 + }, + { + "epoch": 0.18966229031893722, + "grad_norm": 4.196987628936768, + "learning_rate": 2.7415146733988344e-05, + "loss": 1.7574, + "step": 97395 + }, + { + "epoch": 0.18969150059004747, + "grad_norm": 4.502289295196533, + "learning_rate": 2.7414374181346253e-05, + "loss": 1.8566, + "step": 97410 + }, + { + "epoch": 0.18972071086115774, + "grad_norm": 2.7968826293945312, + "learning_rate": 2.7413601524160973e-05, + "loss": 1.9529, + "step": 97425 + }, + { + "epoch": 0.189749921132268, + "grad_norm": 2.5761303901672363, + "learning_rate": 2.7412828762439007e-05, + "loss": 1.834, + "step": 97440 + }, + { + "epoch": 0.18977913140337827, + "grad_norm": 2.548686981201172, + "learning_rate": 2.7412055896186867e-05, + "loss": 1.8549, + "step": 97455 + }, + { + "epoch": 0.18980834167448854, + "grad_norm": 2.964200019836426, + "learning_rate": 2.7411282925411058e-05, + "loss": 1.9426, + "step": 97470 + }, + { + "epoch": 0.1898375519455988, + "grad_norm": 3.5004358291625977, + "learning_rate": 2.741050985011809e-05, + "loss": 1.9806, + "step": 97485 + }, + { + "epoch": 0.18986676221670906, + "grad_norm": 2.998915433883667, + "learning_rate": 2.7409736670314473e-05, + "loss": 1.9187, + "step": 97500 + }, + { + "epoch": 0.1898959724878193, + "grad_norm": 2.5376968383789062, + "learning_rate": 2.7408963386006717e-05, + "loss": 1.8504, + "step": 97515 + }, + { + "epoch": 0.18992518275892958, + "grad_norm": 3.5255417823791504, + "learning_rate": 2.7408189997201338e-05, + "loss": 1.8449, + "step": 97530 + }, + { + "epoch": 0.18995439303003983, + "grad_norm": 3.004173517227173, + "learning_rate": 2.7407416503904845e-05, + "loss": 1.841, + "step": 97545 + }, + { + "epoch": 0.1899836033011501, + "grad_norm": 3.23822021484375, + "learning_rate": 2.7406642906123752e-05, + "loss": 1.8683, + "step": 97560 + }, + { + "epoch": 0.19001281357226038, + "grad_norm": 4.210470676422119, + "learning_rate": 2.740586920386457e-05, + "loss": 1.7629, + "step": 97575 + }, + { + "epoch": 0.19004202384337063, + "grad_norm": 5.122344970703125, + "learning_rate": 2.7405095397133822e-05, + "loss": 1.8879, + "step": 97590 + }, + { + "epoch": 0.1900712341144809, + "grad_norm": 4.214742183685303, + "learning_rate": 2.740432148593802e-05, + "loss": 2.0512, + "step": 97605 + }, + { + "epoch": 0.19010044438559115, + "grad_norm": 3.9678585529327393, + "learning_rate": 2.7403547470283688e-05, + "loss": 1.9101, + "step": 97620 + }, + { + "epoch": 0.19012965465670142, + "grad_norm": 2.191659688949585, + "learning_rate": 2.740277335017733e-05, + "loss": 1.8364, + "step": 97635 + }, + { + "epoch": 0.19015886492781167, + "grad_norm": 3.389317274093628, + "learning_rate": 2.740199912562548e-05, + "loss": 1.8048, + "step": 97650 + }, + { + "epoch": 0.19018807519892195, + "grad_norm": 4.079713821411133, + "learning_rate": 2.7401224796634643e-05, + "loss": 1.85, + "step": 97665 + }, + { + "epoch": 0.19021728547003222, + "grad_norm": 2.4994170665740967, + "learning_rate": 2.7400450363211356e-05, + "loss": 1.9945, + "step": 97680 + }, + { + "epoch": 0.19024649574114247, + "grad_norm": 2.838797092437744, + "learning_rate": 2.7399675825362127e-05, + "loss": 2.0209, + "step": 97695 + }, + { + "epoch": 0.19027570601225274, + "grad_norm": 5.547287940979004, + "learning_rate": 2.7398901183093482e-05, + "loss": 1.7824, + "step": 97710 + }, + { + "epoch": 0.190304916283363, + "grad_norm": 2.7238988876342773, + "learning_rate": 2.7398126436411953e-05, + "loss": 1.9496, + "step": 97725 + }, + { + "epoch": 0.19033412655447327, + "grad_norm": 4.330476760864258, + "learning_rate": 2.7397351585324055e-05, + "loss": 1.6926, + "step": 97740 + }, + { + "epoch": 0.1903633368255835, + "grad_norm": 3.3357396125793457, + "learning_rate": 2.7396576629836317e-05, + "loss": 1.9422, + "step": 97755 + }, + { + "epoch": 0.1903925470966938, + "grad_norm": 2.8375051021575928, + "learning_rate": 2.7395801569955262e-05, + "loss": 1.8387, + "step": 97770 + }, + { + "epoch": 0.19042175736780406, + "grad_norm": 2.7693121433258057, + "learning_rate": 2.739502640568742e-05, + "loss": 1.8185, + "step": 97785 + }, + { + "epoch": 0.1904509676389143, + "grad_norm": 2.830531358718872, + "learning_rate": 2.7394251137039315e-05, + "loss": 1.8547, + "step": 97800 + }, + { + "epoch": 0.19048017791002458, + "grad_norm": 2.4769906997680664, + "learning_rate": 2.7393475764017477e-05, + "loss": 1.8052, + "step": 97815 + }, + { + "epoch": 0.19050938818113483, + "grad_norm": 2.9135239124298096, + "learning_rate": 2.739270028662844e-05, + "loss": 1.9, + "step": 97830 + }, + { + "epoch": 0.1905385984522451, + "grad_norm": 4.1970696449279785, + "learning_rate": 2.739192470487873e-05, + "loss": 1.7013, + "step": 97845 + }, + { + "epoch": 0.19056780872335535, + "grad_norm": 3.5004892349243164, + "learning_rate": 2.7391149018774882e-05, + "loss": 2.1666, + "step": 97860 + }, + { + "epoch": 0.19059701899446563, + "grad_norm": 4.679566383361816, + "learning_rate": 2.739037322832342e-05, + "loss": 1.8901, + "step": 97875 + }, + { + "epoch": 0.1906262292655759, + "grad_norm": 4.124343395233154, + "learning_rate": 2.738959733353088e-05, + "loss": 1.8964, + "step": 97890 + }, + { + "epoch": 0.19065543953668615, + "grad_norm": 2.440563440322876, + "learning_rate": 2.7388821334403805e-05, + "loss": 1.7209, + "step": 97905 + }, + { + "epoch": 0.19068464980779642, + "grad_norm": 3.4518566131591797, + "learning_rate": 2.7388045230948723e-05, + "loss": 1.8846, + "step": 97920 + }, + { + "epoch": 0.19071386007890667, + "grad_norm": 1.9016444683074951, + "learning_rate": 2.7387269023172168e-05, + "loss": 2.2767, + "step": 97935 + }, + { + "epoch": 0.19074307035001695, + "grad_norm": 3.8910601139068604, + "learning_rate": 2.7386492711080677e-05, + "loss": 1.8349, + "step": 97950 + }, + { + "epoch": 0.1907722806211272, + "grad_norm": 2.518460273742676, + "learning_rate": 2.7385716294680792e-05, + "loss": 1.7221, + "step": 97965 + }, + { + "epoch": 0.19080149089223747, + "grad_norm": 4.1605401039123535, + "learning_rate": 2.7384939773979045e-05, + "loss": 1.9406, + "step": 97980 + }, + { + "epoch": 0.19083070116334774, + "grad_norm": 3.5834851264953613, + "learning_rate": 2.7384163148981976e-05, + "loss": 1.984, + "step": 97995 + }, + { + "epoch": 0.190859911434458, + "grad_norm": 2.4051644802093506, + "learning_rate": 2.738338641969613e-05, + "loss": 1.7082, + "step": 98010 + }, + { + "epoch": 0.19088912170556827, + "grad_norm": 3.2309203147888184, + "learning_rate": 2.738260958612805e-05, + "loss": 1.9846, + "step": 98025 + }, + { + "epoch": 0.1909183319766785, + "grad_norm": 6.118927955627441, + "learning_rate": 2.7381832648284266e-05, + "loss": 2.053, + "step": 98040 + }, + { + "epoch": 0.1909475422477888, + "grad_norm": 2.5381014347076416, + "learning_rate": 2.738105560617133e-05, + "loss": 1.9495, + "step": 98055 + }, + { + "epoch": 0.19097675251889903, + "grad_norm": 4.563059329986572, + "learning_rate": 2.7380278459795785e-05, + "loss": 1.6179, + "step": 98070 + }, + { + "epoch": 0.1910059627900093, + "grad_norm": 3.149650812149048, + "learning_rate": 2.7379501209164175e-05, + "loss": 2.1345, + "step": 98085 + }, + { + "epoch": 0.19103517306111958, + "grad_norm": 3.5917398929595947, + "learning_rate": 2.7378723854283043e-05, + "loss": 1.7976, + "step": 98100 + }, + { + "epoch": 0.19106438333222983, + "grad_norm": 4.222459316253662, + "learning_rate": 2.7377946395158932e-05, + "loss": 1.7862, + "step": 98115 + }, + { + "epoch": 0.1910935936033401, + "grad_norm": 2.7618374824523926, + "learning_rate": 2.7377168831798398e-05, + "loss": 2.0711, + "step": 98130 + }, + { + "epoch": 0.19112280387445035, + "grad_norm": 4.186985969543457, + "learning_rate": 2.737639116420799e-05, + "loss": 1.8207, + "step": 98145 + }, + { + "epoch": 0.19115201414556063, + "grad_norm": 4.633997440338135, + "learning_rate": 2.7375613392394242e-05, + "loss": 1.6984, + "step": 98160 + }, + { + "epoch": 0.19118122441667088, + "grad_norm": 2.2348268032073975, + "learning_rate": 2.7374835516363716e-05, + "loss": 1.7661, + "step": 98175 + }, + { + "epoch": 0.19121043468778115, + "grad_norm": 5.371476173400879, + "learning_rate": 2.7374057536122956e-05, + "loss": 1.9816, + "step": 98190 + }, + { + "epoch": 0.19123964495889142, + "grad_norm": 3.1411774158477783, + "learning_rate": 2.737327945167852e-05, + "loss": 1.9243, + "step": 98205 + }, + { + "epoch": 0.19126885523000167, + "grad_norm": 2.6102187633514404, + "learning_rate": 2.7372501263036963e-05, + "loss": 1.8374, + "step": 98220 + }, + { + "epoch": 0.19129806550111195, + "grad_norm": 4.532242774963379, + "learning_rate": 2.7371722970204827e-05, + "loss": 1.8869, + "step": 98235 + }, + { + "epoch": 0.1913272757722222, + "grad_norm": 2.0787672996520996, + "learning_rate": 2.7370944573188672e-05, + "loss": 1.9825, + "step": 98250 + }, + { + "epoch": 0.19135648604333247, + "grad_norm": 2.3189220428466797, + "learning_rate": 2.7370166071995054e-05, + "loss": 1.8875, + "step": 98265 + }, + { + "epoch": 0.19138569631444272, + "grad_norm": 4.265535354614258, + "learning_rate": 2.7369387466630532e-05, + "loss": 1.8543, + "step": 98280 + }, + { + "epoch": 0.191414906585553, + "grad_norm": 3.1422064304351807, + "learning_rate": 2.7368608757101653e-05, + "loss": 1.7058, + "step": 98295 + }, + { + "epoch": 0.19144411685666327, + "grad_norm": 4.074655055999756, + "learning_rate": 2.7367829943414978e-05, + "loss": 1.7893, + "step": 98310 + }, + { + "epoch": 0.1914733271277735, + "grad_norm": 3.4092836380004883, + "learning_rate": 2.736705102557707e-05, + "loss": 1.7863, + "step": 98325 + }, + { + "epoch": 0.1915025373988838, + "grad_norm": 3.297553777694702, + "learning_rate": 2.7366272003594485e-05, + "loss": 1.8567, + "step": 98340 + }, + { + "epoch": 0.19153174766999403, + "grad_norm": 5.096502780914307, + "learning_rate": 2.736549287747379e-05, + "loss": 1.8927, + "step": 98355 + }, + { + "epoch": 0.1915609579411043, + "grad_norm": 4.202325344085693, + "learning_rate": 2.7364713647221536e-05, + "loss": 1.9337, + "step": 98370 + }, + { + "epoch": 0.19159016821221456, + "grad_norm": 2.0702078342437744, + "learning_rate": 2.736393431284429e-05, + "loss": 1.9669, + "step": 98385 + }, + { + "epoch": 0.19161937848332483, + "grad_norm": 5.40820837020874, + "learning_rate": 2.736315487434861e-05, + "loss": 1.9961, + "step": 98400 + }, + { + "epoch": 0.1916485887544351, + "grad_norm": 2.0197336673736572, + "learning_rate": 2.7362375331741073e-05, + "loss": 1.9129, + "step": 98415 + }, + { + "epoch": 0.19167779902554535, + "grad_norm": 3.3395638465881348, + "learning_rate": 2.736159568502823e-05, + "loss": 1.7832, + "step": 98430 + }, + { + "epoch": 0.19170700929665563, + "grad_norm": 2.246309280395508, + "learning_rate": 2.736081593421665e-05, + "loss": 1.7512, + "step": 98445 + }, + { + "epoch": 0.19173621956776588, + "grad_norm": 2.5655367374420166, + "learning_rate": 2.73600360793129e-05, + "loss": 1.8966, + "step": 98460 + }, + { + "epoch": 0.19176542983887615, + "grad_norm": 2.062715768814087, + "learning_rate": 2.7359256120323548e-05, + "loss": 1.837, + "step": 98475 + }, + { + "epoch": 0.1917946401099864, + "grad_norm": 2.883363962173462, + "learning_rate": 2.735847605725516e-05, + "loss": 1.8393, + "step": 98490 + }, + { + "epoch": 0.19182385038109667, + "grad_norm": 2.9619500637054443, + "learning_rate": 2.735769589011431e-05, + "loss": 1.9786, + "step": 98505 + }, + { + "epoch": 0.19185306065220695, + "grad_norm": 3.9903581142425537, + "learning_rate": 2.7356915618907564e-05, + "loss": 1.744, + "step": 98520 + }, + { + "epoch": 0.1918822709233172, + "grad_norm": 4.258721828460693, + "learning_rate": 2.73561352436415e-05, + "loss": 1.6222, + "step": 98535 + }, + { + "epoch": 0.19191148119442747, + "grad_norm": 3.652784585952759, + "learning_rate": 2.7355354764322673e-05, + "loss": 1.6778, + "step": 98550 + }, + { + "epoch": 0.19194069146553772, + "grad_norm": 2.6614902019500732, + "learning_rate": 2.7354574180957672e-05, + "loss": 1.7881, + "step": 98565 + }, + { + "epoch": 0.191969901736648, + "grad_norm": 2.3420228958129883, + "learning_rate": 2.7353793493553064e-05, + "loss": 1.844, + "step": 98580 + }, + { + "epoch": 0.19199911200775824, + "grad_norm": 3.3624324798583984, + "learning_rate": 2.735301270211542e-05, + "loss": 2.0371, + "step": 98595 + }, + { + "epoch": 0.1920283222788685, + "grad_norm": 4.438570022583008, + "learning_rate": 2.735223180665132e-05, + "loss": 2.0159, + "step": 98610 + }, + { + "epoch": 0.19205753254997876, + "grad_norm": 4.7717976570129395, + "learning_rate": 2.7351450807167342e-05, + "loss": 1.8667, + "step": 98625 + }, + { + "epoch": 0.19208674282108903, + "grad_norm": 4.630250930786133, + "learning_rate": 2.7350669703670057e-05, + "loss": 1.7348, + "step": 98640 + }, + { + "epoch": 0.1921159530921993, + "grad_norm": 3.2538516521453857, + "learning_rate": 2.7349888496166043e-05, + "loss": 1.6179, + "step": 98655 + }, + { + "epoch": 0.19214516336330956, + "grad_norm": 2.664191484451294, + "learning_rate": 2.7349107184661884e-05, + "loss": 2.0296, + "step": 98670 + }, + { + "epoch": 0.19217437363441983, + "grad_norm": 3.005632162094116, + "learning_rate": 2.7348325769164157e-05, + "loss": 1.8226, + "step": 98685 + }, + { + "epoch": 0.19220358390553008, + "grad_norm": 4.0505571365356445, + "learning_rate": 2.734754424967944e-05, + "loss": 1.8148, + "step": 98700 + }, + { + "epoch": 0.19223279417664035, + "grad_norm": 4.892117023468018, + "learning_rate": 2.7346762626214317e-05, + "loss": 2.0752, + "step": 98715 + }, + { + "epoch": 0.1922620044477506, + "grad_norm": 3.4442551136016846, + "learning_rate": 2.7345980898775372e-05, + "loss": 1.8801, + "step": 98730 + }, + { + "epoch": 0.19229121471886088, + "grad_norm": 3.666714906692505, + "learning_rate": 2.7345199067369183e-05, + "loss": 2.0324, + "step": 98745 + }, + { + "epoch": 0.19232042498997115, + "grad_norm": 4.668206214904785, + "learning_rate": 2.734441713200233e-05, + "loss": 1.9221, + "step": 98760 + }, + { + "epoch": 0.1923496352610814, + "grad_norm": 2.067211151123047, + "learning_rate": 2.734363509268142e-05, + "loss": 1.8333, + "step": 98775 + }, + { + "epoch": 0.19237884553219167, + "grad_norm": 3.525721549987793, + "learning_rate": 2.734285294941301e-05, + "loss": 1.9011, + "step": 98790 + }, + { + "epoch": 0.19240805580330192, + "grad_norm": 2.5275559425354004, + "learning_rate": 2.73420707022037e-05, + "loss": 1.9435, + "step": 98805 + }, + { + "epoch": 0.1924372660744122, + "grad_norm": 2.951507329940796, + "learning_rate": 2.7341288351060082e-05, + "loss": 2.0245, + "step": 98820 + }, + { + "epoch": 0.19246647634552244, + "grad_norm": 3.4309134483337402, + "learning_rate": 2.7340505895988738e-05, + "loss": 2.0222, + "step": 98835 + }, + { + "epoch": 0.19249568661663272, + "grad_norm": 3.0236432552337646, + "learning_rate": 2.7339723336996254e-05, + "loss": 2.0786, + "step": 98850 + }, + { + "epoch": 0.192524896887743, + "grad_norm": 4.2948222160339355, + "learning_rate": 2.733894067408923e-05, + "loss": 1.8052, + "step": 98865 + }, + { + "epoch": 0.19255410715885324, + "grad_norm": 3.584643602371216, + "learning_rate": 2.7338157907274247e-05, + "loss": 1.9111, + "step": 98880 + }, + { + "epoch": 0.1925833174299635, + "grad_norm": 3.0921499729156494, + "learning_rate": 2.7337375036557903e-05, + "loss": 1.9217, + "step": 98895 + }, + { + "epoch": 0.19261252770107376, + "grad_norm": 3.0183024406433105, + "learning_rate": 2.7336592061946786e-05, + "loss": 1.969, + "step": 98910 + }, + { + "epoch": 0.19264173797218404, + "grad_norm": 2.444133996963501, + "learning_rate": 2.7335808983447494e-05, + "loss": 2.002, + "step": 98925 + }, + { + "epoch": 0.19267094824329428, + "grad_norm": 4.221524238586426, + "learning_rate": 2.733502580106662e-05, + "loss": 2.1443, + "step": 98940 + }, + { + "epoch": 0.19270015851440456, + "grad_norm": 2.1210296154022217, + "learning_rate": 2.733424251481076e-05, + "loss": 2.1478, + "step": 98955 + }, + { + "epoch": 0.19272936878551483, + "grad_norm": 2.3690314292907715, + "learning_rate": 2.7333459124686505e-05, + "loss": 1.9956, + "step": 98970 + }, + { + "epoch": 0.19275857905662508, + "grad_norm": 2.468149185180664, + "learning_rate": 2.733267563070046e-05, + "loss": 2.0285, + "step": 98985 + }, + { + "epoch": 0.19278778932773535, + "grad_norm": 4.145179748535156, + "learning_rate": 2.7331892032859217e-05, + "loss": 2.0328, + "step": 99000 + }, + { + "epoch": 0.1928169995988456, + "grad_norm": 3.5643739700317383, + "learning_rate": 2.7331108331169376e-05, + "loss": 1.8457, + "step": 99015 + }, + { + "epoch": 0.19284620986995588, + "grad_norm": 2.854128837585449, + "learning_rate": 2.733032452563754e-05, + "loss": 1.7519, + "step": 99030 + }, + { + "epoch": 0.19287542014106612, + "grad_norm": 1.7001069784164429, + "learning_rate": 2.7329540616270306e-05, + "loss": 2.0158, + "step": 99045 + }, + { + "epoch": 0.1929046304121764, + "grad_norm": 2.600231885910034, + "learning_rate": 2.7328756603074277e-05, + "loss": 1.834, + "step": 99060 + }, + { + "epoch": 0.19293384068328667, + "grad_norm": 4.988858222961426, + "learning_rate": 2.7327972486056054e-05, + "loss": 1.8587, + "step": 99075 + }, + { + "epoch": 0.19296305095439692, + "grad_norm": 3.4738333225250244, + "learning_rate": 2.732718826522224e-05, + "loss": 1.8133, + "step": 99090 + }, + { + "epoch": 0.1929922612255072, + "grad_norm": 4.5725884437561035, + "learning_rate": 2.732640394057944e-05, + "loss": 1.9143, + "step": 99105 + }, + { + "epoch": 0.19302147149661744, + "grad_norm": 3.9497525691986084, + "learning_rate": 2.732561951213426e-05, + "loss": 1.9292, + "step": 99120 + }, + { + "epoch": 0.19305068176772772, + "grad_norm": 3.5975871086120605, + "learning_rate": 2.7324834979893303e-05, + "loss": 1.8409, + "step": 99135 + }, + { + "epoch": 0.19307989203883796, + "grad_norm": 2.214834451675415, + "learning_rate": 2.732405034386318e-05, + "loss": 1.8517, + "step": 99150 + }, + { + "epoch": 0.19310910230994824, + "grad_norm": 2.8448970317840576, + "learning_rate": 2.732326560405049e-05, + "loss": 1.8687, + "step": 99165 + }, + { + "epoch": 0.1931383125810585, + "grad_norm": 2.991927146911621, + "learning_rate": 2.7322480760461857e-05, + "loss": 1.7387, + "step": 99180 + }, + { + "epoch": 0.19316752285216876, + "grad_norm": 2.2114076614379883, + "learning_rate": 2.732169581310387e-05, + "loss": 1.9305, + "step": 99195 + }, + { + "epoch": 0.19319673312327904, + "grad_norm": 5.362480640411377, + "learning_rate": 2.7320910761983154e-05, + "loss": 1.7242, + "step": 99210 + }, + { + "epoch": 0.19322594339438928, + "grad_norm": 3.048447847366333, + "learning_rate": 2.732012560710632e-05, + "loss": 1.9179, + "step": 99225 + }, + { + "epoch": 0.19325515366549956, + "grad_norm": 4.592885494232178, + "learning_rate": 2.731934034847997e-05, + "loss": 1.9377, + "step": 99240 + }, + { + "epoch": 0.1932843639366098, + "grad_norm": 3.9882423877716064, + "learning_rate": 2.731855498611072e-05, + "loss": 1.9332, + "step": 99255 + }, + { + "epoch": 0.19331357420772008, + "grad_norm": 3.3673927783966064, + "learning_rate": 2.731776952000519e-05, + "loss": 1.9364, + "step": 99270 + }, + { + "epoch": 0.19334278447883035, + "grad_norm": 3.6662065982818604, + "learning_rate": 2.7316983950169993e-05, + "loss": 1.7326, + "step": 99285 + }, + { + "epoch": 0.1933719947499406, + "grad_norm": 4.374509334564209, + "learning_rate": 2.731619827661174e-05, + "loss": 1.9286, + "step": 99300 + }, + { + "epoch": 0.19340120502105088, + "grad_norm": 3.4628331661224365, + "learning_rate": 2.7315412499337045e-05, + "loss": 1.7022, + "step": 99315 + }, + { + "epoch": 0.19343041529216112, + "grad_norm": 2.579603433609009, + "learning_rate": 2.731462661835253e-05, + "loss": 1.7704, + "step": 99330 + }, + { + "epoch": 0.1934596255632714, + "grad_norm": 3.091069221496582, + "learning_rate": 2.7313840633664815e-05, + "loss": 1.7778, + "step": 99345 + }, + { + "epoch": 0.19348883583438165, + "grad_norm": 5.090400695800781, + "learning_rate": 2.7313054545280514e-05, + "loss": 1.7285, + "step": 99360 + }, + { + "epoch": 0.19351804610549192, + "grad_norm": 2.546351671218872, + "learning_rate": 2.7312268353206253e-05, + "loss": 1.9417, + "step": 99375 + }, + { + "epoch": 0.1935472563766022, + "grad_norm": 4.185542583465576, + "learning_rate": 2.7311482057448643e-05, + "loss": 1.6706, + "step": 99390 + }, + { + "epoch": 0.19357646664771244, + "grad_norm": 3.260465383529663, + "learning_rate": 2.731069565801432e-05, + "loss": 2.1129, + "step": 99405 + }, + { + "epoch": 0.19360567691882272, + "grad_norm": 3.882298707962036, + "learning_rate": 2.7309909154909887e-05, + "loss": 2.0626, + "step": 99420 + }, + { + "epoch": 0.19363488718993296, + "grad_norm": 3.9773478507995605, + "learning_rate": 2.7309122548141982e-05, + "loss": 1.9497, + "step": 99435 + }, + { + "epoch": 0.19366409746104324, + "grad_norm": 4.090893268585205, + "learning_rate": 2.7308335837717228e-05, + "loss": 1.9007, + "step": 99450 + }, + { + "epoch": 0.19369330773215349, + "grad_norm": 2.3632359504699707, + "learning_rate": 2.730754902364224e-05, + "loss": 2.0099, + "step": 99465 + }, + { + "epoch": 0.19372251800326376, + "grad_norm": 4.095473289489746, + "learning_rate": 2.7306762105923652e-05, + "loss": 1.7044, + "step": 99480 + }, + { + "epoch": 0.19375172827437404, + "grad_norm": 6.2895379066467285, + "learning_rate": 2.7305975084568092e-05, + "loss": 1.9064, + "step": 99495 + }, + { + "epoch": 0.19378093854548428, + "grad_norm": 2.8147449493408203, + "learning_rate": 2.7305187959582188e-05, + "loss": 1.9864, + "step": 99510 + }, + { + "epoch": 0.19381014881659456, + "grad_norm": 3.9521355628967285, + "learning_rate": 2.7304400730972563e-05, + "loss": 2.043, + "step": 99525 + }, + { + "epoch": 0.1938393590877048, + "grad_norm": 2.527974843978882, + "learning_rate": 2.730361339874585e-05, + "loss": 1.865, + "step": 99540 + }, + { + "epoch": 0.19386856935881508, + "grad_norm": 4.282250881195068, + "learning_rate": 2.7302825962908675e-05, + "loss": 1.8109, + "step": 99555 + }, + { + "epoch": 0.19389777962992533, + "grad_norm": 3.112752914428711, + "learning_rate": 2.7302038423467672e-05, + "loss": 1.7857, + "step": 99570 + }, + { + "epoch": 0.1939269899010356, + "grad_norm": 4.542616844177246, + "learning_rate": 2.7301250780429475e-05, + "loss": 1.8044, + "step": 99585 + }, + { + "epoch": 0.19395620017214588, + "grad_norm": 4.9732160568237305, + "learning_rate": 2.7300463033800714e-05, + "loss": 1.9769, + "step": 99600 + }, + { + "epoch": 0.19398541044325612, + "grad_norm": 1.8190230131149292, + "learning_rate": 2.7299675183588026e-05, + "loss": 1.9834, + "step": 99615 + }, + { + "epoch": 0.1940146207143664, + "grad_norm": 3.0133800506591797, + "learning_rate": 2.729888722979804e-05, + "loss": 1.9321, + "step": 99630 + }, + { + "epoch": 0.19404383098547665, + "grad_norm": 5.051206111907959, + "learning_rate": 2.7298099172437398e-05, + "loss": 2.0088, + "step": 99645 + }, + { + "epoch": 0.19407304125658692, + "grad_norm": 2.936155080795288, + "learning_rate": 2.729731101151273e-05, + "loss": 1.8523, + "step": 99660 + }, + { + "epoch": 0.19410225152769717, + "grad_norm": 3.5338001251220703, + "learning_rate": 2.729652274703068e-05, + "loss": 2.1233, + "step": 99675 + }, + { + "epoch": 0.19413146179880744, + "grad_norm": 2.6141345500946045, + "learning_rate": 2.729573437899788e-05, + "loss": 1.8505, + "step": 99690 + }, + { + "epoch": 0.19416067206991772, + "grad_norm": 2.730879068374634, + "learning_rate": 2.7294945907420974e-05, + "loss": 1.6956, + "step": 99705 + }, + { + "epoch": 0.19418988234102796, + "grad_norm": 4.244674205780029, + "learning_rate": 2.7294157332306596e-05, + "loss": 1.641, + "step": 99720 + }, + { + "epoch": 0.19421909261213824, + "grad_norm": 2.3634984493255615, + "learning_rate": 2.729336865366139e-05, + "loss": 1.9858, + "step": 99735 + }, + { + "epoch": 0.19424830288324849, + "grad_norm": 4.2105231285095215, + "learning_rate": 2.7292579871492002e-05, + "loss": 1.8381, + "step": 99750 + }, + { + "epoch": 0.19427751315435876, + "grad_norm": 3.905348062515259, + "learning_rate": 2.7291790985805067e-05, + "loss": 1.7924, + "step": 99765 + }, + { + "epoch": 0.194306723425469, + "grad_norm": 3.042111873626709, + "learning_rate": 2.7291001996607232e-05, + "loss": 1.8397, + "step": 99780 + }, + { + "epoch": 0.19433593369657928, + "grad_norm": 4.014068126678467, + "learning_rate": 2.729021290390514e-05, + "loss": 2.2284, + "step": 99795 + }, + { + "epoch": 0.19436514396768956, + "grad_norm": 2.6055431365966797, + "learning_rate": 2.7289423707705436e-05, + "loss": 1.9419, + "step": 99810 + }, + { + "epoch": 0.1943943542387998, + "grad_norm": 2.497370719909668, + "learning_rate": 2.728863440801477e-05, + "loss": 1.8782, + "step": 99825 + }, + { + "epoch": 0.19442356450991008, + "grad_norm": 3.4753448963165283, + "learning_rate": 2.728784500483978e-05, + "loss": 1.9226, + "step": 99840 + }, + { + "epoch": 0.19445277478102033, + "grad_norm": 3.7824976444244385, + "learning_rate": 2.7287055498187122e-05, + "loss": 1.7604, + "step": 99855 + }, + { + "epoch": 0.1944819850521306, + "grad_norm": 3.9716172218322754, + "learning_rate": 2.728626588806344e-05, + "loss": 1.8012, + "step": 99870 + }, + { + "epoch": 0.19451119532324085, + "grad_norm": 2.4061131477355957, + "learning_rate": 2.728547617447539e-05, + "loss": 1.8795, + "step": 99885 + }, + { + "epoch": 0.19454040559435112, + "grad_norm": 3.517012596130371, + "learning_rate": 2.7284686357429613e-05, + "loss": 1.9715, + "step": 99900 + }, + { + "epoch": 0.1945696158654614, + "grad_norm": 4.13925313949585, + "learning_rate": 2.7283896436932765e-05, + "loss": 1.8814, + "step": 99915 + }, + { + "epoch": 0.19459882613657165, + "grad_norm": 2.505760669708252, + "learning_rate": 2.7283106412991496e-05, + "loss": 1.8949, + "step": 99930 + }, + { + "epoch": 0.19462803640768192, + "grad_norm": 2.5170321464538574, + "learning_rate": 2.728231628561246e-05, + "loss": 1.8203, + "step": 99945 + }, + { + "epoch": 0.19465724667879217, + "grad_norm": 3.488593101501465, + "learning_rate": 2.7281526054802314e-05, + "loss": 1.9206, + "step": 99960 + }, + { + "epoch": 0.19468645694990244, + "grad_norm": 5.53490686416626, + "learning_rate": 2.728073572056771e-05, + "loss": 1.8825, + "step": 99975 + }, + { + "epoch": 0.1947156672210127, + "grad_norm": 3.8407371044158936, + "learning_rate": 2.7279945282915302e-05, + "loss": 2.0369, + "step": 99990 + }, + { + "epoch": 0.19474487749212296, + "grad_norm": 3.424163818359375, + "learning_rate": 2.7279154741851746e-05, + "loss": 2.0105, + "step": 100005 + }, + { + "epoch": 0.19477408776323324, + "grad_norm": 5.5570573806762695, + "learning_rate": 2.72783640973837e-05, + "loss": 1.7985, + "step": 100020 + }, + { + "epoch": 0.1948032980343435, + "grad_norm": 2.386298418045044, + "learning_rate": 2.7277573349517826e-05, + "loss": 1.8656, + "step": 100035 + }, + { + "epoch": 0.19483250830545376, + "grad_norm": 4.574279308319092, + "learning_rate": 2.7276782498260782e-05, + "loss": 1.8927, + "step": 100050 + }, + { + "epoch": 0.194861718576564, + "grad_norm": 3.1473605632781982, + "learning_rate": 2.727599154361922e-05, + "loss": 1.7297, + "step": 100065 + }, + { + "epoch": 0.19489092884767428, + "grad_norm": 2.5923187732696533, + "learning_rate": 2.727520048559981e-05, + "loss": 1.7668, + "step": 100080 + }, + { + "epoch": 0.19492013911878453, + "grad_norm": 2.161724090576172, + "learning_rate": 2.7274409324209212e-05, + "loss": 1.8331, + "step": 100095 + }, + { + "epoch": 0.1949493493898948, + "grad_norm": 4.286500930786133, + "learning_rate": 2.7273618059454085e-05, + "loss": 1.8603, + "step": 100110 + }, + { + "epoch": 0.19497855966100508, + "grad_norm": 4.223363399505615, + "learning_rate": 2.727282669134109e-05, + "loss": 1.6897, + "step": 100125 + }, + { + "epoch": 0.19500776993211533, + "grad_norm": 3.794745445251465, + "learning_rate": 2.7272035219876903e-05, + "loss": 1.9935, + "step": 100140 + }, + { + "epoch": 0.1950369802032256, + "grad_norm": 4.161808967590332, + "learning_rate": 2.7271243645068175e-05, + "loss": 1.9776, + "step": 100155 + }, + { + "epoch": 0.19506619047433585, + "grad_norm": 3.5837323665618896, + "learning_rate": 2.7270451966921583e-05, + "loss": 1.8638, + "step": 100170 + }, + { + "epoch": 0.19509540074544612, + "grad_norm": 2.1057465076446533, + "learning_rate": 2.7269660185443788e-05, + "loss": 1.8915, + "step": 100185 + }, + { + "epoch": 0.19512461101655637, + "grad_norm": 5.059057712554932, + "learning_rate": 2.726886830064146e-05, + "loss": 1.7365, + "step": 100200 + }, + { + "epoch": 0.19515382128766665, + "grad_norm": 3.8220999240875244, + "learning_rate": 2.7268076312521264e-05, + "loss": 2.0257, + "step": 100215 + }, + { + "epoch": 0.19518303155877692, + "grad_norm": 3.4628665447235107, + "learning_rate": 2.7267284221089872e-05, + "loss": 1.7957, + "step": 100230 + }, + { + "epoch": 0.19521224182988717, + "grad_norm": 5.714832305908203, + "learning_rate": 2.7266492026353955e-05, + "loss": 1.8802, + "step": 100245 + }, + { + "epoch": 0.19524145210099744, + "grad_norm": 1.9526153802871704, + "learning_rate": 2.726569972832018e-05, + "loss": 2.0261, + "step": 100260 + }, + { + "epoch": 0.1952706623721077, + "grad_norm": 2.6683590412139893, + "learning_rate": 2.7264907326995233e-05, + "loss": 1.7891, + "step": 100275 + }, + { + "epoch": 0.19529987264321796, + "grad_norm": 2.4714248180389404, + "learning_rate": 2.7264114822385766e-05, + "loss": 1.8686, + "step": 100290 + }, + { + "epoch": 0.1953290829143282, + "grad_norm": 4.144501209259033, + "learning_rate": 2.7263322214498463e-05, + "loss": 1.8619, + "step": 100305 + }, + { + "epoch": 0.1953582931854385, + "grad_norm": 2.18325138092041, + "learning_rate": 2.7262529503340006e-05, + "loss": 1.7987, + "step": 100320 + }, + { + "epoch": 0.19538750345654876, + "grad_norm": 2.6011135578155518, + "learning_rate": 2.726173668891706e-05, + "loss": 1.7998, + "step": 100335 + }, + { + "epoch": 0.195416713727659, + "grad_norm": 5.343276500701904, + "learning_rate": 2.7260943771236305e-05, + "loss": 1.7828, + "step": 100350 + }, + { + "epoch": 0.19544592399876928, + "grad_norm": 2.661271572113037, + "learning_rate": 2.7260150750304414e-05, + "loss": 1.8616, + "step": 100365 + }, + { + "epoch": 0.19547513426987953, + "grad_norm": 1.8658612966537476, + "learning_rate": 2.7259357626128074e-05, + "loss": 1.8845, + "step": 100380 + }, + { + "epoch": 0.1955043445409898, + "grad_norm": 2.7381432056427, + "learning_rate": 2.725856439871396e-05, + "loss": 1.8771, + "step": 100395 + }, + { + "epoch": 0.19553355481210005, + "grad_norm": 3.811016082763672, + "learning_rate": 2.725777106806875e-05, + "loss": 1.9539, + "step": 100410 + }, + { + "epoch": 0.19556276508321033, + "grad_norm": 3.301814317703247, + "learning_rate": 2.7256977634199125e-05, + "loss": 1.7698, + "step": 100425 + }, + { + "epoch": 0.1955919753543206, + "grad_norm": 2.319570541381836, + "learning_rate": 2.7256184097111766e-05, + "loss": 1.8089, + "step": 100440 + }, + { + "epoch": 0.19562118562543085, + "grad_norm": 2.4597103595733643, + "learning_rate": 2.725539045681336e-05, + "loss": 1.9821, + "step": 100455 + }, + { + "epoch": 0.19565039589654112, + "grad_norm": 2.810140609741211, + "learning_rate": 2.7254596713310586e-05, + "loss": 1.9725, + "step": 100470 + }, + { + "epoch": 0.19567960616765137, + "grad_norm": 3.0905210971832275, + "learning_rate": 2.7253802866610127e-05, + "loss": 1.7908, + "step": 100485 + }, + { + "epoch": 0.19570881643876165, + "grad_norm": 3.9158473014831543, + "learning_rate": 2.7253008916718673e-05, + "loss": 1.8927, + "step": 100500 + }, + { + "epoch": 0.1957380267098719, + "grad_norm": 2.886533498764038, + "learning_rate": 2.725221486364291e-05, + "loss": 1.9053, + "step": 100515 + }, + { + "epoch": 0.19576723698098217, + "grad_norm": 3.3901984691619873, + "learning_rate": 2.7251420707389516e-05, + "loss": 1.8508, + "step": 100530 + }, + { + "epoch": 0.19579644725209244, + "grad_norm": 3.7472264766693115, + "learning_rate": 2.7250626447965193e-05, + "loss": 1.8289, + "step": 100545 + }, + { + "epoch": 0.1958256575232027, + "grad_norm": 2.9605493545532227, + "learning_rate": 2.7249832085376617e-05, + "loss": 1.8736, + "step": 100560 + }, + { + "epoch": 0.19585486779431296, + "grad_norm": 2.974295139312744, + "learning_rate": 2.724903761963048e-05, + "loss": 1.8893, + "step": 100575 + }, + { + "epoch": 0.1958840780654232, + "grad_norm": 2.5947487354278564, + "learning_rate": 2.724824305073348e-05, + "loss": 2.0498, + "step": 100590 + }, + { + "epoch": 0.1959132883365335, + "grad_norm": 3.8707077503204346, + "learning_rate": 2.72474483786923e-05, + "loss": 1.9041, + "step": 100605 + }, + { + "epoch": 0.19594249860764373, + "grad_norm": 2.7378580570220947, + "learning_rate": 2.7246653603513637e-05, + "loss": 1.7201, + "step": 100620 + }, + { + "epoch": 0.195971708878754, + "grad_norm": 3.4011311531066895, + "learning_rate": 2.7245858725204176e-05, + "loss": 1.7655, + "step": 100635 + }, + { + "epoch": 0.19600091914986428, + "grad_norm": 3.0134902000427246, + "learning_rate": 2.724506374377062e-05, + "loss": 1.712, + "step": 100650 + }, + { + "epoch": 0.19603012942097453, + "grad_norm": 3.073152542114258, + "learning_rate": 2.724426865921966e-05, + "loss": 1.9858, + "step": 100665 + }, + { + "epoch": 0.1960593396920848, + "grad_norm": 3.228030204772949, + "learning_rate": 2.724347347155799e-05, + "loss": 1.8776, + "step": 100680 + }, + { + "epoch": 0.19608854996319505, + "grad_norm": 3.795522451400757, + "learning_rate": 2.7242678180792307e-05, + "loss": 2.134, + "step": 100695 + }, + { + "epoch": 0.19611776023430533, + "grad_norm": 5.153021812438965, + "learning_rate": 2.7241882786929314e-05, + "loss": 2.0464, + "step": 100710 + }, + { + "epoch": 0.19614697050541557, + "grad_norm": 3.5811140537261963, + "learning_rate": 2.72410872899757e-05, + "loss": 1.8878, + "step": 100725 + }, + { + "epoch": 0.19617618077652585, + "grad_norm": 5.196473598480225, + "learning_rate": 2.724029168993817e-05, + "loss": 1.7658, + "step": 100740 + }, + { + "epoch": 0.1962053910476361, + "grad_norm": 3.976027488708496, + "learning_rate": 2.7239495986823425e-05, + "loss": 1.879, + "step": 100755 + }, + { + "epoch": 0.19623460131874637, + "grad_norm": 3.8692712783813477, + "learning_rate": 2.723870018063816e-05, + "loss": 1.8352, + "step": 100770 + }, + { + "epoch": 0.19626381158985665, + "grad_norm": 5.235087871551514, + "learning_rate": 2.7237904271389078e-05, + "loss": 1.8567, + "step": 100785 + }, + { + "epoch": 0.1962930218609669, + "grad_norm": 3.3791890144348145, + "learning_rate": 2.7237108259082885e-05, + "loss": 1.8126, + "step": 100800 + }, + { + "epoch": 0.19632223213207717, + "grad_norm": 1.9021275043487549, + "learning_rate": 2.723631214372628e-05, + "loss": 1.7514, + "step": 100815 + }, + { + "epoch": 0.19635144240318742, + "grad_norm": 3.4778172969818115, + "learning_rate": 2.723551592532597e-05, + "loss": 1.756, + "step": 100830 + }, + { + "epoch": 0.1963806526742977, + "grad_norm": 3.4496116638183594, + "learning_rate": 2.7234719603888664e-05, + "loss": 2.1882, + "step": 100845 + }, + { + "epoch": 0.19640986294540794, + "grad_norm": 3.6033730506896973, + "learning_rate": 2.7233923179421053e-05, + "loss": 1.9174, + "step": 100860 + }, + { + "epoch": 0.1964390732165182, + "grad_norm": 2.8869011402130127, + "learning_rate": 2.723312665192986e-05, + "loss": 1.5821, + "step": 100875 + }, + { + "epoch": 0.1964682834876285, + "grad_norm": 4.109874248504639, + "learning_rate": 2.7232330021421794e-05, + "loss": 1.7938, + "step": 100890 + }, + { + "epoch": 0.19649749375873873, + "grad_norm": 3.354278802871704, + "learning_rate": 2.723153328790355e-05, + "loss": 1.8709, + "step": 100905 + }, + { + "epoch": 0.196526704029849, + "grad_norm": 3.013465166091919, + "learning_rate": 2.7230736451381844e-05, + "loss": 1.8894, + "step": 100920 + }, + { + "epoch": 0.19655591430095926, + "grad_norm": 3.2619619369506836, + "learning_rate": 2.7229939511863387e-05, + "loss": 1.8466, + "step": 100935 + }, + { + "epoch": 0.19658512457206953, + "grad_norm": 2.6464648246765137, + "learning_rate": 2.7229142469354886e-05, + "loss": 1.9893, + "step": 100950 + }, + { + "epoch": 0.19661433484317978, + "grad_norm": 2.8995745182037354, + "learning_rate": 2.7228345323863066e-05, + "loss": 1.7353, + "step": 100965 + }, + { + "epoch": 0.19664354511429005, + "grad_norm": 3.9069721698760986, + "learning_rate": 2.7227548075394617e-05, + "loss": 1.8366, + "step": 100980 + }, + { + "epoch": 0.19667275538540033, + "grad_norm": 2.674079656600952, + "learning_rate": 2.7226750723956273e-05, + "loss": 1.7994, + "step": 100995 + }, + { + "epoch": 0.19670196565651057, + "grad_norm": 3.381258726119995, + "learning_rate": 2.7225953269554742e-05, + "loss": 1.9097, + "step": 101010 + }, + { + "epoch": 0.19673117592762085, + "grad_norm": 4.749322414398193, + "learning_rate": 2.7225155712196735e-05, + "loss": 1.9171, + "step": 101025 + }, + { + "epoch": 0.1967603861987311, + "grad_norm": 2.7452809810638428, + "learning_rate": 2.722435805188898e-05, + "loss": 2.0804, + "step": 101040 + }, + { + "epoch": 0.19678959646984137, + "grad_norm": 3.3356921672821045, + "learning_rate": 2.7223560288638178e-05, + "loss": 1.9619, + "step": 101055 + }, + { + "epoch": 0.19681880674095162, + "grad_norm": 1.8505098819732666, + "learning_rate": 2.722276242245106e-05, + "loss": 1.9478, + "step": 101070 + }, + { + "epoch": 0.1968480170120619, + "grad_norm": 3.203836441040039, + "learning_rate": 2.7221964453334344e-05, + "loss": 1.8802, + "step": 101085 + }, + { + "epoch": 0.19687722728317217, + "grad_norm": 2.7620935440063477, + "learning_rate": 2.7221166381294737e-05, + "loss": 1.8911, + "step": 101100 + }, + { + "epoch": 0.19690643755428242, + "grad_norm": 3.008974075317383, + "learning_rate": 2.7220368206338978e-05, + "loss": 1.7459, + "step": 101115 + }, + { + "epoch": 0.1969356478253927, + "grad_norm": 4.506922245025635, + "learning_rate": 2.7219569928473774e-05, + "loss": 1.7932, + "step": 101130 + }, + { + "epoch": 0.19696485809650294, + "grad_norm": 2.9958689212799072, + "learning_rate": 2.7218771547705858e-05, + "loss": 1.9051, + "step": 101145 + }, + { + "epoch": 0.1969940683676132, + "grad_norm": 4.214073181152344, + "learning_rate": 2.7217973064041944e-05, + "loss": 1.8098, + "step": 101160 + }, + { + "epoch": 0.19702327863872346, + "grad_norm": 2.3376572132110596, + "learning_rate": 2.7217174477488762e-05, + "loss": 1.7378, + "step": 101175 + }, + { + "epoch": 0.19705248890983373, + "grad_norm": 2.2366175651550293, + "learning_rate": 2.7216375788053038e-05, + "loss": 1.9518, + "step": 101190 + }, + { + "epoch": 0.197081699180944, + "grad_norm": 4.249980926513672, + "learning_rate": 2.7215576995741485e-05, + "loss": 1.8563, + "step": 101205 + }, + { + "epoch": 0.19711090945205426, + "grad_norm": 3.6357455253601074, + "learning_rate": 2.721477810056085e-05, + "loss": 1.972, + "step": 101220 + }, + { + "epoch": 0.19714011972316453, + "grad_norm": 3.260871648788452, + "learning_rate": 2.721397910251785e-05, + "loss": 1.6136, + "step": 101235 + }, + { + "epoch": 0.19716932999427478, + "grad_norm": 4.484538555145264, + "learning_rate": 2.7213180001619213e-05, + "loss": 1.9045, + "step": 101250 + }, + { + "epoch": 0.19719854026538505, + "grad_norm": 2.790937662124634, + "learning_rate": 2.721238079787167e-05, + "loss": 1.8332, + "step": 101265 + }, + { + "epoch": 0.1972277505364953, + "grad_norm": 4.38517427444458, + "learning_rate": 2.721158149128195e-05, + "loss": 1.7181, + "step": 101280 + }, + { + "epoch": 0.19725696080760557, + "grad_norm": 2.7955479621887207, + "learning_rate": 2.7210782081856784e-05, + "loss": 1.9955, + "step": 101295 + }, + { + "epoch": 0.19728617107871585, + "grad_norm": 2.502910852432251, + "learning_rate": 2.7209982569602904e-05, + "loss": 2.0606, + "step": 101310 + }, + { + "epoch": 0.1973153813498261, + "grad_norm": 3.5377182960510254, + "learning_rate": 2.7209182954527043e-05, + "loss": 2.0034, + "step": 101325 + }, + { + "epoch": 0.19734459162093637, + "grad_norm": 3.1391892433166504, + "learning_rate": 2.720838323663594e-05, + "loss": 2.0192, + "step": 101340 + }, + { + "epoch": 0.19737380189204662, + "grad_norm": 4.447723388671875, + "learning_rate": 2.7207583415936315e-05, + "loss": 1.8772, + "step": 101355 + }, + { + "epoch": 0.1974030121631569, + "grad_norm": 2.988149404525757, + "learning_rate": 2.7206783492434925e-05, + "loss": 1.9467, + "step": 101370 + }, + { + "epoch": 0.19743222243426714, + "grad_norm": 3.634883403778076, + "learning_rate": 2.7205983466138487e-05, + "loss": 2.2269, + "step": 101385 + }, + { + "epoch": 0.19746143270537742, + "grad_norm": 3.9800522327423096, + "learning_rate": 2.7205183337053748e-05, + "loss": 1.9508, + "step": 101400 + }, + { + "epoch": 0.1974906429764877, + "grad_norm": 3.017993927001953, + "learning_rate": 2.720438310518744e-05, + "loss": 2.078, + "step": 101415 + }, + { + "epoch": 0.19751985324759794, + "grad_norm": 2.509310245513916, + "learning_rate": 2.720358277054631e-05, + "loss": 1.7947, + "step": 101430 + }, + { + "epoch": 0.1975490635187082, + "grad_norm": 2.516083002090454, + "learning_rate": 2.7202782333137093e-05, + "loss": 1.8022, + "step": 101445 + }, + { + "epoch": 0.19757827378981846, + "grad_norm": 5.204488277435303, + "learning_rate": 2.720198179296653e-05, + "loss": 1.9213, + "step": 101460 + }, + { + "epoch": 0.19760748406092873, + "grad_norm": 3.5926268100738525, + "learning_rate": 2.7201181150041357e-05, + "loss": 1.9067, + "step": 101475 + }, + { + "epoch": 0.19763669433203898, + "grad_norm": 2.7383644580841064, + "learning_rate": 2.7200380404368328e-05, + "loss": 1.7887, + "step": 101490 + }, + { + "epoch": 0.19766590460314926, + "grad_norm": 2.4527342319488525, + "learning_rate": 2.7199579555954177e-05, + "loss": 2.0757, + "step": 101505 + }, + { + "epoch": 0.19769511487425953, + "grad_norm": 2.781235456466675, + "learning_rate": 2.719877860480565e-05, + "loss": 2.0055, + "step": 101520 + }, + { + "epoch": 0.19772432514536978, + "grad_norm": 3.6254844665527344, + "learning_rate": 2.7197977550929494e-05, + "loss": 1.964, + "step": 101535 + }, + { + "epoch": 0.19775353541648005, + "grad_norm": 2.7392122745513916, + "learning_rate": 2.719717639433245e-05, + "loss": 1.9413, + "step": 101550 + }, + { + "epoch": 0.1977827456875903, + "grad_norm": 4.468549728393555, + "learning_rate": 2.719637513502127e-05, + "loss": 1.982, + "step": 101565 + }, + { + "epoch": 0.19781195595870057, + "grad_norm": 4.49515962600708, + "learning_rate": 2.71955737730027e-05, + "loss": 1.5673, + "step": 101580 + }, + { + "epoch": 0.19784116622981082, + "grad_norm": 3.342057228088379, + "learning_rate": 2.719477230828349e-05, + "loss": 1.778, + "step": 101595 + }, + { + "epoch": 0.1978703765009211, + "grad_norm": 4.80141019821167, + "learning_rate": 2.7193970740870383e-05, + "loss": 1.92, + "step": 101610 + }, + { + "epoch": 0.19789958677203137, + "grad_norm": 4.502120018005371, + "learning_rate": 2.719316907077014e-05, + "loss": 1.9432, + "step": 101625 + }, + { + "epoch": 0.19792879704314162, + "grad_norm": 4.221505641937256, + "learning_rate": 2.71923672979895e-05, + "loss": 1.7853, + "step": 101640 + }, + { + "epoch": 0.1979580073142519, + "grad_norm": 2.541995048522949, + "learning_rate": 2.7191565422535223e-05, + "loss": 1.7709, + "step": 101655 + }, + { + "epoch": 0.19798721758536214, + "grad_norm": 4.294101715087891, + "learning_rate": 2.7190763444414057e-05, + "loss": 2.0716, + "step": 101670 + }, + { + "epoch": 0.19801642785647242, + "grad_norm": 2.2642786502838135, + "learning_rate": 2.7189961363632754e-05, + "loss": 1.8188, + "step": 101685 + }, + { + "epoch": 0.19804563812758266, + "grad_norm": 2.8210978507995605, + "learning_rate": 2.718915918019808e-05, + "loss": 1.9474, + "step": 101700 + }, + { + "epoch": 0.19807484839869294, + "grad_norm": 3.8412275314331055, + "learning_rate": 2.7188356894116776e-05, + "loss": 1.8954, + "step": 101715 + }, + { + "epoch": 0.1981040586698032, + "grad_norm": 2.886929750442505, + "learning_rate": 2.7187554505395606e-05, + "loss": 1.9654, + "step": 101730 + }, + { + "epoch": 0.19813326894091346, + "grad_norm": 3.2043139934539795, + "learning_rate": 2.7186752014041327e-05, + "loss": 1.8988, + "step": 101745 + }, + { + "epoch": 0.19816247921202373, + "grad_norm": 3.636002540588379, + "learning_rate": 2.7185949420060695e-05, + "loss": 1.665, + "step": 101760 + }, + { + "epoch": 0.19819168948313398, + "grad_norm": 3.24428653717041, + "learning_rate": 2.7185146723460467e-05, + "loss": 2.0371, + "step": 101775 + }, + { + "epoch": 0.19822089975424426, + "grad_norm": 4.483587741851807, + "learning_rate": 2.7184343924247404e-05, + "loss": 1.9315, + "step": 101790 + }, + { + "epoch": 0.1982501100253545, + "grad_norm": 2.1983368396759033, + "learning_rate": 2.7183541022428267e-05, + "loss": 1.9494, + "step": 101805 + }, + { + "epoch": 0.19827932029646478, + "grad_norm": 3.649796724319458, + "learning_rate": 2.718273801800982e-05, + "loss": 1.8671, + "step": 101820 + }, + { + "epoch": 0.19830853056757505, + "grad_norm": 4.211715221405029, + "learning_rate": 2.7181934910998823e-05, + "loss": 1.981, + "step": 101835 + }, + { + "epoch": 0.1983377408386853, + "grad_norm": 2.3952419757843018, + "learning_rate": 2.718113170140204e-05, + "loss": 1.8291, + "step": 101850 + }, + { + "epoch": 0.19836695110979558, + "grad_norm": 2.39406418800354, + "learning_rate": 2.718032838922623e-05, + "loss": 1.912, + "step": 101865 + }, + { + "epoch": 0.19839616138090582, + "grad_norm": 6.0903778076171875, + "learning_rate": 2.7179524974478163e-05, + "loss": 2.0882, + "step": 101880 + }, + { + "epoch": 0.1984253716520161, + "grad_norm": 3.103097915649414, + "learning_rate": 2.7178721457164605e-05, + "loss": 1.9958, + "step": 101895 + }, + { + "epoch": 0.19845458192312634, + "grad_norm": 3.587754011154175, + "learning_rate": 2.717791783729232e-05, + "loss": 1.8026, + "step": 101910 + }, + { + "epoch": 0.19848379219423662, + "grad_norm": 2.8163771629333496, + "learning_rate": 2.7177114114868072e-05, + "loss": 1.7944, + "step": 101925 + }, + { + "epoch": 0.1985130024653469, + "grad_norm": 3.1890029907226562, + "learning_rate": 2.7176310289898636e-05, + "loss": 1.8578, + "step": 101940 + }, + { + "epoch": 0.19854221273645714, + "grad_norm": 2.3553390502929688, + "learning_rate": 2.7175506362390782e-05, + "loss": 1.8265, + "step": 101955 + }, + { + "epoch": 0.19857142300756742, + "grad_norm": 4.193222522735596, + "learning_rate": 2.717470233235127e-05, + "loss": 1.628, + "step": 101970 + }, + { + "epoch": 0.19860063327867766, + "grad_norm": 2.6176865100860596, + "learning_rate": 2.717389819978688e-05, + "loss": 1.8012, + "step": 101985 + }, + { + "epoch": 0.19862984354978794, + "grad_norm": 1.9875918626785278, + "learning_rate": 2.7173093964704387e-05, + "loss": 1.9923, + "step": 102000 + }, + { + "epoch": 0.19865905382089818, + "grad_norm": 3.303596258163452, + "learning_rate": 2.717228962711055e-05, + "loss": 1.7089, + "step": 102015 + }, + { + "epoch": 0.19868826409200846, + "grad_norm": 2.6512694358825684, + "learning_rate": 2.7171485187012154e-05, + "loss": 1.9137, + "step": 102030 + }, + { + "epoch": 0.19871747436311873, + "grad_norm": 2.2207350730895996, + "learning_rate": 2.7170680644415967e-05, + "loss": 1.8648, + "step": 102045 + }, + { + "epoch": 0.19874668463422898, + "grad_norm": 4.580793380737305, + "learning_rate": 2.7169875999328767e-05, + "loss": 1.8338, + "step": 102060 + }, + { + "epoch": 0.19877589490533926, + "grad_norm": 3.498553514480591, + "learning_rate": 2.7169071251757333e-05, + "loss": 1.8381, + "step": 102075 + }, + { + "epoch": 0.1988051051764495, + "grad_norm": 3.8216021060943604, + "learning_rate": 2.7168266401708434e-05, + "loss": 1.8142, + "step": 102090 + }, + { + "epoch": 0.19883431544755978, + "grad_norm": 3.112802028656006, + "learning_rate": 2.7167461449188853e-05, + "loss": 1.9172, + "step": 102105 + }, + { + "epoch": 0.19886352571867003, + "grad_norm": 3.8839733600616455, + "learning_rate": 2.7166656394205368e-05, + "loss": 1.9819, + "step": 102120 + }, + { + "epoch": 0.1988927359897803, + "grad_norm": 2.4655919075012207, + "learning_rate": 2.7165851236764763e-05, + "loss": 1.9514, + "step": 102135 + }, + { + "epoch": 0.19892194626089058, + "grad_norm": 4.288333415985107, + "learning_rate": 2.716504597687381e-05, + "loss": 1.8178, + "step": 102150 + }, + { + "epoch": 0.19895115653200082, + "grad_norm": 3.875215530395508, + "learning_rate": 2.7164240614539292e-05, + "loss": 1.9124, + "step": 102165 + }, + { + "epoch": 0.1989803668031111, + "grad_norm": 3.0864641666412354, + "learning_rate": 2.7163435149767996e-05, + "loss": 2.0755, + "step": 102180 + }, + { + "epoch": 0.19900957707422134, + "grad_norm": 3.8712475299835205, + "learning_rate": 2.7162629582566702e-05, + "loss": 2.0153, + "step": 102195 + }, + { + "epoch": 0.19903878734533162, + "grad_norm": 5.153542518615723, + "learning_rate": 2.716182391294219e-05, + "loss": 1.7028, + "step": 102210 + }, + { + "epoch": 0.19906799761644187, + "grad_norm": 2.237034797668457, + "learning_rate": 2.7161018140901253e-05, + "loss": 1.8537, + "step": 102225 + }, + { + "epoch": 0.19909720788755214, + "grad_norm": 2.637420415878296, + "learning_rate": 2.7160212266450667e-05, + "loss": 2.0875, + "step": 102240 + }, + { + "epoch": 0.19912641815866242, + "grad_norm": 3.5613598823547363, + "learning_rate": 2.7159406289597227e-05, + "loss": 1.8819, + "step": 102255 + }, + { + "epoch": 0.19915562842977266, + "grad_norm": 6.260428428649902, + "learning_rate": 2.7158600210347715e-05, + "loss": 1.942, + "step": 102270 + }, + { + "epoch": 0.19918483870088294, + "grad_norm": 2.226191759109497, + "learning_rate": 2.7157794028708918e-05, + "loss": 2.0468, + "step": 102285 + }, + { + "epoch": 0.19921404897199319, + "grad_norm": 2.714599847793579, + "learning_rate": 2.715698774468763e-05, + "loss": 1.9678, + "step": 102300 + }, + { + "epoch": 0.19924325924310346, + "grad_norm": 4.426758766174316, + "learning_rate": 2.7156181358290638e-05, + "loss": 1.7769, + "step": 102315 + }, + { + "epoch": 0.1992724695142137, + "grad_norm": 2.7750377655029297, + "learning_rate": 2.7155374869524735e-05, + "loss": 1.7746, + "step": 102330 + }, + { + "epoch": 0.19930167978532398, + "grad_norm": 3.2409069538116455, + "learning_rate": 2.7154568278396712e-05, + "loss": 1.8149, + "step": 102345 + }, + { + "epoch": 0.19933089005643426, + "grad_norm": 2.874199151992798, + "learning_rate": 2.7153761584913354e-05, + "loss": 1.7783, + "step": 102360 + }, + { + "epoch": 0.1993601003275445, + "grad_norm": 3.955298900604248, + "learning_rate": 2.7152954789081467e-05, + "loss": 1.8125, + "step": 102375 + }, + { + "epoch": 0.19938931059865478, + "grad_norm": 2.1332504749298096, + "learning_rate": 2.7152147890907833e-05, + "loss": 1.7314, + "step": 102390 + }, + { + "epoch": 0.19941852086976503, + "grad_norm": 3.648003578186035, + "learning_rate": 2.7151340890399257e-05, + "loss": 1.8598, + "step": 102405 + }, + { + "epoch": 0.1994477311408753, + "grad_norm": 2.9315056800842285, + "learning_rate": 2.7150533787562527e-05, + "loss": 2.0115, + "step": 102420 + }, + { + "epoch": 0.19947694141198555, + "grad_norm": 2.950479507446289, + "learning_rate": 2.7149726582404445e-05, + "loss": 2.0912, + "step": 102435 + }, + { + "epoch": 0.19950615168309582, + "grad_norm": 3.925987958908081, + "learning_rate": 2.7148919274931806e-05, + "loss": 1.8702, + "step": 102450 + }, + { + "epoch": 0.1995353619542061, + "grad_norm": 3.4937009811401367, + "learning_rate": 2.7148111865151407e-05, + "loss": 1.916, + "step": 102465 + }, + { + "epoch": 0.19956457222531634, + "grad_norm": 4.474387168884277, + "learning_rate": 2.7147304353070052e-05, + "loss": 1.7941, + "step": 102480 + }, + { + "epoch": 0.19959378249642662, + "grad_norm": 2.228379726409912, + "learning_rate": 2.7146496738694537e-05, + "loss": 1.8904, + "step": 102495 + }, + { + "epoch": 0.19962299276753687, + "grad_norm": 3.820441961288452, + "learning_rate": 2.7145689022031666e-05, + "loss": 1.9109, + "step": 102510 + }, + { + "epoch": 0.19965220303864714, + "grad_norm": 2.4948506355285645, + "learning_rate": 2.714488120308824e-05, + "loss": 1.7549, + "step": 102525 + }, + { + "epoch": 0.1996814133097574, + "grad_norm": 3.0644490718841553, + "learning_rate": 2.714407328187106e-05, + "loss": 1.8017, + "step": 102540 + }, + { + "epoch": 0.19971062358086766, + "grad_norm": 2.6746768951416016, + "learning_rate": 2.7143265258386932e-05, + "loss": 1.9385, + "step": 102555 + }, + { + "epoch": 0.19973983385197794, + "grad_norm": 3.4077327251434326, + "learning_rate": 2.7142457132642655e-05, + "loss": 1.7609, + "step": 102570 + }, + { + "epoch": 0.19976904412308819, + "grad_norm": 3.0413081645965576, + "learning_rate": 2.7141648904645048e-05, + "loss": 1.8311, + "step": 102585 + }, + { + "epoch": 0.19979825439419846, + "grad_norm": 3.702873945236206, + "learning_rate": 2.7140840574400898e-05, + "loss": 1.7802, + "step": 102600 + }, + { + "epoch": 0.1998274646653087, + "grad_norm": 3.796261787414551, + "learning_rate": 2.7140032141917028e-05, + "loss": 1.8731, + "step": 102615 + }, + { + "epoch": 0.19985667493641898, + "grad_norm": 4.078587055206299, + "learning_rate": 2.713922360720024e-05, + "loss": 1.9422, + "step": 102630 + }, + { + "epoch": 0.19988588520752923, + "grad_norm": 4.734653949737549, + "learning_rate": 2.7138414970257338e-05, + "loss": 1.9146, + "step": 102645 + }, + { + "epoch": 0.1999150954786395, + "grad_norm": 4.855679988861084, + "learning_rate": 2.713760623109514e-05, + "loss": 1.8551, + "step": 102660 + }, + { + "epoch": 0.19994430574974978, + "grad_norm": 4.696156978607178, + "learning_rate": 2.7136797389720448e-05, + "loss": 1.9568, + "step": 102675 + }, + { + "epoch": 0.19997351602086003, + "grad_norm": 2.8986644744873047, + "learning_rate": 2.7135988446140083e-05, + "loss": 2.0761, + "step": 102690 + }, + { + "epoch": 0.2000027262919703, + "grad_norm": 3.2962539196014404, + "learning_rate": 2.7135179400360853e-05, + "loss": 1.9931, + "step": 102705 + }, + { + "epoch": 0.20003193656308055, + "grad_norm": 4.5320916175842285, + "learning_rate": 2.713437025238957e-05, + "loss": 1.8497, + "step": 102720 + }, + { + "epoch": 0.20006114683419082, + "grad_norm": 2.005916118621826, + "learning_rate": 2.7133561002233044e-05, + "loss": 2.0168, + "step": 102735 + }, + { + "epoch": 0.20009035710530107, + "grad_norm": 3.2702598571777344, + "learning_rate": 2.71327516498981e-05, + "loss": 1.8448, + "step": 102750 + }, + { + "epoch": 0.20011956737641134, + "grad_norm": 2.131328582763672, + "learning_rate": 2.7131942195391543e-05, + "loss": 1.9693, + "step": 102765 + }, + { + "epoch": 0.20014877764752162, + "grad_norm": 2.6156246662139893, + "learning_rate": 2.7131132638720196e-05, + "loss": 1.8124, + "step": 102780 + }, + { + "epoch": 0.20017798791863187, + "grad_norm": 2.9488697052001953, + "learning_rate": 2.7130322979890882e-05, + "loss": 1.8311, + "step": 102795 + }, + { + "epoch": 0.20020719818974214, + "grad_norm": 2.3783297538757324, + "learning_rate": 2.7129513218910407e-05, + "loss": 1.9323, + "step": 102810 + }, + { + "epoch": 0.2002364084608524, + "grad_norm": 2.0522263050079346, + "learning_rate": 2.712870335578559e-05, + "loss": 1.7895, + "step": 102825 + }, + { + "epoch": 0.20026561873196266, + "grad_norm": 3.9332668781280518, + "learning_rate": 2.7127893390523265e-05, + "loss": 1.7973, + "step": 102840 + }, + { + "epoch": 0.2002948290030729, + "grad_norm": 4.039329528808594, + "learning_rate": 2.712708332313024e-05, + "loss": 1.8541, + "step": 102855 + }, + { + "epoch": 0.20032403927418319, + "grad_norm": 3.6007204055786133, + "learning_rate": 2.712627315361334e-05, + "loss": 1.8732, + "step": 102870 + }, + { + "epoch": 0.20035324954529343, + "grad_norm": 2.704641342163086, + "learning_rate": 2.7125462881979388e-05, + "loss": 1.9446, + "step": 102885 + }, + { + "epoch": 0.2003824598164037, + "grad_norm": 3.416064500808716, + "learning_rate": 2.7124652508235208e-05, + "loss": 1.9198, + "step": 102900 + }, + { + "epoch": 0.20041167008751398, + "grad_norm": 3.379969358444214, + "learning_rate": 2.7123842032387624e-05, + "loss": 1.7285, + "step": 102915 + }, + { + "epoch": 0.20044088035862423, + "grad_norm": 3.459026336669922, + "learning_rate": 2.7123031454443462e-05, + "loss": 1.9199, + "step": 102930 + }, + { + "epoch": 0.2004700906297345, + "grad_norm": 4.292699813842773, + "learning_rate": 2.7122220774409546e-05, + "loss": 2.0013, + "step": 102945 + }, + { + "epoch": 0.20049930090084475, + "grad_norm": 1.6678625345230103, + "learning_rate": 2.7121409992292703e-05, + "loss": 1.8642, + "step": 102960 + }, + { + "epoch": 0.20052851117195503, + "grad_norm": 4.050575256347656, + "learning_rate": 2.7120599108099764e-05, + "loss": 1.6726, + "step": 102975 + }, + { + "epoch": 0.20055772144306527, + "grad_norm": 3.3889718055725098, + "learning_rate": 2.7119788121837556e-05, + "loss": 1.9645, + "step": 102990 + }, + { + "epoch": 0.20058693171417555, + "grad_norm": 3.196504831314087, + "learning_rate": 2.7118977033512905e-05, + "loss": 1.8479, + "step": 103005 + }, + { + "epoch": 0.20061614198528582, + "grad_norm": 3.099273681640625, + "learning_rate": 2.7118165843132647e-05, + "loss": 1.8896, + "step": 103020 + }, + { + "epoch": 0.20064535225639607, + "grad_norm": 3.556197166442871, + "learning_rate": 2.7117354550703607e-05, + "loss": 2.0707, + "step": 103035 + }, + { + "epoch": 0.20067456252750634, + "grad_norm": 4.080671310424805, + "learning_rate": 2.711654315623262e-05, + "loss": 1.9226, + "step": 103050 + }, + { + "epoch": 0.2007037727986166, + "grad_norm": 3.0881898403167725, + "learning_rate": 2.711573165972652e-05, + "loss": 1.8523, + "step": 103065 + }, + { + "epoch": 0.20073298306972687, + "grad_norm": 2.312800407409668, + "learning_rate": 2.711492006119214e-05, + "loss": 1.8602, + "step": 103080 + }, + { + "epoch": 0.20076219334083711, + "grad_norm": 4.4333319664001465, + "learning_rate": 2.7114108360636316e-05, + "loss": 1.825, + "step": 103095 + }, + { + "epoch": 0.2007914036119474, + "grad_norm": 3.437143564224243, + "learning_rate": 2.7113296558065878e-05, + "loss": 1.7797, + "step": 103110 + }, + { + "epoch": 0.20082061388305766, + "grad_norm": 2.5153558254241943, + "learning_rate": 2.711248465348767e-05, + "loss": 1.9479, + "step": 103125 + }, + { + "epoch": 0.2008498241541679, + "grad_norm": 3.522582530975342, + "learning_rate": 2.7111672646908518e-05, + "loss": 1.951, + "step": 103140 + }, + { + "epoch": 0.20087903442527819, + "grad_norm": 3.816594362258911, + "learning_rate": 2.7110860538335275e-05, + "loss": 1.8019, + "step": 103155 + }, + { + "epoch": 0.20090824469638843, + "grad_norm": 5.287302494049072, + "learning_rate": 2.711004832777477e-05, + "loss": 2.1064, + "step": 103170 + }, + { + "epoch": 0.2009374549674987, + "grad_norm": 3.466721296310425, + "learning_rate": 2.7109236015233843e-05, + "loss": 1.8129, + "step": 103185 + }, + { + "epoch": 0.20096666523860895, + "grad_norm": 3.488677978515625, + "learning_rate": 2.7108423600719336e-05, + "loss": 1.8422, + "step": 103200 + }, + { + "epoch": 0.20099587550971923, + "grad_norm": 5.367987155914307, + "learning_rate": 2.7107611084238092e-05, + "loss": 1.8231, + "step": 103215 + }, + { + "epoch": 0.2010250857808295, + "grad_norm": 4.408047199249268, + "learning_rate": 2.7106798465796954e-05, + "loss": 1.7306, + "step": 103230 + }, + { + "epoch": 0.20105429605193975, + "grad_norm": 5.00948429107666, + "learning_rate": 2.710598574540276e-05, + "loss": 1.894, + "step": 103245 + }, + { + "epoch": 0.20108350632305003, + "grad_norm": 3.5502610206604004, + "learning_rate": 2.7105172923062363e-05, + "loss": 1.7999, + "step": 103260 + }, + { + "epoch": 0.20111271659416027, + "grad_norm": 3.3690390586853027, + "learning_rate": 2.71043599987826e-05, + "loss": 1.9969, + "step": 103275 + }, + { + "epoch": 0.20114192686527055, + "grad_norm": 2.3844897747039795, + "learning_rate": 2.7103546972570313e-05, + "loss": 1.9513, + "step": 103290 + }, + { + "epoch": 0.2011711371363808, + "grad_norm": 5.026220798492432, + "learning_rate": 2.710273384443236e-05, + "loss": 2.0869, + "step": 103305 + }, + { + "epoch": 0.20120034740749107, + "grad_norm": 2.3385205268859863, + "learning_rate": 2.7101920614375582e-05, + "loss": 1.9342, + "step": 103320 + }, + { + "epoch": 0.20122955767860135, + "grad_norm": 4.285235404968262, + "learning_rate": 2.710110728240683e-05, + "loss": 1.8725, + "step": 103335 + }, + { + "epoch": 0.2012587679497116, + "grad_norm": 3.1919143199920654, + "learning_rate": 2.7100293848532947e-05, + "loss": 1.8921, + "step": 103350 + }, + { + "epoch": 0.20128797822082187, + "grad_norm": 2.3677237033843994, + "learning_rate": 2.7099480312760792e-05, + "loss": 1.8099, + "step": 103365 + }, + { + "epoch": 0.20131718849193211, + "grad_norm": 3.0225563049316406, + "learning_rate": 2.709866667509721e-05, + "loss": 1.8103, + "step": 103380 + }, + { + "epoch": 0.2013463987630424, + "grad_norm": 2.1988725662231445, + "learning_rate": 2.7097852935549058e-05, + "loss": 1.7229, + "step": 103395 + }, + { + "epoch": 0.20137560903415264, + "grad_norm": 3.9102203845977783, + "learning_rate": 2.709703909412318e-05, + "loss": 1.8743, + "step": 103410 + }, + { + "epoch": 0.2014048193052629, + "grad_norm": 2.8515665531158447, + "learning_rate": 2.7096225150826435e-05, + "loss": 2.0625, + "step": 103425 + }, + { + "epoch": 0.20143402957637319, + "grad_norm": 2.164712905883789, + "learning_rate": 2.7095411105665678e-05, + "loss": 1.8505, + "step": 103440 + }, + { + "epoch": 0.20146323984748343, + "grad_norm": 3.596843719482422, + "learning_rate": 2.709459695864776e-05, + "loss": 1.8719, + "step": 103455 + }, + { + "epoch": 0.2014924501185937, + "grad_norm": 3.7793238162994385, + "learning_rate": 2.7093782709779544e-05, + "loss": 1.9638, + "step": 103470 + }, + { + "epoch": 0.20152166038970395, + "grad_norm": 2.9691150188446045, + "learning_rate": 2.7092968359067887e-05, + "loss": 1.9263, + "step": 103485 + }, + { + "epoch": 0.20155087066081423, + "grad_norm": 4.105165004730225, + "learning_rate": 2.7092153906519637e-05, + "loss": 1.7756, + "step": 103500 + }, + { + "epoch": 0.20158008093192448, + "grad_norm": 2.3732666969299316, + "learning_rate": 2.7091339352141655e-05, + "loss": 1.7384, + "step": 103515 + }, + { + "epoch": 0.20160929120303475, + "grad_norm": 3.698559284210205, + "learning_rate": 2.709052469594081e-05, + "loss": 1.9224, + "step": 103530 + }, + { + "epoch": 0.20163850147414503, + "grad_norm": 1.9953960180282593, + "learning_rate": 2.708970993792395e-05, + "loss": 1.8934, + "step": 103545 + }, + { + "epoch": 0.20166771174525527, + "grad_norm": 4.226125717163086, + "learning_rate": 2.708889507809795e-05, + "loss": 1.9125, + "step": 103560 + }, + { + "epoch": 0.20169692201636555, + "grad_norm": 2.2995388507843018, + "learning_rate": 2.708808011646966e-05, + "loss": 1.6493, + "step": 103575 + }, + { + "epoch": 0.2017261322874758, + "grad_norm": 2.9458460807800293, + "learning_rate": 2.7087265053045948e-05, + "loss": 1.9539, + "step": 103590 + }, + { + "epoch": 0.20175534255858607, + "grad_norm": 2.137338161468506, + "learning_rate": 2.7086449887833677e-05, + "loss": 1.928, + "step": 103605 + }, + { + "epoch": 0.20178455282969632, + "grad_norm": 2.3770806789398193, + "learning_rate": 2.7085634620839707e-05, + "loss": 1.8384, + "step": 103620 + }, + { + "epoch": 0.2018137631008066, + "grad_norm": 2.7432820796966553, + "learning_rate": 2.708481925207091e-05, + "loss": 1.7668, + "step": 103635 + }, + { + "epoch": 0.20184297337191687, + "grad_norm": 3.921224355697632, + "learning_rate": 2.7084003781534154e-05, + "loss": 1.9747, + "step": 103650 + }, + { + "epoch": 0.20187218364302711, + "grad_norm": 3.390430450439453, + "learning_rate": 2.70831882092363e-05, + "loss": 1.8953, + "step": 103665 + }, + { + "epoch": 0.2019013939141374, + "grad_norm": 3.7544443607330322, + "learning_rate": 2.7082372535184222e-05, + "loss": 1.9268, + "step": 103680 + }, + { + "epoch": 0.20193060418524764, + "grad_norm": 2.534512758255005, + "learning_rate": 2.708155675938478e-05, + "loss": 1.8246, + "step": 103695 + }, + { + "epoch": 0.2019598144563579, + "grad_norm": 3.969911813735962, + "learning_rate": 2.7080740881844855e-05, + "loss": 1.7909, + "step": 103710 + }, + { + "epoch": 0.20198902472746816, + "grad_norm": 3.7282814979553223, + "learning_rate": 2.707992490257131e-05, + "loss": 1.9399, + "step": 103725 + }, + { + "epoch": 0.20201823499857843, + "grad_norm": 4.160645961761475, + "learning_rate": 2.7079108821571016e-05, + "loss": 1.8279, + "step": 103740 + }, + { + "epoch": 0.2020474452696887, + "grad_norm": 4.1503520011901855, + "learning_rate": 2.707829263885085e-05, + "loss": 1.9176, + "step": 103755 + }, + { + "epoch": 0.20207665554079896, + "grad_norm": 2.209500551223755, + "learning_rate": 2.707747635441768e-05, + "loss": 1.8911, + "step": 103770 + }, + { + "epoch": 0.20210586581190923, + "grad_norm": 2.6259777545928955, + "learning_rate": 2.707665996827839e-05, + "loss": 1.9738, + "step": 103785 + }, + { + "epoch": 0.20213507608301948, + "grad_norm": 4.037173748016357, + "learning_rate": 2.7075843480439842e-05, + "loss": 1.7143, + "step": 103800 + }, + { + "epoch": 0.20216428635412975, + "grad_norm": 3.297034502029419, + "learning_rate": 2.7075026890908917e-05, + "loss": 1.9287, + "step": 103815 + }, + { + "epoch": 0.20219349662524, + "grad_norm": 2.4206628799438477, + "learning_rate": 2.7074210199692496e-05, + "loss": 1.9603, + "step": 103830 + }, + { + "epoch": 0.20222270689635027, + "grad_norm": 2.2325990200042725, + "learning_rate": 2.7073393406797447e-05, + "loss": 1.8784, + "step": 103845 + }, + { + "epoch": 0.20225191716746055, + "grad_norm": 2.370084285736084, + "learning_rate": 2.7072576512230662e-05, + "loss": 1.7058, + "step": 103860 + }, + { + "epoch": 0.2022811274385708, + "grad_norm": 2.903735399246216, + "learning_rate": 2.707175951599901e-05, + "loss": 1.906, + "step": 103875 + }, + { + "epoch": 0.20231033770968107, + "grad_norm": 3.246626853942871, + "learning_rate": 2.7070942418109368e-05, + "loss": 1.8229, + "step": 103890 + }, + { + "epoch": 0.20233954798079132, + "grad_norm": 4.666037082672119, + "learning_rate": 2.7070125218568628e-05, + "loss": 1.8774, + "step": 103905 + }, + { + "epoch": 0.2023687582519016, + "grad_norm": 4.1519694328308105, + "learning_rate": 2.7069307917383663e-05, + "loss": 2.0587, + "step": 103920 + }, + { + "epoch": 0.20239796852301184, + "grad_norm": 3.9576058387756348, + "learning_rate": 2.7068490514561358e-05, + "loss": 1.7374, + "step": 103935 + }, + { + "epoch": 0.20242717879412211, + "grad_norm": 2.1355578899383545, + "learning_rate": 2.70676730101086e-05, + "loss": 1.7597, + "step": 103950 + }, + { + "epoch": 0.2024563890652324, + "grad_norm": 2.4091837406158447, + "learning_rate": 2.7066855404032266e-05, + "loss": 1.8022, + "step": 103965 + }, + { + "epoch": 0.20248559933634264, + "grad_norm": 2.44571852684021, + "learning_rate": 2.706603769633925e-05, + "loss": 1.8487, + "step": 103980 + }, + { + "epoch": 0.2025148096074529, + "grad_norm": 2.6934497356414795, + "learning_rate": 2.706521988703643e-05, + "loss": 1.7884, + "step": 103995 + }, + { + "epoch": 0.20254401987856316, + "grad_norm": 3.907776355743408, + "learning_rate": 2.70644019761307e-05, + "loss": 1.9073, + "step": 104010 + }, + { + "epoch": 0.20257323014967343, + "grad_norm": 2.401895761489868, + "learning_rate": 2.7063583963628943e-05, + "loss": 1.7977, + "step": 104025 + }, + { + "epoch": 0.20260244042078368, + "grad_norm": 4.842097282409668, + "learning_rate": 2.7062765849538046e-05, + "loss": 1.9172, + "step": 104040 + }, + { + "epoch": 0.20263165069189396, + "grad_norm": 3.4322097301483154, + "learning_rate": 2.7061947633864903e-05, + "loss": 1.8985, + "step": 104055 + }, + { + "epoch": 0.20266086096300423, + "grad_norm": 3.141848564147949, + "learning_rate": 2.7061129316616403e-05, + "loss": 1.6927, + "step": 104070 + }, + { + "epoch": 0.20269007123411448, + "grad_norm": 3.1912946701049805, + "learning_rate": 2.7060310897799436e-05, + "loss": 2.0986, + "step": 104085 + }, + { + "epoch": 0.20271928150522475, + "grad_norm": 4.239555358886719, + "learning_rate": 2.7059492377420895e-05, + "loss": 1.9034, + "step": 104100 + }, + { + "epoch": 0.202748491776335, + "grad_norm": 2.3730475902557373, + "learning_rate": 2.705867375548767e-05, + "loss": 2.0115, + "step": 104115 + }, + { + "epoch": 0.20277770204744527, + "grad_norm": 4.532721042633057, + "learning_rate": 2.7057855032006662e-05, + "loss": 1.857, + "step": 104130 + }, + { + "epoch": 0.20280691231855552, + "grad_norm": 5.284004211425781, + "learning_rate": 2.7057036206984763e-05, + "loss": 1.806, + "step": 104145 + }, + { + "epoch": 0.2028361225896658, + "grad_norm": 3.16776967048645, + "learning_rate": 2.705621728042886e-05, + "loss": 1.8424, + "step": 104160 + }, + { + "epoch": 0.20286533286077607, + "grad_norm": 3.300427198410034, + "learning_rate": 2.705539825234586e-05, + "loss": 1.9143, + "step": 104175 + }, + { + "epoch": 0.20289454313188632, + "grad_norm": 3.5622899532318115, + "learning_rate": 2.7054579122742653e-05, + "loss": 1.8092, + "step": 104190 + }, + { + "epoch": 0.2029237534029966, + "grad_norm": 2.274960517883301, + "learning_rate": 2.705375989162614e-05, + "loss": 1.8388, + "step": 104205 + }, + { + "epoch": 0.20295296367410684, + "grad_norm": 2.7221145629882812, + "learning_rate": 2.705294055900322e-05, + "loss": 1.7465, + "step": 104220 + }, + { + "epoch": 0.20298217394521711, + "grad_norm": 4.520534038543701, + "learning_rate": 2.705212112488079e-05, + "loss": 1.826, + "step": 104235 + }, + { + "epoch": 0.20301138421632736, + "grad_norm": 3.143634080886841, + "learning_rate": 2.705130158926576e-05, + "loss": 1.8376, + "step": 104250 + }, + { + "epoch": 0.20304059448743764, + "grad_norm": 4.016301155090332, + "learning_rate": 2.7050481952165016e-05, + "loss": 1.9989, + "step": 104265 + }, + { + "epoch": 0.2030698047585479, + "grad_norm": 2.420607805252075, + "learning_rate": 2.7049662213585472e-05, + "loss": 1.8995, + "step": 104280 + }, + { + "epoch": 0.20309901502965816, + "grad_norm": 3.2352330684661865, + "learning_rate": 2.7048842373534034e-05, + "loss": 1.9802, + "step": 104295 + }, + { + "epoch": 0.20312822530076843, + "grad_norm": 4.717593193054199, + "learning_rate": 2.704802243201759e-05, + "loss": 2.0489, + "step": 104310 + }, + { + "epoch": 0.20315743557187868, + "grad_norm": 3.2965643405914307, + "learning_rate": 2.704720238904306e-05, + "loss": 1.9178, + "step": 104325 + }, + { + "epoch": 0.20318664584298896, + "grad_norm": 2.39512038230896, + "learning_rate": 2.704638224461734e-05, + "loss": 1.9478, + "step": 104340 + }, + { + "epoch": 0.2032158561140992, + "grad_norm": 4.014657020568848, + "learning_rate": 2.7045561998747347e-05, + "loss": 2.0277, + "step": 104355 + }, + { + "epoch": 0.20324506638520948, + "grad_norm": 4.472046375274658, + "learning_rate": 2.704474165143998e-05, + "loss": 1.9822, + "step": 104370 + }, + { + "epoch": 0.20327427665631975, + "grad_norm": 3.159424304962158, + "learning_rate": 2.7043921202702148e-05, + "loss": 2.0528, + "step": 104385 + }, + { + "epoch": 0.20330348692743, + "grad_norm": 2.7689337730407715, + "learning_rate": 2.704310065254076e-05, + "loss": 1.7795, + "step": 104400 + }, + { + "epoch": 0.20333269719854027, + "grad_norm": 2.1698110103607178, + "learning_rate": 2.704228000096273e-05, + "loss": 1.9219, + "step": 104415 + }, + { + "epoch": 0.20336190746965052, + "grad_norm": 4.467400550842285, + "learning_rate": 2.704145924797497e-05, + "loss": 1.865, + "step": 104430 + }, + { + "epoch": 0.2033911177407608, + "grad_norm": 4.180427551269531, + "learning_rate": 2.7040638393584383e-05, + "loss": 1.9281, + "step": 104445 + }, + { + "epoch": 0.20342032801187104, + "grad_norm": 3.548189401626587, + "learning_rate": 2.7039817437797884e-05, + "loss": 1.8264, + "step": 104460 + }, + { + "epoch": 0.20344953828298132, + "grad_norm": 3.7842812538146973, + "learning_rate": 2.7038996380622395e-05, + "loss": 1.8274, + "step": 104475 + }, + { + "epoch": 0.2034787485540916, + "grad_norm": 2.923065662384033, + "learning_rate": 2.7038175222064825e-05, + "loss": 1.7776, + "step": 104490 + }, + { + "epoch": 0.20350795882520184, + "grad_norm": 5.125633239746094, + "learning_rate": 2.7037353962132085e-05, + "loss": 2.1145, + "step": 104505 + }, + { + "epoch": 0.20353716909631211, + "grad_norm": 3.130587100982666, + "learning_rate": 2.7036532600831093e-05, + "loss": 2.0625, + "step": 104520 + }, + { + "epoch": 0.20356637936742236, + "grad_norm": 3.0003304481506348, + "learning_rate": 2.703571113816877e-05, + "loss": 1.8489, + "step": 104535 + }, + { + "epoch": 0.20359558963853264, + "grad_norm": 4.782923698425293, + "learning_rate": 2.7034889574152028e-05, + "loss": 1.8088, + "step": 104550 + }, + { + "epoch": 0.20362479990964288, + "grad_norm": 4.285282611846924, + "learning_rate": 2.703406790878779e-05, + "loss": 1.9812, + "step": 104565 + }, + { + "epoch": 0.20365401018075316, + "grad_norm": 2.134176015853882, + "learning_rate": 2.7033246142082977e-05, + "loss": 1.7831, + "step": 104580 + }, + { + "epoch": 0.20368322045186343, + "grad_norm": 2.755265235900879, + "learning_rate": 2.7032424274044498e-05, + "loss": 1.9783, + "step": 104595 + }, + { + "epoch": 0.20371243072297368, + "grad_norm": 2.0685677528381348, + "learning_rate": 2.7031602304679285e-05, + "loss": 2.0031, + "step": 104610 + }, + { + "epoch": 0.20374164099408396, + "grad_norm": 1.9223815202713013, + "learning_rate": 2.7030780233994262e-05, + "loss": 2.0563, + "step": 104625 + }, + { + "epoch": 0.2037708512651942, + "grad_norm": 2.5546908378601074, + "learning_rate": 2.7029958061996346e-05, + "loss": 1.7657, + "step": 104640 + }, + { + "epoch": 0.20380006153630448, + "grad_norm": 4.194926738739014, + "learning_rate": 2.7029135788692456e-05, + "loss": 1.8606, + "step": 104655 + }, + { + "epoch": 0.20382927180741472, + "grad_norm": 3.4608054161071777, + "learning_rate": 2.7028313414089524e-05, + "loss": 1.7947, + "step": 104670 + }, + { + "epoch": 0.203858482078525, + "grad_norm": 2.4103293418884277, + "learning_rate": 2.7027490938194473e-05, + "loss": 2.0522, + "step": 104685 + }, + { + "epoch": 0.20388769234963527, + "grad_norm": 3.410761594772339, + "learning_rate": 2.7026668361014227e-05, + "loss": 2.0221, + "step": 104700 + }, + { + "epoch": 0.20391690262074552, + "grad_norm": 3.8220906257629395, + "learning_rate": 2.7025845682555718e-05, + "loss": 2.0778, + "step": 104715 + }, + { + "epoch": 0.2039461128918558, + "grad_norm": 3.373382568359375, + "learning_rate": 2.702502290282587e-05, + "loss": 1.8084, + "step": 104730 + }, + { + "epoch": 0.20397532316296604, + "grad_norm": 3.017993927001953, + "learning_rate": 2.7024200021831615e-05, + "loss": 2.0226, + "step": 104745 + }, + { + "epoch": 0.20400453343407632, + "grad_norm": 4.596261501312256, + "learning_rate": 2.7023377039579874e-05, + "loss": 2.0108, + "step": 104760 + }, + { + "epoch": 0.20403374370518657, + "grad_norm": 3.1485347747802734, + "learning_rate": 2.7022553956077593e-05, + "loss": 1.8845, + "step": 104775 + }, + { + "epoch": 0.20406295397629684, + "grad_norm": 2.4047646522521973, + "learning_rate": 2.7021730771331686e-05, + "loss": 1.7433, + "step": 104790 + }, + { + "epoch": 0.20409216424740712, + "grad_norm": 3.2625772953033447, + "learning_rate": 2.7020907485349096e-05, + "loss": 1.8587, + "step": 104805 + }, + { + "epoch": 0.20412137451851736, + "grad_norm": 2.0681827068328857, + "learning_rate": 2.7020084098136758e-05, + "loss": 1.8219, + "step": 104820 + }, + { + "epoch": 0.20415058478962764, + "grad_norm": 3.6618101596832275, + "learning_rate": 2.7019260609701594e-05, + "loss": 2.09, + "step": 104835 + }, + { + "epoch": 0.20417979506073788, + "grad_norm": 3.0171432495117188, + "learning_rate": 2.701843702005055e-05, + "loss": 1.957, + "step": 104850 + }, + { + "epoch": 0.20420900533184816, + "grad_norm": 4.452732563018799, + "learning_rate": 2.701761332919056e-05, + "loss": 2.0083, + "step": 104865 + }, + { + "epoch": 0.2042382156029584, + "grad_norm": 3.656525135040283, + "learning_rate": 2.7016789537128554e-05, + "loss": 1.8614, + "step": 104880 + }, + { + "epoch": 0.20426742587406868, + "grad_norm": 4.117002487182617, + "learning_rate": 2.7015965643871473e-05, + "loss": 1.8147, + "step": 104895 + }, + { + "epoch": 0.20429663614517896, + "grad_norm": 2.860213279724121, + "learning_rate": 2.701514164942626e-05, + "loss": 1.9346, + "step": 104910 + }, + { + "epoch": 0.2043258464162892, + "grad_norm": 2.736870527267456, + "learning_rate": 2.7014317553799844e-05, + "loss": 2.0484, + "step": 104925 + }, + { + "epoch": 0.20435505668739948, + "grad_norm": 3.932136058807373, + "learning_rate": 2.7013493356999172e-05, + "loss": 1.8952, + "step": 104940 + }, + { + "epoch": 0.20438426695850972, + "grad_norm": 2.1967592239379883, + "learning_rate": 2.7012669059031185e-05, + "loss": 1.8207, + "step": 104955 + }, + { + "epoch": 0.20441347722962, + "grad_norm": 5.245597839355469, + "learning_rate": 2.701184465990282e-05, + "loss": 1.9467, + "step": 104970 + }, + { + "epoch": 0.20444268750073025, + "grad_norm": 3.5043270587921143, + "learning_rate": 2.7011020159621022e-05, + "loss": 1.9864, + "step": 104985 + }, + { + "epoch": 0.20447189777184052, + "grad_norm": 3.504653215408325, + "learning_rate": 2.7010195558192734e-05, + "loss": 1.942, + "step": 105000 + }, + { + "epoch": 0.20450110804295077, + "grad_norm": 4.149007797241211, + "learning_rate": 2.70093708556249e-05, + "loss": 1.8938, + "step": 105015 + }, + { + "epoch": 0.20453031831406104, + "grad_norm": 2.8751494884490967, + "learning_rate": 2.7008546051924465e-05, + "loss": 1.6703, + "step": 105030 + }, + { + "epoch": 0.20455952858517132, + "grad_norm": 4.414788246154785, + "learning_rate": 2.7007721147098374e-05, + "loss": 1.8568, + "step": 105045 + }, + { + "epoch": 0.20458873885628157, + "grad_norm": 4.349208354949951, + "learning_rate": 2.7006896141153575e-05, + "loss": 1.9922, + "step": 105060 + }, + { + "epoch": 0.20461794912739184, + "grad_norm": 3.0307412147521973, + "learning_rate": 2.7006071034097016e-05, + "loss": 1.8679, + "step": 105075 + }, + { + "epoch": 0.2046471593985021, + "grad_norm": 4.721350193023682, + "learning_rate": 2.7005245825935644e-05, + "loss": 1.8604, + "step": 105090 + }, + { + "epoch": 0.20467636966961236, + "grad_norm": 2.6964800357818604, + "learning_rate": 2.7004420516676406e-05, + "loss": 1.9879, + "step": 105105 + }, + { + "epoch": 0.2047055799407226, + "grad_norm": 5.850434303283691, + "learning_rate": 2.7003595106326257e-05, + "loss": 1.8275, + "step": 105120 + }, + { + "epoch": 0.20473479021183288, + "grad_norm": 3.57669734954834, + "learning_rate": 2.7002769594892144e-05, + "loss": 1.9108, + "step": 105135 + }, + { + "epoch": 0.20476400048294316, + "grad_norm": 2.936375617980957, + "learning_rate": 2.700194398238102e-05, + "loss": 1.9061, + "step": 105150 + }, + { + "epoch": 0.2047932107540534, + "grad_norm": 2.5022313594818115, + "learning_rate": 2.7001118268799838e-05, + "loss": 1.8225, + "step": 105165 + }, + { + "epoch": 0.20482242102516368, + "grad_norm": 3.7294983863830566, + "learning_rate": 2.700029245415555e-05, + "loss": 1.9532, + "step": 105180 + }, + { + "epoch": 0.20485163129627393, + "grad_norm": 2.8929784297943115, + "learning_rate": 2.6999466538455116e-05, + "loss": 1.8597, + "step": 105195 + }, + { + "epoch": 0.2048808415673842, + "grad_norm": 2.210120916366577, + "learning_rate": 2.6998640521705484e-05, + "loss": 2.0459, + "step": 105210 + }, + { + "epoch": 0.20491005183849445, + "grad_norm": 3.9188921451568604, + "learning_rate": 2.6997814403913613e-05, + "loss": 1.8567, + "step": 105225 + }, + { + "epoch": 0.20493926210960473, + "grad_norm": 3.322479248046875, + "learning_rate": 2.6996988185086456e-05, + "loss": 2.0404, + "step": 105240 + }, + { + "epoch": 0.204968472380715, + "grad_norm": 3.046121120452881, + "learning_rate": 2.6996161865230977e-05, + "loss": 1.7884, + "step": 105255 + }, + { + "epoch": 0.20499768265182525, + "grad_norm": 6.27782678604126, + "learning_rate": 2.6995335444354133e-05, + "loss": 1.9162, + "step": 105270 + }, + { + "epoch": 0.20502689292293552, + "grad_norm": 2.7504706382751465, + "learning_rate": 2.699450892246288e-05, + "loss": 2.043, + "step": 105285 + }, + { + "epoch": 0.20505610319404577, + "grad_norm": 2.5467782020568848, + "learning_rate": 2.699368229956418e-05, + "loss": 1.6969, + "step": 105300 + }, + { + "epoch": 0.20508531346515604, + "grad_norm": 2.5343880653381348, + "learning_rate": 2.6992855575664998e-05, + "loss": 1.8864, + "step": 105315 + }, + { + "epoch": 0.2051145237362663, + "grad_norm": 3.6631815433502197, + "learning_rate": 2.699202875077229e-05, + "loss": 1.9435, + "step": 105330 + }, + { + "epoch": 0.20514373400737657, + "grad_norm": 2.9802725315093994, + "learning_rate": 2.6991201824893023e-05, + "loss": 1.8798, + "step": 105345 + }, + { + "epoch": 0.20517294427848684, + "grad_norm": 4.25886344909668, + "learning_rate": 2.6990374798034155e-05, + "loss": 1.9169, + "step": 105360 + }, + { + "epoch": 0.2052021545495971, + "grad_norm": 3.4540529251098633, + "learning_rate": 2.6989547670202658e-05, + "loss": 2.0239, + "step": 105375 + }, + { + "epoch": 0.20523136482070736, + "grad_norm": 2.622673988342285, + "learning_rate": 2.6988720441405494e-05, + "loss": 1.9153, + "step": 105390 + }, + { + "epoch": 0.2052605750918176, + "grad_norm": 3.2731680870056152, + "learning_rate": 2.698789311164963e-05, + "loss": 1.8395, + "step": 105405 + }, + { + "epoch": 0.20528978536292788, + "grad_norm": 2.665842294692993, + "learning_rate": 2.6987065680942023e-05, + "loss": 1.9024, + "step": 105420 + }, + { + "epoch": 0.20531899563403813, + "grad_norm": 4.578698635101318, + "learning_rate": 2.6986238149289662e-05, + "loss": 1.7924, + "step": 105435 + }, + { + "epoch": 0.2053482059051484, + "grad_norm": 4.171306610107422, + "learning_rate": 2.6985410516699495e-05, + "loss": 1.9609, + "step": 105450 + }, + { + "epoch": 0.20537741617625868, + "grad_norm": 2.9240505695343018, + "learning_rate": 2.69845827831785e-05, + "loss": 1.8032, + "step": 105465 + }, + { + "epoch": 0.20540662644736893, + "grad_norm": 3.5840320587158203, + "learning_rate": 2.6983754948733654e-05, + "loss": 2.0775, + "step": 105480 + }, + { + "epoch": 0.2054358367184792, + "grad_norm": 3.6236684322357178, + "learning_rate": 2.6982927013371922e-05, + "loss": 1.8801, + "step": 105495 + }, + { + "epoch": 0.20546504698958945, + "grad_norm": 4.0121049880981445, + "learning_rate": 2.6982098977100276e-05, + "loss": 1.84, + "step": 105510 + }, + { + "epoch": 0.20549425726069973, + "grad_norm": 3.368441581726074, + "learning_rate": 2.6981270839925687e-05, + "loss": 1.8816, + "step": 105525 + }, + { + "epoch": 0.20552346753180997, + "grad_norm": 4.0678300857543945, + "learning_rate": 2.698044260185513e-05, + "loss": 1.9054, + "step": 105540 + }, + { + "epoch": 0.20555267780292025, + "grad_norm": 3.091350793838501, + "learning_rate": 2.697961426289558e-05, + "loss": 2.0138, + "step": 105555 + }, + { + "epoch": 0.20558188807403052, + "grad_norm": 3.3561315536499023, + "learning_rate": 2.6978785823054016e-05, + "loss": 2.0233, + "step": 105570 + }, + { + "epoch": 0.20561109834514077, + "grad_norm": 3.110116958618164, + "learning_rate": 2.6977957282337414e-05, + "loss": 1.8429, + "step": 105585 + }, + { + "epoch": 0.20564030861625104, + "grad_norm": 2.644136905670166, + "learning_rate": 2.6977128640752743e-05, + "loss": 1.9269, + "step": 105600 + }, + { + "epoch": 0.2056695188873613, + "grad_norm": 2.190315008163452, + "learning_rate": 2.6976299898306994e-05, + "loss": 1.8551, + "step": 105615 + }, + { + "epoch": 0.20569872915847157, + "grad_norm": 2.293369770050049, + "learning_rate": 2.6975471055007136e-05, + "loss": 1.9136, + "step": 105630 + }, + { + "epoch": 0.2057279394295818, + "grad_norm": 4.882502555847168, + "learning_rate": 2.697464211086015e-05, + "loss": 2.0458, + "step": 105645 + }, + { + "epoch": 0.2057571497006921, + "grad_norm": 4.075145721435547, + "learning_rate": 2.6973813065873022e-05, + "loss": 1.9796, + "step": 105660 + }, + { + "epoch": 0.20578635997180236, + "grad_norm": 4.1603193283081055, + "learning_rate": 2.6972983920052728e-05, + "loss": 1.8569, + "step": 105675 + }, + { + "epoch": 0.2058155702429126, + "grad_norm": 2.5752010345458984, + "learning_rate": 2.6972154673406253e-05, + "loss": 1.9266, + "step": 105690 + }, + { + "epoch": 0.20584478051402288, + "grad_norm": 2.711348533630371, + "learning_rate": 2.697132532594058e-05, + "loss": 2.1029, + "step": 105705 + }, + { + "epoch": 0.20587399078513313, + "grad_norm": 4.228764533996582, + "learning_rate": 2.6970495877662694e-05, + "loss": 1.7632, + "step": 105720 + }, + { + "epoch": 0.2059032010562434, + "grad_norm": 2.9290924072265625, + "learning_rate": 2.6969666328579578e-05, + "loss": 1.9544, + "step": 105735 + }, + { + "epoch": 0.20593241132735365, + "grad_norm": 2.651179313659668, + "learning_rate": 2.6968836678698217e-05, + "loss": 2.1711, + "step": 105750 + }, + { + "epoch": 0.20596162159846393, + "grad_norm": 2.6075551509857178, + "learning_rate": 2.6968006928025597e-05, + "loss": 1.9571, + "step": 105765 + }, + { + "epoch": 0.2059908318695742, + "grad_norm": 2.347520351409912, + "learning_rate": 2.696717707656871e-05, + "loss": 1.825, + "step": 105780 + }, + { + "epoch": 0.20602004214068445, + "grad_norm": 3.01656436920166, + "learning_rate": 2.6966347124334545e-05, + "loss": 1.849, + "step": 105795 + }, + { + "epoch": 0.20604925241179473, + "grad_norm": 2.5775868892669678, + "learning_rate": 2.6965517071330082e-05, + "loss": 1.8596, + "step": 105810 + }, + { + "epoch": 0.20607846268290497, + "grad_norm": 3.0855631828308105, + "learning_rate": 2.696468691756232e-05, + "loss": 1.9145, + "step": 105825 + }, + { + "epoch": 0.20610767295401525, + "grad_norm": 2.4125802516937256, + "learning_rate": 2.696385666303825e-05, + "loss": 1.9105, + "step": 105840 + }, + { + "epoch": 0.2061368832251255, + "grad_norm": 3.250332832336426, + "learning_rate": 2.6963026307764854e-05, + "loss": 1.9376, + "step": 105855 + }, + { + "epoch": 0.20616609349623577, + "grad_norm": 4.559293270111084, + "learning_rate": 2.6962195851749136e-05, + "loss": 1.7326, + "step": 105870 + }, + { + "epoch": 0.20619530376734604, + "grad_norm": 3.674166202545166, + "learning_rate": 2.696136529499808e-05, + "loss": 1.9746, + "step": 105885 + }, + { + "epoch": 0.2062245140384563, + "grad_norm": 2.3247997760772705, + "learning_rate": 2.6960534637518687e-05, + "loss": 1.7364, + "step": 105900 + }, + { + "epoch": 0.20625372430956657, + "grad_norm": 2.8081724643707275, + "learning_rate": 2.6959703879317947e-05, + "loss": 1.7246, + "step": 105915 + }, + { + "epoch": 0.2062829345806768, + "grad_norm": 4.3130645751953125, + "learning_rate": 2.695887302040286e-05, + "loss": 1.8875, + "step": 105930 + }, + { + "epoch": 0.2063121448517871, + "grad_norm": 4.337838172912598, + "learning_rate": 2.695804206078042e-05, + "loss": 1.625, + "step": 105945 + }, + { + "epoch": 0.20634135512289734, + "grad_norm": 3.595670461654663, + "learning_rate": 2.6957211000457628e-05, + "loss": 1.6323, + "step": 105960 + }, + { + "epoch": 0.2063705653940076, + "grad_norm": 4.434267044067383, + "learning_rate": 2.6956379839441483e-05, + "loss": 2.0782, + "step": 105975 + }, + { + "epoch": 0.20639977566511788, + "grad_norm": 4.447153091430664, + "learning_rate": 2.695554857773898e-05, + "loss": 1.7594, + "step": 105990 + }, + { + "epoch": 0.20642898593622813, + "grad_norm": 3.490856170654297, + "learning_rate": 2.695471721535712e-05, + "loss": 1.9743, + "step": 106005 + }, + { + "epoch": 0.2064581962073384, + "grad_norm": 5.6722092628479, + "learning_rate": 2.6953885752302905e-05, + "loss": 1.992, + "step": 106020 + }, + { + "epoch": 0.20648740647844865, + "grad_norm": 2.944232702255249, + "learning_rate": 2.6953054188583336e-05, + "loss": 1.8636, + "step": 106035 + }, + { + "epoch": 0.20651661674955893, + "grad_norm": 2.4629054069519043, + "learning_rate": 2.6952222524205414e-05, + "loss": 1.8827, + "step": 106050 + }, + { + "epoch": 0.20654582702066918, + "grad_norm": 3.015078544616699, + "learning_rate": 2.6951390759176152e-05, + "loss": 1.8345, + "step": 106065 + }, + { + "epoch": 0.20657503729177945, + "grad_norm": 2.5578644275665283, + "learning_rate": 2.695055889350254e-05, + "loss": 1.8265, + "step": 106080 + }, + { + "epoch": 0.20660424756288973, + "grad_norm": 3.5897505283355713, + "learning_rate": 2.6949726927191597e-05, + "loss": 1.9626, + "step": 106095 + }, + { + "epoch": 0.20663345783399997, + "grad_norm": 2.7222673892974854, + "learning_rate": 2.694889486025032e-05, + "loss": 2.023, + "step": 106110 + }, + { + "epoch": 0.20666266810511025, + "grad_norm": 2.735933780670166, + "learning_rate": 2.694806269268572e-05, + "loss": 1.9089, + "step": 106125 + }, + { + "epoch": 0.2066918783762205, + "grad_norm": 3.455470561981201, + "learning_rate": 2.6947230424504802e-05, + "loss": 1.9776, + "step": 106140 + }, + { + "epoch": 0.20672108864733077, + "grad_norm": 3.8460419178009033, + "learning_rate": 2.694639805571458e-05, + "loss": 1.8384, + "step": 106155 + }, + { + "epoch": 0.20675029891844102, + "grad_norm": 4.549134254455566, + "learning_rate": 2.6945565586322053e-05, + "loss": 2.0014, + "step": 106170 + }, + { + "epoch": 0.2067795091895513, + "grad_norm": 2.118866205215454, + "learning_rate": 2.6944733016334244e-05, + "loss": 1.7076, + "step": 106185 + }, + { + "epoch": 0.20680871946066157, + "grad_norm": 2.973477840423584, + "learning_rate": 2.694390034575816e-05, + "loss": 1.8859, + "step": 106200 + }, + { + "epoch": 0.2068379297317718, + "grad_norm": 3.5064425468444824, + "learning_rate": 2.6943067574600805e-05, + "loss": 1.7944, + "step": 106215 + }, + { + "epoch": 0.2068671400028821, + "grad_norm": 2.8030622005462646, + "learning_rate": 2.69422347028692e-05, + "loss": 2.0078, + "step": 106230 + }, + { + "epoch": 0.20689635027399234, + "grad_norm": 2.5658998489379883, + "learning_rate": 2.694140173057036e-05, + "loss": 1.9043, + "step": 106245 + }, + { + "epoch": 0.2069255605451026, + "grad_norm": 1.9686204195022583, + "learning_rate": 2.6940568657711295e-05, + "loss": 1.7725, + "step": 106260 + }, + { + "epoch": 0.20695477081621286, + "grad_norm": 2.970914363861084, + "learning_rate": 2.693973548429902e-05, + "loss": 1.933, + "step": 106275 + }, + { + "epoch": 0.20698398108732313, + "grad_norm": 5.538846015930176, + "learning_rate": 2.6938902210340554e-05, + "loss": 1.8968, + "step": 106290 + }, + { + "epoch": 0.2070131913584334, + "grad_norm": 3.860184907913208, + "learning_rate": 2.6938068835842918e-05, + "loss": 1.8522, + "step": 106305 + }, + { + "epoch": 0.20704240162954365, + "grad_norm": 3.534608840942383, + "learning_rate": 2.6937235360813117e-05, + "loss": 1.9046, + "step": 106320 + }, + { + "epoch": 0.20707161190065393, + "grad_norm": 4.0080695152282715, + "learning_rate": 2.6936401785258185e-05, + "loss": 1.7485, + "step": 106335 + }, + { + "epoch": 0.20710082217176418, + "grad_norm": 3.2716498374938965, + "learning_rate": 2.6935568109185135e-05, + "loss": 1.8007, + "step": 106350 + }, + { + "epoch": 0.20713003244287445, + "grad_norm": 5.6668620109558105, + "learning_rate": 2.6934734332600983e-05, + "loss": 1.8513, + "step": 106365 + }, + { + "epoch": 0.2071592427139847, + "grad_norm": 3.56961989402771, + "learning_rate": 2.6933900455512757e-05, + "loss": 1.753, + "step": 106380 + }, + { + "epoch": 0.20718845298509497, + "grad_norm": 3.086578369140625, + "learning_rate": 2.6933066477927476e-05, + "loss": 1.9384, + "step": 106395 + }, + { + "epoch": 0.20721766325620525, + "grad_norm": 6.173356056213379, + "learning_rate": 2.6932232399852162e-05, + "loss": 1.8772, + "step": 106410 + }, + { + "epoch": 0.2072468735273155, + "grad_norm": 2.9165964126586914, + "learning_rate": 2.693139822129384e-05, + "loss": 1.8219, + "step": 106425 + }, + { + "epoch": 0.20727608379842577, + "grad_norm": 2.556832790374756, + "learning_rate": 2.6930563942259544e-05, + "loss": 1.8664, + "step": 106440 + }, + { + "epoch": 0.20730529406953602, + "grad_norm": 3.238513469696045, + "learning_rate": 2.692972956275628e-05, + "loss": 1.8597, + "step": 106455 + }, + { + "epoch": 0.2073345043406463, + "grad_norm": 2.1744565963745117, + "learning_rate": 2.6928895082791093e-05, + "loss": 1.7672, + "step": 106470 + }, + { + "epoch": 0.20736371461175654, + "grad_norm": 2.2890548706054688, + "learning_rate": 2.6928060502371e-05, + "loss": 1.8973, + "step": 106485 + }, + { + "epoch": 0.2073929248828668, + "grad_norm": 3.197258949279785, + "learning_rate": 2.6927225821503032e-05, + "loss": 1.9047, + "step": 106500 + }, + { + "epoch": 0.2074221351539771, + "grad_norm": 2.8001456260681152, + "learning_rate": 2.692639104019422e-05, + "loss": 1.912, + "step": 106515 + }, + { + "epoch": 0.20745134542508734, + "grad_norm": 2.3946948051452637, + "learning_rate": 2.692555615845159e-05, + "loss": 1.9169, + "step": 106530 + }, + { + "epoch": 0.2074805556961976, + "grad_norm": 3.0751821994781494, + "learning_rate": 2.6924721176282173e-05, + "loss": 1.9027, + "step": 106545 + }, + { + "epoch": 0.20750976596730786, + "grad_norm": 3.0150632858276367, + "learning_rate": 2.6923886093693008e-05, + "loss": 1.8933, + "step": 106560 + }, + { + "epoch": 0.20753897623841813, + "grad_norm": 4.3918867111206055, + "learning_rate": 2.692305091069111e-05, + "loss": 1.8474, + "step": 106575 + }, + { + "epoch": 0.20756818650952838, + "grad_norm": 3.9623892307281494, + "learning_rate": 2.6922215627283533e-05, + "loss": 1.7726, + "step": 106590 + }, + { + "epoch": 0.20759739678063865, + "grad_norm": 2.7390170097351074, + "learning_rate": 2.6921380243477298e-05, + "loss": 1.8701, + "step": 106605 + }, + { + "epoch": 0.20762660705174893, + "grad_norm": 4.33473539352417, + "learning_rate": 2.692054475927944e-05, + "loss": 1.9438, + "step": 106620 + }, + { + "epoch": 0.20765581732285918, + "grad_norm": 3.3458847999572754, + "learning_rate": 2.6919709174697002e-05, + "loss": 1.8969, + "step": 106635 + }, + { + "epoch": 0.20768502759396945, + "grad_norm": 2.873314380645752, + "learning_rate": 2.6918873489737015e-05, + "loss": 1.9334, + "step": 106650 + }, + { + "epoch": 0.2077142378650797, + "grad_norm": 3.013016700744629, + "learning_rate": 2.691803770440652e-05, + "loss": 1.9636, + "step": 106665 + }, + { + "epoch": 0.20774344813618997, + "grad_norm": 2.7383344173431396, + "learning_rate": 2.6917201818712556e-05, + "loss": 2.1135, + "step": 106680 + }, + { + "epoch": 0.20777265840730022, + "grad_norm": 2.9652326107025146, + "learning_rate": 2.6916365832662152e-05, + "loss": 1.8404, + "step": 106695 + }, + { + "epoch": 0.2078018686784105, + "grad_norm": 4.333414077758789, + "learning_rate": 2.691552974626236e-05, + "loss": 1.7716, + "step": 106710 + }, + { + "epoch": 0.20783107894952077, + "grad_norm": 4.0966796875, + "learning_rate": 2.6914693559520216e-05, + "loss": 1.8956, + "step": 106725 + }, + { + "epoch": 0.20786028922063102, + "grad_norm": 2.2255566120147705, + "learning_rate": 2.6913857272442763e-05, + "loss": 2.0079, + "step": 106740 + }, + { + "epoch": 0.2078894994917413, + "grad_norm": 3.1733102798461914, + "learning_rate": 2.6913020885037035e-05, + "loss": 1.9169, + "step": 106755 + }, + { + "epoch": 0.20791870976285154, + "grad_norm": 2.1505401134490967, + "learning_rate": 2.691218439731009e-05, + "loss": 1.9827, + "step": 106770 + }, + { + "epoch": 0.20794792003396181, + "grad_norm": 2.238618850708008, + "learning_rate": 2.691134780926896e-05, + "loss": 1.8806, + "step": 106785 + }, + { + "epoch": 0.20797713030507206, + "grad_norm": 2.8558506965637207, + "learning_rate": 2.6910511120920702e-05, + "loss": 2.0241, + "step": 106800 + }, + { + "epoch": 0.20800634057618234, + "grad_norm": 5.462651252746582, + "learning_rate": 2.690967433227235e-05, + "loss": 1.9035, + "step": 106815 + }, + { + "epoch": 0.2080355508472926, + "grad_norm": 3.4808349609375, + "learning_rate": 2.6908837443330956e-05, + "loss": 1.7569, + "step": 106830 + }, + { + "epoch": 0.20806476111840286, + "grad_norm": 4.48228645324707, + "learning_rate": 2.6908000454103562e-05, + "loss": 1.8084, + "step": 106845 + }, + { + "epoch": 0.20809397138951313, + "grad_norm": 5.470156192779541, + "learning_rate": 2.690716336459723e-05, + "loss": 2.0153, + "step": 106860 + }, + { + "epoch": 0.20812318166062338, + "grad_norm": 3.2285077571868896, + "learning_rate": 2.6906326174818996e-05, + "loss": 1.9555, + "step": 106875 + }, + { + "epoch": 0.20815239193173365, + "grad_norm": 2.42991042137146, + "learning_rate": 2.6905488884775914e-05, + "loss": 1.8748, + "step": 106890 + }, + { + "epoch": 0.2081816022028439, + "grad_norm": 2.36328387260437, + "learning_rate": 2.6904651494475035e-05, + "loss": 1.7416, + "step": 106905 + }, + { + "epoch": 0.20821081247395418, + "grad_norm": 3.36490797996521, + "learning_rate": 2.6903814003923416e-05, + "loss": 2.0952, + "step": 106920 + }, + { + "epoch": 0.20824002274506445, + "grad_norm": 5.104228496551514, + "learning_rate": 2.69029764131281e-05, + "loss": 1.8224, + "step": 106935 + }, + { + "epoch": 0.2082692330161747, + "grad_norm": 2.934242010116577, + "learning_rate": 2.6902138722096152e-05, + "loss": 1.9987, + "step": 106950 + }, + { + "epoch": 0.20829844328728497, + "grad_norm": 4.109807968139648, + "learning_rate": 2.6901300930834612e-05, + "loss": 1.873, + "step": 106965 + }, + { + "epoch": 0.20832765355839522, + "grad_norm": 3.8005120754241943, + "learning_rate": 2.690046303935055e-05, + "loss": 1.9187, + "step": 106980 + }, + { + "epoch": 0.2083568638295055, + "grad_norm": 3.5435643196105957, + "learning_rate": 2.6899625047651008e-05, + "loss": 1.7097, + "step": 106995 + }, + { + "epoch": 0.20838607410061574, + "grad_norm": 2.54154896736145, + "learning_rate": 2.6898786955743055e-05, + "loss": 2.0288, + "step": 107010 + }, + { + "epoch": 0.20841528437172602, + "grad_norm": 4.690099716186523, + "learning_rate": 2.6897948763633748e-05, + "loss": 1.7734, + "step": 107025 + }, + { + "epoch": 0.2084444946428363, + "grad_norm": 4.570140838623047, + "learning_rate": 2.689711047133013e-05, + "loss": 1.8931, + "step": 107040 + }, + { + "epoch": 0.20847370491394654, + "grad_norm": 4.336936950683594, + "learning_rate": 2.6896272078839278e-05, + "loss": 1.6671, + "step": 107055 + }, + { + "epoch": 0.20850291518505681, + "grad_norm": 3.243089437484741, + "learning_rate": 2.6895433586168247e-05, + "loss": 1.8534, + "step": 107070 + }, + { + "epoch": 0.20853212545616706, + "grad_norm": 3.6152029037475586, + "learning_rate": 2.6894594993324094e-05, + "loss": 1.7418, + "step": 107085 + }, + { + "epoch": 0.20856133572727734, + "grad_norm": 3.026970863342285, + "learning_rate": 2.6893756300313878e-05, + "loss": 1.8205, + "step": 107100 + }, + { + "epoch": 0.20859054599838758, + "grad_norm": 4.720921993255615, + "learning_rate": 2.6892917507144678e-05, + "loss": 1.813, + "step": 107115 + }, + { + "epoch": 0.20861975626949786, + "grad_norm": 2.6896779537200928, + "learning_rate": 2.689207861382354e-05, + "loss": 1.9657, + "step": 107130 + }, + { + "epoch": 0.2086489665406081, + "grad_norm": 4.970309257507324, + "learning_rate": 2.689123962035754e-05, + "loss": 1.8036, + "step": 107145 + }, + { + "epoch": 0.20867817681171838, + "grad_norm": 3.1065382957458496, + "learning_rate": 2.6890400526753733e-05, + "loss": 1.9866, + "step": 107160 + }, + { + "epoch": 0.20870738708282865, + "grad_norm": 3.377146005630493, + "learning_rate": 2.6889561333019197e-05, + "loss": 1.8573, + "step": 107175 + }, + { + "epoch": 0.2087365973539389, + "grad_norm": 2.895570993423462, + "learning_rate": 2.6888722039160992e-05, + "loss": 1.7443, + "step": 107190 + }, + { + "epoch": 0.20876580762504918, + "grad_norm": 4.223216533660889, + "learning_rate": 2.6887882645186183e-05, + "loss": 2.0703, + "step": 107205 + }, + { + "epoch": 0.20879501789615942, + "grad_norm": 3.10441517829895, + "learning_rate": 2.688704315110184e-05, + "loss": 1.796, + "step": 107220 + }, + { + "epoch": 0.2088242281672697, + "grad_norm": 2.790377378463745, + "learning_rate": 2.688620355691504e-05, + "loss": 2.042, + "step": 107235 + }, + { + "epoch": 0.20885343843837995, + "grad_norm": 2.9824726581573486, + "learning_rate": 2.688536386263285e-05, + "loss": 1.772, + "step": 107250 + }, + { + "epoch": 0.20888264870949022, + "grad_norm": 2.572432279586792, + "learning_rate": 2.6884524068262333e-05, + "loss": 2.0062, + "step": 107265 + }, + { + "epoch": 0.2089118589806005, + "grad_norm": 2.3125884532928467, + "learning_rate": 2.688368417381057e-05, + "loss": 1.9295, + "step": 107280 + }, + { + "epoch": 0.20894106925171074, + "grad_norm": 2.7519984245300293, + "learning_rate": 2.6882844179284633e-05, + "loss": 1.8322, + "step": 107295 + }, + { + "epoch": 0.20897027952282102, + "grad_norm": 2.9791460037231445, + "learning_rate": 2.6882004084691598e-05, + "loss": 1.6016, + "step": 107310 + }, + { + "epoch": 0.20899948979393126, + "grad_norm": 2.9997756481170654, + "learning_rate": 2.6881163890038526e-05, + "loss": 1.874, + "step": 107325 + }, + { + "epoch": 0.20902870006504154, + "grad_norm": 3.4577865600585938, + "learning_rate": 2.6880323595332503e-05, + "loss": 1.8632, + "step": 107340 + }, + { + "epoch": 0.2090579103361518, + "grad_norm": 5.106228828430176, + "learning_rate": 2.6879483200580613e-05, + "loss": 1.8564, + "step": 107355 + }, + { + "epoch": 0.20908712060726206, + "grad_norm": 3.4148411750793457, + "learning_rate": 2.6878642705789914e-05, + "loss": 1.9851, + "step": 107370 + }, + { + "epoch": 0.20911633087837234, + "grad_norm": 2.0911991596221924, + "learning_rate": 2.6877802110967497e-05, + "loss": 1.7245, + "step": 107385 + }, + { + "epoch": 0.20914554114948258, + "grad_norm": 3.9117510318756104, + "learning_rate": 2.687696141612044e-05, + "loss": 2.0586, + "step": 107400 + }, + { + "epoch": 0.20917475142059286, + "grad_norm": 3.0696544647216797, + "learning_rate": 2.687612062125582e-05, + "loss": 2.0095, + "step": 107415 + }, + { + "epoch": 0.2092039616917031, + "grad_norm": 4.36345100402832, + "learning_rate": 2.6875279726380716e-05, + "loss": 1.9072, + "step": 107430 + }, + { + "epoch": 0.20923317196281338, + "grad_norm": 6.023902416229248, + "learning_rate": 2.6874438731502212e-05, + "loss": 2.0951, + "step": 107445 + }, + { + "epoch": 0.20926238223392363, + "grad_norm": 2.545959711074829, + "learning_rate": 2.6873597636627393e-05, + "loss": 1.8624, + "step": 107460 + }, + { + "epoch": 0.2092915925050339, + "grad_norm": 4.882541179656982, + "learning_rate": 2.6872756441763336e-05, + "loss": 1.8944, + "step": 107475 + }, + { + "epoch": 0.20932080277614418, + "grad_norm": 3.6726503372192383, + "learning_rate": 2.6871915146917126e-05, + "loss": 1.7217, + "step": 107490 + }, + { + "epoch": 0.20935001304725442, + "grad_norm": 3.12795352935791, + "learning_rate": 2.687107375209585e-05, + "loss": 1.8234, + "step": 107505 + }, + { + "epoch": 0.2093792233183647, + "grad_norm": 4.487453460693359, + "learning_rate": 2.6870232257306593e-05, + "loss": 1.6221, + "step": 107520 + }, + { + "epoch": 0.20940843358947495, + "grad_norm": 2.4420924186706543, + "learning_rate": 2.6869390662556435e-05, + "loss": 1.9015, + "step": 107535 + }, + { + "epoch": 0.20943764386058522, + "grad_norm": 2.5996503829956055, + "learning_rate": 2.6868548967852477e-05, + "loss": 1.9223, + "step": 107550 + }, + { + "epoch": 0.20946685413169547, + "grad_norm": 5.126166343688965, + "learning_rate": 2.6867707173201798e-05, + "loss": 1.924, + "step": 107565 + }, + { + "epoch": 0.20949606440280574, + "grad_norm": 4.093094348907471, + "learning_rate": 2.686686527861148e-05, + "loss": 1.7501, + "step": 107580 + }, + { + "epoch": 0.20952527467391602, + "grad_norm": 3.0408244132995605, + "learning_rate": 2.686602328408863e-05, + "loss": 2.1084, + "step": 107595 + }, + { + "epoch": 0.20955448494502626, + "grad_norm": 4.517397403717041, + "learning_rate": 2.6865181189640326e-05, + "loss": 1.8813, + "step": 107610 + }, + { + "epoch": 0.20958369521613654, + "grad_norm": 1.9934262037277222, + "learning_rate": 2.6864338995273657e-05, + "loss": 1.9927, + "step": 107625 + }, + { + "epoch": 0.2096129054872468, + "grad_norm": 3.2410237789154053, + "learning_rate": 2.6863496700995726e-05, + "loss": 2.1794, + "step": 107640 + }, + { + "epoch": 0.20964211575835706, + "grad_norm": 2.6370344161987305, + "learning_rate": 2.6862654306813616e-05, + "loss": 1.9952, + "step": 107655 + }, + { + "epoch": 0.2096713260294673, + "grad_norm": 4.452397346496582, + "learning_rate": 2.686181181273443e-05, + "loss": 2.1973, + "step": 107670 + }, + { + "epoch": 0.20970053630057758, + "grad_norm": 2.5814452171325684, + "learning_rate": 2.686096921876526e-05, + "loss": 1.9822, + "step": 107685 + }, + { + "epoch": 0.20972974657168786, + "grad_norm": 2.0772507190704346, + "learning_rate": 2.6860126524913197e-05, + "loss": 1.8305, + "step": 107700 + }, + { + "epoch": 0.2097589568427981, + "grad_norm": 7.836738109588623, + "learning_rate": 2.685928373118534e-05, + "loss": 1.8655, + "step": 107715 + }, + { + "epoch": 0.20978816711390838, + "grad_norm": 2.5652856826782227, + "learning_rate": 2.6858440837588787e-05, + "loss": 2.1742, + "step": 107730 + }, + { + "epoch": 0.20981737738501863, + "grad_norm": 2.5728862285614014, + "learning_rate": 2.6857597844130637e-05, + "loss": 1.9772, + "step": 107745 + }, + { + "epoch": 0.2098465876561289, + "grad_norm": 3.747710704803467, + "learning_rate": 2.6856754750817988e-05, + "loss": 1.9774, + "step": 107760 + }, + { + "epoch": 0.20987579792723915, + "grad_norm": 2.4199023246765137, + "learning_rate": 2.685591155765794e-05, + "loss": 1.7714, + "step": 107775 + }, + { + "epoch": 0.20990500819834942, + "grad_norm": 3.970203399658203, + "learning_rate": 2.685506826465759e-05, + "loss": 1.5807, + "step": 107790 + }, + { + "epoch": 0.2099342184694597, + "grad_norm": 2.7192485332489014, + "learning_rate": 2.6854224871824047e-05, + "loss": 1.7476, + "step": 107805 + }, + { + "epoch": 0.20996342874056995, + "grad_norm": 3.78772234916687, + "learning_rate": 2.685338137916441e-05, + "loss": 1.886, + "step": 107820 + }, + { + "epoch": 0.20999263901168022, + "grad_norm": 4.273651599884033, + "learning_rate": 2.6852537786685775e-05, + "loss": 1.8407, + "step": 107835 + }, + { + "epoch": 0.21002184928279047, + "grad_norm": 4.023941516876221, + "learning_rate": 2.685169409439526e-05, + "loss": 1.9216, + "step": 107850 + }, + { + "epoch": 0.21005105955390074, + "grad_norm": 3.0339765548706055, + "learning_rate": 2.6850850302299954e-05, + "loss": 1.7766, + "step": 107865 + }, + { + "epoch": 0.210080269825011, + "grad_norm": 3.361754894256592, + "learning_rate": 2.6850006410406977e-05, + "loss": 1.9329, + "step": 107880 + }, + { + "epoch": 0.21010948009612126, + "grad_norm": 2.449267864227295, + "learning_rate": 2.6849162418723426e-05, + "loss": 1.8739, + "step": 107895 + }, + { + "epoch": 0.21013869036723154, + "grad_norm": 3.6207408905029297, + "learning_rate": 2.684831832725641e-05, + "loss": 2.003, + "step": 107910 + }, + { + "epoch": 0.2101679006383418, + "grad_norm": 3.3550431728363037, + "learning_rate": 2.6847474136013046e-05, + "loss": 1.9521, + "step": 107925 + }, + { + "epoch": 0.21019711090945206, + "grad_norm": 3.2379236221313477, + "learning_rate": 2.684662984500043e-05, + "loss": 1.9683, + "step": 107940 + }, + { + "epoch": 0.2102263211805623, + "grad_norm": 3.8896610736846924, + "learning_rate": 2.6845785454225682e-05, + "loss": 2.064, + "step": 107955 + }, + { + "epoch": 0.21025553145167258, + "grad_norm": 4.033323764801025, + "learning_rate": 2.6844940963695904e-05, + "loss": 1.8029, + "step": 107970 + }, + { + "epoch": 0.21028474172278283, + "grad_norm": 3.5761444568634033, + "learning_rate": 2.6844096373418217e-05, + "loss": 1.863, + "step": 107985 + }, + { + "epoch": 0.2103139519938931, + "grad_norm": 2.495152235031128, + "learning_rate": 2.6843251683399728e-05, + "loss": 1.776, + "step": 108000 + }, + { + "epoch": 0.21034316226500338, + "grad_norm": 2.161456823348999, + "learning_rate": 2.6842406893647544e-05, + "loss": 2.0229, + "step": 108015 + }, + { + "epoch": 0.21037237253611363, + "grad_norm": 2.982679605484009, + "learning_rate": 2.6841562004168795e-05, + "loss": 1.7819, + "step": 108030 + }, + { + "epoch": 0.2104015828072239, + "grad_norm": 4.42517614364624, + "learning_rate": 2.6840717014970584e-05, + "loss": 1.9854, + "step": 108045 + }, + { + "epoch": 0.21043079307833415, + "grad_norm": 3.315840244293213, + "learning_rate": 2.6839871926060028e-05, + "loss": 1.8822, + "step": 108060 + }, + { + "epoch": 0.21046000334944442, + "grad_norm": 2.20531964302063, + "learning_rate": 2.6839026737444248e-05, + "loss": 1.9339, + "step": 108075 + }, + { + "epoch": 0.21048921362055467, + "grad_norm": 2.6291005611419678, + "learning_rate": 2.6838181449130358e-05, + "loss": 1.9202, + "step": 108090 + }, + { + "epoch": 0.21051842389166495, + "grad_norm": 2.9249215126037598, + "learning_rate": 2.6837336061125475e-05, + "loss": 1.9807, + "step": 108105 + }, + { + "epoch": 0.21054763416277522, + "grad_norm": 4.139854431152344, + "learning_rate": 2.683649057343672e-05, + "loss": 1.8354, + "step": 108120 + }, + { + "epoch": 0.21057684443388547, + "grad_norm": 5.221275329589844, + "learning_rate": 2.6835644986071215e-05, + "loss": 1.9035, + "step": 108135 + }, + { + "epoch": 0.21060605470499574, + "grad_norm": 3.100968360900879, + "learning_rate": 2.6834799299036075e-05, + "loss": 1.7803, + "step": 108150 + }, + { + "epoch": 0.210635264976106, + "grad_norm": 2.1341257095336914, + "learning_rate": 2.6833953512338428e-05, + "loss": 1.7174, + "step": 108165 + }, + { + "epoch": 0.21066447524721627, + "grad_norm": 3.9657297134399414, + "learning_rate": 2.6833107625985396e-05, + "loss": 1.9325, + "step": 108180 + }, + { + "epoch": 0.2106936855183265, + "grad_norm": 4.380135536193848, + "learning_rate": 2.6832261639984102e-05, + "loss": 2.0563, + "step": 108195 + }, + { + "epoch": 0.2107228957894368, + "grad_norm": 2.7756996154785156, + "learning_rate": 2.6831415554341664e-05, + "loss": 1.7455, + "step": 108210 + }, + { + "epoch": 0.21075210606054706, + "grad_norm": 3.910353422164917, + "learning_rate": 2.6830569369065213e-05, + "loss": 2.035, + "step": 108225 + }, + { + "epoch": 0.2107813163316573, + "grad_norm": 3.9532132148742676, + "learning_rate": 2.6829723084161876e-05, + "loss": 1.9168, + "step": 108240 + }, + { + "epoch": 0.21081052660276758, + "grad_norm": 3.0819056034088135, + "learning_rate": 2.6828876699638777e-05, + "loss": 1.9076, + "step": 108255 + }, + { + "epoch": 0.21083973687387783, + "grad_norm": 3.3825392723083496, + "learning_rate": 2.682803021550304e-05, + "loss": 1.7789, + "step": 108270 + }, + { + "epoch": 0.2108689471449881, + "grad_norm": 3.8302061557769775, + "learning_rate": 2.6827183631761797e-05, + "loss": 1.8698, + "step": 108285 + }, + { + "epoch": 0.21089815741609835, + "grad_norm": 3.5132298469543457, + "learning_rate": 2.682633694842218e-05, + "loss": 1.8823, + "step": 108300 + }, + { + "epoch": 0.21092736768720863, + "grad_norm": 4.038513660430908, + "learning_rate": 2.6825490165491314e-05, + "loss": 1.8078, + "step": 108315 + }, + { + "epoch": 0.2109565779583189, + "grad_norm": 3.177614212036133, + "learning_rate": 2.6824643282976335e-05, + "loss": 1.833, + "step": 108330 + }, + { + "epoch": 0.21098578822942915, + "grad_norm": 3.6597914695739746, + "learning_rate": 2.682379630088437e-05, + "loss": 1.9826, + "step": 108345 + }, + { + "epoch": 0.21101499850053942, + "grad_norm": 3.4750118255615234, + "learning_rate": 2.682294921922255e-05, + "loss": 1.9665, + "step": 108360 + }, + { + "epoch": 0.21104420877164967, + "grad_norm": 4.199431896209717, + "learning_rate": 2.6822102037998017e-05, + "loss": 1.9105, + "step": 108375 + }, + { + "epoch": 0.21107341904275995, + "grad_norm": 2.8465330600738525, + "learning_rate": 2.6821254757217898e-05, + "loss": 1.8157, + "step": 108390 + }, + { + "epoch": 0.2111026293138702, + "grad_norm": 2.199895143508911, + "learning_rate": 2.6820407376889333e-05, + "loss": 1.8169, + "step": 108405 + }, + { + "epoch": 0.21113183958498047, + "grad_norm": 2.2706222534179688, + "learning_rate": 2.6819559897019452e-05, + "loss": 2.1085, + "step": 108420 + }, + { + "epoch": 0.21116104985609074, + "grad_norm": 4.982051372528076, + "learning_rate": 2.6818712317615394e-05, + "loss": 1.7786, + "step": 108435 + }, + { + "epoch": 0.211190260127201, + "grad_norm": 4.220059871673584, + "learning_rate": 2.6817864638684297e-05, + "loss": 1.869, + "step": 108450 + }, + { + "epoch": 0.21121947039831127, + "grad_norm": 2.8795595169067383, + "learning_rate": 2.68170168602333e-05, + "loss": 1.9574, + "step": 108465 + }, + { + "epoch": 0.2112486806694215, + "grad_norm": 3.3973727226257324, + "learning_rate": 2.6816168982269548e-05, + "loss": 1.9271, + "step": 108480 + }, + { + "epoch": 0.2112778909405318, + "grad_norm": 3.5588557720184326, + "learning_rate": 2.681532100480017e-05, + "loss": 2.0382, + "step": 108495 + }, + { + "epoch": 0.21130710121164203, + "grad_norm": 2.414324998855591, + "learning_rate": 2.6814472927832313e-05, + "loss": 1.7801, + "step": 108510 + }, + { + "epoch": 0.2113363114827523, + "grad_norm": 3.5288190841674805, + "learning_rate": 2.6813624751373115e-05, + "loss": 2.0411, + "step": 108525 + }, + { + "epoch": 0.21136552175386258, + "grad_norm": 3.6074609756469727, + "learning_rate": 2.6812776475429727e-05, + "loss": 1.8513, + "step": 108540 + }, + { + "epoch": 0.21139473202497283, + "grad_norm": 3.5598526000976562, + "learning_rate": 2.681192810000928e-05, + "loss": 1.9603, + "step": 108555 + }, + { + "epoch": 0.2114239422960831, + "grad_norm": 2.1765358448028564, + "learning_rate": 2.6811079625118932e-05, + "loss": 1.8816, + "step": 108570 + }, + { + "epoch": 0.21145315256719335, + "grad_norm": 2.8299851417541504, + "learning_rate": 2.6810231050765816e-05, + "loss": 1.8713, + "step": 108585 + }, + { + "epoch": 0.21148236283830363, + "grad_norm": 2.900864839553833, + "learning_rate": 2.6809382376957084e-05, + "loss": 1.9728, + "step": 108600 + }, + { + "epoch": 0.21151157310941388, + "grad_norm": 3.055288314819336, + "learning_rate": 2.6808533603699886e-05, + "loss": 1.8344, + "step": 108615 + }, + { + "epoch": 0.21154078338052415, + "grad_norm": 4.657290935516357, + "learning_rate": 2.6807684731001363e-05, + "loss": 1.8555, + "step": 108630 + }, + { + "epoch": 0.21156999365163442, + "grad_norm": 2.888864755630493, + "learning_rate": 2.6806835758868664e-05, + "loss": 2.0353, + "step": 108645 + }, + { + "epoch": 0.21159920392274467, + "grad_norm": 3.8315608501434326, + "learning_rate": 2.6805986687308943e-05, + "loss": 1.9459, + "step": 108660 + }, + { + "epoch": 0.21162841419385495, + "grad_norm": 4.430781364440918, + "learning_rate": 2.680513751632935e-05, + "loss": 1.8654, + "step": 108675 + }, + { + "epoch": 0.2116576244649652, + "grad_norm": 7.0257887840271, + "learning_rate": 2.6804288245937033e-05, + "loss": 1.9782, + "step": 108690 + }, + { + "epoch": 0.21168683473607547, + "grad_norm": 3.767902374267578, + "learning_rate": 2.680343887613914e-05, + "loss": 1.9449, + "step": 108705 + }, + { + "epoch": 0.21171604500718572, + "grad_norm": 3.821397542953491, + "learning_rate": 2.6802589406942834e-05, + "loss": 1.7686, + "step": 108720 + }, + { + "epoch": 0.211745255278296, + "grad_norm": 1.9738311767578125, + "learning_rate": 2.6801739838355255e-05, + "loss": 1.9972, + "step": 108735 + }, + { + "epoch": 0.21177446554940627, + "grad_norm": 4.802661418914795, + "learning_rate": 2.6800890170383573e-05, + "loss": 2.0047, + "step": 108750 + }, + { + "epoch": 0.2118036758205165, + "grad_norm": 2.1181414127349854, + "learning_rate": 2.6800040403034933e-05, + "loss": 1.9206, + "step": 108765 + }, + { + "epoch": 0.2118328860916268, + "grad_norm": 4.816725254058838, + "learning_rate": 2.6799190536316487e-05, + "loss": 1.6738, + "step": 108780 + }, + { + "epoch": 0.21186209636273703, + "grad_norm": 2.4658377170562744, + "learning_rate": 2.6798340570235408e-05, + "loss": 1.9751, + "step": 108795 + }, + { + "epoch": 0.2118913066338473, + "grad_norm": 2.4331045150756836, + "learning_rate": 2.6797490504798834e-05, + "loss": 1.7707, + "step": 108810 + }, + { + "epoch": 0.21192051690495756, + "grad_norm": 2.316786766052246, + "learning_rate": 2.679664034001394e-05, + "loss": 1.9114, + "step": 108825 + }, + { + "epoch": 0.21194972717606783, + "grad_norm": 5.081127166748047, + "learning_rate": 2.6795790075887875e-05, + "loss": 1.88, + "step": 108840 + }, + { + "epoch": 0.2119789374471781, + "grad_norm": 3.3721113204956055, + "learning_rate": 2.6794939712427804e-05, + "loss": 1.9934, + "step": 108855 + }, + { + "epoch": 0.21200814771828835, + "grad_norm": 2.942760944366455, + "learning_rate": 2.6794089249640886e-05, + "loss": 1.7847, + "step": 108870 + }, + { + "epoch": 0.21203735798939863, + "grad_norm": 3.8430488109588623, + "learning_rate": 2.6793238687534285e-05, + "loss": 1.8774, + "step": 108885 + }, + { + "epoch": 0.21206656826050888, + "grad_norm": 3.926471471786499, + "learning_rate": 2.679238802611516e-05, + "loss": 1.8817, + "step": 108900 + }, + { + "epoch": 0.21209577853161915, + "grad_norm": 3.7037887573242188, + "learning_rate": 2.6791537265390676e-05, + "loss": 1.8647, + "step": 108915 + }, + { + "epoch": 0.2121249888027294, + "grad_norm": 4.711849689483643, + "learning_rate": 2.6790686405368006e-05, + "loss": 1.836, + "step": 108930 + }, + { + "epoch": 0.21215419907383967, + "grad_norm": 2.770793914794922, + "learning_rate": 2.6789835446054297e-05, + "loss": 2.0589, + "step": 108945 + }, + { + "epoch": 0.21218340934494995, + "grad_norm": 2.980377674102783, + "learning_rate": 2.6788984387456736e-05, + "loss": 2.1003, + "step": 108960 + }, + { + "epoch": 0.2122126196160602, + "grad_norm": 2.83764386177063, + "learning_rate": 2.6788133229582475e-05, + "loss": 1.7176, + "step": 108975 + }, + { + "epoch": 0.21224182988717047, + "grad_norm": 1.9305791854858398, + "learning_rate": 2.6787281972438684e-05, + "loss": 2.0118, + "step": 108990 + }, + { + "epoch": 0.21227104015828072, + "grad_norm": 3.6987972259521484, + "learning_rate": 2.6786430616032535e-05, + "loss": 1.8129, + "step": 109005 + }, + { + "epoch": 0.212300250429391, + "grad_norm": 3.6342735290527344, + "learning_rate": 2.6785579160371197e-05, + "loss": 1.713, + "step": 109020 + }, + { + "epoch": 0.21232946070050124, + "grad_norm": 2.9494435787200928, + "learning_rate": 2.6784727605461836e-05, + "loss": 1.8563, + "step": 109035 + }, + { + "epoch": 0.2123586709716115, + "grad_norm": 2.458470344543457, + "learning_rate": 2.6783875951311632e-05, + "loss": 1.6347, + "step": 109050 + }, + { + "epoch": 0.2123878812427218, + "grad_norm": 5.270952224731445, + "learning_rate": 2.6783024197927747e-05, + "loss": 2.0101, + "step": 109065 + }, + { + "epoch": 0.21241709151383203, + "grad_norm": 2.424886703491211, + "learning_rate": 2.678217234531736e-05, + "loss": 1.8342, + "step": 109080 + }, + { + "epoch": 0.2124463017849423, + "grad_norm": 5.401460647583008, + "learning_rate": 2.6781320393487643e-05, + "loss": 1.929, + "step": 109095 + }, + { + "epoch": 0.21247551205605256, + "grad_norm": 4.5583648681640625, + "learning_rate": 2.678046834244577e-05, + "loss": 1.8967, + "step": 109110 + }, + { + "epoch": 0.21250472232716283, + "grad_norm": 2.6698524951934814, + "learning_rate": 2.6779616192198912e-05, + "loss": 2.0311, + "step": 109125 + }, + { + "epoch": 0.21253393259827308, + "grad_norm": 2.469175100326538, + "learning_rate": 2.677876394275425e-05, + "loss": 1.9063, + "step": 109140 + }, + { + "epoch": 0.21256314286938335, + "grad_norm": 5.225403308868408, + "learning_rate": 2.6777911594118965e-05, + "loss": 1.7878, + "step": 109155 + }, + { + "epoch": 0.21259235314049363, + "grad_norm": 3.9878222942352295, + "learning_rate": 2.6777059146300228e-05, + "loss": 1.9948, + "step": 109170 + }, + { + "epoch": 0.21262156341160388, + "grad_norm": 3.9668021202087402, + "learning_rate": 2.677620659930522e-05, + "loss": 2.0314, + "step": 109185 + }, + { + "epoch": 0.21265077368271415, + "grad_norm": 1.9054670333862305, + "learning_rate": 2.6775353953141116e-05, + "loss": 1.8134, + "step": 109200 + }, + { + "epoch": 0.2126799839538244, + "grad_norm": 2.9554741382598877, + "learning_rate": 2.6774501207815104e-05, + "loss": 1.7933, + "step": 109215 + }, + { + "epoch": 0.21270919422493467, + "grad_norm": 3.3570075035095215, + "learning_rate": 2.677364836333436e-05, + "loss": 2.0503, + "step": 109230 + }, + { + "epoch": 0.21273840449604492, + "grad_norm": 2.9707770347595215, + "learning_rate": 2.677279541970607e-05, + "loss": 1.8962, + "step": 109245 + }, + { + "epoch": 0.2127676147671552, + "grad_norm": 3.3821027278900146, + "learning_rate": 2.677194237693741e-05, + "loss": 1.8643, + "step": 109260 + }, + { + "epoch": 0.21279682503826544, + "grad_norm": 3.3962464332580566, + "learning_rate": 2.677108923503557e-05, + "loss": 1.9699, + "step": 109275 + }, + { + "epoch": 0.21282603530937572, + "grad_norm": 2.4688611030578613, + "learning_rate": 2.677023599400773e-05, + "loss": 1.8916, + "step": 109290 + }, + { + "epoch": 0.212855245580486, + "grad_norm": 3.4210448265075684, + "learning_rate": 2.6769382653861082e-05, + "loss": 1.819, + "step": 109305 + }, + { + "epoch": 0.21288445585159624, + "grad_norm": 2.842135429382324, + "learning_rate": 2.6768529214602803e-05, + "loss": 1.9127, + "step": 109320 + }, + { + "epoch": 0.2129136661227065, + "grad_norm": 3.7058370113372803, + "learning_rate": 2.6767675676240083e-05, + "loss": 2.0038, + "step": 109335 + }, + { + "epoch": 0.21294287639381676, + "grad_norm": 3.374720335006714, + "learning_rate": 2.6766822038780115e-05, + "loss": 2.0714, + "step": 109350 + }, + { + "epoch": 0.21297208666492703, + "grad_norm": 3.429506778717041, + "learning_rate": 2.676596830223008e-05, + "loss": 1.8986, + "step": 109365 + }, + { + "epoch": 0.21300129693603728, + "grad_norm": 2.2242162227630615, + "learning_rate": 2.6765114466597176e-05, + "loss": 1.7431, + "step": 109380 + }, + { + "epoch": 0.21303050720714756, + "grad_norm": 3.1171090602874756, + "learning_rate": 2.6764260531888584e-05, + "loss": 2.0133, + "step": 109395 + }, + { + "epoch": 0.21305971747825783, + "grad_norm": 5.756847381591797, + "learning_rate": 2.6763406498111502e-05, + "loss": 1.8746, + "step": 109410 + }, + { + "epoch": 0.21308892774936808, + "grad_norm": 3.6694235801696777, + "learning_rate": 2.6762552365273118e-05, + "loss": 1.9028, + "step": 109425 + }, + { + "epoch": 0.21311813802047835, + "grad_norm": 2.514770269393921, + "learning_rate": 2.6761698133380628e-05, + "loss": 1.9104, + "step": 109440 + }, + { + "epoch": 0.2131473482915886, + "grad_norm": 2.634536027908325, + "learning_rate": 2.676084380244122e-05, + "loss": 1.8284, + "step": 109455 + }, + { + "epoch": 0.21317655856269888, + "grad_norm": 2.7625274658203125, + "learning_rate": 2.6759989372462096e-05, + "loss": 1.7271, + "step": 109470 + }, + { + "epoch": 0.21320576883380912, + "grad_norm": 4.472266674041748, + "learning_rate": 2.6759134843450444e-05, + "loss": 1.7719, + "step": 109485 + }, + { + "epoch": 0.2132349791049194, + "grad_norm": 3.3584632873535156, + "learning_rate": 2.6758280215413468e-05, + "loss": 1.9736, + "step": 109500 + }, + { + "epoch": 0.21326418937602967, + "grad_norm": 2.929657220840454, + "learning_rate": 2.6757425488358358e-05, + "loss": 2.0031, + "step": 109515 + }, + { + "epoch": 0.21329339964713992, + "grad_norm": 2.7919819355010986, + "learning_rate": 2.6756570662292312e-05, + "loss": 1.8106, + "step": 109530 + }, + { + "epoch": 0.2133226099182502, + "grad_norm": 4.363703727722168, + "learning_rate": 2.6755715737222535e-05, + "loss": 1.8807, + "step": 109545 + }, + { + "epoch": 0.21335182018936044, + "grad_norm": 4.117763519287109, + "learning_rate": 2.675486071315622e-05, + "loss": 1.8795, + "step": 109560 + }, + { + "epoch": 0.21338103046047072, + "grad_norm": 4.1320109367370605, + "learning_rate": 2.6754005590100575e-05, + "loss": 1.8178, + "step": 109575 + }, + { + "epoch": 0.21341024073158096, + "grad_norm": 4.35907506942749, + "learning_rate": 2.675315036806279e-05, + "loss": 2.0663, + "step": 109590 + }, + { + "epoch": 0.21343945100269124, + "grad_norm": 4.172837734222412, + "learning_rate": 2.6752295047050072e-05, + "loss": 2.0024, + "step": 109605 + }, + { + "epoch": 0.2134686612738015, + "grad_norm": 3.445608139038086, + "learning_rate": 2.6751439627069627e-05, + "loss": 1.9501, + "step": 109620 + }, + { + "epoch": 0.21349787154491176, + "grad_norm": 3.9572298526763916, + "learning_rate": 2.675058410812866e-05, + "loss": 1.9268, + "step": 109635 + }, + { + "epoch": 0.21352708181602204, + "grad_norm": 2.6385724544525146, + "learning_rate": 2.674972849023436e-05, + "loss": 1.9157, + "step": 109650 + }, + { + "epoch": 0.21355629208713228, + "grad_norm": 3.2229907512664795, + "learning_rate": 2.6748872773393957e-05, + "loss": 1.9078, + "step": 109665 + }, + { + "epoch": 0.21358550235824256, + "grad_norm": 4.05169677734375, + "learning_rate": 2.674801695761464e-05, + "loss": 1.953, + "step": 109680 + }, + { + "epoch": 0.2136147126293528, + "grad_norm": 2.2267520427703857, + "learning_rate": 2.6747161042903615e-05, + "loss": 1.8417, + "step": 109695 + }, + { + "epoch": 0.21364392290046308, + "grad_norm": 2.2932324409484863, + "learning_rate": 2.6746305029268097e-05, + "loss": 2.0154, + "step": 109710 + }, + { + "epoch": 0.21367313317157335, + "grad_norm": 2.482072591781616, + "learning_rate": 2.6745448916715293e-05, + "loss": 1.845, + "step": 109725 + }, + { + "epoch": 0.2137023434426836, + "grad_norm": 2.7813568115234375, + "learning_rate": 2.674459270525241e-05, + "loss": 1.9386, + "step": 109740 + }, + { + "epoch": 0.21373155371379388, + "grad_norm": 2.9583349227905273, + "learning_rate": 2.6743736394886666e-05, + "loss": 1.8048, + "step": 109755 + }, + { + "epoch": 0.21376076398490412, + "grad_norm": 2.5913519859313965, + "learning_rate": 2.674287998562526e-05, + "loss": 1.9283, + "step": 109770 + }, + { + "epoch": 0.2137899742560144, + "grad_norm": 2.864767074584961, + "learning_rate": 2.6742023477475412e-05, + "loss": 1.7622, + "step": 109785 + }, + { + "epoch": 0.21381918452712464, + "grad_norm": 3.2527997493743896, + "learning_rate": 2.6741166870444336e-05, + "loss": 1.8105, + "step": 109800 + }, + { + "epoch": 0.21384839479823492, + "grad_norm": 3.5855205059051514, + "learning_rate": 2.674031016453924e-05, + "loss": 2.0207, + "step": 109815 + }, + { + "epoch": 0.2138776050693452, + "grad_norm": 2.5562195777893066, + "learning_rate": 2.6739453359767336e-05, + "loss": 1.9991, + "step": 109830 + }, + { + "epoch": 0.21390681534045544, + "grad_norm": 2.263134002685547, + "learning_rate": 2.673859645613585e-05, + "loss": 1.899, + "step": 109845 + }, + { + "epoch": 0.21393602561156572, + "grad_norm": 3.828584671020508, + "learning_rate": 2.673773945365199e-05, + "loss": 1.748, + "step": 109860 + }, + { + "epoch": 0.21396523588267596, + "grad_norm": 2.9502933025360107, + "learning_rate": 2.6736882352322977e-05, + "loss": 1.848, + "step": 109875 + }, + { + "epoch": 0.21399444615378624, + "grad_norm": 1.925807237625122, + "learning_rate": 2.6736025152156026e-05, + "loss": 1.7757, + "step": 109890 + }, + { + "epoch": 0.21402365642489649, + "grad_norm": 3.3970282077789307, + "learning_rate": 2.6735167853158355e-05, + "loss": 1.8596, + "step": 109905 + }, + { + "epoch": 0.21405286669600676, + "grad_norm": 3.7627885341644287, + "learning_rate": 2.6734310455337184e-05, + "loss": 1.7122, + "step": 109920 + }, + { + "epoch": 0.21408207696711704, + "grad_norm": 3.974048614501953, + "learning_rate": 2.6733452958699743e-05, + "loss": 1.9103, + "step": 109935 + }, + { + "epoch": 0.21411128723822728, + "grad_norm": 2.9546279907226562, + "learning_rate": 2.673259536325324e-05, + "loss": 1.8072, + "step": 109950 + }, + { + "epoch": 0.21414049750933756, + "grad_norm": 3.2573819160461426, + "learning_rate": 2.6731737669004895e-05, + "loss": 2.0734, + "step": 109965 + }, + { + "epoch": 0.2141697077804478, + "grad_norm": 5.183135032653809, + "learning_rate": 2.6730879875961942e-05, + "loss": 1.8578, + "step": 109980 + }, + { + "epoch": 0.21419891805155808, + "grad_norm": 3.53238582611084, + "learning_rate": 2.67300219841316e-05, + "loss": 1.7694, + "step": 109995 + }, + { + "epoch": 0.21422812832266833, + "grad_norm": 4.235926151275635, + "learning_rate": 2.6729163993521092e-05, + "loss": 1.7728, + "step": 110010 + }, + { + "epoch": 0.2142573385937786, + "grad_norm": 2.2322680950164795, + "learning_rate": 2.6728305904137642e-05, + "loss": 1.8301, + "step": 110025 + }, + { + "epoch": 0.21428654886488888, + "grad_norm": 2.4045603275299072, + "learning_rate": 2.6727447715988483e-05, + "loss": 1.8877, + "step": 110040 + }, + { + "epoch": 0.21431575913599912, + "grad_norm": 4.33298397064209, + "learning_rate": 2.6726589429080835e-05, + "loss": 1.9082, + "step": 110055 + }, + { + "epoch": 0.2143449694071094, + "grad_norm": 2.5505611896514893, + "learning_rate": 2.672573104342193e-05, + "loss": 1.7464, + "step": 110070 + }, + { + "epoch": 0.21437417967821965, + "grad_norm": 2.6292130947113037, + "learning_rate": 2.6724872559018993e-05, + "loss": 1.821, + "step": 110085 + }, + { + "epoch": 0.21440338994932992, + "grad_norm": 2.199423313140869, + "learning_rate": 2.6724013975879255e-05, + "loss": 1.9598, + "step": 110100 + }, + { + "epoch": 0.21443260022044017, + "grad_norm": 2.397352933883667, + "learning_rate": 2.6723155294009946e-05, + "loss": 1.9664, + "step": 110115 + }, + { + "epoch": 0.21446181049155044, + "grad_norm": 2.7526962757110596, + "learning_rate": 2.67222965134183e-05, + "loss": 1.8335, + "step": 110130 + }, + { + "epoch": 0.21449102076266072, + "grad_norm": 2.473390579223633, + "learning_rate": 2.6721437634111544e-05, + "loss": 2.0067, + "step": 110145 + }, + { + "epoch": 0.21452023103377096, + "grad_norm": 3.4258782863616943, + "learning_rate": 2.6720578656096914e-05, + "loss": 1.841, + "step": 110160 + }, + { + "epoch": 0.21454944130488124, + "grad_norm": 2.979444980621338, + "learning_rate": 2.6719719579381642e-05, + "loss": 1.837, + "step": 110175 + }, + { + "epoch": 0.21457865157599149, + "grad_norm": 3.292710304260254, + "learning_rate": 2.6718860403972964e-05, + "loss": 1.7188, + "step": 110190 + }, + { + "epoch": 0.21460786184710176, + "grad_norm": 3.954049825668335, + "learning_rate": 2.6718001129878113e-05, + "loss": 1.7877, + "step": 110205 + }, + { + "epoch": 0.214637072118212, + "grad_norm": 3.0645692348480225, + "learning_rate": 2.671714175710433e-05, + "loss": 2.0567, + "step": 110220 + }, + { + "epoch": 0.21466628238932228, + "grad_norm": 3.34306001663208, + "learning_rate": 2.6716282285658844e-05, + "loss": 1.9755, + "step": 110235 + }, + { + "epoch": 0.21469549266043256, + "grad_norm": 4.299754619598389, + "learning_rate": 2.67154227155489e-05, + "loss": 1.7277, + "step": 110250 + }, + { + "epoch": 0.2147247029315428, + "grad_norm": 3.3302488327026367, + "learning_rate": 2.6714563046781733e-05, + "loss": 1.7959, + "step": 110265 + }, + { + "epoch": 0.21475391320265308, + "grad_norm": 2.934659481048584, + "learning_rate": 2.671370327936458e-05, + "loss": 1.709, + "step": 110280 + }, + { + "epoch": 0.21478312347376333, + "grad_norm": 3.9981272220611572, + "learning_rate": 2.671284341330469e-05, + "loss": 1.697, + "step": 110295 + }, + { + "epoch": 0.2148123337448736, + "grad_norm": 3.4822757244110107, + "learning_rate": 2.671198344860929e-05, + "loss": 1.9373, + "step": 110310 + }, + { + "epoch": 0.21484154401598385, + "grad_norm": 3.239213705062866, + "learning_rate": 2.6711123385285637e-05, + "loss": 1.8117, + "step": 110325 + }, + { + "epoch": 0.21487075428709412, + "grad_norm": 3.415123701095581, + "learning_rate": 2.6710263223340968e-05, + "loss": 1.8672, + "step": 110340 + }, + { + "epoch": 0.2148999645582044, + "grad_norm": 3.2394487857818604, + "learning_rate": 2.6709402962782518e-05, + "loss": 1.8486, + "step": 110355 + }, + { + "epoch": 0.21492917482931465, + "grad_norm": 3.5315041542053223, + "learning_rate": 2.6708542603617547e-05, + "loss": 2.0867, + "step": 110370 + }, + { + "epoch": 0.21495838510042492, + "grad_norm": 2.5273640155792236, + "learning_rate": 2.6707682145853286e-05, + "loss": 1.8051, + "step": 110385 + }, + { + "epoch": 0.21498759537153517, + "grad_norm": 6.845281600952148, + "learning_rate": 2.670682158949699e-05, + "loss": 1.8445, + "step": 110400 + }, + { + "epoch": 0.21501680564264544, + "grad_norm": 2.6418023109436035, + "learning_rate": 2.6705960934555905e-05, + "loss": 1.7114, + "step": 110415 + }, + { + "epoch": 0.2150460159137557, + "grad_norm": 3.6489005088806152, + "learning_rate": 2.670510018103727e-05, + "loss": 1.8847, + "step": 110430 + }, + { + "epoch": 0.21507522618486596, + "grad_norm": 3.236877918243408, + "learning_rate": 2.6704239328948343e-05, + "loss": 1.8001, + "step": 110445 + }, + { + "epoch": 0.21510443645597624, + "grad_norm": 2.829045057296753, + "learning_rate": 2.6703378378296372e-05, + "loss": 1.7185, + "step": 110460 + }, + { + "epoch": 0.21513364672708649, + "grad_norm": 5.080995559692383, + "learning_rate": 2.6702517329088604e-05, + "loss": 1.8949, + "step": 110475 + }, + { + "epoch": 0.21516285699819676, + "grad_norm": 4.0229902267456055, + "learning_rate": 2.6701656181332293e-05, + "loss": 1.8308, + "step": 110490 + }, + { + "epoch": 0.215192067269307, + "grad_norm": 4.355954647064209, + "learning_rate": 2.670079493503469e-05, + "loss": 1.7957, + "step": 110505 + }, + { + "epoch": 0.21522127754041728, + "grad_norm": 5.61885929107666, + "learning_rate": 2.6699933590203046e-05, + "loss": 1.683, + "step": 110520 + }, + { + "epoch": 0.21525048781152753, + "grad_norm": 4.927652359008789, + "learning_rate": 2.6699072146844617e-05, + "loss": 1.8882, + "step": 110535 + }, + { + "epoch": 0.2152796980826378, + "grad_norm": 3.3163411617279053, + "learning_rate": 2.6698210604966654e-05, + "loss": 1.845, + "step": 110550 + }, + { + "epoch": 0.21530890835374808, + "grad_norm": 2.782061815261841, + "learning_rate": 2.6697348964576415e-05, + "loss": 1.6584, + "step": 110565 + }, + { + "epoch": 0.21533811862485833, + "grad_norm": 4.2090535163879395, + "learning_rate": 2.669648722568116e-05, + "loss": 1.7619, + "step": 110580 + }, + { + "epoch": 0.2153673288959686, + "grad_norm": 2.76629638671875, + "learning_rate": 2.6695625388288134e-05, + "loss": 1.9097, + "step": 110595 + }, + { + "epoch": 0.21539653916707885, + "grad_norm": 2.7745659351348877, + "learning_rate": 2.6694763452404603e-05, + "loss": 1.8785, + "step": 110610 + }, + { + "epoch": 0.21542574943818912, + "grad_norm": 2.462768793106079, + "learning_rate": 2.6693901418037825e-05, + "loss": 1.9038, + "step": 110625 + }, + { + "epoch": 0.21545495970929937, + "grad_norm": 3.777122735977173, + "learning_rate": 2.669303928519506e-05, + "loss": 1.7435, + "step": 110640 + }, + { + "epoch": 0.21548416998040965, + "grad_norm": 5.450116157531738, + "learning_rate": 2.6692177053883566e-05, + "loss": 1.9862, + "step": 110655 + }, + { + "epoch": 0.21551338025151992, + "grad_norm": 1.8337889909744263, + "learning_rate": 2.6691314724110602e-05, + "loss": 1.8983, + "step": 110670 + }, + { + "epoch": 0.21554259052263017, + "grad_norm": 2.3487162590026855, + "learning_rate": 2.6690452295883435e-05, + "loss": 1.9866, + "step": 110685 + }, + { + "epoch": 0.21557180079374044, + "grad_norm": 2.318474054336548, + "learning_rate": 2.668958976920932e-05, + "loss": 1.871, + "step": 110700 + }, + { + "epoch": 0.2156010110648507, + "grad_norm": 3.5330381393432617, + "learning_rate": 2.6688727144095535e-05, + "loss": 2.0554, + "step": 110715 + }, + { + "epoch": 0.21563022133596096, + "grad_norm": 1.7995198965072632, + "learning_rate": 2.6687864420549325e-05, + "loss": 1.7337, + "step": 110730 + }, + { + "epoch": 0.2156594316070712, + "grad_norm": 3.413686752319336, + "learning_rate": 2.6687001598577977e-05, + "loss": 2.022, + "step": 110745 + }, + { + "epoch": 0.2156886418781815, + "grad_norm": 4.160714149475098, + "learning_rate": 2.6686138678188733e-05, + "loss": 1.9797, + "step": 110760 + }, + { + "epoch": 0.21571785214929176, + "grad_norm": 2.5363166332244873, + "learning_rate": 2.6685275659388876e-05, + "loss": 2.0251, + "step": 110775 + }, + { + "epoch": 0.215747062420402, + "grad_norm": 4.317180156707764, + "learning_rate": 2.668441254218567e-05, + "loss": 1.9136, + "step": 110790 + }, + { + "epoch": 0.21577627269151228, + "grad_norm": 3.4366073608398438, + "learning_rate": 2.6683549326586377e-05, + "loss": 1.7775, + "step": 110805 + }, + { + "epoch": 0.21580548296262253, + "grad_norm": 3.6357247829437256, + "learning_rate": 2.668268601259828e-05, + "loss": 1.8456, + "step": 110820 + }, + { + "epoch": 0.2158346932337328, + "grad_norm": 3.6708462238311768, + "learning_rate": 2.6681822600228637e-05, + "loss": 1.8805, + "step": 110835 + }, + { + "epoch": 0.21586390350484305, + "grad_norm": 2.6389195919036865, + "learning_rate": 2.6680959089484723e-05, + "loss": 1.7781, + "step": 110850 + }, + { + "epoch": 0.21589311377595333, + "grad_norm": 2.4040143489837646, + "learning_rate": 2.6680095480373813e-05, + "loss": 1.8565, + "step": 110865 + }, + { + "epoch": 0.2159223240470636, + "grad_norm": 3.392080783843994, + "learning_rate": 2.667923177290317e-05, + "loss": 1.9619, + "step": 110880 + }, + { + "epoch": 0.21595153431817385, + "grad_norm": 2.229698896408081, + "learning_rate": 2.667836796708008e-05, + "loss": 1.9621, + "step": 110895 + }, + { + "epoch": 0.21598074458928412, + "grad_norm": 2.37629771232605, + "learning_rate": 2.6677504062911808e-05, + "loss": 1.9364, + "step": 110910 + }, + { + "epoch": 0.21600995486039437, + "grad_norm": 2.787151575088501, + "learning_rate": 2.667664006040563e-05, + "loss": 1.9532, + "step": 110925 + }, + { + "epoch": 0.21603916513150465, + "grad_norm": 2.8584609031677246, + "learning_rate": 2.6675775959568828e-05, + "loss": 1.9188, + "step": 110940 + }, + { + "epoch": 0.2160683754026149, + "grad_norm": 3.3912649154663086, + "learning_rate": 2.667491176040867e-05, + "loss": 1.9029, + "step": 110955 + }, + { + "epoch": 0.21609758567372517, + "grad_norm": 2.103898048400879, + "learning_rate": 2.6674047462932438e-05, + "loss": 1.8699, + "step": 110970 + }, + { + "epoch": 0.21612679594483544, + "grad_norm": 2.6994028091430664, + "learning_rate": 2.6673183067147416e-05, + "loss": 1.9353, + "step": 110985 + }, + { + "epoch": 0.2161560062159457, + "grad_norm": 3.1044652462005615, + "learning_rate": 2.6672318573060874e-05, + "loss": 1.8657, + "step": 111000 + }, + { + "epoch": 0.21618521648705596, + "grad_norm": 3.8638501167297363, + "learning_rate": 2.6671453980680094e-05, + "loss": 1.6295, + "step": 111015 + }, + { + "epoch": 0.2162144267581662, + "grad_norm": 2.773092746734619, + "learning_rate": 2.667058929001236e-05, + "loss": 2.1069, + "step": 111030 + }, + { + "epoch": 0.2162436370292765, + "grad_norm": 2.0622265338897705, + "learning_rate": 2.6669724501064954e-05, + "loss": 2.0181, + "step": 111045 + }, + { + "epoch": 0.21627284730038673, + "grad_norm": 2.7382068634033203, + "learning_rate": 2.6668859613845152e-05, + "loss": 1.9468, + "step": 111060 + }, + { + "epoch": 0.216302057571497, + "grad_norm": 3.0110533237457275, + "learning_rate": 2.6667994628360248e-05, + "loss": 1.6142, + "step": 111075 + }, + { + "epoch": 0.21633126784260728, + "grad_norm": 2.147174835205078, + "learning_rate": 2.6667129544617513e-05, + "loss": 1.9232, + "step": 111090 + }, + { + "epoch": 0.21636047811371753, + "grad_norm": 3.4371132850646973, + "learning_rate": 2.6666264362624245e-05, + "loss": 1.8315, + "step": 111105 + }, + { + "epoch": 0.2163896883848278, + "grad_norm": 2.041501522064209, + "learning_rate": 2.6665399082387725e-05, + "loss": 1.9564, + "step": 111120 + }, + { + "epoch": 0.21641889865593805, + "grad_norm": 2.935755491256714, + "learning_rate": 2.6664533703915237e-05, + "loss": 1.6756, + "step": 111135 + }, + { + "epoch": 0.21644810892704833, + "grad_norm": 2.4531726837158203, + "learning_rate": 2.666366822721407e-05, + "loss": 1.9623, + "step": 111150 + }, + { + "epoch": 0.21647731919815857, + "grad_norm": 4.068994522094727, + "learning_rate": 2.6662802652291515e-05, + "loss": 2.051, + "step": 111165 + }, + { + "epoch": 0.21650652946926885, + "grad_norm": 2.10845947265625, + "learning_rate": 2.6661936979154852e-05, + "loss": 2.0908, + "step": 111180 + }, + { + "epoch": 0.21653573974037912, + "grad_norm": 3.3636412620544434, + "learning_rate": 2.666107120781138e-05, + "loss": 1.8249, + "step": 111195 + }, + { + "epoch": 0.21656495001148937, + "grad_norm": 4.481722831726074, + "learning_rate": 2.666020533826839e-05, + "loss": 1.9305, + "step": 111210 + }, + { + "epoch": 0.21659416028259965, + "grad_norm": 4.108544826507568, + "learning_rate": 2.6659339370533168e-05, + "loss": 1.9846, + "step": 111225 + }, + { + "epoch": 0.2166233705537099, + "grad_norm": 3.3539793491363525, + "learning_rate": 2.6658473304613013e-05, + "loss": 1.7949, + "step": 111240 + }, + { + "epoch": 0.21665258082482017, + "grad_norm": 3.892812728881836, + "learning_rate": 2.665760714051521e-05, + "loss": 2.1411, + "step": 111255 + }, + { + "epoch": 0.21668179109593041, + "grad_norm": 3.8810596466064453, + "learning_rate": 2.665674087824706e-05, + "loss": 1.9584, + "step": 111270 + }, + { + "epoch": 0.2167110013670407, + "grad_norm": 4.72860050201416, + "learning_rate": 2.665587451781585e-05, + "loss": 1.7934, + "step": 111285 + }, + { + "epoch": 0.21674021163815096, + "grad_norm": 4.017111301422119, + "learning_rate": 2.6655008059228887e-05, + "loss": 1.7309, + "step": 111300 + }, + { + "epoch": 0.2167694219092612, + "grad_norm": 4.965656757354736, + "learning_rate": 2.6654141502493465e-05, + "loss": 1.8537, + "step": 111315 + }, + { + "epoch": 0.2167986321803715, + "grad_norm": 2.9801900386810303, + "learning_rate": 2.6653274847616875e-05, + "loss": 1.756, + "step": 111330 + }, + { + "epoch": 0.21682784245148173, + "grad_norm": 2.664911985397339, + "learning_rate": 2.665240809460642e-05, + "loss": 1.9229, + "step": 111345 + }, + { + "epoch": 0.216857052722592, + "grad_norm": 2.6732869148254395, + "learning_rate": 2.665154124346939e-05, + "loss": 1.9012, + "step": 111360 + }, + { + "epoch": 0.21688626299370226, + "grad_norm": 3.0438828468322754, + "learning_rate": 2.66506742942131e-05, + "loss": 2.0201, + "step": 111375 + }, + { + "epoch": 0.21691547326481253, + "grad_norm": 2.7617833614349365, + "learning_rate": 2.6649807246844845e-05, + "loss": 1.9451, + "step": 111390 + }, + { + "epoch": 0.21694468353592278, + "grad_norm": 2.828261613845825, + "learning_rate": 2.6648940101371918e-05, + "loss": 1.955, + "step": 111405 + }, + { + "epoch": 0.21697389380703305, + "grad_norm": 4.762531280517578, + "learning_rate": 2.6648072857801636e-05, + "loss": 1.8666, + "step": 111420 + }, + { + "epoch": 0.21700310407814333, + "grad_norm": 2.1395909786224365, + "learning_rate": 2.664720551614129e-05, + "loss": 1.7133, + "step": 111435 + }, + { + "epoch": 0.21703231434925357, + "grad_norm": 4.922176361083984, + "learning_rate": 2.6646338076398187e-05, + "loss": 2.0407, + "step": 111450 + }, + { + "epoch": 0.21706152462036385, + "grad_norm": 3.3287906646728516, + "learning_rate": 2.664547053857963e-05, + "loss": 1.9901, + "step": 111465 + }, + { + "epoch": 0.2170907348914741, + "grad_norm": 2.1105287075042725, + "learning_rate": 2.6644602902692933e-05, + "loss": 2.0853, + "step": 111480 + }, + { + "epoch": 0.21711994516258437, + "grad_norm": 3.8828330039978027, + "learning_rate": 2.66437351687454e-05, + "loss": 1.9507, + "step": 111495 + }, + { + "epoch": 0.21714915543369462, + "grad_norm": 4.26022481918335, + "learning_rate": 2.664286733674433e-05, + "loss": 1.7858, + "step": 111510 + }, + { + "epoch": 0.2171783657048049, + "grad_norm": 2.646829843521118, + "learning_rate": 2.6641999406697033e-05, + "loss": 1.6686, + "step": 111525 + }, + { + "epoch": 0.21720757597591517, + "grad_norm": 2.6157593727111816, + "learning_rate": 2.664113137861083e-05, + "loss": 1.9343, + "step": 111540 + }, + { + "epoch": 0.21723678624702542, + "grad_norm": 4.291440486907959, + "learning_rate": 2.6640263252493015e-05, + "loss": 1.5513, + "step": 111555 + }, + { + "epoch": 0.2172659965181357, + "grad_norm": 4.723598003387451, + "learning_rate": 2.6639395028350915e-05, + "loss": 1.7257, + "step": 111570 + }, + { + "epoch": 0.21729520678924594, + "grad_norm": 2.5793144702911377, + "learning_rate": 2.6638526706191826e-05, + "loss": 1.7492, + "step": 111585 + }, + { + "epoch": 0.2173244170603562, + "grad_norm": 3.023305654525757, + "learning_rate": 2.6637658286023066e-05, + "loss": 1.6867, + "step": 111600 + }, + { + "epoch": 0.21735362733146646, + "grad_norm": 3.2556352615356445, + "learning_rate": 2.6636789767851954e-05, + "loss": 2.0555, + "step": 111615 + }, + { + "epoch": 0.21738283760257673, + "grad_norm": 2.4452531337738037, + "learning_rate": 2.6635921151685794e-05, + "loss": 1.6032, + "step": 111630 + }, + { + "epoch": 0.217412047873687, + "grad_norm": 2.343571901321411, + "learning_rate": 2.6635052437531908e-05, + "loss": 1.9354, + "step": 111645 + }, + { + "epoch": 0.21744125814479726, + "grad_norm": 2.609999418258667, + "learning_rate": 2.6634183625397605e-05, + "loss": 1.8907, + "step": 111660 + }, + { + "epoch": 0.21747046841590753, + "grad_norm": 2.865074396133423, + "learning_rate": 2.663331471529021e-05, + "loss": 2.2591, + "step": 111675 + }, + { + "epoch": 0.21749967868701778, + "grad_norm": 4.334042549133301, + "learning_rate": 2.6632445707217033e-05, + "loss": 1.7237, + "step": 111690 + }, + { + "epoch": 0.21752888895812805, + "grad_norm": 3.3543589115142822, + "learning_rate": 2.6631576601185397e-05, + "loss": 2.0538, + "step": 111705 + }, + { + "epoch": 0.2175580992292383, + "grad_norm": 3.712894916534424, + "learning_rate": 2.6630707397202617e-05, + "loss": 1.9205, + "step": 111720 + }, + { + "epoch": 0.21758730950034857, + "grad_norm": 5.5721116065979, + "learning_rate": 2.6629838095276015e-05, + "loss": 1.9847, + "step": 111735 + }, + { + "epoch": 0.21761651977145885, + "grad_norm": 2.9323737621307373, + "learning_rate": 2.662896869541291e-05, + "loss": 1.8197, + "step": 111750 + }, + { + "epoch": 0.2176457300425691, + "grad_norm": 4.951112270355225, + "learning_rate": 2.6628099197620624e-05, + "loss": 2.0227, + "step": 111765 + }, + { + "epoch": 0.21767494031367937, + "grad_norm": 2.4222700595855713, + "learning_rate": 2.662722960190648e-05, + "loss": 2.0125, + "step": 111780 + }, + { + "epoch": 0.21770415058478962, + "grad_norm": 4.372117519378662, + "learning_rate": 2.66263599082778e-05, + "loss": 1.9179, + "step": 111795 + }, + { + "epoch": 0.2177333608558999, + "grad_norm": 4.406684875488281, + "learning_rate": 2.6625490116741906e-05, + "loss": 1.9395, + "step": 111810 + }, + { + "epoch": 0.21776257112701014, + "grad_norm": 4.304651737213135, + "learning_rate": 2.6624620227306127e-05, + "loss": 2.0086, + "step": 111825 + }, + { + "epoch": 0.21779178139812042, + "grad_norm": 2.456897020339966, + "learning_rate": 2.6623750239977787e-05, + "loss": 1.9596, + "step": 111840 + }, + { + "epoch": 0.2178209916692307, + "grad_norm": 2.245980978012085, + "learning_rate": 2.662288015476421e-05, + "loss": 1.8945, + "step": 111855 + }, + { + "epoch": 0.21785020194034094, + "grad_norm": 4.164735794067383, + "learning_rate": 2.6622009971672726e-05, + "loss": 1.8507, + "step": 111870 + }, + { + "epoch": 0.2178794122114512, + "grad_norm": 6.648113250732422, + "learning_rate": 2.6621139690710657e-05, + "loss": 1.9883, + "step": 111885 + }, + { + "epoch": 0.21790862248256146, + "grad_norm": 3.1719470024108887, + "learning_rate": 2.662026931188534e-05, + "loss": 1.7996, + "step": 111900 + }, + { + "epoch": 0.21793783275367173, + "grad_norm": 2.3592193126678467, + "learning_rate": 2.66193988352041e-05, + "loss": 1.8693, + "step": 111915 + }, + { + "epoch": 0.21796704302478198, + "grad_norm": 2.956803321838379, + "learning_rate": 2.6618528260674267e-05, + "loss": 1.8904, + "step": 111930 + }, + { + "epoch": 0.21799625329589226, + "grad_norm": 3.840829849243164, + "learning_rate": 2.6617657588303176e-05, + "loss": 1.9856, + "step": 111945 + }, + { + "epoch": 0.21802546356700253, + "grad_norm": 3.763444423675537, + "learning_rate": 2.6616786818098157e-05, + "loss": 2.0522, + "step": 111960 + }, + { + "epoch": 0.21805467383811278, + "grad_norm": 2.8035919666290283, + "learning_rate": 2.6615915950066538e-05, + "loss": 2.0617, + "step": 111975 + }, + { + "epoch": 0.21808388410922305, + "grad_norm": 3.990502119064331, + "learning_rate": 2.661504498421566e-05, + "loss": 1.9659, + "step": 111990 + }, + { + "epoch": 0.2181130943803333, + "grad_norm": 4.156723499298096, + "learning_rate": 2.6614173920552852e-05, + "loss": 1.8789, + "step": 112005 + }, + { + "epoch": 0.21814230465144357, + "grad_norm": 2.4059195518493652, + "learning_rate": 2.6613302759085456e-05, + "loss": 1.8283, + "step": 112020 + }, + { + "epoch": 0.21817151492255382, + "grad_norm": 2.690340518951416, + "learning_rate": 2.66124314998208e-05, + "loss": 1.7661, + "step": 112035 + }, + { + "epoch": 0.2182007251936641, + "grad_norm": 2.869263172149658, + "learning_rate": 2.6611560142766233e-05, + "loss": 1.9463, + "step": 112050 + }, + { + "epoch": 0.21822993546477437, + "grad_norm": 4.034856796264648, + "learning_rate": 2.6610688687929077e-05, + "loss": 2.053, + "step": 112065 + }, + { + "epoch": 0.21825914573588462, + "grad_norm": 3.4707658290863037, + "learning_rate": 2.6609817135316678e-05, + "loss": 1.8856, + "step": 112080 + }, + { + "epoch": 0.2182883560069949, + "grad_norm": 2.981048583984375, + "learning_rate": 2.660894548493638e-05, + "loss": 1.8925, + "step": 112095 + }, + { + "epoch": 0.21831756627810514, + "grad_norm": 3.1114957332611084, + "learning_rate": 2.660807373679552e-05, + "loss": 1.9322, + "step": 112110 + }, + { + "epoch": 0.21834677654921542, + "grad_norm": 4.338136196136475, + "learning_rate": 2.6607201890901435e-05, + "loss": 1.8071, + "step": 112125 + }, + { + "epoch": 0.21837598682032566, + "grad_norm": 2.855836868286133, + "learning_rate": 2.6606329947261474e-05, + "loss": 2.1236, + "step": 112140 + }, + { + "epoch": 0.21840519709143594, + "grad_norm": 4.288500785827637, + "learning_rate": 2.6605457905882974e-05, + "loss": 1.8106, + "step": 112155 + }, + { + "epoch": 0.2184344073625462, + "grad_norm": 3.234123468399048, + "learning_rate": 2.6604585766773277e-05, + "loss": 2.0132, + "step": 112170 + }, + { + "epoch": 0.21846361763365646, + "grad_norm": 2.5103724002838135, + "learning_rate": 2.6603713529939736e-05, + "loss": 1.8205, + "step": 112185 + }, + { + "epoch": 0.21849282790476673, + "grad_norm": 3.5800178050994873, + "learning_rate": 2.6602841195389692e-05, + "loss": 1.7844, + "step": 112200 + }, + { + "epoch": 0.21852203817587698, + "grad_norm": 3.130119562149048, + "learning_rate": 2.6601968763130488e-05, + "loss": 1.7626, + "step": 112215 + }, + { + "epoch": 0.21855124844698726, + "grad_norm": 4.282564640045166, + "learning_rate": 2.6601096233169476e-05, + "loss": 2.138, + "step": 112230 + }, + { + "epoch": 0.2185804587180975, + "grad_norm": 3.202270984649658, + "learning_rate": 2.6600223605513997e-05, + "loss": 1.9489, + "step": 112245 + }, + { + "epoch": 0.21860966898920778, + "grad_norm": 4.57750129699707, + "learning_rate": 2.659935088017141e-05, + "loss": 1.9905, + "step": 112260 + }, + { + "epoch": 0.21863887926031805, + "grad_norm": 3.757119655609131, + "learning_rate": 2.6598478057149048e-05, + "loss": 1.9843, + "step": 112275 + }, + { + "epoch": 0.2186680895314283, + "grad_norm": 4.264227867126465, + "learning_rate": 2.6597605136454278e-05, + "loss": 1.7862, + "step": 112290 + }, + { + "epoch": 0.21869729980253858, + "grad_norm": 3.738044023513794, + "learning_rate": 2.659673211809444e-05, + "loss": 1.9787, + "step": 112305 + }, + { + "epoch": 0.21872651007364882, + "grad_norm": 3.9246103763580322, + "learning_rate": 2.6595859002076893e-05, + "loss": 1.9985, + "step": 112320 + }, + { + "epoch": 0.2187557203447591, + "grad_norm": 3.810617208480835, + "learning_rate": 2.6594985788408985e-05, + "loss": 1.9279, + "step": 112335 + }, + { + "epoch": 0.21878493061586934, + "grad_norm": 2.54779052734375, + "learning_rate": 2.659411247709807e-05, + "loss": 1.8665, + "step": 112350 + }, + { + "epoch": 0.21881414088697962, + "grad_norm": 2.3314208984375, + "learning_rate": 2.6593239068151507e-05, + "loss": 1.8171, + "step": 112365 + }, + { + "epoch": 0.2188433511580899, + "grad_norm": 2.8644824028015137, + "learning_rate": 2.6592365561576644e-05, + "loss": 2.1908, + "step": 112380 + }, + { + "epoch": 0.21887256142920014, + "grad_norm": 2.667088747024536, + "learning_rate": 2.6591491957380842e-05, + "loss": 1.9942, + "step": 112395 + }, + { + "epoch": 0.21890177170031042, + "grad_norm": 2.7944681644439697, + "learning_rate": 2.6590618255571455e-05, + "loss": 2.0085, + "step": 112410 + }, + { + "epoch": 0.21893098197142066, + "grad_norm": 3.539339780807495, + "learning_rate": 2.6589744456155846e-05, + "loss": 1.797, + "step": 112425 + }, + { + "epoch": 0.21896019224253094, + "grad_norm": 4.621934413909912, + "learning_rate": 2.6588870559141363e-05, + "loss": 1.7653, + "step": 112440 + }, + { + "epoch": 0.21898940251364118, + "grad_norm": 2.362787961959839, + "learning_rate": 2.658799656453537e-05, + "loss": 1.8739, + "step": 112455 + }, + { + "epoch": 0.21901861278475146, + "grad_norm": 3.9668750762939453, + "learning_rate": 2.658712247234523e-05, + "loss": 1.8923, + "step": 112470 + }, + { + "epoch": 0.21904782305586173, + "grad_norm": 4.5454607009887695, + "learning_rate": 2.6586248282578307e-05, + "loss": 1.9908, + "step": 112485 + }, + { + "epoch": 0.21907703332697198, + "grad_norm": 2.259549856185913, + "learning_rate": 2.6585373995241952e-05, + "loss": 2.0473, + "step": 112500 + }, + { + "epoch": 0.21910624359808226, + "grad_norm": 2.121405601501465, + "learning_rate": 2.6584499610343532e-05, + "loss": 1.9556, + "step": 112515 + }, + { + "epoch": 0.2191354538691925, + "grad_norm": 3.1360301971435547, + "learning_rate": 2.6583625127890417e-05, + "loss": 2.2452, + "step": 112530 + }, + { + "epoch": 0.21916466414030278, + "grad_norm": 3.995178461074829, + "learning_rate": 2.6582750547889962e-05, + "loss": 1.7896, + "step": 112545 + }, + { + "epoch": 0.21919387441141303, + "grad_norm": 3.6822192668914795, + "learning_rate": 2.658187587034954e-05, + "loss": 1.9749, + "step": 112560 + }, + { + "epoch": 0.2192230846825233, + "grad_norm": 2.187448024749756, + "learning_rate": 2.6581001095276507e-05, + "loss": 1.9172, + "step": 112575 + }, + { + "epoch": 0.21925229495363358, + "grad_norm": 2.505924701690674, + "learning_rate": 2.6580126222678234e-05, + "loss": 1.9766, + "step": 112590 + }, + { + "epoch": 0.21928150522474382, + "grad_norm": 3.6174399852752686, + "learning_rate": 2.6579251252562095e-05, + "loss": 1.8112, + "step": 112605 + }, + { + "epoch": 0.2193107154958541, + "grad_norm": 3.675524950027466, + "learning_rate": 2.6578376184935448e-05, + "loss": 1.8173, + "step": 112620 + }, + { + "epoch": 0.21933992576696434, + "grad_norm": 2.6764540672302246, + "learning_rate": 2.6577501019805667e-05, + "loss": 1.8347, + "step": 112635 + }, + { + "epoch": 0.21936913603807462, + "grad_norm": 2.648153305053711, + "learning_rate": 2.6576625757180123e-05, + "loss": 1.9975, + "step": 112650 + }, + { + "epoch": 0.21939834630918487, + "grad_norm": 2.781615972518921, + "learning_rate": 2.657575039706619e-05, + "loss": 1.8405, + "step": 112665 + }, + { + "epoch": 0.21942755658029514, + "grad_norm": 3.7857301235198975, + "learning_rate": 2.6574874939471226e-05, + "loss": 1.79, + "step": 112680 + }, + { + "epoch": 0.21945676685140542, + "grad_norm": 2.2762792110443115, + "learning_rate": 2.6573999384402617e-05, + "loss": 1.8142, + "step": 112695 + }, + { + "epoch": 0.21948597712251566, + "grad_norm": 2.4613137245178223, + "learning_rate": 2.657312373186773e-05, + "loss": 1.9734, + "step": 112710 + }, + { + "epoch": 0.21951518739362594, + "grad_norm": 3.06398344039917, + "learning_rate": 2.6572247981873943e-05, + "loss": 1.9325, + "step": 112725 + }, + { + "epoch": 0.21954439766473619, + "grad_norm": 2.809394121170044, + "learning_rate": 2.6571372134428623e-05, + "loss": 1.834, + "step": 112740 + }, + { + "epoch": 0.21957360793584646, + "grad_norm": 2.4552106857299805, + "learning_rate": 2.6570496189539155e-05, + "loss": 1.8696, + "step": 112755 + }, + { + "epoch": 0.2196028182069567, + "grad_norm": 3.8817756175994873, + "learning_rate": 2.6569620147212915e-05, + "loss": 2.0167, + "step": 112770 + }, + { + "epoch": 0.21963202847806698, + "grad_norm": 2.8427846431732178, + "learning_rate": 2.6568744007457267e-05, + "loss": 1.967, + "step": 112785 + }, + { + "epoch": 0.21966123874917726, + "grad_norm": 6.188169002532959, + "learning_rate": 2.6567867770279606e-05, + "loss": 1.8614, + "step": 112800 + }, + { + "epoch": 0.2196904490202875, + "grad_norm": 3.0021584033966064, + "learning_rate": 2.65669914356873e-05, + "loss": 1.6175, + "step": 112815 + }, + { + "epoch": 0.21971965929139778, + "grad_norm": 4.5422749519348145, + "learning_rate": 2.6566115003687735e-05, + "loss": 1.9479, + "step": 112830 + }, + { + "epoch": 0.21974886956250803, + "grad_norm": 4.171680450439453, + "learning_rate": 2.656523847428829e-05, + "loss": 1.7884, + "step": 112845 + }, + { + "epoch": 0.2197780798336183, + "grad_norm": 4.223339557647705, + "learning_rate": 2.6564361847496338e-05, + "loss": 1.8472, + "step": 112860 + }, + { + "epoch": 0.21980729010472855, + "grad_norm": 2.315711498260498, + "learning_rate": 2.6563485123319276e-05, + "loss": 1.8577, + "step": 112875 + }, + { + "epoch": 0.21983650037583882, + "grad_norm": 2.759145498275757, + "learning_rate": 2.6562608301764473e-05, + "loss": 1.9021, + "step": 112890 + }, + { + "epoch": 0.2198657106469491, + "grad_norm": 4.039562225341797, + "learning_rate": 2.6561731382839324e-05, + "loss": 1.9747, + "step": 112905 + }, + { + "epoch": 0.21989492091805934, + "grad_norm": 3.0111336708068848, + "learning_rate": 2.6560854366551207e-05, + "loss": 1.9214, + "step": 112920 + }, + { + "epoch": 0.21992413118916962, + "grad_norm": 3.785029172897339, + "learning_rate": 2.655997725290751e-05, + "loss": 1.9245, + "step": 112935 + }, + { + "epoch": 0.21995334146027987, + "grad_norm": 2.4930436611175537, + "learning_rate": 2.655910004191562e-05, + "loss": 2.1533, + "step": 112950 + }, + { + "epoch": 0.21998255173139014, + "grad_norm": 3.1818413734436035, + "learning_rate": 2.655822273358292e-05, + "loss": 1.9321, + "step": 112965 + }, + { + "epoch": 0.2200117620025004, + "grad_norm": 2.975877523422241, + "learning_rate": 2.6557345327916802e-05, + "loss": 1.7307, + "step": 112980 + }, + { + "epoch": 0.22004097227361066, + "grad_norm": 2.2545995712280273, + "learning_rate": 2.6556467824924654e-05, + "loss": 1.9279, + "step": 112995 + }, + { + "epoch": 0.22007018254472094, + "grad_norm": 5.242068290710449, + "learning_rate": 2.6555590224613862e-05, + "loss": 1.7516, + "step": 113010 + }, + { + "epoch": 0.22009939281583119, + "grad_norm": 2.778902530670166, + "learning_rate": 2.6554712526991826e-05, + "loss": 2.0596, + "step": 113025 + }, + { + "epoch": 0.22012860308694146, + "grad_norm": 5.335712909698486, + "learning_rate": 2.6553834732065927e-05, + "loss": 1.9164, + "step": 113040 + }, + { + "epoch": 0.2201578133580517, + "grad_norm": 2.6981112957000732, + "learning_rate": 2.655295683984356e-05, + "loss": 1.8092, + "step": 113055 + }, + { + "epoch": 0.22018702362916198, + "grad_norm": 3.539572238922119, + "learning_rate": 2.6552078850332116e-05, + "loss": 1.8776, + "step": 113070 + }, + { + "epoch": 0.22021623390027223, + "grad_norm": 2.7010369300842285, + "learning_rate": 2.6551200763538996e-05, + "loss": 1.8627, + "step": 113085 + }, + { + "epoch": 0.2202454441713825, + "grad_norm": 2.7828550338745117, + "learning_rate": 2.655032257947159e-05, + "loss": 2.0137, + "step": 113100 + }, + { + "epoch": 0.22027465444249278, + "grad_norm": 2.542393445968628, + "learning_rate": 2.6549444298137292e-05, + "loss": 1.8451, + "step": 113115 + }, + { + "epoch": 0.22030386471360303, + "grad_norm": 2.2167718410491943, + "learning_rate": 2.6548565919543497e-05, + "loss": 1.8324, + "step": 113130 + }, + { + "epoch": 0.2203330749847133, + "grad_norm": 3.3027074337005615, + "learning_rate": 2.6547687443697607e-05, + "loss": 1.7749, + "step": 113145 + }, + { + "epoch": 0.22036228525582355, + "grad_norm": 4.517278671264648, + "learning_rate": 2.6546808870607016e-05, + "loss": 1.8196, + "step": 113160 + }, + { + "epoch": 0.22039149552693382, + "grad_norm": 3.0809736251831055, + "learning_rate": 2.6545930200279126e-05, + "loss": 1.9752, + "step": 113175 + }, + { + "epoch": 0.22042070579804407, + "grad_norm": 2.2590575218200684, + "learning_rate": 2.6545051432721327e-05, + "loss": 2.1471, + "step": 113190 + }, + { + "epoch": 0.22044991606915434, + "grad_norm": 3.4194178581237793, + "learning_rate": 2.654417256794103e-05, + "loss": 1.9553, + "step": 113205 + }, + { + "epoch": 0.22047912634026462, + "grad_norm": 3.0735700130462646, + "learning_rate": 2.654329360594563e-05, + "loss": 1.9012, + "step": 113220 + }, + { + "epoch": 0.22050833661137487, + "grad_norm": 4.397512912750244, + "learning_rate": 2.6542414546742536e-05, + "loss": 2.0142, + "step": 113235 + }, + { + "epoch": 0.22053754688248514, + "grad_norm": 4.984601974487305, + "learning_rate": 2.6541535390339144e-05, + "loss": 1.5275, + "step": 113250 + }, + { + "epoch": 0.2205667571535954, + "grad_norm": 3.1506595611572266, + "learning_rate": 2.654065613674286e-05, + "loss": 1.7682, + "step": 113265 + }, + { + "epoch": 0.22059596742470566, + "grad_norm": 3.166433811187744, + "learning_rate": 2.6539776785961083e-05, + "loss": 1.9154, + "step": 113280 + }, + { + "epoch": 0.2206251776958159, + "grad_norm": 5.1949992179870605, + "learning_rate": 2.6538897338001226e-05, + "loss": 1.8133, + "step": 113295 + }, + { + "epoch": 0.22065438796692619, + "grad_norm": 4.003324508666992, + "learning_rate": 2.653801779287069e-05, + "loss": 1.8913, + "step": 113310 + }, + { + "epoch": 0.22068359823803646, + "grad_norm": 4.067132949829102, + "learning_rate": 2.653713815057689e-05, + "loss": 1.8093, + "step": 113325 + }, + { + "epoch": 0.2207128085091467, + "grad_norm": 3.6921586990356445, + "learning_rate": 2.6536258411127218e-05, + "loss": 1.9677, + "step": 113340 + }, + { + "epoch": 0.22074201878025698, + "grad_norm": 5.583538055419922, + "learning_rate": 2.6535378574529095e-05, + "loss": 1.9509, + "step": 113355 + }, + { + "epoch": 0.22077122905136723, + "grad_norm": 2.923513889312744, + "learning_rate": 2.6534498640789928e-05, + "loss": 1.807, + "step": 113370 + }, + { + "epoch": 0.2208004393224775, + "grad_norm": 3.3624441623687744, + "learning_rate": 2.653361860991712e-05, + "loss": 2.0291, + "step": 113385 + }, + { + "epoch": 0.22082964959358775, + "grad_norm": 4.006463050842285, + "learning_rate": 2.6532738481918092e-05, + "loss": 2.1117, + "step": 113400 + }, + { + "epoch": 0.22085885986469803, + "grad_norm": 3.863485813140869, + "learning_rate": 2.653185825680025e-05, + "loss": 1.9284, + "step": 113415 + }, + { + "epoch": 0.2208880701358083, + "grad_norm": 2.093747615814209, + "learning_rate": 2.653097793457101e-05, + "loss": 1.8218, + "step": 113430 + }, + { + "epoch": 0.22091728040691855, + "grad_norm": 2.554866313934326, + "learning_rate": 2.6530097515237786e-05, + "loss": 1.9233, + "step": 113445 + }, + { + "epoch": 0.22094649067802882, + "grad_norm": 2.028089761734009, + "learning_rate": 2.6529216998807984e-05, + "loss": 2.0121, + "step": 113460 + }, + { + "epoch": 0.22097570094913907, + "grad_norm": 4.838136672973633, + "learning_rate": 2.6528336385289023e-05, + "loss": 1.8658, + "step": 113475 + }, + { + "epoch": 0.22100491122024934, + "grad_norm": 3.862055540084839, + "learning_rate": 2.6527455674688322e-05, + "loss": 2.0901, + "step": 113490 + }, + { + "epoch": 0.2210341214913596, + "grad_norm": 3.50127911567688, + "learning_rate": 2.6526574867013296e-05, + "loss": 1.7679, + "step": 113505 + }, + { + "epoch": 0.22106333176246987, + "grad_norm": 6.428659915924072, + "learning_rate": 2.6525693962271363e-05, + "loss": 1.9252, + "step": 113520 + }, + { + "epoch": 0.22109254203358011, + "grad_norm": 2.2549614906311035, + "learning_rate": 2.652481296046994e-05, + "loss": 1.8716, + "step": 113535 + }, + { + "epoch": 0.2211217523046904, + "grad_norm": 3.392184257507324, + "learning_rate": 2.6523931861616442e-05, + "loss": 2.06, + "step": 113550 + }, + { + "epoch": 0.22115096257580066, + "grad_norm": 4.30526876449585, + "learning_rate": 2.6523050665718298e-05, + "loss": 2.2171, + "step": 113565 + }, + { + "epoch": 0.2211801728469109, + "grad_norm": 4.607630729675293, + "learning_rate": 2.6522169372782924e-05, + "loss": 1.8445, + "step": 113580 + }, + { + "epoch": 0.22120938311802119, + "grad_norm": 2.6862292289733887, + "learning_rate": 2.6521287982817734e-05, + "loss": 2.0873, + "step": 113595 + }, + { + "epoch": 0.22123859338913143, + "grad_norm": 3.0994985103607178, + "learning_rate": 2.652040649583016e-05, + "loss": 1.6542, + "step": 113610 + }, + { + "epoch": 0.2212678036602417, + "grad_norm": 3.0458292961120605, + "learning_rate": 2.651952491182763e-05, + "loss": 1.8219, + "step": 113625 + }, + { + "epoch": 0.22129701393135195, + "grad_norm": 3.522198438644409, + "learning_rate": 2.6518643230817554e-05, + "loss": 2.0792, + "step": 113640 + }, + { + "epoch": 0.22132622420246223, + "grad_norm": 4.014333248138428, + "learning_rate": 2.6517761452807366e-05, + "loss": 1.802, + "step": 113655 + }, + { + "epoch": 0.2213554344735725, + "grad_norm": 3.216944456100464, + "learning_rate": 2.6516879577804483e-05, + "loss": 1.9393, + "step": 113670 + }, + { + "epoch": 0.22138464474468275, + "grad_norm": 3.3574652671813965, + "learning_rate": 2.6515997605816345e-05, + "loss": 1.8013, + "step": 113685 + }, + { + "epoch": 0.22141385501579303, + "grad_norm": 3.969902515411377, + "learning_rate": 2.6515115536850364e-05, + "loss": 1.9391, + "step": 113700 + }, + { + "epoch": 0.22144306528690327, + "grad_norm": 3.433708906173706, + "learning_rate": 2.6514233370913984e-05, + "loss": 1.9181, + "step": 113715 + }, + { + "epoch": 0.22147227555801355, + "grad_norm": 6.741209030151367, + "learning_rate": 2.651335110801462e-05, + "loss": 1.9908, + "step": 113730 + }, + { + "epoch": 0.2215014858291238, + "grad_norm": 2.779985189437866, + "learning_rate": 2.6512468748159706e-05, + "loss": 1.9151, + "step": 113745 + }, + { + "epoch": 0.22153069610023407, + "grad_norm": 4.84840726852417, + "learning_rate": 2.6511586291356678e-05, + "loss": 2.045, + "step": 113760 + }, + { + "epoch": 0.22155990637134435, + "grad_norm": 3.8712644577026367, + "learning_rate": 2.651070373761296e-05, + "loss": 2.0061, + "step": 113775 + }, + { + "epoch": 0.2215891166424546, + "grad_norm": 3.1947154998779297, + "learning_rate": 2.6509821086935988e-05, + "loss": 1.8212, + "step": 113790 + }, + { + "epoch": 0.22161832691356487, + "grad_norm": 4.376053810119629, + "learning_rate": 2.6508938339333193e-05, + "loss": 1.7868, + "step": 113805 + }, + { + "epoch": 0.22164753718467511, + "grad_norm": 13.27657699584961, + "learning_rate": 2.650805549481201e-05, + "loss": 1.9788, + "step": 113820 + }, + { + "epoch": 0.2216767474557854, + "grad_norm": 4.66534948348999, + "learning_rate": 2.650717255337987e-05, + "loss": 1.7078, + "step": 113835 + }, + { + "epoch": 0.22170595772689564, + "grad_norm": 4.2643866539001465, + "learning_rate": 2.650628951504422e-05, + "loss": 2.0727, + "step": 113850 + }, + { + "epoch": 0.2217351679980059, + "grad_norm": 4.936367511749268, + "learning_rate": 2.650540637981248e-05, + "loss": 1.8531, + "step": 113865 + }, + { + "epoch": 0.22176437826911619, + "grad_norm": 3.8592865467071533, + "learning_rate": 2.6504523147692097e-05, + "loss": 1.8413, + "step": 113880 + }, + { + "epoch": 0.22179358854022643, + "grad_norm": 2.509983777999878, + "learning_rate": 2.6503639818690505e-05, + "loss": 1.7214, + "step": 113895 + }, + { + "epoch": 0.2218227988113367, + "grad_norm": 2.59035325050354, + "learning_rate": 2.6502756392815144e-05, + "loss": 2.01, + "step": 113910 + }, + { + "epoch": 0.22185200908244695, + "grad_norm": 3.386475086212158, + "learning_rate": 2.6501872870073458e-05, + "loss": 1.9502, + "step": 113925 + }, + { + "epoch": 0.22188121935355723, + "grad_norm": 3.1410748958587646, + "learning_rate": 2.6500989250472876e-05, + "loss": 1.8361, + "step": 113940 + }, + { + "epoch": 0.22191042962466748, + "grad_norm": 2.7843382358551025, + "learning_rate": 2.6500105534020854e-05, + "loss": 1.7214, + "step": 113955 + }, + { + "epoch": 0.22193963989577775, + "grad_norm": 4.095170021057129, + "learning_rate": 2.649922172072482e-05, + "loss": 2.1239, + "step": 113970 + }, + { + "epoch": 0.22196885016688803, + "grad_norm": 2.6503350734710693, + "learning_rate": 2.6498337810592226e-05, + "loss": 1.9053, + "step": 113985 + }, + { + "epoch": 0.22199806043799827, + "grad_norm": 2.2811119556427, + "learning_rate": 2.6497453803630508e-05, + "loss": 1.9086, + "step": 114000 + }, + { + "epoch": 0.22202727070910855, + "grad_norm": 4.263707637786865, + "learning_rate": 2.6496569699847117e-05, + "loss": 2.1518, + "step": 114015 + }, + { + "epoch": 0.2220564809802188, + "grad_norm": 4.713109493255615, + "learning_rate": 2.6495685499249496e-05, + "loss": 1.6789, + "step": 114030 + }, + { + "epoch": 0.22208569125132907, + "grad_norm": 5.040444374084473, + "learning_rate": 2.6494801201845094e-05, + "loss": 1.7603, + "step": 114045 + }, + { + "epoch": 0.22211490152243932, + "grad_norm": 3.669161796569824, + "learning_rate": 2.649391680764135e-05, + "loss": 1.8863, + "step": 114060 + }, + { + "epoch": 0.2221441117935496, + "grad_norm": 5.1901726722717285, + "learning_rate": 2.6493032316645713e-05, + "loss": 1.9664, + "step": 114075 + }, + { + "epoch": 0.22217332206465987, + "grad_norm": 4.561544895172119, + "learning_rate": 2.6492147728865643e-05, + "loss": 1.9046, + "step": 114090 + }, + { + "epoch": 0.22220253233577011, + "grad_norm": 2.5348970890045166, + "learning_rate": 2.6491263044308575e-05, + "loss": 1.8957, + "step": 114105 + }, + { + "epoch": 0.2222317426068804, + "grad_norm": 2.31015944480896, + "learning_rate": 2.6490378262981964e-05, + "loss": 1.8117, + "step": 114120 + }, + { + "epoch": 0.22226095287799064, + "grad_norm": 2.8059120178222656, + "learning_rate": 2.648949338489326e-05, + "loss": 1.9301, + "step": 114135 + }, + { + "epoch": 0.2222901631491009, + "grad_norm": 2.276433229446411, + "learning_rate": 2.648860841004992e-05, + "loss": 2.1237, + "step": 114150 + }, + { + "epoch": 0.22231937342021116, + "grad_norm": 4.614841938018799, + "learning_rate": 2.6487723338459394e-05, + "loss": 1.9261, + "step": 114165 + }, + { + "epoch": 0.22234858369132143, + "grad_norm": 5.030738353729248, + "learning_rate": 2.648683817012913e-05, + "loss": 1.8865, + "step": 114180 + }, + { + "epoch": 0.2223777939624317, + "grad_norm": 2.616485595703125, + "learning_rate": 2.648595290506659e-05, + "loss": 1.8067, + "step": 114195 + }, + { + "epoch": 0.22240700423354196, + "grad_norm": 5.456234455108643, + "learning_rate": 2.6485067543279226e-05, + "loss": 2.0019, + "step": 114210 + }, + { + "epoch": 0.22243621450465223, + "grad_norm": 1.8005732297897339, + "learning_rate": 2.6484182084774487e-05, + "loss": 1.879, + "step": 114225 + }, + { + "epoch": 0.22246542477576248, + "grad_norm": 1.9899953603744507, + "learning_rate": 2.6483296529559842e-05, + "loss": 1.8708, + "step": 114240 + }, + { + "epoch": 0.22249463504687275, + "grad_norm": 2.9241819381713867, + "learning_rate": 2.648241087764274e-05, + "loss": 1.8293, + "step": 114255 + }, + { + "epoch": 0.222523845317983, + "grad_norm": 3.605571746826172, + "learning_rate": 2.6481525129030635e-05, + "loss": 1.7704, + "step": 114270 + }, + { + "epoch": 0.22255305558909327, + "grad_norm": 2.378146171569824, + "learning_rate": 2.6480639283730998e-05, + "loss": 2.0889, + "step": 114285 + }, + { + "epoch": 0.22258226586020355, + "grad_norm": 2.7937228679656982, + "learning_rate": 2.6479753341751284e-05, + "loss": 1.9458, + "step": 114300 + }, + { + "epoch": 0.2226114761313138, + "grad_norm": 2.1684587001800537, + "learning_rate": 2.6478867303098947e-05, + "loss": 1.8982, + "step": 114315 + }, + { + "epoch": 0.22264068640242407, + "grad_norm": 2.633378505706787, + "learning_rate": 2.6477981167781456e-05, + "loss": 1.839, + "step": 114330 + }, + { + "epoch": 0.22266989667353432, + "grad_norm": 4.398677825927734, + "learning_rate": 2.6477094935806274e-05, + "loss": 1.86, + "step": 114345 + }, + { + "epoch": 0.2226991069446446, + "grad_norm": 4.474298000335693, + "learning_rate": 2.647620860718086e-05, + "loss": 1.9691, + "step": 114360 + }, + { + "epoch": 0.22272831721575484, + "grad_norm": 3.602238893508911, + "learning_rate": 2.6475322181912683e-05, + "loss": 1.951, + "step": 114375 + }, + { + "epoch": 0.22275752748686511, + "grad_norm": 5.1650824546813965, + "learning_rate": 2.6474435660009198e-05, + "loss": 2.0023, + "step": 114390 + }, + { + "epoch": 0.2227867377579754, + "grad_norm": 3.876054048538208, + "learning_rate": 2.647354904147788e-05, + "loss": 2.0658, + "step": 114405 + }, + { + "epoch": 0.22281594802908564, + "grad_norm": 2.836332321166992, + "learning_rate": 2.6472662326326186e-05, + "loss": 1.6462, + "step": 114420 + }, + { + "epoch": 0.2228451583001959, + "grad_norm": 3.294875383377075, + "learning_rate": 2.6471775514561597e-05, + "loss": 1.7336, + "step": 114435 + }, + { + "epoch": 0.22287436857130616, + "grad_norm": 3.6717963218688965, + "learning_rate": 2.6470888606191568e-05, + "loss": 1.8759, + "step": 114450 + }, + { + "epoch": 0.22290357884241643, + "grad_norm": 4.466915130615234, + "learning_rate": 2.6470001601223578e-05, + "loss": 1.9719, + "step": 114465 + }, + { + "epoch": 0.22293278911352668, + "grad_norm": 2.533273220062256, + "learning_rate": 2.6469114499665087e-05, + "loss": 1.8358, + "step": 114480 + }, + { + "epoch": 0.22296199938463696, + "grad_norm": 3.8786473274230957, + "learning_rate": 2.6468227301523565e-05, + "loss": 1.9287, + "step": 114495 + }, + { + "epoch": 0.22299120965574723, + "grad_norm": 3.436058521270752, + "learning_rate": 2.6467340006806495e-05, + "loss": 2.047, + "step": 114510 + }, + { + "epoch": 0.22302041992685748, + "grad_norm": 3.75593900680542, + "learning_rate": 2.646645261552134e-05, + "loss": 1.912, + "step": 114525 + }, + { + "epoch": 0.22304963019796775, + "grad_norm": 3.7088215351104736, + "learning_rate": 2.6465565127675577e-05, + "loss": 1.9252, + "step": 114540 + }, + { + "epoch": 0.223078840469078, + "grad_norm": 2.377614736557007, + "learning_rate": 2.6464677543276674e-05, + "loss": 1.881, + "step": 114555 + }, + { + "epoch": 0.22310805074018827, + "grad_norm": 4.204943656921387, + "learning_rate": 2.646378986233211e-05, + "loss": 2.0063, + "step": 114570 + }, + { + "epoch": 0.22313726101129852, + "grad_norm": 3.2481517791748047, + "learning_rate": 2.6462902084849363e-05, + "loss": 1.7881, + "step": 114585 + }, + { + "epoch": 0.2231664712824088, + "grad_norm": 2.7306830883026123, + "learning_rate": 2.64620142108359e-05, + "loss": 1.8286, + "step": 114600 + }, + { + "epoch": 0.22319568155351907, + "grad_norm": 6.47460412979126, + "learning_rate": 2.6461126240299206e-05, + "loss": 1.8437, + "step": 114615 + }, + { + "epoch": 0.22322489182462932, + "grad_norm": 2.2643988132476807, + "learning_rate": 2.6460238173246754e-05, + "loss": 2.049, + "step": 114630 + }, + { + "epoch": 0.2232541020957396, + "grad_norm": 4.945465564727783, + "learning_rate": 2.6459350009686024e-05, + "loss": 1.8884, + "step": 114645 + }, + { + "epoch": 0.22328331236684984, + "grad_norm": 5.822000503540039, + "learning_rate": 2.64584617496245e-05, + "loss": 2.0719, + "step": 114660 + }, + { + "epoch": 0.22331252263796011, + "grad_norm": 4.093710422515869, + "learning_rate": 2.6457573393069656e-05, + "loss": 1.9691, + "step": 114675 + }, + { + "epoch": 0.22334173290907036, + "grad_norm": 4.628026008605957, + "learning_rate": 2.6456684940028972e-05, + "loss": 1.8426, + "step": 114690 + }, + { + "epoch": 0.22337094318018064, + "grad_norm": 3.885143280029297, + "learning_rate": 2.6455796390509935e-05, + "loss": 1.8886, + "step": 114705 + }, + { + "epoch": 0.2234001534512909, + "grad_norm": 2.7197821140289307, + "learning_rate": 2.6454907744520028e-05, + "loss": 1.9183, + "step": 114720 + }, + { + "epoch": 0.22342936372240116, + "grad_norm": 3.1269850730895996, + "learning_rate": 2.6454019002066733e-05, + "loss": 2.0711, + "step": 114735 + }, + { + "epoch": 0.22345857399351143, + "grad_norm": 2.586174249649048, + "learning_rate": 2.6453130163157524e-05, + "loss": 1.916, + "step": 114750 + }, + { + "epoch": 0.22348778426462168, + "grad_norm": 3.282273530960083, + "learning_rate": 2.6452241227799905e-05, + "loss": 1.944, + "step": 114765 + }, + { + "epoch": 0.22351699453573196, + "grad_norm": 2.32647442817688, + "learning_rate": 2.6451352196001348e-05, + "loss": 1.8747, + "step": 114780 + }, + { + "epoch": 0.2235462048068422, + "grad_norm": 3.102443218231201, + "learning_rate": 2.645046306776934e-05, + "loss": 1.8126, + "step": 114795 + }, + { + "epoch": 0.22357541507795248, + "grad_norm": 3.2475547790527344, + "learning_rate": 2.6449573843111376e-05, + "loss": 2.0632, + "step": 114810 + }, + { + "epoch": 0.22360462534906275, + "grad_norm": 3.1524431705474854, + "learning_rate": 2.644868452203494e-05, + "loss": 1.7539, + "step": 114825 + }, + { + "epoch": 0.223633835620173, + "grad_norm": 3.517913818359375, + "learning_rate": 2.6447795104547522e-05, + "loss": 1.9573, + "step": 114840 + }, + { + "epoch": 0.22366304589128327, + "grad_norm": 2.8643980026245117, + "learning_rate": 2.644690559065661e-05, + "loss": 1.9595, + "step": 114855 + }, + { + "epoch": 0.22369225616239352, + "grad_norm": 3.1263747215270996, + "learning_rate": 2.6446015980369697e-05, + "loss": 1.8697, + "step": 114870 + }, + { + "epoch": 0.2237214664335038, + "grad_norm": 4.165008544921875, + "learning_rate": 2.6445126273694275e-05, + "loss": 1.8163, + "step": 114885 + }, + { + "epoch": 0.22375067670461404, + "grad_norm": 3.842365026473999, + "learning_rate": 2.644423647063783e-05, + "loss": 1.965, + "step": 114900 + }, + { + "epoch": 0.22377988697572432, + "grad_norm": 3.2855610847473145, + "learning_rate": 2.6443346571207866e-05, + "loss": 1.8641, + "step": 114915 + }, + { + "epoch": 0.2238090972468346, + "grad_norm": 2.9903323650360107, + "learning_rate": 2.6442456575411866e-05, + "loss": 1.8039, + "step": 114930 + }, + { + "epoch": 0.22383830751794484, + "grad_norm": 4.708531856536865, + "learning_rate": 2.6441566483257333e-05, + "loss": 1.9915, + "step": 114945 + }, + { + "epoch": 0.22386751778905511, + "grad_norm": 6.297287464141846, + "learning_rate": 2.644067629475176e-05, + "loss": 1.7994, + "step": 114960 + }, + { + "epoch": 0.22389672806016536, + "grad_norm": 6.971879005432129, + "learning_rate": 2.6439786009902637e-05, + "loss": 1.859, + "step": 114975 + }, + { + "epoch": 0.22392593833127564, + "grad_norm": 4.154273986816406, + "learning_rate": 2.6438895628717478e-05, + "loss": 1.7338, + "step": 114990 + }, + { + "epoch": 0.22395514860238588, + "grad_norm": 4.905094623565674, + "learning_rate": 2.643800515120376e-05, + "loss": 2.0312, + "step": 115005 + }, + { + "epoch": 0.22398435887349616, + "grad_norm": 2.942640542984009, + "learning_rate": 2.6437114577368997e-05, + "loss": 1.7724, + "step": 115020 + }, + { + "epoch": 0.22401356914460643, + "grad_norm": 3.850083112716675, + "learning_rate": 2.6436223907220678e-05, + "loss": 1.7203, + "step": 115035 + }, + { + "epoch": 0.22404277941571668, + "grad_norm": 3.610121011734009, + "learning_rate": 2.6435333140766316e-05, + "loss": 1.8216, + "step": 115050 + }, + { + "epoch": 0.22407198968682696, + "grad_norm": 2.486565113067627, + "learning_rate": 2.6434442278013402e-05, + "loss": 1.7798, + "step": 115065 + }, + { + "epoch": 0.2241011999579372, + "grad_norm": 2.4288437366485596, + "learning_rate": 2.6433551318969443e-05, + "loss": 1.8131, + "step": 115080 + }, + { + "epoch": 0.22413041022904748, + "grad_norm": 3.3251547813415527, + "learning_rate": 2.6432660263641938e-05, + "loss": 1.8716, + "step": 115095 + }, + { + "epoch": 0.22415962050015772, + "grad_norm": 3.3394601345062256, + "learning_rate": 2.6431769112038398e-05, + "loss": 1.771, + "step": 115110 + }, + { + "epoch": 0.224188830771268, + "grad_norm": 4.8938822746276855, + "learning_rate": 2.6430877864166315e-05, + "loss": 1.967, + "step": 115125 + }, + { + "epoch": 0.22421804104237827, + "grad_norm": 3.5164246559143066, + "learning_rate": 2.6429986520033206e-05, + "loss": 2.0326, + "step": 115140 + }, + { + "epoch": 0.22424725131348852, + "grad_norm": 2.9261374473571777, + "learning_rate": 2.642909507964657e-05, + "loss": 1.8582, + "step": 115155 + }, + { + "epoch": 0.2242764615845988, + "grad_norm": 2.603302240371704, + "learning_rate": 2.642820354301392e-05, + "loss": 1.9366, + "step": 115170 + }, + { + "epoch": 0.22430567185570904, + "grad_norm": 4.585788726806641, + "learning_rate": 2.642731191014276e-05, + "loss": 1.7198, + "step": 115185 + }, + { + "epoch": 0.22433488212681932, + "grad_norm": 2.2846384048461914, + "learning_rate": 2.6426420181040604e-05, + "loss": 1.6936, + "step": 115200 + }, + { + "epoch": 0.22436409239792957, + "grad_norm": 3.354407548904419, + "learning_rate": 2.642552835571495e-05, + "loss": 1.9343, + "step": 115215 + }, + { + "epoch": 0.22439330266903984, + "grad_norm": 4.050466060638428, + "learning_rate": 2.6424636434173317e-05, + "loss": 1.95, + "step": 115230 + }, + { + "epoch": 0.22442251294015012, + "grad_norm": 5.36024808883667, + "learning_rate": 2.6423744416423213e-05, + "loss": 1.9112, + "step": 115245 + }, + { + "epoch": 0.22445172321126036, + "grad_norm": 3.209658145904541, + "learning_rate": 2.6422852302472148e-05, + "loss": 1.9278, + "step": 115260 + }, + { + "epoch": 0.22448093348237064, + "grad_norm": 3.0916571617126465, + "learning_rate": 2.642196009232764e-05, + "loss": 1.9806, + "step": 115275 + }, + { + "epoch": 0.22451014375348088, + "grad_norm": 3.5521481037139893, + "learning_rate": 2.64210677859972e-05, + "loss": 1.9803, + "step": 115290 + }, + { + "epoch": 0.22453935402459116, + "grad_norm": 4.086769104003906, + "learning_rate": 2.6420175383488343e-05, + "loss": 1.7626, + "step": 115305 + }, + { + "epoch": 0.2245685642957014, + "grad_norm": 3.1067583560943604, + "learning_rate": 2.641928288480858e-05, + "loss": 1.7372, + "step": 115320 + }, + { + "epoch": 0.22459777456681168, + "grad_norm": 2.6417720317840576, + "learning_rate": 2.6418390289965433e-05, + "loss": 1.9791, + "step": 115335 + }, + { + "epoch": 0.22462698483792196, + "grad_norm": 3.719069480895996, + "learning_rate": 2.6417497598966414e-05, + "loss": 1.8488, + "step": 115350 + }, + { + "epoch": 0.2246561951090322, + "grad_norm": 5.592521667480469, + "learning_rate": 2.641660481181904e-05, + "loss": 2.0829, + "step": 115365 + }, + { + "epoch": 0.22468540538014248, + "grad_norm": 3.166112184524536, + "learning_rate": 2.641571192853083e-05, + "loss": 1.8212, + "step": 115380 + }, + { + "epoch": 0.22471461565125272, + "grad_norm": 3.970258951187134, + "learning_rate": 2.641481894910931e-05, + "loss": 1.8667, + "step": 115395 + }, + { + "epoch": 0.224743825922363, + "grad_norm": 2.680864095687866, + "learning_rate": 2.6413925873561992e-05, + "loss": 2.03, + "step": 115410 + }, + { + "epoch": 0.22477303619347325, + "grad_norm": 3.127448320388794, + "learning_rate": 2.64130327018964e-05, + "loss": 1.9324, + "step": 115425 + }, + { + "epoch": 0.22480224646458352, + "grad_norm": 4.695576190948486, + "learning_rate": 2.641213943412005e-05, + "loss": 1.9821, + "step": 115440 + }, + { + "epoch": 0.2248314567356938, + "grad_norm": 4.29295539855957, + "learning_rate": 2.641124607024047e-05, + "loss": 1.9177, + "step": 115455 + }, + { + "epoch": 0.22486066700680404, + "grad_norm": 3.553468704223633, + "learning_rate": 2.6410352610265184e-05, + "loss": 2.1556, + "step": 115470 + }, + { + "epoch": 0.22488987727791432, + "grad_norm": 3.3158960342407227, + "learning_rate": 2.6409459054201716e-05, + "loss": 1.8865, + "step": 115485 + }, + { + "epoch": 0.22491908754902457, + "grad_norm": 2.954944133758545, + "learning_rate": 2.6408565402057583e-05, + "loss": 1.7485, + "step": 115500 + }, + { + "epoch": 0.22494829782013484, + "grad_norm": 2.656698226928711, + "learning_rate": 2.640767165384032e-05, + "loss": 1.8784, + "step": 115515 + }, + { + "epoch": 0.2249775080912451, + "grad_norm": 3.3731765747070312, + "learning_rate": 2.6406777809557448e-05, + "loss": 2.1023, + "step": 115530 + }, + { + "epoch": 0.22500671836235536, + "grad_norm": 3.9505627155303955, + "learning_rate": 2.6405883869216497e-05, + "loss": 1.9554, + "step": 115545 + }, + { + "epoch": 0.22503592863346564, + "grad_norm": 3.8175137042999268, + "learning_rate": 2.640498983282499e-05, + "loss": 1.8625, + "step": 115560 + }, + { + "epoch": 0.22506513890457588, + "grad_norm": 3.0920603275299072, + "learning_rate": 2.6404095700390466e-05, + "loss": 1.8109, + "step": 115575 + }, + { + "epoch": 0.22509434917568616, + "grad_norm": 3.4620914459228516, + "learning_rate": 2.6403201471920443e-05, + "loss": 1.9571, + "step": 115590 + }, + { + "epoch": 0.2251235594467964, + "grad_norm": 3.3099734783172607, + "learning_rate": 2.640230714742246e-05, + "loss": 1.9887, + "step": 115605 + }, + { + "epoch": 0.22515276971790668, + "grad_norm": 3.6615664958953857, + "learning_rate": 2.6401412726904045e-05, + "loss": 1.6493, + "step": 115620 + }, + { + "epoch": 0.22518197998901693, + "grad_norm": 5.245543003082275, + "learning_rate": 2.6400518210372723e-05, + "loss": 1.8557, + "step": 115635 + }, + { + "epoch": 0.2252111902601272, + "grad_norm": 2.610487937927246, + "learning_rate": 2.639962359783604e-05, + "loss": 1.8713, + "step": 115650 + }, + { + "epoch": 0.22524040053123745, + "grad_norm": 3.294870376586914, + "learning_rate": 2.6398728889301523e-05, + "loss": 1.7163, + "step": 115665 + }, + { + "epoch": 0.22526961080234773, + "grad_norm": 4.011672019958496, + "learning_rate": 2.639783408477671e-05, + "loss": 1.8247, + "step": 115680 + }, + { + "epoch": 0.225298821073458, + "grad_norm": 2.937358856201172, + "learning_rate": 2.6396939184269126e-05, + "loss": 1.979, + "step": 115695 + }, + { + "epoch": 0.22532803134456825, + "grad_norm": 3.453448534011841, + "learning_rate": 2.6396044187786316e-05, + "loss": 2.0449, + "step": 115710 + }, + { + "epoch": 0.22535724161567852, + "grad_norm": 5.526671409606934, + "learning_rate": 2.6395149095335816e-05, + "loss": 1.9897, + "step": 115725 + }, + { + "epoch": 0.22538645188678877, + "grad_norm": 4.461224555969238, + "learning_rate": 2.6394253906925163e-05, + "loss": 1.8533, + "step": 115740 + }, + { + "epoch": 0.22541566215789904, + "grad_norm": 5.735734462738037, + "learning_rate": 2.6393358622561897e-05, + "loss": 1.8861, + "step": 115755 + }, + { + "epoch": 0.2254448724290093, + "grad_norm": 4.691076278686523, + "learning_rate": 2.6392463242253555e-05, + "loss": 1.7308, + "step": 115770 + }, + { + "epoch": 0.22547408270011957, + "grad_norm": 4.628754615783691, + "learning_rate": 2.6391567766007676e-05, + "loss": 1.7458, + "step": 115785 + }, + { + "epoch": 0.22550329297122984, + "grad_norm": 4.187602519989014, + "learning_rate": 2.6390672193831806e-05, + "loss": 1.8061, + "step": 115800 + }, + { + "epoch": 0.2255325032423401, + "grad_norm": 3.9503650665283203, + "learning_rate": 2.638977652573348e-05, + "loss": 1.9526, + "step": 115815 + }, + { + "epoch": 0.22556171351345036, + "grad_norm": 9.63890552520752, + "learning_rate": 2.6388880761720246e-05, + "loss": 1.8816, + "step": 115830 + }, + { + "epoch": 0.2255909237845606, + "grad_norm": 4.613348484039307, + "learning_rate": 2.6387984901799643e-05, + "loss": 1.7638, + "step": 115845 + }, + { + "epoch": 0.22562013405567088, + "grad_norm": 4.483190536499023, + "learning_rate": 2.638708894597922e-05, + "loss": 1.7916, + "step": 115860 + }, + { + "epoch": 0.22564934432678113, + "grad_norm": 2.0357956886291504, + "learning_rate": 2.638619289426652e-05, + "loss": 1.8182, + "step": 115875 + }, + { + "epoch": 0.2256785545978914, + "grad_norm": 2.3006033897399902, + "learning_rate": 2.6385296746669083e-05, + "loss": 1.9208, + "step": 115890 + }, + { + "epoch": 0.22570776486900168, + "grad_norm": 2.481344223022461, + "learning_rate": 2.6384400503194465e-05, + "loss": 1.8007, + "step": 115905 + }, + { + "epoch": 0.22573697514011193, + "grad_norm": 2.620056390762329, + "learning_rate": 2.6383504163850206e-05, + "loss": 1.6959, + "step": 115920 + }, + { + "epoch": 0.2257661854112222, + "grad_norm": 2.7286269664764404, + "learning_rate": 2.638260772864386e-05, + "loss": 1.7272, + "step": 115935 + }, + { + "epoch": 0.22579539568233245, + "grad_norm": 5.485358238220215, + "learning_rate": 2.638171119758298e-05, + "loss": 2.0237, + "step": 115950 + }, + { + "epoch": 0.22582460595344273, + "grad_norm": 4.251539707183838, + "learning_rate": 2.6380814570675097e-05, + "loss": 1.9112, + "step": 115965 + }, + { + "epoch": 0.22585381622455297, + "grad_norm": 3.362433910369873, + "learning_rate": 2.637991784792778e-05, + "loss": 1.9069, + "step": 115980 + }, + { + "epoch": 0.22588302649566325, + "grad_norm": 1.8694603443145752, + "learning_rate": 2.6379021029348577e-05, + "loss": 1.9072, + "step": 115995 + }, + { + "epoch": 0.22591223676677352, + "grad_norm": 2.1426234245300293, + "learning_rate": 2.6378124114945034e-05, + "loss": 1.9914, + "step": 116010 + }, + { + "epoch": 0.22594144703788377, + "grad_norm": 3.50089430809021, + "learning_rate": 2.6377227104724707e-05, + "loss": 1.7776, + "step": 116025 + }, + { + "epoch": 0.22597065730899404, + "grad_norm": 2.899226427078247, + "learning_rate": 2.637632999869515e-05, + "loss": 1.7313, + "step": 116040 + }, + { + "epoch": 0.2259998675801043, + "grad_norm": 2.8428518772125244, + "learning_rate": 2.637543279686392e-05, + "loss": 1.7264, + "step": 116055 + }, + { + "epoch": 0.22602907785121457, + "grad_norm": 2.848050832748413, + "learning_rate": 2.6374535499238568e-05, + "loss": 1.9655, + "step": 116070 + }, + { + "epoch": 0.2260582881223248, + "grad_norm": 5.103193283081055, + "learning_rate": 2.6373638105826658e-05, + "loss": 2.0386, + "step": 116085 + }, + { + "epoch": 0.2260874983934351, + "grad_norm": 2.5761728286743164, + "learning_rate": 2.6372740616635735e-05, + "loss": 1.8287, + "step": 116100 + }, + { + "epoch": 0.22611670866454536, + "grad_norm": 3.3320114612579346, + "learning_rate": 2.6371843031673372e-05, + "loss": 1.9418, + "step": 116115 + }, + { + "epoch": 0.2261459189356556, + "grad_norm": 2.259507656097412, + "learning_rate": 2.637094535094711e-05, + "loss": 1.9047, + "step": 116130 + }, + { + "epoch": 0.22617512920676588, + "grad_norm": 4.561080455780029, + "learning_rate": 2.6370047574464523e-05, + "loss": 1.8292, + "step": 116145 + }, + { + "epoch": 0.22620433947787613, + "grad_norm": 2.499915599822998, + "learning_rate": 2.636914970223317e-05, + "loss": 1.9285, + "step": 116160 + }, + { + "epoch": 0.2262335497489864, + "grad_norm": 3.3140931129455566, + "learning_rate": 2.6368251734260606e-05, + "loss": 1.8257, + "step": 116175 + }, + { + "epoch": 0.22626276002009665, + "grad_norm": 3.8653433322906494, + "learning_rate": 2.636735367055439e-05, + "loss": 1.7764, + "step": 116190 + }, + { + "epoch": 0.22629197029120693, + "grad_norm": 2.983044147491455, + "learning_rate": 2.6366455511122092e-05, + "loss": 1.7937, + "step": 116205 + }, + { + "epoch": 0.2263211805623172, + "grad_norm": 4.0702643394470215, + "learning_rate": 2.6365557255971276e-05, + "loss": 1.8018, + "step": 116220 + }, + { + "epoch": 0.22635039083342745, + "grad_norm": 3.6693663597106934, + "learning_rate": 2.63646589051095e-05, + "loss": 1.7644, + "step": 116235 + }, + { + "epoch": 0.22637960110453773, + "grad_norm": 3.6486213207244873, + "learning_rate": 2.6363760458544336e-05, + "loss": 1.9207, + "step": 116250 + }, + { + "epoch": 0.22640881137564797, + "grad_norm": 4.4745001792907715, + "learning_rate": 2.6362861916283347e-05, + "loss": 1.6957, + "step": 116265 + }, + { + "epoch": 0.22643802164675825, + "grad_norm": 2.6561806201934814, + "learning_rate": 2.63619632783341e-05, + "loss": 1.8967, + "step": 116280 + }, + { + "epoch": 0.2264672319178685, + "grad_norm": 6.47855281829834, + "learning_rate": 2.6361064544704163e-05, + "loss": 1.9806, + "step": 116295 + }, + { + "epoch": 0.22649644218897877, + "grad_norm": 3.7569420337677, + "learning_rate": 2.6360165715401103e-05, + "loss": 1.876, + "step": 116310 + }, + { + "epoch": 0.22652565246008904, + "grad_norm": 2.2071704864501953, + "learning_rate": 2.635926679043249e-05, + "loss": 1.8935, + "step": 116325 + }, + { + "epoch": 0.2265548627311993, + "grad_norm": 3.371734380722046, + "learning_rate": 2.6358367769805892e-05, + "loss": 1.8993, + "step": 116340 + }, + { + "epoch": 0.22658407300230957, + "grad_norm": 2.497349262237549, + "learning_rate": 2.635746865352888e-05, + "loss": 1.8882, + "step": 116355 + }, + { + "epoch": 0.2266132832734198, + "grad_norm": 3.490945816040039, + "learning_rate": 2.635656944160903e-05, + "loss": 1.8716, + "step": 116370 + }, + { + "epoch": 0.2266424935445301, + "grad_norm": 2.3592095375061035, + "learning_rate": 2.635567013405391e-05, + "loss": 1.9403, + "step": 116385 + }, + { + "epoch": 0.22667170381564034, + "grad_norm": 3.6523334980010986, + "learning_rate": 2.6354770730871098e-05, + "loss": 1.9307, + "step": 116400 + }, + { + "epoch": 0.2267009140867506, + "grad_norm": 3.410024404525757, + "learning_rate": 2.6353871232068158e-05, + "loss": 1.7504, + "step": 116415 + }, + { + "epoch": 0.22673012435786088, + "grad_norm": 3.78347110748291, + "learning_rate": 2.6352971637652678e-05, + "loss": 1.9699, + "step": 116430 + }, + { + "epoch": 0.22675933462897113, + "grad_norm": 2.6357228755950928, + "learning_rate": 2.6352071947632227e-05, + "loss": 1.8698, + "step": 116445 + }, + { + "epoch": 0.2267885449000814, + "grad_norm": 4.487425327301025, + "learning_rate": 2.6351172162014377e-05, + "loss": 2.0628, + "step": 116460 + }, + { + "epoch": 0.22681775517119165, + "grad_norm": 4.3056535720825195, + "learning_rate": 2.635027228080671e-05, + "loss": 1.8732, + "step": 116475 + }, + { + "epoch": 0.22684696544230193, + "grad_norm": 2.44303297996521, + "learning_rate": 2.634937230401681e-05, + "loss": 1.8057, + "step": 116490 + } + ], + "logging_steps": 15, + "max_steps": 513518, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8464223584708813e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}