{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5071225071225074, "eval_steps": 500, "global_step": 440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0056657223796034, "grad_norm": 24.973131796915897, "learning_rate": 1.0000000000000002e-06, "loss": 1.8537, "step": 1 }, { "epoch": 0.0113314447592068, "grad_norm": 32.79573813738381, "learning_rate": 2.0000000000000003e-06, "loss": 2.0212, "step": 2 }, { "epoch": 0.0169971671388102, "grad_norm": 23.800880905805656, "learning_rate": 3e-06, "loss": 2.1456, "step": 3 }, { "epoch": 0.0226628895184136, "grad_norm": 19.091198715081358, "learning_rate": 4.000000000000001e-06, "loss": 1.9808, "step": 4 }, { "epoch": 0.028328611898016998, "grad_norm": 14.124470348172405, "learning_rate": 5e-06, "loss": 2.1825, "step": 5 }, { "epoch": 0.0339943342776204, "grad_norm": 11.461608032959802, "learning_rate": 6e-06, "loss": 1.6353, "step": 6 }, { "epoch": 0.039660056657223795, "grad_norm": 10.354681496346823, "learning_rate": 7e-06, "loss": 1.9076, "step": 7 }, { "epoch": 0.0453257790368272, "grad_norm": 10.167669680172194, "learning_rate": 8.000000000000001e-06, "loss": 1.4754, "step": 8 }, { "epoch": 0.05099150141643059, "grad_norm": 7.5541696713086255, "learning_rate": 9e-06, "loss": 1.6213, "step": 9 }, { "epoch": 0.056657223796033995, "grad_norm": 4.087852973173369, "learning_rate": 1e-05, "loss": 1.5217, "step": 10 }, { "epoch": 0.06232294617563739, "grad_norm": 4.071392878063137, "learning_rate": 9.999948174819623e-06, "loss": 1.6551, "step": 11 }, { "epoch": 0.0679886685552408, "grad_norm": 5.2075402015034, "learning_rate": 9.999792700352826e-06, "loss": 1.4474, "step": 12 }, { "epoch": 0.07365439093484419, "grad_norm": 3.6492933345906637, "learning_rate": 9.999533579822611e-06, "loss": 1.5585, "step": 13 }, { "epoch": 0.07932011331444759, "grad_norm": 6.482920810973195, "learning_rate": 9.999170818600562e-06, "loss": 1.3317, "step": 14 }, { "epoch": 0.08498583569405099, "grad_norm": 4.137365745831386, "learning_rate": 9.998704424206747e-06, "loss": 1.4029, "step": 15 }, { "epoch": 0.0906515580736544, "grad_norm": 4.745717244720069, "learning_rate": 9.998134406309555e-06, "loss": 1.6586, "step": 16 }, { "epoch": 0.09631728045325778, "grad_norm": 5.4377770096801346, "learning_rate": 9.997460776725497e-06, "loss": 1.365, "step": 17 }, { "epoch": 0.10198300283286119, "grad_norm": 3.317130182493388, "learning_rate": 9.996683549418964e-06, "loss": 1.4956, "step": 18 }, { "epoch": 0.10764872521246459, "grad_norm": 1.7845609616841893, "learning_rate": 9.995802740501933e-06, "loss": 1.3472, "step": 19 }, { "epoch": 0.11331444759206799, "grad_norm": 14.387033772755194, "learning_rate": 9.994818368233639e-06, "loss": 1.4116, "step": 20 }, { "epoch": 0.11898016997167139, "grad_norm": 6.920700020611593, "learning_rate": 9.993730453020187e-06, "loss": 1.2776, "step": 21 }, { "epoch": 0.12464589235127478, "grad_norm": 6.05951274644599, "learning_rate": 9.99253901741414e-06, "loss": 1.4433, "step": 22 }, { "epoch": 0.13031161473087818, "grad_norm": 3.0541449788715935, "learning_rate": 9.991244086114046e-06, "loss": 1.3396, "step": 23 }, { "epoch": 0.1359773371104816, "grad_norm": 1.8438099140328046, "learning_rate": 9.989845685963917e-06, "loss": 1.3061, "step": 24 }, { "epoch": 0.141643059490085, "grad_norm": 4.048301070320613, "learning_rate": 9.988343845952697e-06, "loss": 1.2283, "step": 25 }, { "epoch": 0.14730878186968838, "grad_norm": 3.5627296346591457, "learning_rate": 9.986738597213633e-06, "loss": 1.2865, "step": 26 }, { "epoch": 0.1529745042492918, "grad_norm": 2.237494567304501, "learning_rate": 9.98502997302365e-06, "loss": 1.3233, "step": 27 }, { "epoch": 0.15864022662889518, "grad_norm": 3.479719952104877, "learning_rate": 9.983218008802648e-06, "loss": 1.3033, "step": 28 }, { "epoch": 0.1643059490084986, "grad_norm": 2.066121083229141, "learning_rate": 9.98130274211278e-06, "loss": 1.3326, "step": 29 }, { "epoch": 0.16997167138810199, "grad_norm": 4.090684571263736, "learning_rate": 9.979284212657658e-06, "loss": 1.3102, "step": 30 }, { "epoch": 0.17563739376770537, "grad_norm": 2.369637256277251, "learning_rate": 9.977162462281544e-06, "loss": 1.4067, "step": 31 }, { "epoch": 0.1813031161473088, "grad_norm": 1.4378564529803546, "learning_rate": 9.97493753496848e-06, "loss": 1.2409, "step": 32 }, { "epoch": 0.18696883852691218, "grad_norm": 1.810353068849482, "learning_rate": 9.972609476841368e-06, "loss": 1.2659, "step": 33 }, { "epoch": 0.19263456090651557, "grad_norm": 2.954930884156565, "learning_rate": 9.970178336161018e-06, "loss": 1.3727, "step": 34 }, { "epoch": 0.19830028328611898, "grad_norm": 2.053307140265503, "learning_rate": 9.967644163325157e-06, "loss": 1.3463, "step": 35 }, { "epoch": 0.20396600566572237, "grad_norm": 1.8032124432327943, "learning_rate": 9.965007010867366e-06, "loss": 1.1998, "step": 36 }, { "epoch": 0.2096317280453258, "grad_norm": 1.4952983263862012, "learning_rate": 9.962266933456008e-06, "loss": 1.2829, "step": 37 }, { "epoch": 0.21529745042492918, "grad_norm": 1.3649794008291625, "learning_rate": 9.959423987893086e-06, "loss": 1.2056, "step": 38 }, { "epoch": 0.22096317280453256, "grad_norm": 1.4380773398306634, "learning_rate": 9.956478233113066e-06, "loss": 1.29, "step": 39 }, { "epoch": 0.22662889518413598, "grad_norm": 1.6072540934424309, "learning_rate": 9.953429730181653e-06, "loss": 1.2593, "step": 40 }, { "epoch": 0.23229461756373937, "grad_norm": 1.6010739399694889, "learning_rate": 9.95027854229454e-06, "loss": 1.2117, "step": 41 }, { "epoch": 0.23796033994334279, "grad_norm": 1.2474393925785745, "learning_rate": 9.947024734776076e-06, "loss": 1.2022, "step": 42 }, { "epoch": 0.24362606232294617, "grad_norm": 1.4019264249340568, "learning_rate": 9.943668375077926e-06, "loss": 1.2365, "step": 43 }, { "epoch": 0.24929178470254956, "grad_norm": 1.5087040675714003, "learning_rate": 9.940209532777666e-06, "loss": 1.274, "step": 44 }, { "epoch": 0.254957507082153, "grad_norm": 1.1953570915609946, "learning_rate": 9.93664827957735e-06, "loss": 1.2526, "step": 45 }, { "epoch": 0.26062322946175637, "grad_norm": 1.4826450819224886, "learning_rate": 9.932984689302012e-06, "loss": 1.1978, "step": 46 }, { "epoch": 0.26628895184135976, "grad_norm": 1.1937833972167977, "learning_rate": 9.929218837898143e-06, "loss": 1.1816, "step": 47 }, { "epoch": 0.2719546742209632, "grad_norm": 1.1238100782353855, "learning_rate": 9.925350803432112e-06, "loss": 1.1931, "step": 48 }, { "epoch": 0.2776203966005666, "grad_norm": 1.3338900623153498, "learning_rate": 9.921380666088558e-06, "loss": 1.1978, "step": 49 }, { "epoch": 0.28328611898017, "grad_norm": 1.3236848667289738, "learning_rate": 9.917308508168712e-06, "loss": 1.2551, "step": 50 }, { "epoch": 0.28895184135977336, "grad_norm": 1.425578635546673, "learning_rate": 9.913134414088698e-06, "loss": 1.2441, "step": 51 }, { "epoch": 0.29461756373937675, "grad_norm": 1.171581674684746, "learning_rate": 9.908858470377793e-06, "loss": 1.2369, "step": 52 }, { "epoch": 0.3002832861189802, "grad_norm": 1.1564744150302062, "learning_rate": 9.904480765676617e-06, "loss": 1.209, "step": 53 }, { "epoch": 0.3059490084985836, "grad_norm": 1.1357504524893798, "learning_rate": 9.9000013907353e-06, "loss": 1.2152, "step": 54 }, { "epoch": 0.311614730878187, "grad_norm": 1.0498825437855333, "learning_rate": 9.895420438411616e-06, "loss": 1.2043, "step": 55 }, { "epoch": 0.31728045325779036, "grad_norm": 1.6465219316145685, "learning_rate": 9.890738003669029e-06, "loss": 1.2289, "step": 56 }, { "epoch": 0.32294617563739375, "grad_norm": 1.711551232749367, "learning_rate": 9.885954183574753e-06, "loss": 1.1831, "step": 57 }, { "epoch": 0.3286118980169972, "grad_norm": 1.2636664413259953, "learning_rate": 9.881069077297724e-06, "loss": 1.2061, "step": 58 }, { "epoch": 0.3342776203966006, "grad_norm": 1.4260407982081962, "learning_rate": 9.876082786106546e-06, "loss": 1.1998, "step": 59 }, { "epoch": 0.33994334277620397, "grad_norm": 1.95604739866899, "learning_rate": 9.870995413367397e-06, "loss": 1.2215, "step": 60 }, { "epoch": 0.34560906515580736, "grad_norm": 1.2316545141521473, "learning_rate": 9.865807064541878e-06, "loss": 1.1599, "step": 61 }, { "epoch": 0.35127478753541075, "grad_norm": 1.1178440688886253, "learning_rate": 9.860517847184837e-06, "loss": 1.1907, "step": 62 }, { "epoch": 0.35694050991501414, "grad_norm": 1.305376049095191, "learning_rate": 9.855127870942131e-06, "loss": 1.1474, "step": 63 }, { "epoch": 0.3626062322946176, "grad_norm": 1.0495122657744762, "learning_rate": 9.849637247548356e-06, "loss": 1.2424, "step": 64 }, { "epoch": 0.36827195467422097, "grad_norm": 1.141538926125254, "learning_rate": 9.844046090824533e-06, "loss": 1.1689, "step": 65 }, { "epoch": 0.37393767705382436, "grad_norm": 1.26961257521241, "learning_rate": 9.83835451667574e-06, "loss": 1.2106, "step": 66 }, { "epoch": 0.37960339943342775, "grad_norm": 1.081533609255719, "learning_rate": 9.832562643088724e-06, "loss": 1.1834, "step": 67 }, { "epoch": 0.38526912181303113, "grad_norm": 1.443083776392187, "learning_rate": 9.826670590129442e-06, "loss": 1.1505, "step": 68 }, { "epoch": 0.3909348441926346, "grad_norm": 1.135777382976375, "learning_rate": 9.820678479940573e-06, "loss": 1.1489, "step": 69 }, { "epoch": 0.39660056657223797, "grad_norm": 1.8779005247112062, "learning_rate": 9.814586436738998e-06, "loss": 1.1643, "step": 70 }, { "epoch": 0.40226628895184136, "grad_norm": 1.7980060811236744, "learning_rate": 9.808394586813209e-06, "loss": 1.1594, "step": 71 }, { "epoch": 0.40793201133144474, "grad_norm": 2.572405910372765, "learning_rate": 9.802103058520704e-06, "loss": 1.1854, "step": 72 }, { "epoch": 0.41359773371104813, "grad_norm": 2.0253448122778606, "learning_rate": 9.795711982285317e-06, "loss": 1.1826, "step": 73 }, { "epoch": 0.4192634560906516, "grad_norm": 6.483254642683073, "learning_rate": 9.78922149059452e-06, "loss": 1.1646, "step": 74 }, { "epoch": 0.42492917847025496, "grad_norm": 1.2964281102887218, "learning_rate": 9.782631717996675e-06, "loss": 1.2379, "step": 75 }, { "epoch": 0.43059490084985835, "grad_norm": 1.9517402996335103, "learning_rate": 9.775942801098241e-06, "loss": 1.164, "step": 76 }, { "epoch": 0.43626062322946174, "grad_norm": 3.064531007561859, "learning_rate": 9.76915487856095e-06, "loss": 1.1418, "step": 77 }, { "epoch": 0.44192634560906513, "grad_norm": 1.5009905490397355, "learning_rate": 9.762268091098926e-06, "loss": 1.1653, "step": 78 }, { "epoch": 0.4475920679886686, "grad_norm": 1.104518219439204, "learning_rate": 9.755282581475769e-06, "loss": 1.2025, "step": 79 }, { "epoch": 0.45325779036827196, "grad_norm": 7.807500502849419, "learning_rate": 9.748198494501598e-06, "loss": 1.148, "step": 80 }, { "epoch": 0.45892351274787535, "grad_norm": 6.196503908242147, "learning_rate": 9.741015977030046e-06, "loss": 1.1819, "step": 81 }, { "epoch": 0.46458923512747874, "grad_norm": 2.2714978855142736, "learning_rate": 9.733735177955219e-06, "loss": 1.1907, "step": 82 }, { "epoch": 0.4702549575070821, "grad_norm": 1.834743890260826, "learning_rate": 9.72635624820861e-06, "loss": 1.1381, "step": 83 }, { "epoch": 0.47592067988668557, "grad_norm": 1.28470626171519, "learning_rate": 9.71887934075596e-06, "loss": 1.2079, "step": 84 }, { "epoch": 0.48158640226628896, "grad_norm": 6.197048819949928, "learning_rate": 9.711304610594104e-06, "loss": 1.1272, "step": 85 }, { "epoch": 0.48725212464589235, "grad_norm": 3.412508821399008, "learning_rate": 9.703632214747742e-06, "loss": 1.2382, "step": 86 }, { "epoch": 0.49291784702549574, "grad_norm": 1.57336480270559, "learning_rate": 9.695862312266195e-06, "loss": 1.157, "step": 87 }, { "epoch": 0.4985835694050991, "grad_norm": 7.383065472181884, "learning_rate": 9.687995064220102e-06, "loss": 1.1684, "step": 88 }, { "epoch": 0.5042492917847026, "grad_norm": 7.508526165016783, "learning_rate": 9.680030633698083e-06, "loss": 1.155, "step": 89 }, { "epoch": 0.509915014164306, "grad_norm": 9.25317664016253, "learning_rate": 9.671969185803357e-06, "loss": 1.1452, "step": 90 }, { "epoch": 0.5155807365439093, "grad_norm": 2.2525643431971876, "learning_rate": 9.66381088765032e-06, "loss": 1.1505, "step": 91 }, { "epoch": 0.5212464589235127, "grad_norm": 1.4721293586733248, "learning_rate": 9.65555590836108e-06, "loss": 1.1812, "step": 92 }, { "epoch": 0.5269121813031161, "grad_norm": 2.6949100034582103, "learning_rate": 9.647204419061957e-06, "loss": 1.1739, "step": 93 }, { "epoch": 0.5325779036827195, "grad_norm": 2.027029228479332, "learning_rate": 9.638756592879923e-06, "loss": 1.1335, "step": 94 }, { "epoch": 0.5382436260623229, "grad_norm": 1.8382974162119243, "learning_rate": 9.630212604939026e-06, "loss": 1.1298, "step": 95 }, { "epoch": 0.5439093484419264, "grad_norm": 1.2086577711922202, "learning_rate": 9.621572632356754e-06, "loss": 1.167, "step": 96 }, { "epoch": 0.5495750708215298, "grad_norm": 1.2819489966676616, "learning_rate": 9.61283685424036e-06, "loss": 1.1151, "step": 97 }, { "epoch": 0.5552407932011332, "grad_norm": 1.6800709750196126, "learning_rate": 9.604005451683154e-06, "loss": 1.1945, "step": 98 }, { "epoch": 0.5609065155807366, "grad_norm": 1.3375384173734144, "learning_rate": 9.59507860776075e-06, "loss": 1.1621, "step": 99 }, { "epoch": 0.56657223796034, "grad_norm": 2.188062868326175, "learning_rate": 9.586056507527266e-06, "loss": 1.1555, "step": 100 }, { "epoch": 0.5722379603399433, "grad_norm": 1.3814102048227788, "learning_rate": 9.57693933801149e-06, "loss": 1.1733, "step": 101 }, { "epoch": 0.5779036827195467, "grad_norm": 1.8014483071872645, "learning_rate": 9.567727288213005e-06, "loss": 1.1964, "step": 102 }, { "epoch": 0.5835694050991501, "grad_norm": 1.1912746031738484, "learning_rate": 9.558420549098269e-06, "loss": 1.2144, "step": 103 }, { "epoch": 0.5892351274787535, "grad_norm": 3.034007485521762, "learning_rate": 9.549019313596652e-06, "loss": 1.1321, "step": 104 }, { "epoch": 0.5949008498583569, "grad_norm": 1.866729945439932, "learning_rate": 9.539523776596446e-06, "loss": 1.1539, "step": 105 }, { "epoch": 0.6005665722379604, "grad_norm": 1.5773392319922173, "learning_rate": 9.529934134940819e-06, "loss": 1.1373, "step": 106 }, { "epoch": 0.6062322946175638, "grad_norm": 1.6561757646401918, "learning_rate": 9.520250587423733e-06, "loss": 1.1788, "step": 107 }, { "epoch": 0.6118980169971672, "grad_norm": 1.2809743948171723, "learning_rate": 9.510473334785828e-06, "loss": 1.1509, "step": 108 }, { "epoch": 0.6175637393767706, "grad_norm": 3.3019220495325405, "learning_rate": 9.500602579710256e-06, "loss": 1.1879, "step": 109 }, { "epoch": 0.623229461756374, "grad_norm": 1.5241985081276304, "learning_rate": 9.490638526818482e-06, "loss": 1.1114, "step": 110 }, { "epoch": 0.6288951841359773, "grad_norm": 2.053104975498995, "learning_rate": 9.480581382666041e-06, "loss": 1.2417, "step": 111 }, { "epoch": 0.6345609065155807, "grad_norm": 1.450461775862418, "learning_rate": 9.470431355738257e-06, "loss": 1.0761, "step": 112 }, { "epoch": 0.6402266288951841, "grad_norm": 2.831772615909268, "learning_rate": 9.460188656445921e-06, "loss": 1.1684, "step": 113 }, { "epoch": 0.6458923512747875, "grad_norm": 1.5478096558601282, "learning_rate": 9.449853497120928e-06, "loss": 1.1695, "step": 114 }, { "epoch": 0.6515580736543909, "grad_norm": 1.6582616402814803, "learning_rate": 9.439426092011877e-06, "loss": 1.1099, "step": 115 }, { "epoch": 0.6572237960339944, "grad_norm": 1.0617767973732541, "learning_rate": 9.428906657279629e-06, "loss": 1.1584, "step": 116 }, { "epoch": 0.6628895184135978, "grad_norm": 1.6822664727814025, "learning_rate": 9.418295410992821e-06, "loss": 1.1527, "step": 117 }, { "epoch": 0.6685552407932012, "grad_norm": 1.1837357577931802, "learning_rate": 9.407592573123359e-06, "loss": 1.187, "step": 118 }, { "epoch": 0.6742209631728046, "grad_norm": 1.690006148754325, "learning_rate": 9.396798365541841e-06, "loss": 1.1023, "step": 119 }, { "epoch": 0.6798866855524079, "grad_norm": 1.2755747770023382, "learning_rate": 9.385913012012972e-06, "loss": 1.1779, "step": 120 }, { "epoch": 0.6855524079320113, "grad_norm": 1.0625930962823409, "learning_rate": 9.374936738190913e-06, "loss": 1.1586, "step": 121 }, { "epoch": 0.6912181303116147, "grad_norm": 1.4107647400186194, "learning_rate": 9.363869771614615e-06, "loss": 1.1227, "step": 122 }, { "epoch": 0.6968838526912181, "grad_norm": 1.4237393729227041, "learning_rate": 9.35271234170309e-06, "loss": 1.1526, "step": 123 }, { "epoch": 0.7025495750708215, "grad_norm": 1.239081728465614, "learning_rate": 9.341464679750669e-06, "loss": 1.1676, "step": 124 }, { "epoch": 0.7082152974504249, "grad_norm": 1.2250609811941313, "learning_rate": 9.330127018922195e-06, "loss": 1.1549, "step": 125 }, { "epoch": 0.7138810198300283, "grad_norm": 1.0079463118549998, "learning_rate": 9.318699594248192e-06, "loss": 1.0825, "step": 126 }, { "epoch": 0.7195467422096318, "grad_norm": 1.1822482076914111, "learning_rate": 9.307182642620001e-06, "loss": 1.1699, "step": 127 }, { "epoch": 0.7252124645892352, "grad_norm": 1.192585782341377, "learning_rate": 9.295576402784858e-06, "loss": 1.1864, "step": 128 }, { "epoch": 0.7308781869688386, "grad_norm": 1.1793876225801334, "learning_rate": 9.283881115340957e-06, "loss": 1.1592, "step": 129 }, { "epoch": 0.7365439093484419, "grad_norm": 1.4328581990598621, "learning_rate": 9.272097022732444e-06, "loss": 1.1264, "step": 130 }, { "epoch": 0.7422096317280453, "grad_norm": 1.4063460821599099, "learning_rate": 9.260224369244414e-06, "loss": 1.1582, "step": 131 }, { "epoch": 0.7478753541076487, "grad_norm": 1.3928551806399836, "learning_rate": 9.248263400997826e-06, "loss": 1.1036, "step": 132 }, { "epoch": 0.7535410764872521, "grad_norm": 1.0443812793505807, "learning_rate": 9.236214365944418e-06, "loss": 1.1809, "step": 133 }, { "epoch": 0.7592067988668555, "grad_norm": 7.4865021772015234, "learning_rate": 9.224077513861556e-06, "loss": 1.1432, "step": 134 }, { "epoch": 0.7648725212464589, "grad_norm": 4.687727924279942, "learning_rate": 9.211853096347059e-06, "loss": 1.1436, "step": 135 }, { "epoch": 0.7705382436260623, "grad_norm": 1.7813513129483227, "learning_rate": 9.199541366813984e-06, "loss": 1.2003, "step": 136 }, { "epoch": 0.7762039660056658, "grad_norm": 1.1574866856711652, "learning_rate": 9.18714258048537e-06, "loss": 1.0949, "step": 137 }, { "epoch": 0.7818696883852692, "grad_norm": 1.5923532949818175, "learning_rate": 9.174656994388957e-06, "loss": 1.1312, "step": 138 }, { "epoch": 0.7875354107648725, "grad_norm": 1.4090405021331738, "learning_rate": 9.16208486735184e-06, "loss": 1.1371, "step": 139 }, { "epoch": 0.7932011331444759, "grad_norm": 1.1066958591085674, "learning_rate": 9.149426459995127e-06, "loss": 1.1892, "step": 140 }, { "epoch": 0.7988668555240793, "grad_norm": 1.3806489023187403, "learning_rate": 9.136682034728508e-06, "loss": 1.1203, "step": 141 }, { "epoch": 0.8045325779036827, "grad_norm": 1.4492241915768966, "learning_rate": 9.123851855744842e-06, "loss": 1.1606, "step": 142 }, { "epoch": 0.8101983002832861, "grad_norm": 1.2880006738591805, "learning_rate": 9.110936189014668e-06, "loss": 1.1363, "step": 143 }, { "epoch": 0.8158640226628895, "grad_norm": 1.4252322295071467, "learning_rate": 9.097935302280682e-06, "loss": 1.1299, "step": 144 }, { "epoch": 0.8215297450424929, "grad_norm": 1.1051239821774794, "learning_rate": 9.08484946505221e-06, "loss": 1.1855, "step": 145 }, { "epoch": 0.8271954674220963, "grad_norm": 1.1582328438262173, "learning_rate": 9.0716789485996e-06, "loss": 1.1173, "step": 146 }, { "epoch": 0.8328611898016998, "grad_norm": 1.1514645858243073, "learning_rate": 9.058424025948609e-06, "loss": 1.0758, "step": 147 }, { "epoch": 0.8385269121813032, "grad_norm": 1.9099023373890425, "learning_rate": 9.045084971874738e-06, "loss": 1.1502, "step": 148 }, { "epoch": 0.8441926345609065, "grad_norm": 1.4883203974156398, "learning_rate": 9.03166206289754e-06, "loss": 1.1244, "step": 149 }, { "epoch": 0.8498583569405099, "grad_norm": 1.2439793782301596, "learning_rate": 9.018155577274891e-06, "loss": 1.1188, "step": 150 }, { "epoch": 0.8555240793201133, "grad_norm": 0.9842320904106822, "learning_rate": 9.004565794997209e-06, "loss": 1.0915, "step": 151 }, { "epoch": 0.8611898016997167, "grad_norm": 1.1256206443075392, "learning_rate": 8.990892997781661e-06, "loss": 1.1418, "step": 152 }, { "epoch": 0.8668555240793201, "grad_norm": 1.4668868690697237, "learning_rate": 8.977137469066321e-06, "loss": 1.1439, "step": 153 }, { "epoch": 0.8725212464589235, "grad_norm": 1.0357963651071045, "learning_rate": 8.963299494004292e-06, "loss": 1.1489, "step": 154 }, { "epoch": 0.8781869688385269, "grad_norm": 1.2279259538562963, "learning_rate": 8.949379359457795e-06, "loss": 1.148, "step": 155 }, { "epoch": 0.8838526912181303, "grad_norm": 1.279164021341607, "learning_rate": 8.935377353992222e-06, "loss": 1.1291, "step": 156 }, { "epoch": 0.8895184135977338, "grad_norm": 1.0117872914387078, "learning_rate": 8.921293767870157e-06, "loss": 1.1029, "step": 157 }, { "epoch": 0.8951841359773371, "grad_norm": 1.0385739682984056, "learning_rate": 8.907128893045359e-06, "loss": 1.1378, "step": 158 }, { "epoch": 0.9008498583569405, "grad_norm": 0.9862798736503189, "learning_rate": 8.892883023156703e-06, "loss": 1.1247, "step": 159 }, { "epoch": 0.9065155807365439, "grad_norm": 1.0052226052209343, "learning_rate": 8.8785564535221e-06, "loss": 1.1396, "step": 160 }, { "epoch": 0.9121813031161473, "grad_norm": 1.0025191403649947, "learning_rate": 8.86414948113237e-06, "loss": 1.1072, "step": 161 }, { "epoch": 0.9178470254957507, "grad_norm": 1.0190829556170014, "learning_rate": 8.849662404645097e-06, "loss": 1.0692, "step": 162 }, { "epoch": 0.9235127478753541, "grad_norm": 1.065083676666634, "learning_rate": 8.835095524378413e-06, "loss": 1.0839, "step": 163 }, { "epoch": 0.9291784702549575, "grad_norm": 2.75250829153078, "learning_rate": 8.820449142304805e-06, "loss": 1.0976, "step": 164 }, { "epoch": 0.9348441926345609, "grad_norm": 1.11457337735503, "learning_rate": 8.805723562044825e-06, "loss": 1.1383, "step": 165 }, { "epoch": 0.9405099150141643, "grad_norm": 1.223823647150824, "learning_rate": 8.790919088860815e-06, "loss": 1.1331, "step": 166 }, { "epoch": 0.9461756373937678, "grad_norm": 0.9688685956053592, "learning_rate": 8.776036029650573e-06, "loss": 1.1168, "step": 167 }, { "epoch": 0.9518413597733711, "grad_norm": 1.0407006447195224, "learning_rate": 8.76107469294099e-06, "loss": 1.1353, "step": 168 }, { "epoch": 0.9575070821529745, "grad_norm": 1.477166466547593, "learning_rate": 8.746035388881655e-06, "loss": 1.146, "step": 169 }, { "epoch": 0.9631728045325779, "grad_norm": 1.1923873158431406, "learning_rate": 8.730918429238429e-06, "loss": 1.1513, "step": 170 }, { "epoch": 0.9688385269121813, "grad_norm": 1.2104600261128056, "learning_rate": 8.715724127386971e-06, "loss": 1.0846, "step": 171 }, { "epoch": 0.9745042492917847, "grad_norm": 1.026649259168152, "learning_rate": 8.70045279830626e-06, "loss": 1.0987, "step": 172 }, { "epoch": 0.9801699716713881, "grad_norm": 1.1324270741577538, "learning_rate": 8.685104758572047e-06, "loss": 1.1884, "step": 173 }, { "epoch": 0.9858356940509915, "grad_norm": 1.1264630127825281, "learning_rate": 8.669680326350303e-06, "loss": 1.1505, "step": 174 }, { "epoch": 0.9915014164305949, "grad_norm": 1.0463584307162723, "learning_rate": 8.65417982139062e-06, "loss": 1.1194, "step": 175 }, { "epoch": 0.9971671388101983, "grad_norm": 1.1195551791308074, "learning_rate": 8.638603565019588e-06, "loss": 1.1228, "step": 176 }, { "epoch": 1.0113636363636365, "grad_norm": 1.7869848977800533, "learning_rate": 8.622951880134122e-06, "loss": 1.0017, "step": 177 }, { "epoch": 1.0170454545454546, "grad_norm": 1.8967548711721598, "learning_rate": 8.60722509119478e-06, "loss": 1.0646, "step": 178 }, { "epoch": 1.0227272727272727, "grad_norm": 2.7719840532515856, "learning_rate": 8.59142352421903e-06, "loss": 0.9887, "step": 179 }, { "epoch": 1.0284090909090908, "grad_norm": 1.8480101734746917, "learning_rate": 8.575547506774498e-06, "loss": 1.0262, "step": 180 }, { "epoch": 1.0340909090909092, "grad_norm": 1.4999444026158775, "learning_rate": 8.559597367972168e-06, "loss": 0.9829, "step": 181 }, { "epoch": 1.0397727272727273, "grad_norm": 1.38809085421665, "learning_rate": 8.543573438459573e-06, "loss": 1.0144, "step": 182 }, { "epoch": 1.0454545454545454, "grad_norm": 1.2624399470463477, "learning_rate": 8.527476050413922e-06, "loss": 0.9867, "step": 183 }, { "epoch": 1.0511363636363635, "grad_norm": 7.342610894443344, "learning_rate": 8.511305537535238e-06, "loss": 0.9866, "step": 184 }, { "epoch": 1.0568181818181819, "grad_norm": 8.705248219538825, "learning_rate": 8.49506223503941e-06, "loss": 0.9728, "step": 185 }, { "epoch": 1.0625, "grad_norm": 2.0263962989089936, "learning_rate": 8.47874647965128e-06, "loss": 0.9965, "step": 186 }, { "epoch": 1.0681818181818181, "grad_norm": 2.13351438929688, "learning_rate": 8.462358609597629e-06, "loss": 1.0024, "step": 187 }, { "epoch": 1.0738636363636365, "grad_norm": 2.0005753741817736, "learning_rate": 8.445898964600188e-06, "loss": 0.993, "step": 188 }, { "epoch": 1.0795454545454546, "grad_norm": 2.084050032615475, "learning_rate": 8.429367885868582e-06, "loss": 0.9958, "step": 189 }, { "epoch": 1.0852272727272727, "grad_norm": 1.7516330808766072, "learning_rate": 8.412765716093273e-06, "loss": 1.0554, "step": 190 }, { "epoch": 1.0909090909090908, "grad_norm": 1.2861019981619892, "learning_rate": 8.396092799438429e-06, "loss": 1.013, "step": 191 }, { "epoch": 1.0965909090909092, "grad_norm": 1.4381225932886976, "learning_rate": 8.379349481534822e-06, "loss": 0.9797, "step": 192 }, { "epoch": 1.1022727272727273, "grad_norm": 1.8623594079891328, "learning_rate": 8.362536109472637e-06, "loss": 1.0018, "step": 193 }, { "epoch": 1.1079545454545454, "grad_norm": 1.5115381108478676, "learning_rate": 8.345653031794292e-06, "loss": 1.016, "step": 194 }, { "epoch": 1.1136363636363635, "grad_norm": 1.193026650866575, "learning_rate": 8.328700598487203e-06, "loss": 0.9977, "step": 195 }, { "epoch": 1.1193181818181819, "grad_norm": 1.080840404605079, "learning_rate": 8.31167916097654e-06, "loss": 0.9982, "step": 196 }, { "epoch": 1.125, "grad_norm": 1.244418182887263, "learning_rate": 8.294589072117925e-06, "loss": 1.0206, "step": 197 }, { "epoch": 1.1306818181818181, "grad_norm": 1.054116651622593, "learning_rate": 8.277430686190137e-06, "loss": 0.9932, "step": 198 }, { "epoch": 1.1363636363636362, "grad_norm": 1.6708346020909142, "learning_rate": 8.260204358887753e-06, "loss": 0.9867, "step": 199 }, { "epoch": 1.1420454545454546, "grad_norm": 1.764380671950815, "learning_rate": 8.24291044731378e-06, "loss": 1.0255, "step": 200 }, { "epoch": 1.1477272727272727, "grad_norm": 1.4610852940462264, "learning_rate": 8.225549309972256e-06, "loss": 1.0016, "step": 201 }, { "epoch": 1.1534090909090908, "grad_norm": 1.3465974910520928, "learning_rate": 8.208121306760806e-06, "loss": 0.9942, "step": 202 }, { "epoch": 1.1590909090909092, "grad_norm": 3.407109598217383, "learning_rate": 8.190626798963198e-06, "loss": 0.9595, "step": 203 }, { "epoch": 1.1647727272727273, "grad_norm": 3.4569449045424228, "learning_rate": 8.173066149241839e-06, "loss": 0.9679, "step": 204 }, { "epoch": 1.1704545454545454, "grad_norm": 3.5722389574790623, "learning_rate": 8.155439721630265e-06, "loss": 1.0112, "step": 205 }, { "epoch": 1.1761363636363638, "grad_norm": 1.7368368324960894, "learning_rate": 8.137747881525593e-06, "loss": 0.9658, "step": 206 }, { "epoch": 1.1818181818181819, "grad_norm": 3.5425491105943365, "learning_rate": 8.119990995680942e-06, "loss": 1.0097, "step": 207 }, { "epoch": 1.1875, "grad_norm": 4.277519958399436, "learning_rate": 8.102169432197842e-06, "loss": 1.0525, "step": 208 }, { "epoch": 1.1931818181818181, "grad_norm": 1.5253776819790414, "learning_rate": 8.084283560518584e-06, "loss": 1.0257, "step": 209 }, { "epoch": 1.1988636363636362, "grad_norm": 2.393941181872517, "learning_rate": 8.066333751418582e-06, "loss": 0.9519, "step": 210 }, { "epoch": 1.2045454545454546, "grad_norm": 1.8648154402777406, "learning_rate": 8.048320376998675e-06, "loss": 1.0314, "step": 211 }, { "epoch": 1.2102272727272727, "grad_norm": 1.1560926115738988, "learning_rate": 8.030243810677408e-06, "loss": 1.0079, "step": 212 }, { "epoch": 1.2159090909090908, "grad_norm": 1.9861708806007312, "learning_rate": 8.012104427183313e-06, "loss": 0.9712, "step": 213 }, { "epoch": 1.2215909090909092, "grad_norm": 1.6176603802315128, "learning_rate": 7.993902602547113e-06, "loss": 1.0604, "step": 214 }, { "epoch": 1.2272727272727273, "grad_norm": 1.206136483858858, "learning_rate": 7.97563871409395e-06, "loss": 0.9968, "step": 215 }, { "epoch": 1.2329545454545454, "grad_norm": 1.0849650106469113, "learning_rate": 7.957313140435545e-06, "loss": 1.0013, "step": 216 }, { "epoch": 1.2386363636363638, "grad_norm": 1.2530592258144626, "learning_rate": 7.938926261462366e-06, "loss": 1.0392, "step": 217 }, { "epoch": 1.2443181818181819, "grad_norm": 1.4528013728950318, "learning_rate": 7.920478458335738e-06, "loss": 0.945, "step": 218 }, { "epoch": 1.25, "grad_norm": 1.1182010469150763, "learning_rate": 7.901970113479956e-06, "loss": 0.9755, "step": 219 }, { "epoch": 1.2556818181818181, "grad_norm": 1.274158214216111, "learning_rate": 7.883401610574338e-06, "loss": 0.9827, "step": 220 }, { "epoch": 1.2613636363636362, "grad_norm": 1.4460645426911298, "learning_rate": 7.86477333454529e-06, "loss": 1.0233, "step": 221 }, { "epoch": 1.2670454545454546, "grad_norm": 1.004043430975716, "learning_rate": 7.84608567155832e-06, "loss": 0.988, "step": 222 }, { "epoch": 1.2727272727272727, "grad_norm": 1.1277928768546195, "learning_rate": 7.82733900901003e-06, "loss": 1.0092, "step": 223 }, { "epoch": 1.2784090909090908, "grad_norm": 1.30174465678015, "learning_rate": 7.808533735520087e-06, "loss": 1.0023, "step": 224 }, { "epoch": 1.2840909090909092, "grad_norm": 1.155122280361969, "learning_rate": 7.789670240923169e-06, "loss": 0.9938, "step": 225 }, { "epoch": 1.2897727272727273, "grad_norm": 1.1535920929699675, "learning_rate": 7.770748916260875e-06, "loss": 1.0215, "step": 226 }, { "epoch": 1.2954545454545454, "grad_norm": 1.7495637702269113, "learning_rate": 7.751770153773635e-06, "loss": 0.9776, "step": 227 }, { "epoch": 1.3011363636363638, "grad_norm": 1.2776922576240242, "learning_rate": 7.732734346892561e-06, "loss": 0.9716, "step": 228 }, { "epoch": 1.3068181818181819, "grad_norm": 1.3172404492877499, "learning_rate": 7.71364189023131e-06, "loss": 0.9928, "step": 229 }, { "epoch": 1.3125, "grad_norm": 1.0320305867343866, "learning_rate": 7.69449317957788e-06, "loss": 0.9544, "step": 230 }, { "epoch": 1.3181818181818181, "grad_norm": 0.9917633137560159, "learning_rate": 7.675288611886423e-06, "loss": 0.9762, "step": 231 }, { "epoch": 1.3238636363636362, "grad_norm": 0.8750459875550817, "learning_rate": 7.656028585269017e-06, "loss": 0.9649, "step": 232 }, { "epoch": 1.3295454545454546, "grad_norm": 1.0172245413205394, "learning_rate": 7.636713498987405e-06, "loss": 0.9915, "step": 233 }, { "epoch": 1.3352272727272727, "grad_norm": 1.1026610095660114, "learning_rate": 7.617343753444714e-06, "loss": 0.9167, "step": 234 }, { "epoch": 1.3409090909090908, "grad_norm": 0.9838674494365538, "learning_rate": 7.597919750177168e-06, "loss": 0.9978, "step": 235 }, { "epoch": 1.3465909090909092, "grad_norm": 0.9922575875228704, "learning_rate": 7.5784418918457605e-06, "loss": 1.0052, "step": 236 }, { "epoch": 1.3522727272727273, "grad_norm": 0.9776223871792626, "learning_rate": 7.5589105822278944e-06, "loss": 1.0096, "step": 237 }, { "epoch": 1.3579545454545454, "grad_norm": 1.4258305295766374, "learning_rate": 7.539326226209032e-06, "loss": 1.0458, "step": 238 }, { "epoch": 1.3636363636363638, "grad_norm": 1.0015058561164187, "learning_rate": 7.519689229774282e-06, "loss": 1.0248, "step": 239 }, { "epoch": 1.3693181818181819, "grad_norm": 1.0082049852889665, "learning_rate": 7.500000000000001e-06, "loss": 0.9766, "step": 240 }, { "epoch": 1.375, "grad_norm": 0.904307095617801, "learning_rate": 7.4802589450453415e-06, "loss": 1.029, "step": 241 }, { "epoch": 1.3806818181818181, "grad_norm": 0.9709949750288794, "learning_rate": 7.4604664741437975e-06, "loss": 0.9803, "step": 242 }, { "epoch": 1.3863636363636362, "grad_norm": 0.9137049440782995, "learning_rate": 7.440622997594718e-06, "loss": 0.9838, "step": 243 }, { "epoch": 1.3920454545454546, "grad_norm": 0.955522616879317, "learning_rate": 7.420728926754803e-06, "loss": 0.9841, "step": 244 }, { "epoch": 1.3977272727272727, "grad_norm": 0.8924545271105511, "learning_rate": 7.400784674029579e-06, "loss": 0.9747, "step": 245 }, { "epoch": 1.4034090909090908, "grad_norm": 0.9275527221675671, "learning_rate": 7.380790652864842e-06, "loss": 1.0203, "step": 246 }, { "epoch": 1.4090909090909092, "grad_norm": 0.9480980891308645, "learning_rate": 7.360747277738094e-06, "loss": 0.9923, "step": 247 }, { "epoch": 1.4147727272727273, "grad_norm": 0.8427849664059336, "learning_rate": 7.340654964149947e-06, "loss": 0.9806, "step": 248 }, { "epoch": 1.4204545454545454, "grad_norm": 0.9076953250803492, "learning_rate": 7.320514128615511e-06, "loss": 0.9982, "step": 249 }, { "epoch": 1.4261363636363638, "grad_norm": 1.0540250139165377, "learning_rate": 7.300325188655762e-06, "loss": 0.9902, "step": 250 }, { "epoch": 1.4318181818181819, "grad_norm": 0.9954503040475974, "learning_rate": 7.280088562788879e-06, "loss": 0.9809, "step": 251 }, { "epoch": 1.4375, "grad_norm": 0.9967393104089797, "learning_rate": 7.259804670521579e-06, "loss": 1.0, "step": 252 }, { "epoch": 1.4431818181818181, "grad_norm": 0.9891797210154472, "learning_rate": 7.2394739323404105e-06, "loss": 1.0005, "step": 253 }, { "epoch": 1.4488636363636362, "grad_norm": 1.1178308003268749, "learning_rate": 7.219096769703045e-06, "loss": 0.9868, "step": 254 }, { "epoch": 1.4545454545454546, "grad_norm": 1.0000809761609377, "learning_rate": 7.198673605029529e-06, "loss": 0.9648, "step": 255 }, { "epoch": 1.4602272727272727, "grad_norm": 0.9396228245111997, "learning_rate": 7.178204861693546e-06, "loss": 1.0009, "step": 256 }, { "epoch": 1.4659090909090908, "grad_norm": 1.055214770002229, "learning_rate": 7.15769096401362e-06, "loss": 0.9478, "step": 257 }, { "epoch": 1.4715909090909092, "grad_norm": 1.0750160280057304, "learning_rate": 7.137132337244329e-06, "loss": 0.958, "step": 258 }, { "epoch": 1.4772727272727273, "grad_norm": 1.0648150711699151, "learning_rate": 7.116529407567489e-06, "loss": 0.9828, "step": 259 }, { "epoch": 1.4829545454545454, "grad_norm": 1.1192077304577122, "learning_rate": 7.095882602083321e-06, "loss": 0.9707, "step": 260 }, { "epoch": 1.4886363636363638, "grad_norm": 1.1092309283046025, "learning_rate": 7.075192348801591e-06, "loss": 0.9842, "step": 261 }, { "epoch": 1.4943181818181819, "grad_norm": 1.0585087928308756, "learning_rate": 7.054459076632742e-06, "loss": 1.0636, "step": 262 }, { "epoch": 1.5, "grad_norm": 1.041991357364786, "learning_rate": 7.033683215379002e-06, "loss": 0.9753, "step": 263 }, { "epoch": 1.5056818181818183, "grad_norm": 0.9720414152268064, "learning_rate": 7.012865195725473e-06, "loss": 0.9916, "step": 264 }, { "epoch": 1.5113636363636362, "grad_norm": 1.1265716150738212, "learning_rate": 6.9920054492312086e-06, "loss": 1.0678, "step": 265 }, { "epoch": 1.5170454545454546, "grad_norm": 1.0711823881169122, "learning_rate": 6.971104408320253e-06, "loss": 0.9776, "step": 266 }, { "epoch": 1.5227272727272727, "grad_norm": 1.1256078273217827, "learning_rate": 6.950162506272697e-06, "loss": 0.9904, "step": 267 }, { "epoch": 1.5284090909090908, "grad_norm": 0.9811471547098307, "learning_rate": 6.9291801772156775e-06, "loss": 0.987, "step": 268 }, { "epoch": 1.5340909090909092, "grad_norm": 1.205853115403329, "learning_rate": 6.9081578561143924e-06, "loss": 0.9352, "step": 269 }, { "epoch": 1.5397727272727273, "grad_norm": 0.9564252171879485, "learning_rate": 6.887095978763072e-06, "loss": 1.0099, "step": 270 }, { "epoch": 1.5454545454545454, "grad_norm": 0.9739638011221726, "learning_rate": 6.865994981775958e-06, "loss": 0.9186, "step": 271 }, { "epoch": 1.5511363636363638, "grad_norm": 1.3776679228140132, "learning_rate": 6.844855302578236e-06, "loss": 1.0077, "step": 272 }, { "epoch": 1.5568181818181817, "grad_norm": 1.0125445825014543, "learning_rate": 6.823677379396984e-06, "loss": 0.9993, "step": 273 }, { "epoch": 1.5625, "grad_norm": 0.9892499359106408, "learning_rate": 6.802461651252073e-06, "loss": 0.9571, "step": 274 }, { "epoch": 1.5681818181818183, "grad_norm": 1.0831674501266864, "learning_rate": 6.781208557947085e-06, "loss": 1.0061, "step": 275 }, { "epoch": 1.5738636363636362, "grad_norm": 0.9356751500366064, "learning_rate": 6.759918540060173e-06, "loss": 0.979, "step": 276 }, { "epoch": 1.5795454545454546, "grad_norm": 1.0557115003350075, "learning_rate": 6.738592038934946e-06, "loss": 0.9961, "step": 277 }, { "epoch": 1.5852272727272727, "grad_norm": 1.2599637679261655, "learning_rate": 6.717229496671307e-06, "loss": 0.9753, "step": 278 }, { "epoch": 1.5909090909090908, "grad_norm": 1.0507134323091725, "learning_rate": 6.6958313561163046e-06, "loss": 0.9425, "step": 279 }, { "epoch": 1.5965909090909092, "grad_norm": 0.9631905231298211, "learning_rate": 6.674398060854931e-06, "loss": 1.055, "step": 280 }, { "epoch": 1.6022727272727273, "grad_norm": 0.9131560827453628, "learning_rate": 6.652930055200948e-06, "loss": 0.9929, "step": 281 }, { "epoch": 1.6079545454545454, "grad_norm": 0.9138134537225251, "learning_rate": 6.631427784187658e-06, "loss": 0.952, "step": 282 }, { "epoch": 1.6136363636363638, "grad_norm": 0.9436608998471452, "learning_rate": 6.609891693558692e-06, "loss": 1.0371, "step": 283 }, { "epoch": 1.6193181818181817, "grad_norm": 1.077730549555469, "learning_rate": 6.588322229758764e-06, "loss": 1.0231, "step": 284 }, { "epoch": 1.625, "grad_norm": 0.8542525239275349, "learning_rate": 6.566719839924412e-06, "loss": 0.9908, "step": 285 }, { "epoch": 1.6306818181818183, "grad_norm": 0.9390889918397101, "learning_rate": 6.545084971874738e-06, "loss": 0.9965, "step": 286 }, { "epoch": 1.6363636363636362, "grad_norm": 1.3403721698995363, "learning_rate": 6.523418074102117e-06, "loss": 0.9865, "step": 287 }, { "epoch": 1.6420454545454546, "grad_norm": 0.9787534693003979, "learning_rate": 6.501719595762903e-06, "loss": 0.995, "step": 288 }, { "epoch": 1.6477272727272727, "grad_norm": 0.8866152592349634, "learning_rate": 6.479989986668118e-06, "loss": 0.9846, "step": 289 }, { "epoch": 1.6534090909090908, "grad_norm": 0.8915138418235523, "learning_rate": 6.458229697274125e-06, "loss": 1.0373, "step": 290 }, { "epoch": 1.6590909090909092, "grad_norm": 0.9633872591030624, "learning_rate": 6.436439178673296e-06, "loss": 0.9864, "step": 291 }, { "epoch": 1.6647727272727273, "grad_norm": 0.9836814915125117, "learning_rate": 6.41461888258465e-06, "loss": 0.9555, "step": 292 }, { "epoch": 1.6704545454545454, "grad_norm": 0.9708188501717393, "learning_rate": 6.392769261344502e-06, "loss": 0.9448, "step": 293 }, { "epoch": 1.6761363636363638, "grad_norm": 0.8777800692748914, "learning_rate": 6.370890767897078e-06, "loss": 1.0044, "step": 294 }, { "epoch": 1.6818181818181817, "grad_norm": 1.0244121250661828, "learning_rate": 6.348983855785122e-06, "loss": 0.9802, "step": 295 }, { "epoch": 1.6875, "grad_norm": 1.0027302545771752, "learning_rate": 6.3270489791405055e-06, "loss": 0.9562, "step": 296 }, { "epoch": 1.6931818181818183, "grad_norm": 1.7051161806513946, "learning_rate": 6.305086592674802e-06, "loss": 0.9892, "step": 297 }, { "epoch": 1.6988636363636362, "grad_norm": 1.12580729447642, "learning_rate": 6.283097151669869e-06, "loss": 0.9821, "step": 298 }, { "epoch": 1.7045454545454546, "grad_norm": 0.9839470381373491, "learning_rate": 6.261081111968403e-06, "loss": 0.9916, "step": 299 }, { "epoch": 1.7102272727272727, "grad_norm": 1.0613072641616672, "learning_rate": 6.2390389299645e-06, "loss": 0.9783, "step": 300 }, { "epoch": 1.7159090909090908, "grad_norm": 0.9792881716793711, "learning_rate": 6.216971062594179e-06, "loss": 1.0007, "step": 301 }, { "epoch": 1.7215909090909092, "grad_norm": 1.1054016241161089, "learning_rate": 6.1948779673259256e-06, "loss": 1.0079, "step": 302 }, { "epoch": 1.7272727272727273, "grad_norm": 1.2013950643084332, "learning_rate": 6.172760102151195e-06, "loss": 1.0137, "step": 303 }, { "epoch": 1.7329545454545454, "grad_norm": 1.0486842583129228, "learning_rate": 6.1506179255749335e-06, "loss": 0.9611, "step": 304 }, { "epoch": 1.7386363636363638, "grad_norm": 0.9879084512426718, "learning_rate": 6.128451896606054e-06, "loss": 0.987, "step": 305 }, { "epoch": 1.7443181818181817, "grad_norm": 0.8702171126549813, "learning_rate": 6.106262474747939e-06, "loss": 1.0354, "step": 306 }, { "epoch": 1.75, "grad_norm": 0.9479994120475482, "learning_rate": 6.084050119988905e-06, "loss": 0.9687, "step": 307 }, { "epoch": 1.7556818181818183, "grad_norm": 0.841865035975423, "learning_rate": 6.061815292792666e-06, "loss": 0.9692, "step": 308 }, { "epoch": 1.7613636363636362, "grad_norm": 1.1986107322286728, "learning_rate": 6.039558454088796e-06, "loss": 0.9869, "step": 309 }, { "epoch": 1.7670454545454546, "grad_norm": 0.9606223972077408, "learning_rate": 6.0172800652631706e-06, "loss": 1.0164, "step": 310 }, { "epoch": 1.7727272727272727, "grad_norm": 0.8967627253652938, "learning_rate": 5.994980588148391e-06, "loss": 1.043, "step": 311 }, { "epoch": 1.7784090909090908, "grad_norm": 0.7941576266062421, "learning_rate": 5.972660485014231e-06, "loss": 0.9485, "step": 312 }, { "epoch": 1.7840909090909092, "grad_norm": 1.0936763123716517, "learning_rate": 5.950320218558037e-06, "loss": 0.9886, "step": 313 }, { "epoch": 1.7897727272727273, "grad_norm": 1.0795280588915757, "learning_rate": 5.927960251895146e-06, "loss": 1.0174, "step": 314 }, { "epoch": 1.7954545454545454, "grad_norm": 0.8880700856278866, "learning_rate": 5.905581048549279e-06, "loss": 0.9825, "step": 315 }, { "epoch": 1.8011363636363638, "grad_norm": 0.8742464433982793, "learning_rate": 5.883183072442938e-06, "loss": 0.9392, "step": 316 }, { "epoch": 1.8068181818181817, "grad_norm": 0.9015845437433646, "learning_rate": 5.860766787887781e-06, "loss": 0.9507, "step": 317 }, { "epoch": 1.8125, "grad_norm": 0.8777902350206828, "learning_rate": 5.838332659575005e-06, "loss": 1.0214, "step": 318 }, { "epoch": 1.8181818181818183, "grad_norm": 0.9432419707404883, "learning_rate": 5.815881152565712e-06, "loss": 0.9913, "step": 319 }, { "epoch": 1.8238636363636362, "grad_norm": 1.554034736388586, "learning_rate": 5.793412732281258e-06, "loss": 0.9762, "step": 320 }, { "epoch": 1.8295454545454546, "grad_norm": 0.9581038943273897, "learning_rate": 5.7709278644936164e-06, "loss": 0.9848, "step": 321 }, { "epoch": 1.8352272727272727, "grad_norm": 0.8898637306384684, "learning_rate": 5.7484270153157215e-06, "loss": 0.9396, "step": 322 }, { "epoch": 1.8409090909090908, "grad_norm": 1.0203919143753812, "learning_rate": 5.725910651191798e-06, "loss": 1.0037, "step": 323 }, { "epoch": 1.8465909090909092, "grad_norm": 0.8907537657379099, "learning_rate": 5.703379238887703e-06, "loss": 0.9609, "step": 324 }, { "epoch": 1.8522727272727273, "grad_norm": 1.114214216754724, "learning_rate": 5.680833245481234e-06, "loss": 0.9412, "step": 325 }, { "epoch": 1.8579545454545454, "grad_norm": 1.0249614863719094, "learning_rate": 5.6582731383524625e-06, "loss": 1.0452, "step": 326 }, { "epoch": 1.8636363636363638, "grad_norm": 0.9715196988270898, "learning_rate": 5.63569938517404e-06, "loss": 1.0453, "step": 327 }, { "epoch": 1.8693181818181817, "grad_norm": 1.1613903786334339, "learning_rate": 5.613112453901493e-06, "loss": 0.9735, "step": 328 }, { "epoch": 1.875, "grad_norm": 1.059608988677026, "learning_rate": 5.590512812763541e-06, "loss": 0.9618, "step": 329 }, { "epoch": 1.8806818181818183, "grad_norm": 1.0952964220643884, "learning_rate": 5.567900930252375e-06, "loss": 0.9793, "step": 330 }, { "epoch": 1.8863636363636362, "grad_norm": 1.014146750998599, "learning_rate": 5.5452772751139496e-06, "loss": 0.9863, "step": 331 }, { "epoch": 1.8920454545454546, "grad_norm": 0.9663339556094782, "learning_rate": 5.522642316338268e-06, "loss": 1.0089, "step": 332 }, { "epoch": 1.8977272727272727, "grad_norm": 0.9872369642699137, "learning_rate": 5.49999652314966e-06, "loss": 1.0105, "step": 333 }, { "epoch": 1.9034090909090908, "grad_norm": 0.9388637738282897, "learning_rate": 5.477340364997051e-06, "loss": 0.9993, "step": 334 }, { "epoch": 1.9090909090909092, "grad_norm": 1.005111659331097, "learning_rate": 5.454674311544236e-06, "loss": 1.024, "step": 335 }, { "epoch": 1.9147727272727273, "grad_norm": 1.1189249784542552, "learning_rate": 5.431998832660136e-06, "loss": 0.9167, "step": 336 }, { "epoch": 1.9204545454545454, "grad_norm": 0.8754985353482484, "learning_rate": 5.409314398409067e-06, "loss": 0.9509, "step": 337 }, { "epoch": 1.9261363636363638, "grad_norm": 1.0077105144422567, "learning_rate": 5.386621479040985e-06, "loss": 0.9802, "step": 338 }, { "epoch": 1.9318181818181817, "grad_norm": 1.014077284312571, "learning_rate": 5.363920544981749e-06, "loss": 1.0046, "step": 339 }, { "epoch": 1.9375, "grad_norm": 0.8813929725147835, "learning_rate": 5.341212066823356e-06, "loss": 1.006, "step": 340 }, { "epoch": 1.9431818181818183, "grad_norm": 0.9749444900176537, "learning_rate": 5.3184965153142e-06, "loss": 0.987, "step": 341 }, { "epoch": 1.9488636363636362, "grad_norm": 0.9433156213620226, "learning_rate": 5.295774361349299e-06, "loss": 0.9846, "step": 342 }, { "epoch": 1.9545454545454546, "grad_norm": 0.9268456057648533, "learning_rate": 5.27304607596055e-06, "loss": 0.9845, "step": 343 }, { "epoch": 1.9602272727272727, "grad_norm": 0.8554873129583374, "learning_rate": 5.250312130306946e-06, "loss": 0.9835, "step": 344 }, { "epoch": 1.9659090909090908, "grad_norm": 1.018982780208351, "learning_rate": 5.227572995664819e-06, "loss": 0.9825, "step": 345 }, { "epoch": 1.9715909090909092, "grad_norm": 0.9391997048223797, "learning_rate": 5.204829143418072e-06, "loss": 1.0199, "step": 346 }, { "epoch": 1.9772727272727273, "grad_norm": 1.0146418881124983, "learning_rate": 5.182081045048404e-06, "loss": 1.0376, "step": 347 }, { "epoch": 1.9829545454545454, "grad_norm": 1.0574567491158355, "learning_rate": 5.159329172125533e-06, "loss": 0.9434, "step": 348 }, { "epoch": 1.9886363636363638, "grad_norm": 0.8123284335215641, "learning_rate": 5.136573996297431e-06, "loss": 0.9802, "step": 349 }, { "epoch": 1.9943181818181817, "grad_norm": 0.9618851741092689, "learning_rate": 5.113815989280528e-06, "loss": 1.0419, "step": 350 }, { "epoch": 2.0, "grad_norm": 0.8632945643175781, "learning_rate": 5.091055622849958e-06, "loss": 0.976, "step": 351 }, { "epoch": 2.005698005698006, "grad_norm": 1.6043377134817856, "learning_rate": 5.068293368829755e-06, "loss": 0.8913, "step": 352 }, { "epoch": 2.011396011396011, "grad_norm": 1.3331364304662667, "learning_rate": 5.045529699083092e-06, "loss": 0.8424, "step": 353 }, { "epoch": 2.017094017094017, "grad_norm": 1.100343372994173, "learning_rate": 5.022765085502478e-06, "loss": 0.8664, "step": 354 }, { "epoch": 2.022792022792023, "grad_norm": 1.2647408619538267, "learning_rate": 5e-06, "loss": 0.8975, "step": 355 }, { "epoch": 2.0284900284900287, "grad_norm": 1.3692030374819484, "learning_rate": 4.977234914497522e-06, "loss": 0.8659, "step": 356 }, { "epoch": 2.034188034188034, "grad_norm": 1.04165152843705, "learning_rate": 4.9544703009169115e-06, "loss": 0.8465, "step": 357 }, { "epoch": 2.03988603988604, "grad_norm": 1.069447973622135, "learning_rate": 4.931706631170246e-06, "loss": 0.8254, "step": 358 }, { "epoch": 2.0455840455840457, "grad_norm": 1.1882943942044963, "learning_rate": 4.9089443771500435e-06, "loss": 0.8759, "step": 359 }, { "epoch": 2.051282051282051, "grad_norm": 0.9445235142025882, "learning_rate": 4.886184010719472e-06, "loss": 0.8761, "step": 360 }, { "epoch": 2.056980056980057, "grad_norm": 0.9617221724763185, "learning_rate": 4.863426003702572e-06, "loss": 0.822, "step": 361 }, { "epoch": 2.0626780626780628, "grad_norm": 0.9901232814378744, "learning_rate": 4.840670827874468e-06, "loss": 0.8423, "step": 362 }, { "epoch": 2.0683760683760686, "grad_norm": 0.8710776051974528, "learning_rate": 4.817918954951598e-06, "loss": 0.8415, "step": 363 }, { "epoch": 2.074074074074074, "grad_norm": 1.2482792899259578, "learning_rate": 4.795170856581929e-06, "loss": 0.8921, "step": 364 }, { "epoch": 2.07977207977208, "grad_norm": 1.1169049347453446, "learning_rate": 4.772427004335183e-06, "loss": 0.8731, "step": 365 }, { "epoch": 2.0854700854700856, "grad_norm": 1.0557231424552356, "learning_rate": 4.749687869693056e-06, "loss": 0.8622, "step": 366 }, { "epoch": 2.091168091168091, "grad_norm": 0.9181343036612701, "learning_rate": 4.7269539240394505e-06, "loss": 0.8653, "step": 367 }, { "epoch": 2.096866096866097, "grad_norm": 0.9543401797100639, "learning_rate": 4.7042256386507e-06, "loss": 0.8419, "step": 368 }, { "epoch": 2.1025641025641026, "grad_norm": 1.192131842860604, "learning_rate": 4.681503484685803e-06, "loss": 0.9153, "step": 369 }, { "epoch": 2.1082621082621085, "grad_norm": 0.9650701175336839, "learning_rate": 4.6587879331766465e-06, "loss": 0.8422, "step": 370 }, { "epoch": 2.113960113960114, "grad_norm": 0.9343115020962703, "learning_rate": 4.636079455018253e-06, "loss": 0.8433, "step": 371 }, { "epoch": 2.1196581196581197, "grad_norm": 0.9058357605337869, "learning_rate": 4.613378520959016e-06, "loss": 0.8587, "step": 372 }, { "epoch": 2.1253561253561255, "grad_norm": 0.9303289966062062, "learning_rate": 4.5906856015909365e-06, "loss": 0.8799, "step": 373 }, { "epoch": 2.131054131054131, "grad_norm": 0.9993338551104146, "learning_rate": 4.568001167339866e-06, "loss": 0.8789, "step": 374 }, { "epoch": 2.1367521367521367, "grad_norm": 1.003313234824171, "learning_rate": 4.545325688455766e-06, "loss": 0.8285, "step": 375 }, { "epoch": 2.1424501424501425, "grad_norm": 0.9365672809002463, "learning_rate": 4.52265963500295e-06, "loss": 0.8561, "step": 376 }, { "epoch": 2.148148148148148, "grad_norm": 0.8712507036248811, "learning_rate": 4.500003476850341e-06, "loss": 0.8262, "step": 377 }, { "epoch": 2.1538461538461537, "grad_norm": 0.9228004881023822, "learning_rate": 4.477357683661734e-06, "loss": 0.8766, "step": 378 }, { "epoch": 2.1595441595441596, "grad_norm": 1.057083805253911, "learning_rate": 4.454722724886051e-06, "loss": 0.8653, "step": 379 }, { "epoch": 2.1652421652421654, "grad_norm": 0.9682059205532203, "learning_rate": 4.432099069747625e-06, "loss": 0.8305, "step": 380 }, { "epoch": 2.1709401709401708, "grad_norm": 0.7938300778290989, "learning_rate": 4.40948718723646e-06, "loss": 0.8526, "step": 381 }, { "epoch": 2.1766381766381766, "grad_norm": 0.992854757801764, "learning_rate": 4.386887546098509e-06, "loss": 0.7915, "step": 382 }, { "epoch": 2.1823361823361824, "grad_norm": 1.1405534353610247, "learning_rate": 4.364300614825963e-06, "loss": 0.8756, "step": 383 }, { "epoch": 2.1880341880341883, "grad_norm": 0.9074206322121355, "learning_rate": 4.341726861647537e-06, "loss": 0.8786, "step": 384 }, { "epoch": 2.1937321937321936, "grad_norm": 0.9106405803513904, "learning_rate": 4.319166754518768e-06, "loss": 0.8736, "step": 385 }, { "epoch": 2.1994301994301995, "grad_norm": 0.9498694178857152, "learning_rate": 4.296620761112299e-06, "loss": 0.8382, "step": 386 }, { "epoch": 2.2051282051282053, "grad_norm": 0.9662171207890898, "learning_rate": 4.274089348808202e-06, "loss": 0.846, "step": 387 }, { "epoch": 2.2108262108262107, "grad_norm": 0.9597347828021979, "learning_rate": 4.251572984684281e-06, "loss": 0.8565, "step": 388 }, { "epoch": 2.2165242165242165, "grad_norm": 1.0199048543960996, "learning_rate": 4.229072135506384e-06, "loss": 0.8634, "step": 389 }, { "epoch": 2.2222222222222223, "grad_norm": 0.8699841121610784, "learning_rate": 4.206587267718743e-06, "loss": 0.8704, "step": 390 }, { "epoch": 2.2279202279202277, "grad_norm": 0.9870860597778771, "learning_rate": 4.18411884743429e-06, "loss": 0.9155, "step": 391 }, { "epoch": 2.2336182336182335, "grad_norm": 0.9765675083733482, "learning_rate": 4.161667340424996e-06, "loss": 0.9111, "step": 392 }, { "epoch": 2.2393162393162394, "grad_norm": 1.0450993205368777, "learning_rate": 4.139233212112221e-06, "loss": 0.8791, "step": 393 }, { "epoch": 2.245014245014245, "grad_norm": 1.1146726034384589, "learning_rate": 4.116816927557063e-06, "loss": 0.8808, "step": 394 }, { "epoch": 2.2507122507122506, "grad_norm": 0.9072001670881498, "learning_rate": 4.094418951450721e-06, "loss": 0.855, "step": 395 }, { "epoch": 2.2564102564102564, "grad_norm": 0.928713607803712, "learning_rate": 4.072039748104856e-06, "loss": 0.8895, "step": 396 }, { "epoch": 2.262108262108262, "grad_norm": 0.9633556898613354, "learning_rate": 4.0496797814419655e-06, "loss": 0.8809, "step": 397 }, { "epoch": 2.267806267806268, "grad_norm": 0.8844497867372285, "learning_rate": 4.0273395149857705e-06, "loss": 0.841, "step": 398 }, { "epoch": 2.2735042735042734, "grad_norm": 0.9239145256816056, "learning_rate": 4.0050194118516095e-06, "loss": 0.8251, "step": 399 }, { "epoch": 2.2792022792022792, "grad_norm": 1.1068686883079584, "learning_rate": 3.982719934736832e-06, "loss": 0.8515, "step": 400 }, { "epoch": 2.284900284900285, "grad_norm": 1.178223126387429, "learning_rate": 3.960441545911205e-06, "loss": 0.886, "step": 401 }, { "epoch": 2.2905982905982905, "grad_norm": 0.8243442773624833, "learning_rate": 3.9381847072073346e-06, "loss": 0.8073, "step": 402 }, { "epoch": 2.2962962962962963, "grad_norm": 0.8877251522703663, "learning_rate": 3.915949880011096e-06, "loss": 0.8376, "step": 403 }, { "epoch": 2.301994301994302, "grad_norm": 1.1086289853786166, "learning_rate": 3.893737525252063e-06, "loss": 0.835, "step": 404 }, { "epoch": 2.3076923076923075, "grad_norm": 0.9736495968403257, "learning_rate": 3.871548103393947e-06, "loss": 0.8366, "step": 405 }, { "epoch": 2.3133903133903133, "grad_norm": 0.883727910369667, "learning_rate": 3.849382074425069e-06, "loss": 0.8788, "step": 406 }, { "epoch": 2.319088319088319, "grad_norm": 0.9302042209091447, "learning_rate": 3.827239897848805e-06, "loss": 0.8105, "step": 407 }, { "epoch": 2.324786324786325, "grad_norm": 0.9816375724049557, "learning_rate": 3.805122032674077e-06, "loss": 0.8801, "step": 408 }, { "epoch": 2.3304843304843303, "grad_norm": 0.9068093342113286, "learning_rate": 3.7830289374058214e-06, "loss": 0.8926, "step": 409 }, { "epoch": 2.336182336182336, "grad_norm": 0.970100166469761, "learning_rate": 3.7609610700355014e-06, "loss": 0.8172, "step": 410 }, { "epoch": 2.341880341880342, "grad_norm": 0.8283355970207111, "learning_rate": 3.7389188880315962e-06, "loss": 0.8541, "step": 411 }, { "epoch": 2.347578347578348, "grad_norm": 0.836387825954222, "learning_rate": 3.7169028483301333e-06, "loss": 0.8566, "step": 412 }, { "epoch": 2.353276353276353, "grad_norm": 0.9704274187846976, "learning_rate": 3.6949134073251993e-06, "loss": 0.856, "step": 413 }, { "epoch": 2.358974358974359, "grad_norm": 0.8667279540573334, "learning_rate": 3.6729510208594954e-06, "loss": 0.896, "step": 414 }, { "epoch": 2.364672364672365, "grad_norm": 0.9194321407732738, "learning_rate": 3.6510161442148783e-06, "loss": 0.8993, "step": 415 }, { "epoch": 2.3703703703703702, "grad_norm": 0.8956254209520699, "learning_rate": 3.6291092321029244e-06, "loss": 0.871, "step": 416 }, { "epoch": 2.376068376068376, "grad_norm": 0.8944268521885398, "learning_rate": 3.6072307386554983e-06, "loss": 0.8958, "step": 417 }, { "epoch": 2.381766381766382, "grad_norm": 0.8881931841978906, "learning_rate": 3.58538111741535e-06, "loss": 0.8718, "step": 418 }, { "epoch": 2.3874643874643873, "grad_norm": 0.8513595068343849, "learning_rate": 3.5635608213267063e-06, "loss": 0.8484, "step": 419 }, { "epoch": 2.393162393162393, "grad_norm": 0.9552616565495209, "learning_rate": 3.5417703027258752e-06, "loss": 0.8576, "step": 420 }, { "epoch": 2.398860398860399, "grad_norm": 0.884306660742374, "learning_rate": 3.5200100133318836e-06, "loss": 0.8623, "step": 421 }, { "epoch": 2.4045584045584047, "grad_norm": 0.8217549127604973, "learning_rate": 3.4982804042370977e-06, "loss": 0.8789, "step": 422 }, { "epoch": 2.41025641025641, "grad_norm": 0.9177953454550434, "learning_rate": 3.476581925897885e-06, "loss": 0.8761, "step": 423 }, { "epoch": 2.415954415954416, "grad_norm": 0.9191232531329524, "learning_rate": 3.4549150281252635e-06, "loss": 0.8381, "step": 424 }, { "epoch": 2.421652421652422, "grad_norm": 0.8942193186940697, "learning_rate": 3.4332801600755895e-06, "loss": 0.9022, "step": 425 }, { "epoch": 2.427350427350427, "grad_norm": 1.2155759035608542, "learning_rate": 3.4116777702412374e-06, "loss": 0.8673, "step": 426 }, { "epoch": 2.433048433048433, "grad_norm": 0.928253119658496, "learning_rate": 3.39010830644131e-06, "loss": 0.8412, "step": 427 }, { "epoch": 2.438746438746439, "grad_norm": 0.8976369958824371, "learning_rate": 3.3685722158123435e-06, "loss": 0.8572, "step": 428 }, { "epoch": 2.4444444444444446, "grad_norm": 0.9342007055562026, "learning_rate": 3.3470699447990527e-06, "loss": 0.8389, "step": 429 }, { "epoch": 2.45014245014245, "grad_norm": 0.9368785720862421, "learning_rate": 3.3256019391450696e-06, "loss": 0.8447, "step": 430 }, { "epoch": 2.455840455840456, "grad_norm": 0.8602147398886509, "learning_rate": 3.3041686438836984e-06, "loss": 0.8314, "step": 431 }, { "epoch": 2.4615384615384617, "grad_norm": 0.7971529130684335, "learning_rate": 3.2827705033286937e-06, "loss": 0.8075, "step": 432 }, { "epoch": 2.467236467236467, "grad_norm": 0.9022354930189497, "learning_rate": 3.261407961065056e-06, "loss": 0.864, "step": 433 }, { "epoch": 2.472934472934473, "grad_norm": 0.8412103377280404, "learning_rate": 3.2400814599398283e-06, "loss": 0.825, "step": 434 }, { "epoch": 2.4786324786324787, "grad_norm": 0.963324698161768, "learning_rate": 3.2187914420529176e-06, "loss": 0.8245, "step": 435 }, { "epoch": 2.484330484330484, "grad_norm": 0.8974616882015672, "learning_rate": 3.197538348747927e-06, "loss": 0.8574, "step": 436 }, { "epoch": 2.49002849002849, "grad_norm": 0.8375456208735425, "learning_rate": 3.176322620603018e-06, "loss": 0.8567, "step": 437 }, { "epoch": 2.4957264957264957, "grad_norm": 0.8637885686817552, "learning_rate": 3.1551446974217643e-06, "loss": 0.8348, "step": 438 }, { "epoch": 2.5014245014245016, "grad_norm": 0.8964567431940926, "learning_rate": 3.1340050182240438e-06, "loss": 0.8614, "step": 439 }, { "epoch": 2.5071225071225074, "grad_norm": 1.0153388506539311, "learning_rate": 3.1129040212369286e-06, "loss": 0.8288, "step": 440 } ], "logging_steps": 1, "max_steps": 700, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 88, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.6500642935537664e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }