{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.984771573604061, "eval_steps": 500, "global_step": 441, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00676818950930626, "grad_norm": 30.061185836791992, "learning_rate": 1.4285714285714286e-06, "loss": 3.5407, "step": 1 }, { "epoch": 0.01353637901861252, "grad_norm": 45.98220443725586, "learning_rate": 2.8571428571428573e-06, "loss": 3.8623, "step": 2 }, { "epoch": 0.02030456852791878, "grad_norm": 52.36494064331055, "learning_rate": 4.2857142857142855e-06, "loss": 4.0388, "step": 3 }, { "epoch": 0.02707275803722504, "grad_norm": 38.702606201171875, "learning_rate": 5.7142857142857145e-06, "loss": 3.9038, "step": 4 }, { "epoch": 0.0338409475465313, "grad_norm": 39.755348205566406, "learning_rate": 7.1428571428571436e-06, "loss": 3.7208, "step": 5 }, { "epoch": 0.04060913705583756, "grad_norm": 28.482959747314453, "learning_rate": 8.571428571428571e-06, "loss": 3.699, "step": 6 }, { "epoch": 0.047377326565143825, "grad_norm": 33.6556510925293, "learning_rate": 1e-05, "loss": 3.2901, "step": 7 }, { "epoch": 0.05414551607445008, "grad_norm": 10.973630905151367, "learning_rate": 1.1428571428571429e-05, "loss": 3.2131, "step": 8 }, { "epoch": 0.06091370558375635, "grad_norm": 8.036073684692383, "learning_rate": 1.2857142857142859e-05, "loss": 3.2309, "step": 9 }, { "epoch": 0.0676818950930626, "grad_norm": 7.630257606506348, "learning_rate": 1.4285714285714287e-05, "loss": 3.2666, "step": 10 }, { "epoch": 0.07445008460236886, "grad_norm": 7.535837650299072, "learning_rate": 1.5714285714285715e-05, "loss": 3.1925, "step": 11 }, { "epoch": 0.08121827411167512, "grad_norm": 10.59571361541748, "learning_rate": 1.7142857142857142e-05, "loss": 3.1352, "step": 12 }, { "epoch": 0.08798646362098139, "grad_norm": 8.021005630493164, "learning_rate": 1.8571428571428575e-05, "loss": 2.8961, "step": 13 }, { "epoch": 0.09475465313028765, "grad_norm": 5.240553855895996, "learning_rate": 2e-05, "loss": 3.0844, "step": 14 }, { "epoch": 0.10152284263959391, "grad_norm": 5.916426658630371, "learning_rate": 1.9999729347501484e-05, "loss": 3.1558, "step": 15 }, { "epoch": 0.10829103214890017, "grad_norm": 5.597356796264648, "learning_rate": 1.9998917404656488e-05, "loss": 3.091, "step": 16 }, { "epoch": 0.11505922165820642, "grad_norm": 6.339421272277832, "learning_rate": 1.9997564215415886e-05, "loss": 3.1522, "step": 17 }, { "epoch": 0.1218274111675127, "grad_norm": 8.770332336425781, "learning_rate": 1.9995669853028485e-05, "loss": 3.1213, "step": 18 }, { "epoch": 0.12859560067681894, "grad_norm": 8.278801918029785, "learning_rate": 1.9993234420037072e-05, "loss": 3.1599, "step": 19 }, { "epoch": 0.1353637901861252, "grad_norm": 4.014164447784424, "learning_rate": 1.999025804827285e-05, "loss": 2.9919, "step": 20 }, { "epoch": 0.14213197969543148, "grad_norm": 4.980748653411865, "learning_rate": 1.9986740898848306e-05, "loss": 3.0508, "step": 21 }, { "epoch": 0.14890016920473773, "grad_norm": 4.607733249664307, "learning_rate": 1.99826831621485e-05, "loss": 3.0403, "step": 22 }, { "epoch": 0.155668358714044, "grad_norm": 4.822920799255371, "learning_rate": 1.997808505782075e-05, "loss": 3.1426, "step": 23 }, { "epoch": 0.16243654822335024, "grad_norm": 6.66705846786499, "learning_rate": 1.9972946834762732e-05, "loss": 3.1822, "step": 24 }, { "epoch": 0.1692047377326565, "grad_norm": 8.534043312072754, "learning_rate": 1.9967268771109037e-05, "loss": 3.0409, "step": 25 }, { "epoch": 0.17597292724196278, "grad_norm": 5.848859786987305, "learning_rate": 1.996105117421608e-05, "loss": 2.9325, "step": 26 }, { "epoch": 0.18274111675126903, "grad_norm": 3.849553108215332, "learning_rate": 1.9954294380645497e-05, "loss": 2.9975, "step": 27 }, { "epoch": 0.1895093062605753, "grad_norm": 5.086816787719727, "learning_rate": 1.9946998756145894e-05, "loss": 3.0159, "step": 28 }, { "epoch": 0.19627749576988154, "grad_norm": 5.160210132598877, "learning_rate": 1.9939164695633067e-05, "loss": 3.1489, "step": 29 }, { "epoch": 0.20304568527918782, "grad_norm": 5.439467430114746, "learning_rate": 1.9930792623168638e-05, "loss": 3.09, "step": 30 }, { "epoch": 0.2098138747884941, "grad_norm": 6.836553573608398, "learning_rate": 1.992188299193706e-05, "loss": 3.1017, "step": 31 }, { "epoch": 0.21658206429780033, "grad_norm": 6.904626846313477, "learning_rate": 1.9912436284221134e-05, "loss": 2.798, "step": 32 }, { "epoch": 0.2233502538071066, "grad_norm": 3.7659807205200195, "learning_rate": 1.9902453011375865e-05, "loss": 2.9905, "step": 33 }, { "epoch": 0.23011844331641285, "grad_norm": 4.325652122497559, "learning_rate": 1.98919337138008e-05, "loss": 3.004, "step": 34 }, { "epoch": 0.23688663282571912, "grad_norm": 5.1247100830078125, "learning_rate": 1.9880878960910772e-05, "loss": 2.9883, "step": 35 }, { "epoch": 0.2436548223350254, "grad_norm": 5.715439319610596, "learning_rate": 1.9869289351105087e-05, "loss": 3.106, "step": 36 }, { "epoch": 0.25042301184433163, "grad_norm": 6.596778392791748, "learning_rate": 1.9857165511735105e-05, "loss": 3.094, "step": 37 }, { "epoch": 0.2571912013536379, "grad_norm": 5.765414714813232, "learning_rate": 1.9844508099070313e-05, "loss": 2.9328, "step": 38 }, { "epoch": 0.2639593908629442, "grad_norm": 3.2168986797332764, "learning_rate": 1.9831317798262787e-05, "loss": 2.9776, "step": 39 }, { "epoch": 0.2707275803722504, "grad_norm": 3.423891067504883, "learning_rate": 1.98175953233101e-05, "loss": 2.9672, "step": 40 }, { "epoch": 0.27749576988155666, "grad_norm": 4.785609245300293, "learning_rate": 1.980334141701667e-05, "loss": 3.1273, "step": 41 }, { "epoch": 0.28426395939086296, "grad_norm": 5.194344997406006, "learning_rate": 1.978855685095358e-05, "loss": 3.0137, "step": 42 }, { "epoch": 0.2910321489001692, "grad_norm": 5.609189510345459, "learning_rate": 1.977324242541677e-05, "loss": 3.0037, "step": 43 }, { "epoch": 0.29780033840947545, "grad_norm": 6.650205612182617, "learning_rate": 1.9757398969383752e-05, "loss": 2.901, "step": 44 }, { "epoch": 0.30456852791878175, "grad_norm": 4.614665985107422, "learning_rate": 1.974102734046872e-05, "loss": 2.9467, "step": 45 }, { "epoch": 0.311336717428088, "grad_norm": 3.8143110275268555, "learning_rate": 1.9724128424876117e-05, "loss": 2.9703, "step": 46 }, { "epoch": 0.31810490693739424, "grad_norm": 5.5977067947387695, "learning_rate": 1.9706703137352695e-05, "loss": 2.9885, "step": 47 }, { "epoch": 0.3248730964467005, "grad_norm": 6.040333271026611, "learning_rate": 1.968875242113798e-05, "loss": 3.0303, "step": 48 }, { "epoch": 0.3316412859560068, "grad_norm": 6.05629825592041, "learning_rate": 1.9670277247913205e-05, "loss": 2.9001, "step": 49 }, { "epoch": 0.338409475465313, "grad_norm": 7.62821102142334, "learning_rate": 1.965127861774873e-05, "loss": 2.8845, "step": 50 }, { "epoch": 0.34517766497461927, "grad_norm": 3.4944276809692383, "learning_rate": 1.96317575590499e-05, "loss": 2.8731, "step": 51 }, { "epoch": 0.35194585448392557, "grad_norm": 3.9865429401397705, "learning_rate": 1.9611715128501378e-05, "loss": 2.936, "step": 52 }, { "epoch": 0.3587140439932318, "grad_norm": 4.1927385330200195, "learning_rate": 1.9591152411009942e-05, "loss": 2.9779, "step": 53 }, { "epoch": 0.36548223350253806, "grad_norm": 4.403099060058594, "learning_rate": 1.9570070519645767e-05, "loss": 2.9444, "step": 54 }, { "epoch": 0.37225042301184436, "grad_norm": 5.183342456817627, "learning_rate": 1.9548470595582166e-05, "loss": 2.9487, "step": 55 }, { "epoch": 0.3790186125211506, "grad_norm": 6.125980854034424, "learning_rate": 1.9526353808033827e-05, "loss": 2.7997, "step": 56 }, { "epoch": 0.38578680203045684, "grad_norm": 5.613026142120361, "learning_rate": 1.9503721354193507e-05, "loss": 2.8006, "step": 57 }, { "epoch": 0.3925549915397631, "grad_norm": 5.052882194519043, "learning_rate": 1.948057445916724e-05, "loss": 2.8919, "step": 58 }, { "epoch": 0.3993231810490694, "grad_norm": 3.8712987899780273, "learning_rate": 1.9456914375908026e-05, "loss": 2.912, "step": 59 }, { "epoch": 0.40609137055837563, "grad_norm": 5.594967365264893, "learning_rate": 1.9432742385147988e-05, "loss": 3.0025, "step": 60 }, { "epoch": 0.4128595600676819, "grad_norm": 5.751216411590576, "learning_rate": 1.9408059795329073e-05, "loss": 2.9587, "step": 61 }, { "epoch": 0.4196277495769882, "grad_norm": 6.2975993156433105, "learning_rate": 1.9382867942532195e-05, "loss": 2.8469, "step": 62 }, { "epoch": 0.4263959390862944, "grad_norm": 5.6724724769592285, "learning_rate": 1.9357168190404937e-05, "loss": 2.8086, "step": 63 }, { "epoch": 0.43316412859560066, "grad_norm": 3.814537525177002, "learning_rate": 1.9330961930087724e-05, "loss": 2.8346, "step": 64 }, { "epoch": 0.43993231810490696, "grad_norm": 4.25925874710083, "learning_rate": 1.9304250580138524e-05, "loss": 2.9784, "step": 65 }, { "epoch": 0.4467005076142132, "grad_norm": 3.799999237060547, "learning_rate": 1.9277035586456056e-05, "loss": 2.8535, "step": 66 }, { "epoch": 0.45346869712351945, "grad_norm": 6.35882568359375, "learning_rate": 1.9249318422201524e-05, "loss": 2.9694, "step": 67 }, { "epoch": 0.4602368866328257, "grad_norm": 6.477646827697754, "learning_rate": 1.9221100587718884e-05, "loss": 3.0061, "step": 68 }, { "epoch": 0.467005076142132, "grad_norm": 5.934814929962158, "learning_rate": 1.919238361045362e-05, "loss": 2.7579, "step": 69 }, { "epoch": 0.47377326565143824, "grad_norm": 3.154392719268799, "learning_rate": 1.916316904487005e-05, "loss": 2.7796, "step": 70 }, { "epoch": 0.4805414551607445, "grad_norm": 4.2975616455078125, "learning_rate": 1.9133458472367216e-05, "loss": 2.8438, "step": 71 }, { "epoch": 0.4873096446700508, "grad_norm": 3.5001091957092285, "learning_rate": 1.9103253501193256e-05, "loss": 2.9239, "step": 72 }, { "epoch": 0.494077834179357, "grad_norm": 5.083667278289795, "learning_rate": 1.9072555766358346e-05, "loss": 2.9237, "step": 73 }, { "epoch": 0.5008460236886633, "grad_norm": 5.127432346343994, "learning_rate": 1.904136692954622e-05, "loss": 2.9601, "step": 74 }, { "epoch": 0.5076142131979695, "grad_norm": 6.419186115264893, "learning_rate": 1.900968867902419e-05, "loss": 2.8173, "step": 75 }, { "epoch": 0.5143824027072758, "grad_norm": 3.2118561267852783, "learning_rate": 1.89775227295518e-05, "loss": 2.7624, "step": 76 }, { "epoch": 0.5211505922165821, "grad_norm": 4.4183807373046875, "learning_rate": 1.8944870822287957e-05, "loss": 2.84, "step": 77 }, { "epoch": 0.5279187817258884, "grad_norm": 4.880641460418701, "learning_rate": 1.891173472469672e-05, "loss": 2.8241, "step": 78 }, { "epoch": 0.5346869712351946, "grad_norm": 4.362979888916016, "learning_rate": 1.8878116230451615e-05, "loss": 2.8896, "step": 79 }, { "epoch": 0.5414551607445008, "grad_norm": 5.762423515319824, "learning_rate": 1.884401715933853e-05, "loss": 2.8949, "step": 80 }, { "epoch": 0.5482233502538071, "grad_norm": 8.867072105407715, "learning_rate": 1.8809439357157226e-05, "loss": 2.9593, "step": 81 }, { "epoch": 0.5549915397631133, "grad_norm": 5.437422752380371, "learning_rate": 1.8774384695621407e-05, "loss": 2.6622, "step": 82 }, { "epoch": 0.5617597292724196, "grad_norm": 3.8561763763427734, "learning_rate": 1.8738855072257428e-05, "loss": 2.8984, "step": 83 }, { "epoch": 0.5685279187817259, "grad_norm": 4.806951522827148, "learning_rate": 1.8702852410301556e-05, "loss": 2.8228, "step": 84 }, { "epoch": 0.5752961082910322, "grad_norm": 4.01973295211792, "learning_rate": 1.8666378658595863e-05, "loss": 2.8602, "step": 85 }, { "epoch": 0.5820642978003384, "grad_norm": 4.8082170486450195, "learning_rate": 1.8629435791482765e-05, "loss": 2.9552, "step": 86 }, { "epoch": 0.5888324873096447, "grad_norm": 6.550163269042969, "learning_rate": 1.8592025808698116e-05, "loss": 2.7965, "step": 87 }, { "epoch": 0.5956006768189509, "grad_norm": 6.67849588394165, "learning_rate": 1.8554150735262975e-05, "loss": 2.7528, "step": 88 }, { "epoch": 0.6023688663282571, "grad_norm": 2.8771703243255615, "learning_rate": 1.8515812621373998e-05, "loss": 2.8308, "step": 89 }, { "epoch": 0.6091370558375635, "grad_norm": 5.142778396606445, "learning_rate": 1.8477013542292446e-05, "loss": 2.7588, "step": 90 }, { "epoch": 0.6159052453468697, "grad_norm": 6.1177873611450195, "learning_rate": 1.8437755598231857e-05, "loss": 2.8855, "step": 91 }, { "epoch": 0.622673434856176, "grad_norm": 6.153074741363525, "learning_rate": 1.8398040914244363e-05, "loss": 2.982, "step": 92 }, { "epoch": 0.6294416243654822, "grad_norm": 5.2112345695495605, "learning_rate": 1.8357871640105648e-05, "loss": 2.8087, "step": 93 }, { "epoch": 0.6362098138747885, "grad_norm": 8.335490226745605, "learning_rate": 1.8317249950198598e-05, "loss": 2.6842, "step": 94 }, { "epoch": 0.6429780033840947, "grad_norm": 5.8389668464660645, "learning_rate": 1.8276178043395588e-05, "loss": 2.7224, "step": 95 }, { "epoch": 0.649746192893401, "grad_norm": 3.2980704307556152, "learning_rate": 1.8234658142939454e-05, "loss": 2.8933, "step": 96 }, { "epoch": 0.6565143824027073, "grad_norm": 5.305524826049805, "learning_rate": 1.8192692496323158e-05, "loss": 2.8103, "step": 97 }, { "epoch": 0.6632825719120136, "grad_norm": 6.091310977935791, "learning_rate": 1.8150283375168112e-05, "loss": 2.9352, "step": 98 }, { "epoch": 0.6700507614213198, "grad_norm": 5.697042465209961, "learning_rate": 1.8107433075101254e-05, "loss": 2.8545, "step": 99 }, { "epoch": 0.676818950930626, "grad_norm": 7.473045349121094, "learning_rate": 1.8064143915630723e-05, "loss": 2.6375, "step": 100 }, { "epoch": 0.6835871404399323, "grad_norm": 2.685059070587158, "learning_rate": 1.8020418240020362e-05, "loss": 2.7562, "step": 101 }, { "epoch": 0.6903553299492385, "grad_norm": 3.2231831550598145, "learning_rate": 1.7976258415162836e-05, "loss": 2.8718, "step": 102 }, { "epoch": 0.6971235194585449, "grad_norm": 3.358761787414551, "learning_rate": 1.7931666831451536e-05, "loss": 2.8679, "step": 103 }, { "epoch": 0.7038917089678511, "grad_norm": 4.336738109588623, "learning_rate": 1.7886645902651166e-05, "loss": 2.797, "step": 104 }, { "epoch": 0.7106598984771574, "grad_norm": 4.629664421081543, "learning_rate": 1.7841198065767107e-05, "loss": 2.7675, "step": 105 }, { "epoch": 0.7174280879864636, "grad_norm": 6.125463485717773, "learning_rate": 1.779532578091347e-05, "loss": 2.627, "step": 106 }, { "epoch": 0.7241962774957699, "grad_norm": 4.7176361083984375, "learning_rate": 1.7749031531179962e-05, "loss": 2.6226, "step": 107 }, { "epoch": 0.7309644670050761, "grad_norm": 3.0627963542938232, "learning_rate": 1.7702317822497457e-05, "loss": 2.8128, "step": 108 }, { "epoch": 0.7377326565143824, "grad_norm": 4.201870918273926, "learning_rate": 1.7655187183502344e-05, "loss": 2.7452, "step": 109 }, { "epoch": 0.7445008460236887, "grad_norm": 4.618666648864746, "learning_rate": 1.7607642165399665e-05, "loss": 2.8431, "step": 110 }, { "epoch": 0.751269035532995, "grad_norm": 5.060817718505859, "learning_rate": 1.755968534182501e-05, "loss": 2.9154, "step": 111 }, { "epoch": 0.7580372250423012, "grad_norm": 5.751707553863525, "learning_rate": 1.7511319308705198e-05, "loss": 2.7316, "step": 112 }, { "epoch": 0.7648054145516074, "grad_norm": 5.403834342956543, "learning_rate": 1.746254668411778e-05, "loss": 2.6888, "step": 113 }, { "epoch": 0.7715736040609137, "grad_norm": 3.657097339630127, "learning_rate": 1.7413370108149288e-05, "loss": 2.7851, "step": 114 }, { "epoch": 0.7783417935702199, "grad_norm": 3.560981035232544, "learning_rate": 1.7363792242752354e-05, "loss": 2.8977, "step": 115 }, { "epoch": 0.7851099830795262, "grad_norm": 5.971733570098877, "learning_rate": 1.731381577160161e-05, "loss": 2.7807, "step": 116 }, { "epoch": 0.7918781725888325, "grad_norm": 5.342052459716797, "learning_rate": 1.726344339994841e-05, "loss": 2.8701, "step": 117 }, { "epoch": 0.7986463620981388, "grad_norm": 5.152158737182617, "learning_rate": 1.7212677854474402e-05, "loss": 2.6611, "step": 118 }, { "epoch": 0.805414551607445, "grad_norm": 5.67462682723999, "learning_rate": 1.7161521883143936e-05, "loss": 2.6134, "step": 119 }, { "epoch": 0.8121827411167513, "grad_norm": 4.32338285446167, "learning_rate": 1.7109978255055295e-05, "loss": 2.7922, "step": 120 }, { "epoch": 0.8189509306260575, "grad_norm": 4.190022945404053, "learning_rate": 1.705804976029083e-05, "loss": 2.7966, "step": 121 }, { "epoch": 0.8257191201353637, "grad_norm": 3.7118101119995117, "learning_rate": 1.7005739209765906e-05, "loss": 2.8186, "step": 122 }, { "epoch": 0.8324873096446701, "grad_norm": 5.160277366638184, "learning_rate": 1.6953049435076768e-05, "loss": 2.9102, "step": 123 }, { "epoch": 0.8392554991539763, "grad_norm": 5.411961078643799, "learning_rate": 1.6899983288347248e-05, "loss": 2.7617, "step": 124 }, { "epoch": 0.8460236886632826, "grad_norm": 7.095698356628418, "learning_rate": 1.6846543642074382e-05, "loss": 2.6926, "step": 125 }, { "epoch": 0.8527918781725888, "grad_norm": 2.7258427143096924, "learning_rate": 1.679273338897293e-05, "loss": 2.6996, "step": 126 }, { "epoch": 0.8595600676818951, "grad_norm": 3.8327107429504395, "learning_rate": 1.6738555441818785e-05, "loss": 2.7992, "step": 127 }, { "epoch": 0.8663282571912013, "grad_norm": 4.773505687713623, "learning_rate": 1.668401273329129e-05, "loss": 2.7682, "step": 128 }, { "epoch": 0.8730964467005076, "grad_norm": 4.107465744018555, "learning_rate": 1.6629108215814523e-05, "loss": 2.8903, "step": 129 }, { "epoch": 0.8798646362098139, "grad_norm": 4.615577220916748, "learning_rate": 1.6573844861397444e-05, "loss": 2.8723, "step": 130 }, { "epoch": 0.8866328257191202, "grad_norm": 7.497233867645264, "learning_rate": 1.6518225661473045e-05, "loss": 2.8291, "step": 131 }, { "epoch": 0.8934010152284264, "grad_norm": 7.081593036651611, "learning_rate": 1.6462253626736413e-05, "loss": 2.5866, "step": 132 }, { "epoch": 0.9001692047377327, "grad_norm": 3.4112582206726074, "learning_rate": 1.6405931786981753e-05, "loss": 2.7011, "step": 133 }, { "epoch": 0.9069373942470389, "grad_norm": 4.411227226257324, "learning_rate": 1.63492631909384e-05, "loss": 2.7789, "step": 134 }, { "epoch": 0.9137055837563451, "grad_norm": 5.724678993225098, "learning_rate": 1.629225090610577e-05, "loss": 2.8353, "step": 135 }, { "epoch": 0.9204737732656514, "grad_norm": 5.374405860900879, "learning_rate": 1.6234898018587336e-05, "loss": 2.9269, "step": 136 }, { "epoch": 0.9272419627749577, "grad_norm": 5.2880072593688965, "learning_rate": 1.6177207632923558e-05, "loss": 2.7229, "step": 137 }, { "epoch": 0.934010152284264, "grad_norm": 5.647241115570068, "learning_rate": 1.6119182871923834e-05, "loss": 2.6128, "step": 138 }, { "epoch": 0.9407783417935702, "grad_norm": 3.1383461952209473, "learning_rate": 1.606082687649748e-05, "loss": 2.708, "step": 139 }, { "epoch": 0.9475465313028765, "grad_norm": 3.174626111984253, "learning_rate": 1.6002142805483686e-05, "loss": 2.8192, "step": 140 }, { "epoch": 0.9543147208121827, "grad_norm": 3.3176159858703613, "learning_rate": 1.5943133835480536e-05, "loss": 2.8202, "step": 141 }, { "epoch": 0.961082910321489, "grad_norm": 4.013696193695068, "learning_rate": 1.588380316067307e-05, "loss": 2.7887, "step": 142 }, { "epoch": 0.9678510998307953, "grad_norm": 5.064754009246826, "learning_rate": 1.582415399266036e-05, "loss": 2.8008, "step": 143 }, { "epoch": 0.9746192893401016, "grad_norm": 5.884125232696533, "learning_rate": 1.5764189560281677e-05, "loss": 2.6257, "step": 144 }, { "epoch": 0.9813874788494078, "grad_norm": 3.0231032371520996, "learning_rate": 1.5703913109441715e-05, "loss": 2.7147, "step": 145 }, { "epoch": 0.988155668358714, "grad_norm": 3.241084337234497, "learning_rate": 1.564332790293487e-05, "loss": 2.7612, "step": 146 }, { "epoch": 0.9949238578680203, "grad_norm": 4.9145121574401855, "learning_rate": 1.5582437220268648e-05, "loss": 2.8171, "step": 147 }, { "epoch": 1.0016920473773265, "grad_norm": 5.478322982788086, "learning_rate": 1.5521244357486132e-05, "loss": 2.6166, "step": 148 }, { "epoch": 1.0084602368866329, "grad_norm": 2.909008502960205, "learning_rate": 1.5459752626987563e-05, "loss": 2.4026, "step": 149 }, { "epoch": 1.015228426395939, "grad_norm": 3.355454206466675, "learning_rate": 1.5397965357351035e-05, "loss": 2.2265, "step": 150 }, { "epoch": 1.0219966159052454, "grad_norm": 3.659177541732788, "learning_rate": 1.5335885893152335e-05, "loss": 2.1872, "step": 151 }, { "epoch": 1.0287648054145515, "grad_norm": 4.308448791503906, "learning_rate": 1.5273517594783878e-05, "loss": 2.0188, "step": 152 }, { "epoch": 1.0355329949238579, "grad_norm": 4.801682949066162, "learning_rate": 1.521086383827282e-05, "loss": 1.9166, "step": 153 }, { "epoch": 1.0423011844331642, "grad_norm": 6.2991790771484375, "learning_rate": 1.5147928015098309e-05, "loss": 1.6925, "step": 154 }, { "epoch": 1.0490693739424704, "grad_norm": 7.9047417640686035, "learning_rate": 1.5084713532007906e-05, "loss": 2.5637, "step": 155 }, { "epoch": 1.0558375634517767, "grad_norm": 6.511372089385986, "learning_rate": 1.5021223810833165e-05, "loss": 2.3506, "step": 156 }, { "epoch": 1.0626057529610828, "grad_norm": 5.02034854888916, "learning_rate": 1.4957462288304421e-05, "loss": 2.1029, "step": 157 }, { "epoch": 1.0693739424703892, "grad_norm": 5.005341529846191, "learning_rate": 1.489343241586475e-05, "loss": 2.0565, "step": 158 }, { "epoch": 1.0761421319796955, "grad_norm": 5.689651012420654, "learning_rate": 1.4829137659483144e-05, "loss": 1.9412, "step": 159 }, { "epoch": 1.0829103214890017, "grad_norm": 6.038967609405518, "learning_rate": 1.4764581499466895e-05, "loss": 1.675, "step": 160 }, { "epoch": 1.089678510998308, "grad_norm": 4.393552303314209, "learning_rate": 1.4699767430273202e-05, "loss": 2.1734, "step": 161 }, { "epoch": 1.0964467005076142, "grad_norm": 3.555631637573242, "learning_rate": 1.4634698960320018e-05, "loss": 2.187, "step": 162 }, { "epoch": 1.1032148900169205, "grad_norm": 3.7586710453033447, "learning_rate": 1.4569379611796137e-05, "loss": 1.9961, "step": 163 }, { "epoch": 1.1099830795262267, "grad_norm": 4.319566249847412, "learning_rate": 1.4503812920470535e-05, "loss": 1.958, "step": 164 }, { "epoch": 1.116751269035533, "grad_norm": 4.831964015960693, "learning_rate": 1.443800243550098e-05, "loss": 1.7072, "step": 165 }, { "epoch": 1.1235194585448394, "grad_norm": 6.157094478607178, "learning_rate": 1.4371951719241906e-05, "loss": 1.7674, "step": 166 }, { "epoch": 1.1302876480541455, "grad_norm": 4.833260536193848, "learning_rate": 1.4305664347051586e-05, "loss": 1.9227, "step": 167 }, { "epoch": 1.1370558375634519, "grad_norm": 3.5581912994384766, "learning_rate": 1.423914390709861e-05, "loss": 2.3748, "step": 168 }, { "epoch": 1.143824027072758, "grad_norm": 3.734834909439087, "learning_rate": 1.4172394000167625e-05, "loss": 2.0371, "step": 169 }, { "epoch": 1.1505922165820643, "grad_norm": 4.00279426574707, "learning_rate": 1.4105418239464452e-05, "loss": 2.0383, "step": 170 }, { "epoch": 1.1573604060913705, "grad_norm": 4.664214134216309, "learning_rate": 1.4038220250420487e-05, "loss": 1.9445, "step": 171 }, { "epoch": 1.1641285956006768, "grad_norm": 5.319397926330566, "learning_rate": 1.3970803670496453e-05, "loss": 1.7367, "step": 172 }, { "epoch": 1.1708967851099832, "grad_norm": 5.559267520904541, "learning_rate": 1.390317214898551e-05, "loss": 1.7855, "step": 173 }, { "epoch": 1.1776649746192893, "grad_norm": 3.4772238731384277, "learning_rate": 1.3835329346815716e-05, "loss": 2.3614, "step": 174 }, { "epoch": 1.1844331641285957, "grad_norm": 3.456766366958618, "learning_rate": 1.3767278936351853e-05, "loss": 2.1906, "step": 175 }, { "epoch": 1.1912013536379018, "grad_norm": 3.739302635192871, "learning_rate": 1.3699024601196641e-05, "loss": 2.0554, "step": 176 }, { "epoch": 1.1979695431472082, "grad_norm": 4.194780349731445, "learning_rate": 1.3630570035991352e-05, "loss": 1.8769, "step": 177 }, { "epoch": 1.2047377326565143, "grad_norm": 5.365659713745117, "learning_rate": 1.3561918946215807e-05, "loss": 1.7156, "step": 178 }, { "epoch": 1.2115059221658206, "grad_norm": 6.615947723388672, "learning_rate": 1.34930750479878e-05, "loss": 1.6489, "step": 179 }, { "epoch": 1.218274111675127, "grad_norm": 4.608173847198486, "learning_rate": 1.3424042067861944e-05, "loss": 2.4078, "step": 180 }, { "epoch": 1.2250423011844331, "grad_norm": 3.3148863315582275, "learning_rate": 1.335482374262795e-05, "loss": 2.2092, "step": 181 }, { "epoch": 1.2318104906937395, "grad_norm": 4.692728519439697, "learning_rate": 1.3285423819108349e-05, "loss": 1.9361, "step": 182 }, { "epoch": 1.2385786802030456, "grad_norm": 4.571840763092041, "learning_rate": 1.3215846053955683e-05, "loss": 1.9115, "step": 183 }, { "epoch": 1.245346869712352, "grad_norm": 5.025711536407471, "learning_rate": 1.3146094213449148e-05, "loss": 1.7432, "step": 184 }, { "epoch": 1.252115059221658, "grad_norm": 6.1127095222473145, "learning_rate": 1.3076172073290726e-05, "loss": 1.5802, "step": 185 }, { "epoch": 1.2588832487309645, "grad_norm": 5.005325794219971, "learning_rate": 1.3006083418400799e-05, "loss": 2.2672, "step": 186 }, { "epoch": 1.2656514382402708, "grad_norm": 3.2444660663604736, "learning_rate": 1.2935832042713288e-05, "loss": 2.2101, "step": 187 }, { "epoch": 1.272419627749577, "grad_norm": 3.3180994987487793, "learning_rate": 1.2865421748970257e-05, "loss": 2.1237, "step": 188 }, { "epoch": 1.2791878172588833, "grad_norm": 4.625007629394531, "learning_rate": 1.2794856348516095e-05, "loss": 1.9741, "step": 189 }, { "epoch": 1.2859560067681894, "grad_norm": 4.619353294372559, "learning_rate": 1.2724139661091188e-05, "loss": 1.9425, "step": 190 }, { "epoch": 1.2927241962774958, "grad_norm": 5.504361152648926, "learning_rate": 1.2653275514625165e-05, "loss": 1.7012, "step": 191 }, { "epoch": 1.299492385786802, "grad_norm": 4.399888515472412, "learning_rate": 1.2582267745029685e-05, "loss": 1.9316, "step": 192 }, { "epoch": 1.3062605752961083, "grad_norm": 3.53360915184021, "learning_rate": 1.2511120195990797e-05, "loss": 2.3907, "step": 193 }, { "epoch": 1.3130287648054146, "grad_norm": 3.4914515018463135, "learning_rate": 1.2439836718760887e-05, "loss": 2.0797, "step": 194 }, { "epoch": 1.3197969543147208, "grad_norm": 3.7882394790649414, "learning_rate": 1.2368421171950193e-05, "loss": 1.955, "step": 195 }, { "epoch": 1.3265651438240271, "grad_norm": 4.370715141296387, "learning_rate": 1.2296877421317958e-05, "loss": 1.8437, "step": 196 }, { "epoch": 1.3333333333333333, "grad_norm": 5.414830207824707, "learning_rate": 1.2225209339563144e-05, "loss": 1.8579, "step": 197 }, { "epoch": 1.3401015228426396, "grad_norm": 5.272250652313232, "learning_rate": 1.215342080611484e-05, "loss": 1.7614, "step": 198 }, { "epoch": 1.3468697123519457, "grad_norm": 4.075460910797119, "learning_rate": 1.2081515706922226e-05, "loss": 2.3666, "step": 199 }, { "epoch": 1.353637901861252, "grad_norm": 2.9030683040618896, "learning_rate": 1.2009497934244257e-05, "loss": 2.0487, "step": 200 }, { "epoch": 1.3604060913705585, "grad_norm": 4.147029876708984, "learning_rate": 1.1937371386438954e-05, "loss": 1.9878, "step": 201 }, { "epoch": 1.3671742808798646, "grad_norm": 5.0643439292907715, "learning_rate": 1.186513996775239e-05, "loss": 1.8252, "step": 202 }, { "epoch": 1.373942470389171, "grad_norm": 5.364940166473389, "learning_rate": 1.1792807588107358e-05, "loss": 1.7401, "step": 203 }, { "epoch": 1.380710659898477, "grad_norm": 6.356777191162109, "learning_rate": 1.1720378162891709e-05, "loss": 1.5169, "step": 204 }, { "epoch": 1.3874788494077834, "grad_norm": 3.031667709350586, "learning_rate": 1.1647855612746423e-05, "loss": 2.3757, "step": 205 }, { "epoch": 1.3942470389170896, "grad_norm": 3.478210926055908, "learning_rate": 1.1575243863353383e-05, "loss": 2.1897, "step": 206 }, { "epoch": 1.401015228426396, "grad_norm": 3.7287087440490723, "learning_rate": 1.150254684522286e-05, "loss": 2.0368, "step": 207 }, { "epoch": 1.4077834179357023, "grad_norm": 4.0293779373168945, "learning_rate": 1.142976849348078e-05, "loss": 1.9049, "step": 208 }, { "epoch": 1.4145516074450084, "grad_norm": 4.953205108642578, "learning_rate": 1.1356912747655687e-05, "loss": 1.7872, "step": 209 }, { "epoch": 1.4213197969543148, "grad_norm": 6.160380840301514, "learning_rate": 1.1283983551465512e-05, "loss": 1.7295, "step": 210 }, { "epoch": 1.4280879864636211, "grad_norm": 5.149349212646484, "learning_rate": 1.1210984852604084e-05, "loss": 2.1102, "step": 211 }, { "epoch": 1.4348561759729273, "grad_norm": 3.172128915786743, "learning_rate": 1.1137920602527448e-05, "loss": 2.2288, "step": 212 }, { "epoch": 1.4416243654822334, "grad_norm": 3.4528701305389404, "learning_rate": 1.1064794756239978e-05, "loss": 2.0189, "step": 213 }, { "epoch": 1.4483925549915397, "grad_norm": 4.66202449798584, "learning_rate": 1.099161127208027e-05, "loss": 1.8742, "step": 214 }, { "epoch": 1.455160744500846, "grad_norm": 5.142988681793213, "learning_rate": 1.0918374111506893e-05, "loss": 1.9004, "step": 215 }, { "epoch": 1.4619289340101522, "grad_norm": 5.548466205596924, "learning_rate": 1.0845087238883945e-05, "loss": 1.5929, "step": 216 }, { "epoch": 1.4686971235194586, "grad_norm": 4.51755428314209, "learning_rate": 1.0771754621266466e-05, "loss": 1.9563, "step": 217 }, { "epoch": 1.475465313028765, "grad_norm": 3.1326138973236084, "learning_rate": 1.0698380228185685e-05, "loss": 2.2197, "step": 218 }, { "epoch": 1.482233502538071, "grad_norm": 3.594095468521118, "learning_rate": 1.0624968031434174e-05, "loss": 2.0466, "step": 219 }, { "epoch": 1.4890016920473772, "grad_norm": 3.841886281967163, "learning_rate": 1.0551522004850821e-05, "loss": 1.9612, "step": 220 }, { "epoch": 1.4957698815566836, "grad_norm": 4.422885417938232, "learning_rate": 1.0478046124105746e-05, "loss": 1.8449, "step": 221 }, { "epoch": 1.50253807106599, "grad_norm": 5.432779788970947, "learning_rate": 1.0404544366485094e-05, "loss": 1.7364, "step": 222 }, { "epoch": 1.509306260575296, "grad_norm": 5.873152256011963, "learning_rate": 1.033102071067573e-05, "loss": 1.6825, "step": 223 }, { "epoch": 1.5160744500846024, "grad_norm": 3.36773943901062, "learning_rate": 1.0257479136549889e-05, "loss": 2.3463, "step": 224 }, { "epoch": 1.5228426395939088, "grad_norm": 3.3323042392730713, "learning_rate": 1.0183923624949721e-05, "loss": 2.0683, "step": 225 }, { "epoch": 1.5296108291032149, "grad_norm": 3.8202672004699707, "learning_rate": 1.0110358157471825e-05, "loss": 1.9565, "step": 226 }, { "epoch": 1.536379018612521, "grad_norm": 4.67080545425415, "learning_rate": 1.0036786716251721e-05, "loss": 1.8865, "step": 227 }, { "epoch": 1.5431472081218274, "grad_norm": 5.312952995300293, "learning_rate": 9.963213283748282e-06, "loss": 1.7068, "step": 228 }, { "epoch": 1.5499153976311337, "grad_norm": 6.728119850158691, "learning_rate": 9.889641842528179e-06, "loss": 1.6627, "step": 229 }, { "epoch": 1.5566835871404399, "grad_norm": 2.4371559619903564, "learning_rate": 9.816076375050284e-06, "loss": 2.3459, "step": 230 }, { "epoch": 1.5634517766497462, "grad_norm": 2.8036484718322754, "learning_rate": 9.742520863450116e-06, "loss": 2.1804, "step": 231 }, { "epoch": 1.5702199661590526, "grad_norm": 3.5675642490386963, "learning_rate": 9.668979289324274e-06, "loss": 2.0749, "step": 232 }, { "epoch": 1.5769881556683587, "grad_norm": 4.099052906036377, "learning_rate": 9.595455633514908e-06, "loss": 1.8576, "step": 233 }, { "epoch": 1.5837563451776648, "grad_norm": 4.900853633880615, "learning_rate": 9.521953875894256e-06, "loss": 1.7174, "step": 234 }, { "epoch": 1.5905245346869712, "grad_norm": 5.890774726867676, "learning_rate": 9.448477995149182e-06, "loss": 1.4906, "step": 235 }, { "epoch": 1.5972927241962775, "grad_norm": 4.369800567626953, "learning_rate": 9.37503196856583e-06, "loss": 2.0832, "step": 236 }, { "epoch": 1.6040609137055837, "grad_norm": 3.1959829330444336, "learning_rate": 9.301619771814317e-06, "loss": 2.2265, "step": 237 }, { "epoch": 1.61082910321489, "grad_norm": 3.255842924118042, "learning_rate": 9.228245378733537e-06, "loss": 2.0659, "step": 238 }, { "epoch": 1.6175972927241964, "grad_norm": 3.865798234939575, "learning_rate": 9.154912761116056e-06, "loss": 1.9102, "step": 239 }, { "epoch": 1.6243654822335025, "grad_norm": 4.725029945373535, "learning_rate": 9.081625888493107e-06, "loss": 1.7442, "step": 240 }, { "epoch": 1.6311336717428087, "grad_norm": 8.740133285522461, "learning_rate": 9.00838872791973e-06, "loss": 1.6959, "step": 241 }, { "epoch": 1.637901861252115, "grad_norm": 4.786500930786133, "learning_rate": 8.935205243760022e-06, "loss": 1.8628, "step": 242 }, { "epoch": 1.6446700507614214, "grad_norm": 3.0056700706481934, "learning_rate": 8.862079397472552e-06, "loss": 2.2218, "step": 243 }, { "epoch": 1.6514382402707275, "grad_norm": 3.35292911529541, "learning_rate": 8.78901514739592e-06, "loss": 2.0775, "step": 244 }, { "epoch": 1.6582064297800339, "grad_norm": 3.997661590576172, "learning_rate": 8.71601644853449e-06, "loss": 1.9842, "step": 245 }, { "epoch": 1.6649746192893402, "grad_norm": 4.569092273712158, "learning_rate": 8.643087252344313e-06, "loss": 1.8055, "step": 246 }, { "epoch": 1.6717428087986463, "grad_norm": 5.217006683349609, "learning_rate": 8.57023150651922e-06, "loss": 1.5681, "step": 247 }, { "epoch": 1.6785109983079525, "grad_norm": 5.526303291320801, "learning_rate": 8.49745315477714e-06, "loss": 1.7118, "step": 248 }, { "epoch": 1.6852791878172588, "grad_norm": 2.5200791358947754, "learning_rate": 8.424756136646624e-06, "loss": 2.2932, "step": 249 }, { "epoch": 1.6920473773265652, "grad_norm": 3.4209508895874023, "learning_rate": 8.352144387253582e-06, "loss": 2.0515, "step": 250 }, { "epoch": 1.6988155668358713, "grad_norm": 3.7960565090179443, "learning_rate": 8.279621837108295e-06, "loss": 1.9207, "step": 251 }, { "epoch": 1.7055837563451777, "grad_norm": 4.094236373901367, "learning_rate": 8.207192411892645e-06, "loss": 1.7885, "step": 252 }, { "epoch": 1.712351945854484, "grad_norm": 4.837678909301758, "learning_rate": 8.134860032247613e-06, "loss": 1.6723, "step": 253 }, { "epoch": 1.7191201353637902, "grad_norm": 6.248587608337402, "learning_rate": 8.062628613561051e-06, "loss": 1.4528, "step": 254 }, { "epoch": 1.7258883248730963, "grad_norm": 2.59256911277771, "learning_rate": 7.990502065755748e-06, "loss": 2.3992, "step": 255 }, { "epoch": 1.7326565143824029, "grad_norm": 2.9640893936157227, "learning_rate": 7.918484293077777e-06, "loss": 2.1847, "step": 256 }, { "epoch": 1.739424703891709, "grad_norm": 3.4181110858917236, "learning_rate": 7.846579193885165e-06, "loss": 2.0231, "step": 257 }, { "epoch": 1.7461928934010151, "grad_norm": 4.158235549926758, "learning_rate": 7.774790660436857e-06, "loss": 1.9362, "step": 258 }, { "epoch": 1.7529610829103215, "grad_norm": 4.829765796661377, "learning_rate": 7.703122578682047e-06, "loss": 1.7278, "step": 259 }, { "epoch": 1.7597292724196278, "grad_norm": 5.691404819488525, "learning_rate": 7.631578828049809e-06, "loss": 1.6055, "step": 260 }, { "epoch": 1.766497461928934, "grad_norm": 3.2796614170074463, "learning_rate": 7.560163281239116e-06, "loss": 2.0519, "step": 261 }, { "epoch": 1.77326565143824, "grad_norm": 2.57660174369812, "learning_rate": 7.488879804009206e-06, "loss": 2.1934, "step": 262 }, { "epoch": 1.7800338409475467, "grad_norm": 3.3291141986846924, "learning_rate": 7.4177322549703165e-06, "loss": 2.0575, "step": 263 }, { "epoch": 1.7868020304568528, "grad_norm": 3.7897515296936035, "learning_rate": 7.346724485374837e-06, "loss": 1.7963, "step": 264 }, { "epoch": 1.793570219966159, "grad_norm": 4.585766315460205, "learning_rate": 7.275860338908815e-06, "loss": 1.765, "step": 265 }, { "epoch": 1.8003384094754653, "grad_norm": 5.705550670623779, "learning_rate": 7.2051436514839064e-06, "loss": 1.6657, "step": 266 }, { "epoch": 1.8071065989847717, "grad_norm": 4.510739326477051, "learning_rate": 7.134578251029745e-06, "loss": 1.8443, "step": 267 }, { "epoch": 1.8138747884940778, "grad_norm": 3.171539068222046, "learning_rate": 7.064167957286714e-06, "loss": 2.3002, "step": 268 }, { "epoch": 1.8206429780033841, "grad_norm": 3.5727908611297607, "learning_rate": 6.993916581599203e-06, "loss": 2.0323, "step": 269 }, { "epoch": 1.8274111675126905, "grad_norm": 3.5468742847442627, "learning_rate": 6.923827926709277e-06, "loss": 1.9025, "step": 270 }, { "epoch": 1.8341793570219966, "grad_norm": 4.465723037719727, "learning_rate": 6.853905786550855e-06, "loss": 1.8105, "step": 271 }, { "epoch": 1.8409475465313028, "grad_norm": 5.095712184906006, "learning_rate": 6.784153946044321e-06, "loss": 1.6591, "step": 272 }, { "epoch": 1.8477157360406091, "grad_norm": 5.340912818908691, "learning_rate": 6.714576180891653e-06, "loss": 1.6851, "step": 273 }, { "epoch": 1.8544839255499155, "grad_norm": 3.092374801635742, "learning_rate": 6.645176257372054e-06, "loss": 2.3331, "step": 274 }, { "epoch": 1.8612521150592216, "grad_norm": 2.924107551574707, "learning_rate": 6.5759579321380576e-06, "loss": 2.1575, "step": 275 }, { "epoch": 1.868020304568528, "grad_norm": 3.7559361457824707, "learning_rate": 6.5069249520122026e-06, "loss": 1.9893, "step": 276 }, { "epoch": 1.8747884940778343, "grad_norm": 4.786612510681152, "learning_rate": 6.438081053784197e-06, "loss": 1.7583, "step": 277 }, { "epoch": 1.8815566835871405, "grad_norm": 5.209157466888428, "learning_rate": 6.36942996400865e-06, "loss": 1.7118, "step": 278 }, { "epoch": 1.8883248730964466, "grad_norm": 6.413548469543457, "learning_rate": 6.300975398803362e-06, "loss": 1.5219, "step": 279 }, { "epoch": 1.895093062605753, "grad_norm": 2.4699904918670654, "learning_rate": 6.232721063648148e-06, "loss": 2.3592, "step": 280 }, { "epoch": 1.9018612521150593, "grad_norm": 2.732497453689575, "learning_rate": 6.1646706531842845e-06, "loss": 2.0984, "step": 281 }, { "epoch": 1.9086294416243654, "grad_norm": 3.4677207469940186, "learning_rate": 6.09682785101449e-06, "loss": 1.9303, "step": 282 }, { "epoch": 1.9153976311336718, "grad_norm": 3.888166666030884, "learning_rate": 6.029196329503548e-06, "loss": 1.8503, "step": 283 }, { "epoch": 1.9221658206429781, "grad_norm": 4.850317001342773, "learning_rate": 5.961779749579516e-06, "loss": 1.711, "step": 284 }, { "epoch": 1.9289340101522843, "grad_norm": 5.663942813873291, "learning_rate": 5.8945817605355495e-06, "loss": 1.5138, "step": 285 }, { "epoch": 1.9357021996615904, "grad_norm": 3.6366028785705566, "learning_rate": 5.827605999832375e-06, "loss": 2.0733, "step": 286 }, { "epoch": 1.9424703891708968, "grad_norm": 2.522986650466919, "learning_rate": 5.760856092901394e-06, "loss": 2.2134, "step": 287 }, { "epoch": 1.9492385786802031, "grad_norm": 3.196727991104126, "learning_rate": 5.694335652948415e-06, "loss": 2.0477, "step": 288 }, { "epoch": 1.9560067681895092, "grad_norm": 3.9079673290252686, "learning_rate": 5.628048280758096e-06, "loss": 1.963, "step": 289 }, { "epoch": 1.9627749576988156, "grad_norm": 4.57443380355835, "learning_rate": 5.561997564499024e-06, "loss": 1.7234, "step": 290 }, { "epoch": 1.969543147208122, "grad_norm": 5.271142482757568, "learning_rate": 5.4961870795294644e-06, "loss": 1.6151, "step": 291 }, { "epoch": 1.976311336717428, "grad_norm": 4.357114315032959, "learning_rate": 5.430620388203866e-06, "loss": 1.9279, "step": 292 }, { "epoch": 1.9830795262267342, "grad_norm": 2.8121213912963867, "learning_rate": 5.365301039679985e-06, "loss": 2.1234, "step": 293 }, { "epoch": 1.9898477157360406, "grad_norm": 4.057702541351318, "learning_rate": 5.300232569726805e-06, "loss": 1.9386, "step": 294 }, { "epoch": 1.996615905245347, "grad_norm": 5.310722351074219, "learning_rate": 5.2354185005331095e-06, "loss": 1.6403, "step": 295 }, { "epoch": 2.003384094754653, "grad_norm": 3.646991729736328, "learning_rate": 5.170862340516858e-06, "loss": 1.9625, "step": 296 }, { "epoch": 2.010152284263959, "grad_norm": 3.910515069961548, "learning_rate": 5.106567584135251e-06, "loss": 1.6498, "step": 297 }, { "epoch": 2.0169204737732658, "grad_norm": 5.364322662353516, "learning_rate": 5.042537711695584e-06, "loss": 1.4023, "step": 298 }, { "epoch": 2.023688663282572, "grad_norm": 6.2862396240234375, "learning_rate": 4.97877618916684e-06, "loss": 1.0733, "step": 299 }, { "epoch": 2.030456852791878, "grad_norm": 7.049383163452148, "learning_rate": 4.915286467992098e-06, "loss": 0.9028, "step": 300 }, { "epoch": 2.0372250423011846, "grad_norm": 6.528897762298584, "learning_rate": 4.852071984901696e-06, "loss": 0.6975, "step": 301 }, { "epoch": 2.0439932318104908, "grad_norm": 4.566006660461426, "learning_rate": 4.789136161727184e-06, "loss": 0.996, "step": 302 }, { "epoch": 2.050761421319797, "grad_norm": 4.8525590896606445, "learning_rate": 4.7264824052161255e-06, "loss": 1.9084, "step": 303 }, { "epoch": 2.057529610829103, "grad_norm": 8.884151458740234, "learning_rate": 4.664114106847667e-06, "loss": 1.2406, "step": 304 }, { "epoch": 2.0642978003384096, "grad_norm": 9.265266418457031, "learning_rate": 4.602034642648968e-06, "loss": 0.974, "step": 305 }, { "epoch": 2.0710659898477157, "grad_norm": 8.75934886932373, "learning_rate": 4.5402473730124395e-06, "loss": 0.8314, "step": 306 }, { "epoch": 2.077834179357022, "grad_norm": 7.045146942138672, "learning_rate": 4.478755642513868e-06, "loss": 0.7014, "step": 307 }, { "epoch": 2.0846023688663284, "grad_norm": 5.934751033782959, "learning_rate": 4.417562779731355e-06, "loss": 0.5536, "step": 308 }, { "epoch": 2.0913705583756346, "grad_norm": 4.2714314460754395, "learning_rate": 4.356672097065134e-06, "loss": 1.954, "step": 309 }, { "epoch": 2.0981387478849407, "grad_norm": 3.837898015975952, "learning_rate": 4.2960868905582895e-06, "loss": 1.4183, "step": 310 }, { "epoch": 2.104906937394247, "grad_norm": 4.858175277709961, "learning_rate": 4.235810439718327e-06, "loss": 1.0733, "step": 311 }, { "epoch": 2.1116751269035534, "grad_norm": 5.005491256713867, "learning_rate": 4.175846007339644e-06, "loss": 0.851, "step": 312 }, { "epoch": 2.1184433164128595, "grad_norm": 5.400625228881836, "learning_rate": 4.1161968393269324e-06, "loss": 0.7486, "step": 313 }, { "epoch": 2.1252115059221657, "grad_norm": 5.8098602294921875, "learning_rate": 4.0568661645194656e-06, "loss": 0.5741, "step": 314 }, { "epoch": 2.1319796954314723, "grad_norm": 3.7481307983398438, "learning_rate": 3.997857194516319e-06, "loss": 1.7741, "step": 315 }, { "epoch": 2.1387478849407784, "grad_norm": 3.240912675857544, "learning_rate": 3.939173123502523e-06, "loss": 1.5778, "step": 316 }, { "epoch": 2.1455160744500845, "grad_norm": 4.222574710845947, "learning_rate": 3.8808171280761665e-06, "loss": 1.0852, "step": 317 }, { "epoch": 2.152284263959391, "grad_norm": 4.52738618850708, "learning_rate": 3.822792367076446e-06, "loss": 0.9088, "step": 318 }, { "epoch": 2.1590524534686972, "grad_norm": 5.184245586395264, "learning_rate": 3.7651019814126656e-06, "loss": 0.7274, "step": 319 }, { "epoch": 2.1658206429780034, "grad_norm": 6.21406364440918, "learning_rate": 3.7077490938942307e-06, "loss": 0.6568, "step": 320 }, { "epoch": 2.1725888324873095, "grad_norm": 4.0037126541137695, "learning_rate": 3.6507368090616014e-06, "loss": 1.3107, "step": 321 }, { "epoch": 2.179357021996616, "grad_norm": 3.928704023361206, "learning_rate": 3.594068213018249e-06, "loss": 1.6134, "step": 322 }, { "epoch": 2.186125211505922, "grad_norm": 4.245754718780518, "learning_rate": 3.53774637326359e-06, "loss": 1.1365, "step": 323 }, { "epoch": 2.1928934010152283, "grad_norm": 4.95959997177124, "learning_rate": 3.481774338526954e-06, "loss": 0.9044, "step": 324 }, { "epoch": 2.199661590524535, "grad_norm": 5.446150779724121, "learning_rate": 3.426155138602558e-06, "loss": 0.7641, "step": 325 }, { "epoch": 2.206429780033841, "grad_norm": 6.039018630981445, "learning_rate": 3.3708917841854782e-06, "loss": 0.6246, "step": 326 }, { "epoch": 2.213197969543147, "grad_norm": 5.4581427574157715, "learning_rate": 3.3159872667087077e-06, "loss": 0.9867, "step": 327 }, { "epoch": 2.2199661590524533, "grad_norm": 3.5558555126190186, "learning_rate": 3.2614445581812183e-06, "loss": 1.8462, "step": 328 }, { "epoch": 2.22673434856176, "grad_norm": 3.6573843955993652, "learning_rate": 3.207266611027069e-06, "loss": 1.2127, "step": 329 }, { "epoch": 2.233502538071066, "grad_norm": 4.450440406799316, "learning_rate": 3.1534563579256172e-06, "loss": 0.8708, "step": 330 }, { "epoch": 2.240270727580372, "grad_norm": 4.951565742492676, "learning_rate": 3.1000167116527525e-06, "loss": 0.7292, "step": 331 }, { "epoch": 2.2470389170896787, "grad_norm": 5.573976516723633, "learning_rate": 3.0469505649232333e-06, "loss": 0.6443, "step": 332 }, { "epoch": 2.253807106598985, "grad_norm": 5.918398857116699, "learning_rate": 2.9942607902340946e-06, "loss": 0.5702, "step": 333 }, { "epoch": 2.260575296108291, "grad_norm": 3.47131085395813, "learning_rate": 2.9419502397091715e-06, "loss": 1.9211, "step": 334 }, { "epoch": 2.267343485617597, "grad_norm": 3.5219008922576904, "learning_rate": 2.8900217449447077e-06, "loss": 1.3083, "step": 335 }, { "epoch": 2.2741116751269037, "grad_norm": 4.355684280395508, "learning_rate": 2.8384781168560693e-06, "loss": 0.9792, "step": 336 }, { "epoch": 2.28087986463621, "grad_norm": 4.637706756591797, "learning_rate": 2.7873221455256006e-06, "loss": 0.8071, "step": 337 }, { "epoch": 2.287648054145516, "grad_norm": 5.174313068389893, "learning_rate": 2.736556600051593e-06, "loss": 0.658, "step": 338 }, { "epoch": 2.2944162436548226, "grad_norm": 5.701674461364746, "learning_rate": 2.6861842283983953e-06, "loss": 0.5716, "step": 339 }, { "epoch": 2.3011844331641287, "grad_norm": 3.603616714477539, "learning_rate": 2.6362077572476495e-06, "loss": 1.6573, "step": 340 }, { "epoch": 2.307952622673435, "grad_norm": 3.46175217628479, "learning_rate": 2.586629891850716e-06, "loss": 1.5, "step": 341 }, { "epoch": 2.314720812182741, "grad_norm": 4.137648105621338, "learning_rate": 2.5374533158822225e-06, "loss": 1.2069, "step": 342 }, { "epoch": 2.3214890016920475, "grad_norm": 4.514110565185547, "learning_rate": 2.4886806912948034e-06, "loss": 0.8226, "step": 343 }, { "epoch": 2.3282571912013537, "grad_norm": 5.722095012664795, "learning_rate": 2.4403146581749925e-06, "loss": 0.7436, "step": 344 }, { "epoch": 2.33502538071066, "grad_norm": 5.680308818817139, "learning_rate": 2.392357834600336e-06, "loss": 0.583, "step": 345 }, { "epoch": 2.3417935702199664, "grad_norm": 4.119960784912109, "learning_rate": 2.3448128164976593e-06, "loss": 1.2645, "step": 346 }, { "epoch": 2.3485617597292725, "grad_norm": 3.0717074871063232, "learning_rate": 2.297682177502546e-06, "loss": 1.538, "step": 347 }, { "epoch": 2.3553299492385786, "grad_norm": 4.0398335456848145, "learning_rate": 2.2509684688200385e-06, "loss": 1.0585, "step": 348 }, { "epoch": 2.3620981387478848, "grad_norm": 4.792836666107178, "learning_rate": 2.204674219086531e-06, "loss": 0.8199, "step": 349 }, { "epoch": 2.3688663282571913, "grad_norm": 5.0710883140563965, "learning_rate": 2.158801934232897e-06, "loss": 0.6387, "step": 350 }, { "epoch": 2.3756345177664975, "grad_norm": 5.128052234649658, "learning_rate": 2.113354097348834e-06, "loss": 0.5425, "step": 351 }, { "epoch": 2.3824027072758036, "grad_norm": 4.384050369262695, "learning_rate": 2.0683331685484655e-06, "loss": 0.9253, "step": 352 }, { "epoch": 2.38917089678511, "grad_norm": 3.5379750728607178, "learning_rate": 2.0237415848371666e-06, "loss": 1.9209, "step": 353 }, { "epoch": 2.3959390862944163, "grad_norm": 3.754819631576538, "learning_rate": 1.979581759979642e-06, "loss": 1.2382, "step": 354 }, { "epoch": 2.4027072758037225, "grad_norm": 4.621876239776611, "learning_rate": 1.9358560843692787e-06, "loss": 0.9402, "step": 355 }, { "epoch": 2.4094754653130286, "grad_norm": 5.234630584716797, "learning_rate": 1.892566924898751e-06, "loss": 0.7772, "step": 356 }, { "epoch": 2.416243654822335, "grad_norm": 6.046688079833984, "learning_rate": 1.8497166248318876e-06, "loss": 0.6619, "step": 357 }, { "epoch": 2.4230118443316413, "grad_norm": 5.794624328613281, "learning_rate": 1.807307503676846e-06, "loss": 0.5626, "step": 358 }, { "epoch": 2.4297800338409474, "grad_norm": 3.271219253540039, "learning_rate": 1.7653418570605474e-06, "loss": 1.8406, "step": 359 }, { "epoch": 2.436548223350254, "grad_norm": 3.5998053550720215, "learning_rate": 1.7238219566044145e-06, "loss": 1.3465, "step": 360 }, { "epoch": 2.44331641285956, "grad_norm": 4.231540679931641, "learning_rate": 1.6827500498014026e-06, "loss": 1.0409, "step": 361 }, { "epoch": 2.4500846023688663, "grad_norm": 4.704120635986328, "learning_rate": 1.6421283598943526e-06, "loss": 0.7836, "step": 362 }, { "epoch": 2.4568527918781724, "grad_norm": 5.459336757659912, "learning_rate": 1.601959085755641e-06, "loss": 0.6894, "step": 363 }, { "epoch": 2.463620981387479, "grad_norm": 5.819806098937988, "learning_rate": 1.5622444017681438e-06, "loss": 0.5779, "step": 364 }, { "epoch": 2.470389170896785, "grad_norm": 3.3774378299713135, "learning_rate": 1.5229864577075548e-06, "loss": 1.6054, "step": 365 }, { "epoch": 2.4771573604060912, "grad_norm": 3.6015894412994385, "learning_rate": 1.4841873786260019e-06, "loss": 1.4402, "step": 366 }, { "epoch": 2.483925549915398, "grad_norm": 4.313451766967773, "learning_rate": 1.445849264737026e-06, "loss": 0.9478, "step": 367 }, { "epoch": 2.490693739424704, "grad_norm": 5.062134265899658, "learning_rate": 1.4079741913018863e-06, "loss": 0.8397, "step": 368 }, { "epoch": 2.49746192893401, "grad_norm": 5.357868194580078, "learning_rate": 1.3705642085172367e-06, "loss": 0.5926, "step": 369 }, { "epoch": 2.504230118443316, "grad_norm": 5.440720558166504, "learning_rate": 1.3336213414041387e-06, "loss": 0.5514, "step": 370 }, { "epoch": 2.510998307952623, "grad_norm": 3.912086248397827, "learning_rate": 1.2971475896984475e-06, "loss": 1.3175, "step": 371 }, { "epoch": 2.517766497461929, "grad_norm": 3.4451215267181396, "learning_rate": 1.2611449277425715e-06, "loss": 1.7101, "step": 372 }, { "epoch": 2.524534686971235, "grad_norm": 3.9773149490356445, "learning_rate": 1.2256153043785911e-06, "loss": 1.0656, "step": 373 }, { "epoch": 2.5313028764805416, "grad_norm": 4.944250106811523, "learning_rate": 1.1905606428427775e-06, "loss": 0.88, "step": 374 }, { "epoch": 2.5380710659898478, "grad_norm": 5.475653171539307, "learning_rate": 1.1559828406614716e-06, "loss": 0.6841, "step": 375 }, { "epoch": 2.544839255499154, "grad_norm": 6.01757287979126, "learning_rate": 1.1218837695483853e-06, "loss": 0.5779, "step": 376 }, { "epoch": 2.55160744500846, "grad_norm": 4.852456569671631, "learning_rate": 1.0882652753032797e-06, "loss": 0.9251, "step": 377 }, { "epoch": 2.5583756345177666, "grad_norm": 3.3849077224731445, "learning_rate": 1.0551291777120465e-06, "loss": 1.936, "step": 378 }, { "epoch": 2.5651438240270727, "grad_norm": 3.5754101276397705, "learning_rate": 1.0224772704482033e-06, "loss": 1.2097, "step": 379 }, { "epoch": 2.571912013536379, "grad_norm": 4.511597633361816, "learning_rate": 9.903113209758098e-07, "loss": 0.999, "step": 380 }, { "epoch": 2.5786802030456855, "grad_norm": 4.68709659576416, "learning_rate": 9.58633070453785e-07, "loss": 0.7136, "step": 381 }, { "epoch": 2.5854483925549916, "grad_norm": 6.031564712524414, "learning_rate": 9.274442336416567e-07, "loss": 0.6703, "step": 382 }, { "epoch": 2.5922165820642977, "grad_norm": 5.676982402801514, "learning_rate": 8.967464988067476e-07, "loss": 0.5741, "step": 383 }, { "epoch": 2.598984771573604, "grad_norm": 3.2455053329467773, "learning_rate": 8.665415276327871e-07, "loss": 2.0339, "step": 384 }, { "epoch": 2.6057529610829104, "grad_norm": 3.556863784790039, "learning_rate": 8.368309551299536e-07, "loss": 1.325, "step": 385 }, { "epoch": 2.6125211505922166, "grad_norm": 4.362199306488037, "learning_rate": 8.076163895463862e-07, "loss": 1.0078, "step": 386 }, { "epoch": 2.6192893401015227, "grad_norm": 4.831475257873535, "learning_rate": 7.788994122811178e-07, "loss": 0.8614, "step": 387 }, { "epoch": 2.6260575296108293, "grad_norm": 5.154886245727539, "learning_rate": 7.506815777984788e-07, "loss": 0.5961, "step": 388 }, { "epoch": 2.6328257191201354, "grad_norm": 5.211191654205322, "learning_rate": 7.229644135439473e-07, "loss": 0.5393, "step": 389 }, { "epoch": 2.6395939086294415, "grad_norm": 3.482469081878662, "learning_rate": 6.957494198614778e-07, "loss": 1.725, "step": 390 }, { "epoch": 2.6463620981387477, "grad_norm": 3.1785717010498047, "learning_rate": 6.690380699122767e-07, "loss": 1.58, "step": 391 }, { "epoch": 2.6531302876480543, "grad_norm": 3.89457368850708, "learning_rate": 6.428318095950648e-07, "loss": 1.0373, "step": 392 }, { "epoch": 2.6598984771573604, "grad_norm": 4.530887126922607, "learning_rate": 6.171320574678064e-07, "loss": 0.8817, "step": 393 }, { "epoch": 2.6666666666666665, "grad_norm": 5.074831008911133, "learning_rate": 5.919402046709288e-07, "loss": 0.6199, "step": 394 }, { "epoch": 2.673434856175973, "grad_norm": 5.216182231903076, "learning_rate": 5.672576148520136e-07, "loss": 0.5777, "step": 395 }, { "epoch": 2.6802030456852792, "grad_norm": 4.10127067565918, "learning_rate": 5.430856240919779e-07, "loss": 1.3062, "step": 396 }, { "epoch": 2.6869712351945854, "grad_norm": 3.1244003772735596, "learning_rate": 5.19425540832762e-07, "loss": 1.6781, "step": 397 }, { "epoch": 2.6937394247038915, "grad_norm": 3.911149501800537, "learning_rate": 4.962786458064972e-07, "loss": 1.1371, "step": 398 }, { "epoch": 2.700507614213198, "grad_norm": 4.34928560256958, "learning_rate": 4.73646191966175e-07, "loss": 0.9204, "step": 399 }, { "epoch": 2.707275803722504, "grad_norm": 5.3247551918029785, "learning_rate": 4.515294044178331e-07, "loss": 0.7054, "step": 400 }, { "epoch": 2.7140439932318103, "grad_norm": 5.384613990783691, "learning_rate": 4.299294803542331e-07, "loss": 0.6055, "step": 401 }, { "epoch": 2.720812182741117, "grad_norm": 4.743732452392578, "learning_rate": 4.0884758899006007e-07, "loss": 0.9645, "step": 402 }, { "epoch": 2.727580372250423, "grad_norm": 3.20401930809021, "learning_rate": 3.882848714986243e-07, "loss": 1.8007, "step": 403 }, { "epoch": 2.734348561759729, "grad_norm": 3.646068811416626, "learning_rate": 3.6824244095010064e-07, "loss": 1.2693, "step": 404 }, { "epoch": 2.7411167512690353, "grad_norm": 4.450921058654785, "learning_rate": 3.4872138225127137e-07, "loss": 0.9719, "step": 405 }, { "epoch": 2.747884940778342, "grad_norm": 4.953512668609619, "learning_rate": 3.2972275208679625e-07, "loss": 0.8174, "step": 406 }, { "epoch": 2.754653130287648, "grad_norm": 5.153395175933838, "learning_rate": 3.112475788620217e-07, "loss": 0.6039, "step": 407 }, { "epoch": 2.761421319796954, "grad_norm": 5.268378257751465, "learning_rate": 2.932968626473065e-07, "loss": 0.5195, "step": 408 }, { "epoch": 2.7681895093062607, "grad_norm": 3.0056629180908203, "learning_rate": 2.758715751238872e-07, "loss": 1.8951, "step": 409 }, { "epoch": 2.774957698815567, "grad_norm": 3.4584174156188965, "learning_rate": 2.589726595312858e-07, "loss": 1.258, "step": 410 }, { "epoch": 2.781725888324873, "grad_norm": 4.383852481842041, "learning_rate": 2.426010306162485e-07, "loss": 0.947, "step": 411 }, { "epoch": 2.788494077834179, "grad_norm": 5.073363304138184, "learning_rate": 2.2675757458323066e-07, "loss": 0.7545, "step": 412 }, { "epoch": 2.7952622673434857, "grad_norm": 5.620312213897705, "learning_rate": 2.1144314904642194e-07, "loss": 0.612, "step": 413 }, { "epoch": 2.802030456852792, "grad_norm": 5.6198530197143555, "learning_rate": 1.9665858298333006e-07, "loss": 0.584, "step": 414 }, { "epoch": 2.808798646362098, "grad_norm": 3.5097062587738037, "learning_rate": 1.824046766899046e-07, "loss": 1.7121, "step": 415 }, { "epoch": 2.8155668358714045, "grad_norm": 3.444314479827881, "learning_rate": 1.6868220173721472e-07, "loss": 1.4931, "step": 416 }, { "epoch": 2.8223350253807107, "grad_norm": 4.086292743682861, "learning_rate": 1.5549190092968736e-07, "loss": 0.96, "step": 417 }, { "epoch": 2.829103214890017, "grad_norm": 4.669743061065674, "learning_rate": 1.4283448826489798e-07, "loss": 0.7836, "step": 418 }, { "epoch": 2.835871404399323, "grad_norm": 5.329158782958984, "learning_rate": 1.3071064889491723e-07, "loss": 0.713, "step": 419 }, { "epoch": 2.8426395939086295, "grad_norm": 5.7975664138793945, "learning_rate": 1.1912103908922945e-07, "loss": 0.5847, "step": 420 }, { "epoch": 2.8494077834179357, "grad_norm": 3.9617557525634766, "learning_rate": 1.0806628619920322e-07, "loss": 1.3451, "step": 421 }, { "epoch": 2.8561759729272422, "grad_norm": 3.1097750663757324, "learning_rate": 9.754698862413758e-08, "loss": 1.4706, "step": 422 }, { "epoch": 2.8629441624365484, "grad_norm": 3.8327267169952393, "learning_rate": 8.756371577886891e-08, "loss": 1.1243, "step": 423 }, { "epoch": 2.8697123519458545, "grad_norm": 4.473137855529785, "learning_rate": 7.81170080629412e-08, "loss": 0.9026, "step": 424 }, { "epoch": 2.8764805414551606, "grad_norm": 5.105331897735596, "learning_rate": 6.920737683136614e-08, "loss": 0.6808, "step": 425 }, { "epoch": 2.8832487309644668, "grad_norm": 6.36986780166626, "learning_rate": 6.083530436693408e-08, "loss": 0.6489, "step": 426 }, { "epoch": 2.8900169204737733, "grad_norm": 4.548642635345459, "learning_rate": 5.300124385410943e-08, "loss": 0.9101, "step": 427 }, { "epoch": 2.8967851099830795, "grad_norm": 3.0376877784729004, "learning_rate": 4.570561935450468e-08, "loss": 1.8025, "step": 428 }, { "epoch": 2.903553299492386, "grad_norm": 3.8811256885528564, "learning_rate": 3.894882578391879e-08, "loss": 1.2524, "step": 429 }, { "epoch": 2.910321489001692, "grad_norm": 4.729759693145752, "learning_rate": 3.273122889096536e-08, "loss": 0.8427, "step": 430 }, { "epoch": 2.9170896785109983, "grad_norm": 5.370926856994629, "learning_rate": 2.705316523726853e-08, "loss": 0.6915, "step": 431 }, { "epoch": 2.9238578680203045, "grad_norm": 5.328631401062012, "learning_rate": 2.1914942179253052e-08, "loss": 0.5848, "step": 432 }, { "epoch": 2.9306260575296106, "grad_norm": 5.493933200836182, "learning_rate": 1.7316837851499845e-08, "loss": 0.5372, "step": 433 }, { "epoch": 2.937394247038917, "grad_norm": 3.3873283863067627, "learning_rate": 1.325910115169471e-08, "loss": 1.9925, "step": 434 }, { "epoch": 2.9441624365482233, "grad_norm": 3.6963796615600586, "learning_rate": 9.74195172715242e-09, "loss": 1.3562, "step": 435 }, { "epoch": 2.95093062605753, "grad_norm": 4.348772048950195, "learning_rate": 6.7655799629284815e-09, "loss": 0.8997, "step": 436 }, { "epoch": 2.957698815566836, "grad_norm": 4.724446773529053, "learning_rate": 4.330146971515126e-09, "loss": 0.7332, "step": 437 }, { "epoch": 2.964467005076142, "grad_norm": 6.034689903259277, "learning_rate": 2.435784584114975e-09, "loss": 0.663, "step": 438 }, { "epoch": 2.9712351945854483, "grad_norm": 5.839619159698486, "learning_rate": 1.0825953435122938e-09, "loss": 0.5757, "step": 439 }, { "epoch": 2.9780033840947544, "grad_norm": 3.8592047691345215, "learning_rate": 2.706524985174319e-10, "loss": 1.5785, "step": 440 }, { "epoch": 2.984771573604061, "grad_norm": 3.915963888168335, "learning_rate": 0.0, "loss": 1.1236, "step": 441 }, { "epoch": 2.984771573604061, "step": 441, "total_flos": 4.015937399291904e+16, "train_loss": 1.9800935814710432, "train_runtime": 828.1657, "train_samples_per_second": 34.203, "train_steps_per_second": 0.533 } ], "logging_steps": 1, "max_steps": 441, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.015937399291904e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }