{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0786815998096173, "eval_steps": 200, "global_step": 105800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.43682417860277e-05, "grad_norm": 0.4972322881221771, "learning_rate": 1.9999999990147362e-05, "loss": 1.9714, "step": 100 }, { "epoch": 0.0001487364835720554, "grad_norm": 0.6138768792152405, "learning_rate": 1.9999999958487906e-05, "loss": 1.6983, "step": 200 }, { "epoch": 0.0002231047253580831, "grad_norm": 0.92356276512146, "learning_rate": 1.999999990499435e-05, "loss": 1.6566, "step": 300 }, { "epoch": 0.0002974729671441108, "grad_norm": 0.5427595376968384, "learning_rate": 1.9999999829666684e-05, "loss": 1.6238, "step": 400 }, { "epoch": 0.00037184120893013847, "grad_norm": 1.5316662788391113, "learning_rate": 1.9999999732504913e-05, "loss": 1.6102, "step": 500 }, { "epoch": 0.0004462094507161662, "grad_norm": 0.477271169424057, "learning_rate": 1.999999961350904e-05, "loss": 1.5936, "step": 600 }, { "epoch": 0.0005205776925021939, "grad_norm": 1.1669890880584717, "learning_rate": 1.9999999472679058e-05, "loss": 1.6411, "step": 700 }, { "epoch": 0.0005949459342882216, "grad_norm": 0.6108381748199463, "learning_rate": 1.9999999310014972e-05, "loss": 1.5256, "step": 800 }, { "epoch": 0.0006693141760742492, "grad_norm": 0.6316787004470825, "learning_rate": 1.9999999125516783e-05, "loss": 1.6363, "step": 900 }, { "epoch": 0.0007436824178602769, "grad_norm": 0.8376529216766357, "learning_rate": 1.999999891918449e-05, "loss": 1.5394, "step": 1000 }, { "epoch": 0.0008180506596463047, "grad_norm": 1.0385313034057617, "learning_rate": 1.9999998691018094e-05, "loss": 1.638, "step": 1100 }, { "epoch": 0.0008924189014323324, "grad_norm": 0.8692856431007385, "learning_rate": 1.9999998441017593e-05, "loss": 1.7291, "step": 1200 }, { "epoch": 0.00096678714321836, "grad_norm": 1.059537410736084, "learning_rate": 1.9999998169182993e-05, "loss": 1.6251, "step": 1300 }, { "epoch": 0.0010411553850043877, "grad_norm": 0.3969714045524597, "learning_rate": 1.999999787551429e-05, "loss": 1.5324, "step": 1400 }, { "epoch": 0.0011155236267904154, "grad_norm": 0.6760269403457642, "learning_rate": 1.9999997560011483e-05, "loss": 1.5262, "step": 1500 }, { "epoch": 0.0011898918685764432, "grad_norm": 0.6536991596221924, "learning_rate": 1.9999997222674577e-05, "loss": 1.6578, "step": 1600 }, { "epoch": 0.0012642601103624708, "grad_norm": 1.2318875789642334, "learning_rate": 1.999999686350357e-05, "loss": 1.5478, "step": 1700 }, { "epoch": 0.0013386283521484984, "grad_norm": 1.3172451257705688, "learning_rate": 1.999999648249847e-05, "loss": 1.5926, "step": 1800 }, { "epoch": 0.0014129965939345263, "grad_norm": 1.4219484329223633, "learning_rate": 1.9999996079659265e-05, "loss": 1.5595, "step": 1900 }, { "epoch": 0.0014873648357205539, "grad_norm": 0.6480392813682556, "learning_rate": 1.9999995654985968e-05, "loss": 1.5321, "step": 2000 }, { "epoch": 0.0015617330775065815, "grad_norm": 0.5489968061447144, "learning_rate": 1.999999520847857e-05, "loss": 1.5744, "step": 2100 }, { "epoch": 0.0016361013192926093, "grad_norm": 0.7695141434669495, "learning_rate": 1.999999474013708e-05, "loss": 1.5263, "step": 2200 }, { "epoch": 0.001710469561078637, "grad_norm": 0.7596250176429749, "learning_rate": 1.9999994249961495e-05, "loss": 1.5586, "step": 2300 }, { "epoch": 0.0017848378028646648, "grad_norm": 0.8226674795150757, "learning_rate": 1.9999993737951816e-05, "loss": 1.6021, "step": 2400 }, { "epoch": 0.0018592060446506924, "grad_norm": 0.5418084859848022, "learning_rate": 1.9999993204108044e-05, "loss": 1.6234, "step": 2500 }, { "epoch": 0.00193357428643672, "grad_norm": 0.5253565907478333, "learning_rate": 1.9999992648430182e-05, "loss": 1.5487, "step": 2600 }, { "epoch": 0.0020079425282227476, "grad_norm": 1.0812253952026367, "learning_rate": 1.999999207091823e-05, "loss": 1.6161, "step": 2700 }, { "epoch": 0.0020823107700087755, "grad_norm": 0.6357698440551758, "learning_rate": 1.999999147157219e-05, "loss": 1.5693, "step": 2800 }, { "epoch": 0.0021566790117948033, "grad_norm": 0.9794847369194031, "learning_rate": 1.9999990850392064e-05, "loss": 1.5337, "step": 2900 }, { "epoch": 0.0022310472535808307, "grad_norm": 0.5611212849617004, "learning_rate": 1.9999990207377848e-05, "loss": 1.6034, "step": 3000 }, { "epoch": 0.0023054154953668585, "grad_norm": 0.8199095129966736, "learning_rate": 1.999998954252955e-05, "loss": 1.5291, "step": 3100 }, { "epoch": 0.0023797837371528864, "grad_norm": 0.6310203075408936, "learning_rate": 1.999998885584717e-05, "loss": 1.5337, "step": 3200 }, { "epoch": 0.0024541519789389138, "grad_norm": 0.8682138919830322, "learning_rate": 1.9999988147330707e-05, "loss": 1.5383, "step": 3300 }, { "epoch": 0.0025285202207249416, "grad_norm": 0.6630149483680725, "learning_rate": 1.9999987416980167e-05, "loss": 1.6387, "step": 3400 }, { "epoch": 0.0026028884625109694, "grad_norm": 0.5285632014274597, "learning_rate": 1.9999986664795547e-05, "loss": 1.5507, "step": 3500 }, { "epoch": 0.002677256704296997, "grad_norm": 0.5242965221405029, "learning_rate": 1.9999985890776846e-05, "loss": 1.6422, "step": 3600 }, { "epoch": 0.0027516249460830247, "grad_norm": 0.4600646495819092, "learning_rate": 1.9999985094924076e-05, "loss": 1.6473, "step": 3700 }, { "epoch": 0.0028259931878690525, "grad_norm": 0.6593307256698608, "learning_rate": 1.999998427723723e-05, "loss": 1.5936, "step": 3800 }, { "epoch": 0.00290036142965508, "grad_norm": 0.3825130760669708, "learning_rate": 1.9999983437716315e-05, "loss": 1.5509, "step": 3900 }, { "epoch": 0.0029747296714411077, "grad_norm": 0.46043556928634644, "learning_rate": 1.999998257636133e-05, "loss": 1.5043, "step": 4000 }, { "epoch": 0.0030490979132271356, "grad_norm": 0.751379132270813, "learning_rate": 1.999998169317227e-05, "loss": 1.5258, "step": 4100 }, { "epoch": 0.003123466155013163, "grad_norm": 0.5719695687294006, "learning_rate": 1.9999980788149155e-05, "loss": 1.669, "step": 4200 }, { "epoch": 0.003197834396799191, "grad_norm": 0.5489699244499207, "learning_rate": 1.999997986129197e-05, "loss": 1.5581, "step": 4300 }, { "epoch": 0.0032722026385852187, "grad_norm": 0.5944995880126953, "learning_rate": 1.9999978912600722e-05, "loss": 1.5717, "step": 4400 }, { "epoch": 0.003346570880371246, "grad_norm": 0.4564272165298462, "learning_rate": 1.9999977942075416e-05, "loss": 1.5178, "step": 4500 }, { "epoch": 0.003420939122157274, "grad_norm": 1.082127571105957, "learning_rate": 1.9999976949716057e-05, "loss": 1.6077, "step": 4600 }, { "epoch": 0.0034953073639433017, "grad_norm": 0.7081079483032227, "learning_rate": 1.9999975935522635e-05, "loss": 1.6147, "step": 4700 }, { "epoch": 0.0035696756057293296, "grad_norm": 1.084369421005249, "learning_rate": 1.9999974899495163e-05, "loss": 1.5796, "step": 4800 }, { "epoch": 0.003644043847515357, "grad_norm": 0.5583994388580322, "learning_rate": 1.999997384163364e-05, "loss": 1.5099, "step": 4900 }, { "epoch": 0.003718412089301385, "grad_norm": 0.563099205493927, "learning_rate": 1.999997276193807e-05, "loss": 1.5222, "step": 5000 }, { "epoch": 0.0037927803310874126, "grad_norm": 0.6037421822547913, "learning_rate": 1.9999971660408454e-05, "loss": 1.5916, "step": 5100 }, { "epoch": 0.00386714857287344, "grad_norm": 0.5209466218948364, "learning_rate": 1.9999970537044787e-05, "loss": 1.6196, "step": 5200 }, { "epoch": 0.003941516814659467, "grad_norm": 1.0418217182159424, "learning_rate": 1.9999969391847088e-05, "loss": 1.601, "step": 5300 }, { "epoch": 0.004015885056445495, "grad_norm": 1.235737681388855, "learning_rate": 1.9999968224815345e-05, "loss": 1.4994, "step": 5400 }, { "epoch": 0.004090253298231523, "grad_norm": 1.1249513626098633, "learning_rate": 1.9999967035949567e-05, "loss": 1.5871, "step": 5500 }, { "epoch": 0.004164621540017551, "grad_norm": 0.8271663784980774, "learning_rate": 1.9999965825249753e-05, "loss": 1.5734, "step": 5600 }, { "epoch": 0.004238989781803579, "grad_norm": 0.6501545906066895, "learning_rate": 1.999996459271591e-05, "loss": 1.5859, "step": 5700 }, { "epoch": 0.004313358023589607, "grad_norm": 0.6576992273330688, "learning_rate": 1.9999963338348036e-05, "loss": 1.5457, "step": 5800 }, { "epoch": 0.004387726265375634, "grad_norm": 0.5684088468551636, "learning_rate": 1.9999962062146138e-05, "loss": 1.5518, "step": 5900 }, { "epoch": 0.004462094507161661, "grad_norm": 0.6332255005836487, "learning_rate": 1.9999960764110216e-05, "loss": 1.586, "step": 6000 }, { "epoch": 0.004536462748947689, "grad_norm": 0.4564649164676666, "learning_rate": 1.9999959444240276e-05, "loss": 1.5249, "step": 6100 }, { "epoch": 0.004610830990733717, "grad_norm": 0.5801929235458374, "learning_rate": 1.9999958102536316e-05, "loss": 1.5849, "step": 6200 }, { "epoch": 0.004685199232519745, "grad_norm": 0.8843029737472534, "learning_rate": 1.9999956738998345e-05, "loss": 1.5055, "step": 6300 }, { "epoch": 0.004759567474305773, "grad_norm": 0.7232934832572937, "learning_rate": 1.999995535362636e-05, "loss": 1.5706, "step": 6400 }, { "epoch": 0.004833935716091801, "grad_norm": 0.7958771586418152, "learning_rate": 1.9999953946420368e-05, "loss": 1.5943, "step": 6500 }, { "epoch": 0.0049083039578778275, "grad_norm": 0.7699094414710999, "learning_rate": 1.999995251738037e-05, "loss": 1.6173, "step": 6600 }, { "epoch": 0.004982672199663855, "grad_norm": 0.43996062874794006, "learning_rate": 1.9999951066506368e-05, "loss": 1.5154, "step": 6700 }, { "epoch": 0.005057040441449883, "grad_norm": 0.773326575756073, "learning_rate": 1.9999949593798372e-05, "loss": 1.5791, "step": 6800 }, { "epoch": 0.005131408683235911, "grad_norm": 0.42401251196861267, "learning_rate": 1.9999948099256374e-05, "loss": 1.5429, "step": 6900 }, { "epoch": 0.005205776925021939, "grad_norm": 0.44549378752708435, "learning_rate": 1.999994658288039e-05, "loss": 1.605, "step": 7000 }, { "epoch": 0.005280145166807967, "grad_norm": 0.5648560523986816, "learning_rate": 1.999994504467041e-05, "loss": 1.5536, "step": 7100 }, { "epoch": 0.005354513408593994, "grad_norm": 1.0245320796966553, "learning_rate": 1.999994348462645e-05, "loss": 1.5509, "step": 7200 }, { "epoch": 0.0054288816503800215, "grad_norm": 0.9695309996604919, "learning_rate": 1.9999941902748505e-05, "loss": 1.5892, "step": 7300 }, { "epoch": 0.005503249892166049, "grad_norm": 0.9779026508331299, "learning_rate": 1.9999940299036584e-05, "loss": 1.5901, "step": 7400 }, { "epoch": 0.005577618133952077, "grad_norm": 0.7186980247497559, "learning_rate": 1.999993867349068e-05, "loss": 1.5402, "step": 7500 }, { "epoch": 0.005651986375738105, "grad_norm": 0.751449704170227, "learning_rate": 1.9999937026110813e-05, "loss": 1.5217, "step": 7600 }, { "epoch": 0.005726354617524133, "grad_norm": 0.7808834314346313, "learning_rate": 1.999993535689697e-05, "loss": 1.5254, "step": 7700 }, { "epoch": 0.00580072285931016, "grad_norm": 0.529984176158905, "learning_rate": 1.999993366584917e-05, "loss": 1.6134, "step": 7800 }, { "epoch": 0.005875091101096188, "grad_norm": 0.8374336361885071, "learning_rate": 1.9999931952967404e-05, "loss": 1.4759, "step": 7900 }, { "epoch": 0.0059494593428822155, "grad_norm": 0.3483956754207611, "learning_rate": 1.9999930218251683e-05, "loss": 1.5905, "step": 8000 }, { "epoch": 0.006023827584668243, "grad_norm": 0.8897103667259216, "learning_rate": 1.9999928461702004e-05, "loss": 1.6492, "step": 8100 }, { "epoch": 0.006098195826454271, "grad_norm": 0.5743923783302307, "learning_rate": 1.999992668331838e-05, "loss": 1.5322, "step": 8200 }, { "epoch": 0.006172564068240299, "grad_norm": 1.3532215356826782, "learning_rate": 1.999992488310081e-05, "loss": 1.5155, "step": 8300 }, { "epoch": 0.006246932310026326, "grad_norm": 1.118270754814148, "learning_rate": 1.9999923061049298e-05, "loss": 1.517, "step": 8400 }, { "epoch": 0.006321300551812354, "grad_norm": 1.0752383470535278, "learning_rate": 1.9999921217163847e-05, "loss": 1.5654, "step": 8500 }, { "epoch": 0.006395668793598382, "grad_norm": 0.4761950671672821, "learning_rate": 1.999991935144446e-05, "loss": 1.588, "step": 8600 }, { "epoch": 0.0064700370353844095, "grad_norm": 0.4377930164337158, "learning_rate": 1.9999917463891147e-05, "loss": 1.5932, "step": 8700 }, { "epoch": 0.006544405277170437, "grad_norm": 0.5289610624313354, "learning_rate": 1.9999915554503908e-05, "loss": 1.5362, "step": 8800 }, { "epoch": 0.006618773518956465, "grad_norm": 0.6469466090202332, "learning_rate": 1.9999913623282747e-05, "loss": 1.5515, "step": 8900 }, { "epoch": 0.006693141760742492, "grad_norm": 0.8052897453308105, "learning_rate": 1.999991167022767e-05, "loss": 1.4691, "step": 9000 }, { "epoch": 0.00676751000252852, "grad_norm": 0.4677363932132721, "learning_rate": 1.999990969533868e-05, "loss": 1.5955, "step": 9100 }, { "epoch": 0.006841878244314548, "grad_norm": 0.9299643039703369, "learning_rate": 1.9999907698615777e-05, "loss": 1.5657, "step": 9200 }, { "epoch": 0.006916246486100576, "grad_norm": 0.5175402164459229, "learning_rate": 1.9999905680058974e-05, "loss": 1.5471, "step": 9300 }, { "epoch": 0.0069906147278866035, "grad_norm": 0.6280660033226013, "learning_rate": 1.999990363966827e-05, "loss": 1.5465, "step": 9400 }, { "epoch": 0.007064982969672631, "grad_norm": 0.5920536518096924, "learning_rate": 1.999990157744367e-05, "loss": 1.5587, "step": 9500 }, { "epoch": 0.007139351211458659, "grad_norm": 0.6226286292076111, "learning_rate": 1.999989949338518e-05, "loss": 1.5399, "step": 9600 }, { "epoch": 0.007213719453244686, "grad_norm": 0.757337749004364, "learning_rate": 1.9999897387492803e-05, "loss": 1.5142, "step": 9700 }, { "epoch": 0.007288087695030714, "grad_norm": 0.5596433281898499, "learning_rate": 1.9999895259766547e-05, "loss": 1.4845, "step": 9800 }, { "epoch": 0.007362455936816742, "grad_norm": 0.8564650416374207, "learning_rate": 1.999989311020641e-05, "loss": 1.5007, "step": 9900 }, { "epoch": 0.00743682417860277, "grad_norm": 0.7305134534835815, "learning_rate": 1.99998909388124e-05, "loss": 1.4579, "step": 10000 }, { "epoch": 0.007511192420388797, "grad_norm": 0.5316299200057983, "learning_rate": 1.9999888745584525e-05, "loss": 1.5686, "step": 10100 }, { "epoch": 0.007585560662174825, "grad_norm": 0.8033043742179871, "learning_rate": 1.9999886530522786e-05, "loss": 1.5322, "step": 10200 }, { "epoch": 0.007659928903960852, "grad_norm": 0.8600965738296509, "learning_rate": 1.999988429362719e-05, "loss": 1.5171, "step": 10300 }, { "epoch": 0.00773429714574688, "grad_norm": 0.7395327091217041, "learning_rate": 1.9999882034897743e-05, "loss": 1.5651, "step": 10400 }, { "epoch": 0.007808665387532908, "grad_norm": 0.7305371761322021, "learning_rate": 1.9999879754334445e-05, "loss": 1.5098, "step": 10500 }, { "epoch": 0.007883033629318935, "grad_norm": 0.6956737637519836, "learning_rate": 1.99998774519373e-05, "loss": 1.6477, "step": 10600 }, { "epoch": 0.007957401871104964, "grad_norm": 0.8382702469825745, "learning_rate": 1.9999875127706324e-05, "loss": 1.5233, "step": 10700 }, { "epoch": 0.00803177011289099, "grad_norm": 0.37894684076309204, "learning_rate": 1.999987278164151e-05, "loss": 1.5008, "step": 10800 }, { "epoch": 0.00810613835467702, "grad_norm": 0.5010106563568115, "learning_rate": 1.9999870413742868e-05, "loss": 1.5424, "step": 10900 }, { "epoch": 0.008180506596463046, "grad_norm": 0.6536372900009155, "learning_rate": 1.9999868024010403e-05, "loss": 1.5774, "step": 11000 }, { "epoch": 0.008254874838249075, "grad_norm": 0.43751344084739685, "learning_rate": 1.9999865612444122e-05, "loss": 1.5887, "step": 11100 }, { "epoch": 0.008329243080035102, "grad_norm": 0.4979201853275299, "learning_rate": 1.999986317904403e-05, "loss": 1.5715, "step": 11200 }, { "epoch": 0.008403611321821129, "grad_norm": 0.513481855392456, "learning_rate": 1.9999860723810127e-05, "loss": 1.5182, "step": 11300 }, { "epoch": 0.008477979563607158, "grad_norm": 0.5014403462409973, "learning_rate": 1.9999858246742425e-05, "loss": 1.5185, "step": 11400 }, { "epoch": 0.008552347805393185, "grad_norm": 0.5066354274749756, "learning_rate": 1.9999855747840925e-05, "loss": 1.5964, "step": 11500 }, { "epoch": 0.008626716047179213, "grad_norm": 0.7306295037269592, "learning_rate": 1.999985322710564e-05, "loss": 1.6196, "step": 11600 }, { "epoch": 0.00870108428896524, "grad_norm": 0.3036212623119354, "learning_rate": 1.9999850684536562e-05, "loss": 1.5308, "step": 11700 }, { "epoch": 0.008775452530751267, "grad_norm": 0.51576167345047, "learning_rate": 1.999984812013371e-05, "loss": 1.5954, "step": 11800 }, { "epoch": 0.008849820772537296, "grad_norm": 0.7507824301719666, "learning_rate": 1.999984553389708e-05, "loss": 1.5184, "step": 11900 }, { "epoch": 0.008924189014323323, "grad_norm": 0.43882057070732117, "learning_rate": 1.999984292582668e-05, "loss": 1.5507, "step": 12000 }, { "epoch": 0.008998557256109352, "grad_norm": 1.0746114253997803, "learning_rate": 1.9999840295922518e-05, "loss": 1.5196, "step": 12100 }, { "epoch": 0.009072925497895378, "grad_norm": 0.6190723180770874, "learning_rate": 1.99998376441846e-05, "loss": 1.5683, "step": 12200 }, { "epoch": 0.009147293739681407, "grad_norm": 0.7086498141288757, "learning_rate": 1.9999834970612934e-05, "loss": 1.6125, "step": 12300 }, { "epoch": 0.009221661981467434, "grad_norm": 0.9270760416984558, "learning_rate": 1.999983227520752e-05, "loss": 1.6108, "step": 12400 }, { "epoch": 0.009296030223253461, "grad_norm": 0.47269493341445923, "learning_rate": 1.9999829557968365e-05, "loss": 1.5784, "step": 12500 }, { "epoch": 0.00937039846503949, "grad_norm": 0.888103723526001, "learning_rate": 1.9999826818895477e-05, "loss": 1.5406, "step": 12600 }, { "epoch": 0.009444766706825517, "grad_norm": 0.44103074073791504, "learning_rate": 1.9999824057988865e-05, "loss": 1.6345, "step": 12700 }, { "epoch": 0.009519134948611545, "grad_norm": 0.8790387511253357, "learning_rate": 1.999982127524853e-05, "loss": 1.5069, "step": 12800 }, { "epoch": 0.009593503190397572, "grad_norm": 0.7071767449378967, "learning_rate": 1.9999818470674474e-05, "loss": 1.5656, "step": 12900 }, { "epoch": 0.009667871432183601, "grad_norm": 0.36154705286026, "learning_rate": 1.9999815644266713e-05, "loss": 1.5022, "step": 13000 }, { "epoch": 0.009742239673969628, "grad_norm": 0.8780633807182312, "learning_rate": 1.9999812796025247e-05, "loss": 1.5585, "step": 13100 }, { "epoch": 0.009816607915755655, "grad_norm": 0.45413634181022644, "learning_rate": 1.9999809925950084e-05, "loss": 1.5732, "step": 13200 }, { "epoch": 0.009890976157541684, "grad_norm": 0.6584810614585876, "learning_rate": 1.999980703404123e-05, "loss": 1.5572, "step": 13300 }, { "epoch": 0.00996534439932771, "grad_norm": 0.4910299479961395, "learning_rate": 1.9999804120298694e-05, "loss": 1.5544, "step": 13400 }, { "epoch": 0.01003971264111374, "grad_norm": 0.4675331115722656, "learning_rate": 1.9999801184722477e-05, "loss": 1.5055, "step": 13500 }, { "epoch": 0.010114080882899766, "grad_norm": 0.7481106519699097, "learning_rate": 1.999979822731259e-05, "loss": 1.5148, "step": 13600 }, { "epoch": 0.010188449124685793, "grad_norm": 0.5710633993148804, "learning_rate": 1.9999795248069036e-05, "loss": 1.6359, "step": 13700 }, { "epoch": 0.010262817366471822, "grad_norm": 0.5404725074768066, "learning_rate": 1.999979224699183e-05, "loss": 1.5919, "step": 13800 }, { "epoch": 0.010337185608257849, "grad_norm": 0.5491372346878052, "learning_rate": 1.9999789224080965e-05, "loss": 1.6065, "step": 13900 }, { "epoch": 0.010411553850043878, "grad_norm": 0.3632746934890747, "learning_rate": 1.9999786179336454e-05, "loss": 1.5333, "step": 14000 }, { "epoch": 0.010485922091829905, "grad_norm": 0.43190881609916687, "learning_rate": 1.9999783112758305e-05, "loss": 1.5359, "step": 14100 }, { "epoch": 0.010560290333615933, "grad_norm": 0.6655808687210083, "learning_rate": 1.9999780024346525e-05, "loss": 1.6012, "step": 14200 }, { "epoch": 0.01063465857540196, "grad_norm": 0.7489643692970276, "learning_rate": 1.999977691410112e-05, "loss": 1.5545, "step": 14300 }, { "epoch": 0.010709026817187987, "grad_norm": 0.4741237759590149, "learning_rate": 1.9999773782022095e-05, "loss": 1.5303, "step": 14400 }, { "epoch": 0.010783395058974016, "grad_norm": 0.7895578145980835, "learning_rate": 1.9999770628109458e-05, "loss": 1.5997, "step": 14500 }, { "epoch": 0.010857763300760043, "grad_norm": 0.6510291695594788, "learning_rate": 1.9999767452363215e-05, "loss": 1.483, "step": 14600 }, { "epoch": 0.010932131542546072, "grad_norm": 0.5989207029342651, "learning_rate": 1.9999764254783376e-05, "loss": 1.5073, "step": 14700 }, { "epoch": 0.011006499784332099, "grad_norm": 0.5995681881904602, "learning_rate": 1.9999761035369946e-05, "loss": 1.5439, "step": 14800 }, { "epoch": 0.011080868026118126, "grad_norm": 0.6359573602676392, "learning_rate": 1.9999757794122933e-05, "loss": 1.5821, "step": 14900 }, { "epoch": 0.011155236267904154, "grad_norm": 0.404085636138916, "learning_rate": 1.9999754531042338e-05, "loss": 1.556, "step": 15000 }, { "epoch": 0.011229604509690181, "grad_norm": 0.660020112991333, "learning_rate": 1.9999751246128175e-05, "loss": 1.5713, "step": 15100 }, { "epoch": 0.01130397275147621, "grad_norm": 0.7031283378601074, "learning_rate": 1.9999747939380453e-05, "loss": 1.4647, "step": 15200 }, { "epoch": 0.011378340993262237, "grad_norm": 0.5159358978271484, "learning_rate": 1.9999744610799173e-05, "loss": 1.5298, "step": 15300 }, { "epoch": 0.011452709235048266, "grad_norm": 0.5451757907867432, "learning_rate": 1.9999741260384345e-05, "loss": 1.6068, "step": 15400 }, { "epoch": 0.011527077476834293, "grad_norm": 0.9550883769989014, "learning_rate": 1.9999737888135975e-05, "loss": 1.5665, "step": 15500 }, { "epoch": 0.01160144571862032, "grad_norm": 0.3711983859539032, "learning_rate": 1.999973449405407e-05, "loss": 1.5189, "step": 15600 }, { "epoch": 0.011675813960406348, "grad_norm": 1.052902340888977, "learning_rate": 1.9999731078138643e-05, "loss": 1.6834, "step": 15700 }, { "epoch": 0.011750182202192375, "grad_norm": 0.6009785532951355, "learning_rate": 1.9999727640389697e-05, "loss": 1.5497, "step": 15800 }, { "epoch": 0.011824550443978404, "grad_norm": 0.5357051491737366, "learning_rate": 1.999972418080724e-05, "loss": 1.5163, "step": 15900 }, { "epoch": 0.011898918685764431, "grad_norm": 0.5712498426437378, "learning_rate": 1.9999720699391275e-05, "loss": 1.5611, "step": 16000 }, { "epoch": 0.01197328692755046, "grad_norm": 0.5744183659553528, "learning_rate": 1.999971719614182e-05, "loss": 1.5405, "step": 16100 }, { "epoch": 0.012047655169336487, "grad_norm": 0.42877888679504395, "learning_rate": 1.9999713671058874e-05, "loss": 1.5595, "step": 16200 }, { "epoch": 0.012122023411122514, "grad_norm": 0.7209616303443909, "learning_rate": 1.9999710124142445e-05, "loss": 1.5457, "step": 16300 }, { "epoch": 0.012196391652908542, "grad_norm": 0.7052120566368103, "learning_rate": 1.999970655539255e-05, "loss": 1.5724, "step": 16400 }, { "epoch": 0.01227075989469457, "grad_norm": 0.45960021018981934, "learning_rate": 1.9999702964809182e-05, "loss": 1.5479, "step": 16500 }, { "epoch": 0.012345128136480598, "grad_norm": 0.4394296407699585, "learning_rate": 1.9999699352392362e-05, "loss": 1.55, "step": 16600 }, { "epoch": 0.012419496378266625, "grad_norm": 1.227424144744873, "learning_rate": 1.999969571814209e-05, "loss": 1.5187, "step": 16700 }, { "epoch": 0.012493864620052652, "grad_norm": 0.8249584436416626, "learning_rate": 1.9999692062058376e-05, "loss": 1.5677, "step": 16800 }, { "epoch": 0.01256823286183868, "grad_norm": 0.8973199725151062, "learning_rate": 1.999968838414123e-05, "loss": 1.5501, "step": 16900 }, { "epoch": 0.012642601103624708, "grad_norm": 0.716529905796051, "learning_rate": 1.999968468439066e-05, "loss": 1.6245, "step": 17000 }, { "epoch": 0.012716969345410736, "grad_norm": 0.5941506624221802, "learning_rate": 1.999968096280667e-05, "loss": 1.4951, "step": 17100 }, { "epoch": 0.012791337587196763, "grad_norm": 1.8864718675613403, "learning_rate": 1.999967721938927e-05, "loss": 1.5397, "step": 17200 }, { "epoch": 0.012865705828982792, "grad_norm": 0.6418184638023376, "learning_rate": 1.999967345413847e-05, "loss": 1.4693, "step": 17300 }, { "epoch": 0.012940074070768819, "grad_norm": 0.6764699220657349, "learning_rate": 1.999966966705428e-05, "loss": 1.4635, "step": 17400 }, { "epoch": 0.013014442312554846, "grad_norm": 0.7185351848602295, "learning_rate": 1.9999665858136704e-05, "loss": 1.5252, "step": 17500 }, { "epoch": 0.013088810554340875, "grad_norm": 0.42110446095466614, "learning_rate": 1.9999662027385748e-05, "loss": 1.5908, "step": 17600 }, { "epoch": 0.013163178796126902, "grad_norm": 0.6807708144187927, "learning_rate": 1.999965817480143e-05, "loss": 1.6466, "step": 17700 }, { "epoch": 0.01323754703791293, "grad_norm": 0.5771286487579346, "learning_rate": 1.999965430038375e-05, "loss": 1.5361, "step": 17800 }, { "epoch": 0.013311915279698957, "grad_norm": 0.5322648882865906, "learning_rate": 1.9999650404132715e-05, "loss": 1.5638, "step": 17900 }, { "epoch": 0.013386283521484984, "grad_norm": 0.865608274936676, "learning_rate": 1.9999646486048342e-05, "loss": 1.4568, "step": 18000 }, { "epoch": 0.013460651763271013, "grad_norm": 0.7592107057571411, "learning_rate": 1.9999642546130634e-05, "loss": 1.5669, "step": 18100 }, { "epoch": 0.01353502000505704, "grad_norm": 0.673466145992279, "learning_rate": 1.9999638584379602e-05, "loss": 1.5141, "step": 18200 }, { "epoch": 0.013609388246843069, "grad_norm": 0.574698269367218, "learning_rate": 1.9999634600795252e-05, "loss": 1.5703, "step": 18300 }, { "epoch": 0.013683756488629096, "grad_norm": 0.6722753643989563, "learning_rate": 1.9999630595377595e-05, "loss": 1.5843, "step": 18400 }, { "epoch": 0.013758124730415124, "grad_norm": 0.9738336801528931, "learning_rate": 1.9999626568126636e-05, "loss": 1.4878, "step": 18500 }, { "epoch": 0.013832492972201151, "grad_norm": 0.5274741649627686, "learning_rate": 1.999962251904239e-05, "loss": 1.5254, "step": 18600 }, { "epoch": 0.013906861213987178, "grad_norm": 1.725870966911316, "learning_rate": 1.999961844812486e-05, "loss": 1.4785, "step": 18700 }, { "epoch": 0.013981229455773207, "grad_norm": 0.6889399886131287, "learning_rate": 1.9999614355374058e-05, "loss": 1.5437, "step": 18800 }, { "epoch": 0.014055597697559234, "grad_norm": 0.576836884021759, "learning_rate": 1.9999610240789994e-05, "loss": 1.4949, "step": 18900 }, { "epoch": 0.014129965939345263, "grad_norm": 0.3870568871498108, "learning_rate": 1.9999606104372674e-05, "loss": 1.5376, "step": 19000 }, { "epoch": 0.01420433418113129, "grad_norm": 1.1045247316360474, "learning_rate": 1.9999601946122107e-05, "loss": 1.6825, "step": 19100 }, { "epoch": 0.014278702422917318, "grad_norm": 0.49821707606315613, "learning_rate": 1.9999597766038304e-05, "loss": 1.4909, "step": 19200 }, { "epoch": 0.014353070664703345, "grad_norm": 0.4011678695678711, "learning_rate": 1.9999593564121275e-05, "loss": 1.4673, "step": 19300 }, { "epoch": 0.014427438906489372, "grad_norm": 0.46667736768722534, "learning_rate": 1.9999589340371026e-05, "loss": 1.5537, "step": 19400 }, { "epoch": 0.014501807148275401, "grad_norm": 0.4063940942287445, "learning_rate": 1.9999585094787567e-05, "loss": 1.4736, "step": 19500 }, { "epoch": 0.014576175390061428, "grad_norm": 0.5824026465415955, "learning_rate": 1.9999580827370906e-05, "loss": 1.6191, "step": 19600 }, { "epoch": 0.014650543631847457, "grad_norm": 0.5595284104347229, "learning_rate": 1.999957653812106e-05, "loss": 1.5294, "step": 19700 }, { "epoch": 0.014724911873633484, "grad_norm": 0.6950704455375671, "learning_rate": 1.9999572227038028e-05, "loss": 1.5015, "step": 19800 }, { "epoch": 0.01479928011541951, "grad_norm": 0.4345974028110504, "learning_rate": 1.999956789412183e-05, "loss": 1.4955, "step": 19900 }, { "epoch": 0.01487364835720554, "grad_norm": 0.475046306848526, "learning_rate": 1.9999563539372464e-05, "loss": 1.5663, "step": 20000 }, { "epoch": 0.014948016598991566, "grad_norm": 0.3211815357208252, "learning_rate": 1.9999559162789946e-05, "loss": 1.5379, "step": 20100 }, { "epoch": 0.015022384840777595, "grad_norm": 0.7868314981460571, "learning_rate": 1.9999554764374287e-05, "loss": 1.542, "step": 20200 }, { "epoch": 0.015096753082563622, "grad_norm": 0.3961299955844879, "learning_rate": 1.9999550344125492e-05, "loss": 1.4359, "step": 20300 }, { "epoch": 0.01517112132434965, "grad_norm": 0.7971549034118652, "learning_rate": 1.9999545902043577e-05, "loss": 1.5363, "step": 20400 }, { "epoch": 0.015245489566135677, "grad_norm": 1.1090092658996582, "learning_rate": 1.9999541438128543e-05, "loss": 1.5565, "step": 20500 }, { "epoch": 0.015319857807921704, "grad_norm": 1.0558898448944092, "learning_rate": 1.9999536952380406e-05, "loss": 1.5504, "step": 20600 }, { "epoch": 0.015394226049707733, "grad_norm": 0.5869760513305664, "learning_rate": 1.9999532444799174e-05, "loss": 1.5023, "step": 20700 }, { "epoch": 0.01546859429149376, "grad_norm": 0.5132299065589905, "learning_rate": 1.9999527915384858e-05, "loss": 1.472, "step": 20800 }, { "epoch": 0.015542962533279789, "grad_norm": 0.8405370116233826, "learning_rate": 1.999952336413747e-05, "loss": 1.479, "step": 20900 }, { "epoch": 0.015617330775065816, "grad_norm": 1.0692424774169922, "learning_rate": 1.9999518791057012e-05, "loss": 1.5734, "step": 21000 }, { "epoch": 0.015691699016851843, "grad_norm": 0.39929547905921936, "learning_rate": 1.99995141961435e-05, "loss": 1.5499, "step": 21100 }, { "epoch": 0.01576606725863787, "grad_norm": 0.5001465082168579, "learning_rate": 1.999950957939694e-05, "loss": 1.5728, "step": 21200 }, { "epoch": 0.0158404355004239, "grad_norm": 0.4564245045185089, "learning_rate": 1.999950494081735e-05, "loss": 1.4685, "step": 21300 }, { "epoch": 0.015914803742209927, "grad_norm": 0.945813775062561, "learning_rate": 1.999950028040473e-05, "loss": 1.5309, "step": 21400 }, { "epoch": 0.015989171983995954, "grad_norm": 0.5529621839523315, "learning_rate": 1.9999495598159102e-05, "loss": 1.5244, "step": 21500 }, { "epoch": 0.01606354022578198, "grad_norm": 0.7338210940361023, "learning_rate": 1.9999490894080467e-05, "loss": 1.6339, "step": 21600 }, { "epoch": 0.016137908467568008, "grad_norm": 1.0055419206619263, "learning_rate": 1.999948616816884e-05, "loss": 1.613, "step": 21700 }, { "epoch": 0.01621227670935404, "grad_norm": 0.5460941195487976, "learning_rate": 1.9999481420424223e-05, "loss": 1.5819, "step": 21800 }, { "epoch": 0.016286644951140065, "grad_norm": 1.005537509918213, "learning_rate": 1.9999476650846637e-05, "loss": 1.5636, "step": 21900 }, { "epoch": 0.016361013192926092, "grad_norm": 0.8599165678024292, "learning_rate": 1.9999471859436082e-05, "loss": 1.4977, "step": 22000 }, { "epoch": 0.01643538143471212, "grad_norm": 0.41388291120529175, "learning_rate": 1.9999467046192583e-05, "loss": 1.4243, "step": 22100 }, { "epoch": 0.01650974967649815, "grad_norm": 0.4443175494670868, "learning_rate": 1.9999462211116135e-05, "loss": 1.5419, "step": 22200 }, { "epoch": 0.016584117918284177, "grad_norm": 0.9959002733230591, "learning_rate": 1.999945735420676e-05, "loss": 1.588, "step": 22300 }, { "epoch": 0.016658486160070204, "grad_norm": 0.7721849679946899, "learning_rate": 1.999945247546446e-05, "loss": 1.4806, "step": 22400 }, { "epoch": 0.01673285440185623, "grad_norm": 0.5781850814819336, "learning_rate": 1.9999447574889253e-05, "loss": 1.5864, "step": 22500 }, { "epoch": 0.016807222643642258, "grad_norm": 0.6155378222465515, "learning_rate": 1.9999442652481143e-05, "loss": 1.6002, "step": 22600 }, { "epoch": 0.016881590885428288, "grad_norm": 0.8101166486740112, "learning_rate": 1.9999437708240146e-05, "loss": 1.5385, "step": 22700 }, { "epoch": 0.016955959127214315, "grad_norm": 0.8044368624687195, "learning_rate": 1.999943274216627e-05, "loss": 1.4563, "step": 22800 }, { "epoch": 0.017030327369000342, "grad_norm": 0.3784123361110687, "learning_rate": 1.9999427754259527e-05, "loss": 1.5844, "step": 22900 }, { "epoch": 0.01710469561078637, "grad_norm": 0.8152732253074646, "learning_rate": 1.9999422744519928e-05, "loss": 1.53, "step": 23000 }, { "epoch": 0.017179063852572396, "grad_norm": 0.8851474523544312, "learning_rate": 1.9999417712947486e-05, "loss": 1.5828, "step": 23100 }, { "epoch": 0.017253432094358426, "grad_norm": 0.8275689482688904, "learning_rate": 1.9999412659542208e-05, "loss": 1.5057, "step": 23200 }, { "epoch": 0.017327800336144453, "grad_norm": 0.5356424450874329, "learning_rate": 1.9999407584304106e-05, "loss": 1.5621, "step": 23300 }, { "epoch": 0.01740216857793048, "grad_norm": 0.35889101028442383, "learning_rate": 1.999940248723319e-05, "loss": 1.5884, "step": 23400 }, { "epoch": 0.017476536819716507, "grad_norm": 0.5190862417221069, "learning_rate": 1.9999397368329477e-05, "loss": 1.6021, "step": 23500 }, { "epoch": 0.017550905061502534, "grad_norm": 0.5140055418014526, "learning_rate": 1.9999392227592967e-05, "loss": 1.5474, "step": 23600 }, { "epoch": 0.017625273303288565, "grad_norm": 0.607276201248169, "learning_rate": 1.9999387065023685e-05, "loss": 1.5002, "step": 23700 }, { "epoch": 0.01769964154507459, "grad_norm": 0.7513449192047119, "learning_rate": 1.9999381880621634e-05, "loss": 1.5098, "step": 23800 }, { "epoch": 0.01777400978686062, "grad_norm": 0.7328070402145386, "learning_rate": 1.9999376674386824e-05, "loss": 1.5768, "step": 23900 }, { "epoch": 0.017848378028646646, "grad_norm": 0.817368745803833, "learning_rate": 1.9999371446319272e-05, "loss": 1.5178, "step": 24000 }, { "epoch": 0.017922746270432676, "grad_norm": 0.844530463218689, "learning_rate": 1.999936619641899e-05, "loss": 1.5331, "step": 24100 }, { "epoch": 0.017997114512218703, "grad_norm": 0.8772881627082825, "learning_rate": 1.9999360924685978e-05, "loss": 1.5564, "step": 24200 }, { "epoch": 0.01807148275400473, "grad_norm": 0.37944692373275757, "learning_rate": 1.999935563112026e-05, "loss": 1.4823, "step": 24300 }, { "epoch": 0.018145850995790757, "grad_norm": 0.34085753560066223, "learning_rate": 1.999935031572184e-05, "loss": 1.4741, "step": 24400 }, { "epoch": 0.018220219237576784, "grad_norm": 0.8616833686828613, "learning_rate": 1.9999344978490737e-05, "loss": 1.4642, "step": 24500 }, { "epoch": 0.018294587479362814, "grad_norm": 0.9431029558181763, "learning_rate": 1.9999339619426958e-05, "loss": 1.5507, "step": 24600 }, { "epoch": 0.01836895572114884, "grad_norm": 0.5803475975990295, "learning_rate": 1.9999334238530512e-05, "loss": 1.5617, "step": 24700 }, { "epoch": 0.01844332396293487, "grad_norm": 0.7339209318161011, "learning_rate": 1.9999328835801416e-05, "loss": 1.4881, "step": 24800 }, { "epoch": 0.018517692204720895, "grad_norm": 0.7969409823417664, "learning_rate": 1.9999323411239676e-05, "loss": 1.5438, "step": 24900 }, { "epoch": 0.018592060446506922, "grad_norm": 0.6049161553382874, "learning_rate": 1.9999317964845313e-05, "loss": 1.5352, "step": 25000 }, { "epoch": 0.018666428688292953, "grad_norm": 0.625723659992218, "learning_rate": 1.999931249661833e-05, "loss": 1.4817, "step": 25100 }, { "epoch": 0.01874079693007898, "grad_norm": 0.8167730569839478, "learning_rate": 1.9999307006558745e-05, "loss": 1.4863, "step": 25200 }, { "epoch": 0.018815165171865007, "grad_norm": 0.41490304470062256, "learning_rate": 1.9999301494666566e-05, "loss": 1.4653, "step": 25300 }, { "epoch": 0.018889533413651034, "grad_norm": 0.7005138397216797, "learning_rate": 1.9999295960941802e-05, "loss": 1.5417, "step": 25400 }, { "epoch": 0.01896390165543706, "grad_norm": 0.4145418405532837, "learning_rate": 1.9999290405384476e-05, "loss": 1.5818, "step": 25500 }, { "epoch": 0.01903826989722309, "grad_norm": 0.9620917439460754, "learning_rate": 1.999928482799459e-05, "loss": 1.5717, "step": 25600 }, { "epoch": 0.019112638139009118, "grad_norm": 0.518038272857666, "learning_rate": 1.999927922877216e-05, "loss": 1.4603, "step": 25700 }, { "epoch": 0.019187006380795145, "grad_norm": 1.0701864957809448, "learning_rate": 1.9999273607717198e-05, "loss": 1.5095, "step": 25800 }, { "epoch": 0.019261374622581172, "grad_norm": 1.2206807136535645, "learning_rate": 1.9999267964829717e-05, "loss": 1.5099, "step": 25900 }, { "epoch": 0.019335742864367202, "grad_norm": 0.4838850796222687, "learning_rate": 1.999926230010973e-05, "loss": 1.5856, "step": 26000 }, { "epoch": 0.01941011110615323, "grad_norm": 0.3916015625, "learning_rate": 1.9999256613557243e-05, "loss": 1.5198, "step": 26100 }, { "epoch": 0.019484479347939256, "grad_norm": 0.4921341836452484, "learning_rate": 1.9999250905172276e-05, "loss": 1.5517, "step": 26200 }, { "epoch": 0.019558847589725283, "grad_norm": 0.4124142527580261, "learning_rate": 1.999924517495484e-05, "loss": 1.6267, "step": 26300 }, { "epoch": 0.01963321583151131, "grad_norm": 0.6755162477493286, "learning_rate": 1.9999239422904946e-05, "loss": 1.5408, "step": 26400 }, { "epoch": 0.01970758407329734, "grad_norm": 0.8833709359169006, "learning_rate": 1.9999233649022604e-05, "loss": 1.5334, "step": 26500 }, { "epoch": 0.019781952315083368, "grad_norm": 0.5344982147216797, "learning_rate": 1.9999227853307832e-05, "loss": 1.5532, "step": 26600 }, { "epoch": 0.019856320556869395, "grad_norm": 0.5524909496307373, "learning_rate": 1.999922203576064e-05, "loss": 1.5672, "step": 26700 }, { "epoch": 0.01993068879865542, "grad_norm": 0.6802098751068115, "learning_rate": 1.999921619638104e-05, "loss": 1.57, "step": 26800 }, { "epoch": 0.02000505704044145, "grad_norm": 0.6773833632469177, "learning_rate": 1.9999210335169047e-05, "loss": 1.562, "step": 26900 }, { "epoch": 0.02007942528222748, "grad_norm": 0.5428286194801331, "learning_rate": 1.999920445212467e-05, "loss": 1.5683, "step": 27000 }, { "epoch": 0.020153793524013506, "grad_norm": 0.5180791020393372, "learning_rate": 1.9999198547247927e-05, "loss": 1.6216, "step": 27100 }, { "epoch": 0.020228161765799533, "grad_norm": 0.6695342659950256, "learning_rate": 1.9999192620538825e-05, "loss": 1.4601, "step": 27200 }, { "epoch": 0.02030253000758556, "grad_norm": 1.2745898962020874, "learning_rate": 1.999918667199738e-05, "loss": 1.5002, "step": 27300 }, { "epoch": 0.020376898249371587, "grad_norm": 0.5011482834815979, "learning_rate": 1.999918070162361e-05, "loss": 1.5048, "step": 27400 }, { "epoch": 0.020451266491157617, "grad_norm": 0.4430767297744751, "learning_rate": 1.999917470941752e-05, "loss": 1.5609, "step": 27500 }, { "epoch": 0.020525634732943644, "grad_norm": 0.540259838104248, "learning_rate": 1.9999168695379124e-05, "loss": 1.5115, "step": 27600 }, { "epoch": 0.02060000297472967, "grad_norm": 0.4208228886127472, "learning_rate": 1.999916265950844e-05, "loss": 1.5318, "step": 27700 }, { "epoch": 0.020674371216515698, "grad_norm": 0.5492777824401855, "learning_rate": 1.9999156601805477e-05, "loss": 1.5001, "step": 27800 }, { "epoch": 0.020748739458301725, "grad_norm": 0.8193747997283936, "learning_rate": 1.999915052227025e-05, "loss": 1.5795, "step": 27900 }, { "epoch": 0.020823107700087756, "grad_norm": 0.6221509575843811, "learning_rate": 1.999914442090277e-05, "loss": 1.5414, "step": 28000 }, { "epoch": 0.020897475941873783, "grad_norm": 0.8481204509735107, "learning_rate": 1.9999138297703055e-05, "loss": 1.5481, "step": 28100 }, { "epoch": 0.02097184418365981, "grad_norm": 0.8506454229354858, "learning_rate": 1.9999132152671116e-05, "loss": 1.536, "step": 28200 }, { "epoch": 0.021046212425445836, "grad_norm": 0.6849836111068726, "learning_rate": 1.9999125985806964e-05, "loss": 1.5236, "step": 28300 }, { "epoch": 0.021120580667231867, "grad_norm": 0.6328344345092773, "learning_rate": 1.999911979711062e-05, "loss": 1.4921, "step": 28400 }, { "epoch": 0.021194948909017894, "grad_norm": 0.44376102089881897, "learning_rate": 1.9999113586582085e-05, "loss": 1.5039, "step": 28500 }, { "epoch": 0.02126931715080392, "grad_norm": 0.6041997075080872, "learning_rate": 1.9999107354221385e-05, "loss": 1.5522, "step": 28600 }, { "epoch": 0.021343685392589948, "grad_norm": 0.5901020169258118, "learning_rate": 1.9999101100028522e-05, "loss": 1.5321, "step": 28700 }, { "epoch": 0.021418053634375975, "grad_norm": 1.058334231376648, "learning_rate": 1.999909482400352e-05, "loss": 1.5725, "step": 28800 }, { "epoch": 0.021492421876162005, "grad_norm": 0.9694022536277771, "learning_rate": 1.9999088526146387e-05, "loss": 1.545, "step": 28900 }, { "epoch": 0.021566790117948032, "grad_norm": 0.6166462898254395, "learning_rate": 1.999908220645714e-05, "loss": 1.481, "step": 29000 }, { "epoch": 0.02164115835973406, "grad_norm": 0.4764333963394165, "learning_rate": 1.999907586493579e-05, "loss": 1.5559, "step": 29100 }, { "epoch": 0.021715526601520086, "grad_norm": 0.5028481483459473, "learning_rate": 1.9999069501582352e-05, "loss": 1.5451, "step": 29200 }, { "epoch": 0.021789894843306113, "grad_norm": 0.7064079642295837, "learning_rate": 1.9999063116396844e-05, "loss": 1.5065, "step": 29300 }, { "epoch": 0.021864263085092144, "grad_norm": 0.8854705691337585, "learning_rate": 1.9999056709379268e-05, "loss": 1.5331, "step": 29400 }, { "epoch": 0.02193863132687817, "grad_norm": 1.1931555271148682, "learning_rate": 1.999905028052965e-05, "loss": 1.5629, "step": 29500 }, { "epoch": 0.022012999568664197, "grad_norm": 0.4196559190750122, "learning_rate": 1.9999043829848e-05, "loss": 1.5969, "step": 29600 }, { "epoch": 0.022087367810450224, "grad_norm": 0.661222517490387, "learning_rate": 1.999903735733433e-05, "loss": 1.4522, "step": 29700 }, { "epoch": 0.02216173605223625, "grad_norm": 1.0771206617355347, "learning_rate": 1.9999030862988658e-05, "loss": 1.6346, "step": 29800 }, { "epoch": 0.022236104294022282, "grad_norm": 0.4439813196659088, "learning_rate": 1.9999024346810995e-05, "loss": 1.4533, "step": 29900 }, { "epoch": 0.02231047253580831, "grad_norm": 0.3492225706577301, "learning_rate": 1.999901780880136e-05, "loss": 1.4831, "step": 30000 }, { "epoch": 0.022384840777594336, "grad_norm": 0.6220123171806335, "learning_rate": 1.9999011248959757e-05, "loss": 1.5165, "step": 30100 }, { "epoch": 0.022459209019380363, "grad_norm": 0.467629998922348, "learning_rate": 1.9999004667286214e-05, "loss": 1.5315, "step": 30200 }, { "epoch": 0.022533577261166393, "grad_norm": 0.6271420121192932, "learning_rate": 1.9998998063780735e-05, "loss": 1.5861, "step": 30300 }, { "epoch": 0.02260794550295242, "grad_norm": 0.404694139957428, "learning_rate": 1.9998991438443337e-05, "loss": 1.455, "step": 30400 }, { "epoch": 0.022682313744738447, "grad_norm": 0.6882662177085876, "learning_rate": 1.9998984791274038e-05, "loss": 1.5077, "step": 30500 }, { "epoch": 0.022756681986524474, "grad_norm": 0.4796554744243622, "learning_rate": 1.9998978122272844e-05, "loss": 1.4934, "step": 30600 }, { "epoch": 0.0228310502283105, "grad_norm": 0.7510641813278198, "learning_rate": 1.9998971431439783e-05, "loss": 1.5009, "step": 30700 }, { "epoch": 0.02290541847009653, "grad_norm": 0.5859106779098511, "learning_rate": 1.9998964718774857e-05, "loss": 1.529, "step": 30800 }, { "epoch": 0.02297978671188256, "grad_norm": 0.5376309156417847, "learning_rate": 1.999895798427809e-05, "loss": 1.5485, "step": 30900 }, { "epoch": 0.023054154953668585, "grad_norm": 0.18692457675933838, "learning_rate": 1.9998951227949487e-05, "loss": 1.4897, "step": 31000 }, { "epoch": 0.023128523195454612, "grad_norm": 0.4838975667953491, "learning_rate": 1.999894444978907e-05, "loss": 1.5244, "step": 31100 }, { "epoch": 0.02320289143724064, "grad_norm": 0.8973916172981262, "learning_rate": 1.9998937649796854e-05, "loss": 1.554, "step": 31200 }, { "epoch": 0.02327725967902667, "grad_norm": 0.5681875944137573, "learning_rate": 1.999893082797285e-05, "loss": 1.5503, "step": 31300 }, { "epoch": 0.023351627920812697, "grad_norm": 0.469427227973938, "learning_rate": 1.9998923984317075e-05, "loss": 1.4381, "step": 31400 }, { "epoch": 0.023425996162598724, "grad_norm": 0.43099313974380493, "learning_rate": 1.9998917118829543e-05, "loss": 1.5112, "step": 31500 }, { "epoch": 0.02350036440438475, "grad_norm": 0.8290247917175293, "learning_rate": 1.999891023151027e-05, "loss": 1.467, "step": 31600 }, { "epoch": 0.023574732646170778, "grad_norm": 0.7134198546409607, "learning_rate": 1.999890332235927e-05, "loss": 1.5312, "step": 31700 }, { "epoch": 0.023649100887956808, "grad_norm": 0.4312078356742859, "learning_rate": 1.999889639137656e-05, "loss": 1.5417, "step": 31800 }, { "epoch": 0.023723469129742835, "grad_norm": 0.5288392305374146, "learning_rate": 1.9998889438562153e-05, "loss": 1.5432, "step": 31900 }, { "epoch": 0.023797837371528862, "grad_norm": 0.5819665789604187, "learning_rate": 1.9998882463916062e-05, "loss": 1.5703, "step": 32000 }, { "epoch": 0.02387220561331489, "grad_norm": 0.6748378276824951, "learning_rate": 1.999887546743831e-05, "loss": 1.5674, "step": 32100 }, { "epoch": 0.02394657385510092, "grad_norm": 0.5860730409622192, "learning_rate": 1.9998868449128905e-05, "loss": 1.5775, "step": 32200 }, { "epoch": 0.024020942096886946, "grad_norm": 1.1641826629638672, "learning_rate": 1.9998861408987866e-05, "loss": 1.6354, "step": 32300 }, { "epoch": 0.024095310338672973, "grad_norm": 0.6446713209152222, "learning_rate": 1.9998854347015206e-05, "loss": 1.5508, "step": 32400 }, { "epoch": 0.024169678580459, "grad_norm": 0.8211930990219116, "learning_rate": 1.9998847263210942e-05, "loss": 1.4797, "step": 32500 }, { "epoch": 0.024244046822245027, "grad_norm": 0.9733643531799316, "learning_rate": 1.9998840157575093e-05, "loss": 1.5375, "step": 32600 }, { "epoch": 0.024318415064031058, "grad_norm": 0.7882494330406189, "learning_rate": 1.9998833030107663e-05, "loss": 1.5167, "step": 32700 }, { "epoch": 0.024392783305817085, "grad_norm": 0.8701611757278442, "learning_rate": 1.999882588080868e-05, "loss": 1.578, "step": 32800 }, { "epoch": 0.02446715154760311, "grad_norm": 0.5390304923057556, "learning_rate": 1.9998818709678157e-05, "loss": 1.4868, "step": 32900 }, { "epoch": 0.02454151978938914, "grad_norm": 0.8778117299079895, "learning_rate": 1.9998811516716104e-05, "loss": 1.4611, "step": 33000 }, { "epoch": 0.024615888031175166, "grad_norm": 1.267151951789856, "learning_rate": 1.999880430192254e-05, "loss": 1.4868, "step": 33100 }, { "epoch": 0.024690256272961196, "grad_norm": 0.6846994161605835, "learning_rate": 1.9998797065297483e-05, "loss": 1.5047, "step": 33200 }, { "epoch": 0.024764624514747223, "grad_norm": 0.6609792113304138, "learning_rate": 1.9998789806840945e-05, "loss": 1.5189, "step": 33300 }, { "epoch": 0.02483899275653325, "grad_norm": 0.5603302717208862, "learning_rate": 1.9998782526552946e-05, "loss": 1.5095, "step": 33400 }, { "epoch": 0.024913360998319277, "grad_norm": 0.7241900563240051, "learning_rate": 1.9998775224433493e-05, "loss": 1.5106, "step": 33500 }, { "epoch": 0.024987729240105304, "grad_norm": 1.149263620376587, "learning_rate": 1.9998767900482616e-05, "loss": 1.5778, "step": 33600 }, { "epoch": 0.025062097481891334, "grad_norm": 0.6764651536941528, "learning_rate": 1.9998760554700318e-05, "loss": 1.4944, "step": 33700 }, { "epoch": 0.02513646572367736, "grad_norm": 0.6464880704879761, "learning_rate": 1.999875318708662e-05, "loss": 1.5719, "step": 33800 }, { "epoch": 0.025210833965463388, "grad_norm": 0.6596807241439819, "learning_rate": 1.9998745797641543e-05, "loss": 1.6179, "step": 33900 }, { "epoch": 0.025285202207249415, "grad_norm": 0.8761606812477112, "learning_rate": 1.9998738386365096e-05, "loss": 1.5256, "step": 34000 }, { "epoch": 0.025359570449035442, "grad_norm": 0.43756160140037537, "learning_rate": 1.9998730953257297e-05, "loss": 1.5477, "step": 34100 }, { "epoch": 0.025433938690821473, "grad_norm": 0.4515778720378876, "learning_rate": 1.9998723498318165e-05, "loss": 1.5666, "step": 34200 }, { "epoch": 0.0255083069326075, "grad_norm": 0.5726724863052368, "learning_rate": 1.9998716021547714e-05, "loss": 1.4878, "step": 34300 }, { "epoch": 0.025582675174393527, "grad_norm": 0.5104209184646606, "learning_rate": 1.999870852294596e-05, "loss": 1.5678, "step": 34400 }, { "epoch": 0.025657043416179554, "grad_norm": 0.7009900808334351, "learning_rate": 1.999870100251292e-05, "loss": 1.4896, "step": 34500 }, { "epoch": 0.025731411657965584, "grad_norm": 0.46048620343208313, "learning_rate": 1.9998693460248613e-05, "loss": 1.5144, "step": 34600 }, { "epoch": 0.02580577989975161, "grad_norm": 0.6157929301261902, "learning_rate": 1.999868589615305e-05, "loss": 1.5178, "step": 34700 }, { "epoch": 0.025880148141537638, "grad_norm": 0.5260864496231079, "learning_rate": 1.9998678310226253e-05, "loss": 1.5046, "step": 34800 }, { "epoch": 0.025954516383323665, "grad_norm": 0.5624649524688721, "learning_rate": 1.999867070246823e-05, "loss": 1.5418, "step": 34900 }, { "epoch": 0.026028884625109692, "grad_norm": 0.5242325663566589, "learning_rate": 1.999866307287901e-05, "loss": 1.4936, "step": 35000 }, { "epoch": 0.026103252866895722, "grad_norm": 0.42132341861724854, "learning_rate": 1.9998655421458603e-05, "loss": 1.5528, "step": 35100 }, { "epoch": 0.02617762110868175, "grad_norm": 1.2333385944366455, "learning_rate": 1.9998647748207022e-05, "loss": 1.5343, "step": 35200 }, { "epoch": 0.026251989350467776, "grad_norm": 0.4847305417060852, "learning_rate": 1.9998640053124288e-05, "loss": 1.5256, "step": 35300 }, { "epoch": 0.026326357592253803, "grad_norm": 0.4797114133834839, "learning_rate": 1.999863233621042e-05, "loss": 1.5394, "step": 35400 }, { "epoch": 0.02640072583403983, "grad_norm": 0.8396820425987244, "learning_rate": 1.999862459746543e-05, "loss": 1.5643, "step": 35500 }, { "epoch": 0.02647509407582586, "grad_norm": 0.4638078808784485, "learning_rate": 1.999861683688934e-05, "loss": 1.5372, "step": 35600 }, { "epoch": 0.026549462317611888, "grad_norm": 0.44567036628723145, "learning_rate": 1.9998609054482162e-05, "loss": 1.5471, "step": 35700 }, { "epoch": 0.026623830559397915, "grad_norm": 0.7429941892623901, "learning_rate": 1.9998601250243915e-05, "loss": 1.5551, "step": 35800 }, { "epoch": 0.02669819880118394, "grad_norm": 0.4555191695690155, "learning_rate": 1.9998593424174618e-05, "loss": 1.6057, "step": 35900 }, { "epoch": 0.02677256704296997, "grad_norm": 1.1227898597717285, "learning_rate": 1.9998585576274286e-05, "loss": 1.5223, "step": 36000 }, { "epoch": 0.026846935284756, "grad_norm": 0.5287070870399475, "learning_rate": 1.9998577706542937e-05, "loss": 1.4566, "step": 36100 }, { "epoch": 0.026921303526542026, "grad_norm": 0.43527650833129883, "learning_rate": 1.9998569814980587e-05, "loss": 1.5472, "step": 36200 }, { "epoch": 0.026995671768328053, "grad_norm": 0.8627545237541199, "learning_rate": 1.999856190158725e-05, "loss": 1.5259, "step": 36300 }, { "epoch": 0.02707004001011408, "grad_norm": 0.5693374276161194, "learning_rate": 1.9998553966362952e-05, "loss": 1.5403, "step": 36400 }, { "epoch": 0.02714440825190011, "grad_norm": 0.43485864996910095, "learning_rate": 1.9998546009307707e-05, "loss": 1.55, "step": 36500 }, { "epoch": 0.027218776493686137, "grad_norm": 0.7903422713279724, "learning_rate": 1.9998538030421526e-05, "loss": 1.5793, "step": 36600 }, { "epoch": 0.027293144735472164, "grad_norm": 0.5814279317855835, "learning_rate": 1.9998530029704436e-05, "loss": 1.5068, "step": 36700 }, { "epoch": 0.02736751297725819, "grad_norm": 0.5183308124542236, "learning_rate": 1.9998522007156444e-05, "loss": 1.4984, "step": 36800 }, { "epoch": 0.027441881219044218, "grad_norm": 0.5316556692123413, "learning_rate": 1.9998513962777578e-05, "loss": 1.5973, "step": 36900 }, { "epoch": 0.02751624946083025, "grad_norm": 0.6409894824028015, "learning_rate": 1.999850589656785e-05, "loss": 1.5491, "step": 37000 }, { "epoch": 0.027590617702616275, "grad_norm": 0.7894346117973328, "learning_rate": 1.9998497808527273e-05, "loss": 1.5117, "step": 37100 }, { "epoch": 0.027664985944402302, "grad_norm": 0.6969322562217712, "learning_rate": 1.9998489698655877e-05, "loss": 1.5079, "step": 37200 }, { "epoch": 0.02773935418618833, "grad_norm": 1.1727479696273804, "learning_rate": 1.9998481566953673e-05, "loss": 1.4889, "step": 37300 }, { "epoch": 0.027813722427974356, "grad_norm": 0.7132461071014404, "learning_rate": 1.9998473413420672e-05, "loss": 1.5284, "step": 37400 }, { "epoch": 0.027888090669760387, "grad_norm": 0.3298719525337219, "learning_rate": 1.9998465238056905e-05, "loss": 1.5616, "step": 37500 }, { "epoch": 0.027962458911546414, "grad_norm": 0.6609339714050293, "learning_rate": 1.999845704086238e-05, "loss": 1.6174, "step": 37600 }, { "epoch": 0.02803682715333244, "grad_norm": 0.5007957220077515, "learning_rate": 1.9998448821837118e-05, "loss": 1.5016, "step": 37700 }, { "epoch": 0.028111195395118468, "grad_norm": 0.9311910271644592, "learning_rate": 1.9998440580981136e-05, "loss": 1.5351, "step": 37800 }, { "epoch": 0.028185563636904495, "grad_norm": 0.7796390056610107, "learning_rate": 1.9998432318294455e-05, "loss": 1.5461, "step": 37900 }, { "epoch": 0.028259931878690525, "grad_norm": 0.8507175445556641, "learning_rate": 1.9998424033777093e-05, "loss": 1.5247, "step": 38000 }, { "epoch": 0.028334300120476552, "grad_norm": 0.3990893065929413, "learning_rate": 1.9998415727429065e-05, "loss": 1.5629, "step": 38100 }, { "epoch": 0.02840866836226258, "grad_norm": 0.852613091468811, "learning_rate": 1.9998407399250386e-05, "loss": 1.5216, "step": 38200 }, { "epoch": 0.028483036604048606, "grad_norm": 0.4536173343658447, "learning_rate": 1.9998399049241083e-05, "loss": 1.5953, "step": 38300 }, { "epoch": 0.028557404845834636, "grad_norm": 0.5260401964187622, "learning_rate": 1.999839067740117e-05, "loss": 1.616, "step": 38400 }, { "epoch": 0.028631773087620663, "grad_norm": 0.6179829835891724, "learning_rate": 1.9998382283730663e-05, "loss": 1.5295, "step": 38500 }, { "epoch": 0.02870614132940669, "grad_norm": 0.5114478468894958, "learning_rate": 1.9998373868229582e-05, "loss": 1.5425, "step": 38600 }, { "epoch": 0.028780509571192717, "grad_norm": 0.593675971031189, "learning_rate": 1.9998365430897948e-05, "loss": 1.5474, "step": 38700 }, { "epoch": 0.028854877812978744, "grad_norm": 0.4959476888179779, "learning_rate": 1.999835697173577e-05, "loss": 1.5775, "step": 38800 }, { "epoch": 0.028929246054764775, "grad_norm": 0.6730287671089172, "learning_rate": 1.9998348490743082e-05, "loss": 1.6572, "step": 38900 }, { "epoch": 0.029003614296550802, "grad_norm": 0.9137376546859741, "learning_rate": 1.999833998791989e-05, "loss": 1.5007, "step": 39000 }, { "epoch": 0.02907798253833683, "grad_norm": 1.2021700143814087, "learning_rate": 1.999833146326622e-05, "loss": 1.5095, "step": 39100 }, { "epoch": 0.029152350780122856, "grad_norm": 0.5708747506141663, "learning_rate": 1.9998322916782083e-05, "loss": 1.5644, "step": 39200 }, { "epoch": 0.029226719021908883, "grad_norm": 0.6767252087593079, "learning_rate": 1.9998314348467508e-05, "loss": 1.5248, "step": 39300 }, { "epoch": 0.029301087263694913, "grad_norm": 0.4881773889064789, "learning_rate": 1.9998305758322504e-05, "loss": 1.4889, "step": 39400 }, { "epoch": 0.02937545550548094, "grad_norm": 0.4517097771167755, "learning_rate": 1.9998297146347093e-05, "loss": 1.5388, "step": 39500 }, { "epoch": 0.029449823747266967, "grad_norm": 0.6027237176895142, "learning_rate": 1.9998288512541295e-05, "loss": 1.5702, "step": 39600 }, { "epoch": 0.029524191989052994, "grad_norm": 0.4435807764530182, "learning_rate": 1.9998279856905127e-05, "loss": 1.5708, "step": 39700 }, { "epoch": 0.02959856023083902, "grad_norm": 0.5487297773361206, "learning_rate": 1.999827117943861e-05, "loss": 1.5184, "step": 39800 }, { "epoch": 0.02967292847262505, "grad_norm": 0.8344607949256897, "learning_rate": 1.9998262480141762e-05, "loss": 1.5454, "step": 39900 }, { "epoch": 0.02974729671441108, "grad_norm": 0.8898949027061462, "learning_rate": 1.9998253759014602e-05, "loss": 1.5409, "step": 40000 }, { "epoch": 0.029821664956197105, "grad_norm": 0.897030770778656, "learning_rate": 1.9998245016057147e-05, "loss": 1.5316, "step": 40100 }, { "epoch": 0.029896033197983132, "grad_norm": 0.6615723371505737, "learning_rate": 1.999823625126942e-05, "loss": 1.6079, "step": 40200 }, { "epoch": 0.02997040143976916, "grad_norm": 0.41309353709220886, "learning_rate": 1.9998227464651438e-05, "loss": 1.5077, "step": 40300 }, { "epoch": 0.03004476968155519, "grad_norm": 0.7121081352233887, "learning_rate": 1.9998218656203218e-05, "loss": 1.5346, "step": 40400 }, { "epoch": 0.030119137923341217, "grad_norm": 0.7162127494812012, "learning_rate": 1.9998209825924784e-05, "loss": 1.5369, "step": 40500 }, { "epoch": 0.030193506165127244, "grad_norm": 0.5943055748939514, "learning_rate": 1.9998200973816152e-05, "loss": 1.5852, "step": 40600 }, { "epoch": 0.03026787440691327, "grad_norm": 0.6746940612792969, "learning_rate": 1.9998192099877344e-05, "loss": 1.5407, "step": 40700 }, { "epoch": 0.0303422426486993, "grad_norm": 0.9628979563713074, "learning_rate": 1.9998183204108375e-05, "loss": 1.4937, "step": 40800 }, { "epoch": 0.030416610890485328, "grad_norm": 0.3971594274044037, "learning_rate": 1.9998174286509267e-05, "loss": 1.5628, "step": 40900 }, { "epoch": 0.030490979132271355, "grad_norm": 0.553767204284668, "learning_rate": 1.9998165347080043e-05, "loss": 1.5182, "step": 41000 }, { "epoch": 0.030565347374057382, "grad_norm": 0.4197104573249817, "learning_rate": 1.9998156385820716e-05, "loss": 1.4853, "step": 41100 }, { "epoch": 0.03063971561584341, "grad_norm": 0.7118240594863892, "learning_rate": 1.9998147402731308e-05, "loss": 1.4737, "step": 41200 }, { "epoch": 0.03071408385762944, "grad_norm": 0.7333774566650391, "learning_rate": 1.999813839781184e-05, "loss": 1.5183, "step": 41300 }, { "epoch": 0.030788452099415466, "grad_norm": 0.509201169013977, "learning_rate": 1.9998129371062332e-05, "loss": 1.4873, "step": 41400 }, { "epoch": 0.030862820341201493, "grad_norm": 0.3249990940093994, "learning_rate": 1.9998120322482803e-05, "loss": 1.4316, "step": 41500 }, { "epoch": 0.03093718858298752, "grad_norm": 0.5361568331718445, "learning_rate": 1.9998111252073272e-05, "loss": 1.5113, "step": 41600 }, { "epoch": 0.031011556824773547, "grad_norm": 1.3092052936553955, "learning_rate": 1.9998102159833758e-05, "loss": 1.5448, "step": 41700 }, { "epoch": 0.031085925066559578, "grad_norm": 0.6276385188102722, "learning_rate": 1.999809304576428e-05, "loss": 1.6223, "step": 41800 }, { "epoch": 0.031160293308345605, "grad_norm": 0.7364848256111145, "learning_rate": 1.9998083909864863e-05, "loss": 1.4543, "step": 41900 }, { "epoch": 0.03123466155013163, "grad_norm": 0.3654361367225647, "learning_rate": 1.9998074752135523e-05, "loss": 1.6071, "step": 42000 }, { "epoch": 0.03130902979191766, "grad_norm": 1.1972397565841675, "learning_rate": 1.999806557257628e-05, "loss": 1.4909, "step": 42100 }, { "epoch": 0.031383398033703686, "grad_norm": 0.5845790505409241, "learning_rate": 1.9998056371187155e-05, "loss": 1.5687, "step": 42200 }, { "epoch": 0.03145776627548971, "grad_norm": 0.7037214636802673, "learning_rate": 1.9998047147968168e-05, "loss": 1.5561, "step": 42300 }, { "epoch": 0.03153213451727574, "grad_norm": 0.5212551951408386, "learning_rate": 1.999803790291934e-05, "loss": 1.5063, "step": 42400 }, { "epoch": 0.03160650275906177, "grad_norm": 0.6110777854919434, "learning_rate": 1.999802863604069e-05, "loss": 1.4896, "step": 42500 }, { "epoch": 0.0316808710008478, "grad_norm": 0.6877493858337402, "learning_rate": 1.999801934733224e-05, "loss": 1.5779, "step": 42600 }, { "epoch": 0.03175523924263383, "grad_norm": 0.4461131989955902, "learning_rate": 1.9998010036794005e-05, "loss": 1.5973, "step": 42700 }, { "epoch": 0.031829607484419854, "grad_norm": 1.0050228834152222, "learning_rate": 1.999800070442601e-05, "loss": 1.5149, "step": 42800 }, { "epoch": 0.03190397572620588, "grad_norm": 0.46876657009124756, "learning_rate": 1.9997991350228275e-05, "loss": 1.6122, "step": 42900 }, { "epoch": 0.03197834396799191, "grad_norm": 0.4919954240322113, "learning_rate": 1.999798197420082e-05, "loss": 1.5411, "step": 43000 }, { "epoch": 0.032052712209777935, "grad_norm": 0.9554354548454285, "learning_rate": 1.9997972576343668e-05, "loss": 1.579, "step": 43100 }, { "epoch": 0.03212708045156396, "grad_norm": 1.0671650171279907, "learning_rate": 1.9997963156656835e-05, "loss": 1.5999, "step": 43200 }, { "epoch": 0.03220144869334999, "grad_norm": 0.8465139269828796, "learning_rate": 1.999795371514034e-05, "loss": 1.5703, "step": 43300 }, { "epoch": 0.032275816935136016, "grad_norm": 0.7047709822654724, "learning_rate": 1.9997944251794212e-05, "loss": 1.5814, "step": 43400 }, { "epoch": 0.03235018517692205, "grad_norm": 0.7836155891418457, "learning_rate": 1.9997934766618465e-05, "loss": 1.5464, "step": 43500 }, { "epoch": 0.03242455341870808, "grad_norm": 0.7335034012794495, "learning_rate": 1.9997925259613124e-05, "loss": 1.5278, "step": 43600 }, { "epoch": 0.032498921660494104, "grad_norm": 0.834950864315033, "learning_rate": 1.9997915730778202e-05, "loss": 1.559, "step": 43700 }, { "epoch": 0.03257328990228013, "grad_norm": 0.7446547150611877, "learning_rate": 1.9997906180113726e-05, "loss": 1.6256, "step": 43800 }, { "epoch": 0.03264765814406616, "grad_norm": 0.5306852459907532, "learning_rate": 1.9997896607619718e-05, "loss": 1.44, "step": 43900 }, { "epoch": 0.032722026385852185, "grad_norm": 0.500023365020752, "learning_rate": 1.9997887013296196e-05, "loss": 1.5355, "step": 44000 }, { "epoch": 0.03279639462763821, "grad_norm": 0.6218491196632385, "learning_rate": 1.9997877397143182e-05, "loss": 1.5741, "step": 44100 }, { "epoch": 0.03287076286942424, "grad_norm": 0.3754362463951111, "learning_rate": 1.9997867759160696e-05, "loss": 1.6125, "step": 44200 }, { "epoch": 0.032945131111210266, "grad_norm": 0.9419918656349182, "learning_rate": 1.999785809934876e-05, "loss": 1.4781, "step": 44300 }, { "epoch": 0.0330194993529963, "grad_norm": 0.503409743309021, "learning_rate": 1.9997848417707394e-05, "loss": 1.532, "step": 44400 }, { "epoch": 0.03309386759478233, "grad_norm": 0.6554058194160461, "learning_rate": 1.999783871423662e-05, "loss": 1.5704, "step": 44500 }, { "epoch": 0.033168235836568354, "grad_norm": 0.9691445231437683, "learning_rate": 1.9997828988936462e-05, "loss": 1.5278, "step": 44600 }, { "epoch": 0.03324260407835438, "grad_norm": 0.43620389699935913, "learning_rate": 1.999781924180694e-05, "loss": 1.5436, "step": 44700 }, { "epoch": 0.03331697232014041, "grad_norm": 0.6035354137420654, "learning_rate": 1.999780947284807e-05, "loss": 1.5899, "step": 44800 }, { "epoch": 0.033391340561926434, "grad_norm": 0.3441450595855713, "learning_rate": 1.9997799682059875e-05, "loss": 1.5443, "step": 44900 }, { "epoch": 0.03346570880371246, "grad_norm": 0.5419406294822693, "learning_rate": 1.999778986944238e-05, "loss": 1.4789, "step": 45000 }, { "epoch": 0.03354007704549849, "grad_norm": 0.7912573218345642, "learning_rate": 1.9997780034995605e-05, "loss": 1.4816, "step": 45100 }, { "epoch": 0.033614445287284515, "grad_norm": 0.8978769779205322, "learning_rate": 1.9997770178719573e-05, "loss": 1.5124, "step": 45200 }, { "epoch": 0.03368881352907054, "grad_norm": 0.6722145080566406, "learning_rate": 1.99977603006143e-05, "loss": 1.5229, "step": 45300 }, { "epoch": 0.033763181770856576, "grad_norm": 0.4918314218521118, "learning_rate": 1.9997750400679815e-05, "loss": 1.562, "step": 45400 }, { "epoch": 0.0338375500126426, "grad_norm": 0.9343436360359192, "learning_rate": 1.9997740478916138e-05, "loss": 1.6147, "step": 45500 }, { "epoch": 0.03391191825442863, "grad_norm": 1.0771671533584595, "learning_rate": 1.9997730535323287e-05, "loss": 1.4441, "step": 45600 }, { "epoch": 0.03398628649621466, "grad_norm": 0.666222095489502, "learning_rate": 1.999772056990128e-05, "loss": 1.536, "step": 45700 }, { "epoch": 0.034060654738000684, "grad_norm": 0.5621564388275146, "learning_rate": 1.9997710582650153e-05, "loss": 1.5652, "step": 45800 }, { "epoch": 0.03413502297978671, "grad_norm": 0.6601430177688599, "learning_rate": 1.9997700573569912e-05, "loss": 1.5309, "step": 45900 }, { "epoch": 0.03420939122157274, "grad_norm": 0.7411925792694092, "learning_rate": 1.9997690542660585e-05, "loss": 1.4918, "step": 46000 }, { "epoch": 0.034283759463358765, "grad_norm": 0.5674101710319519, "learning_rate": 1.99976804899222e-05, "loss": 1.5383, "step": 46100 }, { "epoch": 0.03435812770514479, "grad_norm": 0.8503201007843018, "learning_rate": 1.999767041535477e-05, "loss": 1.5298, "step": 46200 }, { "epoch": 0.034432495946930826, "grad_norm": 0.8432891368865967, "learning_rate": 1.999766031895832e-05, "loss": 1.5419, "step": 46300 }, { "epoch": 0.03450686418871685, "grad_norm": 0.41764137148857117, "learning_rate": 1.9997650200732876e-05, "loss": 1.5313, "step": 46400 }, { "epoch": 0.03458123243050288, "grad_norm": 1.0963222980499268, "learning_rate": 1.9997640060678455e-05, "loss": 1.5593, "step": 46500 }, { "epoch": 0.03465560067228891, "grad_norm": 0.5194404721260071, "learning_rate": 1.9997629898795082e-05, "loss": 1.6352, "step": 46600 }, { "epoch": 0.034729968914074934, "grad_norm": 0.6641316413879395, "learning_rate": 1.9997619715082777e-05, "loss": 1.552, "step": 46700 }, { "epoch": 0.03480433715586096, "grad_norm": 1.1054824590682983, "learning_rate": 1.999760950954156e-05, "loss": 1.5176, "step": 46800 }, { "epoch": 0.03487870539764699, "grad_norm": 0.44163691997528076, "learning_rate": 1.9997599282171466e-05, "loss": 1.5985, "step": 46900 }, { "epoch": 0.034953073639433015, "grad_norm": 0.8304015398025513, "learning_rate": 1.99975890329725e-05, "loss": 1.554, "step": 47000 }, { "epoch": 0.03502744188121904, "grad_norm": 0.8395280838012695, "learning_rate": 1.9997578761944693e-05, "loss": 1.6185, "step": 47100 }, { "epoch": 0.03510181012300507, "grad_norm": 0.8857927322387695, "learning_rate": 1.9997568469088068e-05, "loss": 1.5108, "step": 47200 }, { "epoch": 0.0351761783647911, "grad_norm": 0.6471524834632874, "learning_rate": 1.999755815440265e-05, "loss": 1.4693, "step": 47300 }, { "epoch": 0.03525054660657713, "grad_norm": 0.6664785146713257, "learning_rate": 1.9997547817888453e-05, "loss": 1.5391, "step": 47400 }, { "epoch": 0.035324914848363156, "grad_norm": 0.4979814887046814, "learning_rate": 1.9997537459545505e-05, "loss": 1.5367, "step": 47500 }, { "epoch": 0.03539928309014918, "grad_norm": 0.5753507614135742, "learning_rate": 1.9997527079373828e-05, "loss": 1.547, "step": 47600 }, { "epoch": 0.03547365133193521, "grad_norm": 0.5349861979484558, "learning_rate": 1.9997516677373444e-05, "loss": 1.5317, "step": 47700 }, { "epoch": 0.03554801957372124, "grad_norm": 0.49024057388305664, "learning_rate": 1.9997506253544377e-05, "loss": 1.4412, "step": 47800 }, { "epoch": 0.035622387815507264, "grad_norm": 0.4172305166721344, "learning_rate": 1.9997495807886648e-05, "loss": 1.4767, "step": 47900 }, { "epoch": 0.03569675605729329, "grad_norm": 0.8609969615936279, "learning_rate": 1.9997485340400283e-05, "loss": 1.5191, "step": 48000 }, { "epoch": 0.03577112429907932, "grad_norm": 0.46808651089668274, "learning_rate": 1.9997474851085304e-05, "loss": 1.5835, "step": 48100 }, { "epoch": 0.03584549254086535, "grad_norm": 1.0232137441635132, "learning_rate": 1.999746433994173e-05, "loss": 1.5322, "step": 48200 }, { "epoch": 0.03591986078265138, "grad_norm": 0.6745514273643494, "learning_rate": 1.9997453806969588e-05, "loss": 1.5706, "step": 48300 }, { "epoch": 0.035994229024437406, "grad_norm": 1.114139437675476, "learning_rate": 1.99974432521689e-05, "loss": 1.5895, "step": 48400 }, { "epoch": 0.03606859726622343, "grad_norm": 0.7316710948944092, "learning_rate": 1.9997432675539686e-05, "loss": 1.4594, "step": 48500 }, { "epoch": 0.03614296550800946, "grad_norm": 0.4518311023712158, "learning_rate": 1.9997422077081973e-05, "loss": 1.5374, "step": 48600 }, { "epoch": 0.03621733374979549, "grad_norm": 0.6804157495498657, "learning_rate": 1.999741145679578e-05, "loss": 1.5915, "step": 48700 }, { "epoch": 0.036291701991581514, "grad_norm": 0.4367609918117523, "learning_rate": 1.999740081468114e-05, "loss": 1.5092, "step": 48800 }, { "epoch": 0.03636607023336754, "grad_norm": 0.7415800094604492, "learning_rate": 1.9997390150738063e-05, "loss": 1.4548, "step": 48900 }, { "epoch": 0.03644043847515357, "grad_norm": 0.4981917440891266, "learning_rate": 1.999737946496658e-05, "loss": 1.53, "step": 49000 }, { "epoch": 0.036514806716939595, "grad_norm": 0.42718860507011414, "learning_rate": 1.9997368757366712e-05, "loss": 1.4911, "step": 49100 }, { "epoch": 0.03658917495872563, "grad_norm": 0.5294268131256104, "learning_rate": 1.999735802793849e-05, "loss": 1.5203, "step": 49200 }, { "epoch": 0.036663543200511656, "grad_norm": 0.5844396948814392, "learning_rate": 1.999734727668192e-05, "loss": 1.5442, "step": 49300 }, { "epoch": 0.03673791144229768, "grad_norm": 0.7835099697113037, "learning_rate": 1.9997336503597043e-05, "loss": 1.4786, "step": 49400 }, { "epoch": 0.03681227968408371, "grad_norm": 0.6045675873756409, "learning_rate": 1.9997325708683875e-05, "loss": 1.5386, "step": 49500 }, { "epoch": 0.03688664792586974, "grad_norm": 0.651782751083374, "learning_rate": 1.9997314891942442e-05, "loss": 1.4999, "step": 49600 }, { "epoch": 0.036961016167655764, "grad_norm": 0.45741456747055054, "learning_rate": 1.9997304053372762e-05, "loss": 1.4946, "step": 49700 }, { "epoch": 0.03703538440944179, "grad_norm": 0.5363433957099915, "learning_rate": 1.999729319297486e-05, "loss": 1.5421, "step": 49800 }, { "epoch": 0.03710975265122782, "grad_norm": 0.45601820945739746, "learning_rate": 1.9997282310748768e-05, "loss": 1.5149, "step": 49900 }, { "epoch": 0.037184120893013844, "grad_norm": 0.6308805346488953, "learning_rate": 1.9997271406694504e-05, "loss": 1.5464, "step": 50000 }, { "epoch": 0.03725848913479988, "grad_norm": 0.6946158409118652, "learning_rate": 1.999726048081209e-05, "loss": 1.5865, "step": 50100 }, { "epoch": 0.037332857376585905, "grad_norm": 0.4899694323539734, "learning_rate": 1.9997249533101554e-05, "loss": 1.5839, "step": 50200 }, { "epoch": 0.03740722561837193, "grad_norm": 0.7726057767868042, "learning_rate": 1.9997238563562912e-05, "loss": 1.4571, "step": 50300 }, { "epoch": 0.03748159386015796, "grad_norm": 0.9151437878608704, "learning_rate": 1.9997227572196197e-05, "loss": 1.4459, "step": 50400 }, { "epoch": 0.037555962101943986, "grad_norm": 0.6182152032852173, "learning_rate": 1.9997216559001433e-05, "loss": 1.4586, "step": 50500 }, { "epoch": 0.03763033034373001, "grad_norm": 0.7595764398574829, "learning_rate": 1.9997205523978636e-05, "loss": 1.5382, "step": 50600 }, { "epoch": 0.03770469858551604, "grad_norm": 0.8368933796882629, "learning_rate": 1.9997194467127838e-05, "loss": 1.636, "step": 50700 }, { "epoch": 0.03777906682730207, "grad_norm": 1.3371498584747314, "learning_rate": 1.9997183388449055e-05, "loss": 1.5225, "step": 50800 }, { "epoch": 0.037853435069088094, "grad_norm": 0.6440578103065491, "learning_rate": 1.999717228794232e-05, "loss": 1.5165, "step": 50900 }, { "epoch": 0.03792780331087412, "grad_norm": 0.5950276255607605, "learning_rate": 1.999716116560765e-05, "loss": 1.552, "step": 51000 }, { "epoch": 0.038002171552660155, "grad_norm": 0.7176305651664734, "learning_rate": 1.9997150021445074e-05, "loss": 1.5019, "step": 51100 }, { "epoch": 0.03807653979444618, "grad_norm": 0.7437789440155029, "learning_rate": 1.999713885545462e-05, "loss": 1.5632, "step": 51200 }, { "epoch": 0.03815090803623221, "grad_norm": 0.48256799578666687, "learning_rate": 1.9997127667636298e-05, "loss": 1.4943, "step": 51300 }, { "epoch": 0.038225276278018236, "grad_norm": 0.8726604580879211, "learning_rate": 1.9997116457990148e-05, "loss": 1.6079, "step": 51400 }, { "epoch": 0.03829964451980426, "grad_norm": 0.6188727617263794, "learning_rate": 1.9997105226516186e-05, "loss": 1.5845, "step": 51500 }, { "epoch": 0.03837401276159029, "grad_norm": 0.5416398048400879, "learning_rate": 1.9997093973214442e-05, "loss": 1.5297, "step": 51600 }, { "epoch": 0.03844838100337632, "grad_norm": 0.8704063296318054, "learning_rate": 1.999708269808493e-05, "loss": 1.5324, "step": 51700 }, { "epoch": 0.038522749245162344, "grad_norm": 0.9990126490592957, "learning_rate": 1.9997071401127688e-05, "loss": 1.5435, "step": 51800 }, { "epoch": 0.03859711748694837, "grad_norm": 0.402971476316452, "learning_rate": 1.9997060082342732e-05, "loss": 1.5755, "step": 51900 }, { "epoch": 0.038671485728734405, "grad_norm": 0.7084416747093201, "learning_rate": 1.9997048741730092e-05, "loss": 1.5153, "step": 52000 }, { "epoch": 0.03874585397052043, "grad_norm": 0.5382058620452881, "learning_rate": 1.9997037379289786e-05, "loss": 1.5378, "step": 52100 }, { "epoch": 0.03882022221230646, "grad_norm": 0.5846664905548096, "learning_rate": 1.9997025995021845e-05, "loss": 1.5118, "step": 52200 }, { "epoch": 0.038894590454092486, "grad_norm": 0.6125427484512329, "learning_rate": 1.999701458892629e-05, "loss": 1.5567, "step": 52300 }, { "epoch": 0.03896895869587851, "grad_norm": 0.4352121949195862, "learning_rate": 1.999700316100315e-05, "loss": 1.5665, "step": 52400 }, { "epoch": 0.03904332693766454, "grad_norm": 0.4972393810749054, "learning_rate": 1.9996991711252448e-05, "loss": 1.5728, "step": 52500 }, { "epoch": 0.039117695179450566, "grad_norm": 7.361449241638184, "learning_rate": 1.9996980239674207e-05, "loss": 1.4814, "step": 52600 }, { "epoch": 0.03919206342123659, "grad_norm": 0.4652218818664551, "learning_rate": 1.9996968746268452e-05, "loss": 1.5553, "step": 52700 }, { "epoch": 0.03926643166302262, "grad_norm": 0.7898038029670715, "learning_rate": 1.9996957231035213e-05, "loss": 1.5251, "step": 52800 }, { "epoch": 0.03934079990480865, "grad_norm": 0.5735042095184326, "learning_rate": 1.999694569397451e-05, "loss": 1.474, "step": 52900 }, { "epoch": 0.03941516814659468, "grad_norm": 0.6072685122489929, "learning_rate": 1.9996934135086367e-05, "loss": 1.6186, "step": 53000 }, { "epoch": 0.03948953638838071, "grad_norm": 0.5961938500404358, "learning_rate": 1.9996922554370818e-05, "loss": 1.5622, "step": 53100 }, { "epoch": 0.039563904630166735, "grad_norm": 0.6281226277351379, "learning_rate": 1.999691095182788e-05, "loss": 1.5508, "step": 53200 }, { "epoch": 0.03963827287195276, "grad_norm": 0.9202520847320557, "learning_rate": 1.9996899327457576e-05, "loss": 1.5623, "step": 53300 }, { "epoch": 0.03971264111373879, "grad_norm": 0.4792959988117218, "learning_rate": 1.9996887681259946e-05, "loss": 1.5363, "step": 53400 }, { "epoch": 0.039787009355524816, "grad_norm": 0.4955751299858093, "learning_rate": 1.9996876013234997e-05, "loss": 1.5026, "step": 53500 }, { "epoch": 0.03986137759731084, "grad_norm": 1.114710807800293, "learning_rate": 1.9996864323382766e-05, "loss": 1.5397, "step": 53600 }, { "epoch": 0.03993574583909687, "grad_norm": 0.5231966972351074, "learning_rate": 1.9996852611703278e-05, "loss": 1.4971, "step": 53700 }, { "epoch": 0.0400101140808829, "grad_norm": 0.8232663869857788, "learning_rate": 1.9996840878196554e-05, "loss": 1.5054, "step": 53800 }, { "epoch": 0.04008448232266893, "grad_norm": 0.5382178425788879, "learning_rate": 1.999682912286262e-05, "loss": 1.47, "step": 53900 }, { "epoch": 0.04015885056445496, "grad_norm": 0.8355742692947388, "learning_rate": 1.999681734570151e-05, "loss": 1.3635, "step": 54000 }, { "epoch": 0.040233218806240985, "grad_norm": 0.715268611907959, "learning_rate": 1.9996805546713237e-05, "loss": 1.5606, "step": 54100 }, { "epoch": 0.04030758704802701, "grad_norm": 0.9210833311080933, "learning_rate": 1.9996793725897836e-05, "loss": 1.5702, "step": 54200 }, { "epoch": 0.04038195528981304, "grad_norm": 0.5435687899589539, "learning_rate": 1.9996781883255328e-05, "loss": 1.5468, "step": 54300 }, { "epoch": 0.040456323531599066, "grad_norm": 0.517410159111023, "learning_rate": 1.9996770018785743e-05, "loss": 1.597, "step": 54400 }, { "epoch": 0.04053069177338509, "grad_norm": 0.5302937030792236, "learning_rate": 1.9996758132489102e-05, "loss": 1.4796, "step": 54500 }, { "epoch": 0.04060506001517112, "grad_norm": 0.8928558230400085, "learning_rate": 1.9996746224365435e-05, "loss": 1.5311, "step": 54600 }, { "epoch": 0.04067942825695715, "grad_norm": 0.9816573262214661, "learning_rate": 1.9996734294414765e-05, "loss": 1.5409, "step": 54700 }, { "epoch": 0.040753796498743174, "grad_norm": 0.5399038791656494, "learning_rate": 1.999672234263712e-05, "loss": 1.4705, "step": 54800 }, { "epoch": 0.04082816474052921, "grad_norm": 0.815388560295105, "learning_rate": 1.9996710369032528e-05, "loss": 1.5708, "step": 54900 }, { "epoch": 0.040902532982315234, "grad_norm": 0.765061616897583, "learning_rate": 1.999669837360101e-05, "loss": 1.4828, "step": 55000 }, { "epoch": 0.04097690122410126, "grad_norm": 0.8168923258781433, "learning_rate": 1.99966863563426e-05, "loss": 1.5167, "step": 55100 }, { "epoch": 0.04105126946588729, "grad_norm": 0.6376170516014099, "learning_rate": 1.9996674317257315e-05, "loss": 1.5286, "step": 55200 }, { "epoch": 0.041125637707673315, "grad_norm": 0.7510162591934204, "learning_rate": 1.9996662256345184e-05, "loss": 1.5737, "step": 55300 }, { "epoch": 0.04120000594945934, "grad_norm": 0.5505034327507019, "learning_rate": 1.9996650173606234e-05, "loss": 1.5388, "step": 55400 }, { "epoch": 0.04127437419124537, "grad_norm": 0.49698886275291443, "learning_rate": 1.99966380690405e-05, "loss": 1.5832, "step": 55500 }, { "epoch": 0.041348742433031396, "grad_norm": 0.5520877242088318, "learning_rate": 1.9996625942647994e-05, "loss": 1.5643, "step": 55600 }, { "epoch": 0.04142311067481742, "grad_norm": 0.6185612082481384, "learning_rate": 1.999661379442875e-05, "loss": 1.5087, "step": 55700 }, { "epoch": 0.04149747891660345, "grad_norm": 0.8302799463272095, "learning_rate": 1.9996601624382795e-05, "loss": 1.6283, "step": 55800 }, { "epoch": 0.041571847158389484, "grad_norm": 0.9720719456672668, "learning_rate": 1.9996589432510155e-05, "loss": 1.5064, "step": 55900 }, { "epoch": 0.04164621540017551, "grad_norm": 0.40555909276008606, "learning_rate": 1.9996577218810855e-05, "loss": 1.5138, "step": 56000 }, { "epoch": 0.04172058364196154, "grad_norm": 0.9815244674682617, "learning_rate": 1.9996564983284918e-05, "loss": 1.4913, "step": 56100 }, { "epoch": 0.041794951883747565, "grad_norm": 1.1608703136444092, "learning_rate": 1.9996552725932382e-05, "loss": 1.4939, "step": 56200 }, { "epoch": 0.04186932012553359, "grad_norm": 0.38927561044692993, "learning_rate": 1.9996540446753264e-05, "loss": 1.5115, "step": 56300 }, { "epoch": 0.04194368836731962, "grad_norm": 0.7470927834510803, "learning_rate": 1.9996528145747594e-05, "loss": 1.5539, "step": 56400 }, { "epoch": 0.042018056609105646, "grad_norm": 0.46110135316848755, "learning_rate": 1.99965158229154e-05, "loss": 1.4602, "step": 56500 }, { "epoch": 0.04209242485089167, "grad_norm": 0.5916282534599304, "learning_rate": 1.9996503478256705e-05, "loss": 1.5721, "step": 56600 }, { "epoch": 0.0421667930926777, "grad_norm": 0.5567501187324524, "learning_rate": 1.999649111177154e-05, "loss": 1.482, "step": 56700 }, { "epoch": 0.042241161334463734, "grad_norm": 0.8147956728935242, "learning_rate": 1.9996478723459928e-05, "loss": 1.5518, "step": 56800 }, { "epoch": 0.04231552957624976, "grad_norm": 1.0146034955978394, "learning_rate": 1.9996466313321906e-05, "loss": 1.52, "step": 56900 }, { "epoch": 0.04238989781803579, "grad_norm": 0.267634779214859, "learning_rate": 1.9996453881357486e-05, "loss": 1.598, "step": 57000 }, { "epoch": 0.042464266059821815, "grad_norm": 0.7316186428070068, "learning_rate": 1.9996441427566707e-05, "loss": 1.5306, "step": 57100 }, { "epoch": 0.04253863430160784, "grad_norm": 1.078633189201355, "learning_rate": 1.999642895194959e-05, "loss": 1.4847, "step": 57200 }, { "epoch": 0.04261300254339387, "grad_norm": 0.5316028594970703, "learning_rate": 1.9996416454506164e-05, "loss": 1.5928, "step": 57300 }, { "epoch": 0.042687370785179896, "grad_norm": 0.573113739490509, "learning_rate": 1.999640393523646e-05, "loss": 1.5609, "step": 57400 }, { "epoch": 0.04276173902696592, "grad_norm": 0.45106610655784607, "learning_rate": 1.9996391394140496e-05, "loss": 1.5693, "step": 57500 }, { "epoch": 0.04283610726875195, "grad_norm": 0.6011554002761841, "learning_rate": 1.9996378831218307e-05, "loss": 1.5968, "step": 57600 }, { "epoch": 0.042910475510537976, "grad_norm": 0.7017727494239807, "learning_rate": 1.9996366246469922e-05, "loss": 1.4973, "step": 57700 }, { "epoch": 0.04298484375232401, "grad_norm": 0.8994572758674622, "learning_rate": 1.9996353639895365e-05, "loss": 1.4624, "step": 57800 }, { "epoch": 0.04305921199411004, "grad_norm": 0.8928848505020142, "learning_rate": 1.9996341011494663e-05, "loss": 1.5755, "step": 57900 }, { "epoch": 0.043133580235896064, "grad_norm": 0.7762150168418884, "learning_rate": 1.999632836126784e-05, "loss": 1.4074, "step": 58000 }, { "epoch": 0.04320794847768209, "grad_norm": 0.5097442865371704, "learning_rate": 1.9996315689214932e-05, "loss": 1.5281, "step": 58100 }, { "epoch": 0.04328231671946812, "grad_norm": 0.803718626499176, "learning_rate": 1.999630299533596e-05, "loss": 1.499, "step": 58200 }, { "epoch": 0.043356684961254145, "grad_norm": 0.5989664196968079, "learning_rate": 1.9996290279630956e-05, "loss": 1.5286, "step": 58300 }, { "epoch": 0.04343105320304017, "grad_norm": 0.45334750413894653, "learning_rate": 1.999627754209995e-05, "loss": 1.5598, "step": 58400 }, { "epoch": 0.0435054214448262, "grad_norm": 0.9461644887924194, "learning_rate": 1.999626478274296e-05, "loss": 1.4569, "step": 58500 }, { "epoch": 0.043579789686612226, "grad_norm": 0.5558738112449646, "learning_rate": 1.999625200156002e-05, "loss": 1.5329, "step": 58600 }, { "epoch": 0.04365415792839826, "grad_norm": 0.49125516414642334, "learning_rate": 1.999623919855116e-05, "loss": 1.4805, "step": 58700 }, { "epoch": 0.04372852617018429, "grad_norm": 0.6038479208946228, "learning_rate": 1.9996226373716406e-05, "loss": 1.5589, "step": 58800 }, { "epoch": 0.043802894411970314, "grad_norm": 0.4560091197490692, "learning_rate": 1.9996213527055784e-05, "loss": 1.4538, "step": 58900 }, { "epoch": 0.04387726265375634, "grad_norm": 0.6255136728286743, "learning_rate": 1.9996200658569323e-05, "loss": 1.5959, "step": 59000 }, { "epoch": 0.04395163089554237, "grad_norm": 0.8603237867355347, "learning_rate": 1.999618776825705e-05, "loss": 1.4769, "step": 59100 }, { "epoch": 0.044025999137328395, "grad_norm": 1.027685523033142, "learning_rate": 1.9996174856119e-05, "loss": 1.485, "step": 59200 }, { "epoch": 0.04410036737911442, "grad_norm": 0.6371426582336426, "learning_rate": 1.999616192215519e-05, "loss": 1.5713, "step": 59300 }, { "epoch": 0.04417473562090045, "grad_norm": 0.8155677318572998, "learning_rate": 1.9996148966365664e-05, "loss": 1.5755, "step": 59400 }, { "epoch": 0.044249103862686476, "grad_norm": 0.9418515563011169, "learning_rate": 1.9996135988750432e-05, "loss": 1.51, "step": 59500 }, { "epoch": 0.0443234721044725, "grad_norm": 0.529082179069519, "learning_rate": 1.9996122989309536e-05, "loss": 1.5254, "step": 59600 }, { "epoch": 0.04439784034625854, "grad_norm": 0.5595930218696594, "learning_rate": 1.9996109968042992e-05, "loss": 1.5515, "step": 59700 }, { "epoch": 0.044472208588044564, "grad_norm": 0.8503856062889099, "learning_rate": 1.9996096924950843e-05, "loss": 1.5123, "step": 59800 }, { "epoch": 0.04454657682983059, "grad_norm": 0.6979494690895081, "learning_rate": 1.9996083860033107e-05, "loss": 1.5213, "step": 59900 }, { "epoch": 0.04462094507161662, "grad_norm": 0.5807011723518372, "learning_rate": 1.9996070773289816e-05, "loss": 1.5411, "step": 60000 }, { "epoch": 0.044695313313402645, "grad_norm": 0.6768651604652405, "learning_rate": 1.9996057664721e-05, "loss": 1.5252, "step": 60100 }, { "epoch": 0.04476968155518867, "grad_norm": 0.3594638407230377, "learning_rate": 1.9996044534326682e-05, "loss": 1.5126, "step": 60200 }, { "epoch": 0.0448440497969747, "grad_norm": 0.4025649130344391, "learning_rate": 1.9996031382106897e-05, "loss": 1.561, "step": 60300 }, { "epoch": 0.044918418038760725, "grad_norm": 0.8125213980674744, "learning_rate": 1.9996018208061675e-05, "loss": 1.5445, "step": 60400 }, { "epoch": 0.04499278628054675, "grad_norm": 0.5969058275222778, "learning_rate": 1.9996005012191037e-05, "loss": 1.582, "step": 60500 }, { "epoch": 0.045067154522332786, "grad_norm": 1.1144986152648926, "learning_rate": 1.9995991794495016e-05, "loss": 1.5563, "step": 60600 }, { "epoch": 0.04514152276411881, "grad_norm": 0.8091686367988586, "learning_rate": 1.999597855497364e-05, "loss": 1.5199, "step": 60700 }, { "epoch": 0.04521589100590484, "grad_norm": 1.3050564527511597, "learning_rate": 1.999596529362694e-05, "loss": 1.5034, "step": 60800 }, { "epoch": 0.04529025924769087, "grad_norm": 0.5470508933067322, "learning_rate": 1.9995952010454943e-05, "loss": 1.5684, "step": 60900 }, { "epoch": 0.045364627489476894, "grad_norm": 0.9612744450569153, "learning_rate": 1.9995938705457682e-05, "loss": 1.6064, "step": 61000 }, { "epoch": 0.04543899573126292, "grad_norm": 0.9011774659156799, "learning_rate": 1.9995925378635177e-05, "loss": 1.4553, "step": 61100 }, { "epoch": 0.04551336397304895, "grad_norm": 1.70448637008667, "learning_rate": 1.9995912029987466e-05, "loss": 1.4507, "step": 61200 }, { "epoch": 0.045587732214834975, "grad_norm": 1.7321926355361938, "learning_rate": 1.999589865951457e-05, "loss": 1.5071, "step": 61300 }, { "epoch": 0.045662100456621, "grad_norm": 0.5415388941764832, "learning_rate": 1.999588526721653e-05, "loss": 1.5518, "step": 61400 }, { "epoch": 0.04573646869840703, "grad_norm": 0.8833714127540588, "learning_rate": 1.9995871853093366e-05, "loss": 1.5299, "step": 61500 }, { "epoch": 0.04581083694019306, "grad_norm": 0.49804380536079407, "learning_rate": 1.999585841714511e-05, "loss": 1.5215, "step": 61600 }, { "epoch": 0.04588520518197909, "grad_norm": 1.3999980688095093, "learning_rate": 1.999584495937179e-05, "loss": 1.5028, "step": 61700 }, { "epoch": 0.04595957342376512, "grad_norm": 1.1679743528366089, "learning_rate": 1.9995831479773438e-05, "loss": 1.4767, "step": 61800 }, { "epoch": 0.046033941665551144, "grad_norm": 0.7388249635696411, "learning_rate": 1.999581797835008e-05, "loss": 1.558, "step": 61900 }, { "epoch": 0.04610830990733717, "grad_norm": 0.6812136769294739, "learning_rate": 1.9995804455101746e-05, "loss": 1.4495, "step": 62000 }, { "epoch": 0.0461826781491232, "grad_norm": 1.3702300786972046, "learning_rate": 1.999579091002847e-05, "loss": 1.4212, "step": 62100 }, { "epoch": 0.046257046390909225, "grad_norm": 0.42544421553611755, "learning_rate": 1.999577734313028e-05, "loss": 1.5603, "step": 62200 }, { "epoch": 0.04633141463269525, "grad_norm": 0.6235955357551575, "learning_rate": 1.99957637544072e-05, "loss": 1.5164, "step": 62300 }, { "epoch": 0.04640578287448128, "grad_norm": 0.30019888281822205, "learning_rate": 1.9995750143859262e-05, "loss": 1.4764, "step": 62400 }, { "epoch": 0.04648015111626731, "grad_norm": 0.509626567363739, "learning_rate": 1.99957365114865e-05, "loss": 1.535, "step": 62500 }, { "epoch": 0.04655451935805334, "grad_norm": 0.726915717124939, "learning_rate": 1.9995722857288943e-05, "loss": 1.5428, "step": 62600 }, { "epoch": 0.046628887599839366, "grad_norm": 0.5223472714424133, "learning_rate": 1.9995709181266613e-05, "loss": 1.548, "step": 62700 }, { "epoch": 0.04670325584162539, "grad_norm": 0.5914735794067383, "learning_rate": 1.9995695483419554e-05, "loss": 1.5433, "step": 62800 }, { "epoch": 0.04677762408341142, "grad_norm": 1.1892948150634766, "learning_rate": 1.999568176374778e-05, "loss": 1.5964, "step": 62900 }, { "epoch": 0.04685199232519745, "grad_norm": 0.47329986095428467, "learning_rate": 1.9995668022251333e-05, "loss": 1.4587, "step": 63000 }, { "epoch": 0.046926360566983474, "grad_norm": 0.7776244878768921, "learning_rate": 1.9995654258930237e-05, "loss": 1.5118, "step": 63100 }, { "epoch": 0.0470007288087695, "grad_norm": 0.4600290358066559, "learning_rate": 1.9995640473784526e-05, "loss": 1.5327, "step": 63200 }, { "epoch": 0.04707509705055553, "grad_norm": 0.785589873790741, "learning_rate": 1.9995626666814226e-05, "loss": 1.5346, "step": 63300 }, { "epoch": 0.047149465292341555, "grad_norm": 0.34471455216407776, "learning_rate": 1.999561283801937e-05, "loss": 1.5669, "step": 63400 }, { "epoch": 0.04722383353412759, "grad_norm": 0.8968401551246643, "learning_rate": 1.9995598987399988e-05, "loss": 1.4522, "step": 63500 }, { "epoch": 0.047298201775913616, "grad_norm": 0.5577977895736694, "learning_rate": 1.9995585114956104e-05, "loss": 1.5894, "step": 63600 }, { "epoch": 0.04737257001769964, "grad_norm": 0.8406354188919067, "learning_rate": 1.999557122068776e-05, "loss": 1.5585, "step": 63700 }, { "epoch": 0.04744693825948567, "grad_norm": 0.6812056303024292, "learning_rate": 1.9995557304594977e-05, "loss": 1.5531, "step": 63800 }, { "epoch": 0.0475213065012717, "grad_norm": 0.6341506242752075, "learning_rate": 1.999554336667779e-05, "loss": 1.5064, "step": 63900 }, { "epoch": 0.047595674743057724, "grad_norm": 0.7291605472564697, "learning_rate": 1.999552940693623e-05, "loss": 1.4924, "step": 64000 }, { "epoch": 0.04767004298484375, "grad_norm": 0.5496443510055542, "learning_rate": 1.9995515425370317e-05, "loss": 1.5276, "step": 64100 }, { "epoch": 0.04774441122662978, "grad_norm": 0.49453896284103394, "learning_rate": 1.9995501421980096e-05, "loss": 1.4673, "step": 64200 }, { "epoch": 0.047818779468415805, "grad_norm": 0.5134396553039551, "learning_rate": 1.999548739676559e-05, "loss": 1.5857, "step": 64300 }, { "epoch": 0.04789314771020184, "grad_norm": 1.035983681678772, "learning_rate": 1.9995473349726834e-05, "loss": 1.4617, "step": 64400 }, { "epoch": 0.047967515951987866, "grad_norm": 0.4110111594200134, "learning_rate": 1.999545928086385e-05, "loss": 1.599, "step": 64500 }, { "epoch": 0.04804188419377389, "grad_norm": 0.6466584205627441, "learning_rate": 1.999544519017668e-05, "loss": 1.5212, "step": 64600 }, { "epoch": 0.04811625243555992, "grad_norm": 0.501596212387085, "learning_rate": 1.9995431077665345e-05, "loss": 1.5215, "step": 64700 }, { "epoch": 0.04819062067734595, "grad_norm": 0.547459065914154, "learning_rate": 1.9995416943329882e-05, "loss": 1.5414, "step": 64800 }, { "epoch": 0.048264988919131974, "grad_norm": 0.9374314546585083, "learning_rate": 1.999540278717032e-05, "loss": 1.5104, "step": 64900 }, { "epoch": 0.048339357160918, "grad_norm": 0.5237802267074585, "learning_rate": 1.999538860918669e-05, "loss": 1.5065, "step": 65000 }, { "epoch": 0.04841372540270403, "grad_norm": 0.534058690071106, "learning_rate": 1.9995374409379023e-05, "loss": 1.5237, "step": 65100 }, { "epoch": 0.048488093644490055, "grad_norm": 0.5253255367279053, "learning_rate": 1.999536018774735e-05, "loss": 1.5591, "step": 65200 }, { "epoch": 0.04856246188627608, "grad_norm": 0.6362668871879578, "learning_rate": 1.99953459442917e-05, "loss": 1.5155, "step": 65300 }, { "epoch": 0.048636830128062115, "grad_norm": 0.4244192838668823, "learning_rate": 1.999533167901211e-05, "loss": 1.5238, "step": 65400 }, { "epoch": 0.04871119836984814, "grad_norm": 0.7062031030654907, "learning_rate": 1.99953173919086e-05, "loss": 1.637, "step": 65500 }, { "epoch": 0.04878556661163417, "grad_norm": 0.5232000946998596, "learning_rate": 1.9995303082981215e-05, "loss": 1.4824, "step": 65600 }, { "epoch": 0.048859934853420196, "grad_norm": 0.6280112862586975, "learning_rate": 1.9995288752229976e-05, "loss": 1.5882, "step": 65700 }, { "epoch": 0.04893430309520622, "grad_norm": 1.1615891456604004, "learning_rate": 1.999527439965492e-05, "loss": 1.4839, "step": 65800 }, { "epoch": 0.04900867133699225, "grad_norm": 0.6920228600502014, "learning_rate": 1.9995260025256075e-05, "loss": 1.5071, "step": 65900 }, { "epoch": 0.04908303957877828, "grad_norm": 0.7031546235084534, "learning_rate": 1.999524562903347e-05, "loss": 1.4983, "step": 66000 }, { "epoch": 0.049157407820564304, "grad_norm": 0.4306289553642273, "learning_rate": 1.999523121098714e-05, "loss": 1.519, "step": 66100 }, { "epoch": 0.04923177606235033, "grad_norm": 0.533328652381897, "learning_rate": 1.9995216771117123e-05, "loss": 1.5628, "step": 66200 }, { "epoch": 0.049306144304136365, "grad_norm": 0.6325706839561462, "learning_rate": 1.999520230942344e-05, "loss": 1.4369, "step": 66300 }, { "epoch": 0.04938051254592239, "grad_norm": 0.43968090415000916, "learning_rate": 1.9995187825906125e-05, "loss": 1.4506, "step": 66400 }, { "epoch": 0.04945488078770842, "grad_norm": 1.3659377098083496, "learning_rate": 1.9995173320565217e-05, "loss": 1.4786, "step": 66500 }, { "epoch": 0.049529249029494446, "grad_norm": 0.7602143883705139, "learning_rate": 1.9995158793400735e-05, "loss": 1.5922, "step": 66600 }, { "epoch": 0.04960361727128047, "grad_norm": 0.4866381287574768, "learning_rate": 1.999514424441272e-05, "loss": 1.5279, "step": 66700 }, { "epoch": 0.0496779855130665, "grad_norm": 0.9451634287834167, "learning_rate": 1.9995129673601203e-05, "loss": 1.5125, "step": 66800 }, { "epoch": 0.04975235375485253, "grad_norm": 0.49570247530937195, "learning_rate": 1.999511508096621e-05, "loss": 1.556, "step": 66900 }, { "epoch": 0.049826721996638554, "grad_norm": 0.5554278492927551, "learning_rate": 1.999510046650778e-05, "loss": 1.5644, "step": 67000 }, { "epoch": 0.04990109023842458, "grad_norm": 1.070966124534607, "learning_rate": 1.9995085830225943e-05, "loss": 1.4394, "step": 67100 }, { "epoch": 0.04997545848021061, "grad_norm": 0.47984740138053894, "learning_rate": 1.999507117212073e-05, "loss": 1.5499, "step": 67200 }, { "epoch": 0.05004982672199664, "grad_norm": 0.6557255983352661, "learning_rate": 1.999505649219217e-05, "loss": 1.5894, "step": 67300 }, { "epoch": 0.05012419496378267, "grad_norm": 0.4384523332118988, "learning_rate": 1.99950417904403e-05, "loss": 1.5454, "step": 67400 }, { "epoch": 0.050198563205568696, "grad_norm": 1.0821648836135864, "learning_rate": 1.9995027066865148e-05, "loss": 1.5872, "step": 67500 }, { "epoch": 0.05027293144735472, "grad_norm": 0.8576905727386475, "learning_rate": 1.9995012321466747e-05, "loss": 1.5024, "step": 67600 }, { "epoch": 0.05034729968914075, "grad_norm": 0.7904402613639832, "learning_rate": 1.9994997554245136e-05, "loss": 1.4259, "step": 67700 }, { "epoch": 0.050421667930926777, "grad_norm": 1.2858697175979614, "learning_rate": 1.9994982765200337e-05, "loss": 1.5907, "step": 67800 }, { "epoch": 0.050496036172712803, "grad_norm": 0.35552987456321716, "learning_rate": 1.9994967954332388e-05, "loss": 1.5593, "step": 67900 }, { "epoch": 0.05057040441449883, "grad_norm": 0.5848758220672607, "learning_rate": 1.999495312164132e-05, "loss": 1.5435, "step": 68000 }, { "epoch": 0.05064477265628486, "grad_norm": 1.5355236530303955, "learning_rate": 1.999493826712717e-05, "loss": 1.5301, "step": 68100 }, { "epoch": 0.050719140898070884, "grad_norm": 0.595832109451294, "learning_rate": 1.999492339078996e-05, "loss": 1.5225, "step": 68200 }, { "epoch": 0.05079350913985692, "grad_norm": 0.47388339042663574, "learning_rate": 1.999490849262973e-05, "loss": 1.5252, "step": 68300 }, { "epoch": 0.050867877381642945, "grad_norm": 0.48052307963371277, "learning_rate": 1.999489357264651e-05, "loss": 1.5274, "step": 68400 }, { "epoch": 0.05094224562342897, "grad_norm": 0.7523823380470276, "learning_rate": 1.9994878630840334e-05, "loss": 1.5485, "step": 68500 }, { "epoch": 0.051016613865215, "grad_norm": 0.5487205982208252, "learning_rate": 1.9994863667211237e-05, "loss": 1.5851, "step": 68600 }, { "epoch": 0.051090982107001026, "grad_norm": 0.899217963218689, "learning_rate": 1.999484868175925e-05, "loss": 1.5519, "step": 68700 }, { "epoch": 0.05116535034878705, "grad_norm": 0.6217190623283386, "learning_rate": 1.9994833674484398e-05, "loss": 1.465, "step": 68800 }, { "epoch": 0.05123971859057308, "grad_norm": 0.5816856026649475, "learning_rate": 1.9994818645386725e-05, "loss": 1.4822, "step": 68900 }, { "epoch": 0.05131408683235911, "grad_norm": 0.5480476021766663, "learning_rate": 1.999480359446626e-05, "loss": 1.5958, "step": 69000 }, { "epoch": 0.051388455074145134, "grad_norm": 0.6178867220878601, "learning_rate": 1.9994788521723033e-05, "loss": 1.4214, "step": 69100 }, { "epoch": 0.05146282331593117, "grad_norm": 0.639522135257721, "learning_rate": 1.999477342715708e-05, "loss": 1.5462, "step": 69200 }, { "epoch": 0.051537191557717195, "grad_norm": 0.8950421810150146, "learning_rate": 1.9994758310768432e-05, "loss": 1.5562, "step": 69300 }, { "epoch": 0.05161155979950322, "grad_norm": 0.9787744283676147, "learning_rate": 1.9994743172557123e-05, "loss": 1.5684, "step": 69400 }, { "epoch": 0.05168592804128925, "grad_norm": 0.34816843271255493, "learning_rate": 1.999472801252319e-05, "loss": 1.5693, "step": 69500 }, { "epoch": 0.051760296283075276, "grad_norm": 0.8306708931922913, "learning_rate": 1.9994712830666658e-05, "loss": 1.6258, "step": 69600 }, { "epoch": 0.0518346645248613, "grad_norm": 0.6776370406150818, "learning_rate": 1.9994697626987562e-05, "loss": 1.432, "step": 69700 }, { "epoch": 0.05190903276664733, "grad_norm": 0.3862565755844116, "learning_rate": 1.999468240148594e-05, "loss": 1.5958, "step": 69800 }, { "epoch": 0.05198340100843336, "grad_norm": 0.42656075954437256, "learning_rate": 1.9994667154161826e-05, "loss": 1.5192, "step": 69900 }, { "epoch": 0.052057769250219384, "grad_norm": 0.7187511920928955, "learning_rate": 1.9994651885015246e-05, "loss": 1.4779, "step": 70000 }, { "epoch": 0.05213213749200541, "grad_norm": 0.7468114495277405, "learning_rate": 1.9994636594046237e-05, "loss": 1.4672, "step": 70100 }, { "epoch": 0.052206505733791445, "grad_norm": 0.5125714540481567, "learning_rate": 1.9994621281254834e-05, "loss": 1.5607, "step": 70200 }, { "epoch": 0.05228087397557747, "grad_norm": 0.6202149987220764, "learning_rate": 1.999460594664107e-05, "loss": 1.5402, "step": 70300 }, { "epoch": 0.0523552422173635, "grad_norm": 1.1004749536514282, "learning_rate": 1.9994590590204974e-05, "loss": 1.449, "step": 70400 }, { "epoch": 0.052429610459149525, "grad_norm": 0.5230892896652222, "learning_rate": 1.9994575211946588e-05, "loss": 1.5675, "step": 70500 }, { "epoch": 0.05250397870093555, "grad_norm": 0.4736848771572113, "learning_rate": 1.9994559811865936e-05, "loss": 1.4462, "step": 70600 }, { "epoch": 0.05257834694272158, "grad_norm": 0.722038209438324, "learning_rate": 1.9994544389963063e-05, "loss": 1.5297, "step": 70700 }, { "epoch": 0.052652715184507606, "grad_norm": 0.38121601939201355, "learning_rate": 1.999452894623799e-05, "loss": 1.4976, "step": 70800 }, { "epoch": 0.05272708342629363, "grad_norm": 0.7381113767623901, "learning_rate": 1.999451348069076e-05, "loss": 1.6052, "step": 70900 }, { "epoch": 0.05280145166807966, "grad_norm": 0.48022809624671936, "learning_rate": 1.99944979933214e-05, "loss": 1.5308, "step": 71000 }, { "epoch": 0.052875819909865694, "grad_norm": 0.6746697425842285, "learning_rate": 1.9994482484129952e-05, "loss": 1.5776, "step": 71100 }, { "epoch": 0.05295018815165172, "grad_norm": 1.0217593908309937, "learning_rate": 1.999446695311644e-05, "loss": 1.5206, "step": 71200 }, { "epoch": 0.05302455639343775, "grad_norm": 0.9935411810874939, "learning_rate": 1.999445140028091e-05, "loss": 1.5362, "step": 71300 }, { "epoch": 0.053098924635223775, "grad_norm": 0.6215861439704895, "learning_rate": 1.9994435825623382e-05, "loss": 1.5598, "step": 71400 }, { "epoch": 0.0531732928770098, "grad_norm": 0.48527583479881287, "learning_rate": 1.99944202291439e-05, "loss": 1.5116, "step": 71500 }, { "epoch": 0.05324766111879583, "grad_norm": 1.3961418867111206, "learning_rate": 1.9994404610842496e-05, "loss": 1.5574, "step": 71600 }, { "epoch": 0.053322029360581856, "grad_norm": 1.0021171569824219, "learning_rate": 1.9994388970719202e-05, "loss": 1.5676, "step": 71700 }, { "epoch": 0.05339639760236788, "grad_norm": 0.5348053574562073, "learning_rate": 1.9994373308774052e-05, "loss": 1.4911, "step": 71800 }, { "epoch": 0.05347076584415391, "grad_norm": 0.5527310967445374, "learning_rate": 1.9994357625007087e-05, "loss": 1.5595, "step": 71900 }, { "epoch": 0.05354513408593994, "grad_norm": 1.1981103420257568, "learning_rate": 1.999434191941833e-05, "loss": 1.5239, "step": 72000 }, { "epoch": 0.05361950232772597, "grad_norm": 0.507123589515686, "learning_rate": 1.999432619200782e-05, "loss": 1.5324, "step": 72100 }, { "epoch": 0.053693870569512, "grad_norm": 0.4210796356201172, "learning_rate": 1.99943104427756e-05, "loss": 1.5681, "step": 72200 }, { "epoch": 0.053768238811298025, "grad_norm": 0.574341893196106, "learning_rate": 1.999429467172169e-05, "loss": 1.6575, "step": 72300 }, { "epoch": 0.05384260705308405, "grad_norm": 0.5402580499649048, "learning_rate": 1.9994278878846135e-05, "loss": 1.5097, "step": 72400 }, { "epoch": 0.05391697529487008, "grad_norm": 0.5868122577667236, "learning_rate": 1.9994263064148964e-05, "loss": 1.5158, "step": 72500 }, { "epoch": 0.053991343536656106, "grad_norm": 0.5461186170578003, "learning_rate": 1.9994247227630216e-05, "loss": 1.4676, "step": 72600 }, { "epoch": 0.05406571177844213, "grad_norm": 0.56854248046875, "learning_rate": 1.999423136928992e-05, "loss": 1.5803, "step": 72700 }, { "epoch": 0.05414008002022816, "grad_norm": 0.5925450325012207, "learning_rate": 1.9994215489128113e-05, "loss": 1.5622, "step": 72800 }, { "epoch": 0.05421444826201419, "grad_norm": 0.9310332536697388, "learning_rate": 1.999419958714483e-05, "loss": 1.5552, "step": 72900 }, { "epoch": 0.05428881650380022, "grad_norm": 0.6535036563873291, "learning_rate": 1.9994183663340106e-05, "loss": 1.5079, "step": 73000 }, { "epoch": 0.05436318474558625, "grad_norm": 0.759397029876709, "learning_rate": 1.9994167717713976e-05, "loss": 1.4763, "step": 73100 }, { "epoch": 0.054437552987372274, "grad_norm": 0.8042114973068237, "learning_rate": 1.999415175026648e-05, "loss": 1.5085, "step": 73200 }, { "epoch": 0.0545119212291583, "grad_norm": 0.5362099409103394, "learning_rate": 1.999413576099764e-05, "loss": 1.4936, "step": 73300 }, { "epoch": 0.05458628947094433, "grad_norm": 0.5755407214164734, "learning_rate": 1.9994119749907502e-05, "loss": 1.5056, "step": 73400 }, { "epoch": 0.054660657712730355, "grad_norm": 0.595206081867218, "learning_rate": 1.9994103716996097e-05, "loss": 1.4753, "step": 73500 }, { "epoch": 0.05473502595451638, "grad_norm": 0.8156918287277222, "learning_rate": 1.9994087662263457e-05, "loss": 1.5586, "step": 73600 }, { "epoch": 0.05480939419630241, "grad_norm": 0.739098310470581, "learning_rate": 1.999407158570962e-05, "loss": 1.5223, "step": 73700 }, { "epoch": 0.054883762438088436, "grad_norm": 1.348789095878601, "learning_rate": 1.999405548733463e-05, "loss": 1.5179, "step": 73800 }, { "epoch": 0.05495813067987446, "grad_norm": 0.671525776386261, "learning_rate": 1.99940393671385e-05, "loss": 1.5044, "step": 73900 }, { "epoch": 0.0550324989216605, "grad_norm": 1.034043312072754, "learning_rate": 1.9994023225121288e-05, "loss": 1.4223, "step": 74000 }, { "epoch": 0.055106867163446524, "grad_norm": 1.1060287952423096, "learning_rate": 1.9994007061283018e-05, "loss": 1.5573, "step": 74100 }, { "epoch": 0.05518123540523255, "grad_norm": 0.8618998527526855, "learning_rate": 1.999399087562373e-05, "loss": 1.4898, "step": 74200 }, { "epoch": 0.05525560364701858, "grad_norm": 0.714076817035675, "learning_rate": 1.9993974668143452e-05, "loss": 1.451, "step": 74300 }, { "epoch": 0.055329971888804605, "grad_norm": 0.4572731554508209, "learning_rate": 1.9993958438842224e-05, "loss": 1.5303, "step": 74400 }, { "epoch": 0.05540434013059063, "grad_norm": 0.496499627828598, "learning_rate": 1.9993942187720082e-05, "loss": 1.6219, "step": 74500 }, { "epoch": 0.05547870837237666, "grad_norm": 0.4714408218860626, "learning_rate": 1.9993925914777064e-05, "loss": 1.5501, "step": 74600 }, { "epoch": 0.055553076614162686, "grad_norm": 0.5283282995223999, "learning_rate": 1.9993909620013203e-05, "loss": 1.5221, "step": 74700 }, { "epoch": 0.05562744485594871, "grad_norm": 0.8781616687774658, "learning_rate": 1.999389330342853e-05, "loss": 1.5478, "step": 74800 }, { "epoch": 0.05570181309773475, "grad_norm": 0.5995525121688843, "learning_rate": 1.9993876965023084e-05, "loss": 1.5069, "step": 74900 }, { "epoch": 0.055776181339520774, "grad_norm": 0.533664882183075, "learning_rate": 1.9993860604796905e-05, "loss": 1.514, "step": 75000 }, { "epoch": 0.0558505495813068, "grad_norm": 1.012466311454773, "learning_rate": 1.9993844222750023e-05, "loss": 1.473, "step": 75100 }, { "epoch": 0.05592491782309283, "grad_norm": 0.7862614393234253, "learning_rate": 1.9993827818882473e-05, "loss": 1.4832, "step": 75200 }, { "epoch": 0.055999286064878855, "grad_norm": 0.7203556299209595, "learning_rate": 1.9993811393194302e-05, "loss": 1.5157, "step": 75300 }, { "epoch": 0.05607365430666488, "grad_norm": 1.1218525171279907, "learning_rate": 1.9993794945685528e-05, "loss": 1.4169, "step": 75400 }, { "epoch": 0.05614802254845091, "grad_norm": 0.46560999751091003, "learning_rate": 1.99937784763562e-05, "loss": 1.4688, "step": 75500 }, { "epoch": 0.056222390790236935, "grad_norm": 0.9627271294593811, "learning_rate": 1.999376198520635e-05, "loss": 1.5489, "step": 75600 }, { "epoch": 0.05629675903202296, "grad_norm": 0.9937626719474792, "learning_rate": 1.9993745472236018e-05, "loss": 1.5759, "step": 75700 }, { "epoch": 0.05637112727380899, "grad_norm": 0.6520542502403259, "learning_rate": 1.9993728937445232e-05, "loss": 1.4653, "step": 75800 }, { "epoch": 0.05644549551559502, "grad_norm": 1.1701862812042236, "learning_rate": 1.9993712380834034e-05, "loss": 1.4875, "step": 75900 }, { "epoch": 0.05651986375738105, "grad_norm": 0.9439906477928162, "learning_rate": 1.999369580240246e-05, "loss": 1.5524, "step": 76000 }, { "epoch": 0.05659423199916708, "grad_norm": 1.1177873611450195, "learning_rate": 1.9993679202150543e-05, "loss": 1.558, "step": 76100 }, { "epoch": 0.056668600240953104, "grad_norm": 0.4650721549987793, "learning_rate": 1.9993662580078317e-05, "loss": 1.5035, "step": 76200 }, { "epoch": 0.05674296848273913, "grad_norm": 0.5230388045310974, "learning_rate": 1.999364593618583e-05, "loss": 1.5027, "step": 76300 }, { "epoch": 0.05681733672452516, "grad_norm": 0.6694977879524231, "learning_rate": 1.9993629270473108e-05, "loss": 1.4642, "step": 76400 }, { "epoch": 0.056891704966311185, "grad_norm": 0.6857712268829346, "learning_rate": 1.999361258294019e-05, "loss": 1.6462, "step": 76500 }, { "epoch": 0.05696607320809721, "grad_norm": 0.708351731300354, "learning_rate": 1.9993595873587112e-05, "loss": 1.4773, "step": 76600 }, { "epoch": 0.05704044144988324, "grad_norm": 0.451820969581604, "learning_rate": 1.999357914241391e-05, "loss": 1.5512, "step": 76700 }, { "epoch": 0.05711480969166927, "grad_norm": 0.9653975963592529, "learning_rate": 1.9993562389420623e-05, "loss": 1.5231, "step": 76800 }, { "epoch": 0.0571891779334553, "grad_norm": 1.0675276517868042, "learning_rate": 1.9993545614607287e-05, "loss": 1.5519, "step": 76900 }, { "epoch": 0.05726354617524133, "grad_norm": 0.6132591366767883, "learning_rate": 1.9993528817973938e-05, "loss": 1.5634, "step": 77000 }, { "epoch": 0.057337914417027354, "grad_norm": 0.6499157547950745, "learning_rate": 1.999351199952061e-05, "loss": 1.4993, "step": 77100 }, { "epoch": 0.05741228265881338, "grad_norm": 0.8147251605987549, "learning_rate": 1.999349515924734e-05, "loss": 1.5336, "step": 77200 }, { "epoch": 0.05748665090059941, "grad_norm": 0.601445198059082, "learning_rate": 1.9993478297154175e-05, "loss": 1.5546, "step": 77300 }, { "epoch": 0.057561019142385435, "grad_norm": 0.7941200137138367, "learning_rate": 1.9993461413241138e-05, "loss": 1.5751, "step": 77400 }, { "epoch": 0.05763538738417146, "grad_norm": 0.5446432828903198, "learning_rate": 1.9993444507508272e-05, "loss": 1.5638, "step": 77500 }, { "epoch": 0.05770975562595749, "grad_norm": 1.241955280303955, "learning_rate": 1.9993427579955617e-05, "loss": 1.5663, "step": 77600 }, { "epoch": 0.057784123867743516, "grad_norm": 0.47920486330986023, "learning_rate": 1.99934106305832e-05, "loss": 1.4326, "step": 77700 }, { "epoch": 0.05785849210952955, "grad_norm": 0.8999041318893433, "learning_rate": 1.9993393659391068e-05, "loss": 1.5711, "step": 77800 }, { "epoch": 0.05793286035131558, "grad_norm": 0.6789896488189697, "learning_rate": 1.9993376666379256e-05, "loss": 1.5342, "step": 77900 }, { "epoch": 0.058007228593101604, "grad_norm": 0.5044109225273132, "learning_rate": 1.9993359651547798e-05, "loss": 1.4873, "step": 78000 }, { "epoch": 0.05808159683488763, "grad_norm": 0.7116490006446838, "learning_rate": 1.9993342614896733e-05, "loss": 1.453, "step": 78100 }, { "epoch": 0.05815596507667366, "grad_norm": 0.5207152962684631, "learning_rate": 1.9993325556426096e-05, "loss": 1.4711, "step": 78200 }, { "epoch": 0.058230333318459684, "grad_norm": 0.8057217001914978, "learning_rate": 1.999330847613593e-05, "loss": 1.5021, "step": 78300 }, { "epoch": 0.05830470156024571, "grad_norm": 1.1154026985168457, "learning_rate": 1.9993291374026266e-05, "loss": 1.4475, "step": 78400 }, { "epoch": 0.05837906980203174, "grad_norm": 0.4721396267414093, "learning_rate": 1.9993274250097146e-05, "loss": 1.5285, "step": 78500 }, { "epoch": 0.058453438043817765, "grad_norm": 1.0618817806243896, "learning_rate": 1.9993257104348604e-05, "loss": 1.5323, "step": 78600 }, { "epoch": 0.0585278062856038, "grad_norm": 1.1249905824661255, "learning_rate": 1.999323993678068e-05, "loss": 1.5252, "step": 78700 }, { "epoch": 0.058602174527389826, "grad_norm": 0.48599275946617126, "learning_rate": 1.999322274739341e-05, "loss": 1.5124, "step": 78800 }, { "epoch": 0.05867654276917585, "grad_norm": 0.5065784454345703, "learning_rate": 1.999320553618683e-05, "loss": 1.5858, "step": 78900 }, { "epoch": 0.05875091101096188, "grad_norm": 0.854963481426239, "learning_rate": 1.999318830316098e-05, "loss": 1.513, "step": 79000 }, { "epoch": 0.05882527925274791, "grad_norm": 0.556955099105835, "learning_rate": 1.9993171048315895e-05, "loss": 1.514, "step": 79100 }, { "epoch": 0.058899647494533934, "grad_norm": 0.6691248416900635, "learning_rate": 1.9993153771651618e-05, "loss": 1.4574, "step": 79200 }, { "epoch": 0.05897401573631996, "grad_norm": 0.5654352903366089, "learning_rate": 1.999313647316818e-05, "loss": 1.5046, "step": 79300 }, { "epoch": 0.05904838397810599, "grad_norm": 0.9016973972320557, "learning_rate": 1.9993119152865624e-05, "loss": 1.5465, "step": 79400 }, { "epoch": 0.059122752219892015, "grad_norm": 0.4756191670894623, "learning_rate": 1.9993101810743985e-05, "loss": 1.4944, "step": 79500 }, { "epoch": 0.05919712046167804, "grad_norm": 0.44962599873542786, "learning_rate": 1.9993084446803303e-05, "loss": 1.4853, "step": 79600 }, { "epoch": 0.059271488703464076, "grad_norm": 0.5768176913261414, "learning_rate": 1.9993067061043614e-05, "loss": 1.5246, "step": 79700 }, { "epoch": 0.0593458569452501, "grad_norm": 0.6383886933326721, "learning_rate": 1.9993049653464957e-05, "loss": 1.5407, "step": 79800 }, { "epoch": 0.05942022518703613, "grad_norm": 0.5047423243522644, "learning_rate": 1.999303222406737e-05, "loss": 1.5593, "step": 79900 }, { "epoch": 0.05949459342882216, "grad_norm": 0.5224947333335876, "learning_rate": 1.999301477285089e-05, "loss": 1.5501, "step": 80000 }, { "epoch": 0.059568961670608184, "grad_norm": 0.8568351864814758, "learning_rate": 1.9992997299815557e-05, "loss": 1.5291, "step": 80100 }, { "epoch": 0.05964332991239421, "grad_norm": 0.5065781474113464, "learning_rate": 1.9992979804961406e-05, "loss": 1.4743, "step": 80200 }, { "epoch": 0.05971769815418024, "grad_norm": 0.7506331205368042, "learning_rate": 1.999296228828848e-05, "loss": 1.575, "step": 80300 }, { "epoch": 0.059792066395966265, "grad_norm": 0.7313674092292786, "learning_rate": 1.999294474979681e-05, "loss": 1.4892, "step": 80400 }, { "epoch": 0.05986643463775229, "grad_norm": 0.6475706100463867, "learning_rate": 1.999292718948644e-05, "loss": 1.4716, "step": 80500 }, { "epoch": 0.05994080287953832, "grad_norm": 0.4502275586128235, "learning_rate": 1.999290960735741e-05, "loss": 1.5449, "step": 80600 }, { "epoch": 0.06001517112132435, "grad_norm": 0.7036411762237549, "learning_rate": 1.9992892003409753e-05, "loss": 1.4786, "step": 80700 }, { "epoch": 0.06008953936311038, "grad_norm": 0.5732350945472717, "learning_rate": 1.999287437764351e-05, "loss": 1.5548, "step": 80800 }, { "epoch": 0.060163907604896406, "grad_norm": 0.6757441759109497, "learning_rate": 1.999285673005872e-05, "loss": 1.4276, "step": 80900 }, { "epoch": 0.06023827584668243, "grad_norm": 0.8502363562583923, "learning_rate": 1.999283906065542e-05, "loss": 1.5078, "step": 81000 }, { "epoch": 0.06031264408846846, "grad_norm": 0.9248318672180176, "learning_rate": 1.9992821369433654e-05, "loss": 1.5352, "step": 81100 }, { "epoch": 0.06038701233025449, "grad_norm": 0.3702896535396576, "learning_rate": 1.999280365639345e-05, "loss": 1.5567, "step": 81200 }, { "epoch": 0.060461380572040514, "grad_norm": 0.8454656004905701, "learning_rate": 1.9992785921534853e-05, "loss": 1.5327, "step": 81300 }, { "epoch": 0.06053574881382654, "grad_norm": 1.452540397644043, "learning_rate": 1.9992768164857906e-05, "loss": 1.473, "step": 81400 }, { "epoch": 0.06061011705561257, "grad_norm": 0.5796297192573547, "learning_rate": 1.999275038636264e-05, "loss": 1.4551, "step": 81500 }, { "epoch": 0.0606844852973986, "grad_norm": 0.5252229571342468, "learning_rate": 1.9992732586049096e-05, "loss": 1.4727, "step": 81600 }, { "epoch": 0.06075885353918463, "grad_norm": 1.0359326601028442, "learning_rate": 1.999271476391732e-05, "loss": 1.5017, "step": 81700 }, { "epoch": 0.060833221780970656, "grad_norm": 0.49495527148246765, "learning_rate": 1.9992696919967337e-05, "loss": 1.5521, "step": 81800 }, { "epoch": 0.06090759002275668, "grad_norm": 0.548267662525177, "learning_rate": 1.9992679054199197e-05, "loss": 1.508, "step": 81900 }, { "epoch": 0.06098195826454271, "grad_norm": 0.5110555291175842, "learning_rate": 1.999266116661294e-05, "loss": 1.5644, "step": 82000 }, { "epoch": 0.06105632650632874, "grad_norm": 0.803193211555481, "learning_rate": 1.9992643257208595e-05, "loss": 1.4233, "step": 82100 }, { "epoch": 0.061130694748114764, "grad_norm": 1.2153749465942383, "learning_rate": 1.9992625325986207e-05, "loss": 1.5741, "step": 82200 }, { "epoch": 0.06120506298990079, "grad_norm": 0.48382624983787537, "learning_rate": 1.999260737294582e-05, "loss": 1.5272, "step": 82300 }, { "epoch": 0.06127943123168682, "grad_norm": 0.4789665937423706, "learning_rate": 1.9992589398087466e-05, "loss": 1.4757, "step": 82400 }, { "epoch": 0.061353799473472845, "grad_norm": 0.5505409240722656, "learning_rate": 1.9992571401411183e-05, "loss": 1.4968, "step": 82500 }, { "epoch": 0.06142816771525888, "grad_norm": 0.7146855592727661, "learning_rate": 1.999255338291702e-05, "loss": 1.4641, "step": 82600 }, { "epoch": 0.061502535957044906, "grad_norm": 1.2916581630706787, "learning_rate": 1.9992535342605008e-05, "loss": 1.4884, "step": 82700 }, { "epoch": 0.06157690419883093, "grad_norm": 0.5506526231765747, "learning_rate": 1.9992517280475186e-05, "loss": 1.4925, "step": 82800 }, { "epoch": 0.06165127244061696, "grad_norm": 1.0735949277877808, "learning_rate": 1.9992499196527598e-05, "loss": 1.456, "step": 82900 }, { "epoch": 0.06172564068240299, "grad_norm": 0.5877838134765625, "learning_rate": 1.9992481090762284e-05, "loss": 1.4362, "step": 83000 }, { "epoch": 0.061800008924189014, "grad_norm": 0.6066355109214783, "learning_rate": 1.9992462963179275e-05, "loss": 1.5472, "step": 83100 }, { "epoch": 0.06187437716597504, "grad_norm": 0.5328536629676819, "learning_rate": 1.9992444813778622e-05, "loss": 1.5712, "step": 83200 }, { "epoch": 0.06194874540776107, "grad_norm": 0.685464084148407, "learning_rate": 1.9992426642560356e-05, "loss": 1.531, "step": 83300 }, { "epoch": 0.062023113649547094, "grad_norm": 0.6651979684829712, "learning_rate": 1.999240844952452e-05, "loss": 1.5326, "step": 83400 }, { "epoch": 0.06209748189133313, "grad_norm": 0.9877690076828003, "learning_rate": 1.9992390234671157e-05, "loss": 1.5223, "step": 83500 }, { "epoch": 0.062171850133119155, "grad_norm": 0.4471887946128845, "learning_rate": 1.9992371998000303e-05, "loss": 1.5093, "step": 83600 }, { "epoch": 0.06224621837490518, "grad_norm": 0.8113996386528015, "learning_rate": 1.9992353739511994e-05, "loss": 1.4959, "step": 83700 }, { "epoch": 0.06232058661669121, "grad_norm": 0.5820923447608948, "learning_rate": 1.999233545920628e-05, "loss": 1.5134, "step": 83800 }, { "epoch": 0.062394954858477236, "grad_norm": 0.623708188533783, "learning_rate": 1.999231715708319e-05, "loss": 1.4944, "step": 83900 }, { "epoch": 0.06246932310026326, "grad_norm": 0.5685898065567017, "learning_rate": 1.9992298833142772e-05, "loss": 1.5297, "step": 84000 }, { "epoch": 0.06254369134204929, "grad_norm": 0.5108596682548523, "learning_rate": 1.999228048738506e-05, "loss": 1.4644, "step": 84100 }, { "epoch": 0.06261805958383532, "grad_norm": 0.636935293674469, "learning_rate": 1.99922621198101e-05, "loss": 1.5105, "step": 84200 }, { "epoch": 0.06269242782562134, "grad_norm": 0.7226575613021851, "learning_rate": 1.9992243730417926e-05, "loss": 1.5828, "step": 84300 }, { "epoch": 0.06276679606740737, "grad_norm": 0.7858364582061768, "learning_rate": 1.9992225319208584e-05, "loss": 1.5216, "step": 84400 }, { "epoch": 0.0628411643091934, "grad_norm": 0.5035095810890198, "learning_rate": 1.999220688618211e-05, "loss": 1.5342, "step": 84500 }, { "epoch": 0.06291553255097942, "grad_norm": 0.515177845954895, "learning_rate": 1.9992188431338547e-05, "loss": 1.5137, "step": 84600 }, { "epoch": 0.06298990079276545, "grad_norm": 0.6190256476402283, "learning_rate": 1.9992169954677933e-05, "loss": 1.4787, "step": 84700 }, { "epoch": 0.06306426903455148, "grad_norm": 0.42270639538764954, "learning_rate": 1.999215145620031e-05, "loss": 1.5532, "step": 84800 }, { "epoch": 0.06313863727633752, "grad_norm": 0.9928336143493652, "learning_rate": 1.999213293590572e-05, "loss": 1.5139, "step": 84900 }, { "epoch": 0.06321300551812355, "grad_norm": 0.6874875426292419, "learning_rate": 1.99921143937942e-05, "loss": 1.5843, "step": 85000 }, { "epoch": 0.06328737375990957, "grad_norm": 1.4590458869934082, "learning_rate": 1.9992095829865786e-05, "loss": 1.4197, "step": 85100 }, { "epoch": 0.0633617420016956, "grad_norm": 0.7171260714530945, "learning_rate": 1.999207724412053e-05, "loss": 1.5444, "step": 85200 }, { "epoch": 0.06343611024348163, "grad_norm": 0.7907926440238953, "learning_rate": 1.9992058636558466e-05, "loss": 1.4923, "step": 85300 }, { "epoch": 0.06351047848526765, "grad_norm": 0.5244536399841309, "learning_rate": 1.9992040007179635e-05, "loss": 1.4754, "step": 85400 }, { "epoch": 0.06358484672705368, "grad_norm": 0.7662790417671204, "learning_rate": 1.999202135598408e-05, "loss": 1.4759, "step": 85500 }, { "epoch": 0.06365921496883971, "grad_norm": 0.5479734539985657, "learning_rate": 1.9992002682971837e-05, "loss": 1.5631, "step": 85600 }, { "epoch": 0.06373358321062574, "grad_norm": 0.5378610491752625, "learning_rate": 1.9991983988142952e-05, "loss": 1.4574, "step": 85700 }, { "epoch": 0.06380795145241176, "grad_norm": 0.5130261182785034, "learning_rate": 1.9991965271497463e-05, "loss": 1.4096, "step": 85800 }, { "epoch": 0.06388231969419779, "grad_norm": 0.3913695812225342, "learning_rate": 1.9991946533035408e-05, "loss": 1.4662, "step": 85900 }, { "epoch": 0.06395668793598382, "grad_norm": 0.6080545783042908, "learning_rate": 1.9991927772756833e-05, "loss": 1.5168, "step": 86000 }, { "epoch": 0.06403105617776984, "grad_norm": 0.4266175627708435, "learning_rate": 1.9991908990661782e-05, "loss": 1.5904, "step": 86100 }, { "epoch": 0.06410542441955587, "grad_norm": 0.7378482818603516, "learning_rate": 1.9991890186750284e-05, "loss": 1.5445, "step": 86200 }, { "epoch": 0.0641797926613419, "grad_norm": 0.4735576808452606, "learning_rate": 1.999187136102239e-05, "loss": 1.5184, "step": 86300 }, { "epoch": 0.06425416090312792, "grad_norm": 1.1487786769866943, "learning_rate": 1.999185251347814e-05, "loss": 1.5623, "step": 86400 }, { "epoch": 0.06432852914491395, "grad_norm": 0.6790218353271484, "learning_rate": 1.9991833644117573e-05, "loss": 1.4743, "step": 86500 }, { "epoch": 0.06440289738669998, "grad_norm": 0.6615635752677917, "learning_rate": 1.9991814752940728e-05, "loss": 1.5226, "step": 86600 }, { "epoch": 0.064477265628486, "grad_norm": 0.5001498460769653, "learning_rate": 1.9991795839947652e-05, "loss": 1.5801, "step": 86700 }, { "epoch": 0.06455163387027203, "grad_norm": 0.880649983882904, "learning_rate": 1.9991776905138382e-05, "loss": 1.5611, "step": 86800 }, { "epoch": 0.06462600211205807, "grad_norm": 0.6761185526847839, "learning_rate": 1.9991757948512962e-05, "loss": 1.5622, "step": 86900 }, { "epoch": 0.0647003703538441, "grad_norm": 0.48481419682502747, "learning_rate": 1.999173897007143e-05, "loss": 1.4518, "step": 87000 }, { "epoch": 0.06477473859563013, "grad_norm": 0.5701479315757751, "learning_rate": 1.999171996981383e-05, "loss": 1.5306, "step": 87100 }, { "epoch": 0.06484910683741615, "grad_norm": 0.7284945845603943, "learning_rate": 1.99917009477402e-05, "loss": 1.4337, "step": 87200 }, { "epoch": 0.06492347507920218, "grad_norm": 0.7202057242393494, "learning_rate": 1.999168190385059e-05, "loss": 1.5605, "step": 87300 }, { "epoch": 0.06499784332098821, "grad_norm": 0.4802098274230957, "learning_rate": 1.9991662838145034e-05, "loss": 1.5428, "step": 87400 }, { "epoch": 0.06507221156277423, "grad_norm": 0.5068057775497437, "learning_rate": 1.9991643750623574e-05, "loss": 1.4441, "step": 87500 }, { "epoch": 0.06514657980456026, "grad_norm": 0.8798725605010986, "learning_rate": 1.9991624641286255e-05, "loss": 1.5766, "step": 87600 }, { "epoch": 0.06522094804634629, "grad_norm": 0.49363136291503906, "learning_rate": 1.9991605510133115e-05, "loss": 1.6196, "step": 87700 }, { "epoch": 0.06529531628813232, "grad_norm": 0.5400691628456116, "learning_rate": 1.99915863571642e-05, "loss": 1.5392, "step": 87800 }, { "epoch": 0.06536968452991834, "grad_norm": 0.5299246311187744, "learning_rate": 1.9991567182379546e-05, "loss": 1.5645, "step": 87900 }, { "epoch": 0.06544405277170437, "grad_norm": 0.6503016352653503, "learning_rate": 1.9991547985779202e-05, "loss": 1.4476, "step": 88000 }, { "epoch": 0.0655184210134904, "grad_norm": 0.5769862532615662, "learning_rate": 1.9991528767363207e-05, "loss": 1.5248, "step": 88100 }, { "epoch": 0.06559278925527642, "grad_norm": 0.8062888383865356, "learning_rate": 1.99915095271316e-05, "loss": 1.6394, "step": 88200 }, { "epoch": 0.06566715749706245, "grad_norm": 0.4004135727882385, "learning_rate": 1.999149026508443e-05, "loss": 1.5317, "step": 88300 }, { "epoch": 0.06574152573884848, "grad_norm": 0.6382884383201599, "learning_rate": 1.9991470981221727e-05, "loss": 1.5602, "step": 88400 }, { "epoch": 0.0658158939806345, "grad_norm": 0.535750150680542, "learning_rate": 1.9991451675543544e-05, "loss": 1.5113, "step": 88500 }, { "epoch": 0.06589026222242053, "grad_norm": 1.1604392528533936, "learning_rate": 1.999143234804992e-05, "loss": 1.4996, "step": 88600 }, { "epoch": 0.06596463046420656, "grad_norm": 0.7842492461204529, "learning_rate": 1.99914129987409e-05, "loss": 1.5054, "step": 88700 }, { "epoch": 0.0660389987059926, "grad_norm": 0.7460685968399048, "learning_rate": 1.999139362761652e-05, "loss": 1.433, "step": 88800 }, { "epoch": 0.06611336694777863, "grad_norm": 0.7984693050384521, "learning_rate": 1.9991374234676826e-05, "loss": 1.5551, "step": 88900 }, { "epoch": 0.06618773518956465, "grad_norm": 0.6733551621437073, "learning_rate": 1.999135481992186e-05, "loss": 1.4334, "step": 89000 }, { "epoch": 0.06626210343135068, "grad_norm": 0.8035016059875488, "learning_rate": 1.999133538335166e-05, "loss": 1.4872, "step": 89100 }, { "epoch": 0.06633647167313671, "grad_norm": 0.4339046776294708, "learning_rate": 1.9991315924966277e-05, "loss": 1.4869, "step": 89200 }, { "epoch": 0.06641083991492273, "grad_norm": 0.6680594086647034, "learning_rate": 1.9991296444765747e-05, "loss": 1.5103, "step": 89300 }, { "epoch": 0.06648520815670876, "grad_norm": 0.697487473487854, "learning_rate": 1.9991276942750117e-05, "loss": 1.4239, "step": 89400 }, { "epoch": 0.06655957639849479, "grad_norm": 0.587734043598175, "learning_rate": 1.9991257418919424e-05, "loss": 1.5856, "step": 89500 }, { "epoch": 0.06663394464028081, "grad_norm": 0.8574571013450623, "learning_rate": 1.999123787327372e-05, "loss": 1.4818, "step": 89600 }, { "epoch": 0.06670831288206684, "grad_norm": 1.0861676931381226, "learning_rate": 1.9991218305813035e-05, "loss": 1.4883, "step": 89700 }, { "epoch": 0.06678268112385287, "grad_norm": 1.0139306783676147, "learning_rate": 1.9991198716537422e-05, "loss": 1.5099, "step": 89800 }, { "epoch": 0.0668570493656389, "grad_norm": 0.6741511225700378, "learning_rate": 1.999117910544692e-05, "loss": 1.4746, "step": 89900 }, { "epoch": 0.06693141760742492, "grad_norm": 0.9702801704406738, "learning_rate": 1.999115947254157e-05, "loss": 1.5166, "step": 90000 }, { "epoch": 0.06700578584921095, "grad_norm": 0.7757803797721863, "learning_rate": 1.9991139817821416e-05, "loss": 1.5031, "step": 90100 }, { "epoch": 0.06708015409099698, "grad_norm": 0.7200698256492615, "learning_rate": 1.9991120141286502e-05, "loss": 1.5834, "step": 90200 }, { "epoch": 0.067154522332783, "grad_norm": 0.7415780425071716, "learning_rate": 1.999110044293687e-05, "loss": 1.5689, "step": 90300 }, { "epoch": 0.06722889057456903, "grad_norm": 0.5777677297592163, "learning_rate": 1.9991080722772564e-05, "loss": 1.5139, "step": 90400 }, { "epoch": 0.06730325881635506, "grad_norm": 0.6991866827011108, "learning_rate": 1.999106098079363e-05, "loss": 1.5073, "step": 90500 }, { "epoch": 0.06737762705814108, "grad_norm": 0.6112390160560608, "learning_rate": 1.9991041217000105e-05, "loss": 1.4773, "step": 90600 }, { "epoch": 0.06745199529992713, "grad_norm": 0.8287676572799683, "learning_rate": 1.9991021431392033e-05, "loss": 1.5425, "step": 90700 }, { "epoch": 0.06752636354171315, "grad_norm": 0.8582881689071655, "learning_rate": 1.999100162396946e-05, "loss": 1.5581, "step": 90800 }, { "epoch": 0.06760073178349918, "grad_norm": 0.5585276484489441, "learning_rate": 1.999098179473243e-05, "loss": 1.5015, "step": 90900 }, { "epoch": 0.0676751000252852, "grad_norm": 0.4237435460090637, "learning_rate": 1.9990961943680984e-05, "loss": 1.523, "step": 91000 }, { "epoch": 0.06774946826707123, "grad_norm": 0.5455594658851624, "learning_rate": 1.999094207081517e-05, "loss": 1.5448, "step": 91100 }, { "epoch": 0.06782383650885726, "grad_norm": 0.48855817317962646, "learning_rate": 1.999092217613502e-05, "loss": 1.4535, "step": 91200 }, { "epoch": 0.06789820475064329, "grad_norm": 0.5199916958808899, "learning_rate": 1.999090225964059e-05, "loss": 1.4921, "step": 91300 }, { "epoch": 0.06797257299242931, "grad_norm": 0.5790271162986755, "learning_rate": 1.9990882321331916e-05, "loss": 1.5773, "step": 91400 }, { "epoch": 0.06804694123421534, "grad_norm": 0.5524342656135559, "learning_rate": 1.9990862361209043e-05, "loss": 1.4619, "step": 91500 }, { "epoch": 0.06812130947600137, "grad_norm": 0.7153291702270508, "learning_rate": 1.999084237927202e-05, "loss": 1.6042, "step": 91600 }, { "epoch": 0.0681956777177874, "grad_norm": 0.957635223865509, "learning_rate": 1.9990822375520882e-05, "loss": 1.538, "step": 91700 }, { "epoch": 0.06827004595957342, "grad_norm": 0.38240477442741394, "learning_rate": 1.9990802349955678e-05, "loss": 1.5937, "step": 91800 }, { "epoch": 0.06834441420135945, "grad_norm": 0.8961233496665955, "learning_rate": 1.999078230257645e-05, "loss": 1.5119, "step": 91900 }, { "epoch": 0.06841878244314548, "grad_norm": 0.47433900833129883, "learning_rate": 1.999076223338324e-05, "loss": 1.5449, "step": 92000 }, { "epoch": 0.0684931506849315, "grad_norm": 0.8222399353981018, "learning_rate": 1.9990742142376098e-05, "loss": 1.5334, "step": 92100 }, { "epoch": 0.06856751892671753, "grad_norm": 0.464373916387558, "learning_rate": 1.999072202955506e-05, "loss": 1.5003, "step": 92200 }, { "epoch": 0.06864188716850356, "grad_norm": 0.8799763321876526, "learning_rate": 1.9990701894920176e-05, "loss": 1.581, "step": 92300 }, { "epoch": 0.06871625541028958, "grad_norm": 0.9567086100578308, "learning_rate": 1.999068173847149e-05, "loss": 1.4373, "step": 92400 }, { "epoch": 0.06879062365207561, "grad_norm": 0.440479576587677, "learning_rate": 1.999066156020904e-05, "loss": 1.5571, "step": 92500 }, { "epoch": 0.06886499189386165, "grad_norm": 0.7486180663108826, "learning_rate": 1.9990641360132876e-05, "loss": 1.4437, "step": 92600 }, { "epoch": 0.06893936013564768, "grad_norm": 0.7576742172241211, "learning_rate": 1.9990621138243037e-05, "loss": 1.5306, "step": 92700 }, { "epoch": 0.0690137283774337, "grad_norm": 0.6755186915397644, "learning_rate": 1.9990600894539574e-05, "loss": 1.5769, "step": 92800 }, { "epoch": 0.06908809661921973, "grad_norm": 0.6093853712081909, "learning_rate": 1.9990580629022526e-05, "loss": 1.5777, "step": 92900 }, { "epoch": 0.06916246486100576, "grad_norm": 0.5788242220878601, "learning_rate": 1.9990560341691938e-05, "loss": 1.494, "step": 93000 }, { "epoch": 0.06923683310279179, "grad_norm": 0.828676700592041, "learning_rate": 1.9990540032547855e-05, "loss": 1.5651, "step": 93100 }, { "epoch": 0.06931120134457781, "grad_norm": 0.5612863302230835, "learning_rate": 1.9990519701590322e-05, "loss": 1.5584, "step": 93200 }, { "epoch": 0.06938556958636384, "grad_norm": 0.965107262134552, "learning_rate": 1.999049934881938e-05, "loss": 1.497, "step": 93300 }, { "epoch": 0.06945993782814987, "grad_norm": 0.46939852833747864, "learning_rate": 1.9990478974235078e-05, "loss": 1.5716, "step": 93400 }, { "epoch": 0.0695343060699359, "grad_norm": 0.4986964464187622, "learning_rate": 1.999045857783746e-05, "loss": 1.5762, "step": 93500 }, { "epoch": 0.06960867431172192, "grad_norm": 0.4267128109931946, "learning_rate": 1.9990438159626566e-05, "loss": 1.5101, "step": 93600 }, { "epoch": 0.06968304255350795, "grad_norm": 0.411811888217926, "learning_rate": 1.9990417719602445e-05, "loss": 1.5623, "step": 93700 }, { "epoch": 0.06975741079529398, "grad_norm": 0.8761053681373596, "learning_rate": 1.999039725776514e-05, "loss": 1.4294, "step": 93800 }, { "epoch": 0.06983177903708, "grad_norm": 0.9531000852584839, "learning_rate": 1.99903767741147e-05, "loss": 1.4925, "step": 93900 }, { "epoch": 0.06990614727886603, "grad_norm": 0.516830325126648, "learning_rate": 1.999035626865116e-05, "loss": 1.5802, "step": 94000 }, { "epoch": 0.06998051552065206, "grad_norm": 0.47061294317245483, "learning_rate": 1.9990335741374572e-05, "loss": 1.5668, "step": 94100 }, { "epoch": 0.07005488376243808, "grad_norm": 0.7790777683258057, "learning_rate": 1.9990315192284978e-05, "loss": 1.5568, "step": 94200 }, { "epoch": 0.07012925200422411, "grad_norm": 0.75156170129776, "learning_rate": 1.9990294621382426e-05, "loss": 1.5217, "step": 94300 }, { "epoch": 0.07020362024601014, "grad_norm": 1.195028305053711, "learning_rate": 1.999027402866696e-05, "loss": 1.5662, "step": 94400 }, { "epoch": 0.07027798848779618, "grad_norm": 0.6215851306915283, "learning_rate": 1.999025341413862e-05, "loss": 1.5208, "step": 94500 }, { "epoch": 0.0703523567295822, "grad_norm": 0.509843647480011, "learning_rate": 1.9990232777797458e-05, "loss": 1.489, "step": 94600 }, { "epoch": 0.07042672497136823, "grad_norm": 1.2951029539108276, "learning_rate": 1.9990212119643516e-05, "loss": 1.4729, "step": 94700 }, { "epoch": 0.07050109321315426, "grad_norm": 0.5028135776519775, "learning_rate": 1.9990191439676838e-05, "loss": 1.5579, "step": 94800 }, { "epoch": 0.07057546145494029, "grad_norm": 0.7202877998352051, "learning_rate": 1.9990170737897473e-05, "loss": 1.5282, "step": 94900 }, { "epoch": 0.07064982969672631, "grad_norm": 0.9731516242027283, "learning_rate": 1.9990150014305462e-05, "loss": 1.5194, "step": 95000 }, { "epoch": 0.07072419793851234, "grad_norm": 0.7444689273834229, "learning_rate": 1.9990129268900848e-05, "loss": 1.5198, "step": 95100 }, { "epoch": 0.07079856618029837, "grad_norm": 0.9299377202987671, "learning_rate": 1.9990108501683685e-05, "loss": 1.5393, "step": 95200 }, { "epoch": 0.0708729344220844, "grad_norm": 0.6611402630805969, "learning_rate": 1.999008771265401e-05, "loss": 1.5351, "step": 95300 }, { "epoch": 0.07094730266387042, "grad_norm": 0.4772530496120453, "learning_rate": 1.9990066901811876e-05, "loss": 1.5243, "step": 95400 }, { "epoch": 0.07102167090565645, "grad_norm": 0.42998188734054565, "learning_rate": 1.9990046069157322e-05, "loss": 1.5877, "step": 95500 }, { "epoch": 0.07109603914744247, "grad_norm": 0.7415347099304199, "learning_rate": 1.9990025214690396e-05, "loss": 1.5633, "step": 95600 }, { "epoch": 0.0711704073892285, "grad_norm": 0.657112717628479, "learning_rate": 1.999000433841114e-05, "loss": 1.4555, "step": 95700 }, { "epoch": 0.07124477563101453, "grad_norm": 0.9188429713249207, "learning_rate": 1.998998344031961e-05, "loss": 1.4329, "step": 95800 }, { "epoch": 0.07131914387280056, "grad_norm": 0.8823667168617249, "learning_rate": 1.9989962520415836e-05, "loss": 1.4754, "step": 95900 }, { "epoch": 0.07139351211458658, "grad_norm": 0.7276200652122498, "learning_rate": 1.9989941578699878e-05, "loss": 1.5286, "step": 96000 }, { "epoch": 0.07146788035637261, "grad_norm": 0.941512405872345, "learning_rate": 1.998992061517177e-05, "loss": 1.5087, "step": 96100 }, { "epoch": 0.07154224859815864, "grad_norm": 1.0310442447662354, "learning_rate": 1.998989962983157e-05, "loss": 1.5895, "step": 96200 }, { "epoch": 0.07161661683994466, "grad_norm": 1.3620883226394653, "learning_rate": 1.9989878622679317e-05, "loss": 1.474, "step": 96300 }, { "epoch": 0.0716909850817307, "grad_norm": 0.5119801163673401, "learning_rate": 1.998985759371505e-05, "loss": 1.5112, "step": 96400 }, { "epoch": 0.07176535332351673, "grad_norm": 0.8966123461723328, "learning_rate": 1.998983654293883e-05, "loss": 1.4903, "step": 96500 }, { "epoch": 0.07183972156530276, "grad_norm": 0.5336944460868835, "learning_rate": 1.998981547035069e-05, "loss": 1.5673, "step": 96600 }, { "epoch": 0.07191408980708879, "grad_norm": 1.2533961534500122, "learning_rate": 1.9989794375950688e-05, "loss": 1.5039, "step": 96700 }, { "epoch": 0.07198845804887481, "grad_norm": 1.3317081928253174, "learning_rate": 1.9989773259738858e-05, "loss": 1.567, "step": 96800 }, { "epoch": 0.07206282629066084, "grad_norm": 0.49700722098350525, "learning_rate": 1.998975212171525e-05, "loss": 1.542, "step": 96900 }, { "epoch": 0.07213719453244687, "grad_norm": 0.5809246301651001, "learning_rate": 1.9989730961879913e-05, "loss": 1.5097, "step": 97000 }, { "epoch": 0.07221156277423289, "grad_norm": 0.6107625365257263, "learning_rate": 1.9989709780232894e-05, "loss": 1.536, "step": 97100 }, { "epoch": 0.07228593101601892, "grad_norm": 0.5271338820457458, "learning_rate": 1.9989688576774234e-05, "loss": 1.5819, "step": 97200 }, { "epoch": 0.07236029925780495, "grad_norm": 0.6692411303520203, "learning_rate": 1.9989667351503988e-05, "loss": 1.4833, "step": 97300 }, { "epoch": 0.07243466749959097, "grad_norm": 1.0627728700637817, "learning_rate": 1.998964610442219e-05, "loss": 1.5404, "step": 97400 }, { "epoch": 0.072509035741377, "grad_norm": 0.5696298480033875, "learning_rate": 1.9989624835528896e-05, "loss": 1.4491, "step": 97500 }, { "epoch": 0.07258340398316303, "grad_norm": 0.5105301141738892, "learning_rate": 1.998960354482415e-05, "loss": 1.5188, "step": 97600 }, { "epoch": 0.07265777222494905, "grad_norm": 0.53251713514328, "learning_rate": 1.9989582232307998e-05, "loss": 1.5367, "step": 97700 }, { "epoch": 0.07273214046673508, "grad_norm": 0.6559078693389893, "learning_rate": 1.9989560897980485e-05, "loss": 1.4773, "step": 97800 }, { "epoch": 0.07280650870852111, "grad_norm": 0.39833974838256836, "learning_rate": 1.998953954184166e-05, "loss": 1.6063, "step": 97900 }, { "epoch": 0.07288087695030714, "grad_norm": 1.0479645729064941, "learning_rate": 1.9989518163891566e-05, "loss": 1.565, "step": 98000 }, { "epoch": 0.07295524519209316, "grad_norm": 0.7905478477478027, "learning_rate": 1.9989496764130253e-05, "loss": 1.5266, "step": 98100 }, { "epoch": 0.07302961343387919, "grad_norm": 0.4569951295852661, "learning_rate": 1.998947534255777e-05, "loss": 1.5295, "step": 98200 }, { "epoch": 0.07310398167566523, "grad_norm": 0.5308849215507507, "learning_rate": 1.9989453899174158e-05, "loss": 1.5203, "step": 98300 }, { "epoch": 0.07317834991745126, "grad_norm": 0.906802773475647, "learning_rate": 1.998943243397947e-05, "loss": 1.556, "step": 98400 }, { "epoch": 0.07325271815923728, "grad_norm": 0.5071494579315186, "learning_rate": 1.9989410946973747e-05, "loss": 1.5627, "step": 98500 }, { "epoch": 0.07332708640102331, "grad_norm": 0.5252199172973633, "learning_rate": 1.9989389438157037e-05, "loss": 1.5181, "step": 98600 }, { "epoch": 0.07340145464280934, "grad_norm": 0.5738980174064636, "learning_rate": 1.9989367907529394e-05, "loss": 1.6101, "step": 98700 }, { "epoch": 0.07347582288459537, "grad_norm": 0.6898683309555054, "learning_rate": 1.9989346355090853e-05, "loss": 1.579, "step": 98800 }, { "epoch": 0.07355019112638139, "grad_norm": 0.5396860241889954, "learning_rate": 1.998932478084147e-05, "loss": 1.5645, "step": 98900 }, { "epoch": 0.07362455936816742, "grad_norm": 0.5482293367385864, "learning_rate": 1.998930318478129e-05, "loss": 1.5453, "step": 99000 }, { "epoch": 0.07369892760995345, "grad_norm": 0.8394240736961365, "learning_rate": 1.9989281566910363e-05, "loss": 1.5025, "step": 99100 }, { "epoch": 0.07377329585173947, "grad_norm": 0.9409950971603394, "learning_rate": 1.9989259927228725e-05, "loss": 1.5489, "step": 99200 }, { "epoch": 0.0738476640935255, "grad_norm": 0.5597321391105652, "learning_rate": 1.9989238265736437e-05, "loss": 1.5994, "step": 99300 }, { "epoch": 0.07392203233531153, "grad_norm": 0.5139235258102417, "learning_rate": 1.9989216582433538e-05, "loss": 1.5478, "step": 99400 }, { "epoch": 0.07399640057709755, "grad_norm": 0.6312362551689148, "learning_rate": 1.998919487732008e-05, "loss": 1.4989, "step": 99500 }, { "epoch": 0.07407076881888358, "grad_norm": 0.6924223303794861, "learning_rate": 1.9989173150396105e-05, "loss": 1.4491, "step": 99600 }, { "epoch": 0.07414513706066961, "grad_norm": 0.5490585565567017, "learning_rate": 1.9989151401661666e-05, "loss": 1.538, "step": 99700 }, { "epoch": 0.07421950530245564, "grad_norm": 0.630455732345581, "learning_rate": 1.998912963111681e-05, "loss": 1.5286, "step": 99800 }, { "epoch": 0.07429387354424166, "grad_norm": 0.8591504693031311, "learning_rate": 1.998910783876158e-05, "loss": 1.5612, "step": 99900 }, { "epoch": 0.07436824178602769, "grad_norm": 1.0016669034957886, "learning_rate": 1.9989086024596027e-05, "loss": 1.5154, "step": 100000 }, { "epoch": 0.07444261002781372, "grad_norm": 0.6513885259628296, "learning_rate": 1.9989064188620197e-05, "loss": 1.5446, "step": 100100 }, { "epoch": 0.07451697826959976, "grad_norm": 0.6838514804840088, "learning_rate": 1.998904233083414e-05, "loss": 1.5336, "step": 100200 }, { "epoch": 0.07459134651138578, "grad_norm": 0.46571242809295654, "learning_rate": 1.9989020451237903e-05, "loss": 1.4838, "step": 100300 }, { "epoch": 0.07466571475317181, "grad_norm": 0.9936356544494629, "learning_rate": 1.998899854983153e-05, "loss": 1.5929, "step": 100400 }, { "epoch": 0.07474008299495784, "grad_norm": 0.6591018438339233, "learning_rate": 1.9988976626615075e-05, "loss": 1.54, "step": 100500 }, { "epoch": 0.07481445123674386, "grad_norm": 0.8453909754753113, "learning_rate": 1.998895468158858e-05, "loss": 1.5191, "step": 100600 }, { "epoch": 0.07488881947852989, "grad_norm": 0.6555935144424438, "learning_rate": 1.9988932714752095e-05, "loss": 1.5734, "step": 100700 }, { "epoch": 0.07496318772031592, "grad_norm": 0.6445733308792114, "learning_rate": 1.998891072610567e-05, "loss": 1.5516, "step": 100800 }, { "epoch": 0.07503755596210195, "grad_norm": 0.534389078617096, "learning_rate": 1.9988888715649357e-05, "loss": 1.5441, "step": 100900 }, { "epoch": 0.07511192420388797, "grad_norm": 1.068562388420105, "learning_rate": 1.998886668338319e-05, "loss": 1.4998, "step": 101000 }, { "epoch": 0.075186292445674, "grad_norm": 0.6331286430358887, "learning_rate": 1.998884462930723e-05, "loss": 1.5633, "step": 101100 }, { "epoch": 0.07526066068746003, "grad_norm": 1.3566038608551025, "learning_rate": 1.998882255342152e-05, "loss": 1.4621, "step": 101200 }, { "epoch": 0.07533502892924605, "grad_norm": 0.9672004580497742, "learning_rate": 1.998880045572611e-05, "loss": 1.5249, "step": 101300 }, { "epoch": 0.07540939717103208, "grad_norm": 0.36732280254364014, "learning_rate": 1.9988778336221045e-05, "loss": 1.574, "step": 101400 }, { "epoch": 0.07548376541281811, "grad_norm": 0.4788234829902649, "learning_rate": 1.998875619490638e-05, "loss": 1.5418, "step": 101500 }, { "epoch": 0.07555813365460413, "grad_norm": 0.8955681324005127, "learning_rate": 1.9988734031782157e-05, "loss": 1.5568, "step": 101600 }, { "epoch": 0.07563250189639016, "grad_norm": 0.8049163222312927, "learning_rate": 1.9988711846848427e-05, "loss": 1.4838, "step": 101700 }, { "epoch": 0.07570687013817619, "grad_norm": 0.7558008432388306, "learning_rate": 1.9988689640105235e-05, "loss": 1.4955, "step": 101800 }, { "epoch": 0.07578123837996222, "grad_norm": 0.4749026596546173, "learning_rate": 1.9988667411552635e-05, "loss": 1.5929, "step": 101900 }, { "epoch": 0.07585560662174824, "grad_norm": 0.6597522497177124, "learning_rate": 1.998864516119067e-05, "loss": 1.5584, "step": 102000 }, { "epoch": 0.07592997486353428, "grad_norm": 0.7412188053131104, "learning_rate": 1.9988622889019395e-05, "loss": 1.5842, "step": 102100 }, { "epoch": 0.07600434310532031, "grad_norm": 0.5564984679222107, "learning_rate": 1.9988600595038853e-05, "loss": 1.5764, "step": 102200 }, { "epoch": 0.07607871134710634, "grad_norm": 1.0488529205322266, "learning_rate": 1.9988578279249097e-05, "loss": 1.458, "step": 102300 }, { "epoch": 0.07615307958889236, "grad_norm": 1.40269136428833, "learning_rate": 1.998855594165017e-05, "loss": 1.4588, "step": 102400 }, { "epoch": 0.07622744783067839, "grad_norm": 0.8488138318061829, "learning_rate": 1.9988533582242127e-05, "loss": 1.522, "step": 102500 }, { "epoch": 0.07630181607246442, "grad_norm": 0.5191701054573059, "learning_rate": 1.9988511201025015e-05, "loss": 1.5036, "step": 102600 }, { "epoch": 0.07637618431425044, "grad_norm": 0.6648279428482056, "learning_rate": 1.9988488797998878e-05, "loss": 1.4929, "step": 102700 }, { "epoch": 0.07645055255603647, "grad_norm": 1.8600202798843384, "learning_rate": 1.9988466373163774e-05, "loss": 1.5692, "step": 102800 }, { "epoch": 0.0765249207978225, "grad_norm": 0.7583739757537842, "learning_rate": 1.9988443926519743e-05, "loss": 1.5145, "step": 102900 }, { "epoch": 0.07659928903960853, "grad_norm": 0.6128048300743103, "learning_rate": 1.998842145806684e-05, "loss": 1.5729, "step": 103000 }, { "epoch": 0.07667365728139455, "grad_norm": 0.7574602365493774, "learning_rate": 1.998839896780511e-05, "loss": 1.4356, "step": 103100 }, { "epoch": 0.07674802552318058, "grad_norm": 1.4134727716445923, "learning_rate": 1.9988376455734606e-05, "loss": 1.5048, "step": 103200 }, { "epoch": 0.0768223937649666, "grad_norm": 0.7592337727546692, "learning_rate": 1.9988353921855374e-05, "loss": 1.4988, "step": 103300 }, { "epoch": 0.07689676200675263, "grad_norm": 0.522486686706543, "learning_rate": 1.9988331366167465e-05, "loss": 1.5654, "step": 103400 }, { "epoch": 0.07697113024853866, "grad_norm": 0.6535342335700989, "learning_rate": 1.9988308788670925e-05, "loss": 1.4593, "step": 103500 }, { "epoch": 0.07704549849032469, "grad_norm": 0.6663926243782043, "learning_rate": 1.9988286189365808e-05, "loss": 1.477, "step": 103600 }, { "epoch": 0.07711986673211071, "grad_norm": 0.5006215572357178, "learning_rate": 1.998826356825216e-05, "loss": 1.5326, "step": 103700 }, { "epoch": 0.07719423497389674, "grad_norm": 0.6826842427253723, "learning_rate": 1.9988240925330032e-05, "loss": 1.5102, "step": 103800 }, { "epoch": 0.07726860321568277, "grad_norm": 0.2680438756942749, "learning_rate": 1.9988218260599477e-05, "loss": 1.4773, "step": 103900 }, { "epoch": 0.07734297145746881, "grad_norm": 0.9159733057022095, "learning_rate": 1.9988195574060536e-05, "loss": 1.4984, "step": 104000 }, { "epoch": 0.07741733969925484, "grad_norm": 1.0930269956588745, "learning_rate": 1.9988172865713266e-05, "loss": 1.4196, "step": 104100 }, { "epoch": 0.07749170794104086, "grad_norm": 0.6656064391136169, "learning_rate": 1.998815013555771e-05, "loss": 1.5282, "step": 104200 }, { "epoch": 0.07756607618282689, "grad_norm": 0.6679131388664246, "learning_rate": 1.9988127383593923e-05, "loss": 1.4922, "step": 104300 }, { "epoch": 0.07764044442461292, "grad_norm": 0.5231404304504395, "learning_rate": 1.9988104609821953e-05, "loss": 1.4648, "step": 104400 }, { "epoch": 0.07771481266639894, "grad_norm": 0.6543662548065186, "learning_rate": 1.998808181424185e-05, "loss": 1.5349, "step": 104500 }, { "epoch": 0.07778918090818497, "grad_norm": 0.4422987997531891, "learning_rate": 1.9988058996853666e-05, "loss": 1.5031, "step": 104600 }, { "epoch": 0.077863549149971, "grad_norm": 0.74057537317276, "learning_rate": 1.9988036157657444e-05, "loss": 1.5373, "step": 104700 }, { "epoch": 0.07793791739175703, "grad_norm": 0.8893790245056152, "learning_rate": 1.998801329665324e-05, "loss": 1.5177, "step": 104800 }, { "epoch": 0.07801228563354305, "grad_norm": 0.898235559463501, "learning_rate": 1.9987990413841103e-05, "loss": 1.5938, "step": 104900 }, { "epoch": 0.07808665387532908, "grad_norm": 0.566254198551178, "learning_rate": 1.9987967509221082e-05, "loss": 1.4581, "step": 105000 }, { "epoch": 0.0781610221171151, "grad_norm": 0.6054997444152832, "learning_rate": 1.9987944582793226e-05, "loss": 1.5248, "step": 105100 }, { "epoch": 0.07823539035890113, "grad_norm": 0.6898595690727234, "learning_rate": 1.9987921634557588e-05, "loss": 1.5482, "step": 105200 }, { "epoch": 0.07830975860068716, "grad_norm": 0.7741703391075134, "learning_rate": 1.9987898664514213e-05, "loss": 1.5175, "step": 105300 }, { "epoch": 0.07838412684247319, "grad_norm": 0.649459958076477, "learning_rate": 1.9987875672663155e-05, "loss": 1.5702, "step": 105400 }, { "epoch": 0.07845849508425921, "grad_norm": 1.0062605142593384, "learning_rate": 1.9987852659004465e-05, "loss": 1.5077, "step": 105500 }, { "epoch": 0.07853286332604524, "grad_norm": 0.5658386945724487, "learning_rate": 1.9987829623538193e-05, "loss": 1.5682, "step": 105600 }, { "epoch": 0.07860723156783127, "grad_norm": 0.5370775461196899, "learning_rate": 1.9987806566264383e-05, "loss": 1.5635, "step": 105700 }, { "epoch": 0.0786815998096173, "grad_norm": 0.5227326154708862, "learning_rate": 1.9987783487183097e-05, "loss": 1.4999, "step": 105800 } ], "logging_steps": 100, "max_steps": 6723300, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.441297875537322e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }