MohamedAhmedAE's picture
Training in progress, step 105600, checkpoint
512c812 verified
raw
history blame contribute delete
No virus
187 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.07853286332604524,
"eval_steps": 200,
"global_step": 105600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 7.43682417860277e-05,
"grad_norm": 0.4972322881221771,
"learning_rate": 1.9999999990147362e-05,
"loss": 1.9714,
"step": 100
},
{
"epoch": 0.0001487364835720554,
"grad_norm": 0.6138768792152405,
"learning_rate": 1.9999999958487906e-05,
"loss": 1.6983,
"step": 200
},
{
"epoch": 0.0002231047253580831,
"grad_norm": 0.92356276512146,
"learning_rate": 1.999999990499435e-05,
"loss": 1.6566,
"step": 300
},
{
"epoch": 0.0002974729671441108,
"grad_norm": 0.5427595376968384,
"learning_rate": 1.9999999829666684e-05,
"loss": 1.6238,
"step": 400
},
{
"epoch": 0.00037184120893013847,
"grad_norm": 1.5316662788391113,
"learning_rate": 1.9999999732504913e-05,
"loss": 1.6102,
"step": 500
},
{
"epoch": 0.0004462094507161662,
"grad_norm": 0.477271169424057,
"learning_rate": 1.999999961350904e-05,
"loss": 1.5936,
"step": 600
},
{
"epoch": 0.0005205776925021939,
"grad_norm": 1.1669890880584717,
"learning_rate": 1.9999999472679058e-05,
"loss": 1.6411,
"step": 700
},
{
"epoch": 0.0005949459342882216,
"grad_norm": 0.6108381748199463,
"learning_rate": 1.9999999310014972e-05,
"loss": 1.5256,
"step": 800
},
{
"epoch": 0.0006693141760742492,
"grad_norm": 0.6316787004470825,
"learning_rate": 1.9999999125516783e-05,
"loss": 1.6363,
"step": 900
},
{
"epoch": 0.0007436824178602769,
"grad_norm": 0.8376529216766357,
"learning_rate": 1.999999891918449e-05,
"loss": 1.5394,
"step": 1000
},
{
"epoch": 0.0008180506596463047,
"grad_norm": 1.0385313034057617,
"learning_rate": 1.9999998691018094e-05,
"loss": 1.638,
"step": 1100
},
{
"epoch": 0.0008924189014323324,
"grad_norm": 0.8692856431007385,
"learning_rate": 1.9999998441017593e-05,
"loss": 1.7291,
"step": 1200
},
{
"epoch": 0.00096678714321836,
"grad_norm": 1.059537410736084,
"learning_rate": 1.9999998169182993e-05,
"loss": 1.6251,
"step": 1300
},
{
"epoch": 0.0010411553850043877,
"grad_norm": 0.3969714045524597,
"learning_rate": 1.999999787551429e-05,
"loss": 1.5324,
"step": 1400
},
{
"epoch": 0.0011155236267904154,
"grad_norm": 0.6760269403457642,
"learning_rate": 1.9999997560011483e-05,
"loss": 1.5262,
"step": 1500
},
{
"epoch": 0.0011898918685764432,
"grad_norm": 0.6536991596221924,
"learning_rate": 1.9999997222674577e-05,
"loss": 1.6578,
"step": 1600
},
{
"epoch": 0.0012642601103624708,
"grad_norm": 1.2318875789642334,
"learning_rate": 1.999999686350357e-05,
"loss": 1.5478,
"step": 1700
},
{
"epoch": 0.0013386283521484984,
"grad_norm": 1.3172451257705688,
"learning_rate": 1.999999648249847e-05,
"loss": 1.5926,
"step": 1800
},
{
"epoch": 0.0014129965939345263,
"grad_norm": 1.4219484329223633,
"learning_rate": 1.9999996079659265e-05,
"loss": 1.5595,
"step": 1900
},
{
"epoch": 0.0014873648357205539,
"grad_norm": 0.6480392813682556,
"learning_rate": 1.9999995654985968e-05,
"loss": 1.5321,
"step": 2000
},
{
"epoch": 0.0015617330775065815,
"grad_norm": 0.5489968061447144,
"learning_rate": 1.999999520847857e-05,
"loss": 1.5744,
"step": 2100
},
{
"epoch": 0.0016361013192926093,
"grad_norm": 0.7695141434669495,
"learning_rate": 1.999999474013708e-05,
"loss": 1.5263,
"step": 2200
},
{
"epoch": 0.001710469561078637,
"grad_norm": 0.7596250176429749,
"learning_rate": 1.9999994249961495e-05,
"loss": 1.5586,
"step": 2300
},
{
"epoch": 0.0017848378028646648,
"grad_norm": 0.8226674795150757,
"learning_rate": 1.9999993737951816e-05,
"loss": 1.6021,
"step": 2400
},
{
"epoch": 0.0018592060446506924,
"grad_norm": 0.5418084859848022,
"learning_rate": 1.9999993204108044e-05,
"loss": 1.6234,
"step": 2500
},
{
"epoch": 0.00193357428643672,
"grad_norm": 0.5253565907478333,
"learning_rate": 1.9999992648430182e-05,
"loss": 1.5487,
"step": 2600
},
{
"epoch": 0.0020079425282227476,
"grad_norm": 1.0812253952026367,
"learning_rate": 1.999999207091823e-05,
"loss": 1.6161,
"step": 2700
},
{
"epoch": 0.0020823107700087755,
"grad_norm": 0.6357698440551758,
"learning_rate": 1.999999147157219e-05,
"loss": 1.5693,
"step": 2800
},
{
"epoch": 0.0021566790117948033,
"grad_norm": 0.9794847369194031,
"learning_rate": 1.9999990850392064e-05,
"loss": 1.5337,
"step": 2900
},
{
"epoch": 0.0022310472535808307,
"grad_norm": 0.5611212849617004,
"learning_rate": 1.9999990207377848e-05,
"loss": 1.6034,
"step": 3000
},
{
"epoch": 0.0023054154953668585,
"grad_norm": 0.8199095129966736,
"learning_rate": 1.999998954252955e-05,
"loss": 1.5291,
"step": 3100
},
{
"epoch": 0.0023797837371528864,
"grad_norm": 0.6310203075408936,
"learning_rate": 1.999998885584717e-05,
"loss": 1.5337,
"step": 3200
},
{
"epoch": 0.0024541519789389138,
"grad_norm": 0.8682138919830322,
"learning_rate": 1.9999988147330707e-05,
"loss": 1.5383,
"step": 3300
},
{
"epoch": 0.0025285202207249416,
"grad_norm": 0.6630149483680725,
"learning_rate": 1.9999987416980167e-05,
"loss": 1.6387,
"step": 3400
},
{
"epoch": 0.0026028884625109694,
"grad_norm": 0.5285632014274597,
"learning_rate": 1.9999986664795547e-05,
"loss": 1.5507,
"step": 3500
},
{
"epoch": 0.002677256704296997,
"grad_norm": 0.5242965221405029,
"learning_rate": 1.9999985890776846e-05,
"loss": 1.6422,
"step": 3600
},
{
"epoch": 0.0027516249460830247,
"grad_norm": 0.4600646495819092,
"learning_rate": 1.9999985094924076e-05,
"loss": 1.6473,
"step": 3700
},
{
"epoch": 0.0028259931878690525,
"grad_norm": 0.6593307256698608,
"learning_rate": 1.999998427723723e-05,
"loss": 1.5936,
"step": 3800
},
{
"epoch": 0.00290036142965508,
"grad_norm": 0.3825130760669708,
"learning_rate": 1.9999983437716315e-05,
"loss": 1.5509,
"step": 3900
},
{
"epoch": 0.0029747296714411077,
"grad_norm": 0.46043556928634644,
"learning_rate": 1.999998257636133e-05,
"loss": 1.5043,
"step": 4000
},
{
"epoch": 0.0030490979132271356,
"grad_norm": 0.751379132270813,
"learning_rate": 1.999998169317227e-05,
"loss": 1.5258,
"step": 4100
},
{
"epoch": 0.003123466155013163,
"grad_norm": 0.5719695687294006,
"learning_rate": 1.9999980788149155e-05,
"loss": 1.669,
"step": 4200
},
{
"epoch": 0.003197834396799191,
"grad_norm": 0.5489699244499207,
"learning_rate": 1.999997986129197e-05,
"loss": 1.5581,
"step": 4300
},
{
"epoch": 0.0032722026385852187,
"grad_norm": 0.5944995880126953,
"learning_rate": 1.9999978912600722e-05,
"loss": 1.5717,
"step": 4400
},
{
"epoch": 0.003346570880371246,
"grad_norm": 0.4564272165298462,
"learning_rate": 1.9999977942075416e-05,
"loss": 1.5178,
"step": 4500
},
{
"epoch": 0.003420939122157274,
"grad_norm": 1.082127571105957,
"learning_rate": 1.9999976949716057e-05,
"loss": 1.6077,
"step": 4600
},
{
"epoch": 0.0034953073639433017,
"grad_norm": 0.7081079483032227,
"learning_rate": 1.9999975935522635e-05,
"loss": 1.6147,
"step": 4700
},
{
"epoch": 0.0035696756057293296,
"grad_norm": 1.084369421005249,
"learning_rate": 1.9999974899495163e-05,
"loss": 1.5796,
"step": 4800
},
{
"epoch": 0.003644043847515357,
"grad_norm": 0.5583994388580322,
"learning_rate": 1.999997384163364e-05,
"loss": 1.5099,
"step": 4900
},
{
"epoch": 0.003718412089301385,
"grad_norm": 0.563099205493927,
"learning_rate": 1.999997276193807e-05,
"loss": 1.5222,
"step": 5000
},
{
"epoch": 0.0037927803310874126,
"grad_norm": 0.6037421822547913,
"learning_rate": 1.9999971660408454e-05,
"loss": 1.5916,
"step": 5100
},
{
"epoch": 0.00386714857287344,
"grad_norm": 0.5209466218948364,
"learning_rate": 1.9999970537044787e-05,
"loss": 1.6196,
"step": 5200
},
{
"epoch": 0.003941516814659467,
"grad_norm": 1.0418217182159424,
"learning_rate": 1.9999969391847088e-05,
"loss": 1.601,
"step": 5300
},
{
"epoch": 0.004015885056445495,
"grad_norm": 1.235737681388855,
"learning_rate": 1.9999968224815345e-05,
"loss": 1.4994,
"step": 5400
},
{
"epoch": 0.004090253298231523,
"grad_norm": 1.1249513626098633,
"learning_rate": 1.9999967035949567e-05,
"loss": 1.5871,
"step": 5500
},
{
"epoch": 0.004164621540017551,
"grad_norm": 0.8271663784980774,
"learning_rate": 1.9999965825249753e-05,
"loss": 1.5734,
"step": 5600
},
{
"epoch": 0.004238989781803579,
"grad_norm": 0.6501545906066895,
"learning_rate": 1.999996459271591e-05,
"loss": 1.5859,
"step": 5700
},
{
"epoch": 0.004313358023589607,
"grad_norm": 0.6576992273330688,
"learning_rate": 1.9999963338348036e-05,
"loss": 1.5457,
"step": 5800
},
{
"epoch": 0.004387726265375634,
"grad_norm": 0.5684088468551636,
"learning_rate": 1.9999962062146138e-05,
"loss": 1.5518,
"step": 5900
},
{
"epoch": 0.004462094507161661,
"grad_norm": 0.6332255005836487,
"learning_rate": 1.9999960764110216e-05,
"loss": 1.586,
"step": 6000
},
{
"epoch": 0.004536462748947689,
"grad_norm": 0.4564649164676666,
"learning_rate": 1.9999959444240276e-05,
"loss": 1.5249,
"step": 6100
},
{
"epoch": 0.004610830990733717,
"grad_norm": 0.5801929235458374,
"learning_rate": 1.9999958102536316e-05,
"loss": 1.5849,
"step": 6200
},
{
"epoch": 0.004685199232519745,
"grad_norm": 0.8843029737472534,
"learning_rate": 1.9999956738998345e-05,
"loss": 1.5055,
"step": 6300
},
{
"epoch": 0.004759567474305773,
"grad_norm": 0.7232934832572937,
"learning_rate": 1.999995535362636e-05,
"loss": 1.5706,
"step": 6400
},
{
"epoch": 0.004833935716091801,
"grad_norm": 0.7958771586418152,
"learning_rate": 1.9999953946420368e-05,
"loss": 1.5943,
"step": 6500
},
{
"epoch": 0.0049083039578778275,
"grad_norm": 0.7699094414710999,
"learning_rate": 1.999995251738037e-05,
"loss": 1.6173,
"step": 6600
},
{
"epoch": 0.004982672199663855,
"grad_norm": 0.43996062874794006,
"learning_rate": 1.9999951066506368e-05,
"loss": 1.5154,
"step": 6700
},
{
"epoch": 0.005057040441449883,
"grad_norm": 0.773326575756073,
"learning_rate": 1.9999949593798372e-05,
"loss": 1.5791,
"step": 6800
},
{
"epoch": 0.005131408683235911,
"grad_norm": 0.42401251196861267,
"learning_rate": 1.9999948099256374e-05,
"loss": 1.5429,
"step": 6900
},
{
"epoch": 0.005205776925021939,
"grad_norm": 0.44549378752708435,
"learning_rate": 1.999994658288039e-05,
"loss": 1.605,
"step": 7000
},
{
"epoch": 0.005280145166807967,
"grad_norm": 0.5648560523986816,
"learning_rate": 1.999994504467041e-05,
"loss": 1.5536,
"step": 7100
},
{
"epoch": 0.005354513408593994,
"grad_norm": 1.0245320796966553,
"learning_rate": 1.999994348462645e-05,
"loss": 1.5509,
"step": 7200
},
{
"epoch": 0.0054288816503800215,
"grad_norm": 0.9695309996604919,
"learning_rate": 1.9999941902748505e-05,
"loss": 1.5892,
"step": 7300
},
{
"epoch": 0.005503249892166049,
"grad_norm": 0.9779026508331299,
"learning_rate": 1.9999940299036584e-05,
"loss": 1.5901,
"step": 7400
},
{
"epoch": 0.005577618133952077,
"grad_norm": 0.7186980247497559,
"learning_rate": 1.999993867349068e-05,
"loss": 1.5402,
"step": 7500
},
{
"epoch": 0.005651986375738105,
"grad_norm": 0.751449704170227,
"learning_rate": 1.9999937026110813e-05,
"loss": 1.5217,
"step": 7600
},
{
"epoch": 0.005726354617524133,
"grad_norm": 0.7808834314346313,
"learning_rate": 1.999993535689697e-05,
"loss": 1.5254,
"step": 7700
},
{
"epoch": 0.00580072285931016,
"grad_norm": 0.529984176158905,
"learning_rate": 1.999993366584917e-05,
"loss": 1.6134,
"step": 7800
},
{
"epoch": 0.005875091101096188,
"grad_norm": 0.8374336361885071,
"learning_rate": 1.9999931952967404e-05,
"loss": 1.4759,
"step": 7900
},
{
"epoch": 0.0059494593428822155,
"grad_norm": 0.3483956754207611,
"learning_rate": 1.9999930218251683e-05,
"loss": 1.5905,
"step": 8000
},
{
"epoch": 0.006023827584668243,
"grad_norm": 0.8897103667259216,
"learning_rate": 1.9999928461702004e-05,
"loss": 1.6492,
"step": 8100
},
{
"epoch": 0.006098195826454271,
"grad_norm": 0.5743923783302307,
"learning_rate": 1.999992668331838e-05,
"loss": 1.5322,
"step": 8200
},
{
"epoch": 0.006172564068240299,
"grad_norm": 1.3532215356826782,
"learning_rate": 1.999992488310081e-05,
"loss": 1.5155,
"step": 8300
},
{
"epoch": 0.006246932310026326,
"grad_norm": 1.118270754814148,
"learning_rate": 1.9999923061049298e-05,
"loss": 1.517,
"step": 8400
},
{
"epoch": 0.006321300551812354,
"grad_norm": 1.0752383470535278,
"learning_rate": 1.9999921217163847e-05,
"loss": 1.5654,
"step": 8500
},
{
"epoch": 0.006395668793598382,
"grad_norm": 0.4761950671672821,
"learning_rate": 1.999991935144446e-05,
"loss": 1.588,
"step": 8600
},
{
"epoch": 0.0064700370353844095,
"grad_norm": 0.4377930164337158,
"learning_rate": 1.9999917463891147e-05,
"loss": 1.5932,
"step": 8700
},
{
"epoch": 0.006544405277170437,
"grad_norm": 0.5289610624313354,
"learning_rate": 1.9999915554503908e-05,
"loss": 1.5362,
"step": 8800
},
{
"epoch": 0.006618773518956465,
"grad_norm": 0.6469466090202332,
"learning_rate": 1.9999913623282747e-05,
"loss": 1.5515,
"step": 8900
},
{
"epoch": 0.006693141760742492,
"grad_norm": 0.8052897453308105,
"learning_rate": 1.999991167022767e-05,
"loss": 1.4691,
"step": 9000
},
{
"epoch": 0.00676751000252852,
"grad_norm": 0.4677363932132721,
"learning_rate": 1.999990969533868e-05,
"loss": 1.5955,
"step": 9100
},
{
"epoch": 0.006841878244314548,
"grad_norm": 0.9299643039703369,
"learning_rate": 1.9999907698615777e-05,
"loss": 1.5657,
"step": 9200
},
{
"epoch": 0.006916246486100576,
"grad_norm": 0.5175402164459229,
"learning_rate": 1.9999905680058974e-05,
"loss": 1.5471,
"step": 9300
},
{
"epoch": 0.0069906147278866035,
"grad_norm": 0.6280660033226013,
"learning_rate": 1.999990363966827e-05,
"loss": 1.5465,
"step": 9400
},
{
"epoch": 0.007064982969672631,
"grad_norm": 0.5920536518096924,
"learning_rate": 1.999990157744367e-05,
"loss": 1.5587,
"step": 9500
},
{
"epoch": 0.007139351211458659,
"grad_norm": 0.6226286292076111,
"learning_rate": 1.999989949338518e-05,
"loss": 1.5399,
"step": 9600
},
{
"epoch": 0.007213719453244686,
"grad_norm": 0.757337749004364,
"learning_rate": 1.9999897387492803e-05,
"loss": 1.5142,
"step": 9700
},
{
"epoch": 0.007288087695030714,
"grad_norm": 0.5596433281898499,
"learning_rate": 1.9999895259766547e-05,
"loss": 1.4845,
"step": 9800
},
{
"epoch": 0.007362455936816742,
"grad_norm": 0.8564650416374207,
"learning_rate": 1.999989311020641e-05,
"loss": 1.5007,
"step": 9900
},
{
"epoch": 0.00743682417860277,
"grad_norm": 0.7305134534835815,
"learning_rate": 1.99998909388124e-05,
"loss": 1.4579,
"step": 10000
},
{
"epoch": 0.007511192420388797,
"grad_norm": 0.5316299200057983,
"learning_rate": 1.9999888745584525e-05,
"loss": 1.5686,
"step": 10100
},
{
"epoch": 0.007585560662174825,
"grad_norm": 0.8033043742179871,
"learning_rate": 1.9999886530522786e-05,
"loss": 1.5322,
"step": 10200
},
{
"epoch": 0.007659928903960852,
"grad_norm": 0.8600965738296509,
"learning_rate": 1.999988429362719e-05,
"loss": 1.5171,
"step": 10300
},
{
"epoch": 0.00773429714574688,
"grad_norm": 0.7395327091217041,
"learning_rate": 1.9999882034897743e-05,
"loss": 1.5651,
"step": 10400
},
{
"epoch": 0.007808665387532908,
"grad_norm": 0.7305371761322021,
"learning_rate": 1.9999879754334445e-05,
"loss": 1.5098,
"step": 10500
},
{
"epoch": 0.007883033629318935,
"grad_norm": 0.6956737637519836,
"learning_rate": 1.99998774519373e-05,
"loss": 1.6477,
"step": 10600
},
{
"epoch": 0.007957401871104964,
"grad_norm": 0.8382702469825745,
"learning_rate": 1.9999875127706324e-05,
"loss": 1.5233,
"step": 10700
},
{
"epoch": 0.00803177011289099,
"grad_norm": 0.37894684076309204,
"learning_rate": 1.999987278164151e-05,
"loss": 1.5008,
"step": 10800
},
{
"epoch": 0.00810613835467702,
"grad_norm": 0.5010106563568115,
"learning_rate": 1.9999870413742868e-05,
"loss": 1.5424,
"step": 10900
},
{
"epoch": 0.008180506596463046,
"grad_norm": 0.6536372900009155,
"learning_rate": 1.9999868024010403e-05,
"loss": 1.5774,
"step": 11000
},
{
"epoch": 0.008254874838249075,
"grad_norm": 0.43751344084739685,
"learning_rate": 1.9999865612444122e-05,
"loss": 1.5887,
"step": 11100
},
{
"epoch": 0.008329243080035102,
"grad_norm": 0.4979201853275299,
"learning_rate": 1.999986317904403e-05,
"loss": 1.5715,
"step": 11200
},
{
"epoch": 0.008403611321821129,
"grad_norm": 0.513481855392456,
"learning_rate": 1.9999860723810127e-05,
"loss": 1.5182,
"step": 11300
},
{
"epoch": 0.008477979563607158,
"grad_norm": 0.5014403462409973,
"learning_rate": 1.9999858246742425e-05,
"loss": 1.5185,
"step": 11400
},
{
"epoch": 0.008552347805393185,
"grad_norm": 0.5066354274749756,
"learning_rate": 1.9999855747840925e-05,
"loss": 1.5964,
"step": 11500
},
{
"epoch": 0.008626716047179213,
"grad_norm": 0.7306295037269592,
"learning_rate": 1.999985322710564e-05,
"loss": 1.6196,
"step": 11600
},
{
"epoch": 0.00870108428896524,
"grad_norm": 0.3036212623119354,
"learning_rate": 1.9999850684536562e-05,
"loss": 1.5308,
"step": 11700
},
{
"epoch": 0.008775452530751267,
"grad_norm": 0.51576167345047,
"learning_rate": 1.999984812013371e-05,
"loss": 1.5954,
"step": 11800
},
{
"epoch": 0.008849820772537296,
"grad_norm": 0.7507824301719666,
"learning_rate": 1.999984553389708e-05,
"loss": 1.5184,
"step": 11900
},
{
"epoch": 0.008924189014323323,
"grad_norm": 0.43882057070732117,
"learning_rate": 1.999984292582668e-05,
"loss": 1.5507,
"step": 12000
},
{
"epoch": 0.008998557256109352,
"grad_norm": 1.0746114253997803,
"learning_rate": 1.9999840295922518e-05,
"loss": 1.5196,
"step": 12100
},
{
"epoch": 0.009072925497895378,
"grad_norm": 0.6190723180770874,
"learning_rate": 1.99998376441846e-05,
"loss": 1.5683,
"step": 12200
},
{
"epoch": 0.009147293739681407,
"grad_norm": 0.7086498141288757,
"learning_rate": 1.9999834970612934e-05,
"loss": 1.6125,
"step": 12300
},
{
"epoch": 0.009221661981467434,
"grad_norm": 0.9270760416984558,
"learning_rate": 1.999983227520752e-05,
"loss": 1.6108,
"step": 12400
},
{
"epoch": 0.009296030223253461,
"grad_norm": 0.47269493341445923,
"learning_rate": 1.9999829557968365e-05,
"loss": 1.5784,
"step": 12500
},
{
"epoch": 0.00937039846503949,
"grad_norm": 0.888103723526001,
"learning_rate": 1.9999826818895477e-05,
"loss": 1.5406,
"step": 12600
},
{
"epoch": 0.009444766706825517,
"grad_norm": 0.44103074073791504,
"learning_rate": 1.9999824057988865e-05,
"loss": 1.6345,
"step": 12700
},
{
"epoch": 0.009519134948611545,
"grad_norm": 0.8790387511253357,
"learning_rate": 1.999982127524853e-05,
"loss": 1.5069,
"step": 12800
},
{
"epoch": 0.009593503190397572,
"grad_norm": 0.7071767449378967,
"learning_rate": 1.9999818470674474e-05,
"loss": 1.5656,
"step": 12900
},
{
"epoch": 0.009667871432183601,
"grad_norm": 0.36154705286026,
"learning_rate": 1.9999815644266713e-05,
"loss": 1.5022,
"step": 13000
},
{
"epoch": 0.009742239673969628,
"grad_norm": 0.8780633807182312,
"learning_rate": 1.9999812796025247e-05,
"loss": 1.5585,
"step": 13100
},
{
"epoch": 0.009816607915755655,
"grad_norm": 0.45413634181022644,
"learning_rate": 1.9999809925950084e-05,
"loss": 1.5732,
"step": 13200
},
{
"epoch": 0.009890976157541684,
"grad_norm": 0.6584810614585876,
"learning_rate": 1.999980703404123e-05,
"loss": 1.5572,
"step": 13300
},
{
"epoch": 0.00996534439932771,
"grad_norm": 0.4910299479961395,
"learning_rate": 1.9999804120298694e-05,
"loss": 1.5544,
"step": 13400
},
{
"epoch": 0.01003971264111374,
"grad_norm": 0.4675331115722656,
"learning_rate": 1.9999801184722477e-05,
"loss": 1.5055,
"step": 13500
},
{
"epoch": 0.010114080882899766,
"grad_norm": 0.7481106519699097,
"learning_rate": 1.999979822731259e-05,
"loss": 1.5148,
"step": 13600
},
{
"epoch": 0.010188449124685793,
"grad_norm": 0.5710633993148804,
"learning_rate": 1.9999795248069036e-05,
"loss": 1.6359,
"step": 13700
},
{
"epoch": 0.010262817366471822,
"grad_norm": 0.5404725074768066,
"learning_rate": 1.999979224699183e-05,
"loss": 1.5919,
"step": 13800
},
{
"epoch": 0.010337185608257849,
"grad_norm": 0.5491372346878052,
"learning_rate": 1.9999789224080965e-05,
"loss": 1.6065,
"step": 13900
},
{
"epoch": 0.010411553850043878,
"grad_norm": 0.3632746934890747,
"learning_rate": 1.9999786179336454e-05,
"loss": 1.5333,
"step": 14000
},
{
"epoch": 0.010485922091829905,
"grad_norm": 0.43190881609916687,
"learning_rate": 1.9999783112758305e-05,
"loss": 1.5359,
"step": 14100
},
{
"epoch": 0.010560290333615933,
"grad_norm": 0.6655808687210083,
"learning_rate": 1.9999780024346525e-05,
"loss": 1.6012,
"step": 14200
},
{
"epoch": 0.01063465857540196,
"grad_norm": 0.7489643692970276,
"learning_rate": 1.999977691410112e-05,
"loss": 1.5545,
"step": 14300
},
{
"epoch": 0.010709026817187987,
"grad_norm": 0.4741237759590149,
"learning_rate": 1.9999773782022095e-05,
"loss": 1.5303,
"step": 14400
},
{
"epoch": 0.010783395058974016,
"grad_norm": 0.7895578145980835,
"learning_rate": 1.9999770628109458e-05,
"loss": 1.5997,
"step": 14500
},
{
"epoch": 0.010857763300760043,
"grad_norm": 0.6510291695594788,
"learning_rate": 1.9999767452363215e-05,
"loss": 1.483,
"step": 14600
},
{
"epoch": 0.010932131542546072,
"grad_norm": 0.5989207029342651,
"learning_rate": 1.9999764254783376e-05,
"loss": 1.5073,
"step": 14700
},
{
"epoch": 0.011006499784332099,
"grad_norm": 0.5995681881904602,
"learning_rate": 1.9999761035369946e-05,
"loss": 1.5439,
"step": 14800
},
{
"epoch": 0.011080868026118126,
"grad_norm": 0.6359573602676392,
"learning_rate": 1.9999757794122933e-05,
"loss": 1.5821,
"step": 14900
},
{
"epoch": 0.011155236267904154,
"grad_norm": 0.404085636138916,
"learning_rate": 1.9999754531042338e-05,
"loss": 1.556,
"step": 15000
},
{
"epoch": 0.011229604509690181,
"grad_norm": 0.660020112991333,
"learning_rate": 1.9999751246128175e-05,
"loss": 1.5713,
"step": 15100
},
{
"epoch": 0.01130397275147621,
"grad_norm": 0.7031283378601074,
"learning_rate": 1.9999747939380453e-05,
"loss": 1.4647,
"step": 15200
},
{
"epoch": 0.011378340993262237,
"grad_norm": 0.5159358978271484,
"learning_rate": 1.9999744610799173e-05,
"loss": 1.5298,
"step": 15300
},
{
"epoch": 0.011452709235048266,
"grad_norm": 0.5451757907867432,
"learning_rate": 1.9999741260384345e-05,
"loss": 1.6068,
"step": 15400
},
{
"epoch": 0.011527077476834293,
"grad_norm": 0.9550883769989014,
"learning_rate": 1.9999737888135975e-05,
"loss": 1.5665,
"step": 15500
},
{
"epoch": 0.01160144571862032,
"grad_norm": 0.3711983859539032,
"learning_rate": 1.999973449405407e-05,
"loss": 1.5189,
"step": 15600
},
{
"epoch": 0.011675813960406348,
"grad_norm": 1.052902340888977,
"learning_rate": 1.9999731078138643e-05,
"loss": 1.6834,
"step": 15700
},
{
"epoch": 0.011750182202192375,
"grad_norm": 0.6009785532951355,
"learning_rate": 1.9999727640389697e-05,
"loss": 1.5497,
"step": 15800
},
{
"epoch": 0.011824550443978404,
"grad_norm": 0.5357051491737366,
"learning_rate": 1.999972418080724e-05,
"loss": 1.5163,
"step": 15900
},
{
"epoch": 0.011898918685764431,
"grad_norm": 0.5712498426437378,
"learning_rate": 1.9999720699391275e-05,
"loss": 1.5611,
"step": 16000
},
{
"epoch": 0.01197328692755046,
"grad_norm": 0.5744183659553528,
"learning_rate": 1.999971719614182e-05,
"loss": 1.5405,
"step": 16100
},
{
"epoch": 0.012047655169336487,
"grad_norm": 0.42877888679504395,
"learning_rate": 1.9999713671058874e-05,
"loss": 1.5595,
"step": 16200
},
{
"epoch": 0.012122023411122514,
"grad_norm": 0.7209616303443909,
"learning_rate": 1.9999710124142445e-05,
"loss": 1.5457,
"step": 16300
},
{
"epoch": 0.012196391652908542,
"grad_norm": 0.7052120566368103,
"learning_rate": 1.999970655539255e-05,
"loss": 1.5724,
"step": 16400
},
{
"epoch": 0.01227075989469457,
"grad_norm": 0.45960021018981934,
"learning_rate": 1.9999702964809182e-05,
"loss": 1.5479,
"step": 16500
},
{
"epoch": 0.012345128136480598,
"grad_norm": 0.4394296407699585,
"learning_rate": 1.9999699352392362e-05,
"loss": 1.55,
"step": 16600
},
{
"epoch": 0.012419496378266625,
"grad_norm": 1.227424144744873,
"learning_rate": 1.999969571814209e-05,
"loss": 1.5187,
"step": 16700
},
{
"epoch": 0.012493864620052652,
"grad_norm": 0.8249584436416626,
"learning_rate": 1.9999692062058376e-05,
"loss": 1.5677,
"step": 16800
},
{
"epoch": 0.01256823286183868,
"grad_norm": 0.8973199725151062,
"learning_rate": 1.999968838414123e-05,
"loss": 1.5501,
"step": 16900
},
{
"epoch": 0.012642601103624708,
"grad_norm": 0.716529905796051,
"learning_rate": 1.999968468439066e-05,
"loss": 1.6245,
"step": 17000
},
{
"epoch": 0.012716969345410736,
"grad_norm": 0.5941506624221802,
"learning_rate": 1.999968096280667e-05,
"loss": 1.4951,
"step": 17100
},
{
"epoch": 0.012791337587196763,
"grad_norm": 1.8864718675613403,
"learning_rate": 1.999967721938927e-05,
"loss": 1.5397,
"step": 17200
},
{
"epoch": 0.012865705828982792,
"grad_norm": 0.6418184638023376,
"learning_rate": 1.999967345413847e-05,
"loss": 1.4693,
"step": 17300
},
{
"epoch": 0.012940074070768819,
"grad_norm": 0.6764699220657349,
"learning_rate": 1.999966966705428e-05,
"loss": 1.4635,
"step": 17400
},
{
"epoch": 0.013014442312554846,
"grad_norm": 0.7185351848602295,
"learning_rate": 1.9999665858136704e-05,
"loss": 1.5252,
"step": 17500
},
{
"epoch": 0.013088810554340875,
"grad_norm": 0.42110446095466614,
"learning_rate": 1.9999662027385748e-05,
"loss": 1.5908,
"step": 17600
},
{
"epoch": 0.013163178796126902,
"grad_norm": 0.6807708144187927,
"learning_rate": 1.999965817480143e-05,
"loss": 1.6466,
"step": 17700
},
{
"epoch": 0.01323754703791293,
"grad_norm": 0.5771286487579346,
"learning_rate": 1.999965430038375e-05,
"loss": 1.5361,
"step": 17800
},
{
"epoch": 0.013311915279698957,
"grad_norm": 0.5322648882865906,
"learning_rate": 1.9999650404132715e-05,
"loss": 1.5638,
"step": 17900
},
{
"epoch": 0.013386283521484984,
"grad_norm": 0.865608274936676,
"learning_rate": 1.9999646486048342e-05,
"loss": 1.4568,
"step": 18000
},
{
"epoch": 0.013460651763271013,
"grad_norm": 0.7592107057571411,
"learning_rate": 1.9999642546130634e-05,
"loss": 1.5669,
"step": 18100
},
{
"epoch": 0.01353502000505704,
"grad_norm": 0.673466145992279,
"learning_rate": 1.9999638584379602e-05,
"loss": 1.5141,
"step": 18200
},
{
"epoch": 0.013609388246843069,
"grad_norm": 0.574698269367218,
"learning_rate": 1.9999634600795252e-05,
"loss": 1.5703,
"step": 18300
},
{
"epoch": 0.013683756488629096,
"grad_norm": 0.6722753643989563,
"learning_rate": 1.9999630595377595e-05,
"loss": 1.5843,
"step": 18400
},
{
"epoch": 0.013758124730415124,
"grad_norm": 0.9738336801528931,
"learning_rate": 1.9999626568126636e-05,
"loss": 1.4878,
"step": 18500
},
{
"epoch": 0.013832492972201151,
"grad_norm": 0.5274741649627686,
"learning_rate": 1.999962251904239e-05,
"loss": 1.5254,
"step": 18600
},
{
"epoch": 0.013906861213987178,
"grad_norm": 1.725870966911316,
"learning_rate": 1.999961844812486e-05,
"loss": 1.4785,
"step": 18700
},
{
"epoch": 0.013981229455773207,
"grad_norm": 0.6889399886131287,
"learning_rate": 1.9999614355374058e-05,
"loss": 1.5437,
"step": 18800
},
{
"epoch": 0.014055597697559234,
"grad_norm": 0.576836884021759,
"learning_rate": 1.9999610240789994e-05,
"loss": 1.4949,
"step": 18900
},
{
"epoch": 0.014129965939345263,
"grad_norm": 0.3870568871498108,
"learning_rate": 1.9999606104372674e-05,
"loss": 1.5376,
"step": 19000
},
{
"epoch": 0.01420433418113129,
"grad_norm": 1.1045247316360474,
"learning_rate": 1.9999601946122107e-05,
"loss": 1.6825,
"step": 19100
},
{
"epoch": 0.014278702422917318,
"grad_norm": 0.49821707606315613,
"learning_rate": 1.9999597766038304e-05,
"loss": 1.4909,
"step": 19200
},
{
"epoch": 0.014353070664703345,
"grad_norm": 0.4011678695678711,
"learning_rate": 1.9999593564121275e-05,
"loss": 1.4673,
"step": 19300
},
{
"epoch": 0.014427438906489372,
"grad_norm": 0.46667736768722534,
"learning_rate": 1.9999589340371026e-05,
"loss": 1.5537,
"step": 19400
},
{
"epoch": 0.014501807148275401,
"grad_norm": 0.4063940942287445,
"learning_rate": 1.9999585094787567e-05,
"loss": 1.4736,
"step": 19500
},
{
"epoch": 0.014576175390061428,
"grad_norm": 0.5824026465415955,
"learning_rate": 1.9999580827370906e-05,
"loss": 1.6191,
"step": 19600
},
{
"epoch": 0.014650543631847457,
"grad_norm": 0.5595284104347229,
"learning_rate": 1.999957653812106e-05,
"loss": 1.5294,
"step": 19700
},
{
"epoch": 0.014724911873633484,
"grad_norm": 0.6950704455375671,
"learning_rate": 1.9999572227038028e-05,
"loss": 1.5015,
"step": 19800
},
{
"epoch": 0.01479928011541951,
"grad_norm": 0.4345974028110504,
"learning_rate": 1.999956789412183e-05,
"loss": 1.4955,
"step": 19900
},
{
"epoch": 0.01487364835720554,
"grad_norm": 0.475046306848526,
"learning_rate": 1.9999563539372464e-05,
"loss": 1.5663,
"step": 20000
},
{
"epoch": 0.014948016598991566,
"grad_norm": 0.3211815357208252,
"learning_rate": 1.9999559162789946e-05,
"loss": 1.5379,
"step": 20100
},
{
"epoch": 0.015022384840777595,
"grad_norm": 0.7868314981460571,
"learning_rate": 1.9999554764374287e-05,
"loss": 1.542,
"step": 20200
},
{
"epoch": 0.015096753082563622,
"grad_norm": 0.3961299955844879,
"learning_rate": 1.9999550344125492e-05,
"loss": 1.4359,
"step": 20300
},
{
"epoch": 0.01517112132434965,
"grad_norm": 0.7971549034118652,
"learning_rate": 1.9999545902043577e-05,
"loss": 1.5363,
"step": 20400
},
{
"epoch": 0.015245489566135677,
"grad_norm": 1.1090092658996582,
"learning_rate": 1.9999541438128543e-05,
"loss": 1.5565,
"step": 20500
},
{
"epoch": 0.015319857807921704,
"grad_norm": 1.0558898448944092,
"learning_rate": 1.9999536952380406e-05,
"loss": 1.5504,
"step": 20600
},
{
"epoch": 0.015394226049707733,
"grad_norm": 0.5869760513305664,
"learning_rate": 1.9999532444799174e-05,
"loss": 1.5023,
"step": 20700
},
{
"epoch": 0.01546859429149376,
"grad_norm": 0.5132299065589905,
"learning_rate": 1.9999527915384858e-05,
"loss": 1.472,
"step": 20800
},
{
"epoch": 0.015542962533279789,
"grad_norm": 0.8405370116233826,
"learning_rate": 1.999952336413747e-05,
"loss": 1.479,
"step": 20900
},
{
"epoch": 0.015617330775065816,
"grad_norm": 1.0692424774169922,
"learning_rate": 1.9999518791057012e-05,
"loss": 1.5734,
"step": 21000
},
{
"epoch": 0.015691699016851843,
"grad_norm": 0.39929547905921936,
"learning_rate": 1.99995141961435e-05,
"loss": 1.5499,
"step": 21100
},
{
"epoch": 0.01576606725863787,
"grad_norm": 0.5001465082168579,
"learning_rate": 1.999950957939694e-05,
"loss": 1.5728,
"step": 21200
},
{
"epoch": 0.0158404355004239,
"grad_norm": 0.4564245045185089,
"learning_rate": 1.999950494081735e-05,
"loss": 1.4685,
"step": 21300
},
{
"epoch": 0.015914803742209927,
"grad_norm": 0.945813775062561,
"learning_rate": 1.999950028040473e-05,
"loss": 1.5309,
"step": 21400
},
{
"epoch": 0.015989171983995954,
"grad_norm": 0.5529621839523315,
"learning_rate": 1.9999495598159102e-05,
"loss": 1.5244,
"step": 21500
},
{
"epoch": 0.01606354022578198,
"grad_norm": 0.7338210940361023,
"learning_rate": 1.9999490894080467e-05,
"loss": 1.6339,
"step": 21600
},
{
"epoch": 0.016137908467568008,
"grad_norm": 1.0055419206619263,
"learning_rate": 1.999948616816884e-05,
"loss": 1.613,
"step": 21700
},
{
"epoch": 0.01621227670935404,
"grad_norm": 0.5460941195487976,
"learning_rate": 1.9999481420424223e-05,
"loss": 1.5819,
"step": 21800
},
{
"epoch": 0.016286644951140065,
"grad_norm": 1.005537509918213,
"learning_rate": 1.9999476650846637e-05,
"loss": 1.5636,
"step": 21900
},
{
"epoch": 0.016361013192926092,
"grad_norm": 0.8599165678024292,
"learning_rate": 1.9999471859436082e-05,
"loss": 1.4977,
"step": 22000
},
{
"epoch": 0.01643538143471212,
"grad_norm": 0.41388291120529175,
"learning_rate": 1.9999467046192583e-05,
"loss": 1.4243,
"step": 22100
},
{
"epoch": 0.01650974967649815,
"grad_norm": 0.4443175494670868,
"learning_rate": 1.9999462211116135e-05,
"loss": 1.5419,
"step": 22200
},
{
"epoch": 0.016584117918284177,
"grad_norm": 0.9959002733230591,
"learning_rate": 1.999945735420676e-05,
"loss": 1.588,
"step": 22300
},
{
"epoch": 0.016658486160070204,
"grad_norm": 0.7721849679946899,
"learning_rate": 1.999945247546446e-05,
"loss": 1.4806,
"step": 22400
},
{
"epoch": 0.01673285440185623,
"grad_norm": 0.5781850814819336,
"learning_rate": 1.9999447574889253e-05,
"loss": 1.5864,
"step": 22500
},
{
"epoch": 0.016807222643642258,
"grad_norm": 0.6155378222465515,
"learning_rate": 1.9999442652481143e-05,
"loss": 1.6002,
"step": 22600
},
{
"epoch": 0.016881590885428288,
"grad_norm": 0.8101166486740112,
"learning_rate": 1.9999437708240146e-05,
"loss": 1.5385,
"step": 22700
},
{
"epoch": 0.016955959127214315,
"grad_norm": 0.8044368624687195,
"learning_rate": 1.999943274216627e-05,
"loss": 1.4563,
"step": 22800
},
{
"epoch": 0.017030327369000342,
"grad_norm": 0.3784123361110687,
"learning_rate": 1.9999427754259527e-05,
"loss": 1.5844,
"step": 22900
},
{
"epoch": 0.01710469561078637,
"grad_norm": 0.8152732253074646,
"learning_rate": 1.9999422744519928e-05,
"loss": 1.53,
"step": 23000
},
{
"epoch": 0.017179063852572396,
"grad_norm": 0.8851474523544312,
"learning_rate": 1.9999417712947486e-05,
"loss": 1.5828,
"step": 23100
},
{
"epoch": 0.017253432094358426,
"grad_norm": 0.8275689482688904,
"learning_rate": 1.9999412659542208e-05,
"loss": 1.5057,
"step": 23200
},
{
"epoch": 0.017327800336144453,
"grad_norm": 0.5356424450874329,
"learning_rate": 1.9999407584304106e-05,
"loss": 1.5621,
"step": 23300
},
{
"epoch": 0.01740216857793048,
"grad_norm": 0.35889101028442383,
"learning_rate": 1.999940248723319e-05,
"loss": 1.5884,
"step": 23400
},
{
"epoch": 0.017476536819716507,
"grad_norm": 0.5190862417221069,
"learning_rate": 1.9999397368329477e-05,
"loss": 1.6021,
"step": 23500
},
{
"epoch": 0.017550905061502534,
"grad_norm": 0.5140055418014526,
"learning_rate": 1.9999392227592967e-05,
"loss": 1.5474,
"step": 23600
},
{
"epoch": 0.017625273303288565,
"grad_norm": 0.607276201248169,
"learning_rate": 1.9999387065023685e-05,
"loss": 1.5002,
"step": 23700
},
{
"epoch": 0.01769964154507459,
"grad_norm": 0.7513449192047119,
"learning_rate": 1.9999381880621634e-05,
"loss": 1.5098,
"step": 23800
},
{
"epoch": 0.01777400978686062,
"grad_norm": 0.7328070402145386,
"learning_rate": 1.9999376674386824e-05,
"loss": 1.5768,
"step": 23900
},
{
"epoch": 0.017848378028646646,
"grad_norm": 0.817368745803833,
"learning_rate": 1.9999371446319272e-05,
"loss": 1.5178,
"step": 24000
},
{
"epoch": 0.017922746270432676,
"grad_norm": 0.844530463218689,
"learning_rate": 1.999936619641899e-05,
"loss": 1.5331,
"step": 24100
},
{
"epoch": 0.017997114512218703,
"grad_norm": 0.8772881627082825,
"learning_rate": 1.9999360924685978e-05,
"loss": 1.5564,
"step": 24200
},
{
"epoch": 0.01807148275400473,
"grad_norm": 0.37944692373275757,
"learning_rate": 1.999935563112026e-05,
"loss": 1.4823,
"step": 24300
},
{
"epoch": 0.018145850995790757,
"grad_norm": 0.34085753560066223,
"learning_rate": 1.999935031572184e-05,
"loss": 1.4741,
"step": 24400
},
{
"epoch": 0.018220219237576784,
"grad_norm": 0.8616833686828613,
"learning_rate": 1.9999344978490737e-05,
"loss": 1.4642,
"step": 24500
},
{
"epoch": 0.018294587479362814,
"grad_norm": 0.9431029558181763,
"learning_rate": 1.9999339619426958e-05,
"loss": 1.5507,
"step": 24600
},
{
"epoch": 0.01836895572114884,
"grad_norm": 0.5803475975990295,
"learning_rate": 1.9999334238530512e-05,
"loss": 1.5617,
"step": 24700
},
{
"epoch": 0.01844332396293487,
"grad_norm": 0.7339209318161011,
"learning_rate": 1.9999328835801416e-05,
"loss": 1.4881,
"step": 24800
},
{
"epoch": 0.018517692204720895,
"grad_norm": 0.7969409823417664,
"learning_rate": 1.9999323411239676e-05,
"loss": 1.5438,
"step": 24900
},
{
"epoch": 0.018592060446506922,
"grad_norm": 0.6049161553382874,
"learning_rate": 1.9999317964845313e-05,
"loss": 1.5352,
"step": 25000
},
{
"epoch": 0.018666428688292953,
"grad_norm": 0.625723659992218,
"learning_rate": 1.999931249661833e-05,
"loss": 1.4817,
"step": 25100
},
{
"epoch": 0.01874079693007898,
"grad_norm": 0.8167730569839478,
"learning_rate": 1.9999307006558745e-05,
"loss": 1.4863,
"step": 25200
},
{
"epoch": 0.018815165171865007,
"grad_norm": 0.41490304470062256,
"learning_rate": 1.9999301494666566e-05,
"loss": 1.4653,
"step": 25300
},
{
"epoch": 0.018889533413651034,
"grad_norm": 0.7005138397216797,
"learning_rate": 1.9999295960941802e-05,
"loss": 1.5417,
"step": 25400
},
{
"epoch": 0.01896390165543706,
"grad_norm": 0.4145418405532837,
"learning_rate": 1.9999290405384476e-05,
"loss": 1.5818,
"step": 25500
},
{
"epoch": 0.01903826989722309,
"grad_norm": 0.9620917439460754,
"learning_rate": 1.999928482799459e-05,
"loss": 1.5717,
"step": 25600
},
{
"epoch": 0.019112638139009118,
"grad_norm": 0.518038272857666,
"learning_rate": 1.999927922877216e-05,
"loss": 1.4603,
"step": 25700
},
{
"epoch": 0.019187006380795145,
"grad_norm": 1.0701864957809448,
"learning_rate": 1.9999273607717198e-05,
"loss": 1.5095,
"step": 25800
},
{
"epoch": 0.019261374622581172,
"grad_norm": 1.2206807136535645,
"learning_rate": 1.9999267964829717e-05,
"loss": 1.5099,
"step": 25900
},
{
"epoch": 0.019335742864367202,
"grad_norm": 0.4838850796222687,
"learning_rate": 1.999926230010973e-05,
"loss": 1.5856,
"step": 26000
},
{
"epoch": 0.01941011110615323,
"grad_norm": 0.3916015625,
"learning_rate": 1.9999256613557243e-05,
"loss": 1.5198,
"step": 26100
},
{
"epoch": 0.019484479347939256,
"grad_norm": 0.4921341836452484,
"learning_rate": 1.9999250905172276e-05,
"loss": 1.5517,
"step": 26200
},
{
"epoch": 0.019558847589725283,
"grad_norm": 0.4124142527580261,
"learning_rate": 1.999924517495484e-05,
"loss": 1.6267,
"step": 26300
},
{
"epoch": 0.01963321583151131,
"grad_norm": 0.6755162477493286,
"learning_rate": 1.9999239422904946e-05,
"loss": 1.5408,
"step": 26400
},
{
"epoch": 0.01970758407329734,
"grad_norm": 0.8833709359169006,
"learning_rate": 1.9999233649022604e-05,
"loss": 1.5334,
"step": 26500
},
{
"epoch": 0.019781952315083368,
"grad_norm": 0.5344982147216797,
"learning_rate": 1.9999227853307832e-05,
"loss": 1.5532,
"step": 26600
},
{
"epoch": 0.019856320556869395,
"grad_norm": 0.5524909496307373,
"learning_rate": 1.999922203576064e-05,
"loss": 1.5672,
"step": 26700
},
{
"epoch": 0.01993068879865542,
"grad_norm": 0.6802098751068115,
"learning_rate": 1.999921619638104e-05,
"loss": 1.57,
"step": 26800
},
{
"epoch": 0.02000505704044145,
"grad_norm": 0.6773833632469177,
"learning_rate": 1.9999210335169047e-05,
"loss": 1.562,
"step": 26900
},
{
"epoch": 0.02007942528222748,
"grad_norm": 0.5428286194801331,
"learning_rate": 1.999920445212467e-05,
"loss": 1.5683,
"step": 27000
},
{
"epoch": 0.020153793524013506,
"grad_norm": 0.5180791020393372,
"learning_rate": 1.9999198547247927e-05,
"loss": 1.6216,
"step": 27100
},
{
"epoch": 0.020228161765799533,
"grad_norm": 0.6695342659950256,
"learning_rate": 1.9999192620538825e-05,
"loss": 1.4601,
"step": 27200
},
{
"epoch": 0.02030253000758556,
"grad_norm": 1.2745898962020874,
"learning_rate": 1.999918667199738e-05,
"loss": 1.5002,
"step": 27300
},
{
"epoch": 0.020376898249371587,
"grad_norm": 0.5011482834815979,
"learning_rate": 1.999918070162361e-05,
"loss": 1.5048,
"step": 27400
},
{
"epoch": 0.020451266491157617,
"grad_norm": 0.4430767297744751,
"learning_rate": 1.999917470941752e-05,
"loss": 1.5609,
"step": 27500
},
{
"epoch": 0.020525634732943644,
"grad_norm": 0.540259838104248,
"learning_rate": 1.9999168695379124e-05,
"loss": 1.5115,
"step": 27600
},
{
"epoch": 0.02060000297472967,
"grad_norm": 0.4208228886127472,
"learning_rate": 1.999916265950844e-05,
"loss": 1.5318,
"step": 27700
},
{
"epoch": 0.020674371216515698,
"grad_norm": 0.5492777824401855,
"learning_rate": 1.9999156601805477e-05,
"loss": 1.5001,
"step": 27800
},
{
"epoch": 0.020748739458301725,
"grad_norm": 0.8193747997283936,
"learning_rate": 1.999915052227025e-05,
"loss": 1.5795,
"step": 27900
},
{
"epoch": 0.020823107700087756,
"grad_norm": 0.6221509575843811,
"learning_rate": 1.999914442090277e-05,
"loss": 1.5414,
"step": 28000
},
{
"epoch": 0.020897475941873783,
"grad_norm": 0.8481204509735107,
"learning_rate": 1.9999138297703055e-05,
"loss": 1.5481,
"step": 28100
},
{
"epoch": 0.02097184418365981,
"grad_norm": 0.8506454229354858,
"learning_rate": 1.9999132152671116e-05,
"loss": 1.536,
"step": 28200
},
{
"epoch": 0.021046212425445836,
"grad_norm": 0.6849836111068726,
"learning_rate": 1.9999125985806964e-05,
"loss": 1.5236,
"step": 28300
},
{
"epoch": 0.021120580667231867,
"grad_norm": 0.6328344345092773,
"learning_rate": 1.999911979711062e-05,
"loss": 1.4921,
"step": 28400
},
{
"epoch": 0.021194948909017894,
"grad_norm": 0.44376102089881897,
"learning_rate": 1.9999113586582085e-05,
"loss": 1.5039,
"step": 28500
},
{
"epoch": 0.02126931715080392,
"grad_norm": 0.6041997075080872,
"learning_rate": 1.9999107354221385e-05,
"loss": 1.5522,
"step": 28600
},
{
"epoch": 0.021343685392589948,
"grad_norm": 0.5901020169258118,
"learning_rate": 1.9999101100028522e-05,
"loss": 1.5321,
"step": 28700
},
{
"epoch": 0.021418053634375975,
"grad_norm": 1.058334231376648,
"learning_rate": 1.999909482400352e-05,
"loss": 1.5725,
"step": 28800
},
{
"epoch": 0.021492421876162005,
"grad_norm": 0.9694022536277771,
"learning_rate": 1.9999088526146387e-05,
"loss": 1.545,
"step": 28900
},
{
"epoch": 0.021566790117948032,
"grad_norm": 0.6166462898254395,
"learning_rate": 1.999908220645714e-05,
"loss": 1.481,
"step": 29000
},
{
"epoch": 0.02164115835973406,
"grad_norm": 0.4764333963394165,
"learning_rate": 1.999907586493579e-05,
"loss": 1.5559,
"step": 29100
},
{
"epoch": 0.021715526601520086,
"grad_norm": 0.5028481483459473,
"learning_rate": 1.9999069501582352e-05,
"loss": 1.5451,
"step": 29200
},
{
"epoch": 0.021789894843306113,
"grad_norm": 0.7064079642295837,
"learning_rate": 1.9999063116396844e-05,
"loss": 1.5065,
"step": 29300
},
{
"epoch": 0.021864263085092144,
"grad_norm": 0.8854705691337585,
"learning_rate": 1.9999056709379268e-05,
"loss": 1.5331,
"step": 29400
},
{
"epoch": 0.02193863132687817,
"grad_norm": 1.1931555271148682,
"learning_rate": 1.999905028052965e-05,
"loss": 1.5629,
"step": 29500
},
{
"epoch": 0.022012999568664197,
"grad_norm": 0.4196559190750122,
"learning_rate": 1.9999043829848e-05,
"loss": 1.5969,
"step": 29600
},
{
"epoch": 0.022087367810450224,
"grad_norm": 0.661222517490387,
"learning_rate": 1.999903735733433e-05,
"loss": 1.4522,
"step": 29700
},
{
"epoch": 0.02216173605223625,
"grad_norm": 1.0771206617355347,
"learning_rate": 1.9999030862988658e-05,
"loss": 1.6346,
"step": 29800
},
{
"epoch": 0.022236104294022282,
"grad_norm": 0.4439813196659088,
"learning_rate": 1.9999024346810995e-05,
"loss": 1.4533,
"step": 29900
},
{
"epoch": 0.02231047253580831,
"grad_norm": 0.3492225706577301,
"learning_rate": 1.999901780880136e-05,
"loss": 1.4831,
"step": 30000
},
{
"epoch": 0.022384840777594336,
"grad_norm": 0.6220123171806335,
"learning_rate": 1.9999011248959757e-05,
"loss": 1.5165,
"step": 30100
},
{
"epoch": 0.022459209019380363,
"grad_norm": 0.467629998922348,
"learning_rate": 1.9999004667286214e-05,
"loss": 1.5315,
"step": 30200
},
{
"epoch": 0.022533577261166393,
"grad_norm": 0.6271420121192932,
"learning_rate": 1.9998998063780735e-05,
"loss": 1.5861,
"step": 30300
},
{
"epoch": 0.02260794550295242,
"grad_norm": 0.404694139957428,
"learning_rate": 1.9998991438443337e-05,
"loss": 1.455,
"step": 30400
},
{
"epoch": 0.022682313744738447,
"grad_norm": 0.6882662177085876,
"learning_rate": 1.9998984791274038e-05,
"loss": 1.5077,
"step": 30500
},
{
"epoch": 0.022756681986524474,
"grad_norm": 0.4796554744243622,
"learning_rate": 1.9998978122272844e-05,
"loss": 1.4934,
"step": 30600
},
{
"epoch": 0.0228310502283105,
"grad_norm": 0.7510641813278198,
"learning_rate": 1.9998971431439783e-05,
"loss": 1.5009,
"step": 30700
},
{
"epoch": 0.02290541847009653,
"grad_norm": 0.5859106779098511,
"learning_rate": 1.9998964718774857e-05,
"loss": 1.529,
"step": 30800
},
{
"epoch": 0.02297978671188256,
"grad_norm": 0.5376309156417847,
"learning_rate": 1.999895798427809e-05,
"loss": 1.5485,
"step": 30900
},
{
"epoch": 0.023054154953668585,
"grad_norm": 0.18692457675933838,
"learning_rate": 1.9998951227949487e-05,
"loss": 1.4897,
"step": 31000
},
{
"epoch": 0.023128523195454612,
"grad_norm": 0.4838975667953491,
"learning_rate": 1.999894444978907e-05,
"loss": 1.5244,
"step": 31100
},
{
"epoch": 0.02320289143724064,
"grad_norm": 0.8973916172981262,
"learning_rate": 1.9998937649796854e-05,
"loss": 1.554,
"step": 31200
},
{
"epoch": 0.02327725967902667,
"grad_norm": 0.5681875944137573,
"learning_rate": 1.999893082797285e-05,
"loss": 1.5503,
"step": 31300
},
{
"epoch": 0.023351627920812697,
"grad_norm": 0.469427227973938,
"learning_rate": 1.9998923984317075e-05,
"loss": 1.4381,
"step": 31400
},
{
"epoch": 0.023425996162598724,
"grad_norm": 0.43099313974380493,
"learning_rate": 1.9998917118829543e-05,
"loss": 1.5112,
"step": 31500
},
{
"epoch": 0.02350036440438475,
"grad_norm": 0.8290247917175293,
"learning_rate": 1.999891023151027e-05,
"loss": 1.467,
"step": 31600
},
{
"epoch": 0.023574732646170778,
"grad_norm": 0.7134198546409607,
"learning_rate": 1.999890332235927e-05,
"loss": 1.5312,
"step": 31700
},
{
"epoch": 0.023649100887956808,
"grad_norm": 0.4312078356742859,
"learning_rate": 1.999889639137656e-05,
"loss": 1.5417,
"step": 31800
},
{
"epoch": 0.023723469129742835,
"grad_norm": 0.5288392305374146,
"learning_rate": 1.9998889438562153e-05,
"loss": 1.5432,
"step": 31900
},
{
"epoch": 0.023797837371528862,
"grad_norm": 0.5819665789604187,
"learning_rate": 1.9998882463916062e-05,
"loss": 1.5703,
"step": 32000
},
{
"epoch": 0.02387220561331489,
"grad_norm": 0.6748378276824951,
"learning_rate": 1.999887546743831e-05,
"loss": 1.5674,
"step": 32100
},
{
"epoch": 0.02394657385510092,
"grad_norm": 0.5860730409622192,
"learning_rate": 1.9998868449128905e-05,
"loss": 1.5775,
"step": 32200
},
{
"epoch": 0.024020942096886946,
"grad_norm": 1.1641826629638672,
"learning_rate": 1.9998861408987866e-05,
"loss": 1.6354,
"step": 32300
},
{
"epoch": 0.024095310338672973,
"grad_norm": 0.6446713209152222,
"learning_rate": 1.9998854347015206e-05,
"loss": 1.5508,
"step": 32400
},
{
"epoch": 0.024169678580459,
"grad_norm": 0.8211930990219116,
"learning_rate": 1.9998847263210942e-05,
"loss": 1.4797,
"step": 32500
},
{
"epoch": 0.024244046822245027,
"grad_norm": 0.9733643531799316,
"learning_rate": 1.9998840157575093e-05,
"loss": 1.5375,
"step": 32600
},
{
"epoch": 0.024318415064031058,
"grad_norm": 0.7882494330406189,
"learning_rate": 1.9998833030107663e-05,
"loss": 1.5167,
"step": 32700
},
{
"epoch": 0.024392783305817085,
"grad_norm": 0.8701611757278442,
"learning_rate": 1.999882588080868e-05,
"loss": 1.578,
"step": 32800
},
{
"epoch": 0.02446715154760311,
"grad_norm": 0.5390304923057556,
"learning_rate": 1.9998818709678157e-05,
"loss": 1.4868,
"step": 32900
},
{
"epoch": 0.02454151978938914,
"grad_norm": 0.8778117299079895,
"learning_rate": 1.9998811516716104e-05,
"loss": 1.4611,
"step": 33000
},
{
"epoch": 0.024615888031175166,
"grad_norm": 1.267151951789856,
"learning_rate": 1.999880430192254e-05,
"loss": 1.4868,
"step": 33100
},
{
"epoch": 0.024690256272961196,
"grad_norm": 0.6846994161605835,
"learning_rate": 1.9998797065297483e-05,
"loss": 1.5047,
"step": 33200
},
{
"epoch": 0.024764624514747223,
"grad_norm": 0.6609792113304138,
"learning_rate": 1.9998789806840945e-05,
"loss": 1.5189,
"step": 33300
},
{
"epoch": 0.02483899275653325,
"grad_norm": 0.5603302717208862,
"learning_rate": 1.9998782526552946e-05,
"loss": 1.5095,
"step": 33400
},
{
"epoch": 0.024913360998319277,
"grad_norm": 0.7241900563240051,
"learning_rate": 1.9998775224433493e-05,
"loss": 1.5106,
"step": 33500
},
{
"epoch": 0.024987729240105304,
"grad_norm": 1.149263620376587,
"learning_rate": 1.9998767900482616e-05,
"loss": 1.5778,
"step": 33600
},
{
"epoch": 0.025062097481891334,
"grad_norm": 0.6764651536941528,
"learning_rate": 1.9998760554700318e-05,
"loss": 1.4944,
"step": 33700
},
{
"epoch": 0.02513646572367736,
"grad_norm": 0.6464880704879761,
"learning_rate": 1.999875318708662e-05,
"loss": 1.5719,
"step": 33800
},
{
"epoch": 0.025210833965463388,
"grad_norm": 0.6596807241439819,
"learning_rate": 1.9998745797641543e-05,
"loss": 1.6179,
"step": 33900
},
{
"epoch": 0.025285202207249415,
"grad_norm": 0.8761606812477112,
"learning_rate": 1.9998738386365096e-05,
"loss": 1.5256,
"step": 34000
},
{
"epoch": 0.025359570449035442,
"grad_norm": 0.43756160140037537,
"learning_rate": 1.9998730953257297e-05,
"loss": 1.5477,
"step": 34100
},
{
"epoch": 0.025433938690821473,
"grad_norm": 0.4515778720378876,
"learning_rate": 1.9998723498318165e-05,
"loss": 1.5666,
"step": 34200
},
{
"epoch": 0.0255083069326075,
"grad_norm": 0.5726724863052368,
"learning_rate": 1.9998716021547714e-05,
"loss": 1.4878,
"step": 34300
},
{
"epoch": 0.025582675174393527,
"grad_norm": 0.5104209184646606,
"learning_rate": 1.999870852294596e-05,
"loss": 1.5678,
"step": 34400
},
{
"epoch": 0.025657043416179554,
"grad_norm": 0.7009900808334351,
"learning_rate": 1.999870100251292e-05,
"loss": 1.4896,
"step": 34500
},
{
"epoch": 0.025731411657965584,
"grad_norm": 0.46048620343208313,
"learning_rate": 1.9998693460248613e-05,
"loss": 1.5144,
"step": 34600
},
{
"epoch": 0.02580577989975161,
"grad_norm": 0.6157929301261902,
"learning_rate": 1.999868589615305e-05,
"loss": 1.5178,
"step": 34700
},
{
"epoch": 0.025880148141537638,
"grad_norm": 0.5260864496231079,
"learning_rate": 1.9998678310226253e-05,
"loss": 1.5046,
"step": 34800
},
{
"epoch": 0.025954516383323665,
"grad_norm": 0.5624649524688721,
"learning_rate": 1.999867070246823e-05,
"loss": 1.5418,
"step": 34900
},
{
"epoch": 0.026028884625109692,
"grad_norm": 0.5242325663566589,
"learning_rate": 1.999866307287901e-05,
"loss": 1.4936,
"step": 35000
},
{
"epoch": 0.026103252866895722,
"grad_norm": 0.42132341861724854,
"learning_rate": 1.9998655421458603e-05,
"loss": 1.5528,
"step": 35100
},
{
"epoch": 0.02617762110868175,
"grad_norm": 1.2333385944366455,
"learning_rate": 1.9998647748207022e-05,
"loss": 1.5343,
"step": 35200
},
{
"epoch": 0.026251989350467776,
"grad_norm": 0.4847305417060852,
"learning_rate": 1.9998640053124288e-05,
"loss": 1.5256,
"step": 35300
},
{
"epoch": 0.026326357592253803,
"grad_norm": 0.4797114133834839,
"learning_rate": 1.999863233621042e-05,
"loss": 1.5394,
"step": 35400
},
{
"epoch": 0.02640072583403983,
"grad_norm": 0.8396820425987244,
"learning_rate": 1.999862459746543e-05,
"loss": 1.5643,
"step": 35500
},
{
"epoch": 0.02647509407582586,
"grad_norm": 0.4638078808784485,
"learning_rate": 1.999861683688934e-05,
"loss": 1.5372,
"step": 35600
},
{
"epoch": 0.026549462317611888,
"grad_norm": 0.44567036628723145,
"learning_rate": 1.9998609054482162e-05,
"loss": 1.5471,
"step": 35700
},
{
"epoch": 0.026623830559397915,
"grad_norm": 0.7429941892623901,
"learning_rate": 1.9998601250243915e-05,
"loss": 1.5551,
"step": 35800
},
{
"epoch": 0.02669819880118394,
"grad_norm": 0.4555191695690155,
"learning_rate": 1.9998593424174618e-05,
"loss": 1.6057,
"step": 35900
},
{
"epoch": 0.02677256704296997,
"grad_norm": 1.1227898597717285,
"learning_rate": 1.9998585576274286e-05,
"loss": 1.5223,
"step": 36000
},
{
"epoch": 0.026846935284756,
"grad_norm": 0.5287070870399475,
"learning_rate": 1.9998577706542937e-05,
"loss": 1.4566,
"step": 36100
},
{
"epoch": 0.026921303526542026,
"grad_norm": 0.43527650833129883,
"learning_rate": 1.9998569814980587e-05,
"loss": 1.5472,
"step": 36200
},
{
"epoch": 0.026995671768328053,
"grad_norm": 0.8627545237541199,
"learning_rate": 1.999856190158725e-05,
"loss": 1.5259,
"step": 36300
},
{
"epoch": 0.02707004001011408,
"grad_norm": 0.5693374276161194,
"learning_rate": 1.9998553966362952e-05,
"loss": 1.5403,
"step": 36400
},
{
"epoch": 0.02714440825190011,
"grad_norm": 0.43485864996910095,
"learning_rate": 1.9998546009307707e-05,
"loss": 1.55,
"step": 36500
},
{
"epoch": 0.027218776493686137,
"grad_norm": 0.7903422713279724,
"learning_rate": 1.9998538030421526e-05,
"loss": 1.5793,
"step": 36600
},
{
"epoch": 0.027293144735472164,
"grad_norm": 0.5814279317855835,
"learning_rate": 1.9998530029704436e-05,
"loss": 1.5068,
"step": 36700
},
{
"epoch": 0.02736751297725819,
"grad_norm": 0.5183308124542236,
"learning_rate": 1.9998522007156444e-05,
"loss": 1.4984,
"step": 36800
},
{
"epoch": 0.027441881219044218,
"grad_norm": 0.5316556692123413,
"learning_rate": 1.9998513962777578e-05,
"loss": 1.5973,
"step": 36900
},
{
"epoch": 0.02751624946083025,
"grad_norm": 0.6409894824028015,
"learning_rate": 1.999850589656785e-05,
"loss": 1.5491,
"step": 37000
},
{
"epoch": 0.027590617702616275,
"grad_norm": 0.7894346117973328,
"learning_rate": 1.9998497808527273e-05,
"loss": 1.5117,
"step": 37100
},
{
"epoch": 0.027664985944402302,
"grad_norm": 0.6969322562217712,
"learning_rate": 1.9998489698655877e-05,
"loss": 1.5079,
"step": 37200
},
{
"epoch": 0.02773935418618833,
"grad_norm": 1.1727479696273804,
"learning_rate": 1.9998481566953673e-05,
"loss": 1.4889,
"step": 37300
},
{
"epoch": 0.027813722427974356,
"grad_norm": 0.7132461071014404,
"learning_rate": 1.9998473413420672e-05,
"loss": 1.5284,
"step": 37400
},
{
"epoch": 0.027888090669760387,
"grad_norm": 0.3298719525337219,
"learning_rate": 1.9998465238056905e-05,
"loss": 1.5616,
"step": 37500
},
{
"epoch": 0.027962458911546414,
"grad_norm": 0.6609339714050293,
"learning_rate": 1.999845704086238e-05,
"loss": 1.6174,
"step": 37600
},
{
"epoch": 0.02803682715333244,
"grad_norm": 0.5007957220077515,
"learning_rate": 1.9998448821837118e-05,
"loss": 1.5016,
"step": 37700
},
{
"epoch": 0.028111195395118468,
"grad_norm": 0.9311910271644592,
"learning_rate": 1.9998440580981136e-05,
"loss": 1.5351,
"step": 37800
},
{
"epoch": 0.028185563636904495,
"grad_norm": 0.7796390056610107,
"learning_rate": 1.9998432318294455e-05,
"loss": 1.5461,
"step": 37900
},
{
"epoch": 0.028259931878690525,
"grad_norm": 0.8507175445556641,
"learning_rate": 1.9998424033777093e-05,
"loss": 1.5247,
"step": 38000
},
{
"epoch": 0.028334300120476552,
"grad_norm": 0.3990893065929413,
"learning_rate": 1.9998415727429065e-05,
"loss": 1.5629,
"step": 38100
},
{
"epoch": 0.02840866836226258,
"grad_norm": 0.852613091468811,
"learning_rate": 1.9998407399250386e-05,
"loss": 1.5216,
"step": 38200
},
{
"epoch": 0.028483036604048606,
"grad_norm": 0.4536173343658447,
"learning_rate": 1.9998399049241083e-05,
"loss": 1.5953,
"step": 38300
},
{
"epoch": 0.028557404845834636,
"grad_norm": 0.5260401964187622,
"learning_rate": 1.999839067740117e-05,
"loss": 1.616,
"step": 38400
},
{
"epoch": 0.028631773087620663,
"grad_norm": 0.6179829835891724,
"learning_rate": 1.9998382283730663e-05,
"loss": 1.5295,
"step": 38500
},
{
"epoch": 0.02870614132940669,
"grad_norm": 0.5114478468894958,
"learning_rate": 1.9998373868229582e-05,
"loss": 1.5425,
"step": 38600
},
{
"epoch": 0.028780509571192717,
"grad_norm": 0.593675971031189,
"learning_rate": 1.9998365430897948e-05,
"loss": 1.5474,
"step": 38700
},
{
"epoch": 0.028854877812978744,
"grad_norm": 0.4959476888179779,
"learning_rate": 1.999835697173577e-05,
"loss": 1.5775,
"step": 38800
},
{
"epoch": 0.028929246054764775,
"grad_norm": 0.6730287671089172,
"learning_rate": 1.9998348490743082e-05,
"loss": 1.6572,
"step": 38900
},
{
"epoch": 0.029003614296550802,
"grad_norm": 0.9137376546859741,
"learning_rate": 1.999833998791989e-05,
"loss": 1.5007,
"step": 39000
},
{
"epoch": 0.02907798253833683,
"grad_norm": 1.2021700143814087,
"learning_rate": 1.999833146326622e-05,
"loss": 1.5095,
"step": 39100
},
{
"epoch": 0.029152350780122856,
"grad_norm": 0.5708747506141663,
"learning_rate": 1.9998322916782083e-05,
"loss": 1.5644,
"step": 39200
},
{
"epoch": 0.029226719021908883,
"grad_norm": 0.6767252087593079,
"learning_rate": 1.9998314348467508e-05,
"loss": 1.5248,
"step": 39300
},
{
"epoch": 0.029301087263694913,
"grad_norm": 0.4881773889064789,
"learning_rate": 1.9998305758322504e-05,
"loss": 1.4889,
"step": 39400
},
{
"epoch": 0.02937545550548094,
"grad_norm": 0.4517097771167755,
"learning_rate": 1.9998297146347093e-05,
"loss": 1.5388,
"step": 39500
},
{
"epoch": 0.029449823747266967,
"grad_norm": 0.6027237176895142,
"learning_rate": 1.9998288512541295e-05,
"loss": 1.5702,
"step": 39600
},
{
"epoch": 0.029524191989052994,
"grad_norm": 0.4435807764530182,
"learning_rate": 1.9998279856905127e-05,
"loss": 1.5708,
"step": 39700
},
{
"epoch": 0.02959856023083902,
"grad_norm": 0.5487297773361206,
"learning_rate": 1.999827117943861e-05,
"loss": 1.5184,
"step": 39800
},
{
"epoch": 0.02967292847262505,
"grad_norm": 0.8344607949256897,
"learning_rate": 1.9998262480141762e-05,
"loss": 1.5454,
"step": 39900
},
{
"epoch": 0.02974729671441108,
"grad_norm": 0.8898949027061462,
"learning_rate": 1.9998253759014602e-05,
"loss": 1.5409,
"step": 40000
},
{
"epoch": 0.029821664956197105,
"grad_norm": 0.897030770778656,
"learning_rate": 1.9998245016057147e-05,
"loss": 1.5316,
"step": 40100
},
{
"epoch": 0.029896033197983132,
"grad_norm": 0.6615723371505737,
"learning_rate": 1.999823625126942e-05,
"loss": 1.6079,
"step": 40200
},
{
"epoch": 0.02997040143976916,
"grad_norm": 0.41309353709220886,
"learning_rate": 1.9998227464651438e-05,
"loss": 1.5077,
"step": 40300
},
{
"epoch": 0.03004476968155519,
"grad_norm": 0.7121081352233887,
"learning_rate": 1.9998218656203218e-05,
"loss": 1.5346,
"step": 40400
},
{
"epoch": 0.030119137923341217,
"grad_norm": 0.7162127494812012,
"learning_rate": 1.9998209825924784e-05,
"loss": 1.5369,
"step": 40500
},
{
"epoch": 0.030193506165127244,
"grad_norm": 0.5943055748939514,
"learning_rate": 1.9998200973816152e-05,
"loss": 1.5852,
"step": 40600
},
{
"epoch": 0.03026787440691327,
"grad_norm": 0.6746940612792969,
"learning_rate": 1.9998192099877344e-05,
"loss": 1.5407,
"step": 40700
},
{
"epoch": 0.0303422426486993,
"grad_norm": 0.9628979563713074,
"learning_rate": 1.9998183204108375e-05,
"loss": 1.4937,
"step": 40800
},
{
"epoch": 0.030416610890485328,
"grad_norm": 0.3971594274044037,
"learning_rate": 1.9998174286509267e-05,
"loss": 1.5628,
"step": 40900
},
{
"epoch": 0.030490979132271355,
"grad_norm": 0.553767204284668,
"learning_rate": 1.9998165347080043e-05,
"loss": 1.5182,
"step": 41000
},
{
"epoch": 0.030565347374057382,
"grad_norm": 0.4197104573249817,
"learning_rate": 1.9998156385820716e-05,
"loss": 1.4853,
"step": 41100
},
{
"epoch": 0.03063971561584341,
"grad_norm": 0.7118240594863892,
"learning_rate": 1.9998147402731308e-05,
"loss": 1.4737,
"step": 41200
},
{
"epoch": 0.03071408385762944,
"grad_norm": 0.7333774566650391,
"learning_rate": 1.999813839781184e-05,
"loss": 1.5183,
"step": 41300
},
{
"epoch": 0.030788452099415466,
"grad_norm": 0.509201169013977,
"learning_rate": 1.9998129371062332e-05,
"loss": 1.4873,
"step": 41400
},
{
"epoch": 0.030862820341201493,
"grad_norm": 0.3249990940093994,
"learning_rate": 1.9998120322482803e-05,
"loss": 1.4316,
"step": 41500
},
{
"epoch": 0.03093718858298752,
"grad_norm": 0.5361568331718445,
"learning_rate": 1.9998111252073272e-05,
"loss": 1.5113,
"step": 41600
},
{
"epoch": 0.031011556824773547,
"grad_norm": 1.3092052936553955,
"learning_rate": 1.9998102159833758e-05,
"loss": 1.5448,
"step": 41700
},
{
"epoch": 0.031085925066559578,
"grad_norm": 0.6276385188102722,
"learning_rate": 1.999809304576428e-05,
"loss": 1.6223,
"step": 41800
},
{
"epoch": 0.031160293308345605,
"grad_norm": 0.7364848256111145,
"learning_rate": 1.9998083909864863e-05,
"loss": 1.4543,
"step": 41900
},
{
"epoch": 0.03123466155013163,
"grad_norm": 0.3654361367225647,
"learning_rate": 1.9998074752135523e-05,
"loss": 1.6071,
"step": 42000
},
{
"epoch": 0.03130902979191766,
"grad_norm": 1.1972397565841675,
"learning_rate": 1.999806557257628e-05,
"loss": 1.4909,
"step": 42100
},
{
"epoch": 0.031383398033703686,
"grad_norm": 0.5845790505409241,
"learning_rate": 1.9998056371187155e-05,
"loss": 1.5687,
"step": 42200
},
{
"epoch": 0.03145776627548971,
"grad_norm": 0.7037214636802673,
"learning_rate": 1.9998047147968168e-05,
"loss": 1.5561,
"step": 42300
},
{
"epoch": 0.03153213451727574,
"grad_norm": 0.5212551951408386,
"learning_rate": 1.999803790291934e-05,
"loss": 1.5063,
"step": 42400
},
{
"epoch": 0.03160650275906177,
"grad_norm": 0.6110777854919434,
"learning_rate": 1.999802863604069e-05,
"loss": 1.4896,
"step": 42500
},
{
"epoch": 0.0316808710008478,
"grad_norm": 0.6877493858337402,
"learning_rate": 1.999801934733224e-05,
"loss": 1.5779,
"step": 42600
},
{
"epoch": 0.03175523924263383,
"grad_norm": 0.4461131989955902,
"learning_rate": 1.9998010036794005e-05,
"loss": 1.5973,
"step": 42700
},
{
"epoch": 0.031829607484419854,
"grad_norm": 1.0050228834152222,
"learning_rate": 1.999800070442601e-05,
"loss": 1.5149,
"step": 42800
},
{
"epoch": 0.03190397572620588,
"grad_norm": 0.46876657009124756,
"learning_rate": 1.9997991350228275e-05,
"loss": 1.6122,
"step": 42900
},
{
"epoch": 0.03197834396799191,
"grad_norm": 0.4919954240322113,
"learning_rate": 1.999798197420082e-05,
"loss": 1.5411,
"step": 43000
},
{
"epoch": 0.032052712209777935,
"grad_norm": 0.9554354548454285,
"learning_rate": 1.9997972576343668e-05,
"loss": 1.579,
"step": 43100
},
{
"epoch": 0.03212708045156396,
"grad_norm": 1.0671650171279907,
"learning_rate": 1.9997963156656835e-05,
"loss": 1.5999,
"step": 43200
},
{
"epoch": 0.03220144869334999,
"grad_norm": 0.8465139269828796,
"learning_rate": 1.999795371514034e-05,
"loss": 1.5703,
"step": 43300
},
{
"epoch": 0.032275816935136016,
"grad_norm": 0.7047709822654724,
"learning_rate": 1.9997944251794212e-05,
"loss": 1.5814,
"step": 43400
},
{
"epoch": 0.03235018517692205,
"grad_norm": 0.7836155891418457,
"learning_rate": 1.9997934766618465e-05,
"loss": 1.5464,
"step": 43500
},
{
"epoch": 0.03242455341870808,
"grad_norm": 0.7335034012794495,
"learning_rate": 1.9997925259613124e-05,
"loss": 1.5278,
"step": 43600
},
{
"epoch": 0.032498921660494104,
"grad_norm": 0.834950864315033,
"learning_rate": 1.9997915730778202e-05,
"loss": 1.559,
"step": 43700
},
{
"epoch": 0.03257328990228013,
"grad_norm": 0.7446547150611877,
"learning_rate": 1.9997906180113726e-05,
"loss": 1.6256,
"step": 43800
},
{
"epoch": 0.03264765814406616,
"grad_norm": 0.5306852459907532,
"learning_rate": 1.9997896607619718e-05,
"loss": 1.44,
"step": 43900
},
{
"epoch": 0.032722026385852185,
"grad_norm": 0.500023365020752,
"learning_rate": 1.9997887013296196e-05,
"loss": 1.5355,
"step": 44000
},
{
"epoch": 0.03279639462763821,
"grad_norm": 0.6218491196632385,
"learning_rate": 1.9997877397143182e-05,
"loss": 1.5741,
"step": 44100
},
{
"epoch": 0.03287076286942424,
"grad_norm": 0.3754362463951111,
"learning_rate": 1.9997867759160696e-05,
"loss": 1.6125,
"step": 44200
},
{
"epoch": 0.032945131111210266,
"grad_norm": 0.9419918656349182,
"learning_rate": 1.999785809934876e-05,
"loss": 1.4781,
"step": 44300
},
{
"epoch": 0.0330194993529963,
"grad_norm": 0.503409743309021,
"learning_rate": 1.9997848417707394e-05,
"loss": 1.532,
"step": 44400
},
{
"epoch": 0.03309386759478233,
"grad_norm": 0.6554058194160461,
"learning_rate": 1.999783871423662e-05,
"loss": 1.5704,
"step": 44500
},
{
"epoch": 0.033168235836568354,
"grad_norm": 0.9691445231437683,
"learning_rate": 1.9997828988936462e-05,
"loss": 1.5278,
"step": 44600
},
{
"epoch": 0.03324260407835438,
"grad_norm": 0.43620389699935913,
"learning_rate": 1.999781924180694e-05,
"loss": 1.5436,
"step": 44700
},
{
"epoch": 0.03331697232014041,
"grad_norm": 0.6035354137420654,
"learning_rate": 1.999780947284807e-05,
"loss": 1.5899,
"step": 44800
},
{
"epoch": 0.033391340561926434,
"grad_norm": 0.3441450595855713,
"learning_rate": 1.9997799682059875e-05,
"loss": 1.5443,
"step": 44900
},
{
"epoch": 0.03346570880371246,
"grad_norm": 0.5419406294822693,
"learning_rate": 1.999778986944238e-05,
"loss": 1.4789,
"step": 45000
},
{
"epoch": 0.03354007704549849,
"grad_norm": 0.7912573218345642,
"learning_rate": 1.9997780034995605e-05,
"loss": 1.4816,
"step": 45100
},
{
"epoch": 0.033614445287284515,
"grad_norm": 0.8978769779205322,
"learning_rate": 1.9997770178719573e-05,
"loss": 1.5124,
"step": 45200
},
{
"epoch": 0.03368881352907054,
"grad_norm": 0.6722145080566406,
"learning_rate": 1.99977603006143e-05,
"loss": 1.5229,
"step": 45300
},
{
"epoch": 0.033763181770856576,
"grad_norm": 0.4918314218521118,
"learning_rate": 1.9997750400679815e-05,
"loss": 1.562,
"step": 45400
},
{
"epoch": 0.0338375500126426,
"grad_norm": 0.9343436360359192,
"learning_rate": 1.9997740478916138e-05,
"loss": 1.6147,
"step": 45500
},
{
"epoch": 0.03391191825442863,
"grad_norm": 1.0771671533584595,
"learning_rate": 1.9997730535323287e-05,
"loss": 1.4441,
"step": 45600
},
{
"epoch": 0.03398628649621466,
"grad_norm": 0.666222095489502,
"learning_rate": 1.999772056990128e-05,
"loss": 1.536,
"step": 45700
},
{
"epoch": 0.034060654738000684,
"grad_norm": 0.5621564388275146,
"learning_rate": 1.9997710582650153e-05,
"loss": 1.5652,
"step": 45800
},
{
"epoch": 0.03413502297978671,
"grad_norm": 0.6601430177688599,
"learning_rate": 1.9997700573569912e-05,
"loss": 1.5309,
"step": 45900
},
{
"epoch": 0.03420939122157274,
"grad_norm": 0.7411925792694092,
"learning_rate": 1.9997690542660585e-05,
"loss": 1.4918,
"step": 46000
},
{
"epoch": 0.034283759463358765,
"grad_norm": 0.5674101710319519,
"learning_rate": 1.99976804899222e-05,
"loss": 1.5383,
"step": 46100
},
{
"epoch": 0.03435812770514479,
"grad_norm": 0.8503201007843018,
"learning_rate": 1.999767041535477e-05,
"loss": 1.5298,
"step": 46200
},
{
"epoch": 0.034432495946930826,
"grad_norm": 0.8432891368865967,
"learning_rate": 1.999766031895832e-05,
"loss": 1.5419,
"step": 46300
},
{
"epoch": 0.03450686418871685,
"grad_norm": 0.41764137148857117,
"learning_rate": 1.9997650200732876e-05,
"loss": 1.5313,
"step": 46400
},
{
"epoch": 0.03458123243050288,
"grad_norm": 1.0963222980499268,
"learning_rate": 1.9997640060678455e-05,
"loss": 1.5593,
"step": 46500
},
{
"epoch": 0.03465560067228891,
"grad_norm": 0.5194404721260071,
"learning_rate": 1.9997629898795082e-05,
"loss": 1.6352,
"step": 46600
},
{
"epoch": 0.034729968914074934,
"grad_norm": 0.6641316413879395,
"learning_rate": 1.9997619715082777e-05,
"loss": 1.552,
"step": 46700
},
{
"epoch": 0.03480433715586096,
"grad_norm": 1.1054824590682983,
"learning_rate": 1.999760950954156e-05,
"loss": 1.5176,
"step": 46800
},
{
"epoch": 0.03487870539764699,
"grad_norm": 0.44163691997528076,
"learning_rate": 1.9997599282171466e-05,
"loss": 1.5985,
"step": 46900
},
{
"epoch": 0.034953073639433015,
"grad_norm": 0.8304015398025513,
"learning_rate": 1.99975890329725e-05,
"loss": 1.554,
"step": 47000
},
{
"epoch": 0.03502744188121904,
"grad_norm": 0.8395280838012695,
"learning_rate": 1.9997578761944693e-05,
"loss": 1.6185,
"step": 47100
},
{
"epoch": 0.03510181012300507,
"grad_norm": 0.8857927322387695,
"learning_rate": 1.9997568469088068e-05,
"loss": 1.5108,
"step": 47200
},
{
"epoch": 0.0351761783647911,
"grad_norm": 0.6471524834632874,
"learning_rate": 1.999755815440265e-05,
"loss": 1.4693,
"step": 47300
},
{
"epoch": 0.03525054660657713,
"grad_norm": 0.6664785146713257,
"learning_rate": 1.9997547817888453e-05,
"loss": 1.5391,
"step": 47400
},
{
"epoch": 0.035324914848363156,
"grad_norm": 0.4979814887046814,
"learning_rate": 1.9997537459545505e-05,
"loss": 1.5367,
"step": 47500
},
{
"epoch": 0.03539928309014918,
"grad_norm": 0.5753507614135742,
"learning_rate": 1.9997527079373828e-05,
"loss": 1.547,
"step": 47600
},
{
"epoch": 0.03547365133193521,
"grad_norm": 0.5349861979484558,
"learning_rate": 1.9997516677373444e-05,
"loss": 1.5317,
"step": 47700
},
{
"epoch": 0.03554801957372124,
"grad_norm": 0.49024057388305664,
"learning_rate": 1.9997506253544377e-05,
"loss": 1.4412,
"step": 47800
},
{
"epoch": 0.035622387815507264,
"grad_norm": 0.4172305166721344,
"learning_rate": 1.9997495807886648e-05,
"loss": 1.4767,
"step": 47900
},
{
"epoch": 0.03569675605729329,
"grad_norm": 0.8609969615936279,
"learning_rate": 1.9997485340400283e-05,
"loss": 1.5191,
"step": 48000
},
{
"epoch": 0.03577112429907932,
"grad_norm": 0.46808651089668274,
"learning_rate": 1.9997474851085304e-05,
"loss": 1.5835,
"step": 48100
},
{
"epoch": 0.03584549254086535,
"grad_norm": 1.0232137441635132,
"learning_rate": 1.999746433994173e-05,
"loss": 1.5322,
"step": 48200
},
{
"epoch": 0.03591986078265138,
"grad_norm": 0.6745514273643494,
"learning_rate": 1.9997453806969588e-05,
"loss": 1.5706,
"step": 48300
},
{
"epoch": 0.035994229024437406,
"grad_norm": 1.114139437675476,
"learning_rate": 1.99974432521689e-05,
"loss": 1.5895,
"step": 48400
},
{
"epoch": 0.03606859726622343,
"grad_norm": 0.7316710948944092,
"learning_rate": 1.9997432675539686e-05,
"loss": 1.4594,
"step": 48500
},
{
"epoch": 0.03614296550800946,
"grad_norm": 0.4518311023712158,
"learning_rate": 1.9997422077081973e-05,
"loss": 1.5374,
"step": 48600
},
{
"epoch": 0.03621733374979549,
"grad_norm": 0.6804157495498657,
"learning_rate": 1.999741145679578e-05,
"loss": 1.5915,
"step": 48700
},
{
"epoch": 0.036291701991581514,
"grad_norm": 0.4367609918117523,
"learning_rate": 1.999740081468114e-05,
"loss": 1.5092,
"step": 48800
},
{
"epoch": 0.03636607023336754,
"grad_norm": 0.7415800094604492,
"learning_rate": 1.9997390150738063e-05,
"loss": 1.4548,
"step": 48900
},
{
"epoch": 0.03644043847515357,
"grad_norm": 0.4981917440891266,
"learning_rate": 1.999737946496658e-05,
"loss": 1.53,
"step": 49000
},
{
"epoch": 0.036514806716939595,
"grad_norm": 0.42718860507011414,
"learning_rate": 1.9997368757366712e-05,
"loss": 1.4911,
"step": 49100
},
{
"epoch": 0.03658917495872563,
"grad_norm": 0.5294268131256104,
"learning_rate": 1.999735802793849e-05,
"loss": 1.5203,
"step": 49200
},
{
"epoch": 0.036663543200511656,
"grad_norm": 0.5844396948814392,
"learning_rate": 1.999734727668192e-05,
"loss": 1.5442,
"step": 49300
},
{
"epoch": 0.03673791144229768,
"grad_norm": 0.7835099697113037,
"learning_rate": 1.9997336503597043e-05,
"loss": 1.4786,
"step": 49400
},
{
"epoch": 0.03681227968408371,
"grad_norm": 0.6045675873756409,
"learning_rate": 1.9997325708683875e-05,
"loss": 1.5386,
"step": 49500
},
{
"epoch": 0.03688664792586974,
"grad_norm": 0.651782751083374,
"learning_rate": 1.9997314891942442e-05,
"loss": 1.4999,
"step": 49600
},
{
"epoch": 0.036961016167655764,
"grad_norm": 0.45741456747055054,
"learning_rate": 1.9997304053372762e-05,
"loss": 1.4946,
"step": 49700
},
{
"epoch": 0.03703538440944179,
"grad_norm": 0.5363433957099915,
"learning_rate": 1.999729319297486e-05,
"loss": 1.5421,
"step": 49800
},
{
"epoch": 0.03710975265122782,
"grad_norm": 0.45601820945739746,
"learning_rate": 1.9997282310748768e-05,
"loss": 1.5149,
"step": 49900
},
{
"epoch": 0.037184120893013844,
"grad_norm": 0.6308805346488953,
"learning_rate": 1.9997271406694504e-05,
"loss": 1.5464,
"step": 50000
},
{
"epoch": 0.03725848913479988,
"grad_norm": 0.6946158409118652,
"learning_rate": 1.999726048081209e-05,
"loss": 1.5865,
"step": 50100
},
{
"epoch": 0.037332857376585905,
"grad_norm": 0.4899694323539734,
"learning_rate": 1.9997249533101554e-05,
"loss": 1.5839,
"step": 50200
},
{
"epoch": 0.03740722561837193,
"grad_norm": 0.7726057767868042,
"learning_rate": 1.9997238563562912e-05,
"loss": 1.4571,
"step": 50300
},
{
"epoch": 0.03748159386015796,
"grad_norm": 0.9151437878608704,
"learning_rate": 1.9997227572196197e-05,
"loss": 1.4459,
"step": 50400
},
{
"epoch": 0.037555962101943986,
"grad_norm": 0.6182152032852173,
"learning_rate": 1.9997216559001433e-05,
"loss": 1.4586,
"step": 50500
},
{
"epoch": 0.03763033034373001,
"grad_norm": 0.7595764398574829,
"learning_rate": 1.9997205523978636e-05,
"loss": 1.5382,
"step": 50600
},
{
"epoch": 0.03770469858551604,
"grad_norm": 0.8368933796882629,
"learning_rate": 1.9997194467127838e-05,
"loss": 1.636,
"step": 50700
},
{
"epoch": 0.03777906682730207,
"grad_norm": 1.3371498584747314,
"learning_rate": 1.9997183388449055e-05,
"loss": 1.5225,
"step": 50800
},
{
"epoch": 0.037853435069088094,
"grad_norm": 0.6440578103065491,
"learning_rate": 1.999717228794232e-05,
"loss": 1.5165,
"step": 50900
},
{
"epoch": 0.03792780331087412,
"grad_norm": 0.5950276255607605,
"learning_rate": 1.999716116560765e-05,
"loss": 1.552,
"step": 51000
},
{
"epoch": 0.038002171552660155,
"grad_norm": 0.7176305651664734,
"learning_rate": 1.9997150021445074e-05,
"loss": 1.5019,
"step": 51100
},
{
"epoch": 0.03807653979444618,
"grad_norm": 0.7437789440155029,
"learning_rate": 1.999713885545462e-05,
"loss": 1.5632,
"step": 51200
},
{
"epoch": 0.03815090803623221,
"grad_norm": 0.48256799578666687,
"learning_rate": 1.9997127667636298e-05,
"loss": 1.4943,
"step": 51300
},
{
"epoch": 0.038225276278018236,
"grad_norm": 0.8726604580879211,
"learning_rate": 1.9997116457990148e-05,
"loss": 1.6079,
"step": 51400
},
{
"epoch": 0.03829964451980426,
"grad_norm": 0.6188727617263794,
"learning_rate": 1.9997105226516186e-05,
"loss": 1.5845,
"step": 51500
},
{
"epoch": 0.03837401276159029,
"grad_norm": 0.5416398048400879,
"learning_rate": 1.9997093973214442e-05,
"loss": 1.5297,
"step": 51600
},
{
"epoch": 0.03844838100337632,
"grad_norm": 0.8704063296318054,
"learning_rate": 1.999708269808493e-05,
"loss": 1.5324,
"step": 51700
},
{
"epoch": 0.038522749245162344,
"grad_norm": 0.9990126490592957,
"learning_rate": 1.9997071401127688e-05,
"loss": 1.5435,
"step": 51800
},
{
"epoch": 0.03859711748694837,
"grad_norm": 0.402971476316452,
"learning_rate": 1.9997060082342732e-05,
"loss": 1.5755,
"step": 51900
},
{
"epoch": 0.038671485728734405,
"grad_norm": 0.7084416747093201,
"learning_rate": 1.9997048741730092e-05,
"loss": 1.5153,
"step": 52000
},
{
"epoch": 0.03874585397052043,
"grad_norm": 0.5382058620452881,
"learning_rate": 1.9997037379289786e-05,
"loss": 1.5378,
"step": 52100
},
{
"epoch": 0.03882022221230646,
"grad_norm": 0.5846664905548096,
"learning_rate": 1.9997025995021845e-05,
"loss": 1.5118,
"step": 52200
},
{
"epoch": 0.038894590454092486,
"grad_norm": 0.6125427484512329,
"learning_rate": 1.999701458892629e-05,
"loss": 1.5567,
"step": 52300
},
{
"epoch": 0.03896895869587851,
"grad_norm": 0.4352121949195862,
"learning_rate": 1.999700316100315e-05,
"loss": 1.5665,
"step": 52400
},
{
"epoch": 0.03904332693766454,
"grad_norm": 0.4972393810749054,
"learning_rate": 1.9996991711252448e-05,
"loss": 1.5728,
"step": 52500
},
{
"epoch": 0.039117695179450566,
"grad_norm": 7.361449241638184,
"learning_rate": 1.9996980239674207e-05,
"loss": 1.4814,
"step": 52600
},
{
"epoch": 0.03919206342123659,
"grad_norm": 0.4652218818664551,
"learning_rate": 1.9996968746268452e-05,
"loss": 1.5553,
"step": 52700
},
{
"epoch": 0.03926643166302262,
"grad_norm": 0.7898038029670715,
"learning_rate": 1.9996957231035213e-05,
"loss": 1.5251,
"step": 52800
},
{
"epoch": 0.03934079990480865,
"grad_norm": 0.5735042095184326,
"learning_rate": 1.999694569397451e-05,
"loss": 1.474,
"step": 52900
},
{
"epoch": 0.03941516814659468,
"grad_norm": 0.6072685122489929,
"learning_rate": 1.9996934135086367e-05,
"loss": 1.6186,
"step": 53000
},
{
"epoch": 0.03948953638838071,
"grad_norm": 0.5961938500404358,
"learning_rate": 1.9996922554370818e-05,
"loss": 1.5622,
"step": 53100
},
{
"epoch": 0.039563904630166735,
"grad_norm": 0.6281226277351379,
"learning_rate": 1.999691095182788e-05,
"loss": 1.5508,
"step": 53200
},
{
"epoch": 0.03963827287195276,
"grad_norm": 0.9202520847320557,
"learning_rate": 1.9996899327457576e-05,
"loss": 1.5623,
"step": 53300
},
{
"epoch": 0.03971264111373879,
"grad_norm": 0.4792959988117218,
"learning_rate": 1.9996887681259946e-05,
"loss": 1.5363,
"step": 53400
},
{
"epoch": 0.039787009355524816,
"grad_norm": 0.4955751299858093,
"learning_rate": 1.9996876013234997e-05,
"loss": 1.5026,
"step": 53500
},
{
"epoch": 0.03986137759731084,
"grad_norm": 1.114710807800293,
"learning_rate": 1.9996864323382766e-05,
"loss": 1.5397,
"step": 53600
},
{
"epoch": 0.03993574583909687,
"grad_norm": 0.5231966972351074,
"learning_rate": 1.9996852611703278e-05,
"loss": 1.4971,
"step": 53700
},
{
"epoch": 0.0400101140808829,
"grad_norm": 0.8232663869857788,
"learning_rate": 1.9996840878196554e-05,
"loss": 1.5054,
"step": 53800
},
{
"epoch": 0.04008448232266893,
"grad_norm": 0.5382178425788879,
"learning_rate": 1.999682912286262e-05,
"loss": 1.47,
"step": 53900
},
{
"epoch": 0.04015885056445496,
"grad_norm": 0.8355742692947388,
"learning_rate": 1.999681734570151e-05,
"loss": 1.3635,
"step": 54000
},
{
"epoch": 0.040233218806240985,
"grad_norm": 0.715268611907959,
"learning_rate": 1.9996805546713237e-05,
"loss": 1.5606,
"step": 54100
},
{
"epoch": 0.04030758704802701,
"grad_norm": 0.9210833311080933,
"learning_rate": 1.9996793725897836e-05,
"loss": 1.5702,
"step": 54200
},
{
"epoch": 0.04038195528981304,
"grad_norm": 0.5435687899589539,
"learning_rate": 1.9996781883255328e-05,
"loss": 1.5468,
"step": 54300
},
{
"epoch": 0.040456323531599066,
"grad_norm": 0.517410159111023,
"learning_rate": 1.9996770018785743e-05,
"loss": 1.597,
"step": 54400
},
{
"epoch": 0.04053069177338509,
"grad_norm": 0.5302937030792236,
"learning_rate": 1.9996758132489102e-05,
"loss": 1.4796,
"step": 54500
},
{
"epoch": 0.04060506001517112,
"grad_norm": 0.8928558230400085,
"learning_rate": 1.9996746224365435e-05,
"loss": 1.5311,
"step": 54600
},
{
"epoch": 0.04067942825695715,
"grad_norm": 0.9816573262214661,
"learning_rate": 1.9996734294414765e-05,
"loss": 1.5409,
"step": 54700
},
{
"epoch": 0.040753796498743174,
"grad_norm": 0.5399038791656494,
"learning_rate": 1.999672234263712e-05,
"loss": 1.4705,
"step": 54800
},
{
"epoch": 0.04082816474052921,
"grad_norm": 0.815388560295105,
"learning_rate": 1.9996710369032528e-05,
"loss": 1.5708,
"step": 54900
},
{
"epoch": 0.040902532982315234,
"grad_norm": 0.765061616897583,
"learning_rate": 1.999669837360101e-05,
"loss": 1.4828,
"step": 55000
},
{
"epoch": 0.04097690122410126,
"grad_norm": 0.8168923258781433,
"learning_rate": 1.99966863563426e-05,
"loss": 1.5167,
"step": 55100
},
{
"epoch": 0.04105126946588729,
"grad_norm": 0.6376170516014099,
"learning_rate": 1.9996674317257315e-05,
"loss": 1.5286,
"step": 55200
},
{
"epoch": 0.041125637707673315,
"grad_norm": 0.7510162591934204,
"learning_rate": 1.9996662256345184e-05,
"loss": 1.5737,
"step": 55300
},
{
"epoch": 0.04120000594945934,
"grad_norm": 0.5505034327507019,
"learning_rate": 1.9996650173606234e-05,
"loss": 1.5388,
"step": 55400
},
{
"epoch": 0.04127437419124537,
"grad_norm": 0.49698886275291443,
"learning_rate": 1.99966380690405e-05,
"loss": 1.5832,
"step": 55500
},
{
"epoch": 0.041348742433031396,
"grad_norm": 0.5520877242088318,
"learning_rate": 1.9996625942647994e-05,
"loss": 1.5643,
"step": 55600
},
{
"epoch": 0.04142311067481742,
"grad_norm": 0.6185612082481384,
"learning_rate": 1.999661379442875e-05,
"loss": 1.5087,
"step": 55700
},
{
"epoch": 0.04149747891660345,
"grad_norm": 0.8302799463272095,
"learning_rate": 1.9996601624382795e-05,
"loss": 1.6283,
"step": 55800
},
{
"epoch": 0.041571847158389484,
"grad_norm": 0.9720719456672668,
"learning_rate": 1.9996589432510155e-05,
"loss": 1.5064,
"step": 55900
},
{
"epoch": 0.04164621540017551,
"grad_norm": 0.40555909276008606,
"learning_rate": 1.9996577218810855e-05,
"loss": 1.5138,
"step": 56000
},
{
"epoch": 0.04172058364196154,
"grad_norm": 0.9815244674682617,
"learning_rate": 1.9996564983284918e-05,
"loss": 1.4913,
"step": 56100
},
{
"epoch": 0.041794951883747565,
"grad_norm": 1.1608703136444092,
"learning_rate": 1.9996552725932382e-05,
"loss": 1.4939,
"step": 56200
},
{
"epoch": 0.04186932012553359,
"grad_norm": 0.38927561044692993,
"learning_rate": 1.9996540446753264e-05,
"loss": 1.5115,
"step": 56300
},
{
"epoch": 0.04194368836731962,
"grad_norm": 0.7470927834510803,
"learning_rate": 1.9996528145747594e-05,
"loss": 1.5539,
"step": 56400
},
{
"epoch": 0.042018056609105646,
"grad_norm": 0.46110135316848755,
"learning_rate": 1.99965158229154e-05,
"loss": 1.4602,
"step": 56500
},
{
"epoch": 0.04209242485089167,
"grad_norm": 0.5916282534599304,
"learning_rate": 1.9996503478256705e-05,
"loss": 1.5721,
"step": 56600
},
{
"epoch": 0.0421667930926777,
"grad_norm": 0.5567501187324524,
"learning_rate": 1.999649111177154e-05,
"loss": 1.482,
"step": 56700
},
{
"epoch": 0.042241161334463734,
"grad_norm": 0.8147956728935242,
"learning_rate": 1.9996478723459928e-05,
"loss": 1.5518,
"step": 56800
},
{
"epoch": 0.04231552957624976,
"grad_norm": 1.0146034955978394,
"learning_rate": 1.9996466313321906e-05,
"loss": 1.52,
"step": 56900
},
{
"epoch": 0.04238989781803579,
"grad_norm": 0.267634779214859,
"learning_rate": 1.9996453881357486e-05,
"loss": 1.598,
"step": 57000
},
{
"epoch": 0.042464266059821815,
"grad_norm": 0.7316186428070068,
"learning_rate": 1.9996441427566707e-05,
"loss": 1.5306,
"step": 57100
},
{
"epoch": 0.04253863430160784,
"grad_norm": 1.078633189201355,
"learning_rate": 1.999642895194959e-05,
"loss": 1.4847,
"step": 57200
},
{
"epoch": 0.04261300254339387,
"grad_norm": 0.5316028594970703,
"learning_rate": 1.9996416454506164e-05,
"loss": 1.5928,
"step": 57300
},
{
"epoch": 0.042687370785179896,
"grad_norm": 0.573113739490509,
"learning_rate": 1.999640393523646e-05,
"loss": 1.5609,
"step": 57400
},
{
"epoch": 0.04276173902696592,
"grad_norm": 0.45106610655784607,
"learning_rate": 1.9996391394140496e-05,
"loss": 1.5693,
"step": 57500
},
{
"epoch": 0.04283610726875195,
"grad_norm": 0.6011554002761841,
"learning_rate": 1.9996378831218307e-05,
"loss": 1.5968,
"step": 57600
},
{
"epoch": 0.042910475510537976,
"grad_norm": 0.7017727494239807,
"learning_rate": 1.9996366246469922e-05,
"loss": 1.4973,
"step": 57700
},
{
"epoch": 0.04298484375232401,
"grad_norm": 0.8994572758674622,
"learning_rate": 1.9996353639895365e-05,
"loss": 1.4624,
"step": 57800
},
{
"epoch": 0.04305921199411004,
"grad_norm": 0.8928848505020142,
"learning_rate": 1.9996341011494663e-05,
"loss": 1.5755,
"step": 57900
},
{
"epoch": 0.043133580235896064,
"grad_norm": 0.7762150168418884,
"learning_rate": 1.999632836126784e-05,
"loss": 1.4074,
"step": 58000
},
{
"epoch": 0.04320794847768209,
"grad_norm": 0.5097442865371704,
"learning_rate": 1.9996315689214932e-05,
"loss": 1.5281,
"step": 58100
},
{
"epoch": 0.04328231671946812,
"grad_norm": 0.803718626499176,
"learning_rate": 1.999630299533596e-05,
"loss": 1.499,
"step": 58200
},
{
"epoch": 0.043356684961254145,
"grad_norm": 0.5989664196968079,
"learning_rate": 1.9996290279630956e-05,
"loss": 1.5286,
"step": 58300
},
{
"epoch": 0.04343105320304017,
"grad_norm": 0.45334750413894653,
"learning_rate": 1.999627754209995e-05,
"loss": 1.5598,
"step": 58400
},
{
"epoch": 0.0435054214448262,
"grad_norm": 0.9461644887924194,
"learning_rate": 1.999626478274296e-05,
"loss": 1.4569,
"step": 58500
},
{
"epoch": 0.043579789686612226,
"grad_norm": 0.5558738112449646,
"learning_rate": 1.999625200156002e-05,
"loss": 1.5329,
"step": 58600
},
{
"epoch": 0.04365415792839826,
"grad_norm": 0.49125516414642334,
"learning_rate": 1.999623919855116e-05,
"loss": 1.4805,
"step": 58700
},
{
"epoch": 0.04372852617018429,
"grad_norm": 0.6038479208946228,
"learning_rate": 1.9996226373716406e-05,
"loss": 1.5589,
"step": 58800
},
{
"epoch": 0.043802894411970314,
"grad_norm": 0.4560091197490692,
"learning_rate": 1.9996213527055784e-05,
"loss": 1.4538,
"step": 58900
},
{
"epoch": 0.04387726265375634,
"grad_norm": 0.6255136728286743,
"learning_rate": 1.9996200658569323e-05,
"loss": 1.5959,
"step": 59000
},
{
"epoch": 0.04395163089554237,
"grad_norm": 0.8603237867355347,
"learning_rate": 1.999618776825705e-05,
"loss": 1.4769,
"step": 59100
},
{
"epoch": 0.044025999137328395,
"grad_norm": 1.027685523033142,
"learning_rate": 1.9996174856119e-05,
"loss": 1.485,
"step": 59200
},
{
"epoch": 0.04410036737911442,
"grad_norm": 0.6371426582336426,
"learning_rate": 1.999616192215519e-05,
"loss": 1.5713,
"step": 59300
},
{
"epoch": 0.04417473562090045,
"grad_norm": 0.8155677318572998,
"learning_rate": 1.9996148966365664e-05,
"loss": 1.5755,
"step": 59400
},
{
"epoch": 0.044249103862686476,
"grad_norm": 0.9418515563011169,
"learning_rate": 1.9996135988750432e-05,
"loss": 1.51,
"step": 59500
},
{
"epoch": 0.0443234721044725,
"grad_norm": 0.529082179069519,
"learning_rate": 1.9996122989309536e-05,
"loss": 1.5254,
"step": 59600
},
{
"epoch": 0.04439784034625854,
"grad_norm": 0.5595930218696594,
"learning_rate": 1.9996109968042992e-05,
"loss": 1.5515,
"step": 59700
},
{
"epoch": 0.044472208588044564,
"grad_norm": 0.8503856062889099,
"learning_rate": 1.9996096924950843e-05,
"loss": 1.5123,
"step": 59800
},
{
"epoch": 0.04454657682983059,
"grad_norm": 0.6979494690895081,
"learning_rate": 1.9996083860033107e-05,
"loss": 1.5213,
"step": 59900
},
{
"epoch": 0.04462094507161662,
"grad_norm": 0.5807011723518372,
"learning_rate": 1.9996070773289816e-05,
"loss": 1.5411,
"step": 60000
},
{
"epoch": 0.044695313313402645,
"grad_norm": 0.6768651604652405,
"learning_rate": 1.9996057664721e-05,
"loss": 1.5252,
"step": 60100
},
{
"epoch": 0.04476968155518867,
"grad_norm": 0.3594638407230377,
"learning_rate": 1.9996044534326682e-05,
"loss": 1.5126,
"step": 60200
},
{
"epoch": 0.0448440497969747,
"grad_norm": 0.4025649130344391,
"learning_rate": 1.9996031382106897e-05,
"loss": 1.561,
"step": 60300
},
{
"epoch": 0.044918418038760725,
"grad_norm": 0.8125213980674744,
"learning_rate": 1.9996018208061675e-05,
"loss": 1.5445,
"step": 60400
},
{
"epoch": 0.04499278628054675,
"grad_norm": 0.5969058275222778,
"learning_rate": 1.9996005012191037e-05,
"loss": 1.582,
"step": 60500
},
{
"epoch": 0.045067154522332786,
"grad_norm": 1.1144986152648926,
"learning_rate": 1.9995991794495016e-05,
"loss": 1.5563,
"step": 60600
},
{
"epoch": 0.04514152276411881,
"grad_norm": 0.8091686367988586,
"learning_rate": 1.999597855497364e-05,
"loss": 1.5199,
"step": 60700
},
{
"epoch": 0.04521589100590484,
"grad_norm": 1.3050564527511597,
"learning_rate": 1.999596529362694e-05,
"loss": 1.5034,
"step": 60800
},
{
"epoch": 0.04529025924769087,
"grad_norm": 0.5470508933067322,
"learning_rate": 1.9995952010454943e-05,
"loss": 1.5684,
"step": 60900
},
{
"epoch": 0.045364627489476894,
"grad_norm": 0.9612744450569153,
"learning_rate": 1.9995938705457682e-05,
"loss": 1.6064,
"step": 61000
},
{
"epoch": 0.04543899573126292,
"grad_norm": 0.9011774659156799,
"learning_rate": 1.9995925378635177e-05,
"loss": 1.4553,
"step": 61100
},
{
"epoch": 0.04551336397304895,
"grad_norm": 1.70448637008667,
"learning_rate": 1.9995912029987466e-05,
"loss": 1.4507,
"step": 61200
},
{
"epoch": 0.045587732214834975,
"grad_norm": 1.7321926355361938,
"learning_rate": 1.999589865951457e-05,
"loss": 1.5071,
"step": 61300
},
{
"epoch": 0.045662100456621,
"grad_norm": 0.5415388941764832,
"learning_rate": 1.999588526721653e-05,
"loss": 1.5518,
"step": 61400
},
{
"epoch": 0.04573646869840703,
"grad_norm": 0.8833714127540588,
"learning_rate": 1.9995871853093366e-05,
"loss": 1.5299,
"step": 61500
},
{
"epoch": 0.04581083694019306,
"grad_norm": 0.49804380536079407,
"learning_rate": 1.999585841714511e-05,
"loss": 1.5215,
"step": 61600
},
{
"epoch": 0.04588520518197909,
"grad_norm": 1.3999980688095093,
"learning_rate": 1.999584495937179e-05,
"loss": 1.5028,
"step": 61700
},
{
"epoch": 0.04595957342376512,
"grad_norm": 1.1679743528366089,
"learning_rate": 1.9995831479773438e-05,
"loss": 1.4767,
"step": 61800
},
{
"epoch": 0.046033941665551144,
"grad_norm": 0.7388249635696411,
"learning_rate": 1.999581797835008e-05,
"loss": 1.558,
"step": 61900
},
{
"epoch": 0.04610830990733717,
"grad_norm": 0.6812136769294739,
"learning_rate": 1.9995804455101746e-05,
"loss": 1.4495,
"step": 62000
},
{
"epoch": 0.0461826781491232,
"grad_norm": 1.3702300786972046,
"learning_rate": 1.999579091002847e-05,
"loss": 1.4212,
"step": 62100
},
{
"epoch": 0.046257046390909225,
"grad_norm": 0.42544421553611755,
"learning_rate": 1.999577734313028e-05,
"loss": 1.5603,
"step": 62200
},
{
"epoch": 0.04633141463269525,
"grad_norm": 0.6235955357551575,
"learning_rate": 1.99957637544072e-05,
"loss": 1.5164,
"step": 62300
},
{
"epoch": 0.04640578287448128,
"grad_norm": 0.30019888281822205,
"learning_rate": 1.9995750143859262e-05,
"loss": 1.4764,
"step": 62400
},
{
"epoch": 0.04648015111626731,
"grad_norm": 0.509626567363739,
"learning_rate": 1.99957365114865e-05,
"loss": 1.535,
"step": 62500
},
{
"epoch": 0.04655451935805334,
"grad_norm": 0.726915717124939,
"learning_rate": 1.9995722857288943e-05,
"loss": 1.5428,
"step": 62600
},
{
"epoch": 0.046628887599839366,
"grad_norm": 0.5223472714424133,
"learning_rate": 1.9995709181266613e-05,
"loss": 1.548,
"step": 62700
},
{
"epoch": 0.04670325584162539,
"grad_norm": 0.5914735794067383,
"learning_rate": 1.9995695483419554e-05,
"loss": 1.5433,
"step": 62800
},
{
"epoch": 0.04677762408341142,
"grad_norm": 1.1892948150634766,
"learning_rate": 1.999568176374778e-05,
"loss": 1.5964,
"step": 62900
},
{
"epoch": 0.04685199232519745,
"grad_norm": 0.47329986095428467,
"learning_rate": 1.9995668022251333e-05,
"loss": 1.4587,
"step": 63000
},
{
"epoch": 0.046926360566983474,
"grad_norm": 0.7776244878768921,
"learning_rate": 1.9995654258930237e-05,
"loss": 1.5118,
"step": 63100
},
{
"epoch": 0.0470007288087695,
"grad_norm": 0.4600290358066559,
"learning_rate": 1.9995640473784526e-05,
"loss": 1.5327,
"step": 63200
},
{
"epoch": 0.04707509705055553,
"grad_norm": 0.785589873790741,
"learning_rate": 1.9995626666814226e-05,
"loss": 1.5346,
"step": 63300
},
{
"epoch": 0.047149465292341555,
"grad_norm": 0.34471455216407776,
"learning_rate": 1.999561283801937e-05,
"loss": 1.5669,
"step": 63400
},
{
"epoch": 0.04722383353412759,
"grad_norm": 0.8968401551246643,
"learning_rate": 1.9995598987399988e-05,
"loss": 1.4522,
"step": 63500
},
{
"epoch": 0.047298201775913616,
"grad_norm": 0.5577977895736694,
"learning_rate": 1.9995585114956104e-05,
"loss": 1.5894,
"step": 63600
},
{
"epoch": 0.04737257001769964,
"grad_norm": 0.8406354188919067,
"learning_rate": 1.999557122068776e-05,
"loss": 1.5585,
"step": 63700
},
{
"epoch": 0.04744693825948567,
"grad_norm": 0.6812056303024292,
"learning_rate": 1.9995557304594977e-05,
"loss": 1.5531,
"step": 63800
},
{
"epoch": 0.0475213065012717,
"grad_norm": 0.6341506242752075,
"learning_rate": 1.999554336667779e-05,
"loss": 1.5064,
"step": 63900
},
{
"epoch": 0.047595674743057724,
"grad_norm": 0.7291605472564697,
"learning_rate": 1.999552940693623e-05,
"loss": 1.4924,
"step": 64000
},
{
"epoch": 0.04767004298484375,
"grad_norm": 0.5496443510055542,
"learning_rate": 1.9995515425370317e-05,
"loss": 1.5276,
"step": 64100
},
{
"epoch": 0.04774441122662978,
"grad_norm": 0.49453896284103394,
"learning_rate": 1.9995501421980096e-05,
"loss": 1.4673,
"step": 64200
},
{
"epoch": 0.047818779468415805,
"grad_norm": 0.5134396553039551,
"learning_rate": 1.999548739676559e-05,
"loss": 1.5857,
"step": 64300
},
{
"epoch": 0.04789314771020184,
"grad_norm": 1.035983681678772,
"learning_rate": 1.9995473349726834e-05,
"loss": 1.4617,
"step": 64400
},
{
"epoch": 0.047967515951987866,
"grad_norm": 0.4110111594200134,
"learning_rate": 1.999545928086385e-05,
"loss": 1.599,
"step": 64500
},
{
"epoch": 0.04804188419377389,
"grad_norm": 0.6466584205627441,
"learning_rate": 1.999544519017668e-05,
"loss": 1.5212,
"step": 64600
},
{
"epoch": 0.04811625243555992,
"grad_norm": 0.501596212387085,
"learning_rate": 1.9995431077665345e-05,
"loss": 1.5215,
"step": 64700
},
{
"epoch": 0.04819062067734595,
"grad_norm": 0.547459065914154,
"learning_rate": 1.9995416943329882e-05,
"loss": 1.5414,
"step": 64800
},
{
"epoch": 0.048264988919131974,
"grad_norm": 0.9374314546585083,
"learning_rate": 1.999540278717032e-05,
"loss": 1.5104,
"step": 64900
},
{
"epoch": 0.048339357160918,
"grad_norm": 0.5237802267074585,
"learning_rate": 1.999538860918669e-05,
"loss": 1.5065,
"step": 65000
},
{
"epoch": 0.04841372540270403,
"grad_norm": 0.534058690071106,
"learning_rate": 1.9995374409379023e-05,
"loss": 1.5237,
"step": 65100
},
{
"epoch": 0.048488093644490055,
"grad_norm": 0.5253255367279053,
"learning_rate": 1.999536018774735e-05,
"loss": 1.5591,
"step": 65200
},
{
"epoch": 0.04856246188627608,
"grad_norm": 0.6362668871879578,
"learning_rate": 1.99953459442917e-05,
"loss": 1.5155,
"step": 65300
},
{
"epoch": 0.048636830128062115,
"grad_norm": 0.4244192838668823,
"learning_rate": 1.999533167901211e-05,
"loss": 1.5238,
"step": 65400
},
{
"epoch": 0.04871119836984814,
"grad_norm": 0.7062031030654907,
"learning_rate": 1.99953173919086e-05,
"loss": 1.637,
"step": 65500
},
{
"epoch": 0.04878556661163417,
"grad_norm": 0.5232000946998596,
"learning_rate": 1.9995303082981215e-05,
"loss": 1.4824,
"step": 65600
},
{
"epoch": 0.048859934853420196,
"grad_norm": 0.6280112862586975,
"learning_rate": 1.9995288752229976e-05,
"loss": 1.5882,
"step": 65700
},
{
"epoch": 0.04893430309520622,
"grad_norm": 1.1615891456604004,
"learning_rate": 1.999527439965492e-05,
"loss": 1.4839,
"step": 65800
},
{
"epoch": 0.04900867133699225,
"grad_norm": 0.6920228600502014,
"learning_rate": 1.9995260025256075e-05,
"loss": 1.5071,
"step": 65900
},
{
"epoch": 0.04908303957877828,
"grad_norm": 0.7031546235084534,
"learning_rate": 1.999524562903347e-05,
"loss": 1.4983,
"step": 66000
},
{
"epoch": 0.049157407820564304,
"grad_norm": 0.4306289553642273,
"learning_rate": 1.999523121098714e-05,
"loss": 1.519,
"step": 66100
},
{
"epoch": 0.04923177606235033,
"grad_norm": 0.533328652381897,
"learning_rate": 1.9995216771117123e-05,
"loss": 1.5628,
"step": 66200
},
{
"epoch": 0.049306144304136365,
"grad_norm": 0.6325706839561462,
"learning_rate": 1.999520230942344e-05,
"loss": 1.4369,
"step": 66300
},
{
"epoch": 0.04938051254592239,
"grad_norm": 0.43968090415000916,
"learning_rate": 1.9995187825906125e-05,
"loss": 1.4506,
"step": 66400
},
{
"epoch": 0.04945488078770842,
"grad_norm": 1.3659377098083496,
"learning_rate": 1.9995173320565217e-05,
"loss": 1.4786,
"step": 66500
},
{
"epoch": 0.049529249029494446,
"grad_norm": 0.7602143883705139,
"learning_rate": 1.9995158793400735e-05,
"loss": 1.5922,
"step": 66600
},
{
"epoch": 0.04960361727128047,
"grad_norm": 0.4866381287574768,
"learning_rate": 1.999514424441272e-05,
"loss": 1.5279,
"step": 66700
},
{
"epoch": 0.0496779855130665,
"grad_norm": 0.9451634287834167,
"learning_rate": 1.9995129673601203e-05,
"loss": 1.5125,
"step": 66800
},
{
"epoch": 0.04975235375485253,
"grad_norm": 0.49570247530937195,
"learning_rate": 1.999511508096621e-05,
"loss": 1.556,
"step": 66900
},
{
"epoch": 0.049826721996638554,
"grad_norm": 0.5554278492927551,
"learning_rate": 1.999510046650778e-05,
"loss": 1.5644,
"step": 67000
},
{
"epoch": 0.04990109023842458,
"grad_norm": 1.070966124534607,
"learning_rate": 1.9995085830225943e-05,
"loss": 1.4394,
"step": 67100
},
{
"epoch": 0.04997545848021061,
"grad_norm": 0.47984740138053894,
"learning_rate": 1.999507117212073e-05,
"loss": 1.5499,
"step": 67200
},
{
"epoch": 0.05004982672199664,
"grad_norm": 0.6557255983352661,
"learning_rate": 1.999505649219217e-05,
"loss": 1.5894,
"step": 67300
},
{
"epoch": 0.05012419496378267,
"grad_norm": 0.4384523332118988,
"learning_rate": 1.99950417904403e-05,
"loss": 1.5454,
"step": 67400
},
{
"epoch": 0.050198563205568696,
"grad_norm": 1.0821648836135864,
"learning_rate": 1.9995027066865148e-05,
"loss": 1.5872,
"step": 67500
},
{
"epoch": 0.05027293144735472,
"grad_norm": 0.8576905727386475,
"learning_rate": 1.9995012321466747e-05,
"loss": 1.5024,
"step": 67600
},
{
"epoch": 0.05034729968914075,
"grad_norm": 0.7904402613639832,
"learning_rate": 1.9994997554245136e-05,
"loss": 1.4259,
"step": 67700
},
{
"epoch": 0.050421667930926777,
"grad_norm": 1.2858697175979614,
"learning_rate": 1.9994982765200337e-05,
"loss": 1.5907,
"step": 67800
},
{
"epoch": 0.050496036172712803,
"grad_norm": 0.35552987456321716,
"learning_rate": 1.9994967954332388e-05,
"loss": 1.5593,
"step": 67900
},
{
"epoch": 0.05057040441449883,
"grad_norm": 0.5848758220672607,
"learning_rate": 1.999495312164132e-05,
"loss": 1.5435,
"step": 68000
},
{
"epoch": 0.05064477265628486,
"grad_norm": 1.5355236530303955,
"learning_rate": 1.999493826712717e-05,
"loss": 1.5301,
"step": 68100
},
{
"epoch": 0.050719140898070884,
"grad_norm": 0.595832109451294,
"learning_rate": 1.999492339078996e-05,
"loss": 1.5225,
"step": 68200
},
{
"epoch": 0.05079350913985692,
"grad_norm": 0.47388339042663574,
"learning_rate": 1.999490849262973e-05,
"loss": 1.5252,
"step": 68300
},
{
"epoch": 0.050867877381642945,
"grad_norm": 0.48052307963371277,
"learning_rate": 1.999489357264651e-05,
"loss": 1.5274,
"step": 68400
},
{
"epoch": 0.05094224562342897,
"grad_norm": 0.7523823380470276,
"learning_rate": 1.9994878630840334e-05,
"loss": 1.5485,
"step": 68500
},
{
"epoch": 0.051016613865215,
"grad_norm": 0.5487205982208252,
"learning_rate": 1.9994863667211237e-05,
"loss": 1.5851,
"step": 68600
},
{
"epoch": 0.051090982107001026,
"grad_norm": 0.899217963218689,
"learning_rate": 1.999484868175925e-05,
"loss": 1.5519,
"step": 68700
},
{
"epoch": 0.05116535034878705,
"grad_norm": 0.6217190623283386,
"learning_rate": 1.9994833674484398e-05,
"loss": 1.465,
"step": 68800
},
{
"epoch": 0.05123971859057308,
"grad_norm": 0.5816856026649475,
"learning_rate": 1.9994818645386725e-05,
"loss": 1.4822,
"step": 68900
},
{
"epoch": 0.05131408683235911,
"grad_norm": 0.5480476021766663,
"learning_rate": 1.999480359446626e-05,
"loss": 1.5958,
"step": 69000
},
{
"epoch": 0.051388455074145134,
"grad_norm": 0.6178867220878601,
"learning_rate": 1.9994788521723033e-05,
"loss": 1.4214,
"step": 69100
},
{
"epoch": 0.05146282331593117,
"grad_norm": 0.639522135257721,
"learning_rate": 1.999477342715708e-05,
"loss": 1.5462,
"step": 69200
},
{
"epoch": 0.051537191557717195,
"grad_norm": 0.8950421810150146,
"learning_rate": 1.9994758310768432e-05,
"loss": 1.5562,
"step": 69300
},
{
"epoch": 0.05161155979950322,
"grad_norm": 0.9787744283676147,
"learning_rate": 1.9994743172557123e-05,
"loss": 1.5684,
"step": 69400
},
{
"epoch": 0.05168592804128925,
"grad_norm": 0.34816843271255493,
"learning_rate": 1.999472801252319e-05,
"loss": 1.5693,
"step": 69500
},
{
"epoch": 0.051760296283075276,
"grad_norm": 0.8306708931922913,
"learning_rate": 1.9994712830666658e-05,
"loss": 1.6258,
"step": 69600
},
{
"epoch": 0.0518346645248613,
"grad_norm": 0.6776370406150818,
"learning_rate": 1.9994697626987562e-05,
"loss": 1.432,
"step": 69700
},
{
"epoch": 0.05190903276664733,
"grad_norm": 0.3862565755844116,
"learning_rate": 1.999468240148594e-05,
"loss": 1.5958,
"step": 69800
},
{
"epoch": 0.05198340100843336,
"grad_norm": 0.42656075954437256,
"learning_rate": 1.9994667154161826e-05,
"loss": 1.5192,
"step": 69900
},
{
"epoch": 0.052057769250219384,
"grad_norm": 0.7187511920928955,
"learning_rate": 1.9994651885015246e-05,
"loss": 1.4779,
"step": 70000
},
{
"epoch": 0.05213213749200541,
"grad_norm": 0.7468114495277405,
"learning_rate": 1.9994636594046237e-05,
"loss": 1.4672,
"step": 70100
},
{
"epoch": 0.052206505733791445,
"grad_norm": 0.5125714540481567,
"learning_rate": 1.9994621281254834e-05,
"loss": 1.5607,
"step": 70200
},
{
"epoch": 0.05228087397557747,
"grad_norm": 0.6202149987220764,
"learning_rate": 1.999460594664107e-05,
"loss": 1.5402,
"step": 70300
},
{
"epoch": 0.0523552422173635,
"grad_norm": 1.1004749536514282,
"learning_rate": 1.9994590590204974e-05,
"loss": 1.449,
"step": 70400
},
{
"epoch": 0.052429610459149525,
"grad_norm": 0.5230892896652222,
"learning_rate": 1.9994575211946588e-05,
"loss": 1.5675,
"step": 70500
},
{
"epoch": 0.05250397870093555,
"grad_norm": 0.4736848771572113,
"learning_rate": 1.9994559811865936e-05,
"loss": 1.4462,
"step": 70600
},
{
"epoch": 0.05257834694272158,
"grad_norm": 0.722038209438324,
"learning_rate": 1.9994544389963063e-05,
"loss": 1.5297,
"step": 70700
},
{
"epoch": 0.052652715184507606,
"grad_norm": 0.38121601939201355,
"learning_rate": 1.999452894623799e-05,
"loss": 1.4976,
"step": 70800
},
{
"epoch": 0.05272708342629363,
"grad_norm": 0.7381113767623901,
"learning_rate": 1.999451348069076e-05,
"loss": 1.6052,
"step": 70900
},
{
"epoch": 0.05280145166807966,
"grad_norm": 0.48022809624671936,
"learning_rate": 1.99944979933214e-05,
"loss": 1.5308,
"step": 71000
},
{
"epoch": 0.052875819909865694,
"grad_norm": 0.6746697425842285,
"learning_rate": 1.9994482484129952e-05,
"loss": 1.5776,
"step": 71100
},
{
"epoch": 0.05295018815165172,
"grad_norm": 1.0217593908309937,
"learning_rate": 1.999446695311644e-05,
"loss": 1.5206,
"step": 71200
},
{
"epoch": 0.05302455639343775,
"grad_norm": 0.9935411810874939,
"learning_rate": 1.999445140028091e-05,
"loss": 1.5362,
"step": 71300
},
{
"epoch": 0.053098924635223775,
"grad_norm": 0.6215861439704895,
"learning_rate": 1.9994435825623382e-05,
"loss": 1.5598,
"step": 71400
},
{
"epoch": 0.0531732928770098,
"grad_norm": 0.48527583479881287,
"learning_rate": 1.99944202291439e-05,
"loss": 1.5116,
"step": 71500
},
{
"epoch": 0.05324766111879583,
"grad_norm": 1.3961418867111206,
"learning_rate": 1.9994404610842496e-05,
"loss": 1.5574,
"step": 71600
},
{
"epoch": 0.053322029360581856,
"grad_norm": 1.0021171569824219,
"learning_rate": 1.9994388970719202e-05,
"loss": 1.5676,
"step": 71700
},
{
"epoch": 0.05339639760236788,
"grad_norm": 0.5348053574562073,
"learning_rate": 1.9994373308774052e-05,
"loss": 1.4911,
"step": 71800
},
{
"epoch": 0.05347076584415391,
"grad_norm": 0.5527310967445374,
"learning_rate": 1.9994357625007087e-05,
"loss": 1.5595,
"step": 71900
},
{
"epoch": 0.05354513408593994,
"grad_norm": 1.1981103420257568,
"learning_rate": 1.999434191941833e-05,
"loss": 1.5239,
"step": 72000
},
{
"epoch": 0.05361950232772597,
"grad_norm": 0.507123589515686,
"learning_rate": 1.999432619200782e-05,
"loss": 1.5324,
"step": 72100
},
{
"epoch": 0.053693870569512,
"grad_norm": 0.4210796356201172,
"learning_rate": 1.99943104427756e-05,
"loss": 1.5681,
"step": 72200
},
{
"epoch": 0.053768238811298025,
"grad_norm": 0.574341893196106,
"learning_rate": 1.999429467172169e-05,
"loss": 1.6575,
"step": 72300
},
{
"epoch": 0.05384260705308405,
"grad_norm": 0.5402580499649048,
"learning_rate": 1.9994278878846135e-05,
"loss": 1.5097,
"step": 72400
},
{
"epoch": 0.05391697529487008,
"grad_norm": 0.5868122577667236,
"learning_rate": 1.9994263064148964e-05,
"loss": 1.5158,
"step": 72500
},
{
"epoch": 0.053991343536656106,
"grad_norm": 0.5461186170578003,
"learning_rate": 1.9994247227630216e-05,
"loss": 1.4676,
"step": 72600
},
{
"epoch": 0.05406571177844213,
"grad_norm": 0.56854248046875,
"learning_rate": 1.999423136928992e-05,
"loss": 1.5803,
"step": 72700
},
{
"epoch": 0.05414008002022816,
"grad_norm": 0.5925450325012207,
"learning_rate": 1.9994215489128113e-05,
"loss": 1.5622,
"step": 72800
},
{
"epoch": 0.05421444826201419,
"grad_norm": 0.9310332536697388,
"learning_rate": 1.999419958714483e-05,
"loss": 1.5552,
"step": 72900
},
{
"epoch": 0.05428881650380022,
"grad_norm": 0.6535036563873291,
"learning_rate": 1.9994183663340106e-05,
"loss": 1.5079,
"step": 73000
},
{
"epoch": 0.05436318474558625,
"grad_norm": 0.759397029876709,
"learning_rate": 1.9994167717713976e-05,
"loss": 1.4763,
"step": 73100
},
{
"epoch": 0.054437552987372274,
"grad_norm": 0.8042114973068237,
"learning_rate": 1.999415175026648e-05,
"loss": 1.5085,
"step": 73200
},
{
"epoch": 0.0545119212291583,
"grad_norm": 0.5362099409103394,
"learning_rate": 1.999413576099764e-05,
"loss": 1.4936,
"step": 73300
},
{
"epoch": 0.05458628947094433,
"grad_norm": 0.5755407214164734,
"learning_rate": 1.9994119749907502e-05,
"loss": 1.5056,
"step": 73400
},
{
"epoch": 0.054660657712730355,
"grad_norm": 0.595206081867218,
"learning_rate": 1.9994103716996097e-05,
"loss": 1.4753,
"step": 73500
},
{
"epoch": 0.05473502595451638,
"grad_norm": 0.8156918287277222,
"learning_rate": 1.9994087662263457e-05,
"loss": 1.5586,
"step": 73600
},
{
"epoch": 0.05480939419630241,
"grad_norm": 0.739098310470581,
"learning_rate": 1.999407158570962e-05,
"loss": 1.5223,
"step": 73700
},
{
"epoch": 0.054883762438088436,
"grad_norm": 1.348789095878601,
"learning_rate": 1.999405548733463e-05,
"loss": 1.5179,
"step": 73800
},
{
"epoch": 0.05495813067987446,
"grad_norm": 0.671525776386261,
"learning_rate": 1.99940393671385e-05,
"loss": 1.5044,
"step": 73900
},
{
"epoch": 0.0550324989216605,
"grad_norm": 1.034043312072754,
"learning_rate": 1.9994023225121288e-05,
"loss": 1.4223,
"step": 74000
},
{
"epoch": 0.055106867163446524,
"grad_norm": 1.1060287952423096,
"learning_rate": 1.9994007061283018e-05,
"loss": 1.5573,
"step": 74100
},
{
"epoch": 0.05518123540523255,
"grad_norm": 0.8618998527526855,
"learning_rate": 1.999399087562373e-05,
"loss": 1.4898,
"step": 74200
},
{
"epoch": 0.05525560364701858,
"grad_norm": 0.714076817035675,
"learning_rate": 1.9993974668143452e-05,
"loss": 1.451,
"step": 74300
},
{
"epoch": 0.055329971888804605,
"grad_norm": 0.4572731554508209,
"learning_rate": 1.9993958438842224e-05,
"loss": 1.5303,
"step": 74400
},
{
"epoch": 0.05540434013059063,
"grad_norm": 0.496499627828598,
"learning_rate": 1.9993942187720082e-05,
"loss": 1.6219,
"step": 74500
},
{
"epoch": 0.05547870837237666,
"grad_norm": 0.4714408218860626,
"learning_rate": 1.9993925914777064e-05,
"loss": 1.5501,
"step": 74600
},
{
"epoch": 0.055553076614162686,
"grad_norm": 0.5283282995223999,
"learning_rate": 1.9993909620013203e-05,
"loss": 1.5221,
"step": 74700
},
{
"epoch": 0.05562744485594871,
"grad_norm": 0.8781616687774658,
"learning_rate": 1.999389330342853e-05,
"loss": 1.5478,
"step": 74800
},
{
"epoch": 0.05570181309773475,
"grad_norm": 0.5995525121688843,
"learning_rate": 1.9993876965023084e-05,
"loss": 1.5069,
"step": 74900
},
{
"epoch": 0.055776181339520774,
"grad_norm": 0.533664882183075,
"learning_rate": 1.9993860604796905e-05,
"loss": 1.514,
"step": 75000
},
{
"epoch": 0.0558505495813068,
"grad_norm": 1.012466311454773,
"learning_rate": 1.9993844222750023e-05,
"loss": 1.473,
"step": 75100
},
{
"epoch": 0.05592491782309283,
"grad_norm": 0.7862614393234253,
"learning_rate": 1.9993827818882473e-05,
"loss": 1.4832,
"step": 75200
},
{
"epoch": 0.055999286064878855,
"grad_norm": 0.7203556299209595,
"learning_rate": 1.9993811393194302e-05,
"loss": 1.5157,
"step": 75300
},
{
"epoch": 0.05607365430666488,
"grad_norm": 1.1218525171279907,
"learning_rate": 1.9993794945685528e-05,
"loss": 1.4169,
"step": 75400
},
{
"epoch": 0.05614802254845091,
"grad_norm": 0.46560999751091003,
"learning_rate": 1.99937784763562e-05,
"loss": 1.4688,
"step": 75500
},
{
"epoch": 0.056222390790236935,
"grad_norm": 0.9627271294593811,
"learning_rate": 1.999376198520635e-05,
"loss": 1.5489,
"step": 75600
},
{
"epoch": 0.05629675903202296,
"grad_norm": 0.9937626719474792,
"learning_rate": 1.9993745472236018e-05,
"loss": 1.5759,
"step": 75700
},
{
"epoch": 0.05637112727380899,
"grad_norm": 0.6520542502403259,
"learning_rate": 1.9993728937445232e-05,
"loss": 1.4653,
"step": 75800
},
{
"epoch": 0.05644549551559502,
"grad_norm": 1.1701862812042236,
"learning_rate": 1.9993712380834034e-05,
"loss": 1.4875,
"step": 75900
},
{
"epoch": 0.05651986375738105,
"grad_norm": 0.9439906477928162,
"learning_rate": 1.999369580240246e-05,
"loss": 1.5524,
"step": 76000
},
{
"epoch": 0.05659423199916708,
"grad_norm": 1.1177873611450195,
"learning_rate": 1.9993679202150543e-05,
"loss": 1.558,
"step": 76100
},
{
"epoch": 0.056668600240953104,
"grad_norm": 0.4650721549987793,
"learning_rate": 1.9993662580078317e-05,
"loss": 1.5035,
"step": 76200
},
{
"epoch": 0.05674296848273913,
"grad_norm": 0.5230388045310974,
"learning_rate": 1.999364593618583e-05,
"loss": 1.5027,
"step": 76300
},
{
"epoch": 0.05681733672452516,
"grad_norm": 0.6694977879524231,
"learning_rate": 1.9993629270473108e-05,
"loss": 1.4642,
"step": 76400
},
{
"epoch": 0.056891704966311185,
"grad_norm": 0.6857712268829346,
"learning_rate": 1.999361258294019e-05,
"loss": 1.6462,
"step": 76500
},
{
"epoch": 0.05696607320809721,
"grad_norm": 0.708351731300354,
"learning_rate": 1.9993595873587112e-05,
"loss": 1.4773,
"step": 76600
},
{
"epoch": 0.05704044144988324,
"grad_norm": 0.451820969581604,
"learning_rate": 1.999357914241391e-05,
"loss": 1.5512,
"step": 76700
},
{
"epoch": 0.05711480969166927,
"grad_norm": 0.9653975963592529,
"learning_rate": 1.9993562389420623e-05,
"loss": 1.5231,
"step": 76800
},
{
"epoch": 0.0571891779334553,
"grad_norm": 1.0675276517868042,
"learning_rate": 1.9993545614607287e-05,
"loss": 1.5519,
"step": 76900
},
{
"epoch": 0.05726354617524133,
"grad_norm": 0.6132591366767883,
"learning_rate": 1.9993528817973938e-05,
"loss": 1.5634,
"step": 77000
},
{
"epoch": 0.057337914417027354,
"grad_norm": 0.6499157547950745,
"learning_rate": 1.999351199952061e-05,
"loss": 1.4993,
"step": 77100
},
{
"epoch": 0.05741228265881338,
"grad_norm": 0.8147251605987549,
"learning_rate": 1.999349515924734e-05,
"loss": 1.5336,
"step": 77200
},
{
"epoch": 0.05748665090059941,
"grad_norm": 0.601445198059082,
"learning_rate": 1.9993478297154175e-05,
"loss": 1.5546,
"step": 77300
},
{
"epoch": 0.057561019142385435,
"grad_norm": 0.7941200137138367,
"learning_rate": 1.9993461413241138e-05,
"loss": 1.5751,
"step": 77400
},
{
"epoch": 0.05763538738417146,
"grad_norm": 0.5446432828903198,
"learning_rate": 1.9993444507508272e-05,
"loss": 1.5638,
"step": 77500
},
{
"epoch": 0.05770975562595749,
"grad_norm": 1.241955280303955,
"learning_rate": 1.9993427579955617e-05,
"loss": 1.5663,
"step": 77600
},
{
"epoch": 0.057784123867743516,
"grad_norm": 0.47920486330986023,
"learning_rate": 1.99934106305832e-05,
"loss": 1.4326,
"step": 77700
},
{
"epoch": 0.05785849210952955,
"grad_norm": 0.8999041318893433,
"learning_rate": 1.9993393659391068e-05,
"loss": 1.5711,
"step": 77800
},
{
"epoch": 0.05793286035131558,
"grad_norm": 0.6789896488189697,
"learning_rate": 1.9993376666379256e-05,
"loss": 1.5342,
"step": 77900
},
{
"epoch": 0.058007228593101604,
"grad_norm": 0.5044109225273132,
"learning_rate": 1.9993359651547798e-05,
"loss": 1.4873,
"step": 78000
},
{
"epoch": 0.05808159683488763,
"grad_norm": 0.7116490006446838,
"learning_rate": 1.9993342614896733e-05,
"loss": 1.453,
"step": 78100
},
{
"epoch": 0.05815596507667366,
"grad_norm": 0.5207152962684631,
"learning_rate": 1.9993325556426096e-05,
"loss": 1.4711,
"step": 78200
},
{
"epoch": 0.058230333318459684,
"grad_norm": 0.8057217001914978,
"learning_rate": 1.999330847613593e-05,
"loss": 1.5021,
"step": 78300
},
{
"epoch": 0.05830470156024571,
"grad_norm": 1.1154026985168457,
"learning_rate": 1.9993291374026266e-05,
"loss": 1.4475,
"step": 78400
},
{
"epoch": 0.05837906980203174,
"grad_norm": 0.4721396267414093,
"learning_rate": 1.9993274250097146e-05,
"loss": 1.5285,
"step": 78500
},
{
"epoch": 0.058453438043817765,
"grad_norm": 1.0618817806243896,
"learning_rate": 1.9993257104348604e-05,
"loss": 1.5323,
"step": 78600
},
{
"epoch": 0.0585278062856038,
"grad_norm": 1.1249905824661255,
"learning_rate": 1.999323993678068e-05,
"loss": 1.5252,
"step": 78700
},
{
"epoch": 0.058602174527389826,
"grad_norm": 0.48599275946617126,
"learning_rate": 1.999322274739341e-05,
"loss": 1.5124,
"step": 78800
},
{
"epoch": 0.05867654276917585,
"grad_norm": 0.5065784454345703,
"learning_rate": 1.999320553618683e-05,
"loss": 1.5858,
"step": 78900
},
{
"epoch": 0.05875091101096188,
"grad_norm": 0.854963481426239,
"learning_rate": 1.999318830316098e-05,
"loss": 1.513,
"step": 79000
},
{
"epoch": 0.05882527925274791,
"grad_norm": 0.556955099105835,
"learning_rate": 1.9993171048315895e-05,
"loss": 1.514,
"step": 79100
},
{
"epoch": 0.058899647494533934,
"grad_norm": 0.6691248416900635,
"learning_rate": 1.9993153771651618e-05,
"loss": 1.4574,
"step": 79200
},
{
"epoch": 0.05897401573631996,
"grad_norm": 0.5654352903366089,
"learning_rate": 1.999313647316818e-05,
"loss": 1.5046,
"step": 79300
},
{
"epoch": 0.05904838397810599,
"grad_norm": 0.9016973972320557,
"learning_rate": 1.9993119152865624e-05,
"loss": 1.5465,
"step": 79400
},
{
"epoch": 0.059122752219892015,
"grad_norm": 0.4756191670894623,
"learning_rate": 1.9993101810743985e-05,
"loss": 1.4944,
"step": 79500
},
{
"epoch": 0.05919712046167804,
"grad_norm": 0.44962599873542786,
"learning_rate": 1.9993084446803303e-05,
"loss": 1.4853,
"step": 79600
},
{
"epoch": 0.059271488703464076,
"grad_norm": 0.5768176913261414,
"learning_rate": 1.9993067061043614e-05,
"loss": 1.5246,
"step": 79700
},
{
"epoch": 0.0593458569452501,
"grad_norm": 0.6383886933326721,
"learning_rate": 1.9993049653464957e-05,
"loss": 1.5407,
"step": 79800
},
{
"epoch": 0.05942022518703613,
"grad_norm": 0.5047423243522644,
"learning_rate": 1.999303222406737e-05,
"loss": 1.5593,
"step": 79900
},
{
"epoch": 0.05949459342882216,
"grad_norm": 0.5224947333335876,
"learning_rate": 1.999301477285089e-05,
"loss": 1.5501,
"step": 80000
},
{
"epoch": 0.059568961670608184,
"grad_norm": 0.8568351864814758,
"learning_rate": 1.9992997299815557e-05,
"loss": 1.5291,
"step": 80100
},
{
"epoch": 0.05964332991239421,
"grad_norm": 0.5065781474113464,
"learning_rate": 1.9992979804961406e-05,
"loss": 1.4743,
"step": 80200
},
{
"epoch": 0.05971769815418024,
"grad_norm": 0.7506331205368042,
"learning_rate": 1.999296228828848e-05,
"loss": 1.575,
"step": 80300
},
{
"epoch": 0.059792066395966265,
"grad_norm": 0.7313674092292786,
"learning_rate": 1.999294474979681e-05,
"loss": 1.4892,
"step": 80400
},
{
"epoch": 0.05986643463775229,
"grad_norm": 0.6475706100463867,
"learning_rate": 1.999292718948644e-05,
"loss": 1.4716,
"step": 80500
},
{
"epoch": 0.05994080287953832,
"grad_norm": 0.4502275586128235,
"learning_rate": 1.999290960735741e-05,
"loss": 1.5449,
"step": 80600
},
{
"epoch": 0.06001517112132435,
"grad_norm": 0.7036411762237549,
"learning_rate": 1.9992892003409753e-05,
"loss": 1.4786,
"step": 80700
},
{
"epoch": 0.06008953936311038,
"grad_norm": 0.5732350945472717,
"learning_rate": 1.999287437764351e-05,
"loss": 1.5548,
"step": 80800
},
{
"epoch": 0.060163907604896406,
"grad_norm": 0.6757441759109497,
"learning_rate": 1.999285673005872e-05,
"loss": 1.4276,
"step": 80900
},
{
"epoch": 0.06023827584668243,
"grad_norm": 0.8502363562583923,
"learning_rate": 1.999283906065542e-05,
"loss": 1.5078,
"step": 81000
},
{
"epoch": 0.06031264408846846,
"grad_norm": 0.9248318672180176,
"learning_rate": 1.9992821369433654e-05,
"loss": 1.5352,
"step": 81100
},
{
"epoch": 0.06038701233025449,
"grad_norm": 0.3702896535396576,
"learning_rate": 1.999280365639345e-05,
"loss": 1.5567,
"step": 81200
},
{
"epoch": 0.060461380572040514,
"grad_norm": 0.8454656004905701,
"learning_rate": 1.9992785921534853e-05,
"loss": 1.5327,
"step": 81300
},
{
"epoch": 0.06053574881382654,
"grad_norm": 1.452540397644043,
"learning_rate": 1.9992768164857906e-05,
"loss": 1.473,
"step": 81400
},
{
"epoch": 0.06061011705561257,
"grad_norm": 0.5796297192573547,
"learning_rate": 1.999275038636264e-05,
"loss": 1.4551,
"step": 81500
},
{
"epoch": 0.0606844852973986,
"grad_norm": 0.5252229571342468,
"learning_rate": 1.9992732586049096e-05,
"loss": 1.4727,
"step": 81600
},
{
"epoch": 0.06075885353918463,
"grad_norm": 1.0359326601028442,
"learning_rate": 1.999271476391732e-05,
"loss": 1.5017,
"step": 81700
},
{
"epoch": 0.060833221780970656,
"grad_norm": 0.49495527148246765,
"learning_rate": 1.9992696919967337e-05,
"loss": 1.5521,
"step": 81800
},
{
"epoch": 0.06090759002275668,
"grad_norm": 0.548267662525177,
"learning_rate": 1.9992679054199197e-05,
"loss": 1.508,
"step": 81900
},
{
"epoch": 0.06098195826454271,
"grad_norm": 0.5110555291175842,
"learning_rate": 1.999266116661294e-05,
"loss": 1.5644,
"step": 82000
},
{
"epoch": 0.06105632650632874,
"grad_norm": 0.803193211555481,
"learning_rate": 1.9992643257208595e-05,
"loss": 1.4233,
"step": 82100
},
{
"epoch": 0.061130694748114764,
"grad_norm": 1.2153749465942383,
"learning_rate": 1.9992625325986207e-05,
"loss": 1.5741,
"step": 82200
},
{
"epoch": 0.06120506298990079,
"grad_norm": 0.48382624983787537,
"learning_rate": 1.999260737294582e-05,
"loss": 1.5272,
"step": 82300
},
{
"epoch": 0.06127943123168682,
"grad_norm": 0.4789665937423706,
"learning_rate": 1.9992589398087466e-05,
"loss": 1.4757,
"step": 82400
},
{
"epoch": 0.061353799473472845,
"grad_norm": 0.5505409240722656,
"learning_rate": 1.9992571401411183e-05,
"loss": 1.4968,
"step": 82500
},
{
"epoch": 0.06142816771525888,
"grad_norm": 0.7146855592727661,
"learning_rate": 1.999255338291702e-05,
"loss": 1.4641,
"step": 82600
},
{
"epoch": 0.061502535957044906,
"grad_norm": 1.2916581630706787,
"learning_rate": 1.9992535342605008e-05,
"loss": 1.4884,
"step": 82700
},
{
"epoch": 0.06157690419883093,
"grad_norm": 0.5506526231765747,
"learning_rate": 1.9992517280475186e-05,
"loss": 1.4925,
"step": 82800
},
{
"epoch": 0.06165127244061696,
"grad_norm": 1.0735949277877808,
"learning_rate": 1.9992499196527598e-05,
"loss": 1.456,
"step": 82900
},
{
"epoch": 0.06172564068240299,
"grad_norm": 0.5877838134765625,
"learning_rate": 1.9992481090762284e-05,
"loss": 1.4362,
"step": 83000
},
{
"epoch": 0.061800008924189014,
"grad_norm": 0.6066355109214783,
"learning_rate": 1.9992462963179275e-05,
"loss": 1.5472,
"step": 83100
},
{
"epoch": 0.06187437716597504,
"grad_norm": 0.5328536629676819,
"learning_rate": 1.9992444813778622e-05,
"loss": 1.5712,
"step": 83200
},
{
"epoch": 0.06194874540776107,
"grad_norm": 0.685464084148407,
"learning_rate": 1.9992426642560356e-05,
"loss": 1.531,
"step": 83300
},
{
"epoch": 0.062023113649547094,
"grad_norm": 0.6651979684829712,
"learning_rate": 1.999240844952452e-05,
"loss": 1.5326,
"step": 83400
},
{
"epoch": 0.06209748189133313,
"grad_norm": 0.9877690076828003,
"learning_rate": 1.9992390234671157e-05,
"loss": 1.5223,
"step": 83500
},
{
"epoch": 0.062171850133119155,
"grad_norm": 0.4471887946128845,
"learning_rate": 1.9992371998000303e-05,
"loss": 1.5093,
"step": 83600
},
{
"epoch": 0.06224621837490518,
"grad_norm": 0.8113996386528015,
"learning_rate": 1.9992353739511994e-05,
"loss": 1.4959,
"step": 83700
},
{
"epoch": 0.06232058661669121,
"grad_norm": 0.5820923447608948,
"learning_rate": 1.999233545920628e-05,
"loss": 1.5134,
"step": 83800
},
{
"epoch": 0.062394954858477236,
"grad_norm": 0.623708188533783,
"learning_rate": 1.999231715708319e-05,
"loss": 1.4944,
"step": 83900
},
{
"epoch": 0.06246932310026326,
"grad_norm": 0.5685898065567017,
"learning_rate": 1.9992298833142772e-05,
"loss": 1.5297,
"step": 84000
},
{
"epoch": 0.06254369134204929,
"grad_norm": 0.5108596682548523,
"learning_rate": 1.999228048738506e-05,
"loss": 1.4644,
"step": 84100
},
{
"epoch": 0.06261805958383532,
"grad_norm": 0.636935293674469,
"learning_rate": 1.99922621198101e-05,
"loss": 1.5105,
"step": 84200
},
{
"epoch": 0.06269242782562134,
"grad_norm": 0.7226575613021851,
"learning_rate": 1.9992243730417926e-05,
"loss": 1.5828,
"step": 84300
},
{
"epoch": 0.06276679606740737,
"grad_norm": 0.7858364582061768,
"learning_rate": 1.9992225319208584e-05,
"loss": 1.5216,
"step": 84400
},
{
"epoch": 0.0628411643091934,
"grad_norm": 0.5035095810890198,
"learning_rate": 1.999220688618211e-05,
"loss": 1.5342,
"step": 84500
},
{
"epoch": 0.06291553255097942,
"grad_norm": 0.515177845954895,
"learning_rate": 1.9992188431338547e-05,
"loss": 1.5137,
"step": 84600
},
{
"epoch": 0.06298990079276545,
"grad_norm": 0.6190256476402283,
"learning_rate": 1.9992169954677933e-05,
"loss": 1.4787,
"step": 84700
},
{
"epoch": 0.06306426903455148,
"grad_norm": 0.42270639538764954,
"learning_rate": 1.999215145620031e-05,
"loss": 1.5532,
"step": 84800
},
{
"epoch": 0.06313863727633752,
"grad_norm": 0.9928336143493652,
"learning_rate": 1.999213293590572e-05,
"loss": 1.5139,
"step": 84900
},
{
"epoch": 0.06321300551812355,
"grad_norm": 0.6874875426292419,
"learning_rate": 1.99921143937942e-05,
"loss": 1.5843,
"step": 85000
},
{
"epoch": 0.06328737375990957,
"grad_norm": 1.4590458869934082,
"learning_rate": 1.9992095829865786e-05,
"loss": 1.4197,
"step": 85100
},
{
"epoch": 0.0633617420016956,
"grad_norm": 0.7171260714530945,
"learning_rate": 1.999207724412053e-05,
"loss": 1.5444,
"step": 85200
},
{
"epoch": 0.06343611024348163,
"grad_norm": 0.7907926440238953,
"learning_rate": 1.9992058636558466e-05,
"loss": 1.4923,
"step": 85300
},
{
"epoch": 0.06351047848526765,
"grad_norm": 0.5244536399841309,
"learning_rate": 1.9992040007179635e-05,
"loss": 1.4754,
"step": 85400
},
{
"epoch": 0.06358484672705368,
"grad_norm": 0.7662790417671204,
"learning_rate": 1.999202135598408e-05,
"loss": 1.4759,
"step": 85500
},
{
"epoch": 0.06365921496883971,
"grad_norm": 0.5479734539985657,
"learning_rate": 1.9992002682971837e-05,
"loss": 1.5631,
"step": 85600
},
{
"epoch": 0.06373358321062574,
"grad_norm": 0.5378610491752625,
"learning_rate": 1.9991983988142952e-05,
"loss": 1.4574,
"step": 85700
},
{
"epoch": 0.06380795145241176,
"grad_norm": 0.5130261182785034,
"learning_rate": 1.9991965271497463e-05,
"loss": 1.4096,
"step": 85800
},
{
"epoch": 0.06388231969419779,
"grad_norm": 0.3913695812225342,
"learning_rate": 1.9991946533035408e-05,
"loss": 1.4662,
"step": 85900
},
{
"epoch": 0.06395668793598382,
"grad_norm": 0.6080545783042908,
"learning_rate": 1.9991927772756833e-05,
"loss": 1.5168,
"step": 86000
},
{
"epoch": 0.06403105617776984,
"grad_norm": 0.4266175627708435,
"learning_rate": 1.9991908990661782e-05,
"loss": 1.5904,
"step": 86100
},
{
"epoch": 0.06410542441955587,
"grad_norm": 0.7378482818603516,
"learning_rate": 1.9991890186750284e-05,
"loss": 1.5445,
"step": 86200
},
{
"epoch": 0.0641797926613419,
"grad_norm": 0.4735576808452606,
"learning_rate": 1.999187136102239e-05,
"loss": 1.5184,
"step": 86300
},
{
"epoch": 0.06425416090312792,
"grad_norm": 1.1487786769866943,
"learning_rate": 1.999185251347814e-05,
"loss": 1.5623,
"step": 86400
},
{
"epoch": 0.06432852914491395,
"grad_norm": 0.6790218353271484,
"learning_rate": 1.9991833644117573e-05,
"loss": 1.4743,
"step": 86500
},
{
"epoch": 0.06440289738669998,
"grad_norm": 0.6615635752677917,
"learning_rate": 1.9991814752940728e-05,
"loss": 1.5226,
"step": 86600
},
{
"epoch": 0.064477265628486,
"grad_norm": 0.5001498460769653,
"learning_rate": 1.9991795839947652e-05,
"loss": 1.5801,
"step": 86700
},
{
"epoch": 0.06455163387027203,
"grad_norm": 0.880649983882904,
"learning_rate": 1.9991776905138382e-05,
"loss": 1.5611,
"step": 86800
},
{
"epoch": 0.06462600211205807,
"grad_norm": 0.6761185526847839,
"learning_rate": 1.9991757948512962e-05,
"loss": 1.5622,
"step": 86900
},
{
"epoch": 0.0647003703538441,
"grad_norm": 0.48481419682502747,
"learning_rate": 1.999173897007143e-05,
"loss": 1.4518,
"step": 87000
},
{
"epoch": 0.06477473859563013,
"grad_norm": 0.5701479315757751,
"learning_rate": 1.999171996981383e-05,
"loss": 1.5306,
"step": 87100
},
{
"epoch": 0.06484910683741615,
"grad_norm": 0.7284945845603943,
"learning_rate": 1.99917009477402e-05,
"loss": 1.4337,
"step": 87200
},
{
"epoch": 0.06492347507920218,
"grad_norm": 0.7202057242393494,
"learning_rate": 1.999168190385059e-05,
"loss": 1.5605,
"step": 87300
},
{
"epoch": 0.06499784332098821,
"grad_norm": 0.4802098274230957,
"learning_rate": 1.9991662838145034e-05,
"loss": 1.5428,
"step": 87400
},
{
"epoch": 0.06507221156277423,
"grad_norm": 0.5068057775497437,
"learning_rate": 1.9991643750623574e-05,
"loss": 1.4441,
"step": 87500
},
{
"epoch": 0.06514657980456026,
"grad_norm": 0.8798725605010986,
"learning_rate": 1.9991624641286255e-05,
"loss": 1.5766,
"step": 87600
},
{
"epoch": 0.06522094804634629,
"grad_norm": 0.49363136291503906,
"learning_rate": 1.9991605510133115e-05,
"loss": 1.6196,
"step": 87700
},
{
"epoch": 0.06529531628813232,
"grad_norm": 0.5400691628456116,
"learning_rate": 1.99915863571642e-05,
"loss": 1.5392,
"step": 87800
},
{
"epoch": 0.06536968452991834,
"grad_norm": 0.5299246311187744,
"learning_rate": 1.9991567182379546e-05,
"loss": 1.5645,
"step": 87900
},
{
"epoch": 0.06544405277170437,
"grad_norm": 0.6503016352653503,
"learning_rate": 1.9991547985779202e-05,
"loss": 1.4476,
"step": 88000
},
{
"epoch": 0.0655184210134904,
"grad_norm": 0.5769862532615662,
"learning_rate": 1.9991528767363207e-05,
"loss": 1.5248,
"step": 88100
},
{
"epoch": 0.06559278925527642,
"grad_norm": 0.8062888383865356,
"learning_rate": 1.99915095271316e-05,
"loss": 1.6394,
"step": 88200
},
{
"epoch": 0.06566715749706245,
"grad_norm": 0.4004135727882385,
"learning_rate": 1.999149026508443e-05,
"loss": 1.5317,
"step": 88300
},
{
"epoch": 0.06574152573884848,
"grad_norm": 0.6382884383201599,
"learning_rate": 1.9991470981221727e-05,
"loss": 1.5602,
"step": 88400
},
{
"epoch": 0.0658158939806345,
"grad_norm": 0.535750150680542,
"learning_rate": 1.9991451675543544e-05,
"loss": 1.5113,
"step": 88500
},
{
"epoch": 0.06589026222242053,
"grad_norm": 1.1604392528533936,
"learning_rate": 1.999143234804992e-05,
"loss": 1.4996,
"step": 88600
},
{
"epoch": 0.06596463046420656,
"grad_norm": 0.7842492461204529,
"learning_rate": 1.99914129987409e-05,
"loss": 1.5054,
"step": 88700
},
{
"epoch": 0.0660389987059926,
"grad_norm": 0.7460685968399048,
"learning_rate": 1.999139362761652e-05,
"loss": 1.433,
"step": 88800
},
{
"epoch": 0.06611336694777863,
"grad_norm": 0.7984693050384521,
"learning_rate": 1.9991374234676826e-05,
"loss": 1.5551,
"step": 88900
},
{
"epoch": 0.06618773518956465,
"grad_norm": 0.6733551621437073,
"learning_rate": 1.999135481992186e-05,
"loss": 1.4334,
"step": 89000
},
{
"epoch": 0.06626210343135068,
"grad_norm": 0.8035016059875488,
"learning_rate": 1.999133538335166e-05,
"loss": 1.4872,
"step": 89100
},
{
"epoch": 0.06633647167313671,
"grad_norm": 0.4339046776294708,
"learning_rate": 1.9991315924966277e-05,
"loss": 1.4869,
"step": 89200
},
{
"epoch": 0.06641083991492273,
"grad_norm": 0.6680594086647034,
"learning_rate": 1.9991296444765747e-05,
"loss": 1.5103,
"step": 89300
},
{
"epoch": 0.06648520815670876,
"grad_norm": 0.697487473487854,
"learning_rate": 1.9991276942750117e-05,
"loss": 1.4239,
"step": 89400
},
{
"epoch": 0.06655957639849479,
"grad_norm": 0.587734043598175,
"learning_rate": 1.9991257418919424e-05,
"loss": 1.5856,
"step": 89500
},
{
"epoch": 0.06663394464028081,
"grad_norm": 0.8574571013450623,
"learning_rate": 1.999123787327372e-05,
"loss": 1.4818,
"step": 89600
},
{
"epoch": 0.06670831288206684,
"grad_norm": 1.0861676931381226,
"learning_rate": 1.9991218305813035e-05,
"loss": 1.4883,
"step": 89700
},
{
"epoch": 0.06678268112385287,
"grad_norm": 1.0139306783676147,
"learning_rate": 1.9991198716537422e-05,
"loss": 1.5099,
"step": 89800
},
{
"epoch": 0.0668570493656389,
"grad_norm": 0.6741511225700378,
"learning_rate": 1.999117910544692e-05,
"loss": 1.4746,
"step": 89900
},
{
"epoch": 0.06693141760742492,
"grad_norm": 0.9702801704406738,
"learning_rate": 1.999115947254157e-05,
"loss": 1.5166,
"step": 90000
},
{
"epoch": 0.06700578584921095,
"grad_norm": 0.7757803797721863,
"learning_rate": 1.9991139817821416e-05,
"loss": 1.5031,
"step": 90100
},
{
"epoch": 0.06708015409099698,
"grad_norm": 0.7200698256492615,
"learning_rate": 1.9991120141286502e-05,
"loss": 1.5834,
"step": 90200
},
{
"epoch": 0.067154522332783,
"grad_norm": 0.7415780425071716,
"learning_rate": 1.999110044293687e-05,
"loss": 1.5689,
"step": 90300
},
{
"epoch": 0.06722889057456903,
"grad_norm": 0.5777677297592163,
"learning_rate": 1.9991080722772564e-05,
"loss": 1.5139,
"step": 90400
},
{
"epoch": 0.06730325881635506,
"grad_norm": 0.6991866827011108,
"learning_rate": 1.999106098079363e-05,
"loss": 1.5073,
"step": 90500
},
{
"epoch": 0.06737762705814108,
"grad_norm": 0.6112390160560608,
"learning_rate": 1.9991041217000105e-05,
"loss": 1.4773,
"step": 90600
},
{
"epoch": 0.06745199529992713,
"grad_norm": 0.8287676572799683,
"learning_rate": 1.9991021431392033e-05,
"loss": 1.5425,
"step": 90700
},
{
"epoch": 0.06752636354171315,
"grad_norm": 0.8582881689071655,
"learning_rate": 1.999100162396946e-05,
"loss": 1.5581,
"step": 90800
},
{
"epoch": 0.06760073178349918,
"grad_norm": 0.5585276484489441,
"learning_rate": 1.999098179473243e-05,
"loss": 1.5015,
"step": 90900
},
{
"epoch": 0.0676751000252852,
"grad_norm": 0.4237435460090637,
"learning_rate": 1.9990961943680984e-05,
"loss": 1.523,
"step": 91000
},
{
"epoch": 0.06774946826707123,
"grad_norm": 0.5455594658851624,
"learning_rate": 1.999094207081517e-05,
"loss": 1.5448,
"step": 91100
},
{
"epoch": 0.06782383650885726,
"grad_norm": 0.48855817317962646,
"learning_rate": 1.999092217613502e-05,
"loss": 1.4535,
"step": 91200
},
{
"epoch": 0.06789820475064329,
"grad_norm": 0.5199916958808899,
"learning_rate": 1.999090225964059e-05,
"loss": 1.4921,
"step": 91300
},
{
"epoch": 0.06797257299242931,
"grad_norm": 0.5790271162986755,
"learning_rate": 1.9990882321331916e-05,
"loss": 1.5773,
"step": 91400
},
{
"epoch": 0.06804694123421534,
"grad_norm": 0.5524342656135559,
"learning_rate": 1.9990862361209043e-05,
"loss": 1.4619,
"step": 91500
},
{
"epoch": 0.06812130947600137,
"grad_norm": 0.7153291702270508,
"learning_rate": 1.999084237927202e-05,
"loss": 1.6042,
"step": 91600
},
{
"epoch": 0.0681956777177874,
"grad_norm": 0.957635223865509,
"learning_rate": 1.9990822375520882e-05,
"loss": 1.538,
"step": 91700
},
{
"epoch": 0.06827004595957342,
"grad_norm": 0.38240477442741394,
"learning_rate": 1.9990802349955678e-05,
"loss": 1.5937,
"step": 91800
},
{
"epoch": 0.06834441420135945,
"grad_norm": 0.8961233496665955,
"learning_rate": 1.999078230257645e-05,
"loss": 1.5119,
"step": 91900
},
{
"epoch": 0.06841878244314548,
"grad_norm": 0.47433900833129883,
"learning_rate": 1.999076223338324e-05,
"loss": 1.5449,
"step": 92000
},
{
"epoch": 0.0684931506849315,
"grad_norm": 0.8222399353981018,
"learning_rate": 1.9990742142376098e-05,
"loss": 1.5334,
"step": 92100
},
{
"epoch": 0.06856751892671753,
"grad_norm": 0.464373916387558,
"learning_rate": 1.999072202955506e-05,
"loss": 1.5003,
"step": 92200
},
{
"epoch": 0.06864188716850356,
"grad_norm": 0.8799763321876526,
"learning_rate": 1.9990701894920176e-05,
"loss": 1.581,
"step": 92300
},
{
"epoch": 0.06871625541028958,
"grad_norm": 0.9567086100578308,
"learning_rate": 1.999068173847149e-05,
"loss": 1.4373,
"step": 92400
},
{
"epoch": 0.06879062365207561,
"grad_norm": 0.440479576587677,
"learning_rate": 1.999066156020904e-05,
"loss": 1.5571,
"step": 92500
},
{
"epoch": 0.06886499189386165,
"grad_norm": 0.7486180663108826,
"learning_rate": 1.9990641360132876e-05,
"loss": 1.4437,
"step": 92600
},
{
"epoch": 0.06893936013564768,
"grad_norm": 0.7576742172241211,
"learning_rate": 1.9990621138243037e-05,
"loss": 1.5306,
"step": 92700
},
{
"epoch": 0.0690137283774337,
"grad_norm": 0.6755186915397644,
"learning_rate": 1.9990600894539574e-05,
"loss": 1.5769,
"step": 92800
},
{
"epoch": 0.06908809661921973,
"grad_norm": 0.6093853712081909,
"learning_rate": 1.9990580629022526e-05,
"loss": 1.5777,
"step": 92900
},
{
"epoch": 0.06916246486100576,
"grad_norm": 0.5788242220878601,
"learning_rate": 1.9990560341691938e-05,
"loss": 1.494,
"step": 93000
},
{
"epoch": 0.06923683310279179,
"grad_norm": 0.828676700592041,
"learning_rate": 1.9990540032547855e-05,
"loss": 1.5651,
"step": 93100
},
{
"epoch": 0.06931120134457781,
"grad_norm": 0.5612863302230835,
"learning_rate": 1.9990519701590322e-05,
"loss": 1.5584,
"step": 93200
},
{
"epoch": 0.06938556958636384,
"grad_norm": 0.965107262134552,
"learning_rate": 1.999049934881938e-05,
"loss": 1.497,
"step": 93300
},
{
"epoch": 0.06945993782814987,
"grad_norm": 0.46939852833747864,
"learning_rate": 1.9990478974235078e-05,
"loss": 1.5716,
"step": 93400
},
{
"epoch": 0.0695343060699359,
"grad_norm": 0.4986964464187622,
"learning_rate": 1.999045857783746e-05,
"loss": 1.5762,
"step": 93500
},
{
"epoch": 0.06960867431172192,
"grad_norm": 0.4267128109931946,
"learning_rate": 1.9990438159626566e-05,
"loss": 1.5101,
"step": 93600
},
{
"epoch": 0.06968304255350795,
"grad_norm": 0.411811888217926,
"learning_rate": 1.9990417719602445e-05,
"loss": 1.5623,
"step": 93700
},
{
"epoch": 0.06975741079529398,
"grad_norm": 0.8761053681373596,
"learning_rate": 1.999039725776514e-05,
"loss": 1.4294,
"step": 93800
},
{
"epoch": 0.06983177903708,
"grad_norm": 0.9531000852584839,
"learning_rate": 1.99903767741147e-05,
"loss": 1.4925,
"step": 93900
},
{
"epoch": 0.06990614727886603,
"grad_norm": 0.516830325126648,
"learning_rate": 1.999035626865116e-05,
"loss": 1.5802,
"step": 94000
},
{
"epoch": 0.06998051552065206,
"grad_norm": 0.47061294317245483,
"learning_rate": 1.9990335741374572e-05,
"loss": 1.5668,
"step": 94100
},
{
"epoch": 0.07005488376243808,
"grad_norm": 0.7790777683258057,
"learning_rate": 1.9990315192284978e-05,
"loss": 1.5568,
"step": 94200
},
{
"epoch": 0.07012925200422411,
"grad_norm": 0.75156170129776,
"learning_rate": 1.9990294621382426e-05,
"loss": 1.5217,
"step": 94300
},
{
"epoch": 0.07020362024601014,
"grad_norm": 1.195028305053711,
"learning_rate": 1.999027402866696e-05,
"loss": 1.5662,
"step": 94400
},
{
"epoch": 0.07027798848779618,
"grad_norm": 0.6215851306915283,
"learning_rate": 1.999025341413862e-05,
"loss": 1.5208,
"step": 94500
},
{
"epoch": 0.0703523567295822,
"grad_norm": 0.509843647480011,
"learning_rate": 1.9990232777797458e-05,
"loss": 1.489,
"step": 94600
},
{
"epoch": 0.07042672497136823,
"grad_norm": 1.2951029539108276,
"learning_rate": 1.9990212119643516e-05,
"loss": 1.4729,
"step": 94700
},
{
"epoch": 0.07050109321315426,
"grad_norm": 0.5028135776519775,
"learning_rate": 1.9990191439676838e-05,
"loss": 1.5579,
"step": 94800
},
{
"epoch": 0.07057546145494029,
"grad_norm": 0.7202877998352051,
"learning_rate": 1.9990170737897473e-05,
"loss": 1.5282,
"step": 94900
},
{
"epoch": 0.07064982969672631,
"grad_norm": 0.9731516242027283,
"learning_rate": 1.9990150014305462e-05,
"loss": 1.5194,
"step": 95000
},
{
"epoch": 0.07072419793851234,
"grad_norm": 0.7444689273834229,
"learning_rate": 1.9990129268900848e-05,
"loss": 1.5198,
"step": 95100
},
{
"epoch": 0.07079856618029837,
"grad_norm": 0.9299377202987671,
"learning_rate": 1.9990108501683685e-05,
"loss": 1.5393,
"step": 95200
},
{
"epoch": 0.0708729344220844,
"grad_norm": 0.6611402630805969,
"learning_rate": 1.999008771265401e-05,
"loss": 1.5351,
"step": 95300
},
{
"epoch": 0.07094730266387042,
"grad_norm": 0.4772530496120453,
"learning_rate": 1.9990066901811876e-05,
"loss": 1.5243,
"step": 95400
},
{
"epoch": 0.07102167090565645,
"grad_norm": 0.42998188734054565,
"learning_rate": 1.9990046069157322e-05,
"loss": 1.5877,
"step": 95500
},
{
"epoch": 0.07109603914744247,
"grad_norm": 0.7415347099304199,
"learning_rate": 1.9990025214690396e-05,
"loss": 1.5633,
"step": 95600
},
{
"epoch": 0.0711704073892285,
"grad_norm": 0.657112717628479,
"learning_rate": 1.999000433841114e-05,
"loss": 1.4555,
"step": 95700
},
{
"epoch": 0.07124477563101453,
"grad_norm": 0.9188429713249207,
"learning_rate": 1.998998344031961e-05,
"loss": 1.4329,
"step": 95800
},
{
"epoch": 0.07131914387280056,
"grad_norm": 0.8823667168617249,
"learning_rate": 1.9989962520415836e-05,
"loss": 1.4754,
"step": 95900
},
{
"epoch": 0.07139351211458658,
"grad_norm": 0.7276200652122498,
"learning_rate": 1.9989941578699878e-05,
"loss": 1.5286,
"step": 96000
},
{
"epoch": 0.07146788035637261,
"grad_norm": 0.941512405872345,
"learning_rate": 1.998992061517177e-05,
"loss": 1.5087,
"step": 96100
},
{
"epoch": 0.07154224859815864,
"grad_norm": 1.0310442447662354,
"learning_rate": 1.998989962983157e-05,
"loss": 1.5895,
"step": 96200
},
{
"epoch": 0.07161661683994466,
"grad_norm": 1.3620883226394653,
"learning_rate": 1.9989878622679317e-05,
"loss": 1.474,
"step": 96300
},
{
"epoch": 0.0716909850817307,
"grad_norm": 0.5119801163673401,
"learning_rate": 1.998985759371505e-05,
"loss": 1.5112,
"step": 96400
},
{
"epoch": 0.07176535332351673,
"grad_norm": 0.8966123461723328,
"learning_rate": 1.998983654293883e-05,
"loss": 1.4903,
"step": 96500
},
{
"epoch": 0.07183972156530276,
"grad_norm": 0.5336944460868835,
"learning_rate": 1.998981547035069e-05,
"loss": 1.5673,
"step": 96600
},
{
"epoch": 0.07191408980708879,
"grad_norm": 1.2533961534500122,
"learning_rate": 1.9989794375950688e-05,
"loss": 1.5039,
"step": 96700
},
{
"epoch": 0.07198845804887481,
"grad_norm": 1.3317081928253174,
"learning_rate": 1.9989773259738858e-05,
"loss": 1.567,
"step": 96800
},
{
"epoch": 0.07206282629066084,
"grad_norm": 0.49700722098350525,
"learning_rate": 1.998975212171525e-05,
"loss": 1.542,
"step": 96900
},
{
"epoch": 0.07213719453244687,
"grad_norm": 0.5809246301651001,
"learning_rate": 1.9989730961879913e-05,
"loss": 1.5097,
"step": 97000
},
{
"epoch": 0.07221156277423289,
"grad_norm": 0.6107625365257263,
"learning_rate": 1.9989709780232894e-05,
"loss": 1.536,
"step": 97100
},
{
"epoch": 0.07228593101601892,
"grad_norm": 0.5271338820457458,
"learning_rate": 1.9989688576774234e-05,
"loss": 1.5819,
"step": 97200
},
{
"epoch": 0.07236029925780495,
"grad_norm": 0.6692411303520203,
"learning_rate": 1.9989667351503988e-05,
"loss": 1.4833,
"step": 97300
},
{
"epoch": 0.07243466749959097,
"grad_norm": 1.0627728700637817,
"learning_rate": 1.998964610442219e-05,
"loss": 1.5404,
"step": 97400
},
{
"epoch": 0.072509035741377,
"grad_norm": 0.5696298480033875,
"learning_rate": 1.9989624835528896e-05,
"loss": 1.4491,
"step": 97500
},
{
"epoch": 0.07258340398316303,
"grad_norm": 0.5105301141738892,
"learning_rate": 1.998960354482415e-05,
"loss": 1.5188,
"step": 97600
},
{
"epoch": 0.07265777222494905,
"grad_norm": 0.53251713514328,
"learning_rate": 1.9989582232307998e-05,
"loss": 1.5367,
"step": 97700
},
{
"epoch": 0.07273214046673508,
"grad_norm": 0.6559078693389893,
"learning_rate": 1.9989560897980485e-05,
"loss": 1.4773,
"step": 97800
},
{
"epoch": 0.07280650870852111,
"grad_norm": 0.39833974838256836,
"learning_rate": 1.998953954184166e-05,
"loss": 1.6063,
"step": 97900
},
{
"epoch": 0.07288087695030714,
"grad_norm": 1.0479645729064941,
"learning_rate": 1.9989518163891566e-05,
"loss": 1.565,
"step": 98000
},
{
"epoch": 0.07295524519209316,
"grad_norm": 0.7905478477478027,
"learning_rate": 1.9989496764130253e-05,
"loss": 1.5266,
"step": 98100
},
{
"epoch": 0.07302961343387919,
"grad_norm": 0.4569951295852661,
"learning_rate": 1.998947534255777e-05,
"loss": 1.5295,
"step": 98200
},
{
"epoch": 0.07310398167566523,
"grad_norm": 0.5308849215507507,
"learning_rate": 1.9989453899174158e-05,
"loss": 1.5203,
"step": 98300
},
{
"epoch": 0.07317834991745126,
"grad_norm": 0.906802773475647,
"learning_rate": 1.998943243397947e-05,
"loss": 1.556,
"step": 98400
},
{
"epoch": 0.07325271815923728,
"grad_norm": 0.5071494579315186,
"learning_rate": 1.9989410946973747e-05,
"loss": 1.5627,
"step": 98500
},
{
"epoch": 0.07332708640102331,
"grad_norm": 0.5252199172973633,
"learning_rate": 1.9989389438157037e-05,
"loss": 1.5181,
"step": 98600
},
{
"epoch": 0.07340145464280934,
"grad_norm": 0.5738980174064636,
"learning_rate": 1.9989367907529394e-05,
"loss": 1.6101,
"step": 98700
},
{
"epoch": 0.07347582288459537,
"grad_norm": 0.6898683309555054,
"learning_rate": 1.9989346355090853e-05,
"loss": 1.579,
"step": 98800
},
{
"epoch": 0.07355019112638139,
"grad_norm": 0.5396860241889954,
"learning_rate": 1.998932478084147e-05,
"loss": 1.5645,
"step": 98900
},
{
"epoch": 0.07362455936816742,
"grad_norm": 0.5482293367385864,
"learning_rate": 1.998930318478129e-05,
"loss": 1.5453,
"step": 99000
},
{
"epoch": 0.07369892760995345,
"grad_norm": 0.8394240736961365,
"learning_rate": 1.9989281566910363e-05,
"loss": 1.5025,
"step": 99100
},
{
"epoch": 0.07377329585173947,
"grad_norm": 0.9409950971603394,
"learning_rate": 1.9989259927228725e-05,
"loss": 1.5489,
"step": 99200
},
{
"epoch": 0.0738476640935255,
"grad_norm": 0.5597321391105652,
"learning_rate": 1.9989238265736437e-05,
"loss": 1.5994,
"step": 99300
},
{
"epoch": 0.07392203233531153,
"grad_norm": 0.5139235258102417,
"learning_rate": 1.9989216582433538e-05,
"loss": 1.5478,
"step": 99400
},
{
"epoch": 0.07399640057709755,
"grad_norm": 0.6312362551689148,
"learning_rate": 1.998919487732008e-05,
"loss": 1.4989,
"step": 99500
},
{
"epoch": 0.07407076881888358,
"grad_norm": 0.6924223303794861,
"learning_rate": 1.9989173150396105e-05,
"loss": 1.4491,
"step": 99600
},
{
"epoch": 0.07414513706066961,
"grad_norm": 0.5490585565567017,
"learning_rate": 1.9989151401661666e-05,
"loss": 1.538,
"step": 99700
},
{
"epoch": 0.07421950530245564,
"grad_norm": 0.630455732345581,
"learning_rate": 1.998912963111681e-05,
"loss": 1.5286,
"step": 99800
},
{
"epoch": 0.07429387354424166,
"grad_norm": 0.8591504693031311,
"learning_rate": 1.998910783876158e-05,
"loss": 1.5612,
"step": 99900
},
{
"epoch": 0.07436824178602769,
"grad_norm": 1.0016669034957886,
"learning_rate": 1.9989086024596027e-05,
"loss": 1.5154,
"step": 100000
},
{
"epoch": 0.07444261002781372,
"grad_norm": 0.6513885259628296,
"learning_rate": 1.9989064188620197e-05,
"loss": 1.5446,
"step": 100100
},
{
"epoch": 0.07451697826959976,
"grad_norm": 0.6838514804840088,
"learning_rate": 1.998904233083414e-05,
"loss": 1.5336,
"step": 100200
},
{
"epoch": 0.07459134651138578,
"grad_norm": 0.46571242809295654,
"learning_rate": 1.9989020451237903e-05,
"loss": 1.4838,
"step": 100300
},
{
"epoch": 0.07466571475317181,
"grad_norm": 0.9936356544494629,
"learning_rate": 1.998899854983153e-05,
"loss": 1.5929,
"step": 100400
},
{
"epoch": 0.07474008299495784,
"grad_norm": 0.6591018438339233,
"learning_rate": 1.9988976626615075e-05,
"loss": 1.54,
"step": 100500
},
{
"epoch": 0.07481445123674386,
"grad_norm": 0.8453909754753113,
"learning_rate": 1.998895468158858e-05,
"loss": 1.5191,
"step": 100600
},
{
"epoch": 0.07488881947852989,
"grad_norm": 0.6555935144424438,
"learning_rate": 1.9988932714752095e-05,
"loss": 1.5734,
"step": 100700
},
{
"epoch": 0.07496318772031592,
"grad_norm": 0.6445733308792114,
"learning_rate": 1.998891072610567e-05,
"loss": 1.5516,
"step": 100800
},
{
"epoch": 0.07503755596210195,
"grad_norm": 0.534389078617096,
"learning_rate": 1.9988888715649357e-05,
"loss": 1.5441,
"step": 100900
},
{
"epoch": 0.07511192420388797,
"grad_norm": 1.068562388420105,
"learning_rate": 1.998886668338319e-05,
"loss": 1.4998,
"step": 101000
},
{
"epoch": 0.075186292445674,
"grad_norm": 0.6331286430358887,
"learning_rate": 1.998884462930723e-05,
"loss": 1.5633,
"step": 101100
},
{
"epoch": 0.07526066068746003,
"grad_norm": 1.3566038608551025,
"learning_rate": 1.998882255342152e-05,
"loss": 1.4621,
"step": 101200
},
{
"epoch": 0.07533502892924605,
"grad_norm": 0.9672004580497742,
"learning_rate": 1.998880045572611e-05,
"loss": 1.5249,
"step": 101300
},
{
"epoch": 0.07540939717103208,
"grad_norm": 0.36732280254364014,
"learning_rate": 1.9988778336221045e-05,
"loss": 1.574,
"step": 101400
},
{
"epoch": 0.07548376541281811,
"grad_norm": 0.4788234829902649,
"learning_rate": 1.998875619490638e-05,
"loss": 1.5418,
"step": 101500
},
{
"epoch": 0.07555813365460413,
"grad_norm": 0.8955681324005127,
"learning_rate": 1.9988734031782157e-05,
"loss": 1.5568,
"step": 101600
},
{
"epoch": 0.07563250189639016,
"grad_norm": 0.8049163222312927,
"learning_rate": 1.9988711846848427e-05,
"loss": 1.4838,
"step": 101700
},
{
"epoch": 0.07570687013817619,
"grad_norm": 0.7558008432388306,
"learning_rate": 1.9988689640105235e-05,
"loss": 1.4955,
"step": 101800
},
{
"epoch": 0.07578123837996222,
"grad_norm": 0.4749026596546173,
"learning_rate": 1.9988667411552635e-05,
"loss": 1.5929,
"step": 101900
},
{
"epoch": 0.07585560662174824,
"grad_norm": 0.6597522497177124,
"learning_rate": 1.998864516119067e-05,
"loss": 1.5584,
"step": 102000
},
{
"epoch": 0.07592997486353428,
"grad_norm": 0.7412188053131104,
"learning_rate": 1.9988622889019395e-05,
"loss": 1.5842,
"step": 102100
},
{
"epoch": 0.07600434310532031,
"grad_norm": 0.5564984679222107,
"learning_rate": 1.9988600595038853e-05,
"loss": 1.5764,
"step": 102200
},
{
"epoch": 0.07607871134710634,
"grad_norm": 1.0488529205322266,
"learning_rate": 1.9988578279249097e-05,
"loss": 1.458,
"step": 102300
},
{
"epoch": 0.07615307958889236,
"grad_norm": 1.40269136428833,
"learning_rate": 1.998855594165017e-05,
"loss": 1.4588,
"step": 102400
},
{
"epoch": 0.07622744783067839,
"grad_norm": 0.8488138318061829,
"learning_rate": 1.9988533582242127e-05,
"loss": 1.522,
"step": 102500
},
{
"epoch": 0.07630181607246442,
"grad_norm": 0.5191701054573059,
"learning_rate": 1.9988511201025015e-05,
"loss": 1.5036,
"step": 102600
},
{
"epoch": 0.07637618431425044,
"grad_norm": 0.6648279428482056,
"learning_rate": 1.9988488797998878e-05,
"loss": 1.4929,
"step": 102700
},
{
"epoch": 0.07645055255603647,
"grad_norm": 1.8600202798843384,
"learning_rate": 1.9988466373163774e-05,
"loss": 1.5692,
"step": 102800
},
{
"epoch": 0.0765249207978225,
"grad_norm": 0.7583739757537842,
"learning_rate": 1.9988443926519743e-05,
"loss": 1.5145,
"step": 102900
},
{
"epoch": 0.07659928903960853,
"grad_norm": 0.6128048300743103,
"learning_rate": 1.998842145806684e-05,
"loss": 1.5729,
"step": 103000
},
{
"epoch": 0.07667365728139455,
"grad_norm": 0.7574602365493774,
"learning_rate": 1.998839896780511e-05,
"loss": 1.4356,
"step": 103100
},
{
"epoch": 0.07674802552318058,
"grad_norm": 1.4134727716445923,
"learning_rate": 1.9988376455734606e-05,
"loss": 1.5048,
"step": 103200
},
{
"epoch": 0.0768223937649666,
"grad_norm": 0.7592337727546692,
"learning_rate": 1.9988353921855374e-05,
"loss": 1.4988,
"step": 103300
},
{
"epoch": 0.07689676200675263,
"grad_norm": 0.522486686706543,
"learning_rate": 1.9988331366167465e-05,
"loss": 1.5654,
"step": 103400
},
{
"epoch": 0.07697113024853866,
"grad_norm": 0.6535342335700989,
"learning_rate": 1.9988308788670925e-05,
"loss": 1.4593,
"step": 103500
},
{
"epoch": 0.07704549849032469,
"grad_norm": 0.6663926243782043,
"learning_rate": 1.9988286189365808e-05,
"loss": 1.477,
"step": 103600
},
{
"epoch": 0.07711986673211071,
"grad_norm": 0.5006215572357178,
"learning_rate": 1.998826356825216e-05,
"loss": 1.5326,
"step": 103700
},
{
"epoch": 0.07719423497389674,
"grad_norm": 0.6826842427253723,
"learning_rate": 1.9988240925330032e-05,
"loss": 1.5102,
"step": 103800
},
{
"epoch": 0.07726860321568277,
"grad_norm": 0.2680438756942749,
"learning_rate": 1.9988218260599477e-05,
"loss": 1.4773,
"step": 103900
},
{
"epoch": 0.07734297145746881,
"grad_norm": 0.9159733057022095,
"learning_rate": 1.9988195574060536e-05,
"loss": 1.4984,
"step": 104000
},
{
"epoch": 0.07741733969925484,
"grad_norm": 1.0930269956588745,
"learning_rate": 1.9988172865713266e-05,
"loss": 1.4196,
"step": 104100
},
{
"epoch": 0.07749170794104086,
"grad_norm": 0.6656064391136169,
"learning_rate": 1.998815013555771e-05,
"loss": 1.5282,
"step": 104200
},
{
"epoch": 0.07756607618282689,
"grad_norm": 0.6679131388664246,
"learning_rate": 1.9988127383593923e-05,
"loss": 1.4922,
"step": 104300
},
{
"epoch": 0.07764044442461292,
"grad_norm": 0.5231404304504395,
"learning_rate": 1.9988104609821953e-05,
"loss": 1.4648,
"step": 104400
},
{
"epoch": 0.07771481266639894,
"grad_norm": 0.6543662548065186,
"learning_rate": 1.998808181424185e-05,
"loss": 1.5349,
"step": 104500
},
{
"epoch": 0.07778918090818497,
"grad_norm": 0.4422987997531891,
"learning_rate": 1.9988058996853666e-05,
"loss": 1.5031,
"step": 104600
},
{
"epoch": 0.077863549149971,
"grad_norm": 0.74057537317276,
"learning_rate": 1.9988036157657444e-05,
"loss": 1.5373,
"step": 104700
},
{
"epoch": 0.07793791739175703,
"grad_norm": 0.8893790245056152,
"learning_rate": 1.998801329665324e-05,
"loss": 1.5177,
"step": 104800
},
{
"epoch": 0.07801228563354305,
"grad_norm": 0.898235559463501,
"learning_rate": 1.9987990413841103e-05,
"loss": 1.5938,
"step": 104900
},
{
"epoch": 0.07808665387532908,
"grad_norm": 0.566254198551178,
"learning_rate": 1.9987967509221082e-05,
"loss": 1.4581,
"step": 105000
},
{
"epoch": 0.0781610221171151,
"grad_norm": 0.6054997444152832,
"learning_rate": 1.9987944582793226e-05,
"loss": 1.5248,
"step": 105100
},
{
"epoch": 0.07823539035890113,
"grad_norm": 0.6898595690727234,
"learning_rate": 1.9987921634557588e-05,
"loss": 1.5482,
"step": 105200
},
{
"epoch": 0.07830975860068716,
"grad_norm": 0.7741703391075134,
"learning_rate": 1.9987898664514213e-05,
"loss": 1.5175,
"step": 105300
},
{
"epoch": 0.07838412684247319,
"grad_norm": 0.649459958076477,
"learning_rate": 1.9987875672663155e-05,
"loss": 1.5702,
"step": 105400
},
{
"epoch": 0.07845849508425921,
"grad_norm": 1.0062605142593384,
"learning_rate": 1.9987852659004465e-05,
"loss": 1.5077,
"step": 105500
},
{
"epoch": 0.07853286332604524,
"grad_norm": 0.5658386945724487,
"learning_rate": 1.9987829623538193e-05,
"loss": 1.5682,
"step": 105600
}
],
"logging_steps": 100,
"max_steps": 6723300,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4384193697868513e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}