dumb-dev's picture
at least i generates some kind of output now...
8bb1997 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 20000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032,
"grad_norm": NaN,
"learning_rate": 9.999948122981575e-05,
"loss": 1.0507,
"step": 32
},
{
"epoch": 0.0064,
"grad_norm": 1.0899442434310913,
"learning_rate": 9.999770471768777e-05,
"loss": 1.016,
"step": 64
},
{
"epoch": 0.0096,
"grad_norm": 2.957014799118042,
"learning_rate": 9.999466495684926e-05,
"loss": 0.9928,
"step": 96
},
{
"epoch": 0.0128,
"grad_norm": 1.3575732707977295,
"learning_rate": 9.999036202410325e-05,
"loss": 0.8757,
"step": 128
},
{
"epoch": 0.016,
"grad_norm": 1.0611408948898315,
"learning_rate": 9.998498908285819e-05,
"loss": 0.8615,
"step": 160
},
{
"epoch": 0.0192,
"grad_norm": 1.382992148399353,
"learning_rate": 9.997819962824957e-05,
"loss": 0.8216,
"step": 192
},
{
"epoch": 0.0224,
"grad_norm": 2.863276481628418,
"learning_rate": 9.997014741774866e-05,
"loss": 0.7406,
"step": 224
},
{
"epoch": 0.0256,
"grad_norm": 1.1629211902618408,
"learning_rate": 9.996083265480365e-05,
"loss": 0.8171,
"step": 256
},
{
"epoch": 0.0288,
"grad_norm": 2.2264232635498047,
"learning_rate": 9.995025557476261e-05,
"loss": 0.8835,
"step": 288
},
{
"epoch": 0.032,
"grad_norm": 1.7896003723144531,
"learning_rate": 9.993841644486747e-05,
"loss": 0.7303,
"step": 320
},
{
"epoch": 0.0352,
"grad_norm": 1.403260350227356,
"learning_rate": 9.992531556424726e-05,
"loss": 0.7384,
"step": 352
},
{
"epoch": 0.0384,
"grad_norm": 2.308896780014038,
"learning_rate": 9.99109532639106e-05,
"loss": 0.8211,
"step": 384
},
{
"epoch": 0.0416,
"grad_norm": 1.282929539680481,
"learning_rate": 9.989532990673728e-05,
"loss": 0.7211,
"step": 416
},
{
"epoch": 0.0448,
"grad_norm": 2.4921414852142334,
"learning_rate": 9.987844588746915e-05,
"loss": 0.8204,
"step": 448
},
{
"epoch": 0.048,
"grad_norm": 1.3490195274353027,
"learning_rate": 9.986030163270011e-05,
"loss": 0.7623,
"step": 480
},
{
"epoch": 0.0512,
"grad_norm": 1.436516523361206,
"learning_rate": 9.98408976008653e-05,
"loss": 0.7981,
"step": 512
},
{
"epoch": 0.0544,
"grad_norm": 2.3144304752349854,
"learning_rate": 9.982023428222962e-05,
"loss": 0.7422,
"step": 544
},
{
"epoch": 0.0576,
"grad_norm": 1.2702479362487793,
"learning_rate": 9.979831219887525e-05,
"loss": 0.8107,
"step": 576
},
{
"epoch": 0.0608,
"grad_norm": 3.110814332962036,
"learning_rate": 9.977513190468848e-05,
"loss": 0.8395,
"step": 608
},
{
"epoch": 0.064,
"grad_norm": 4.934881687164307,
"learning_rate": 9.975069398534574e-05,
"loss": 0.8456,
"step": 640
},
{
"epoch": 0.0672,
"grad_norm": 1.5248093605041504,
"learning_rate": 9.972499905829875e-05,
"loss": 0.7604,
"step": 672
},
{
"epoch": 0.0704,
"grad_norm": 1.5269616842269897,
"learning_rate": 9.9698047772759e-05,
"loss": 0.7557,
"step": 704
},
{
"epoch": 0.0736,
"grad_norm": 1.523474097251892,
"learning_rate": 9.966984080968128e-05,
"loss": 0.7622,
"step": 736
},
{
"epoch": 0.0768,
"grad_norm": 1.3121402263641357,
"learning_rate": 9.96403788817465e-05,
"loss": 0.6912,
"step": 768
},
{
"epoch": 0.08,
"grad_norm": 0.9180154800415039,
"learning_rate": 9.96096627333437e-05,
"loss": 0.8783,
"step": 800
},
{
"epoch": 0.0832,
"grad_norm": 2.254473924636841,
"learning_rate": 9.957769314055117e-05,
"loss": 0.7987,
"step": 832
},
{
"epoch": 0.0864,
"grad_norm": 1.9398365020751953,
"learning_rate": 9.954447091111694e-05,
"loss": 0.7703,
"step": 864
},
{
"epoch": 0.0896,
"grad_norm": 1.4880696535110474,
"learning_rate": 9.950999688443833e-05,
"loss": 0.7258,
"step": 896
},
{
"epoch": 0.0928,
"grad_norm": 1.8427962064743042,
"learning_rate": 9.947427193154071e-05,
"loss": 0.6981,
"step": 928
},
{
"epoch": 0.096,
"grad_norm": 3.3647401332855225,
"learning_rate": 9.943729695505552e-05,
"loss": 0.7862,
"step": 960
},
{
"epoch": 0.0992,
"grad_norm": 1.852992296218872,
"learning_rate": 9.939907288919747e-05,
"loss": 0.8016,
"step": 992
},
{
"epoch": 0.1024,
"grad_norm": 1.2231330871582031,
"learning_rate": 9.935960069974096e-05,
"loss": 0.8001,
"step": 1024
},
{
"epoch": 0.1056,
"grad_norm": 1.2329598665237427,
"learning_rate": 9.931888138399561e-05,
"loss": 0.7656,
"step": 1056
},
{
"epoch": 0.1088,
"grad_norm": 1.4887111186981201,
"learning_rate": 9.927691597078108e-05,
"loss": 0.7772,
"step": 1088
},
{
"epoch": 0.112,
"grad_norm": 1.1879202127456665,
"learning_rate": 9.923370552040116e-05,
"loss": 0.7368,
"step": 1120
},
{
"epoch": 0.1152,
"grad_norm": 1.4578642845153809,
"learning_rate": 9.918925112461688e-05,
"loss": 0.7226,
"step": 1152
},
{
"epoch": 0.1184,
"grad_norm": 3.8356716632843018,
"learning_rate": 9.914355390661896e-05,
"loss": 0.7468,
"step": 1184
},
{
"epoch": 0.1216,
"grad_norm": 3.390878200531006,
"learning_rate": 9.909661502099943e-05,
"loss": 0.7163,
"step": 1216
},
{
"epoch": 0.1248,
"grad_norm": 2.217479944229126,
"learning_rate": 9.904843565372248e-05,
"loss": 0.7805,
"step": 1248
},
{
"epoch": 0.128,
"grad_norm": 0.7309045195579529,
"learning_rate": 9.899901702209445e-05,
"loss": 0.6929,
"step": 1280
},
{
"epoch": 0.1312,
"grad_norm": 1.173700213432312,
"learning_rate": 9.89483603747331e-05,
"loss": 0.726,
"step": 1312
},
{
"epoch": 0.1344,
"grad_norm": 1.4089820384979248,
"learning_rate": 9.88964669915361e-05,
"loss": 0.8606,
"step": 1344
},
{
"epoch": 0.1376,
"grad_norm": 1.0375796556472778,
"learning_rate": 9.884333818364861e-05,
"loss": 0.721,
"step": 1376
},
{
"epoch": 0.1408,
"grad_norm": 2.082084894180298,
"learning_rate": 9.878897529343023e-05,
"loss": 0.7884,
"step": 1408
},
{
"epoch": 0.144,
"grad_norm": 0.7961512804031372,
"learning_rate": 9.873337969442101e-05,
"loss": 0.774,
"step": 1440
},
{
"epoch": 0.1472,
"grad_norm": 1.3074238300323486,
"learning_rate": 9.867655279130683e-05,
"loss": 0.7392,
"step": 1472
},
{
"epoch": 0.1504,
"grad_norm": 1.5205963850021362,
"learning_rate": 9.861849601988383e-05,
"loss": 0.7731,
"step": 1504
},
{
"epoch": 0.1536,
"grad_norm": 1.4995771646499634,
"learning_rate": 9.855921084702219e-05,
"loss": 0.8281,
"step": 1536
},
{
"epoch": 0.1568,
"grad_norm": 1.0279921293258667,
"learning_rate": 9.849869877062902e-05,
"loss": 0.6942,
"step": 1568
},
{
"epoch": 0.16,
"grad_norm": 2.8020853996276855,
"learning_rate": 9.843696131961058e-05,
"loss": 0.7389,
"step": 1600
},
{
"epoch": 0.1632,
"grad_norm": 2.99129056930542,
"learning_rate": 9.837400005383354e-05,
"loss": 0.7483,
"step": 1632
},
{
"epoch": 0.1664,
"grad_norm": 2.325167179107666,
"learning_rate": 9.830981656408574e-05,
"loss": 0.7483,
"step": 1664
},
{
"epoch": 0.1696,
"grad_norm": 0.7245140671730042,
"learning_rate": 9.824441247203579e-05,
"loss": 0.7633,
"step": 1696
},
{
"epoch": 0.1728,
"grad_norm": 2.7938778400421143,
"learning_rate": 9.817778943019228e-05,
"loss": 0.7812,
"step": 1728
},
{
"epoch": 0.176,
"grad_norm": 1.2263625860214233,
"learning_rate": 9.810994912186189e-05,
"loss": 0.7712,
"step": 1760
},
{
"epoch": 0.1792,
"grad_norm": 1.2694672346115112,
"learning_rate": 9.804089326110697e-05,
"loss": 0.7297,
"step": 1792
},
{
"epoch": 0.1824,
"grad_norm": 1.255414366722107,
"learning_rate": 9.797062359270215e-05,
"loss": 0.735,
"step": 1824
},
{
"epoch": 0.1856,
"grad_norm": 1.3175591230392456,
"learning_rate": 9.789914189209029e-05,
"loss": 0.7633,
"step": 1856
},
{
"epoch": 0.1888,
"grad_norm": 1.0326446294784546,
"learning_rate": 9.78264499653376e-05,
"loss": 0.7955,
"step": 1888
},
{
"epoch": 0.192,
"grad_norm": 1.093620777130127,
"learning_rate": 9.775254964908807e-05,
"loss": 0.766,
"step": 1920
},
{
"epoch": 0.1952,
"grad_norm": 1.4234970808029175,
"learning_rate": 9.767744281051701e-05,
"loss": 0.6725,
"step": 1952
},
{
"epoch": 0.1984,
"grad_norm": 0.7571769952774048,
"learning_rate": 9.760113134728384e-05,
"loss": 0.6953,
"step": 1984
},
{
"epoch": 0.2016,
"grad_norm": 1.7207865715026855,
"learning_rate": 9.752361718748423e-05,
"loss": 0.7356,
"step": 2016
},
{
"epoch": 0.2048,
"grad_norm": 2.240748882293701,
"learning_rate": 9.744490228960138e-05,
"loss": 0.8067,
"step": 2048
},
{
"epoch": 0.208,
"grad_norm": 1.2544214725494385,
"learning_rate": 9.736498864245638e-05,
"loss": 0.7618,
"step": 2080
},
{
"epoch": 0.2112,
"grad_norm": 5.976646900177002,
"learning_rate": 9.728387826515819e-05,
"loss": 0.6825,
"step": 2112
},
{
"epoch": 0.2144,
"grad_norm": 4.557011127471924,
"learning_rate": 9.72015732070525e-05,
"loss": 0.7623,
"step": 2144
},
{
"epoch": 0.2176,
"grad_norm": 0.8000884056091309,
"learning_rate": 9.71180755476699e-05,
"loss": 0.7719,
"step": 2176
},
{
"epoch": 0.2208,
"grad_norm": 1.115488052368164,
"learning_rate": 9.703338739667346e-05,
"loss": 0.7913,
"step": 2208
},
{
"epoch": 0.224,
"grad_norm": 1.3180317878723145,
"learning_rate": 9.694751089380536e-05,
"loss": 0.7452,
"step": 2240
},
{
"epoch": 0.2272,
"grad_norm": 2.9995932579040527,
"learning_rate": 9.686044820883285e-05,
"loss": 0.7962,
"step": 2272
},
{
"epoch": 0.2304,
"grad_norm": 1.234027624130249,
"learning_rate": 9.677220154149336e-05,
"loss": 0.828,
"step": 2304
},
{
"epoch": 0.2336,
"grad_norm": 1.6579309701919556,
"learning_rate": 9.668277312143907e-05,
"loss": 0.7569,
"step": 2336
},
{
"epoch": 0.2368,
"grad_norm": 1.5580084323883057,
"learning_rate": 9.65921652081804e-05,
"loss": 0.7947,
"step": 2368
},
{
"epoch": 0.24,
"grad_norm": 0.6711795330047607,
"learning_rate": 9.650038009102905e-05,
"loss": 0.7461,
"step": 2400
},
{
"epoch": 0.2432,
"grad_norm": 1.2285038232803345,
"learning_rate": 9.640742008904005e-05,
"loss": 0.6587,
"step": 2432
},
{
"epoch": 0.2464,
"grad_norm": 0.7901808619499207,
"learning_rate": 9.631328755095334e-05,
"loss": 0.7182,
"step": 2464
},
{
"epoch": 0.2496,
"grad_norm": 0.6125284433364868,
"learning_rate": 9.62179848551342e-05,
"loss": 0.709,
"step": 2496
},
{
"epoch": 0.2528,
"grad_norm": 1.1602981090545654,
"learning_rate": 9.612151440951334e-05,
"loss": 0.7039,
"step": 2528
},
{
"epoch": 0.256,
"grad_norm": 2.366184711456299,
"learning_rate": 9.602387865152597e-05,
"loss": 0.8669,
"step": 2560
},
{
"epoch": 0.2592,
"grad_norm": 2.583352565765381,
"learning_rate": 9.592508004805023e-05,
"loss": 0.7258,
"step": 2592
},
{
"epoch": 0.2624,
"grad_norm": 2.132749557495117,
"learning_rate": 9.58251210953449e-05,
"loss": 0.6971,
"step": 2624
},
{
"epoch": 0.2656,
"grad_norm": 1.4479436874389648,
"learning_rate": 9.572400431898627e-05,
"loss": 0.8086,
"step": 2656
},
{
"epoch": 0.2688,
"grad_norm": 1.2764617204666138,
"learning_rate": 9.562173227380436e-05,
"loss": 0.7426,
"step": 2688
},
{
"epoch": 0.272,
"grad_norm": 3.4120121002197266,
"learning_rate": 9.55183075438184e-05,
"loss": 0.7382,
"step": 2720
},
{
"epoch": 0.2752,
"grad_norm": 1.9773039817810059,
"learning_rate": 9.541373274217145e-05,
"loss": 0.7903,
"step": 2752
},
{
"epoch": 0.2784,
"grad_norm": 1.4097728729248047,
"learning_rate": 9.530801051106449e-05,
"loss": 0.7713,
"step": 2784
},
{
"epoch": 0.2816,
"grad_norm": 1.0817668437957764,
"learning_rate": 9.520114352168958e-05,
"loss": 0.7275,
"step": 2816
},
{
"epoch": 0.2848,
"grad_norm": 1.2667794227600098,
"learning_rate": 9.509313447416242e-05,
"loss": 0.6648,
"step": 2848
},
{
"epoch": 0.288,
"grad_norm": 1.8679159879684448,
"learning_rate": 9.498398609745405e-05,
"loss": 0.7445,
"step": 2880
},
{
"epoch": 0.2912,
"grad_norm": 2.8598544597625732,
"learning_rate": 9.487370114932202e-05,
"loss": 0.733,
"step": 2912
},
{
"epoch": 0.2944,
"grad_norm": 0.9554559588432312,
"learning_rate": 9.476228241624059e-05,
"loss": 0.7487,
"step": 2944
},
{
"epoch": 0.2976,
"grad_norm": 1.926672101020813,
"learning_rate": 9.464973271333042e-05,
"loss": 0.8864,
"step": 2976
},
{
"epoch": 0.3008,
"grad_norm": 0.8425309658050537,
"learning_rate": 9.45360548842874e-05,
"loss": 0.7295,
"step": 3008
},
{
"epoch": 0.304,
"grad_norm": 1.3110431432724,
"learning_rate": 9.442125180131078e-05,
"loss": 0.7547,
"step": 3040
},
{
"epoch": 0.3072,
"grad_norm": 0.9774306416511536,
"learning_rate": 9.430532636503068e-05,
"loss": 0.7099,
"step": 3072
},
{
"epoch": 0.3104,
"grad_norm": 0.6718234419822693,
"learning_rate": 9.418828150443469e-05,
"loss": 0.7636,
"step": 3104
},
{
"epoch": 0.3136,
"grad_norm": 1.2758376598358154,
"learning_rate": 9.407012017679393e-05,
"loss": 0.7066,
"step": 3136
},
{
"epoch": 0.3168,
"grad_norm": 1.3185311555862427,
"learning_rate": 9.395084536758838e-05,
"loss": 0.6785,
"step": 3168
},
{
"epoch": 0.32,
"grad_norm": 1.550795078277588,
"learning_rate": 9.383046009043134e-05,
"loss": 0.7451,
"step": 3200
},
{
"epoch": 0.3232,
"grad_norm": 1.1686354875564575,
"learning_rate": 9.370896738699339e-05,
"loss": 0.6652,
"step": 3232
},
{
"epoch": 0.3264,
"grad_norm": 0.848976194858551,
"learning_rate": 9.358637032692545e-05,
"loss": 0.7705,
"step": 3264
},
{
"epoch": 0.3296,
"grad_norm": 1.3812384605407715,
"learning_rate": 9.346267200778126e-05,
"loss": 0.7168,
"step": 3296
},
{
"epoch": 0.3328,
"grad_norm": 1.008135199546814,
"learning_rate": 9.333787555493914e-05,
"loss": 0.7352,
"step": 3328
},
{
"epoch": 0.336,
"grad_norm": 1.2273484468460083,
"learning_rate": 9.321198412152301e-05,
"loss": 0.7979,
"step": 3360
},
{
"epoch": 0.3392,
"grad_norm": 0.8740741610527039,
"learning_rate": 9.308500088832272e-05,
"loss": 0.6846,
"step": 3392
},
{
"epoch": 0.3424,
"grad_norm": 1.3684589862823486,
"learning_rate": 9.295692906371363e-05,
"loss": 0.7758,
"step": 3424
},
{
"epoch": 0.3456,
"grad_norm": 1.2861257791519165,
"learning_rate": 9.282777188357565e-05,
"loss": 0.6581,
"step": 3456
},
{
"epoch": 0.3488,
"grad_norm": 0.8915108442306519,
"learning_rate": 9.269753261121138e-05,
"loss": 0.7935,
"step": 3488
},
{
"epoch": 0.352,
"grad_norm": 1.1308799982070923,
"learning_rate": 9.256621453726379e-05,
"loss": 0.7759,
"step": 3520
},
{
"epoch": 0.3552,
"grad_norm": 1.0886152982711792,
"learning_rate": 9.243382097963291e-05,
"loss": 0.7207,
"step": 3552
},
{
"epoch": 0.3584,
"grad_norm": 0.675757110118866,
"learning_rate": 9.230035528339211e-05,
"loss": 0.6876,
"step": 3584
},
{
"epoch": 0.3616,
"grad_norm": 3.258622884750366,
"learning_rate": 9.216582082070358e-05,
"loss": 0.7498,
"step": 3616
},
{
"epoch": 0.3648,
"grad_norm": 3.8826818466186523,
"learning_rate": 9.203022099073309e-05,
"loss": 0.7993,
"step": 3648
},
{
"epoch": 0.368,
"grad_norm": 0.9782927632331848,
"learning_rate": 9.189355921956412e-05,
"loss": 0.7005,
"step": 3680
},
{
"epoch": 0.3712,
"grad_norm": 1.1662654876708984,
"learning_rate": 9.175583896011131e-05,
"loss": 0.6732,
"step": 3712
},
{
"epoch": 0.3744,
"grad_norm": 1.038501501083374,
"learning_rate": 9.161706369203317e-05,
"loss": 0.7414,
"step": 3744
},
{
"epoch": 0.3776,
"grad_norm": 2.3218936920166016,
"learning_rate": 9.147723692164427e-05,
"loss": 0.8008,
"step": 3776
},
{
"epoch": 0.3808,
"grad_norm": 2.1190292835235596,
"learning_rate": 9.13363621818265e-05,
"loss": 0.711,
"step": 3808
},
{
"epoch": 0.384,
"grad_norm": 1.8653652667999268,
"learning_rate": 9.119444303193996e-05,
"loss": 0.7641,
"step": 3840
},
{
"epoch": 0.3872,
"grad_norm": 1.760704517364502,
"learning_rate": 9.10514830577329e-05,
"loss": 0.7231,
"step": 3872
},
{
"epoch": 0.3904,
"grad_norm": 0.8437080979347229,
"learning_rate": 9.090748587125118e-05,
"loss": 0.7089,
"step": 3904
},
{
"epoch": 0.3936,
"grad_norm": 1.6417099237442017,
"learning_rate": 9.076245511074703e-05,
"loss": 0.7645,
"step": 3936
},
{
"epoch": 0.3968,
"grad_norm": 1.0280499458312988,
"learning_rate": 9.06163944405871e-05,
"loss": 0.78,
"step": 3968
},
{
"epoch": 0.4,
"grad_norm": 2.645205020904541,
"learning_rate": 9.046930755115985e-05,
"loss": 0.7443,
"step": 4000
},
{
"epoch": 0.4032,
"grad_norm": 1.4332572221755981,
"learning_rate": 9.032119815878236e-05,
"loss": 0.7138,
"step": 4032
},
{
"epoch": 0.4064,
"grad_norm": 1.3062529563903809,
"learning_rate": 9.017207000560639e-05,
"loss": 0.6866,
"step": 4064
},
{
"epoch": 0.4096,
"grad_norm": 1.559920072555542,
"learning_rate": 9.002192685952385e-05,
"loss": 0.7289,
"step": 4096
},
{
"epoch": 0.4128,
"grad_norm": 2.111950635910034,
"learning_rate": 8.987077251407158e-05,
"loss": 0.7011,
"step": 4128
},
{
"epoch": 0.416,
"grad_norm": 0.8812033534049988,
"learning_rate": 8.971861078833557e-05,
"loss": 0.7469,
"step": 4160
},
{
"epoch": 0.4192,
"grad_norm": 0.8479238748550415,
"learning_rate": 8.956544552685437e-05,
"loss": 0.7263,
"step": 4192
},
{
"epoch": 0.4224,
"grad_norm": 1.0125929117202759,
"learning_rate": 8.941128059952201e-05,
"loss": 0.6762,
"step": 4224
},
{
"epoch": 0.4256,
"grad_norm": 0.9122424721717834,
"learning_rate": 8.925611990149021e-05,
"loss": 0.7076,
"step": 4256
},
{
"epoch": 0.4288,
"grad_norm": 1.814253330230713,
"learning_rate": 8.909996735306996e-05,
"loss": 0.7143,
"step": 4288
},
{
"epoch": 0.432,
"grad_norm": 1.4890289306640625,
"learning_rate": 8.894282689963251e-05,
"loss": 0.6931,
"step": 4320
},
{
"epoch": 0.4352,
"grad_norm": 1.908116340637207,
"learning_rate": 8.878470251150959e-05,
"loss": 0.701,
"step": 4352
},
{
"epoch": 0.4384,
"grad_norm": 1.2831019163131714,
"learning_rate": 8.862559818389322e-05,
"loss": 0.7625,
"step": 4384
},
{
"epoch": 0.4416,
"grad_norm": 0.923768162727356,
"learning_rate": 8.846551793673467e-05,
"loss": 0.7902,
"step": 4416
},
{
"epoch": 0.4448,
"grad_norm": 1.6989527940750122,
"learning_rate": 8.83044658146429e-05,
"loss": 0.7006,
"step": 4448
},
{
"epoch": 0.448,
"grad_norm": 1.203029990196228,
"learning_rate": 8.814244588678245e-05,
"loss": 0.7588,
"step": 4480
},
{
"epoch": 0.4512,
"grad_norm": 1.8377019166946411,
"learning_rate": 8.797946224677052e-05,
"loss": 0.6975,
"step": 4512
},
{
"epoch": 0.4544,
"grad_norm": 1.4714457988739014,
"learning_rate": 8.78155190125736e-05,
"loss": 0.6502,
"step": 4544
},
{
"epoch": 0.4576,
"grad_norm": 1.6311497688293457,
"learning_rate": 8.765062032640346e-05,
"loss": 0.7536,
"step": 4576
},
{
"epoch": 0.4608,
"grad_norm": 1.8238953351974487,
"learning_rate": 8.748477035461238e-05,
"loss": 0.7899,
"step": 4608
},
{
"epoch": 0.464,
"grad_norm": 1.5541362762451172,
"learning_rate": 8.7317973287588e-05,
"loss": 0.6904,
"step": 4640
},
{
"epoch": 0.4672,
"grad_norm": 1.032272219657898,
"learning_rate": 8.715023333964736e-05,
"loss": 0.7395,
"step": 4672
},
{
"epoch": 0.4704,
"grad_norm": 1.3095510005950928,
"learning_rate": 8.69815547489305e-05,
"loss": 0.6854,
"step": 4704
},
{
"epoch": 0.4736,
"grad_norm": 1.5274263620376587,
"learning_rate": 8.681194177729327e-05,
"loss": 0.7498,
"step": 4736
},
{
"epoch": 0.4768,
"grad_norm": 1.4236122369766235,
"learning_rate": 8.66413987101998e-05,
"loss": 0.7356,
"step": 4768
},
{
"epoch": 0.48,
"grad_norm": 1.2118279933929443,
"learning_rate": 8.646992985661404e-05,
"loss": 0.7178,
"step": 4800
},
{
"epoch": 0.4832,
"grad_norm": 3.3495805263519287,
"learning_rate": 8.629753954889107e-05,
"loss": 0.7326,
"step": 4832
},
{
"epoch": 0.4864,
"grad_norm": 0.6829349398612976,
"learning_rate": 8.612423214266749e-05,
"loss": 0.7838,
"step": 4864
},
{
"epoch": 0.4896,
"grad_norm": 0.8314148187637329,
"learning_rate": 8.595001201675147e-05,
"loss": 0.7007,
"step": 4896
},
{
"epoch": 0.4928,
"grad_norm": 1.2672547101974487,
"learning_rate": 8.577488357301209e-05,
"loss": 0.7377,
"step": 4928
},
{
"epoch": 0.496,
"grad_norm": 1.3968323469161987,
"learning_rate": 8.559885123626807e-05,
"loss": 0.6774,
"step": 4960
},
{
"epoch": 0.4992,
"grad_norm": 1.2808008193969727,
"learning_rate": 8.542191945417601e-05,
"loss": 0.6807,
"step": 4992
},
{
"epoch": 0.5024,
"grad_norm": 1.9290404319763184,
"learning_rate": 8.524409269711807e-05,
"loss": 0.7376,
"step": 5024
},
{
"epoch": 0.5056,
"grad_norm": 1.3726913928985596,
"learning_rate": 8.506537545808892e-05,
"loss": 0.7402,
"step": 5056
},
{
"epoch": 0.5088,
"grad_norm": 1.7894905805587769,
"learning_rate": 8.48857722525823e-05,
"loss": 0.6991,
"step": 5088
},
{
"epoch": 0.512,
"grad_norm": 1.1462016105651855,
"learning_rate": 8.470528761847684e-05,
"loss": 0.7989,
"step": 5120
},
{
"epoch": 0.5152,
"grad_norm": 0.7457314729690552,
"learning_rate": 8.452392611592153e-05,
"loss": 0.7616,
"step": 5152
},
{
"epoch": 0.5184,
"grad_norm": 1.728968858718872,
"learning_rate": 8.434169232722043e-05,
"loss": 0.6324,
"step": 5184
},
{
"epoch": 0.5216,
"grad_norm": 0.9103218913078308,
"learning_rate": 8.415859085671683e-05,
"loss": 0.7222,
"step": 5216
},
{
"epoch": 0.5248,
"grad_norm": 1.602072834968567,
"learning_rate": 8.397462633067704e-05,
"loss": 0.7265,
"step": 5248
},
{
"epoch": 0.528,
"grad_norm": 0.9967379570007324,
"learning_rate": 8.378980339717349e-05,
"loss": 0.7042,
"step": 5280
},
{
"epoch": 0.5312,
"grad_norm": 1.9905532598495483,
"learning_rate": 8.360412672596712e-05,
"loss": 0.8098,
"step": 5312
},
{
"epoch": 0.5344,
"grad_norm": 1.1432929039001465,
"learning_rate": 8.341760100838965e-05,
"loss": 0.7591,
"step": 5344
},
{
"epoch": 0.5376,
"grad_norm": 2.1654775142669678,
"learning_rate": 8.323023095722486e-05,
"loss": 0.8071,
"step": 5376
},
{
"epoch": 0.5408,
"grad_norm": 1.2390097379684448,
"learning_rate": 8.304202130658959e-05,
"loss": 0.834,
"step": 5408
},
{
"epoch": 0.544,
"grad_norm": 1.0290433168411255,
"learning_rate": 8.285297681181408e-05,
"loss": 0.8228,
"step": 5440
},
{
"epoch": 0.5472,
"grad_norm": 1.299111008644104,
"learning_rate": 8.26631022493219e-05,
"loss": 0.7099,
"step": 5472
},
{
"epoch": 0.5504,
"grad_norm": 0.8850242495536804,
"learning_rate": 8.247240241650918e-05,
"loss": 0.7646,
"step": 5504
},
{
"epoch": 0.5536,
"grad_norm": 1.980812907218933,
"learning_rate": 8.22808821316235e-05,
"loss": 0.7312,
"step": 5536
},
{
"epoch": 0.5568,
"grad_norm": 1.0378026962280273,
"learning_rate": 8.208854623364202e-05,
"loss": 0.7277,
"step": 5568
},
{
"epoch": 0.56,
"grad_norm": 1.6820452213287354,
"learning_rate": 8.189539958214935e-05,
"loss": 0.7654,
"step": 5600
},
{
"epoch": 0.5632,
"grad_norm": 1.494661808013916,
"learning_rate": 8.170144705721465e-05,
"loss": 0.7208,
"step": 5632
},
{
"epoch": 0.5664,
"grad_norm": 0.9761049747467041,
"learning_rate": 8.150669355926846e-05,
"loss": 0.6898,
"step": 5664
},
{
"epoch": 0.5696,
"grad_norm": 1.3057583570480347,
"learning_rate": 8.131114400897874e-05,
"loss": 0.7887,
"step": 5696
},
{
"epoch": 0.5728,
"grad_norm": 1.0025156736373901,
"learning_rate": 8.111480334712665e-05,
"loss": 0.6483,
"step": 5728
},
{
"epoch": 0.576,
"grad_norm": 0.9818746447563171,
"learning_rate": 8.091767653448167e-05,
"loss": 0.8385,
"step": 5760
},
{
"epoch": 0.5792,
"grad_norm": 1.1921987533569336,
"learning_rate": 8.071976855167629e-05,
"loss": 0.6707,
"step": 5792
},
{
"epoch": 0.5824,
"grad_norm": 1.5055749416351318,
"learning_rate": 8.052108439908013e-05,
"loss": 0.7086,
"step": 5824
},
{
"epoch": 0.5856,
"grad_norm": 1.7581650018692017,
"learning_rate": 8.032162909667362e-05,
"loss": 0.6696,
"step": 5856
},
{
"epoch": 0.5888,
"grad_norm": 1.8909873962402344,
"learning_rate": 8.01214076839212e-05,
"loss": 0.7471,
"step": 5888
},
{
"epoch": 0.592,
"grad_norm": 1.3570644855499268,
"learning_rate": 7.992042521964389e-05,
"loss": 0.655,
"step": 5920
},
{
"epoch": 0.5952,
"grad_norm": 0.6561287641525269,
"learning_rate": 7.971868678189161e-05,
"loss": 0.719,
"step": 5952
},
{
"epoch": 0.5984,
"grad_norm": 1.3650476932525635,
"learning_rate": 7.951619746781474e-05,
"loss": 0.7405,
"step": 5984
},
{
"epoch": 0.6016,
"grad_norm": 2.8344266414642334,
"learning_rate": 7.931296239353544e-05,
"loss": 0.7192,
"step": 6016
},
{
"epoch": 0.6048,
"grad_norm": 1.6202623844146729,
"learning_rate": 7.910898669401839e-05,
"loss": 0.7671,
"step": 6048
},
{
"epoch": 0.608,
"grad_norm": 1.1194038391113281,
"learning_rate": 7.890427552294093e-05,
"loss": 0.7915,
"step": 6080
},
{
"epoch": 0.6112,
"grad_norm": 0.8267541527748108,
"learning_rate": 7.869883405256295e-05,
"loss": 0.7441,
"step": 6112
},
{
"epoch": 0.6144,
"grad_norm": 1.229134202003479,
"learning_rate": 7.849266747359619e-05,
"loss": 0.6548,
"step": 6144
},
{
"epoch": 0.6176,
"grad_norm": 1.151248812675476,
"learning_rate": 7.828578099507308e-05,
"loss": 0.6795,
"step": 6176
},
{
"epoch": 0.6208,
"grad_norm": 1.620975375175476,
"learning_rate": 7.80781798442151e-05,
"loss": 0.6352,
"step": 6208
},
{
"epoch": 0.624,
"grad_norm": 0.9030219912528992,
"learning_rate": 7.786986926630078e-05,
"loss": 0.7185,
"step": 6240
},
{
"epoch": 0.6272,
"grad_norm": 1.2997703552246094,
"learning_rate": 7.766085452453312e-05,
"loss": 0.6523,
"step": 6272
},
{
"epoch": 0.6304,
"grad_norm": 1.208347201347351,
"learning_rate": 7.74511408999066e-05,
"loss": 0.6928,
"step": 6304
},
{
"epoch": 0.6336,
"grad_norm": 0.723646879196167,
"learning_rate": 7.724073369107376e-05,
"loss": 0.6603,
"step": 6336
},
{
"epoch": 0.6368,
"grad_norm": 1.125978946685791,
"learning_rate": 7.702963821421133e-05,
"loss": 0.7328,
"step": 6368
},
{
"epoch": 0.64,
"grad_norm": 2.039461135864258,
"learning_rate": 7.6817859802886e-05,
"loss": 0.7545,
"step": 6400
},
{
"epoch": 0.6432,
"grad_norm": 1.3743586540222168,
"learning_rate": 7.660540380791942e-05,
"loss": 0.67,
"step": 6432
},
{
"epoch": 0.6464,
"grad_norm": 1.402256727218628,
"learning_rate": 7.639227559725332e-05,
"loss": 0.636,
"step": 6464
},
{
"epoch": 0.6496,
"grad_norm": 1.0240074396133423,
"learning_rate": 7.617848055581361e-05,
"loss": 0.8179,
"step": 6496
},
{
"epoch": 0.6528,
"grad_norm": 0.8905365467071533,
"learning_rate": 7.596402408537443e-05,
"loss": 0.7542,
"step": 6528
},
{
"epoch": 0.656,
"grad_norm": 1.8598270416259766,
"learning_rate": 7.574891160442179e-05,
"loss": 0.7266,
"step": 6560
},
{
"epoch": 0.6592,
"grad_norm": 0.9146720170974731,
"learning_rate": 7.553314854801641e-05,
"loss": 0.7861,
"step": 6592
},
{
"epoch": 0.6624,
"grad_norm": 1.8956897258758545,
"learning_rate": 7.531674036765662e-05,
"loss": 0.7113,
"step": 6624
},
{
"epoch": 0.6656,
"grad_norm": 1.0353283882141113,
"learning_rate": 7.509969253114055e-05,
"loss": 0.6984,
"step": 6656
},
{
"epoch": 0.6688,
"grad_norm": 1.890493631362915,
"learning_rate": 7.488201052242789e-05,
"loss": 0.6687,
"step": 6688
},
{
"epoch": 0.672,
"grad_norm": 0.9367122054100037,
"learning_rate": 7.46636998415015e-05,
"loss": 0.719,
"step": 6720
},
{
"epoch": 0.6752,
"grad_norm": 1.1989344358444214,
"learning_rate": 7.444476600422828e-05,
"loss": 0.775,
"step": 6752
},
{
"epoch": 0.6784,
"grad_norm": 0.8481733202934265,
"learning_rate": 7.42252145422199e-05,
"loss": 0.7667,
"step": 6784
},
{
"epoch": 0.6816,
"grad_norm": 1.0271095037460327,
"learning_rate": 7.400505100269307e-05,
"loss": 0.653,
"step": 6816
},
{
"epoch": 0.6848,
"grad_norm": 1.3998816013336182,
"learning_rate": 7.378428094832931e-05,
"loss": 0.6651,
"step": 6848
},
{
"epoch": 0.688,
"grad_norm": 1.3338642120361328,
"learning_rate": 7.356290995713437e-05,
"loss": 0.6266,
"step": 6880
},
{
"epoch": 0.6912,
"grad_norm": 0.8170168995857239,
"learning_rate": 7.334094362229739e-05,
"loss": 0.765,
"step": 6912
},
{
"epoch": 0.6944,
"grad_norm": 1.4982614517211914,
"learning_rate": 7.311838755204959e-05,
"loss": 0.641,
"step": 6944
},
{
"epoch": 0.6976,
"grad_norm": 1.623159646987915,
"learning_rate": 7.290222928580347e-05,
"loss": 0.6462,
"step": 6976
},
{
"epoch": 0.7008,
"grad_norm": 1.169145941734314,
"learning_rate": 7.267852862072673e-05,
"loss": 0.7506,
"step": 7008
},
{
"epoch": 0.704,
"grad_norm": 1.011816382408142,
"learning_rate": 7.245425495690538e-05,
"loss": 0.7183,
"step": 7040
},
{
"epoch": 0.7072,
"grad_norm": 3.0435078144073486,
"learning_rate": 7.222941396086789e-05,
"loss": 0.7948,
"step": 7072
},
{
"epoch": 0.7104,
"grad_norm": 0.802679717540741,
"learning_rate": 7.2004011313477e-05,
"loss": 0.8216,
"step": 7104
},
{
"epoch": 0.7136,
"grad_norm": 0.7551457285881042,
"learning_rate": 7.17780527097862e-05,
"loss": 0.7823,
"step": 7136
},
{
"epoch": 0.7168,
"grad_norm": 1.3118380308151245,
"learning_rate": 7.155154385889589e-05,
"loss": 0.7803,
"step": 7168
},
{
"epoch": 0.72,
"grad_norm": 1.1100643873214722,
"learning_rate": 7.132449048380907e-05,
"loss": 0.7425,
"step": 7200
},
{
"epoch": 0.7232,
"grad_norm": 0.8792561888694763,
"learning_rate": 7.109689832128673e-05,
"loss": 0.7515,
"step": 7232
},
{
"epoch": 0.7264,
"grad_norm": 0.8382082581520081,
"learning_rate": 7.0868773121703e-05,
"loss": 0.8134,
"step": 7264
},
{
"epoch": 0.7296,
"grad_norm": 1.7332772016525269,
"learning_rate": 7.064012064889971e-05,
"loss": 0.6971,
"step": 7296
},
{
"epoch": 0.7328,
"grad_norm": 1.4402042627334595,
"learning_rate": 7.041094668004093e-05,
"loss": 0.6845,
"step": 7328
},
{
"epoch": 0.736,
"grad_norm": 1.1810777187347412,
"learning_rate": 7.018125700546683e-05,
"loss": 0.7472,
"step": 7360
},
{
"epoch": 0.7392,
"grad_norm": 0.9390580058097839,
"learning_rate": 6.995105742854759e-05,
"loss": 0.8127,
"step": 7392
},
{
"epoch": 0.7424,
"grad_norm": 1.570432186126709,
"learning_rate": 6.972035376553656e-05,
"loss": 0.7071,
"step": 7424
},
{
"epoch": 0.7456,
"grad_norm": 1.168547511100769,
"learning_rate": 6.94891518454234e-05,
"loss": 0.7017,
"step": 7456
},
{
"epoch": 0.7488,
"grad_norm": 1.1337932348251343,
"learning_rate": 6.925745750978686e-05,
"loss": 0.6738,
"step": 7488
},
{
"epoch": 0.752,
"grad_norm": 1.351352334022522,
"learning_rate": 6.902527661264701e-05,
"loss": 0.7548,
"step": 7520
},
{
"epoch": 0.7552,
"grad_norm": 0.6679269671440125,
"learning_rate": 6.87926150203176e-05,
"loss": 0.7106,
"step": 7552
},
{
"epoch": 0.7584,
"grad_norm": 1.3825992345809937,
"learning_rate": 6.855947861125759e-05,
"loss": 0.6443,
"step": 7584
},
{
"epoch": 0.7616,
"grad_norm": 1.1650683879852295,
"learning_rate": 6.832587327592275e-05,
"loss": 0.7547,
"step": 7616
},
{
"epoch": 0.7648,
"grad_norm": 1.5112355947494507,
"learning_rate": 6.809180491661678e-05,
"loss": 0.7076,
"step": 7648
},
{
"epoch": 0.768,
"grad_norm": 0.8795199990272522,
"learning_rate": 6.785727944734228e-05,
"loss": 0.7345,
"step": 7680
},
{
"epoch": 0.7712,
"grad_norm": 1.6340776681900024,
"learning_rate": 6.762230279365114e-05,
"loss": 0.7517,
"step": 7712
},
{
"epoch": 0.7744,
"grad_norm": 1.022924542427063,
"learning_rate": 6.738688089249502e-05,
"loss": 0.6874,
"step": 7744
},
{
"epoch": 0.7776,
"grad_norm": 1.2930107116699219,
"learning_rate": 6.715101969207525e-05,
"loss": 0.7479,
"step": 7776
},
{
"epoch": 0.7808,
"grad_norm": 1.9842311143875122,
"learning_rate": 6.691472515169251e-05,
"loss": 0.7479,
"step": 7808
},
{
"epoch": 0.784,
"grad_norm": 1.5960675477981567,
"learning_rate": 6.667800324159636e-05,
"loss": 0.7928,
"step": 7840
},
{
"epoch": 0.7872,
"grad_norm": 3.447913885116577,
"learning_rate": 6.644085994283433e-05,
"loss": 0.6924,
"step": 7872
},
{
"epoch": 0.7904,
"grad_norm": 0.8809865713119507,
"learning_rate": 6.620330124710077e-05,
"loss": 0.7955,
"step": 7904
},
{
"epoch": 0.7936,
"grad_norm": 1.3761461973190308,
"learning_rate": 6.596533315658555e-05,
"loss": 0.6842,
"step": 7936
},
{
"epoch": 0.7968,
"grad_norm": 0.9557456374168396,
"learning_rate": 6.572696168382235e-05,
"loss": 0.7285,
"step": 7968
},
{
"epoch": 0.8,
"grad_norm": 0.7569695115089417,
"learning_rate": 6.548819285153676e-05,
"loss": 0.6431,
"step": 8000
},
{
"epoch": 0.8032,
"grad_norm": 1.2884209156036377,
"learning_rate": 6.524903269249411e-05,
"loss": 0.739,
"step": 8032
},
{
"epoch": 0.8064,
"grad_norm": 1.033050775527954,
"learning_rate": 6.500948724934703e-05,
"loss": 0.6759,
"step": 8064
},
{
"epoch": 0.8096,
"grad_norm": 0.9404661655426025,
"learning_rate": 6.47695625744828e-05,
"loss": 0.696,
"step": 8096
},
{
"epoch": 0.8128,
"grad_norm": 0.8363805413246155,
"learning_rate": 6.452926472987044e-05,
"loss": 0.7273,
"step": 8128
},
{
"epoch": 0.816,
"grad_norm": 0.7976164817810059,
"learning_rate": 6.428859978690748e-05,
"loss": 0.6671,
"step": 8160
},
{
"epoch": 0.8192,
"grad_norm": 1.6969666481018066,
"learning_rate": 6.404757382626669e-05,
"loss": 0.6968,
"step": 8192
},
{
"epoch": 0.8224,
"grad_norm": 1.061860203742981,
"learning_rate": 6.380619293774223e-05,
"loss": 0.7424,
"step": 8224
},
{
"epoch": 0.8256,
"grad_norm": 1.2336043119430542,
"learning_rate": 6.356446322009607e-05,
"loss": 0.6786,
"step": 8256
},
{
"epoch": 0.8288,
"grad_norm": 1.3530735969543457,
"learning_rate": 6.332239078090358e-05,
"loss": 0.7042,
"step": 8288
},
{
"epoch": 0.832,
"grad_norm": 0.9186837673187256,
"learning_rate": 6.307998173639954e-05,
"loss": 0.7433,
"step": 8320
},
{
"epoch": 0.8352,
"grad_norm": 1.0583479404449463,
"learning_rate": 6.283724221132333e-05,
"loss": 0.6515,
"step": 8352
},
{
"epoch": 0.8384,
"grad_norm": 1.468887209892273,
"learning_rate": 6.259417833876432e-05,
"loss": 0.7033,
"step": 8384
},
{
"epoch": 0.8416,
"grad_norm": 0.7726921439170837,
"learning_rate": 6.235079626000694e-05,
"loss": 0.721,
"step": 8416
},
{
"epoch": 0.8448,
"grad_norm": 1.8641211986541748,
"learning_rate": 6.21071021243754e-05,
"loss": 0.626,
"step": 8448
},
{
"epoch": 0.848,
"grad_norm": 1.9702180624008179,
"learning_rate": 6.186310208907839e-05,
"loss": 0.6017,
"step": 8480
},
{
"epoch": 0.8512,
"grad_norm": 2.057535171508789,
"learning_rate": 6.161880231905354e-05,
"loss": 0.7612,
"step": 8512
},
{
"epoch": 0.8544,
"grad_norm": 2.2840230464935303,
"learning_rate": 6.137420898681158e-05,
"loss": 0.6609,
"step": 8544
},
{
"epoch": 0.8576,
"grad_norm": 1.7856135368347168,
"learning_rate": 6.112932827228044e-05,
"loss": 0.7015,
"step": 8576
},
{
"epoch": 0.8608,
"grad_norm": 1.0354335308074951,
"learning_rate": 6.0884166362649075e-05,
"loss": 0.6714,
"step": 8608
},
{
"epoch": 0.864,
"grad_norm": 1.054237961769104,
"learning_rate": 6.063872945221118e-05,
"loss": 0.6928,
"step": 8640
},
{
"epoch": 0.8672,
"grad_norm": 1.004862904548645,
"learning_rate": 6.039302374220861e-05,
"loss": 0.7676,
"step": 8672
},
{
"epoch": 0.8704,
"grad_norm": 0.8693735003471375,
"learning_rate": 6.0147055440674795e-05,
"loss": 0.7562,
"step": 8704
},
{
"epoch": 0.8736,
"grad_norm": 1.6824612617492676,
"learning_rate": 5.990083076227782e-05,
"loss": 0.6509,
"step": 8736
},
{
"epoch": 0.8768,
"grad_norm": 3.1215667724609375,
"learning_rate": 5.9654355928163416e-05,
"loss": 0.7553,
"step": 8768
},
{
"epoch": 0.88,
"grad_norm": 1.4479137659072876,
"learning_rate": 5.9407637165797793e-05,
"loss": 0.8046,
"step": 8800
},
{
"epoch": 0.8832,
"grad_norm": 2.769347906112671,
"learning_rate": 5.916068070881026e-05,
"loss": 0.6869,
"step": 8832
},
{
"epoch": 0.8864,
"grad_norm": 1.338932752609253,
"learning_rate": 5.891349279683578e-05,
"loss": 0.6742,
"step": 8864
},
{
"epoch": 0.8896,
"grad_norm": 1.15195631980896,
"learning_rate": 5.8666079675357285e-05,
"loss": 0.6972,
"step": 8896
},
{
"epoch": 0.8928,
"grad_norm": 1.0247623920440674,
"learning_rate": 5.841844759554787e-05,
"loss": 0.7107,
"step": 8928
},
{
"epoch": 0.896,
"grad_norm": 1.4130921363830566,
"learning_rate": 5.817060281411284e-05,
"loss": 0.7327,
"step": 8960
},
{
"epoch": 0.8992,
"grad_norm": 0.6436507701873779,
"learning_rate": 5.792255159313169e-05,
"loss": 0.6418,
"step": 8992
},
{
"epoch": 0.9024,
"grad_norm": 0.9555985331535339,
"learning_rate": 5.7674300199899834e-05,
"loss": 0.7157,
"step": 9024
},
{
"epoch": 0.9056,
"grad_norm": 0.8774769306182861,
"learning_rate": 5.742585490677024e-05,
"loss": 0.6197,
"step": 9056
},
{
"epoch": 0.9088,
"grad_norm": 0.9347734451293945,
"learning_rate": 5.7177221990995e-05,
"loss": 0.6672,
"step": 9088
},
{
"epoch": 0.912,
"grad_norm": 1.2730952501296997,
"learning_rate": 5.692840773456669e-05,
"loss": 0.7524,
"step": 9120
},
{
"epoch": 0.9152,
"grad_norm": 1.3449304103851318,
"learning_rate": 5.667941842405968e-05,
"loss": 0.7106,
"step": 9152
},
{
"epoch": 0.9184,
"grad_norm": 2.288444757461548,
"learning_rate": 5.643026035047128e-05,
"loss": 0.7239,
"step": 9184
},
{
"epoch": 0.9216,
"grad_norm": 1.1817107200622559,
"learning_rate": 5.618093980906276e-05,
"loss": 0.7342,
"step": 9216
},
{
"epoch": 0.9248,
"grad_norm": 1.4276821613311768,
"learning_rate": 5.5931463099200355e-05,
"loss": 0.6198,
"step": 9248
},
{
"epoch": 0.928,
"grad_norm": 1.0878974199295044,
"learning_rate": 5.568183652419606e-05,
"loss": 0.7204,
"step": 9280
},
{
"epoch": 0.9312,
"grad_norm": 1.5497533082962036,
"learning_rate": 5.54320663911484e-05,
"loss": 0.7218,
"step": 9312
},
{
"epoch": 0.9344,
"grad_norm": 0.5286266207695007,
"learning_rate": 5.518215901078302e-05,
"loss": 0.8243,
"step": 9344
},
{
"epoch": 0.9376,
"grad_norm": 1.9889594316482544,
"learning_rate": 5.493212069729332e-05,
"loss": 0.6849,
"step": 9376
},
{
"epoch": 0.9408,
"grad_norm": 1.6639822721481323,
"learning_rate": 5.468195776818084e-05,
"loss": 0.682,
"step": 9408
},
{
"epoch": 0.944,
"grad_norm": 3.0651698112487793,
"learning_rate": 5.4431676544095676e-05,
"loss": 0.8112,
"step": 9440
},
{
"epoch": 0.9472,
"grad_norm": 1.0381174087524414,
"learning_rate": 5.4181283348676806e-05,
"loss": 0.6497,
"step": 9472
},
{
"epoch": 0.9504,
"grad_norm": 1.0353689193725586,
"learning_rate": 5.393078450839228e-05,
"loss": 0.6654,
"step": 9504
},
{
"epoch": 0.9536,
"grad_norm": 1.6130503416061401,
"learning_rate": 5.368018635237936e-05,
"loss": 0.7351,
"step": 9536
},
{
"epoch": 0.9568,
"grad_norm": 1.171970248222351,
"learning_rate": 5.3429495212284665e-05,
"loss": 0.7099,
"step": 9568
},
{
"epoch": 0.96,
"grad_norm": 1.937739610671997,
"learning_rate": 5.3178717422104144e-05,
"loss": 0.6366,
"step": 9600
},
{
"epoch": 0.9632,
"grad_norm": 1.8911631107330322,
"learning_rate": 5.2927859318023073e-05,
"loss": 0.6813,
"step": 9632
},
{
"epoch": 0.9664,
"grad_norm": 1.1599578857421875,
"learning_rate": 5.2676927238255946e-05,
"loss": 0.7155,
"step": 9664
},
{
"epoch": 0.9696,
"grad_norm": 1.2809479236602783,
"learning_rate": 5.242592752288632e-05,
"loss": 0.7051,
"step": 9696
},
{
"epoch": 0.9728,
"grad_norm": 2.0790278911590576,
"learning_rate": 5.2174866513706646e-05,
"loss": 0.7387,
"step": 9728
},
{
"epoch": 0.976,
"grad_norm": 1.0074536800384521,
"learning_rate": 5.1923750554058084e-05,
"loss": 0.6751,
"step": 9760
},
{
"epoch": 0.9792,
"grad_norm": 1.3937727212905884,
"learning_rate": 5.16725859886701e-05,
"loss": 0.6902,
"step": 9792
},
{
"epoch": 0.9824,
"grad_norm": 0.8866567015647888,
"learning_rate": 5.142137916350028e-05,
"loss": 0.7443,
"step": 9824
},
{
"epoch": 0.9856,
"grad_norm": 0.857765793800354,
"learning_rate": 5.1170136425573956e-05,
"loss": 0.7032,
"step": 9856
},
{
"epoch": 0.9888,
"grad_norm": 0.6846195459365845,
"learning_rate": 5.0918864122823816e-05,
"loss": 0.6508,
"step": 9888
},
{
"epoch": 0.992,
"grad_norm": 0.9779634475708008,
"learning_rate": 5.066756860392956e-05,
"loss": 0.7161,
"step": 9920
},
{
"epoch": 0.9952,
"grad_norm": 1.3198580741882324,
"learning_rate": 5.0416256218157476e-05,
"loss": 0.6885,
"step": 9952
},
{
"epoch": 0.9984,
"grad_norm": 0.8396392464637756,
"learning_rate": 5.0164933315199955e-05,
"loss": 0.7511,
"step": 9984
},
{
"epoch": 1.0016,
"grad_norm": 0.9331254363059998,
"learning_rate": 4.991360624501518e-05,
"loss": 0.7289,
"step": 10016
},
{
"epoch": 1.0048,
"grad_norm": 1.1577215194702148,
"learning_rate": 4.966228135766662e-05,
"loss": 0.7328,
"step": 10048
},
{
"epoch": 1.008,
"grad_norm": 2.2317230701446533,
"learning_rate": 4.941096500316253e-05,
"loss": 0.6988,
"step": 10080
},
{
"epoch": 1.0112,
"grad_norm": 0.8188418745994568,
"learning_rate": 4.915966353129567e-05,
"loss": 0.6093,
"step": 10112
},
{
"epoch": 1.0144,
"grad_norm": 0.7794582843780518,
"learning_rate": 4.890838329148268e-05,
"loss": 0.7017,
"step": 10144
},
{
"epoch": 1.0176,
"grad_norm": 1.402685284614563,
"learning_rate": 4.865713063260379e-05,
"loss": 0.5686,
"step": 10176
},
{
"epoch": 1.0208,
"grad_norm": 0.8832930326461792,
"learning_rate": 4.840591190284238e-05,
"loss": 0.7829,
"step": 10208
},
{
"epoch": 1.024,
"grad_norm": 1.5789443254470825,
"learning_rate": 4.8154733449524544e-05,
"loss": 0.7085,
"step": 10240
},
{
"epoch": 1.0272,
"grad_norm": 1.2192682027816772,
"learning_rate": 4.790360161895878e-05,
"loss": 0.7648,
"step": 10272
},
{
"epoch": 1.0304,
"grad_norm": 1.4799787998199463,
"learning_rate": 4.765252275627554e-05,
"loss": 0.6982,
"step": 10304
},
{
"epoch": 1.0336,
"grad_norm": 0.4716717004776001,
"learning_rate": 4.74015032052671e-05,
"loss": 0.6898,
"step": 10336
},
{
"epoch": 1.0368,
"grad_norm": 1.9356942176818848,
"learning_rate": 4.715054930822703e-05,
"loss": 0.7243,
"step": 10368
},
{
"epoch": 1.04,
"grad_norm": 0.943780243396759,
"learning_rate": 4.689966740579016e-05,
"loss": 0.6593,
"step": 10400
},
{
"epoch": 1.0432,
"grad_norm": 1.5752168893814087,
"learning_rate": 4.664886383677229e-05,
"loss": 0.7061,
"step": 10432
},
{
"epoch": 1.0464,
"grad_norm": 3.25386381149292,
"learning_rate": 4.639814493800998e-05,
"loss": 0.722,
"step": 10464
},
{
"epoch": 1.0496,
"grad_norm": 0.9472445249557495,
"learning_rate": 4.6147517044200576e-05,
"loss": 0.6558,
"step": 10496
},
{
"epoch": 1.0528,
"grad_norm": 1.0976086854934692,
"learning_rate": 4.5896986487742015e-05,
"loss": 0.7291,
"step": 10528
},
{
"epoch": 1.056,
"grad_norm": 1.4805541038513184,
"learning_rate": 4.564655959857295e-05,
"loss": 0.6784,
"step": 10560
},
{
"epoch": 1.0592,
"grad_norm": 0.9332720637321472,
"learning_rate": 4.5396242704012734e-05,
"loss": 0.7588,
"step": 10592
},
{
"epoch": 1.0624,
"grad_norm": 2.2907462120056152,
"learning_rate": 4.514604212860156e-05,
"loss": 0.6782,
"step": 10624
},
{
"epoch": 1.0656,
"grad_norm": 1.4787052869796753,
"learning_rate": 4.489596419394075e-05,
"loss": 0.7251,
"step": 10656
},
{
"epoch": 1.0688,
"grad_norm": 1.5562191009521484,
"learning_rate": 4.4646015218532874e-05,
"loss": 0.6026,
"step": 10688
},
{
"epoch": 1.072,
"grad_norm": 1.118370771408081,
"learning_rate": 4.439620151762232e-05,
"loss": 0.6583,
"step": 10720
},
{
"epoch": 1.0752,
"grad_norm": 1.5327064990997314,
"learning_rate": 4.41465294030355e-05,
"loss": 0.7432,
"step": 10752
},
{
"epoch": 1.0784,
"grad_norm": 0.7389672994613647,
"learning_rate": 4.3897005183021537e-05,
"loss": 0.7404,
"step": 10784
},
{
"epoch": 1.0816,
"grad_norm": 1.2267436981201172,
"learning_rate": 4.364763516209287e-05,
"loss": 0.7079,
"step": 10816
},
{
"epoch": 1.0848,
"grad_norm": 1.2490900754928589,
"learning_rate": 4.3398425640865815e-05,
"loss": 0.6579,
"step": 10848
},
{
"epoch": 1.088,
"grad_norm": 1.3137383460998535,
"learning_rate": 4.3149382915901606e-05,
"loss": 0.6725,
"step": 10880
},
{
"epoch": 1.0912,
"grad_norm": 1.0889027118682861,
"learning_rate": 4.290051327954708e-05,
"loss": 0.7875,
"step": 10912
},
{
"epoch": 1.0944,
"grad_norm": 0.8377600908279419,
"learning_rate": 4.2651823019775854e-05,
"loss": 0.6738,
"step": 10944
},
{
"epoch": 1.0976,
"grad_norm": 1.071028709411621,
"learning_rate": 4.240331842002938e-05,
"loss": 0.6673,
"step": 10976
},
{
"epoch": 1.1008,
"grad_norm": 0.9563510417938232,
"learning_rate": 4.2155005759058166e-05,
"loss": 0.6889,
"step": 11008
},
{
"epoch": 1.104,
"grad_norm": 1.0600881576538086,
"learning_rate": 4.190689131076323e-05,
"loss": 0.6657,
"step": 11040
},
{
"epoch": 1.1072,
"grad_norm": 1.4170634746551514,
"learning_rate": 4.1666725373083604e-05,
"loss": 0.6737,
"step": 11072
},
{
"epoch": 1.1104,
"grad_norm": 1.095928430557251,
"learning_rate": 4.14190194711032e-05,
"loss": 0.7647,
"step": 11104
},
{
"epoch": 1.1136,
"grad_norm": 1.0701088905334473,
"learning_rate": 4.117153037732726e-05,
"loss": 0.6774,
"step": 11136
},
{
"epoch": 1.1168,
"grad_norm": 1.4848291873931885,
"learning_rate": 4.0924264344848436e-05,
"loss": 0.722,
"step": 11168
},
{
"epoch": 1.12,
"grad_norm": 0.7388201355934143,
"learning_rate": 4.067722762112345e-05,
"loss": 0.6531,
"step": 11200
},
{
"epoch": 1.1232,
"grad_norm": 1.835442304611206,
"learning_rate": 4.043042644781526e-05,
"loss": 0.6337,
"step": 11232
},
{
"epoch": 1.1264,
"grad_norm": 1.07326340675354,
"learning_rate": 4.0183867060635446e-05,
"loss": 0.6441,
"step": 11264
},
{
"epoch": 1.1296,
"grad_norm": 1.252897024154663,
"learning_rate": 3.9937555689186486e-05,
"loss": 0.673,
"step": 11296
},
{
"epoch": 1.1328,
"grad_norm": 0.7107636332511902,
"learning_rate": 3.9691498556804554e-05,
"loss": 0.659,
"step": 11328
},
{
"epoch": 1.1360000000000001,
"grad_norm": 2.5162734985351562,
"learning_rate": 3.9445701880402126e-05,
"loss": 0.6303,
"step": 11360
},
{
"epoch": 1.1392,
"grad_norm": 1.3729472160339355,
"learning_rate": 3.920017187031098e-05,
"loss": 0.6804,
"step": 11392
},
{
"epoch": 1.1424,
"grad_norm": 0.5136198401451111,
"learning_rate": 3.8954914730125304e-05,
"loss": 0.7017,
"step": 11424
},
{
"epoch": 1.1456,
"grad_norm": 0.7279484272003174,
"learning_rate": 3.870993665654482e-05,
"loss": 0.6259,
"step": 11456
},
{
"epoch": 1.1488,
"grad_norm": 1.321696162223816,
"learning_rate": 3.8465243839218414e-05,
"loss": 0.7333,
"step": 11488
},
{
"epoch": 1.152,
"grad_norm": 1.254591703414917,
"learning_rate": 3.8220842460587636e-05,
"loss": 0.7907,
"step": 11520
},
{
"epoch": 1.1552,
"grad_norm": 1.9220726490020752,
"learning_rate": 3.7976738695730456e-05,
"loss": 0.7166,
"step": 11552
},
{
"epoch": 1.1584,
"grad_norm": 1.3406776189804077,
"learning_rate": 3.7732938712205336e-05,
"loss": 0.7136,
"step": 11584
},
{
"epoch": 1.1616,
"grad_norm": 2.079749822616577,
"learning_rate": 3.7489448669895324e-05,
"loss": 0.7138,
"step": 11616
},
{
"epoch": 1.1648,
"grad_norm": 1.3080401420593262,
"learning_rate": 3.72462747208525e-05,
"loss": 0.7515,
"step": 11648
},
{
"epoch": 1.168,
"grad_norm": 1.9912532567977905,
"learning_rate": 3.700342300914244e-05,
"loss": 0.6928,
"step": 11680
},
{
"epoch": 1.1712,
"grad_norm": 2.2096424102783203,
"learning_rate": 3.6760899670689076e-05,
"loss": 0.6403,
"step": 11712
},
{
"epoch": 1.1743999999999999,
"grad_norm": 1.913182020187378,
"learning_rate": 3.651871083311957e-05,
"loss": 0.6714,
"step": 11744
},
{
"epoch": 1.1776,
"grad_norm": 1.0321300029754639,
"learning_rate": 3.627686261560957e-05,
"loss": 0.7559,
"step": 11776
},
{
"epoch": 1.1808,
"grad_norm": 1.6351675987243652,
"learning_rate": 3.603536112872858e-05,
"loss": 0.6845,
"step": 11808
},
{
"epoch": 1.184,
"grad_norm": 0.8725209832191467,
"learning_rate": 3.5794212474285504e-05,
"loss": 0.6107,
"step": 11840
},
{
"epoch": 1.1872,
"grad_norm": 0.8211314678192139,
"learning_rate": 3.5553422745174604e-05,
"loss": 0.6728,
"step": 11872
},
{
"epoch": 1.1904,
"grad_norm": 1.5020124912261963,
"learning_rate": 3.531299802522148e-05,
"loss": 0.6603,
"step": 11904
},
{
"epoch": 1.1936,
"grad_norm": 1.5487534999847412,
"learning_rate": 3.507294438902929e-05,
"loss": 0.719,
"step": 11936
},
{
"epoch": 1.1968,
"grad_norm": 1.2901443243026733,
"learning_rate": 3.483326790182544e-05,
"loss": 0.6501,
"step": 11968
},
{
"epoch": 1.2,
"grad_norm": 1.5855425596237183,
"learning_rate": 3.4593974619308136e-05,
"loss": 0.8073,
"step": 12000
},
{
"epoch": 1.2032,
"grad_norm": 1.250167965888977,
"learning_rate": 3.435507058749358e-05,
"loss": 0.698,
"step": 12032
},
{
"epoch": 1.2064,
"grad_norm": 0.8152143359184265,
"learning_rate": 3.411656184256304e-05,
"loss": 0.7344,
"step": 12064
},
{
"epoch": 1.2096,
"grad_norm": 0.6146981120109558,
"learning_rate": 3.387845441071046e-05,
"loss": 0.6858,
"step": 12096
},
{
"epoch": 1.2128,
"grad_norm": 0.8901644349098206,
"learning_rate": 3.364075430799013e-05,
"loss": 0.7079,
"step": 12128
},
{
"epoch": 1.216,
"grad_norm": 2.3424062728881836,
"learning_rate": 3.340346754016471e-05,
"loss": 0.6571,
"step": 12160
},
{
"epoch": 1.2192,
"grad_norm": 1.396082878112793,
"learning_rate": 3.316660010255351e-05,
"loss": 0.6591,
"step": 12192
},
{
"epoch": 1.2224,
"grad_norm": 1.133885145187378,
"learning_rate": 3.2930157979880925e-05,
"loss": 0.6771,
"step": 12224
},
{
"epoch": 1.2256,
"grad_norm": 1.2843247652053833,
"learning_rate": 3.2694147146125345e-05,
"loss": 0.7611,
"step": 12256
},
{
"epoch": 1.2288000000000001,
"grad_norm": 1.4448399543762207,
"learning_rate": 3.245857356436817e-05,
"loss": 0.7749,
"step": 12288
},
{
"epoch": 1.232,
"grad_norm": 1.0012179613113403,
"learning_rate": 3.2223443186643044e-05,
"loss": 0.6742,
"step": 12320
},
{
"epoch": 1.2352,
"grad_norm": 1.815568447113037,
"learning_rate": 3.198876195378566e-05,
"loss": 0.6298,
"step": 12352
},
{
"epoch": 1.2384,
"grad_norm": 1.159679889678955,
"learning_rate": 3.175453579528347e-05,
"loss": 0.7449,
"step": 12384
},
{
"epoch": 1.2416,
"grad_norm": 1.4494870901107788,
"learning_rate": 3.152077062912602e-05,
"loss": 0.7288,
"step": 12416
},
{
"epoch": 1.2448,
"grad_norm": 0.5199709534645081,
"learning_rate": 3.128747236165535e-05,
"loss": 0.6784,
"step": 12448
},
{
"epoch": 1.248,
"grad_norm": 1.009386420249939,
"learning_rate": 3.105464688741674e-05,
"loss": 0.6768,
"step": 12480
},
{
"epoch": 1.2511999999999999,
"grad_norm": 1.244207501411438,
"learning_rate": 3.082230008900986e-05,
"loss": 0.6553,
"step": 12512
},
{
"epoch": 1.2544,
"grad_norm": 1.0833635330200195,
"learning_rate": 3.059043783694005e-05,
"loss": 0.6678,
"step": 12544
},
{
"epoch": 1.2576,
"grad_norm": 1.3821258544921875,
"learning_rate": 3.0359065989470072e-05,
"loss": 0.6889,
"step": 12576
},
{
"epoch": 1.2608,
"grad_norm": 1.3211382627487183,
"learning_rate": 3.012819039247201e-05,
"loss": 0.7779,
"step": 12608
},
{
"epoch": 1.264,
"grad_norm": 1.2246074676513672,
"learning_rate": 2.989781687927968e-05,
"loss": 0.6459,
"step": 12640
},
{
"epoch": 1.2671999999999999,
"grad_norm": 2.3037734031677246,
"learning_rate": 2.9667951270541162e-05,
"loss": 0.7138,
"step": 12672
},
{
"epoch": 1.2704,
"grad_norm": 2.228825330734253,
"learning_rate": 2.9438599374071725e-05,
"loss": 0.6843,
"step": 12704
},
{
"epoch": 1.2736,
"grad_norm": 3.4860639572143555,
"learning_rate": 2.9209766984707145e-05,
"loss": 0.6952,
"step": 12736
},
{
"epoch": 1.2768,
"grad_norm": 0.9082501530647278,
"learning_rate": 2.8981459884157214e-05,
"loss": 0.7082,
"step": 12768
},
{
"epoch": 1.28,
"grad_norm": 1.2819948196411133,
"learning_rate": 2.8753683840859807e-05,
"loss": 0.7196,
"step": 12800
},
{
"epoch": 1.2832,
"grad_norm": 1.5968257188796997,
"learning_rate": 2.8526444609834935e-05,
"loss": 0.7115,
"step": 12832
},
{
"epoch": 1.2864,
"grad_norm": 1.9902583360671997,
"learning_rate": 2.8299747932539468e-05,
"loss": 0.6307,
"step": 12864
},
{
"epoch": 1.2896,
"grad_norm": 1.1297131776809692,
"learning_rate": 2.807359953672206e-05,
"loss": 0.7802,
"step": 12896
},
{
"epoch": 1.2928,
"grad_norm": 1.4224005937576294,
"learning_rate": 2.7848005136278388e-05,
"loss": 0.7239,
"step": 12928
},
{
"epoch": 1.296,
"grad_norm": 0.8316251635551453,
"learning_rate": 2.7622970431106825e-05,
"loss": 0.6876,
"step": 12960
},
{
"epoch": 1.2992,
"grad_norm": 1.1183089017868042,
"learning_rate": 2.7398501106964427e-05,
"loss": 0.6967,
"step": 12992
},
{
"epoch": 1.3024,
"grad_norm": 1.4382871389389038,
"learning_rate": 2.7174602835323182e-05,
"loss": 0.6734,
"step": 13024
},
{
"epoch": 1.3056,
"grad_norm": 1.0771244764328003,
"learning_rate": 2.695128127322689e-05,
"loss": 0.6573,
"step": 13056
},
{
"epoch": 1.3088,
"grad_norm": 1.7864910364151,
"learning_rate": 2.6728542063148032e-05,
"loss": 0.5815,
"step": 13088
},
{
"epoch": 1.312,
"grad_norm": 1.222066879272461,
"learning_rate": 2.6506390832845403e-05,
"loss": 0.6817,
"step": 13120
},
{
"epoch": 1.3152,
"grad_norm": 1.3181254863739014,
"learning_rate": 2.6284833195221714e-05,
"loss": 0.7327,
"step": 13152
},
{
"epoch": 1.3184,
"grad_norm": 0.9396976828575134,
"learning_rate": 2.6063874748182e-05,
"loss": 0.6547,
"step": 13184
},
{
"epoch": 1.3216,
"grad_norm": 1.0401841402053833,
"learning_rate": 2.585039791716687e-05,
"loss": 0.693,
"step": 13216
},
{
"epoch": 1.3248,
"grad_norm": 1.1094788312911987,
"learning_rate": 2.5630635427040247e-05,
"loss": 0.787,
"step": 13248
},
{
"epoch": 1.328,
"grad_norm": 1.2137497663497925,
"learning_rate": 2.54114886565461e-05,
"loss": 0.728,
"step": 13280
},
{
"epoch": 1.3312,
"grad_norm": 1.508653163909912,
"learning_rate": 2.5192963142676086e-05,
"loss": 0.6142,
"step": 13312
},
{
"epoch": 1.3344,
"grad_norm": 2.5001964569091797,
"learning_rate": 2.4975064406725152e-05,
"loss": 0.7211,
"step": 13344
},
{
"epoch": 1.3376000000000001,
"grad_norm": 1.8579870462417603,
"learning_rate": 2.475779795415199e-05,
"loss": 0.6963,
"step": 13376
},
{
"epoch": 1.3408,
"grad_norm": 1.3078196048736572,
"learning_rate": 2.45411692744399e-05,
"loss": 0.6836,
"step": 13408
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.9487518072128296,
"learning_rate": 2.432518384095813e-05,
"loss": 0.6803,
"step": 13440
},
{
"epoch": 1.3472,
"grad_norm": 1.867445945739746,
"learning_rate": 2.4109847110823642e-05,
"loss": 0.6249,
"step": 13472
},
{
"epoch": 1.3504,
"grad_norm": 0.5121757388114929,
"learning_rate": 2.3895164524763104e-05,
"loss": 0.7344,
"step": 13504
},
{
"epoch": 1.3536000000000001,
"grad_norm": 1.702012062072754,
"learning_rate": 2.3681141506975502e-05,
"loss": 0.6328,
"step": 13536
},
{
"epoch": 1.3568,
"grad_norm": 2.03639554977417,
"learning_rate": 2.3467783464995107e-05,
"loss": 0.6805,
"step": 13568
},
{
"epoch": 1.3599999999999999,
"grad_norm": 2.0144524574279785,
"learning_rate": 2.3255095789554843e-05,
"loss": 0.6536,
"step": 13600
},
{
"epoch": 1.3632,
"grad_norm": 1.5898628234863281,
"learning_rate": 2.3043083854449988e-05,
"loss": 0.7368,
"step": 13632
},
{
"epoch": 1.3664,
"grad_norm": 1.2409743070602417,
"learning_rate": 2.2831753016402558e-05,
"loss": 0.7038,
"step": 13664
},
{
"epoch": 1.3696,
"grad_norm": 1.5496866703033447,
"learning_rate": 2.2621108614925806e-05,
"loss": 0.6668,
"step": 13696
},
{
"epoch": 1.3728,
"grad_norm": 1.3265410661697388,
"learning_rate": 2.2411155972189434e-05,
"loss": 0.6478,
"step": 13728
},
{
"epoch": 1.376,
"grad_norm": 1.752454161643982,
"learning_rate": 2.2201900392885077e-05,
"loss": 0.6099,
"step": 13760
},
{
"epoch": 1.3792,
"grad_norm": 1.9202802181243896,
"learning_rate": 2.1993347164092247e-05,
"loss": 0.7054,
"step": 13792
},
{
"epoch": 1.3824,
"grad_norm": 1.8838222026824951,
"learning_rate": 2.178550155514476e-05,
"loss": 0.6859,
"step": 13824
},
{
"epoch": 1.3856,
"grad_norm": 1.3007084131240845,
"learning_rate": 2.1578368817497673e-05,
"loss": 0.7375,
"step": 13856
},
{
"epoch": 1.3888,
"grad_norm": 0.8226228952407837,
"learning_rate": 2.137195418459449e-05,
"loss": 0.6373,
"step": 13888
},
{
"epoch": 1.392,
"grad_norm": 1.4098424911499023,
"learning_rate": 2.1166262871734976e-05,
"loss": 0.6691,
"step": 13920
},
{
"epoch": 1.3952,
"grad_norm": 1.1725754737854004,
"learning_rate": 2.0961300075943445e-05,
"loss": 0.7041,
"step": 13952
},
{
"epoch": 1.3984,
"grad_norm": 2.074549674987793,
"learning_rate": 2.0763441978165273e-05,
"loss": 0.6911,
"step": 13984
},
{
"epoch": 1.4016,
"grad_norm": 0.9322546124458313,
"learning_rate": 2.0559928566660237e-05,
"loss": 0.6174,
"step": 14016
},
{
"epoch": 1.4048,
"grad_norm": 1.16145920753479,
"learning_rate": 2.035715899194704e-05,
"loss": 0.7497,
"step": 14048
},
{
"epoch": 1.408,
"grad_norm": 2.1026079654693604,
"learning_rate": 2.0155138377228922e-05,
"loss": 0.6454,
"step": 14080
},
{
"epoch": 1.4112,
"grad_norm": 1.752734661102295,
"learning_rate": 1.9953871826785803e-05,
"loss": 0.7353,
"step": 14112
},
{
"epoch": 1.4144,
"grad_norm": 1.2497884035110474,
"learning_rate": 1.9753364425845368e-05,
"loss": 0.6479,
"step": 14144
},
{
"epoch": 1.4176,
"grad_norm": 2.7295846939086914,
"learning_rate": 1.9553621240454452e-05,
"loss": 0.7022,
"step": 14176
},
{
"epoch": 1.4208,
"grad_norm": 1.2454453706741333,
"learning_rate": 1.9354647317351188e-05,
"loss": 0.7192,
"step": 14208
},
{
"epoch": 1.424,
"grad_norm": 2.1699957847595215,
"learning_rate": 1.9156447683837363e-05,
"loss": 0.614,
"step": 14240
},
{
"epoch": 1.4272,
"grad_norm": 1.7152907848358154,
"learning_rate": 1.8959027347651527e-05,
"loss": 0.649,
"step": 14272
},
{
"epoch": 1.4304000000000001,
"grad_norm": 1.1111620664596558,
"learning_rate": 1.8762391296842317e-05,
"loss": 0.6757,
"step": 14304
},
{
"epoch": 1.4336,
"grad_norm": 1.8501092195510864,
"learning_rate": 1.8566544499642587e-05,
"loss": 0.689,
"step": 14336
},
{
"epoch": 1.4368,
"grad_norm": 1.2384352684020996,
"learning_rate": 1.837149190434378e-05,
"loss": 0.8523,
"step": 14368
},
{
"epoch": 1.44,
"grad_norm": 1.276906132698059,
"learning_rate": 1.8177238439170886e-05,
"loss": 0.7296,
"step": 14400
},
{
"epoch": 1.4432,
"grad_norm": 0.8352245688438416,
"learning_rate": 1.7983789012158035e-05,
"loss": 0.7488,
"step": 14432
},
{
"epoch": 1.4464000000000001,
"grad_norm": 1.441490650177002,
"learning_rate": 1.779114851102437e-05,
"loss": 0.8141,
"step": 14464
},
{
"epoch": 1.4496,
"grad_norm": 2.355762243270874,
"learning_rate": 1.7599321803050596e-05,
"loss": 0.7312,
"step": 14496
},
{
"epoch": 1.4527999999999999,
"grad_norm": 0.7539392709732056,
"learning_rate": 1.740831373495607e-05,
"loss": 0.6704,
"step": 14528
},
{
"epoch": 1.456,
"grad_norm": 2.1388750076293945,
"learning_rate": 1.7218129132776222e-05,
"loss": 0.7292,
"step": 14560
},
{
"epoch": 1.4592,
"grad_norm": 1.0650655031204224,
"learning_rate": 1.7028772801740746e-05,
"loss": 0.7597,
"step": 14592
},
{
"epoch": 1.4624,
"grad_norm": 1.2916077375411987,
"learning_rate": 1.6840249526152034e-05,
"loss": 0.7453,
"step": 14624
},
{
"epoch": 1.4656,
"grad_norm": 1.1462541818618774,
"learning_rate": 1.6652564069264475e-05,
"loss": 0.6779,
"step": 14656
},
{
"epoch": 1.4687999999999999,
"grad_norm": 3.1677587032318115,
"learning_rate": 1.6465721173164002e-05,
"loss": 0.6914,
"step": 14688
},
{
"epoch": 1.472,
"grad_norm": 1.2350428104400635,
"learning_rate": 1.627972555864824e-05,
"loss": 0.6588,
"step": 14720
},
{
"epoch": 1.4752,
"grad_norm": 2.9489924907684326,
"learning_rate": 1.6094581925107353e-05,
"loss": 0.7823,
"step": 14752
},
{
"epoch": 1.4784,
"grad_norm": 1.1057605743408203,
"learning_rate": 1.591029495040518e-05,
"loss": 0.7743,
"step": 14784
},
{
"epoch": 1.4816,
"grad_norm": 2.1350364685058594,
"learning_rate": 1.5726869290761158e-05,
"loss": 0.6946,
"step": 14816
},
{
"epoch": 1.4848,
"grad_norm": 1.1925487518310547,
"learning_rate": 1.554430958063259e-05,
"loss": 0.6952,
"step": 14848
},
{
"epoch": 1.488,
"grad_norm": 1.2516975402832031,
"learning_rate": 1.5362620432597557e-05,
"loss": 0.6579,
"step": 14880
},
{
"epoch": 1.4912,
"grad_norm": 1.4604252576828003,
"learning_rate": 1.5181806437238472e-05,
"loss": 0.6498,
"step": 14912
},
{
"epoch": 1.4944,
"grad_norm": 1.3046425580978394,
"learning_rate": 1.5001872163025954e-05,
"loss": 0.7083,
"step": 14944
},
{
"epoch": 1.4976,
"grad_norm": 1.975250482559204,
"learning_rate": 1.482282215620352e-05,
"loss": 0.7058,
"step": 14976
},
{
"epoch": 1.5008,
"grad_norm": 4.101298809051514,
"learning_rate": 1.4644660940672627e-05,
"loss": 0.6834,
"step": 15008
},
{
"epoch": 1.504,
"grad_norm": 1.0669056177139282,
"learning_rate": 1.4467393017878445e-05,
"loss": 0.6642,
"step": 15040
},
{
"epoch": 1.5072,
"grad_norm": 1.5664597749710083,
"learning_rate": 1.4291022866696085e-05,
"loss": 0.6576,
"step": 15072
},
{
"epoch": 1.5104,
"grad_norm": 0.7155601978302002,
"learning_rate": 1.4115554943317417e-05,
"loss": 0.6904,
"step": 15104
},
{
"epoch": 1.5135999999999998,
"grad_norm": 1.4340877532958984,
"learning_rate": 1.394099368113853e-05,
"loss": 0.6205,
"step": 15136
},
{
"epoch": 1.5168,
"grad_norm": 1.087167739868164,
"learning_rate": 1.3767343490647665e-05,
"loss": 0.6777,
"step": 15168
},
{
"epoch": 1.52,
"grad_norm": 1.225614070892334,
"learning_rate": 1.3594608759313831e-05,
"loss": 0.6338,
"step": 15200
},
{
"epoch": 1.5232,
"grad_norm": 1.9098353385925293,
"learning_rate": 1.3422793851475907e-05,
"loss": 0.6957,
"step": 15232
},
{
"epoch": 1.5264,
"grad_norm": 1.5703306198120117,
"learning_rate": 1.3251903108232361e-05,
"loss": 0.692,
"step": 15264
},
{
"epoch": 1.5295999999999998,
"grad_norm": 2.0928618907928467,
"learning_rate": 1.3081940847331659e-05,
"loss": 0.6351,
"step": 15296
},
{
"epoch": 1.5328,
"grad_norm": 0.9946128726005554,
"learning_rate": 1.291291136306304e-05,
"loss": 0.6889,
"step": 15328
},
{
"epoch": 1.536,
"grad_norm": 1.0102778673171997,
"learning_rate": 1.2744818926148155e-05,
"loss": 0.6934,
"step": 15360
},
{
"epoch": 1.5392000000000001,
"grad_norm": 1.1262110471725464,
"learning_rate": 1.2577667783633007e-05,
"loss": 0.7111,
"step": 15392
},
{
"epoch": 1.5424,
"grad_norm": 2.779313802719116,
"learning_rate": 1.241146215878079e-05,
"loss": 0.7366,
"step": 15424
},
{
"epoch": 1.5455999999999999,
"grad_norm": 1.0337574481964111,
"learning_rate": 1.2246206250965125e-05,
"loss": 0.7489,
"step": 15456
},
{
"epoch": 1.5488,
"grad_norm": 1.171697735786438,
"learning_rate": 1.2081904235563901e-05,
"loss": 0.6869,
"step": 15488
},
{
"epoch": 1.552,
"grad_norm": 0.9727947115898132,
"learning_rate": 1.19185602638539e-05,
"loss": 0.6787,
"step": 15520
},
{
"epoch": 1.5552000000000001,
"grad_norm": 2.387002944946289,
"learning_rate": 1.1756178462905782e-05,
"loss": 0.7071,
"step": 15552
},
{
"epoch": 1.5584,
"grad_norm": 1.6133655309677124,
"learning_rate": 1.159476293547992e-05,
"loss": 0.6605,
"step": 15584
},
{
"epoch": 1.5615999999999999,
"grad_norm": 2.7770400047302246,
"learning_rate": 1.1434317759922664e-05,
"loss": 0.6942,
"step": 15616
},
{
"epoch": 1.5648,
"grad_norm": 1.266869068145752,
"learning_rate": 1.1274846990063315e-05,
"loss": 0.5871,
"step": 15648
},
{
"epoch": 1.568,
"grad_norm": 1.802133321762085,
"learning_rate": 1.111635465511175e-05,
"loss": 0.7334,
"step": 15680
},
{
"epoch": 1.5712000000000002,
"grad_norm": 2.079975128173828,
"learning_rate": 1.0958844759556525e-05,
"loss": 0.6608,
"step": 15712
},
{
"epoch": 1.5744,
"grad_norm": 1.0633741617202759,
"learning_rate": 1.0802321283063794e-05,
"loss": 0.6987,
"step": 15744
},
{
"epoch": 1.5776,
"grad_norm": 2.0246541500091553,
"learning_rate": 1.0646788180376716e-05,
"loss": 0.7045,
"step": 15776
},
{
"epoch": 1.5808,
"grad_norm": 1.056842565536499,
"learning_rate": 1.049224938121548e-05,
"loss": 0.6995,
"step": 15808
},
{
"epoch": 1.584,
"grad_norm": 0.7846526503562927,
"learning_rate": 1.0338708790178136e-05,
"loss": 0.6526,
"step": 15840
},
{
"epoch": 1.5872000000000002,
"grad_norm": 1.433289885520935,
"learning_rate": 1.0186170286641816e-05,
"loss": 0.7284,
"step": 15872
},
{
"epoch": 1.5904,
"grad_norm": 1.2502738237380981,
"learning_rate": 1.003463772466483e-05,
"loss": 0.6619,
"step": 15904
},
{
"epoch": 1.5936,
"grad_norm": 1.7102774381637573,
"learning_rate": 9.884114932889171e-06,
"loss": 0.6693,
"step": 15936
},
{
"epoch": 1.5968,
"grad_norm": 1.107363224029541,
"learning_rate": 9.734605714443906e-06,
"loss": 0.7315,
"step": 15968
},
{
"epoch": 1.6,
"grad_norm": 1.9314860105514526,
"learning_rate": 9.586113846848982e-06,
"loss": 0.6206,
"step": 16000
},
{
"epoch": 1.6032,
"grad_norm": 1.2245968580245972,
"learning_rate": 9.438643081919818e-06,
"loss": 0.6928,
"step": 16032
},
{
"epoch": 1.6064,
"grad_norm": 1.1570390462875366,
"learning_rate": 9.29219714567256e-06,
"loss": 0.6487,
"step": 16064
},
{
"epoch": 1.6096,
"grad_norm": 2.6823019981384277,
"learning_rate": 9.146779738229838e-06,
"loss": 0.6707,
"step": 16096
},
{
"epoch": 1.6128,
"grad_norm": 0.8797982931137085,
"learning_rate": 9.002394533727382e-06,
"loss": 0.6555,
"step": 16128
},
{
"epoch": 1.616,
"grad_norm": 1.5554248094558716,
"learning_rate": 8.859045180221138e-06,
"loss": 0.7374,
"step": 16160
},
{
"epoch": 1.6192,
"grad_norm": 0.8423168659210205,
"learning_rate": 8.716735299595059e-06,
"loss": 0.7016,
"step": 16192
},
{
"epoch": 1.6223999999999998,
"grad_norm": 0.746976375579834,
"learning_rate": 8.575468487469696e-06,
"loss": 0.7187,
"step": 16224
},
{
"epoch": 1.6256,
"grad_norm": 0.968506395816803,
"learning_rate": 8.435248313111243e-06,
"loss": 0.7318,
"step": 16256
},
{
"epoch": 1.6288,
"grad_norm": 1.9923843145370483,
"learning_rate": 8.296078319341443e-06,
"loss": 0.6974,
"step": 16288
},
{
"epoch": 1.6320000000000001,
"grad_norm": 1.3819128274917603,
"learning_rate": 8.157962022448001e-06,
"loss": 0.6629,
"step": 16320
},
{
"epoch": 1.6352,
"grad_norm": 1.205120325088501,
"learning_rate": 8.020902912095806e-06,
"loss": 0.675,
"step": 16352
},
{
"epoch": 1.6383999999999999,
"grad_norm": 1.0177948474884033,
"learning_rate": 7.884904451238712e-06,
"loss": 0.6806,
"step": 16384
},
{
"epoch": 1.6416,
"grad_norm": 1.5206818580627441,
"learning_rate": 7.749970076032049e-06,
"loss": 0.6469,
"step": 16416
},
{
"epoch": 1.6448,
"grad_norm": 1.2392345666885376,
"learning_rate": 7.6161031957458494e-06,
"loss": 0.6733,
"step": 16448
},
{
"epoch": 1.6480000000000001,
"grad_norm": 1.3811380863189697,
"learning_rate": 7.48330719267864e-06,
"loss": 0.7082,
"step": 16480
},
{
"epoch": 1.6512,
"grad_norm": 2.9916181564331055,
"learning_rate": 7.351585422072049e-06,
"loss": 0.687,
"step": 16512
},
{
"epoch": 1.6543999999999999,
"grad_norm": 1.3270025253295898,
"learning_rate": 7.220941212026005e-06,
"loss": 0.6676,
"step": 16544
},
{
"epoch": 1.6576,
"grad_norm": 1.387866735458374,
"learning_rate": 7.091377863414611e-06,
"loss": 0.7144,
"step": 16576
},
{
"epoch": 1.6608,
"grad_norm": 3.1207494735717773,
"learning_rate": 6.962898649802823e-06,
"loss": 0.713,
"step": 16608
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.9926366806030273,
"learning_rate": 6.835506817363657e-06,
"loss": 0.7259,
"step": 16640
},
{
"epoch": 1.6672,
"grad_norm": 1.1651358604431152,
"learning_rate": 6.709205584796241e-06,
"loss": 0.72,
"step": 16672
},
{
"epoch": 1.6703999999999999,
"grad_norm": 1.572033166885376,
"learning_rate": 6.583998143244463e-06,
"loss": 0.7037,
"step": 16704
},
{
"epoch": 1.6736,
"grad_norm": 1.3364044427871704,
"learning_rate": 6.459887656216318e-06,
"loss": 0.6626,
"step": 16736
},
{
"epoch": 1.6768,
"grad_norm": 1.205981731414795,
"learning_rate": 6.336877259504004e-06,
"loss": 0.6653,
"step": 16768
},
{
"epoch": 1.6800000000000002,
"grad_norm": 1.0859547853469849,
"learning_rate": 6.214970061104686e-06,
"loss": 0.6433,
"step": 16800
},
{
"epoch": 1.6832,
"grad_norm": 1.1651371717453003,
"learning_rate": 6.094169141142014e-06,
"loss": 0.6532,
"step": 16832
},
{
"epoch": 1.6864,
"grad_norm": 2.472378730773926,
"learning_rate": 5.9744775517881935e-06,
"loss": 0.7404,
"step": 16864
},
{
"epoch": 1.6896,
"grad_norm": 1.8600549697875977,
"learning_rate": 5.855898317186992e-06,
"loss": 0.7189,
"step": 16896
},
{
"epoch": 1.6928,
"grad_norm": 1.219624400138855,
"learning_rate": 5.738434433377243e-06,
"loss": 0.7101,
"step": 16928
},
{
"epoch": 1.696,
"grad_norm": 1.0139923095703125,
"learning_rate": 5.622088868217179e-06,
"loss": 0.6955,
"step": 16960
},
{
"epoch": 1.6992,
"grad_norm": 1.0342754125595093,
"learning_rate": 5.506864561309455e-06,
"loss": 0.6473,
"step": 16992
},
{
"epoch": 1.7024,
"grad_norm": 1.0285016298294067,
"learning_rate": 5.3927644239268434e-06,
"loss": 0.6938,
"step": 17024
},
{
"epoch": 1.7056,
"grad_norm": 2.018785238265991,
"learning_rate": 5.279791338938716e-06,
"loss": 0.6869,
"step": 17056
},
{
"epoch": 1.7088,
"grad_norm": 1.0847994089126587,
"learning_rate": 5.1679481607382065e-06,
"loss": 0.6748,
"step": 17088
},
{
"epoch": 1.712,
"grad_norm": 1.3558719158172607,
"learning_rate": 5.057237715170033e-06,
"loss": 0.6666,
"step": 17120
},
{
"epoch": 1.7151999999999998,
"grad_norm": 1.9623054265975952,
"learning_rate": 4.9476627994591515e-06,
"loss": 0.708,
"step": 17152
},
{
"epoch": 1.7184,
"grad_norm": 3.0121994018554688,
"learning_rate": 4.839226182140072e-06,
"loss": 0.6868,
"step": 17184
},
{
"epoch": 1.7216,
"grad_norm": 1.1430203914642334,
"learning_rate": 4.731930602986906e-06,
"loss": 0.7228,
"step": 17216
},
{
"epoch": 1.7248,
"grad_norm": 0.7465403079986572,
"learning_rate": 4.625778772944156e-06,
"loss": 0.6882,
"step": 17248
},
{
"epoch": 1.728,
"grad_norm": 1.0914149284362793,
"learning_rate": 4.5207733740581795e-06,
"loss": 0.58,
"step": 17280
},
{
"epoch": 1.7311999999999999,
"grad_norm": 0.8385502696037292,
"learning_rate": 4.416917059409464e-06,
"loss": 0.6425,
"step": 17312
},
{
"epoch": 1.7344,
"grad_norm": 1.0544841289520264,
"learning_rate": 4.31421245304558e-06,
"loss": 0.6884,
"step": 17344
},
{
"epoch": 1.7376,
"grad_norm": 0.8974475264549255,
"learning_rate": 4.212662149914886e-06,
"loss": 0.6975,
"step": 17376
},
{
"epoch": 1.7408000000000001,
"grad_norm": 0.9501296281814575,
"learning_rate": 4.112268715800943e-06,
"loss": 0.7162,
"step": 17408
},
{
"epoch": 1.744,
"grad_norm": 1.0887449979782104,
"learning_rate": 4.013034687257727e-06,
"loss": 0.6118,
"step": 17440
},
{
"epoch": 1.7471999999999999,
"grad_norm": 1.264304518699646,
"learning_rate": 3.914962571545511e-06,
"loss": 0.7281,
"step": 17472
},
{
"epoch": 1.7504,
"grad_norm": 0.9120736718177795,
"learning_rate": 3.8180548465675144e-06,
"loss": 0.7174,
"step": 17504
},
{
"epoch": 1.7536,
"grad_norm": 1.4988112449645996,
"learning_rate": 3.7223139608073e-06,
"loss": 0.713,
"step": 17536
},
{
"epoch": 1.7568000000000001,
"grad_norm": 1.2108736038208008,
"learning_rate": 3.627742333266937e-06,
"loss": 0.6111,
"step": 17568
},
{
"epoch": 1.76,
"grad_norm": 2.0740504264831543,
"learning_rate": 3.534342353405834e-06,
"loss": 0.6936,
"step": 17600
},
{
"epoch": 1.7631999999999999,
"grad_norm": 1.0667200088500977,
"learning_rate": 3.442116381080418e-06,
"loss": 0.6292,
"step": 17632
},
{
"epoch": 1.7664,
"grad_norm": 1.067543864250183,
"learning_rate": 3.351066746484455e-06,
"loss": 0.6715,
"step": 17664
},
{
"epoch": 1.7696,
"grad_norm": 0.885524332523346,
"learning_rate": 3.2611957500902347e-06,
"loss": 0.7179,
"step": 17696
},
{
"epoch": 1.7728000000000002,
"grad_norm": 2.0609331130981445,
"learning_rate": 3.172505662590386e-06,
"loss": 0.6599,
"step": 17728
},
{
"epoch": 1.776,
"grad_norm": 4.125449180603027,
"learning_rate": 3.08499872484056e-06,
"loss": 0.6793,
"step": 17760
},
{
"epoch": 1.7792,
"grad_norm": 1.4737067222595215,
"learning_rate": 2.99867714780277e-06,
"loss": 0.7493,
"step": 17792
},
{
"epoch": 1.7824,
"grad_norm": 1.3752883672714233,
"learning_rate": 2.913543112489564e-06,
"loss": 0.6933,
"step": 17824
},
{
"epoch": 1.7856,
"grad_norm": 1.6619579792022705,
"learning_rate": 2.8295987699088923e-06,
"loss": 0.677,
"step": 17856
},
{
"epoch": 1.7888,
"grad_norm": 1.1567070484161377,
"learning_rate": 2.746846241009765e-06,
"loss": 0.706,
"step": 17888
},
{
"epoch": 1.792,
"grad_norm": 1.316402554512024,
"learning_rate": 2.665287616628659e-06,
"loss": 0.8213,
"step": 17920
},
{
"epoch": 1.7952,
"grad_norm": 0.5952889323234558,
"learning_rate": 2.584924957436735e-06,
"loss": 0.6228,
"step": 17952
},
{
"epoch": 1.7984,
"grad_norm": 1.3763988018035889,
"learning_rate": 2.505760293887699e-06,
"loss": 0.6694,
"step": 17984
},
{
"epoch": 1.8016,
"grad_norm": 0.7822681665420532,
"learning_rate": 2.4302138383881677e-06,
"loss": 0.6478,
"step": 18016
},
{
"epoch": 1.8048,
"grad_norm": 1.46699059009552,
"learning_rate": 2.353413545416977e-06,
"loss": 0.6399,
"step": 18048
},
{
"epoch": 1.808,
"grad_norm": 1.3913434743881226,
"learning_rate": 2.2778170974870673e-06,
"loss": 0.7223,
"step": 18080
},
{
"epoch": 1.8112,
"grad_norm": 2.242133378982544,
"learning_rate": 2.2034264046284e-06,
"loss": 0.6006,
"step": 18112
},
{
"epoch": 1.8144,
"grad_norm": 2.8192715644836426,
"learning_rate": 2.1302433464062186e-06,
"loss": 0.6911,
"step": 18144
},
{
"epoch": 1.8176,
"grad_norm": 0.9974599480628967,
"learning_rate": 2.0582697718734722e-06,
"loss": 0.7074,
"step": 18176
},
{
"epoch": 1.8208,
"grad_norm": 1.1070644855499268,
"learning_rate": 1.9875074995241328e-06,
"loss": 0.7339,
"step": 18208
},
{
"epoch": 1.8239999999999998,
"grad_norm": 2.572319507598877,
"learning_rate": 1.9179583172472815e-06,
"loss": 0.6539,
"step": 18240
},
{
"epoch": 1.8272,
"grad_norm": 1.3265892267227173,
"learning_rate": 1.8496239822818729e-06,
"loss": 0.6763,
"step": 18272
},
{
"epoch": 1.8304,
"grad_norm": 1.6043624877929688,
"learning_rate": 1.7825062211723753e-06,
"loss": 0.6849,
"step": 18304
},
{
"epoch": 1.8336000000000001,
"grad_norm": 1.4094398021697998,
"learning_rate": 1.7166067297251343e-06,
"loss": 0.7863,
"step": 18336
},
{
"epoch": 1.8368,
"grad_norm": 1.7544119358062744,
"learning_rate": 1.6519271729655395e-06,
"loss": 0.7,
"step": 18368
},
{
"epoch": 1.8399999999999999,
"grad_norm": 1.261846661567688,
"learning_rate": 1.5884691850959532e-06,
"loss": 0.7229,
"step": 18400
},
{
"epoch": 1.8432,
"grad_norm": 2.0065815448760986,
"learning_rate": 1.5262343694543935e-06,
"loss": 0.6832,
"step": 18432
},
{
"epoch": 1.8464,
"grad_norm": 2.157181978225708,
"learning_rate": 1.4652242984740661e-06,
"loss": 0.6816,
"step": 18464
},
{
"epoch": 1.8496000000000001,
"grad_norm": 1.5340476036071777,
"learning_rate": 1.4054405136435856e-06,
"loss": 0.6593,
"step": 18496
},
{
"epoch": 1.8528,
"grad_norm": 1.1297880411148071,
"learning_rate": 1.346884525468095e-06,
"loss": 0.6664,
"step": 18528
},
{
"epoch": 1.8559999999999999,
"grad_norm": 1.164290189743042,
"learning_rate": 1.2895578134310304e-06,
"loss": 0.7267,
"step": 18560
},
{
"epoch": 1.8592,
"grad_norm": 1.268489122390747,
"learning_rate": 1.2334618259567888e-06,
"loss": 0.7445,
"step": 18592
},
{
"epoch": 1.8624,
"grad_norm": 1.0596938133239746,
"learning_rate": 1.1785979803741077e-06,
"loss": 0.6559,
"step": 18624
},
{
"epoch": 1.8656000000000001,
"grad_norm": 1.8410592079162598,
"learning_rate": 1.1249676628802608e-06,
"loss": 0.7277,
"step": 18656
},
{
"epoch": 1.8688,
"grad_norm": 1.1427465677261353,
"learning_rate": 1.0725722285060469e-06,
"loss": 0.7248,
"step": 18688
},
{
"epoch": 1.8719999999999999,
"grad_norm": 2.2126009464263916,
"learning_rate": 1.0214130010815336e-06,
"loss": 0.6974,
"step": 18720
},
{
"epoch": 1.8752,
"grad_norm": 1.2691198587417603,
"learning_rate": 9.714912732026183e-07,
"loss": 0.7177,
"step": 18752
},
{
"epoch": 1.8784,
"grad_norm": 1.9507790803909302,
"learning_rate": 9.228083061983806e-07,
"loss": 0.7317,
"step": 18784
},
{
"epoch": 1.8816000000000002,
"grad_norm": 1.4309967756271362,
"learning_rate": 8.753653300991704e-07,
"loss": 0.7079,
"step": 18816
},
{
"epoch": 1.8848,
"grad_norm": 1.2619049549102783,
"learning_rate": 8.291635436056045e-07,
"loss": 0.6772,
"step": 18848
},
{
"epoch": 1.888,
"grad_norm": 1.6219273805618286,
"learning_rate": 7.842041140582013e-07,
"loss": 0.6626,
"step": 18880
},
{
"epoch": 1.8912,
"grad_norm": 1.1635990142822266,
"learning_rate": 7.404881774079442e-07,
"loss": 0.6658,
"step": 18912
},
{
"epoch": 1.8944,
"grad_norm": 1.2584396600723267,
"learning_rate": 6.98016838187543e-07,
"loss": 0.6998,
"step": 18944
},
{
"epoch": 1.8976,
"grad_norm": 2.075270652770996,
"learning_rate": 6.567911694835449e-07,
"loss": 0.7049,
"step": 18976
},
{
"epoch": 1.9008,
"grad_norm": 1.8197970390319824,
"learning_rate": 6.168122129092346e-07,
"loss": 0.7312,
"step": 19008
},
{
"epoch": 1.904,
"grad_norm": 1.1752382516860962,
"learning_rate": 5.780809785782771e-07,
"loss": 0.66,
"step": 19040
},
{
"epoch": 1.9072,
"grad_norm": 2.4053640365600586,
"learning_rate": 5.405984450792378e-07,
"loss": 0.6361,
"step": 19072
},
{
"epoch": 1.9104,
"grad_norm": 1.3622859716415405,
"learning_rate": 5.043655594508312e-07,
"loss": 0.6959,
"step": 19104
},
{
"epoch": 1.9136,
"grad_norm": 1.1638226509094238,
"learning_rate": 4.6938323715800534e-07,
"loss": 0.6464,
"step": 19136
},
{
"epoch": 1.9167999999999998,
"grad_norm": 1.3796526193618774,
"learning_rate": 4.3565236206880576e-07,
"loss": 0.6937,
"step": 19168
},
{
"epoch": 1.92,
"grad_norm": 2.0228989124298096,
"learning_rate": 4.031737864320373e-07,
"loss": 0.6535,
"step": 19200
},
{
"epoch": 1.9232,
"grad_norm": 1.9368504285812378,
"learning_rate": 3.719483308557592e-07,
"loss": 0.7266,
"step": 19232
},
{
"epoch": 1.9264000000000001,
"grad_norm": 1.539267897605896,
"learning_rate": 3.4197678428650183e-07,
"loss": 0.6832,
"step": 19264
},
{
"epoch": 1.9296,
"grad_norm": 0.7817739844322205,
"learning_rate": 3.132599039893991e-07,
"loss": 0.736,
"step": 19296
},
{
"epoch": 1.9327999999999999,
"grad_norm": 1.2418057918548584,
"learning_rate": 2.8579841552898166e-07,
"loss": 0.6632,
"step": 19328
},
{
"epoch": 1.936,
"grad_norm": 1.2179762125015259,
"learning_rate": 2.595930127509083e-07,
"loss": 0.6712,
"step": 19360
},
{
"epoch": 1.9392,
"grad_norm": 1.8442071676254272,
"learning_rate": 2.346443577643964e-07,
"loss": 0.6663,
"step": 19392
},
{
"epoch": 1.9424000000000001,
"grad_norm": 1.1954784393310547,
"learning_rate": 2.1095308092550226e-07,
"loss": 0.6471,
"step": 19424
},
{
"epoch": 1.9456,
"grad_norm": 1.3971970081329346,
"learning_rate": 1.885197808212058e-07,
"loss": 0.7487,
"step": 19456
},
{
"epoch": 1.9487999999999999,
"grad_norm": 2.3306965827941895,
"learning_rate": 1.67345024254284e-07,
"loss": 0.7119,
"step": 19488
},
{
"epoch": 1.952,
"grad_norm": 4.306802749633789,
"learning_rate": 1.474293462289611e-07,
"loss": 0.7447,
"step": 19520
},
{
"epoch": 1.9552,
"grad_norm": 1.2803548574447632,
"learning_rate": 1.2877324993744166e-07,
"loss": 0.6207,
"step": 19552
},
{
"epoch": 1.9584000000000001,
"grad_norm": 1.1023082733154297,
"learning_rate": 1.1137720674714302e-07,
"loss": 0.7359,
"step": 19584
},
{
"epoch": 1.9616,
"grad_norm": 1.6054505109786987,
"learning_rate": 9.524165618883252e-08,
"loss": 0.6773,
"step": 19616
},
{
"epoch": 1.9647999999999999,
"grad_norm": 2.317808151245117,
"learning_rate": 8.036700594549196e-08,
"loss": 0.6611,
"step": 19648
},
{
"epoch": 1.968,
"grad_norm": 1.5842328071594238,
"learning_rate": 6.675363184203143e-08,
"loss": 0.7052,
"step": 19680
},
{
"epoch": 1.9712,
"grad_norm": 1.1991004943847656,
"learning_rate": 5.440187783578021e-08,
"loss": 0.7092,
"step": 19712
},
{
"epoch": 1.9744000000000002,
"grad_norm": 2.410238742828369,
"learning_rate": 4.331205600781596e-08,
"loss": 0.6689,
"step": 19744
},
{
"epoch": 1.9776,
"grad_norm": 1.3391367197036743,
"learning_rate": 3.348444655505989e-08,
"loss": 0.6443,
"step": 19776
},
{
"epoch": 1.9808,
"grad_norm": 1.0821375846862793,
"learning_rate": 2.4919297783210227e-08,
"loss": 0.596,
"step": 19808
},
{
"epoch": 1.984,
"grad_norm": 1.2606804370880127,
"learning_rate": 1.7616826100469442e-08,
"loss": 0.6267,
"step": 19840
},
{
"epoch": 1.9872,
"grad_norm": 1.0765273571014404,
"learning_rate": 1.1577216012065296e-08,
"loss": 0.6782,
"step": 19872
},
{
"epoch": 1.9904,
"grad_norm": 0.8385388255119324,
"learning_rate": 6.800620115587908e-09,
"loss": 0.7122,
"step": 19904
},
{
"epoch": 1.9936,
"grad_norm": 1.4438368082046509,
"learning_rate": 3.2871590971594868e-09,
"loss": 0.7942,
"step": 19936
},
{
"epoch": 1.9968,
"grad_norm": 0.7762835025787354,
"learning_rate": 1.0369217283479061e-09,
"loss": 0.7374,
"step": 19968
},
{
"epoch": 2.0,
"grad_norm": 1.7044473886489868,
"learning_rate": 4.996486395736355e-11,
"loss": 0.7244,
"step": 20000
}
],
"logging_steps": 32,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.072644132758323e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}