llama3-8b-jon / trainer_state.json
jtz18's picture
End of training
fd3d453 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.984771573604061,
"eval_steps": 500,
"global_step": 441,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00676818950930626,
"grad_norm": 30.061185836791992,
"learning_rate": 1.4285714285714286e-06,
"loss": 3.5407,
"step": 1
},
{
"epoch": 0.01353637901861252,
"grad_norm": 45.98220443725586,
"learning_rate": 2.8571428571428573e-06,
"loss": 3.8623,
"step": 2
},
{
"epoch": 0.02030456852791878,
"grad_norm": 52.36494064331055,
"learning_rate": 4.2857142857142855e-06,
"loss": 4.0388,
"step": 3
},
{
"epoch": 0.02707275803722504,
"grad_norm": 38.702606201171875,
"learning_rate": 5.7142857142857145e-06,
"loss": 3.9038,
"step": 4
},
{
"epoch": 0.0338409475465313,
"grad_norm": 39.755348205566406,
"learning_rate": 7.1428571428571436e-06,
"loss": 3.7208,
"step": 5
},
{
"epoch": 0.04060913705583756,
"grad_norm": 28.482959747314453,
"learning_rate": 8.571428571428571e-06,
"loss": 3.699,
"step": 6
},
{
"epoch": 0.047377326565143825,
"grad_norm": 33.6556510925293,
"learning_rate": 1e-05,
"loss": 3.2901,
"step": 7
},
{
"epoch": 0.05414551607445008,
"grad_norm": 10.973630905151367,
"learning_rate": 1.1428571428571429e-05,
"loss": 3.2131,
"step": 8
},
{
"epoch": 0.06091370558375635,
"grad_norm": 8.036073684692383,
"learning_rate": 1.2857142857142859e-05,
"loss": 3.2309,
"step": 9
},
{
"epoch": 0.0676818950930626,
"grad_norm": 7.630257606506348,
"learning_rate": 1.4285714285714287e-05,
"loss": 3.2666,
"step": 10
},
{
"epoch": 0.07445008460236886,
"grad_norm": 7.535837650299072,
"learning_rate": 1.5714285714285715e-05,
"loss": 3.1925,
"step": 11
},
{
"epoch": 0.08121827411167512,
"grad_norm": 10.59571361541748,
"learning_rate": 1.7142857142857142e-05,
"loss": 3.1352,
"step": 12
},
{
"epoch": 0.08798646362098139,
"grad_norm": 8.021005630493164,
"learning_rate": 1.8571428571428575e-05,
"loss": 2.8961,
"step": 13
},
{
"epoch": 0.09475465313028765,
"grad_norm": 5.240553855895996,
"learning_rate": 2e-05,
"loss": 3.0844,
"step": 14
},
{
"epoch": 0.10152284263959391,
"grad_norm": 5.916426658630371,
"learning_rate": 1.9999729347501484e-05,
"loss": 3.1558,
"step": 15
},
{
"epoch": 0.10829103214890017,
"grad_norm": 5.597356796264648,
"learning_rate": 1.9998917404656488e-05,
"loss": 3.091,
"step": 16
},
{
"epoch": 0.11505922165820642,
"grad_norm": 6.339421272277832,
"learning_rate": 1.9997564215415886e-05,
"loss": 3.1522,
"step": 17
},
{
"epoch": 0.1218274111675127,
"grad_norm": 8.770332336425781,
"learning_rate": 1.9995669853028485e-05,
"loss": 3.1213,
"step": 18
},
{
"epoch": 0.12859560067681894,
"grad_norm": 8.278801918029785,
"learning_rate": 1.9993234420037072e-05,
"loss": 3.1599,
"step": 19
},
{
"epoch": 0.1353637901861252,
"grad_norm": 4.014164447784424,
"learning_rate": 1.999025804827285e-05,
"loss": 2.9919,
"step": 20
},
{
"epoch": 0.14213197969543148,
"grad_norm": 4.980748653411865,
"learning_rate": 1.9986740898848306e-05,
"loss": 3.0508,
"step": 21
},
{
"epoch": 0.14890016920473773,
"grad_norm": 4.607733249664307,
"learning_rate": 1.99826831621485e-05,
"loss": 3.0403,
"step": 22
},
{
"epoch": 0.155668358714044,
"grad_norm": 4.822920799255371,
"learning_rate": 1.997808505782075e-05,
"loss": 3.1426,
"step": 23
},
{
"epoch": 0.16243654822335024,
"grad_norm": 6.66705846786499,
"learning_rate": 1.9972946834762732e-05,
"loss": 3.1822,
"step": 24
},
{
"epoch": 0.1692047377326565,
"grad_norm": 8.534043312072754,
"learning_rate": 1.9967268771109037e-05,
"loss": 3.0409,
"step": 25
},
{
"epoch": 0.17597292724196278,
"grad_norm": 5.848859786987305,
"learning_rate": 1.996105117421608e-05,
"loss": 2.9325,
"step": 26
},
{
"epoch": 0.18274111675126903,
"grad_norm": 3.849553108215332,
"learning_rate": 1.9954294380645497e-05,
"loss": 2.9975,
"step": 27
},
{
"epoch": 0.1895093062605753,
"grad_norm": 5.086816787719727,
"learning_rate": 1.9946998756145894e-05,
"loss": 3.0159,
"step": 28
},
{
"epoch": 0.19627749576988154,
"grad_norm": 5.160210132598877,
"learning_rate": 1.9939164695633067e-05,
"loss": 3.1489,
"step": 29
},
{
"epoch": 0.20304568527918782,
"grad_norm": 5.439467430114746,
"learning_rate": 1.9930792623168638e-05,
"loss": 3.09,
"step": 30
},
{
"epoch": 0.2098138747884941,
"grad_norm": 6.836553573608398,
"learning_rate": 1.992188299193706e-05,
"loss": 3.1017,
"step": 31
},
{
"epoch": 0.21658206429780033,
"grad_norm": 6.904626846313477,
"learning_rate": 1.9912436284221134e-05,
"loss": 2.798,
"step": 32
},
{
"epoch": 0.2233502538071066,
"grad_norm": 3.7659807205200195,
"learning_rate": 1.9902453011375865e-05,
"loss": 2.9905,
"step": 33
},
{
"epoch": 0.23011844331641285,
"grad_norm": 4.325652122497559,
"learning_rate": 1.98919337138008e-05,
"loss": 3.004,
"step": 34
},
{
"epoch": 0.23688663282571912,
"grad_norm": 5.1247100830078125,
"learning_rate": 1.9880878960910772e-05,
"loss": 2.9883,
"step": 35
},
{
"epoch": 0.2436548223350254,
"grad_norm": 5.715439319610596,
"learning_rate": 1.9869289351105087e-05,
"loss": 3.106,
"step": 36
},
{
"epoch": 0.25042301184433163,
"grad_norm": 6.596778392791748,
"learning_rate": 1.9857165511735105e-05,
"loss": 3.094,
"step": 37
},
{
"epoch": 0.2571912013536379,
"grad_norm": 5.765414714813232,
"learning_rate": 1.9844508099070313e-05,
"loss": 2.9328,
"step": 38
},
{
"epoch": 0.2639593908629442,
"grad_norm": 3.2168986797332764,
"learning_rate": 1.9831317798262787e-05,
"loss": 2.9776,
"step": 39
},
{
"epoch": 0.2707275803722504,
"grad_norm": 3.423891067504883,
"learning_rate": 1.98175953233101e-05,
"loss": 2.9672,
"step": 40
},
{
"epoch": 0.27749576988155666,
"grad_norm": 4.785609245300293,
"learning_rate": 1.980334141701667e-05,
"loss": 3.1273,
"step": 41
},
{
"epoch": 0.28426395939086296,
"grad_norm": 5.194344997406006,
"learning_rate": 1.978855685095358e-05,
"loss": 3.0137,
"step": 42
},
{
"epoch": 0.2910321489001692,
"grad_norm": 5.609189510345459,
"learning_rate": 1.977324242541677e-05,
"loss": 3.0037,
"step": 43
},
{
"epoch": 0.29780033840947545,
"grad_norm": 6.650205612182617,
"learning_rate": 1.9757398969383752e-05,
"loss": 2.901,
"step": 44
},
{
"epoch": 0.30456852791878175,
"grad_norm": 4.614665985107422,
"learning_rate": 1.974102734046872e-05,
"loss": 2.9467,
"step": 45
},
{
"epoch": 0.311336717428088,
"grad_norm": 3.8143110275268555,
"learning_rate": 1.9724128424876117e-05,
"loss": 2.9703,
"step": 46
},
{
"epoch": 0.31810490693739424,
"grad_norm": 5.5977067947387695,
"learning_rate": 1.9706703137352695e-05,
"loss": 2.9885,
"step": 47
},
{
"epoch": 0.3248730964467005,
"grad_norm": 6.040333271026611,
"learning_rate": 1.968875242113798e-05,
"loss": 3.0303,
"step": 48
},
{
"epoch": 0.3316412859560068,
"grad_norm": 6.05629825592041,
"learning_rate": 1.9670277247913205e-05,
"loss": 2.9001,
"step": 49
},
{
"epoch": 0.338409475465313,
"grad_norm": 7.62821102142334,
"learning_rate": 1.965127861774873e-05,
"loss": 2.8845,
"step": 50
},
{
"epoch": 0.34517766497461927,
"grad_norm": 3.4944276809692383,
"learning_rate": 1.96317575590499e-05,
"loss": 2.8731,
"step": 51
},
{
"epoch": 0.35194585448392557,
"grad_norm": 3.9865429401397705,
"learning_rate": 1.9611715128501378e-05,
"loss": 2.936,
"step": 52
},
{
"epoch": 0.3587140439932318,
"grad_norm": 4.1927385330200195,
"learning_rate": 1.9591152411009942e-05,
"loss": 2.9779,
"step": 53
},
{
"epoch": 0.36548223350253806,
"grad_norm": 4.403099060058594,
"learning_rate": 1.9570070519645767e-05,
"loss": 2.9444,
"step": 54
},
{
"epoch": 0.37225042301184436,
"grad_norm": 5.183342456817627,
"learning_rate": 1.9548470595582166e-05,
"loss": 2.9487,
"step": 55
},
{
"epoch": 0.3790186125211506,
"grad_norm": 6.125980854034424,
"learning_rate": 1.9526353808033827e-05,
"loss": 2.7997,
"step": 56
},
{
"epoch": 0.38578680203045684,
"grad_norm": 5.613026142120361,
"learning_rate": 1.9503721354193507e-05,
"loss": 2.8006,
"step": 57
},
{
"epoch": 0.3925549915397631,
"grad_norm": 5.052882194519043,
"learning_rate": 1.948057445916724e-05,
"loss": 2.8919,
"step": 58
},
{
"epoch": 0.3993231810490694,
"grad_norm": 3.8712987899780273,
"learning_rate": 1.9456914375908026e-05,
"loss": 2.912,
"step": 59
},
{
"epoch": 0.40609137055837563,
"grad_norm": 5.594967365264893,
"learning_rate": 1.9432742385147988e-05,
"loss": 3.0025,
"step": 60
},
{
"epoch": 0.4128595600676819,
"grad_norm": 5.751216411590576,
"learning_rate": 1.9408059795329073e-05,
"loss": 2.9587,
"step": 61
},
{
"epoch": 0.4196277495769882,
"grad_norm": 6.2975993156433105,
"learning_rate": 1.9382867942532195e-05,
"loss": 2.8469,
"step": 62
},
{
"epoch": 0.4263959390862944,
"grad_norm": 5.6724724769592285,
"learning_rate": 1.9357168190404937e-05,
"loss": 2.8086,
"step": 63
},
{
"epoch": 0.43316412859560066,
"grad_norm": 3.814537525177002,
"learning_rate": 1.9330961930087724e-05,
"loss": 2.8346,
"step": 64
},
{
"epoch": 0.43993231810490696,
"grad_norm": 4.25925874710083,
"learning_rate": 1.9304250580138524e-05,
"loss": 2.9784,
"step": 65
},
{
"epoch": 0.4467005076142132,
"grad_norm": 3.799999237060547,
"learning_rate": 1.9277035586456056e-05,
"loss": 2.8535,
"step": 66
},
{
"epoch": 0.45346869712351945,
"grad_norm": 6.35882568359375,
"learning_rate": 1.9249318422201524e-05,
"loss": 2.9694,
"step": 67
},
{
"epoch": 0.4602368866328257,
"grad_norm": 6.477646827697754,
"learning_rate": 1.9221100587718884e-05,
"loss": 3.0061,
"step": 68
},
{
"epoch": 0.467005076142132,
"grad_norm": 5.934814929962158,
"learning_rate": 1.919238361045362e-05,
"loss": 2.7579,
"step": 69
},
{
"epoch": 0.47377326565143824,
"grad_norm": 3.154392719268799,
"learning_rate": 1.916316904487005e-05,
"loss": 2.7796,
"step": 70
},
{
"epoch": 0.4805414551607445,
"grad_norm": 4.2975616455078125,
"learning_rate": 1.9133458472367216e-05,
"loss": 2.8438,
"step": 71
},
{
"epoch": 0.4873096446700508,
"grad_norm": 3.5001091957092285,
"learning_rate": 1.9103253501193256e-05,
"loss": 2.9239,
"step": 72
},
{
"epoch": 0.494077834179357,
"grad_norm": 5.083667278289795,
"learning_rate": 1.9072555766358346e-05,
"loss": 2.9237,
"step": 73
},
{
"epoch": 0.5008460236886633,
"grad_norm": 5.127432346343994,
"learning_rate": 1.904136692954622e-05,
"loss": 2.9601,
"step": 74
},
{
"epoch": 0.5076142131979695,
"grad_norm": 6.419186115264893,
"learning_rate": 1.900968867902419e-05,
"loss": 2.8173,
"step": 75
},
{
"epoch": 0.5143824027072758,
"grad_norm": 3.2118561267852783,
"learning_rate": 1.89775227295518e-05,
"loss": 2.7624,
"step": 76
},
{
"epoch": 0.5211505922165821,
"grad_norm": 4.4183807373046875,
"learning_rate": 1.8944870822287957e-05,
"loss": 2.84,
"step": 77
},
{
"epoch": 0.5279187817258884,
"grad_norm": 4.880641460418701,
"learning_rate": 1.891173472469672e-05,
"loss": 2.8241,
"step": 78
},
{
"epoch": 0.5346869712351946,
"grad_norm": 4.362979888916016,
"learning_rate": 1.8878116230451615e-05,
"loss": 2.8896,
"step": 79
},
{
"epoch": 0.5414551607445008,
"grad_norm": 5.762423515319824,
"learning_rate": 1.884401715933853e-05,
"loss": 2.8949,
"step": 80
},
{
"epoch": 0.5482233502538071,
"grad_norm": 8.867072105407715,
"learning_rate": 1.8809439357157226e-05,
"loss": 2.9593,
"step": 81
},
{
"epoch": 0.5549915397631133,
"grad_norm": 5.437422752380371,
"learning_rate": 1.8774384695621407e-05,
"loss": 2.6622,
"step": 82
},
{
"epoch": 0.5617597292724196,
"grad_norm": 3.8561763763427734,
"learning_rate": 1.8738855072257428e-05,
"loss": 2.8984,
"step": 83
},
{
"epoch": 0.5685279187817259,
"grad_norm": 4.806951522827148,
"learning_rate": 1.8702852410301556e-05,
"loss": 2.8228,
"step": 84
},
{
"epoch": 0.5752961082910322,
"grad_norm": 4.01973295211792,
"learning_rate": 1.8666378658595863e-05,
"loss": 2.8602,
"step": 85
},
{
"epoch": 0.5820642978003384,
"grad_norm": 4.8082170486450195,
"learning_rate": 1.8629435791482765e-05,
"loss": 2.9552,
"step": 86
},
{
"epoch": 0.5888324873096447,
"grad_norm": 6.550163269042969,
"learning_rate": 1.8592025808698116e-05,
"loss": 2.7965,
"step": 87
},
{
"epoch": 0.5956006768189509,
"grad_norm": 6.67849588394165,
"learning_rate": 1.8554150735262975e-05,
"loss": 2.7528,
"step": 88
},
{
"epoch": 0.6023688663282571,
"grad_norm": 2.8771703243255615,
"learning_rate": 1.8515812621373998e-05,
"loss": 2.8308,
"step": 89
},
{
"epoch": 0.6091370558375635,
"grad_norm": 5.142778396606445,
"learning_rate": 1.8477013542292446e-05,
"loss": 2.7588,
"step": 90
},
{
"epoch": 0.6159052453468697,
"grad_norm": 6.1177873611450195,
"learning_rate": 1.8437755598231857e-05,
"loss": 2.8855,
"step": 91
},
{
"epoch": 0.622673434856176,
"grad_norm": 6.153074741363525,
"learning_rate": 1.8398040914244363e-05,
"loss": 2.982,
"step": 92
},
{
"epoch": 0.6294416243654822,
"grad_norm": 5.2112345695495605,
"learning_rate": 1.8357871640105648e-05,
"loss": 2.8087,
"step": 93
},
{
"epoch": 0.6362098138747885,
"grad_norm": 8.335490226745605,
"learning_rate": 1.8317249950198598e-05,
"loss": 2.6842,
"step": 94
},
{
"epoch": 0.6429780033840947,
"grad_norm": 5.8389668464660645,
"learning_rate": 1.8276178043395588e-05,
"loss": 2.7224,
"step": 95
},
{
"epoch": 0.649746192893401,
"grad_norm": 3.2980704307556152,
"learning_rate": 1.8234658142939454e-05,
"loss": 2.8933,
"step": 96
},
{
"epoch": 0.6565143824027073,
"grad_norm": 5.305524826049805,
"learning_rate": 1.8192692496323158e-05,
"loss": 2.8103,
"step": 97
},
{
"epoch": 0.6632825719120136,
"grad_norm": 6.091310977935791,
"learning_rate": 1.8150283375168112e-05,
"loss": 2.9352,
"step": 98
},
{
"epoch": 0.6700507614213198,
"grad_norm": 5.697042465209961,
"learning_rate": 1.8107433075101254e-05,
"loss": 2.8545,
"step": 99
},
{
"epoch": 0.676818950930626,
"grad_norm": 7.473045349121094,
"learning_rate": 1.8064143915630723e-05,
"loss": 2.6375,
"step": 100
},
{
"epoch": 0.6835871404399323,
"grad_norm": 2.685059070587158,
"learning_rate": 1.8020418240020362e-05,
"loss": 2.7562,
"step": 101
},
{
"epoch": 0.6903553299492385,
"grad_norm": 3.2231831550598145,
"learning_rate": 1.7976258415162836e-05,
"loss": 2.8718,
"step": 102
},
{
"epoch": 0.6971235194585449,
"grad_norm": 3.358761787414551,
"learning_rate": 1.7931666831451536e-05,
"loss": 2.8679,
"step": 103
},
{
"epoch": 0.7038917089678511,
"grad_norm": 4.336738109588623,
"learning_rate": 1.7886645902651166e-05,
"loss": 2.797,
"step": 104
},
{
"epoch": 0.7106598984771574,
"grad_norm": 4.629664421081543,
"learning_rate": 1.7841198065767107e-05,
"loss": 2.7675,
"step": 105
},
{
"epoch": 0.7174280879864636,
"grad_norm": 6.125463485717773,
"learning_rate": 1.779532578091347e-05,
"loss": 2.627,
"step": 106
},
{
"epoch": 0.7241962774957699,
"grad_norm": 4.7176361083984375,
"learning_rate": 1.7749031531179962e-05,
"loss": 2.6226,
"step": 107
},
{
"epoch": 0.7309644670050761,
"grad_norm": 3.0627963542938232,
"learning_rate": 1.7702317822497457e-05,
"loss": 2.8128,
"step": 108
},
{
"epoch": 0.7377326565143824,
"grad_norm": 4.201870918273926,
"learning_rate": 1.7655187183502344e-05,
"loss": 2.7452,
"step": 109
},
{
"epoch": 0.7445008460236887,
"grad_norm": 4.618666648864746,
"learning_rate": 1.7607642165399665e-05,
"loss": 2.8431,
"step": 110
},
{
"epoch": 0.751269035532995,
"grad_norm": 5.060817718505859,
"learning_rate": 1.755968534182501e-05,
"loss": 2.9154,
"step": 111
},
{
"epoch": 0.7580372250423012,
"grad_norm": 5.751707553863525,
"learning_rate": 1.7511319308705198e-05,
"loss": 2.7316,
"step": 112
},
{
"epoch": 0.7648054145516074,
"grad_norm": 5.403834342956543,
"learning_rate": 1.746254668411778e-05,
"loss": 2.6888,
"step": 113
},
{
"epoch": 0.7715736040609137,
"grad_norm": 3.657097339630127,
"learning_rate": 1.7413370108149288e-05,
"loss": 2.7851,
"step": 114
},
{
"epoch": 0.7783417935702199,
"grad_norm": 3.560981035232544,
"learning_rate": 1.7363792242752354e-05,
"loss": 2.8977,
"step": 115
},
{
"epoch": 0.7851099830795262,
"grad_norm": 5.971733570098877,
"learning_rate": 1.731381577160161e-05,
"loss": 2.7807,
"step": 116
},
{
"epoch": 0.7918781725888325,
"grad_norm": 5.342052459716797,
"learning_rate": 1.726344339994841e-05,
"loss": 2.8701,
"step": 117
},
{
"epoch": 0.7986463620981388,
"grad_norm": 5.152158737182617,
"learning_rate": 1.7212677854474402e-05,
"loss": 2.6611,
"step": 118
},
{
"epoch": 0.805414551607445,
"grad_norm": 5.67462682723999,
"learning_rate": 1.7161521883143936e-05,
"loss": 2.6134,
"step": 119
},
{
"epoch": 0.8121827411167513,
"grad_norm": 4.32338285446167,
"learning_rate": 1.7109978255055295e-05,
"loss": 2.7922,
"step": 120
},
{
"epoch": 0.8189509306260575,
"grad_norm": 4.190022945404053,
"learning_rate": 1.705804976029083e-05,
"loss": 2.7966,
"step": 121
},
{
"epoch": 0.8257191201353637,
"grad_norm": 3.7118101119995117,
"learning_rate": 1.7005739209765906e-05,
"loss": 2.8186,
"step": 122
},
{
"epoch": 0.8324873096446701,
"grad_norm": 5.160277366638184,
"learning_rate": 1.6953049435076768e-05,
"loss": 2.9102,
"step": 123
},
{
"epoch": 0.8392554991539763,
"grad_norm": 5.411961078643799,
"learning_rate": 1.6899983288347248e-05,
"loss": 2.7617,
"step": 124
},
{
"epoch": 0.8460236886632826,
"grad_norm": 7.095698356628418,
"learning_rate": 1.6846543642074382e-05,
"loss": 2.6926,
"step": 125
},
{
"epoch": 0.8527918781725888,
"grad_norm": 2.7258427143096924,
"learning_rate": 1.679273338897293e-05,
"loss": 2.6996,
"step": 126
},
{
"epoch": 0.8595600676818951,
"grad_norm": 3.8327107429504395,
"learning_rate": 1.6738555441818785e-05,
"loss": 2.7992,
"step": 127
},
{
"epoch": 0.8663282571912013,
"grad_norm": 4.773505687713623,
"learning_rate": 1.668401273329129e-05,
"loss": 2.7682,
"step": 128
},
{
"epoch": 0.8730964467005076,
"grad_norm": 4.107465744018555,
"learning_rate": 1.6629108215814523e-05,
"loss": 2.8903,
"step": 129
},
{
"epoch": 0.8798646362098139,
"grad_norm": 4.615577220916748,
"learning_rate": 1.6573844861397444e-05,
"loss": 2.8723,
"step": 130
},
{
"epoch": 0.8866328257191202,
"grad_norm": 7.497233867645264,
"learning_rate": 1.6518225661473045e-05,
"loss": 2.8291,
"step": 131
},
{
"epoch": 0.8934010152284264,
"grad_norm": 7.081593036651611,
"learning_rate": 1.6462253626736413e-05,
"loss": 2.5866,
"step": 132
},
{
"epoch": 0.9001692047377327,
"grad_norm": 3.4112582206726074,
"learning_rate": 1.6405931786981753e-05,
"loss": 2.7011,
"step": 133
},
{
"epoch": 0.9069373942470389,
"grad_norm": 4.411227226257324,
"learning_rate": 1.63492631909384e-05,
"loss": 2.7789,
"step": 134
},
{
"epoch": 0.9137055837563451,
"grad_norm": 5.724678993225098,
"learning_rate": 1.629225090610577e-05,
"loss": 2.8353,
"step": 135
},
{
"epoch": 0.9204737732656514,
"grad_norm": 5.374405860900879,
"learning_rate": 1.6234898018587336e-05,
"loss": 2.9269,
"step": 136
},
{
"epoch": 0.9272419627749577,
"grad_norm": 5.2880072593688965,
"learning_rate": 1.6177207632923558e-05,
"loss": 2.7229,
"step": 137
},
{
"epoch": 0.934010152284264,
"grad_norm": 5.647241115570068,
"learning_rate": 1.6119182871923834e-05,
"loss": 2.6128,
"step": 138
},
{
"epoch": 0.9407783417935702,
"grad_norm": 3.1383461952209473,
"learning_rate": 1.606082687649748e-05,
"loss": 2.708,
"step": 139
},
{
"epoch": 0.9475465313028765,
"grad_norm": 3.174626111984253,
"learning_rate": 1.6002142805483686e-05,
"loss": 2.8192,
"step": 140
},
{
"epoch": 0.9543147208121827,
"grad_norm": 3.3176159858703613,
"learning_rate": 1.5943133835480536e-05,
"loss": 2.8202,
"step": 141
},
{
"epoch": 0.961082910321489,
"grad_norm": 4.013696193695068,
"learning_rate": 1.588380316067307e-05,
"loss": 2.7887,
"step": 142
},
{
"epoch": 0.9678510998307953,
"grad_norm": 5.064754009246826,
"learning_rate": 1.582415399266036e-05,
"loss": 2.8008,
"step": 143
},
{
"epoch": 0.9746192893401016,
"grad_norm": 5.884125232696533,
"learning_rate": 1.5764189560281677e-05,
"loss": 2.6257,
"step": 144
},
{
"epoch": 0.9813874788494078,
"grad_norm": 3.0231032371520996,
"learning_rate": 1.5703913109441715e-05,
"loss": 2.7147,
"step": 145
},
{
"epoch": 0.988155668358714,
"grad_norm": 3.241084337234497,
"learning_rate": 1.564332790293487e-05,
"loss": 2.7612,
"step": 146
},
{
"epoch": 0.9949238578680203,
"grad_norm": 4.9145121574401855,
"learning_rate": 1.5582437220268648e-05,
"loss": 2.8171,
"step": 147
},
{
"epoch": 1.0016920473773265,
"grad_norm": 5.478322982788086,
"learning_rate": 1.5521244357486132e-05,
"loss": 2.6166,
"step": 148
},
{
"epoch": 1.0084602368866329,
"grad_norm": 2.909008502960205,
"learning_rate": 1.5459752626987563e-05,
"loss": 2.4026,
"step": 149
},
{
"epoch": 1.015228426395939,
"grad_norm": 3.355454206466675,
"learning_rate": 1.5397965357351035e-05,
"loss": 2.2265,
"step": 150
},
{
"epoch": 1.0219966159052454,
"grad_norm": 3.659177541732788,
"learning_rate": 1.5335885893152335e-05,
"loss": 2.1872,
"step": 151
},
{
"epoch": 1.0287648054145515,
"grad_norm": 4.308448791503906,
"learning_rate": 1.5273517594783878e-05,
"loss": 2.0188,
"step": 152
},
{
"epoch": 1.0355329949238579,
"grad_norm": 4.801682949066162,
"learning_rate": 1.521086383827282e-05,
"loss": 1.9166,
"step": 153
},
{
"epoch": 1.0423011844331642,
"grad_norm": 6.2991790771484375,
"learning_rate": 1.5147928015098309e-05,
"loss": 1.6925,
"step": 154
},
{
"epoch": 1.0490693739424704,
"grad_norm": 7.9047417640686035,
"learning_rate": 1.5084713532007906e-05,
"loss": 2.5637,
"step": 155
},
{
"epoch": 1.0558375634517767,
"grad_norm": 6.511372089385986,
"learning_rate": 1.5021223810833165e-05,
"loss": 2.3506,
"step": 156
},
{
"epoch": 1.0626057529610828,
"grad_norm": 5.02034854888916,
"learning_rate": 1.4957462288304421e-05,
"loss": 2.1029,
"step": 157
},
{
"epoch": 1.0693739424703892,
"grad_norm": 5.005341529846191,
"learning_rate": 1.489343241586475e-05,
"loss": 2.0565,
"step": 158
},
{
"epoch": 1.0761421319796955,
"grad_norm": 5.689651012420654,
"learning_rate": 1.4829137659483144e-05,
"loss": 1.9412,
"step": 159
},
{
"epoch": 1.0829103214890017,
"grad_norm": 6.038967609405518,
"learning_rate": 1.4764581499466895e-05,
"loss": 1.675,
"step": 160
},
{
"epoch": 1.089678510998308,
"grad_norm": 4.393552303314209,
"learning_rate": 1.4699767430273202e-05,
"loss": 2.1734,
"step": 161
},
{
"epoch": 1.0964467005076142,
"grad_norm": 3.555631637573242,
"learning_rate": 1.4634698960320018e-05,
"loss": 2.187,
"step": 162
},
{
"epoch": 1.1032148900169205,
"grad_norm": 3.7586710453033447,
"learning_rate": 1.4569379611796137e-05,
"loss": 1.9961,
"step": 163
},
{
"epoch": 1.1099830795262267,
"grad_norm": 4.319566249847412,
"learning_rate": 1.4503812920470535e-05,
"loss": 1.958,
"step": 164
},
{
"epoch": 1.116751269035533,
"grad_norm": 4.831964015960693,
"learning_rate": 1.443800243550098e-05,
"loss": 1.7072,
"step": 165
},
{
"epoch": 1.1235194585448394,
"grad_norm": 6.157094478607178,
"learning_rate": 1.4371951719241906e-05,
"loss": 1.7674,
"step": 166
},
{
"epoch": 1.1302876480541455,
"grad_norm": 4.833260536193848,
"learning_rate": 1.4305664347051586e-05,
"loss": 1.9227,
"step": 167
},
{
"epoch": 1.1370558375634519,
"grad_norm": 3.5581912994384766,
"learning_rate": 1.423914390709861e-05,
"loss": 2.3748,
"step": 168
},
{
"epoch": 1.143824027072758,
"grad_norm": 3.734834909439087,
"learning_rate": 1.4172394000167625e-05,
"loss": 2.0371,
"step": 169
},
{
"epoch": 1.1505922165820643,
"grad_norm": 4.00279426574707,
"learning_rate": 1.4105418239464452e-05,
"loss": 2.0383,
"step": 170
},
{
"epoch": 1.1573604060913705,
"grad_norm": 4.664214134216309,
"learning_rate": 1.4038220250420487e-05,
"loss": 1.9445,
"step": 171
},
{
"epoch": 1.1641285956006768,
"grad_norm": 5.319397926330566,
"learning_rate": 1.3970803670496453e-05,
"loss": 1.7367,
"step": 172
},
{
"epoch": 1.1708967851099832,
"grad_norm": 5.559267520904541,
"learning_rate": 1.390317214898551e-05,
"loss": 1.7855,
"step": 173
},
{
"epoch": 1.1776649746192893,
"grad_norm": 3.4772238731384277,
"learning_rate": 1.3835329346815716e-05,
"loss": 2.3614,
"step": 174
},
{
"epoch": 1.1844331641285957,
"grad_norm": 3.456766366958618,
"learning_rate": 1.3767278936351853e-05,
"loss": 2.1906,
"step": 175
},
{
"epoch": 1.1912013536379018,
"grad_norm": 3.739302635192871,
"learning_rate": 1.3699024601196641e-05,
"loss": 2.0554,
"step": 176
},
{
"epoch": 1.1979695431472082,
"grad_norm": 4.194780349731445,
"learning_rate": 1.3630570035991352e-05,
"loss": 1.8769,
"step": 177
},
{
"epoch": 1.2047377326565143,
"grad_norm": 5.365659713745117,
"learning_rate": 1.3561918946215807e-05,
"loss": 1.7156,
"step": 178
},
{
"epoch": 1.2115059221658206,
"grad_norm": 6.615947723388672,
"learning_rate": 1.34930750479878e-05,
"loss": 1.6489,
"step": 179
},
{
"epoch": 1.218274111675127,
"grad_norm": 4.608173847198486,
"learning_rate": 1.3424042067861944e-05,
"loss": 2.4078,
"step": 180
},
{
"epoch": 1.2250423011844331,
"grad_norm": 3.3148863315582275,
"learning_rate": 1.335482374262795e-05,
"loss": 2.2092,
"step": 181
},
{
"epoch": 1.2318104906937395,
"grad_norm": 4.692728519439697,
"learning_rate": 1.3285423819108349e-05,
"loss": 1.9361,
"step": 182
},
{
"epoch": 1.2385786802030456,
"grad_norm": 4.571840763092041,
"learning_rate": 1.3215846053955683e-05,
"loss": 1.9115,
"step": 183
},
{
"epoch": 1.245346869712352,
"grad_norm": 5.025711536407471,
"learning_rate": 1.3146094213449148e-05,
"loss": 1.7432,
"step": 184
},
{
"epoch": 1.252115059221658,
"grad_norm": 6.1127095222473145,
"learning_rate": 1.3076172073290726e-05,
"loss": 1.5802,
"step": 185
},
{
"epoch": 1.2588832487309645,
"grad_norm": 5.005325794219971,
"learning_rate": 1.3006083418400799e-05,
"loss": 2.2672,
"step": 186
},
{
"epoch": 1.2656514382402708,
"grad_norm": 3.2444660663604736,
"learning_rate": 1.2935832042713288e-05,
"loss": 2.2101,
"step": 187
},
{
"epoch": 1.272419627749577,
"grad_norm": 3.3180994987487793,
"learning_rate": 1.2865421748970257e-05,
"loss": 2.1237,
"step": 188
},
{
"epoch": 1.2791878172588833,
"grad_norm": 4.625007629394531,
"learning_rate": 1.2794856348516095e-05,
"loss": 1.9741,
"step": 189
},
{
"epoch": 1.2859560067681894,
"grad_norm": 4.619353294372559,
"learning_rate": 1.2724139661091188e-05,
"loss": 1.9425,
"step": 190
},
{
"epoch": 1.2927241962774958,
"grad_norm": 5.504361152648926,
"learning_rate": 1.2653275514625165e-05,
"loss": 1.7012,
"step": 191
},
{
"epoch": 1.299492385786802,
"grad_norm": 4.399888515472412,
"learning_rate": 1.2582267745029685e-05,
"loss": 1.9316,
"step": 192
},
{
"epoch": 1.3062605752961083,
"grad_norm": 3.53360915184021,
"learning_rate": 1.2511120195990797e-05,
"loss": 2.3907,
"step": 193
},
{
"epoch": 1.3130287648054146,
"grad_norm": 3.4914515018463135,
"learning_rate": 1.2439836718760887e-05,
"loss": 2.0797,
"step": 194
},
{
"epoch": 1.3197969543147208,
"grad_norm": 3.7882394790649414,
"learning_rate": 1.2368421171950193e-05,
"loss": 1.955,
"step": 195
},
{
"epoch": 1.3265651438240271,
"grad_norm": 4.370715141296387,
"learning_rate": 1.2296877421317958e-05,
"loss": 1.8437,
"step": 196
},
{
"epoch": 1.3333333333333333,
"grad_norm": 5.414830207824707,
"learning_rate": 1.2225209339563144e-05,
"loss": 1.8579,
"step": 197
},
{
"epoch": 1.3401015228426396,
"grad_norm": 5.272250652313232,
"learning_rate": 1.215342080611484e-05,
"loss": 1.7614,
"step": 198
},
{
"epoch": 1.3468697123519457,
"grad_norm": 4.075460910797119,
"learning_rate": 1.2081515706922226e-05,
"loss": 2.3666,
"step": 199
},
{
"epoch": 1.353637901861252,
"grad_norm": 2.9030683040618896,
"learning_rate": 1.2009497934244257e-05,
"loss": 2.0487,
"step": 200
},
{
"epoch": 1.3604060913705585,
"grad_norm": 4.147029876708984,
"learning_rate": 1.1937371386438954e-05,
"loss": 1.9878,
"step": 201
},
{
"epoch": 1.3671742808798646,
"grad_norm": 5.0643439292907715,
"learning_rate": 1.186513996775239e-05,
"loss": 1.8252,
"step": 202
},
{
"epoch": 1.373942470389171,
"grad_norm": 5.364940166473389,
"learning_rate": 1.1792807588107358e-05,
"loss": 1.7401,
"step": 203
},
{
"epoch": 1.380710659898477,
"grad_norm": 6.356777191162109,
"learning_rate": 1.1720378162891709e-05,
"loss": 1.5169,
"step": 204
},
{
"epoch": 1.3874788494077834,
"grad_norm": 3.031667709350586,
"learning_rate": 1.1647855612746423e-05,
"loss": 2.3757,
"step": 205
},
{
"epoch": 1.3942470389170896,
"grad_norm": 3.478210926055908,
"learning_rate": 1.1575243863353383e-05,
"loss": 2.1897,
"step": 206
},
{
"epoch": 1.401015228426396,
"grad_norm": 3.7287087440490723,
"learning_rate": 1.150254684522286e-05,
"loss": 2.0368,
"step": 207
},
{
"epoch": 1.4077834179357023,
"grad_norm": 4.0293779373168945,
"learning_rate": 1.142976849348078e-05,
"loss": 1.9049,
"step": 208
},
{
"epoch": 1.4145516074450084,
"grad_norm": 4.953205108642578,
"learning_rate": 1.1356912747655687e-05,
"loss": 1.7872,
"step": 209
},
{
"epoch": 1.4213197969543148,
"grad_norm": 6.160380840301514,
"learning_rate": 1.1283983551465512e-05,
"loss": 1.7295,
"step": 210
},
{
"epoch": 1.4280879864636211,
"grad_norm": 5.149349212646484,
"learning_rate": 1.1210984852604084e-05,
"loss": 2.1102,
"step": 211
},
{
"epoch": 1.4348561759729273,
"grad_norm": 3.172128915786743,
"learning_rate": 1.1137920602527448e-05,
"loss": 2.2288,
"step": 212
},
{
"epoch": 1.4416243654822334,
"grad_norm": 3.4528701305389404,
"learning_rate": 1.1064794756239978e-05,
"loss": 2.0189,
"step": 213
},
{
"epoch": 1.4483925549915397,
"grad_norm": 4.66202449798584,
"learning_rate": 1.099161127208027e-05,
"loss": 1.8742,
"step": 214
},
{
"epoch": 1.455160744500846,
"grad_norm": 5.142988681793213,
"learning_rate": 1.0918374111506893e-05,
"loss": 1.9004,
"step": 215
},
{
"epoch": 1.4619289340101522,
"grad_norm": 5.548466205596924,
"learning_rate": 1.0845087238883945e-05,
"loss": 1.5929,
"step": 216
},
{
"epoch": 1.4686971235194586,
"grad_norm": 4.51755428314209,
"learning_rate": 1.0771754621266466e-05,
"loss": 1.9563,
"step": 217
},
{
"epoch": 1.475465313028765,
"grad_norm": 3.1326138973236084,
"learning_rate": 1.0698380228185685e-05,
"loss": 2.2197,
"step": 218
},
{
"epoch": 1.482233502538071,
"grad_norm": 3.594095468521118,
"learning_rate": 1.0624968031434174e-05,
"loss": 2.0466,
"step": 219
},
{
"epoch": 1.4890016920473772,
"grad_norm": 3.841886281967163,
"learning_rate": 1.0551522004850821e-05,
"loss": 1.9612,
"step": 220
},
{
"epoch": 1.4957698815566836,
"grad_norm": 4.422885417938232,
"learning_rate": 1.0478046124105746e-05,
"loss": 1.8449,
"step": 221
},
{
"epoch": 1.50253807106599,
"grad_norm": 5.432779788970947,
"learning_rate": 1.0404544366485094e-05,
"loss": 1.7364,
"step": 222
},
{
"epoch": 1.509306260575296,
"grad_norm": 5.873152256011963,
"learning_rate": 1.033102071067573e-05,
"loss": 1.6825,
"step": 223
},
{
"epoch": 1.5160744500846024,
"grad_norm": 3.36773943901062,
"learning_rate": 1.0257479136549889e-05,
"loss": 2.3463,
"step": 224
},
{
"epoch": 1.5228426395939088,
"grad_norm": 3.3323042392730713,
"learning_rate": 1.0183923624949721e-05,
"loss": 2.0683,
"step": 225
},
{
"epoch": 1.5296108291032149,
"grad_norm": 3.8202672004699707,
"learning_rate": 1.0110358157471825e-05,
"loss": 1.9565,
"step": 226
},
{
"epoch": 1.536379018612521,
"grad_norm": 4.67080545425415,
"learning_rate": 1.0036786716251721e-05,
"loss": 1.8865,
"step": 227
},
{
"epoch": 1.5431472081218274,
"grad_norm": 5.312952995300293,
"learning_rate": 9.963213283748282e-06,
"loss": 1.7068,
"step": 228
},
{
"epoch": 1.5499153976311337,
"grad_norm": 6.728119850158691,
"learning_rate": 9.889641842528179e-06,
"loss": 1.6627,
"step": 229
},
{
"epoch": 1.5566835871404399,
"grad_norm": 2.4371559619903564,
"learning_rate": 9.816076375050284e-06,
"loss": 2.3459,
"step": 230
},
{
"epoch": 1.5634517766497462,
"grad_norm": 2.8036484718322754,
"learning_rate": 9.742520863450116e-06,
"loss": 2.1804,
"step": 231
},
{
"epoch": 1.5702199661590526,
"grad_norm": 3.5675642490386963,
"learning_rate": 9.668979289324274e-06,
"loss": 2.0749,
"step": 232
},
{
"epoch": 1.5769881556683587,
"grad_norm": 4.099052906036377,
"learning_rate": 9.595455633514908e-06,
"loss": 1.8576,
"step": 233
},
{
"epoch": 1.5837563451776648,
"grad_norm": 4.900853633880615,
"learning_rate": 9.521953875894256e-06,
"loss": 1.7174,
"step": 234
},
{
"epoch": 1.5905245346869712,
"grad_norm": 5.890774726867676,
"learning_rate": 9.448477995149182e-06,
"loss": 1.4906,
"step": 235
},
{
"epoch": 1.5972927241962775,
"grad_norm": 4.369800567626953,
"learning_rate": 9.37503196856583e-06,
"loss": 2.0832,
"step": 236
},
{
"epoch": 1.6040609137055837,
"grad_norm": 3.1959829330444336,
"learning_rate": 9.301619771814317e-06,
"loss": 2.2265,
"step": 237
},
{
"epoch": 1.61082910321489,
"grad_norm": 3.255842924118042,
"learning_rate": 9.228245378733537e-06,
"loss": 2.0659,
"step": 238
},
{
"epoch": 1.6175972927241964,
"grad_norm": 3.865798234939575,
"learning_rate": 9.154912761116056e-06,
"loss": 1.9102,
"step": 239
},
{
"epoch": 1.6243654822335025,
"grad_norm": 4.725029945373535,
"learning_rate": 9.081625888493107e-06,
"loss": 1.7442,
"step": 240
},
{
"epoch": 1.6311336717428087,
"grad_norm": 8.740133285522461,
"learning_rate": 9.00838872791973e-06,
"loss": 1.6959,
"step": 241
},
{
"epoch": 1.637901861252115,
"grad_norm": 4.786500930786133,
"learning_rate": 8.935205243760022e-06,
"loss": 1.8628,
"step": 242
},
{
"epoch": 1.6446700507614214,
"grad_norm": 3.0056700706481934,
"learning_rate": 8.862079397472552e-06,
"loss": 2.2218,
"step": 243
},
{
"epoch": 1.6514382402707275,
"grad_norm": 3.35292911529541,
"learning_rate": 8.78901514739592e-06,
"loss": 2.0775,
"step": 244
},
{
"epoch": 1.6582064297800339,
"grad_norm": 3.997661590576172,
"learning_rate": 8.71601644853449e-06,
"loss": 1.9842,
"step": 245
},
{
"epoch": 1.6649746192893402,
"grad_norm": 4.569092273712158,
"learning_rate": 8.643087252344313e-06,
"loss": 1.8055,
"step": 246
},
{
"epoch": 1.6717428087986463,
"grad_norm": 5.217006683349609,
"learning_rate": 8.57023150651922e-06,
"loss": 1.5681,
"step": 247
},
{
"epoch": 1.6785109983079525,
"grad_norm": 5.526303291320801,
"learning_rate": 8.49745315477714e-06,
"loss": 1.7118,
"step": 248
},
{
"epoch": 1.6852791878172588,
"grad_norm": 2.5200791358947754,
"learning_rate": 8.424756136646624e-06,
"loss": 2.2932,
"step": 249
},
{
"epoch": 1.6920473773265652,
"grad_norm": 3.4209508895874023,
"learning_rate": 8.352144387253582e-06,
"loss": 2.0515,
"step": 250
},
{
"epoch": 1.6988155668358713,
"grad_norm": 3.7960565090179443,
"learning_rate": 8.279621837108295e-06,
"loss": 1.9207,
"step": 251
},
{
"epoch": 1.7055837563451777,
"grad_norm": 4.094236373901367,
"learning_rate": 8.207192411892645e-06,
"loss": 1.7885,
"step": 252
},
{
"epoch": 1.712351945854484,
"grad_norm": 4.837678909301758,
"learning_rate": 8.134860032247613e-06,
"loss": 1.6723,
"step": 253
},
{
"epoch": 1.7191201353637902,
"grad_norm": 6.248587608337402,
"learning_rate": 8.062628613561051e-06,
"loss": 1.4528,
"step": 254
},
{
"epoch": 1.7258883248730963,
"grad_norm": 2.59256911277771,
"learning_rate": 7.990502065755748e-06,
"loss": 2.3992,
"step": 255
},
{
"epoch": 1.7326565143824029,
"grad_norm": 2.9640893936157227,
"learning_rate": 7.918484293077777e-06,
"loss": 2.1847,
"step": 256
},
{
"epoch": 1.739424703891709,
"grad_norm": 3.4181110858917236,
"learning_rate": 7.846579193885165e-06,
"loss": 2.0231,
"step": 257
},
{
"epoch": 1.7461928934010151,
"grad_norm": 4.158235549926758,
"learning_rate": 7.774790660436857e-06,
"loss": 1.9362,
"step": 258
},
{
"epoch": 1.7529610829103215,
"grad_norm": 4.829765796661377,
"learning_rate": 7.703122578682047e-06,
"loss": 1.7278,
"step": 259
},
{
"epoch": 1.7597292724196278,
"grad_norm": 5.691404819488525,
"learning_rate": 7.631578828049809e-06,
"loss": 1.6055,
"step": 260
},
{
"epoch": 1.766497461928934,
"grad_norm": 3.2796614170074463,
"learning_rate": 7.560163281239116e-06,
"loss": 2.0519,
"step": 261
},
{
"epoch": 1.77326565143824,
"grad_norm": 2.57660174369812,
"learning_rate": 7.488879804009206e-06,
"loss": 2.1934,
"step": 262
},
{
"epoch": 1.7800338409475467,
"grad_norm": 3.3291141986846924,
"learning_rate": 7.4177322549703165e-06,
"loss": 2.0575,
"step": 263
},
{
"epoch": 1.7868020304568528,
"grad_norm": 3.7897515296936035,
"learning_rate": 7.346724485374837e-06,
"loss": 1.7963,
"step": 264
},
{
"epoch": 1.793570219966159,
"grad_norm": 4.585766315460205,
"learning_rate": 7.275860338908815e-06,
"loss": 1.765,
"step": 265
},
{
"epoch": 1.8003384094754653,
"grad_norm": 5.705550670623779,
"learning_rate": 7.2051436514839064e-06,
"loss": 1.6657,
"step": 266
},
{
"epoch": 1.8071065989847717,
"grad_norm": 4.510739326477051,
"learning_rate": 7.134578251029745e-06,
"loss": 1.8443,
"step": 267
},
{
"epoch": 1.8138747884940778,
"grad_norm": 3.171539068222046,
"learning_rate": 7.064167957286714e-06,
"loss": 2.3002,
"step": 268
},
{
"epoch": 1.8206429780033841,
"grad_norm": 3.5727908611297607,
"learning_rate": 6.993916581599203e-06,
"loss": 2.0323,
"step": 269
},
{
"epoch": 1.8274111675126905,
"grad_norm": 3.5468742847442627,
"learning_rate": 6.923827926709277e-06,
"loss": 1.9025,
"step": 270
},
{
"epoch": 1.8341793570219966,
"grad_norm": 4.465723037719727,
"learning_rate": 6.853905786550855e-06,
"loss": 1.8105,
"step": 271
},
{
"epoch": 1.8409475465313028,
"grad_norm": 5.095712184906006,
"learning_rate": 6.784153946044321e-06,
"loss": 1.6591,
"step": 272
},
{
"epoch": 1.8477157360406091,
"grad_norm": 5.340912818908691,
"learning_rate": 6.714576180891653e-06,
"loss": 1.6851,
"step": 273
},
{
"epoch": 1.8544839255499155,
"grad_norm": 3.092374801635742,
"learning_rate": 6.645176257372054e-06,
"loss": 2.3331,
"step": 274
},
{
"epoch": 1.8612521150592216,
"grad_norm": 2.924107551574707,
"learning_rate": 6.5759579321380576e-06,
"loss": 2.1575,
"step": 275
},
{
"epoch": 1.868020304568528,
"grad_norm": 3.7559361457824707,
"learning_rate": 6.5069249520122026e-06,
"loss": 1.9893,
"step": 276
},
{
"epoch": 1.8747884940778343,
"grad_norm": 4.786612510681152,
"learning_rate": 6.438081053784197e-06,
"loss": 1.7583,
"step": 277
},
{
"epoch": 1.8815566835871405,
"grad_norm": 5.209157466888428,
"learning_rate": 6.36942996400865e-06,
"loss": 1.7118,
"step": 278
},
{
"epoch": 1.8883248730964466,
"grad_norm": 6.413548469543457,
"learning_rate": 6.300975398803362e-06,
"loss": 1.5219,
"step": 279
},
{
"epoch": 1.895093062605753,
"grad_norm": 2.4699904918670654,
"learning_rate": 6.232721063648148e-06,
"loss": 2.3592,
"step": 280
},
{
"epoch": 1.9018612521150593,
"grad_norm": 2.732497453689575,
"learning_rate": 6.1646706531842845e-06,
"loss": 2.0984,
"step": 281
},
{
"epoch": 1.9086294416243654,
"grad_norm": 3.4677207469940186,
"learning_rate": 6.09682785101449e-06,
"loss": 1.9303,
"step": 282
},
{
"epoch": 1.9153976311336718,
"grad_norm": 3.888166666030884,
"learning_rate": 6.029196329503548e-06,
"loss": 1.8503,
"step": 283
},
{
"epoch": 1.9221658206429781,
"grad_norm": 4.850317001342773,
"learning_rate": 5.961779749579516e-06,
"loss": 1.711,
"step": 284
},
{
"epoch": 1.9289340101522843,
"grad_norm": 5.663942813873291,
"learning_rate": 5.8945817605355495e-06,
"loss": 1.5138,
"step": 285
},
{
"epoch": 1.9357021996615904,
"grad_norm": 3.6366028785705566,
"learning_rate": 5.827605999832375e-06,
"loss": 2.0733,
"step": 286
},
{
"epoch": 1.9424703891708968,
"grad_norm": 2.522986650466919,
"learning_rate": 5.760856092901394e-06,
"loss": 2.2134,
"step": 287
},
{
"epoch": 1.9492385786802031,
"grad_norm": 3.196727991104126,
"learning_rate": 5.694335652948415e-06,
"loss": 2.0477,
"step": 288
},
{
"epoch": 1.9560067681895092,
"grad_norm": 3.9079673290252686,
"learning_rate": 5.628048280758096e-06,
"loss": 1.963,
"step": 289
},
{
"epoch": 1.9627749576988156,
"grad_norm": 4.57443380355835,
"learning_rate": 5.561997564499024e-06,
"loss": 1.7234,
"step": 290
},
{
"epoch": 1.969543147208122,
"grad_norm": 5.271142482757568,
"learning_rate": 5.4961870795294644e-06,
"loss": 1.6151,
"step": 291
},
{
"epoch": 1.976311336717428,
"grad_norm": 4.357114315032959,
"learning_rate": 5.430620388203866e-06,
"loss": 1.9279,
"step": 292
},
{
"epoch": 1.9830795262267342,
"grad_norm": 2.8121213912963867,
"learning_rate": 5.365301039679985e-06,
"loss": 2.1234,
"step": 293
},
{
"epoch": 1.9898477157360406,
"grad_norm": 4.057702541351318,
"learning_rate": 5.300232569726805e-06,
"loss": 1.9386,
"step": 294
},
{
"epoch": 1.996615905245347,
"grad_norm": 5.310722351074219,
"learning_rate": 5.2354185005331095e-06,
"loss": 1.6403,
"step": 295
},
{
"epoch": 2.003384094754653,
"grad_norm": 3.646991729736328,
"learning_rate": 5.170862340516858e-06,
"loss": 1.9625,
"step": 296
},
{
"epoch": 2.010152284263959,
"grad_norm": 3.910515069961548,
"learning_rate": 5.106567584135251e-06,
"loss": 1.6498,
"step": 297
},
{
"epoch": 2.0169204737732658,
"grad_norm": 5.364322662353516,
"learning_rate": 5.042537711695584e-06,
"loss": 1.4023,
"step": 298
},
{
"epoch": 2.023688663282572,
"grad_norm": 6.2862396240234375,
"learning_rate": 4.97877618916684e-06,
"loss": 1.0733,
"step": 299
},
{
"epoch": 2.030456852791878,
"grad_norm": 7.049383163452148,
"learning_rate": 4.915286467992098e-06,
"loss": 0.9028,
"step": 300
},
{
"epoch": 2.0372250423011846,
"grad_norm": 6.528897762298584,
"learning_rate": 4.852071984901696e-06,
"loss": 0.6975,
"step": 301
},
{
"epoch": 2.0439932318104908,
"grad_norm": 4.566006660461426,
"learning_rate": 4.789136161727184e-06,
"loss": 0.996,
"step": 302
},
{
"epoch": 2.050761421319797,
"grad_norm": 4.8525590896606445,
"learning_rate": 4.7264824052161255e-06,
"loss": 1.9084,
"step": 303
},
{
"epoch": 2.057529610829103,
"grad_norm": 8.884151458740234,
"learning_rate": 4.664114106847667e-06,
"loss": 1.2406,
"step": 304
},
{
"epoch": 2.0642978003384096,
"grad_norm": 9.265266418457031,
"learning_rate": 4.602034642648968e-06,
"loss": 0.974,
"step": 305
},
{
"epoch": 2.0710659898477157,
"grad_norm": 8.75934886932373,
"learning_rate": 4.5402473730124395e-06,
"loss": 0.8314,
"step": 306
},
{
"epoch": 2.077834179357022,
"grad_norm": 7.045146942138672,
"learning_rate": 4.478755642513868e-06,
"loss": 0.7014,
"step": 307
},
{
"epoch": 2.0846023688663284,
"grad_norm": 5.934751033782959,
"learning_rate": 4.417562779731355e-06,
"loss": 0.5536,
"step": 308
},
{
"epoch": 2.0913705583756346,
"grad_norm": 4.2714314460754395,
"learning_rate": 4.356672097065134e-06,
"loss": 1.954,
"step": 309
},
{
"epoch": 2.0981387478849407,
"grad_norm": 3.837898015975952,
"learning_rate": 4.2960868905582895e-06,
"loss": 1.4183,
"step": 310
},
{
"epoch": 2.104906937394247,
"grad_norm": 4.858175277709961,
"learning_rate": 4.235810439718327e-06,
"loss": 1.0733,
"step": 311
},
{
"epoch": 2.1116751269035534,
"grad_norm": 5.005491256713867,
"learning_rate": 4.175846007339644e-06,
"loss": 0.851,
"step": 312
},
{
"epoch": 2.1184433164128595,
"grad_norm": 5.400625228881836,
"learning_rate": 4.1161968393269324e-06,
"loss": 0.7486,
"step": 313
},
{
"epoch": 2.1252115059221657,
"grad_norm": 5.8098602294921875,
"learning_rate": 4.0568661645194656e-06,
"loss": 0.5741,
"step": 314
},
{
"epoch": 2.1319796954314723,
"grad_norm": 3.7481307983398438,
"learning_rate": 3.997857194516319e-06,
"loss": 1.7741,
"step": 315
},
{
"epoch": 2.1387478849407784,
"grad_norm": 3.240912675857544,
"learning_rate": 3.939173123502523e-06,
"loss": 1.5778,
"step": 316
},
{
"epoch": 2.1455160744500845,
"grad_norm": 4.222574710845947,
"learning_rate": 3.8808171280761665e-06,
"loss": 1.0852,
"step": 317
},
{
"epoch": 2.152284263959391,
"grad_norm": 4.52738618850708,
"learning_rate": 3.822792367076446e-06,
"loss": 0.9088,
"step": 318
},
{
"epoch": 2.1590524534686972,
"grad_norm": 5.184245586395264,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.7274,
"step": 319
},
{
"epoch": 2.1658206429780034,
"grad_norm": 6.21406364440918,
"learning_rate": 3.7077490938942307e-06,
"loss": 0.6568,
"step": 320
},
{
"epoch": 2.1725888324873095,
"grad_norm": 4.0037126541137695,
"learning_rate": 3.6507368090616014e-06,
"loss": 1.3107,
"step": 321
},
{
"epoch": 2.179357021996616,
"grad_norm": 3.928704023361206,
"learning_rate": 3.594068213018249e-06,
"loss": 1.6134,
"step": 322
},
{
"epoch": 2.186125211505922,
"grad_norm": 4.245754718780518,
"learning_rate": 3.53774637326359e-06,
"loss": 1.1365,
"step": 323
},
{
"epoch": 2.1928934010152283,
"grad_norm": 4.95959997177124,
"learning_rate": 3.481774338526954e-06,
"loss": 0.9044,
"step": 324
},
{
"epoch": 2.199661590524535,
"grad_norm": 5.446150779724121,
"learning_rate": 3.426155138602558e-06,
"loss": 0.7641,
"step": 325
},
{
"epoch": 2.206429780033841,
"grad_norm": 6.039018630981445,
"learning_rate": 3.3708917841854782e-06,
"loss": 0.6246,
"step": 326
},
{
"epoch": 2.213197969543147,
"grad_norm": 5.4581427574157715,
"learning_rate": 3.3159872667087077e-06,
"loss": 0.9867,
"step": 327
},
{
"epoch": 2.2199661590524533,
"grad_norm": 3.5558555126190186,
"learning_rate": 3.2614445581812183e-06,
"loss": 1.8462,
"step": 328
},
{
"epoch": 2.22673434856176,
"grad_norm": 3.6573843955993652,
"learning_rate": 3.207266611027069e-06,
"loss": 1.2127,
"step": 329
},
{
"epoch": 2.233502538071066,
"grad_norm": 4.450440406799316,
"learning_rate": 3.1534563579256172e-06,
"loss": 0.8708,
"step": 330
},
{
"epoch": 2.240270727580372,
"grad_norm": 4.951565742492676,
"learning_rate": 3.1000167116527525e-06,
"loss": 0.7292,
"step": 331
},
{
"epoch": 2.2470389170896787,
"grad_norm": 5.573976516723633,
"learning_rate": 3.0469505649232333e-06,
"loss": 0.6443,
"step": 332
},
{
"epoch": 2.253807106598985,
"grad_norm": 5.918398857116699,
"learning_rate": 2.9942607902340946e-06,
"loss": 0.5702,
"step": 333
},
{
"epoch": 2.260575296108291,
"grad_norm": 3.47131085395813,
"learning_rate": 2.9419502397091715e-06,
"loss": 1.9211,
"step": 334
},
{
"epoch": 2.267343485617597,
"grad_norm": 3.5219008922576904,
"learning_rate": 2.8900217449447077e-06,
"loss": 1.3083,
"step": 335
},
{
"epoch": 2.2741116751269037,
"grad_norm": 4.355684280395508,
"learning_rate": 2.8384781168560693e-06,
"loss": 0.9792,
"step": 336
},
{
"epoch": 2.28087986463621,
"grad_norm": 4.637706756591797,
"learning_rate": 2.7873221455256006e-06,
"loss": 0.8071,
"step": 337
},
{
"epoch": 2.287648054145516,
"grad_norm": 5.174313068389893,
"learning_rate": 2.736556600051593e-06,
"loss": 0.658,
"step": 338
},
{
"epoch": 2.2944162436548226,
"grad_norm": 5.701674461364746,
"learning_rate": 2.6861842283983953e-06,
"loss": 0.5716,
"step": 339
},
{
"epoch": 2.3011844331641287,
"grad_norm": 3.603616714477539,
"learning_rate": 2.6362077572476495e-06,
"loss": 1.6573,
"step": 340
},
{
"epoch": 2.307952622673435,
"grad_norm": 3.46175217628479,
"learning_rate": 2.586629891850716e-06,
"loss": 1.5,
"step": 341
},
{
"epoch": 2.314720812182741,
"grad_norm": 4.137648105621338,
"learning_rate": 2.5374533158822225e-06,
"loss": 1.2069,
"step": 342
},
{
"epoch": 2.3214890016920475,
"grad_norm": 4.514110565185547,
"learning_rate": 2.4886806912948034e-06,
"loss": 0.8226,
"step": 343
},
{
"epoch": 2.3282571912013537,
"grad_norm": 5.722095012664795,
"learning_rate": 2.4403146581749925e-06,
"loss": 0.7436,
"step": 344
},
{
"epoch": 2.33502538071066,
"grad_norm": 5.680308818817139,
"learning_rate": 2.392357834600336e-06,
"loss": 0.583,
"step": 345
},
{
"epoch": 2.3417935702199664,
"grad_norm": 4.119960784912109,
"learning_rate": 2.3448128164976593e-06,
"loss": 1.2645,
"step": 346
},
{
"epoch": 2.3485617597292725,
"grad_norm": 3.0717074871063232,
"learning_rate": 2.297682177502546e-06,
"loss": 1.538,
"step": 347
},
{
"epoch": 2.3553299492385786,
"grad_norm": 4.0398335456848145,
"learning_rate": 2.2509684688200385e-06,
"loss": 1.0585,
"step": 348
},
{
"epoch": 2.3620981387478848,
"grad_norm": 4.792836666107178,
"learning_rate": 2.204674219086531e-06,
"loss": 0.8199,
"step": 349
},
{
"epoch": 2.3688663282571913,
"grad_norm": 5.0710883140563965,
"learning_rate": 2.158801934232897e-06,
"loss": 0.6387,
"step": 350
},
{
"epoch": 2.3756345177664975,
"grad_norm": 5.128052234649658,
"learning_rate": 2.113354097348834e-06,
"loss": 0.5425,
"step": 351
},
{
"epoch": 2.3824027072758036,
"grad_norm": 4.384050369262695,
"learning_rate": 2.0683331685484655e-06,
"loss": 0.9253,
"step": 352
},
{
"epoch": 2.38917089678511,
"grad_norm": 3.5379750728607178,
"learning_rate": 2.0237415848371666e-06,
"loss": 1.9209,
"step": 353
},
{
"epoch": 2.3959390862944163,
"grad_norm": 3.754819631576538,
"learning_rate": 1.979581759979642e-06,
"loss": 1.2382,
"step": 354
},
{
"epoch": 2.4027072758037225,
"grad_norm": 4.621876239776611,
"learning_rate": 1.9358560843692787e-06,
"loss": 0.9402,
"step": 355
},
{
"epoch": 2.4094754653130286,
"grad_norm": 5.234630584716797,
"learning_rate": 1.892566924898751e-06,
"loss": 0.7772,
"step": 356
},
{
"epoch": 2.416243654822335,
"grad_norm": 6.046688079833984,
"learning_rate": 1.8497166248318876e-06,
"loss": 0.6619,
"step": 357
},
{
"epoch": 2.4230118443316413,
"grad_norm": 5.794624328613281,
"learning_rate": 1.807307503676846e-06,
"loss": 0.5626,
"step": 358
},
{
"epoch": 2.4297800338409474,
"grad_norm": 3.271219253540039,
"learning_rate": 1.7653418570605474e-06,
"loss": 1.8406,
"step": 359
},
{
"epoch": 2.436548223350254,
"grad_norm": 3.5998053550720215,
"learning_rate": 1.7238219566044145e-06,
"loss": 1.3465,
"step": 360
},
{
"epoch": 2.44331641285956,
"grad_norm": 4.231540679931641,
"learning_rate": 1.6827500498014026e-06,
"loss": 1.0409,
"step": 361
},
{
"epoch": 2.4500846023688663,
"grad_norm": 4.704120635986328,
"learning_rate": 1.6421283598943526e-06,
"loss": 0.7836,
"step": 362
},
{
"epoch": 2.4568527918781724,
"grad_norm": 5.459336757659912,
"learning_rate": 1.601959085755641e-06,
"loss": 0.6894,
"step": 363
},
{
"epoch": 2.463620981387479,
"grad_norm": 5.819806098937988,
"learning_rate": 1.5622444017681438e-06,
"loss": 0.5779,
"step": 364
},
{
"epoch": 2.470389170896785,
"grad_norm": 3.3774378299713135,
"learning_rate": 1.5229864577075548e-06,
"loss": 1.6054,
"step": 365
},
{
"epoch": 2.4771573604060912,
"grad_norm": 3.6015894412994385,
"learning_rate": 1.4841873786260019e-06,
"loss": 1.4402,
"step": 366
},
{
"epoch": 2.483925549915398,
"grad_norm": 4.313451766967773,
"learning_rate": 1.445849264737026e-06,
"loss": 0.9478,
"step": 367
},
{
"epoch": 2.490693739424704,
"grad_norm": 5.062134265899658,
"learning_rate": 1.4079741913018863e-06,
"loss": 0.8397,
"step": 368
},
{
"epoch": 2.49746192893401,
"grad_norm": 5.357868194580078,
"learning_rate": 1.3705642085172367e-06,
"loss": 0.5926,
"step": 369
},
{
"epoch": 2.504230118443316,
"grad_norm": 5.440720558166504,
"learning_rate": 1.3336213414041387e-06,
"loss": 0.5514,
"step": 370
},
{
"epoch": 2.510998307952623,
"grad_norm": 3.912086248397827,
"learning_rate": 1.2971475896984475e-06,
"loss": 1.3175,
"step": 371
},
{
"epoch": 2.517766497461929,
"grad_norm": 3.4451215267181396,
"learning_rate": 1.2611449277425715e-06,
"loss": 1.7101,
"step": 372
},
{
"epoch": 2.524534686971235,
"grad_norm": 3.9773149490356445,
"learning_rate": 1.2256153043785911e-06,
"loss": 1.0656,
"step": 373
},
{
"epoch": 2.5313028764805416,
"grad_norm": 4.944250106811523,
"learning_rate": 1.1905606428427775e-06,
"loss": 0.88,
"step": 374
},
{
"epoch": 2.5380710659898478,
"grad_norm": 5.475653171539307,
"learning_rate": 1.1559828406614716e-06,
"loss": 0.6841,
"step": 375
},
{
"epoch": 2.544839255499154,
"grad_norm": 6.01757287979126,
"learning_rate": 1.1218837695483853e-06,
"loss": 0.5779,
"step": 376
},
{
"epoch": 2.55160744500846,
"grad_norm": 4.852456569671631,
"learning_rate": 1.0882652753032797e-06,
"loss": 0.9251,
"step": 377
},
{
"epoch": 2.5583756345177666,
"grad_norm": 3.3849077224731445,
"learning_rate": 1.0551291777120465e-06,
"loss": 1.936,
"step": 378
},
{
"epoch": 2.5651438240270727,
"grad_norm": 3.5754101276397705,
"learning_rate": 1.0224772704482033e-06,
"loss": 1.2097,
"step": 379
},
{
"epoch": 2.571912013536379,
"grad_norm": 4.511597633361816,
"learning_rate": 9.903113209758098e-07,
"loss": 0.999,
"step": 380
},
{
"epoch": 2.5786802030456855,
"grad_norm": 4.68709659576416,
"learning_rate": 9.58633070453785e-07,
"loss": 0.7136,
"step": 381
},
{
"epoch": 2.5854483925549916,
"grad_norm": 6.031564712524414,
"learning_rate": 9.274442336416567e-07,
"loss": 0.6703,
"step": 382
},
{
"epoch": 2.5922165820642977,
"grad_norm": 5.676982402801514,
"learning_rate": 8.967464988067476e-07,
"loss": 0.5741,
"step": 383
},
{
"epoch": 2.598984771573604,
"grad_norm": 3.2455053329467773,
"learning_rate": 8.665415276327871e-07,
"loss": 2.0339,
"step": 384
},
{
"epoch": 2.6057529610829104,
"grad_norm": 3.556863784790039,
"learning_rate": 8.368309551299536e-07,
"loss": 1.325,
"step": 385
},
{
"epoch": 2.6125211505922166,
"grad_norm": 4.362199306488037,
"learning_rate": 8.076163895463862e-07,
"loss": 1.0078,
"step": 386
},
{
"epoch": 2.6192893401015227,
"grad_norm": 4.831475257873535,
"learning_rate": 7.788994122811178e-07,
"loss": 0.8614,
"step": 387
},
{
"epoch": 2.6260575296108293,
"grad_norm": 5.154886245727539,
"learning_rate": 7.506815777984788e-07,
"loss": 0.5961,
"step": 388
},
{
"epoch": 2.6328257191201354,
"grad_norm": 5.211191654205322,
"learning_rate": 7.229644135439473e-07,
"loss": 0.5393,
"step": 389
},
{
"epoch": 2.6395939086294415,
"grad_norm": 3.482469081878662,
"learning_rate": 6.957494198614778e-07,
"loss": 1.725,
"step": 390
},
{
"epoch": 2.6463620981387477,
"grad_norm": 3.1785717010498047,
"learning_rate": 6.690380699122767e-07,
"loss": 1.58,
"step": 391
},
{
"epoch": 2.6531302876480543,
"grad_norm": 3.89457368850708,
"learning_rate": 6.428318095950648e-07,
"loss": 1.0373,
"step": 392
},
{
"epoch": 2.6598984771573604,
"grad_norm": 4.530887126922607,
"learning_rate": 6.171320574678064e-07,
"loss": 0.8817,
"step": 393
},
{
"epoch": 2.6666666666666665,
"grad_norm": 5.074831008911133,
"learning_rate": 5.919402046709288e-07,
"loss": 0.6199,
"step": 394
},
{
"epoch": 2.673434856175973,
"grad_norm": 5.216182231903076,
"learning_rate": 5.672576148520136e-07,
"loss": 0.5777,
"step": 395
},
{
"epoch": 2.6802030456852792,
"grad_norm": 4.10127067565918,
"learning_rate": 5.430856240919779e-07,
"loss": 1.3062,
"step": 396
},
{
"epoch": 2.6869712351945854,
"grad_norm": 3.1244003772735596,
"learning_rate": 5.19425540832762e-07,
"loss": 1.6781,
"step": 397
},
{
"epoch": 2.6937394247038915,
"grad_norm": 3.911149501800537,
"learning_rate": 4.962786458064972e-07,
"loss": 1.1371,
"step": 398
},
{
"epoch": 2.700507614213198,
"grad_norm": 4.34928560256958,
"learning_rate": 4.73646191966175e-07,
"loss": 0.9204,
"step": 399
},
{
"epoch": 2.707275803722504,
"grad_norm": 5.3247551918029785,
"learning_rate": 4.515294044178331e-07,
"loss": 0.7054,
"step": 400
},
{
"epoch": 2.7140439932318103,
"grad_norm": 5.384613990783691,
"learning_rate": 4.299294803542331e-07,
"loss": 0.6055,
"step": 401
},
{
"epoch": 2.720812182741117,
"grad_norm": 4.743732452392578,
"learning_rate": 4.0884758899006007e-07,
"loss": 0.9645,
"step": 402
},
{
"epoch": 2.727580372250423,
"grad_norm": 3.20401930809021,
"learning_rate": 3.882848714986243e-07,
"loss": 1.8007,
"step": 403
},
{
"epoch": 2.734348561759729,
"grad_norm": 3.646068811416626,
"learning_rate": 3.6824244095010064e-07,
"loss": 1.2693,
"step": 404
},
{
"epoch": 2.7411167512690353,
"grad_norm": 4.450921058654785,
"learning_rate": 3.4872138225127137e-07,
"loss": 0.9719,
"step": 405
},
{
"epoch": 2.747884940778342,
"grad_norm": 4.953512668609619,
"learning_rate": 3.2972275208679625e-07,
"loss": 0.8174,
"step": 406
},
{
"epoch": 2.754653130287648,
"grad_norm": 5.153395175933838,
"learning_rate": 3.112475788620217e-07,
"loss": 0.6039,
"step": 407
},
{
"epoch": 2.761421319796954,
"grad_norm": 5.268378257751465,
"learning_rate": 2.932968626473065e-07,
"loss": 0.5195,
"step": 408
},
{
"epoch": 2.7681895093062607,
"grad_norm": 3.0056629180908203,
"learning_rate": 2.758715751238872e-07,
"loss": 1.8951,
"step": 409
},
{
"epoch": 2.774957698815567,
"grad_norm": 3.4584174156188965,
"learning_rate": 2.589726595312858e-07,
"loss": 1.258,
"step": 410
},
{
"epoch": 2.781725888324873,
"grad_norm": 4.383852481842041,
"learning_rate": 2.426010306162485e-07,
"loss": 0.947,
"step": 411
},
{
"epoch": 2.788494077834179,
"grad_norm": 5.073363304138184,
"learning_rate": 2.2675757458323066e-07,
"loss": 0.7545,
"step": 412
},
{
"epoch": 2.7952622673434857,
"grad_norm": 5.620312213897705,
"learning_rate": 2.1144314904642194e-07,
"loss": 0.612,
"step": 413
},
{
"epoch": 2.802030456852792,
"grad_norm": 5.6198530197143555,
"learning_rate": 1.9665858298333006e-07,
"loss": 0.584,
"step": 414
},
{
"epoch": 2.808798646362098,
"grad_norm": 3.5097062587738037,
"learning_rate": 1.824046766899046e-07,
"loss": 1.7121,
"step": 415
},
{
"epoch": 2.8155668358714045,
"grad_norm": 3.444314479827881,
"learning_rate": 1.6868220173721472e-07,
"loss": 1.4931,
"step": 416
},
{
"epoch": 2.8223350253807107,
"grad_norm": 4.086292743682861,
"learning_rate": 1.5549190092968736e-07,
"loss": 0.96,
"step": 417
},
{
"epoch": 2.829103214890017,
"grad_norm": 4.669743061065674,
"learning_rate": 1.4283448826489798e-07,
"loss": 0.7836,
"step": 418
},
{
"epoch": 2.835871404399323,
"grad_norm": 5.329158782958984,
"learning_rate": 1.3071064889491723e-07,
"loss": 0.713,
"step": 419
},
{
"epoch": 2.8426395939086295,
"grad_norm": 5.7975664138793945,
"learning_rate": 1.1912103908922945e-07,
"loss": 0.5847,
"step": 420
},
{
"epoch": 2.8494077834179357,
"grad_norm": 3.9617557525634766,
"learning_rate": 1.0806628619920322e-07,
"loss": 1.3451,
"step": 421
},
{
"epoch": 2.8561759729272422,
"grad_norm": 3.1097750663757324,
"learning_rate": 9.754698862413758e-08,
"loss": 1.4706,
"step": 422
},
{
"epoch": 2.8629441624365484,
"grad_norm": 3.8327267169952393,
"learning_rate": 8.756371577886891e-08,
"loss": 1.1243,
"step": 423
},
{
"epoch": 2.8697123519458545,
"grad_norm": 4.473137855529785,
"learning_rate": 7.81170080629412e-08,
"loss": 0.9026,
"step": 424
},
{
"epoch": 2.8764805414551606,
"grad_norm": 5.105331897735596,
"learning_rate": 6.920737683136614e-08,
"loss": 0.6808,
"step": 425
},
{
"epoch": 2.8832487309644668,
"grad_norm": 6.36986780166626,
"learning_rate": 6.083530436693408e-08,
"loss": 0.6489,
"step": 426
},
{
"epoch": 2.8900169204737733,
"grad_norm": 4.548642635345459,
"learning_rate": 5.300124385410943e-08,
"loss": 0.9101,
"step": 427
},
{
"epoch": 2.8967851099830795,
"grad_norm": 3.0376877784729004,
"learning_rate": 4.570561935450468e-08,
"loss": 1.8025,
"step": 428
},
{
"epoch": 2.903553299492386,
"grad_norm": 3.8811256885528564,
"learning_rate": 3.894882578391879e-08,
"loss": 1.2524,
"step": 429
},
{
"epoch": 2.910321489001692,
"grad_norm": 4.729759693145752,
"learning_rate": 3.273122889096536e-08,
"loss": 0.8427,
"step": 430
},
{
"epoch": 2.9170896785109983,
"grad_norm": 5.370926856994629,
"learning_rate": 2.705316523726853e-08,
"loss": 0.6915,
"step": 431
},
{
"epoch": 2.9238578680203045,
"grad_norm": 5.328631401062012,
"learning_rate": 2.1914942179253052e-08,
"loss": 0.5848,
"step": 432
},
{
"epoch": 2.9306260575296106,
"grad_norm": 5.493933200836182,
"learning_rate": 1.7316837851499845e-08,
"loss": 0.5372,
"step": 433
},
{
"epoch": 2.937394247038917,
"grad_norm": 3.3873283863067627,
"learning_rate": 1.325910115169471e-08,
"loss": 1.9925,
"step": 434
},
{
"epoch": 2.9441624365482233,
"grad_norm": 3.6963796615600586,
"learning_rate": 9.74195172715242e-09,
"loss": 1.3562,
"step": 435
},
{
"epoch": 2.95093062605753,
"grad_norm": 4.348772048950195,
"learning_rate": 6.7655799629284815e-09,
"loss": 0.8997,
"step": 436
},
{
"epoch": 2.957698815566836,
"grad_norm": 4.724446773529053,
"learning_rate": 4.330146971515126e-09,
"loss": 0.7332,
"step": 437
},
{
"epoch": 2.964467005076142,
"grad_norm": 6.034689903259277,
"learning_rate": 2.435784584114975e-09,
"loss": 0.663,
"step": 438
},
{
"epoch": 2.9712351945854483,
"grad_norm": 5.839619159698486,
"learning_rate": 1.0825953435122938e-09,
"loss": 0.5757,
"step": 439
},
{
"epoch": 2.9780033840947544,
"grad_norm": 3.8592047691345215,
"learning_rate": 2.706524985174319e-10,
"loss": 1.5785,
"step": 440
},
{
"epoch": 2.984771573604061,
"grad_norm": 3.915963888168335,
"learning_rate": 0.0,
"loss": 1.1236,
"step": 441
},
{
"epoch": 2.984771573604061,
"step": 441,
"total_flos": 4.015937399291904e+16,
"train_loss": 1.9800935814710432,
"train_runtime": 828.1657,
"train_samples_per_second": 34.203,
"train_steps_per_second": 0.533
}
],
"logging_steps": 1,
"max_steps": 441,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.015937399291904e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}