{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.99574241617882, "eval_steps": 500, "global_step": 9390, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010643959552953698, "grad_norm": 464.0, "learning_rate": 2.129925452609159e-07, "loss": 40.7792, "step": 1 }, { "epoch": 0.005321979776476849, "grad_norm": 496.0, "learning_rate": 1.0649627263045794e-06, "loss": 41.7441, "step": 5 }, { "epoch": 0.010643959552953698, "grad_norm": 498.0, "learning_rate": 2.1299254526091587e-06, "loss": 43.442, "step": 10 }, { "epoch": 0.015965939329430547, "grad_norm": 422.0, "learning_rate": 3.194888178913738e-06, "loss": 41.5518, "step": 15 }, { "epoch": 0.021287919105907396, "grad_norm": 310.0, "learning_rate": 4.259850905218317e-06, "loss": 33.6959, "step": 20 }, { "epoch": 0.026609898882384245, "grad_norm": 253.0, "learning_rate": 5.324813631522897e-06, "loss": 29.756, "step": 25 }, { "epoch": 0.031931878658861094, "grad_norm": 99.0, "learning_rate": 6.389776357827476e-06, "loss": 23.7329, "step": 30 }, { "epoch": 0.03725385843533795, "grad_norm": 45.0, "learning_rate": 7.4547390841320565e-06, "loss": 19.2458, "step": 35 }, { "epoch": 0.04257583821181479, "grad_norm": 35.75, "learning_rate": 8.519701810436635e-06, "loss": 18.6755, "step": 40 }, { "epoch": 0.047897817988291645, "grad_norm": 29.75, "learning_rate": 9.584664536741214e-06, "loss": 18.2356, "step": 45 }, { "epoch": 0.05321979776476849, "grad_norm": 17.375, "learning_rate": 1.0649627263045795e-05, "loss": 16.7464, "step": 50 }, { "epoch": 0.05854177754124534, "grad_norm": 12.25, "learning_rate": 1.1714589989350374e-05, "loss": 15.8097, "step": 55 }, { "epoch": 0.06386375731772219, "grad_norm": 6.15625, "learning_rate": 1.2779552715654951e-05, "loss": 15.0801, "step": 60 }, { "epoch": 0.06918573709419905, "grad_norm": 5.125, "learning_rate": 1.3844515441959532e-05, "loss": 14.7544, "step": 65 }, { "epoch": 0.0745077168706759, "grad_norm": 4.21875, "learning_rate": 1.4909478168264113e-05, "loss": 13.8828, "step": 70 }, { "epoch": 0.07982969664715274, "grad_norm": 4.75, "learning_rate": 1.597444089456869e-05, "loss": 13.7324, "step": 75 }, { "epoch": 0.08515167642362959, "grad_norm": 5.34375, "learning_rate": 1.703940362087327e-05, "loss": 13.6977, "step": 80 }, { "epoch": 0.09047365620010644, "grad_norm": 7.34375, "learning_rate": 1.8104366347177852e-05, "loss": 13.4468, "step": 85 }, { "epoch": 0.09579563597658329, "grad_norm": 10.375, "learning_rate": 1.9169329073482428e-05, "loss": 12.726, "step": 90 }, { "epoch": 0.10111761575306014, "grad_norm": 17.625, "learning_rate": 2.0234291799787007e-05, "loss": 11.4668, "step": 95 }, { "epoch": 0.10643959552953698, "grad_norm": 30.375, "learning_rate": 2.129925452609159e-05, "loss": 9.8374, "step": 100 }, { "epoch": 0.11176157530601384, "grad_norm": 34.75, "learning_rate": 2.2364217252396165e-05, "loss": 7.0846, "step": 105 }, { "epoch": 0.11708355508249069, "grad_norm": 9.625, "learning_rate": 2.3429179978700748e-05, "loss": 3.6089, "step": 110 }, { "epoch": 0.12240553485896753, "grad_norm": 3.171875, "learning_rate": 2.4494142705005327e-05, "loss": 1.9872, "step": 115 }, { "epoch": 0.12772751463544438, "grad_norm": 2.875, "learning_rate": 2.5559105431309903e-05, "loss": 1.811, "step": 120 }, { "epoch": 0.13304949441192124, "grad_norm": 1.7265625, "learning_rate": 2.6624068157614485e-05, "loss": 1.5886, "step": 125 }, { "epoch": 0.1383714741883981, "grad_norm": 1.4375, "learning_rate": 2.7689030883919064e-05, "loss": 1.4966, "step": 130 }, { "epoch": 0.14369345396487493, "grad_norm": 1.359375, "learning_rate": 2.8753993610223644e-05, "loss": 1.3914, "step": 135 }, { "epoch": 0.1490154337413518, "grad_norm": 1.6171875, "learning_rate": 2.9818956336528226e-05, "loss": 1.3715, "step": 140 }, { "epoch": 0.15433741351782862, "grad_norm": 1.484375, "learning_rate": 3.08839190628328e-05, "loss": 1.3284, "step": 145 }, { "epoch": 0.15965939329430548, "grad_norm": 1.8203125, "learning_rate": 3.194888178913738e-05, "loss": 1.2243, "step": 150 }, { "epoch": 0.16498137307078234, "grad_norm": 1.453125, "learning_rate": 3.3013844515441964e-05, "loss": 1.203, "step": 155 }, { "epoch": 0.17030335284725917, "grad_norm": 2.640625, "learning_rate": 3.407880724174654e-05, "loss": 1.1315, "step": 160 }, { "epoch": 0.17562533262373603, "grad_norm": 2.1875, "learning_rate": 3.514376996805112e-05, "loss": 1.1368, "step": 165 }, { "epoch": 0.1809473124002129, "grad_norm": 1.609375, "learning_rate": 3.6208732694355704e-05, "loss": 1.1554, "step": 170 }, { "epoch": 0.18626929217668972, "grad_norm": 2.109375, "learning_rate": 3.727369542066027e-05, "loss": 1.0673, "step": 175 }, { "epoch": 0.19159127195316658, "grad_norm": 4.0625, "learning_rate": 3.8338658146964856e-05, "loss": 1.131, "step": 180 }, { "epoch": 0.19691325172964344, "grad_norm": 1.5625, "learning_rate": 3.940362087326944e-05, "loss": 1.0438, "step": 185 }, { "epoch": 0.20223523150612027, "grad_norm": 3.390625, "learning_rate": 4.0468583599574014e-05, "loss": 1.0149, "step": 190 }, { "epoch": 0.20755721128259713, "grad_norm": 4.4375, "learning_rate": 4.15335463258786e-05, "loss": 1.088, "step": 195 }, { "epoch": 0.21287919105907396, "grad_norm": 2.09375, "learning_rate": 4.259850905218318e-05, "loss": 1.0729, "step": 200 }, { "epoch": 0.21820117083555082, "grad_norm": 1.1796875, "learning_rate": 4.3663471778487755e-05, "loss": 1.0227, "step": 205 }, { "epoch": 0.22352315061202768, "grad_norm": 3.296875, "learning_rate": 4.472843450479233e-05, "loss": 1.0067, "step": 210 }, { "epoch": 0.22884513038850451, "grad_norm": 2.296875, "learning_rate": 4.579339723109691e-05, "loss": 1.0051, "step": 215 }, { "epoch": 0.23416711016498137, "grad_norm": 2.375, "learning_rate": 4.6858359957401496e-05, "loss": 0.9989, "step": 220 }, { "epoch": 0.23948908994145823, "grad_norm": 2.609375, "learning_rate": 4.792332268370607e-05, "loss": 0.9821, "step": 225 }, { "epoch": 0.24481106971793506, "grad_norm": 3.484375, "learning_rate": 4.8988285410010654e-05, "loss": 0.9889, "step": 230 }, { "epoch": 0.2501330494944119, "grad_norm": 3.046875, "learning_rate": 5.0053248136315237e-05, "loss": 0.9449, "step": 235 }, { "epoch": 0.25545502927088876, "grad_norm": 1.6328125, "learning_rate": 5.1118210862619806e-05, "loss": 0.9423, "step": 240 }, { "epoch": 0.2607770090473656, "grad_norm": 0.95703125, "learning_rate": 5.2183173588924395e-05, "loss": 0.9127, "step": 245 }, { "epoch": 0.2660989888238425, "grad_norm": 1.9921875, "learning_rate": 5.324813631522897e-05, "loss": 0.9345, "step": 250 }, { "epoch": 0.27142096860031933, "grad_norm": 2.484375, "learning_rate": 5.4313099041533546e-05, "loss": 0.9756, "step": 255 }, { "epoch": 0.2767429483767962, "grad_norm": 2.6875, "learning_rate": 5.537806176783813e-05, "loss": 0.9622, "step": 260 }, { "epoch": 0.282064928153273, "grad_norm": 0.9453125, "learning_rate": 5.6443024494142705e-05, "loss": 0.9019, "step": 265 }, { "epoch": 0.28738690792974986, "grad_norm": 1.8671875, "learning_rate": 5.750798722044729e-05, "loss": 0.9327, "step": 270 }, { "epoch": 0.2927088877062267, "grad_norm": 0.70703125, "learning_rate": 5.857294994675186e-05, "loss": 0.901, "step": 275 }, { "epoch": 0.2980308674827036, "grad_norm": 1.3046875, "learning_rate": 5.963791267305645e-05, "loss": 0.9153, "step": 280 }, { "epoch": 0.30335284725918044, "grad_norm": 0.76953125, "learning_rate": 6.070287539936103e-05, "loss": 0.886, "step": 285 }, { "epoch": 0.30867482703565724, "grad_norm": 20.125, "learning_rate": 6.17678381256656e-05, "loss": 0.9152, "step": 290 }, { "epoch": 0.3139968068121341, "grad_norm": 2.78125, "learning_rate": 6.283280085197019e-05, "loss": 0.8974, "step": 295 }, { "epoch": 0.31931878658861096, "grad_norm": 2.15625, "learning_rate": 6.389776357827476e-05, "loss": 0.9266, "step": 300 }, { "epoch": 0.3246407663650878, "grad_norm": 4.90625, "learning_rate": 6.496272630457935e-05, "loss": 0.9015, "step": 305 }, { "epoch": 0.3299627461415647, "grad_norm": 1.515625, "learning_rate": 6.602768903088393e-05, "loss": 0.9022, "step": 310 }, { "epoch": 0.33528472591804154, "grad_norm": 3.796875, "learning_rate": 6.70926517571885e-05, "loss": 0.9215, "step": 315 }, { "epoch": 0.34060670569451834, "grad_norm": 2.984375, "learning_rate": 6.815761448349308e-05, "loss": 0.9159, "step": 320 }, { "epoch": 0.3459286854709952, "grad_norm": 2.546875, "learning_rate": 6.922257720979765e-05, "loss": 0.8926, "step": 325 }, { "epoch": 0.35125066524747206, "grad_norm": 3.15625, "learning_rate": 7.028753993610224e-05, "loss": 0.9192, "step": 330 }, { "epoch": 0.3565726450239489, "grad_norm": 2.515625, "learning_rate": 7.135250266240682e-05, "loss": 0.88, "step": 335 }, { "epoch": 0.3618946248004258, "grad_norm": 3.03125, "learning_rate": 7.241746538871141e-05, "loss": 0.8753, "step": 340 }, { "epoch": 0.3672166045769026, "grad_norm": 2.28125, "learning_rate": 7.348242811501597e-05, "loss": 0.8975, "step": 345 }, { "epoch": 0.37253858435337944, "grad_norm": 0.8828125, "learning_rate": 7.454739084132055e-05, "loss": 0.9135, "step": 350 }, { "epoch": 0.3778605641298563, "grad_norm": 3.75, "learning_rate": 7.561235356762514e-05, "loss": 0.9116, "step": 355 }, { "epoch": 0.38318254390633316, "grad_norm": 1.828125, "learning_rate": 7.667731629392971e-05, "loss": 0.9198, "step": 360 }, { "epoch": 0.38850452368281, "grad_norm": 1.9921875, "learning_rate": 7.77422790202343e-05, "loss": 0.8648, "step": 365 }, { "epoch": 0.3938265034592869, "grad_norm": 1.125, "learning_rate": 7.880724174653888e-05, "loss": 0.9447, "step": 370 }, { "epoch": 0.3991484832357637, "grad_norm": 0.79296875, "learning_rate": 7.987220447284345e-05, "loss": 0.9195, "step": 375 }, { "epoch": 0.40447046301224054, "grad_norm": 13.6875, "learning_rate": 8.093716719914803e-05, "loss": 0.9055, "step": 380 }, { "epoch": 0.4097924427887174, "grad_norm": 2.296875, "learning_rate": 8.20021299254526e-05, "loss": 0.896, "step": 385 }, { "epoch": 0.41511442256519426, "grad_norm": 5.34375, "learning_rate": 8.30670926517572e-05, "loss": 0.8678, "step": 390 }, { "epoch": 0.4204364023416711, "grad_norm": 1.0234375, "learning_rate": 8.413205537806177e-05, "loss": 0.8409, "step": 395 }, { "epoch": 0.4257583821181479, "grad_norm": 1.265625, "learning_rate": 8.519701810436636e-05, "loss": 0.8628, "step": 400 }, { "epoch": 0.4310803618946248, "grad_norm": 0.578125, "learning_rate": 8.626198083067093e-05, "loss": 0.8319, "step": 405 }, { "epoch": 0.43640234167110165, "grad_norm": 1.296875, "learning_rate": 8.732694355697551e-05, "loss": 0.8847, "step": 410 }, { "epoch": 0.4417243214475785, "grad_norm": 1.3046875, "learning_rate": 8.839190628328009e-05, "loss": 0.8271, "step": 415 }, { "epoch": 0.44704630122405536, "grad_norm": 1.71875, "learning_rate": 8.945686900958466e-05, "loss": 0.8398, "step": 420 }, { "epoch": 0.4523682810005322, "grad_norm": 1.5, "learning_rate": 9.052183173588925e-05, "loss": 0.8594, "step": 425 }, { "epoch": 0.45769026077700903, "grad_norm": 2.0625, "learning_rate": 9.158679446219383e-05, "loss": 0.8827, "step": 430 }, { "epoch": 0.4630122405534859, "grad_norm": 3.1875, "learning_rate": 9.265175718849842e-05, "loss": 0.8546, "step": 435 }, { "epoch": 0.46833422032996275, "grad_norm": 0.58984375, "learning_rate": 9.371671991480299e-05, "loss": 0.8558, "step": 440 }, { "epoch": 0.4736562001064396, "grad_norm": 0.64453125, "learning_rate": 9.478168264110757e-05, "loss": 0.8066, "step": 445 }, { "epoch": 0.47897817988291647, "grad_norm": 0.57421875, "learning_rate": 9.584664536741214e-05, "loss": 0.8661, "step": 450 }, { "epoch": 0.48430015965939327, "grad_norm": 0.58203125, "learning_rate": 9.691160809371672e-05, "loss": 0.8376, "step": 455 }, { "epoch": 0.48962213943587013, "grad_norm": 1.3046875, "learning_rate": 9.797657082002131e-05, "loss": 0.8188, "step": 460 }, { "epoch": 0.494944119212347, "grad_norm": 0.53515625, "learning_rate": 9.904153354632588e-05, "loss": 0.8132, "step": 465 }, { "epoch": 0.5002660989888238, "grad_norm": 0.89453125, "learning_rate": 0.00010010649627263047, "loss": 0.864, "step": 470 }, { "epoch": 0.5055880787653007, "grad_norm": 0.8828125, "learning_rate": 0.00010117145899893504, "loss": 0.8482, "step": 475 }, { "epoch": 0.5109100585417775, "grad_norm": 1.1875, "learning_rate": 0.00010223642172523961, "loss": 0.8585, "step": 480 }, { "epoch": 0.5162320383182544, "grad_norm": 1.1171875, "learning_rate": 0.0001033013844515442, "loss": 0.894, "step": 485 }, { "epoch": 0.5215540180947312, "grad_norm": 1.5625, "learning_rate": 0.00010436634717784879, "loss": 0.8471, "step": 490 }, { "epoch": 0.5268759978712081, "grad_norm": 0.94140625, "learning_rate": 0.00010543130990415335, "loss": 0.8089, "step": 495 }, { "epoch": 0.532197977647685, "grad_norm": 0.75, "learning_rate": 0.00010649627263045794, "loss": 0.8199, "step": 500 }, { "epoch": 0.5375199574241618, "grad_norm": 1.5078125, "learning_rate": 0.00010756123535676253, "loss": 0.8541, "step": 505 }, { "epoch": 0.5428419372006387, "grad_norm": 1.3984375, "learning_rate": 0.00010862619808306709, "loss": 0.8126, "step": 510 }, { "epoch": 0.5481639169771155, "grad_norm": 0.63671875, "learning_rate": 0.00010969116080937167, "loss": 0.8341, "step": 515 }, { "epoch": 0.5534858967535924, "grad_norm": 0.859375, "learning_rate": 0.00011075612353567626, "loss": 0.8199, "step": 520 }, { "epoch": 0.5588078765300691, "grad_norm": 0.9296875, "learning_rate": 0.00011182108626198085, "loss": 0.8765, "step": 525 }, { "epoch": 0.564129856306546, "grad_norm": 0.9296875, "learning_rate": 0.00011288604898828541, "loss": 0.8084, "step": 530 }, { "epoch": 0.5694518360830229, "grad_norm": 2.09375, "learning_rate": 0.00011395101171459, "loss": 0.8739, "step": 535 }, { "epoch": 0.5747738158594997, "grad_norm": 0.88671875, "learning_rate": 0.00011501597444089457, "loss": 0.8715, "step": 540 }, { "epoch": 0.5800957956359766, "grad_norm": 2.546875, "learning_rate": 0.00011608093716719915, "loss": 0.8356, "step": 545 }, { "epoch": 0.5854177754124534, "grad_norm": 3.984375, "learning_rate": 0.00011714589989350373, "loss": 0.8146, "step": 550 }, { "epoch": 0.5907397551889303, "grad_norm": 1.78125, "learning_rate": 0.00011821086261980832, "loss": 0.8372, "step": 555 }, { "epoch": 0.5960617349654072, "grad_norm": 2.09375, "learning_rate": 0.0001192758253461129, "loss": 0.8279, "step": 560 }, { "epoch": 0.601383714741884, "grad_norm": 1.7265625, "learning_rate": 0.00012034078807241747, "loss": 0.7894, "step": 565 }, { "epoch": 0.6067056945183609, "grad_norm": 0.58203125, "learning_rate": 0.00012140575079872206, "loss": 0.8129, "step": 570 }, { "epoch": 0.6120276742948377, "grad_norm": 0.59375, "learning_rate": 0.00012247071352502665, "loss": 0.8015, "step": 575 }, { "epoch": 0.6173496540713145, "grad_norm": 0.62890625, "learning_rate": 0.0001235356762513312, "loss": 0.7973, "step": 580 }, { "epoch": 0.6226716338477913, "grad_norm": 0.68359375, "learning_rate": 0.0001246006389776358, "loss": 0.8268, "step": 585 }, { "epoch": 0.6279936136242682, "grad_norm": 1.8203125, "learning_rate": 0.00012566560170394037, "loss": 0.7672, "step": 590 }, { "epoch": 0.6333155934007451, "grad_norm": 1.6875, "learning_rate": 0.00012673056443024495, "loss": 0.8165, "step": 595 }, { "epoch": 0.6386375731772219, "grad_norm": 1.015625, "learning_rate": 0.00012779552715654952, "loss": 0.7955, "step": 600 }, { "epoch": 0.6439595529536988, "grad_norm": 0.63671875, "learning_rate": 0.0001288604898828541, "loss": 0.8404, "step": 605 }, { "epoch": 0.6492815327301756, "grad_norm": 3.53125, "learning_rate": 0.0001299254526091587, "loss": 0.8981, "step": 610 }, { "epoch": 0.6546035125066525, "grad_norm": 1.6484375, "learning_rate": 0.00013099041533546325, "loss": 0.903, "step": 615 }, { "epoch": 0.6599254922831294, "grad_norm": 2.328125, "learning_rate": 0.00013205537806176785, "loss": 0.8134, "step": 620 }, { "epoch": 0.6652474720596062, "grad_norm": 2.0625, "learning_rate": 0.00013312034078807243, "loss": 0.8003, "step": 625 }, { "epoch": 0.6705694518360831, "grad_norm": 0.81640625, "learning_rate": 0.000134185303514377, "loss": 0.8049, "step": 630 }, { "epoch": 0.6758914316125598, "grad_norm": 0.625, "learning_rate": 0.00013525026624068158, "loss": 0.8208, "step": 635 }, { "epoch": 0.6812134113890367, "grad_norm": 1.3359375, "learning_rate": 0.00013631522896698616, "loss": 0.8393, "step": 640 }, { "epoch": 0.6865353911655135, "grad_norm": 2.265625, "learning_rate": 0.00013738019169329076, "loss": 0.7827, "step": 645 }, { "epoch": 0.6918573709419904, "grad_norm": 1.4453125, "learning_rate": 0.0001384451544195953, "loss": 0.8265, "step": 650 }, { "epoch": 0.6971793507184673, "grad_norm": 0.453125, "learning_rate": 0.0001395101171458999, "loss": 0.8175, "step": 655 }, { "epoch": 0.7025013304949441, "grad_norm": 2.078125, "learning_rate": 0.0001405750798722045, "loss": 0.8762, "step": 660 }, { "epoch": 0.707823310271421, "grad_norm": 1.40625, "learning_rate": 0.00014164004259850904, "loss": 0.8505, "step": 665 }, { "epoch": 0.7131452900478978, "grad_norm": 1.6484375, "learning_rate": 0.00014270500532481364, "loss": 0.8183, "step": 670 }, { "epoch": 0.7184672698243747, "grad_norm": 1.8515625, "learning_rate": 0.00014376996805111821, "loss": 0.7557, "step": 675 }, { "epoch": 0.7237892496008516, "grad_norm": 1.046875, "learning_rate": 0.00014483493077742282, "loss": 0.8473, "step": 680 }, { "epoch": 0.7291112293773284, "grad_norm": 0.625, "learning_rate": 0.00014589989350372737, "loss": 0.7859, "step": 685 }, { "epoch": 0.7344332091538052, "grad_norm": 2.234375, "learning_rate": 0.00014696485623003194, "loss": 0.7998, "step": 690 }, { "epoch": 0.739755188930282, "grad_norm": 1.625, "learning_rate": 0.00014802981895633654, "loss": 0.8206, "step": 695 }, { "epoch": 0.7450771687067589, "grad_norm": 0.74609375, "learning_rate": 0.0001490947816826411, "loss": 0.764, "step": 700 }, { "epoch": 0.7503991484832357, "grad_norm": 1.5234375, "learning_rate": 0.0001501597444089457, "loss": 0.7756, "step": 705 }, { "epoch": 0.7557211282597126, "grad_norm": 0.7421875, "learning_rate": 0.00015122470713525027, "loss": 0.8288, "step": 710 }, { "epoch": 0.7610431080361895, "grad_norm": 1.7265625, "learning_rate": 0.00015228966986155487, "loss": 0.82, "step": 715 }, { "epoch": 0.7663650878126663, "grad_norm": 0.95703125, "learning_rate": 0.00015335463258785942, "loss": 0.7955, "step": 720 }, { "epoch": 0.7716870675891432, "grad_norm": 1.328125, "learning_rate": 0.000154419595314164, "loss": 0.8, "step": 725 }, { "epoch": 0.77700904736562, "grad_norm": 0.47265625, "learning_rate": 0.0001554845580404686, "loss": 0.8194, "step": 730 }, { "epoch": 0.7823310271420969, "grad_norm": 1.015625, "learning_rate": 0.00015654952076677315, "loss": 0.8267, "step": 735 }, { "epoch": 0.7876530069185738, "grad_norm": 1.5859375, "learning_rate": 0.00015761448349307775, "loss": 0.8489, "step": 740 }, { "epoch": 0.7929749866950505, "grad_norm": 0.5546875, "learning_rate": 0.00015867944621938233, "loss": 0.8024, "step": 745 }, { "epoch": 0.7982969664715274, "grad_norm": 0.9765625, "learning_rate": 0.0001597444089456869, "loss": 0.8234, "step": 750 }, { "epoch": 0.8036189462480042, "grad_norm": 1.3828125, "learning_rate": 0.00016080937167199148, "loss": 0.8292, "step": 755 }, { "epoch": 0.8089409260244811, "grad_norm": 0.73828125, "learning_rate": 0.00016187433439829606, "loss": 0.7655, "step": 760 }, { "epoch": 0.814262905800958, "grad_norm": 0.59765625, "learning_rate": 0.00016293929712460066, "loss": 0.8551, "step": 765 }, { "epoch": 0.8195848855774348, "grad_norm": 6.40625, "learning_rate": 0.0001640042598509052, "loss": 0.7744, "step": 770 }, { "epoch": 0.8249068653539117, "grad_norm": 0.77734375, "learning_rate": 0.0001650692225772098, "loss": 0.8563, "step": 775 }, { "epoch": 0.8302288451303885, "grad_norm": 1.046875, "learning_rate": 0.0001661341853035144, "loss": 0.7973, "step": 780 }, { "epoch": 0.8355508249068654, "grad_norm": 1.359375, "learning_rate": 0.00016719914802981896, "loss": 0.8302, "step": 785 }, { "epoch": 0.8408728046833422, "grad_norm": 1.9140625, "learning_rate": 0.00016826411075612354, "loss": 0.827, "step": 790 }, { "epoch": 0.8461947844598191, "grad_norm": 0.578125, "learning_rate": 0.00016932907348242811, "loss": 0.803, "step": 795 }, { "epoch": 0.8515167642362959, "grad_norm": 0.69140625, "learning_rate": 0.00017039403620873272, "loss": 0.7752, "step": 800 }, { "epoch": 0.8568387440127727, "grad_norm": 0.96484375, "learning_rate": 0.00017145899893503727, "loss": 0.7933, "step": 805 }, { "epoch": 0.8621607237892496, "grad_norm": 1.1484375, "learning_rate": 0.00017252396166134187, "loss": 0.8046, "step": 810 }, { "epoch": 0.8674827035657264, "grad_norm": 0.95703125, "learning_rate": 0.00017358892438764644, "loss": 0.8307, "step": 815 }, { "epoch": 0.8728046833422033, "grad_norm": 1.53125, "learning_rate": 0.00017465388711395102, "loss": 0.7823, "step": 820 }, { "epoch": 0.8781266631186802, "grad_norm": 2.296875, "learning_rate": 0.0001757188498402556, "loss": 0.7936, "step": 825 }, { "epoch": 0.883448642895157, "grad_norm": 1.4375, "learning_rate": 0.00017678381256656017, "loss": 0.8117, "step": 830 }, { "epoch": 0.8887706226716339, "grad_norm": 1.0546875, "learning_rate": 0.00017784877529286477, "loss": 0.7771, "step": 835 }, { "epoch": 0.8940926024481107, "grad_norm": 1.125, "learning_rate": 0.00017891373801916932, "loss": 0.7584, "step": 840 }, { "epoch": 0.8994145822245876, "grad_norm": 2.375, "learning_rate": 0.00017997870074547393, "loss": 0.7808, "step": 845 }, { "epoch": 0.9047365620010644, "grad_norm": 2.71875, "learning_rate": 0.0001810436634717785, "loss": 0.7821, "step": 850 }, { "epoch": 0.9100585417775412, "grad_norm": 2.1875, "learning_rate": 0.00018210862619808308, "loss": 0.813, "step": 855 }, { "epoch": 0.9153805215540181, "grad_norm": 2.6875, "learning_rate": 0.00018317358892438765, "loss": 0.7485, "step": 860 }, { "epoch": 0.9207025013304949, "grad_norm": 0.8203125, "learning_rate": 0.00018423855165069223, "loss": 0.7887, "step": 865 }, { "epoch": 0.9260244811069718, "grad_norm": 1.8125, "learning_rate": 0.00018530351437699683, "loss": 0.7465, "step": 870 }, { "epoch": 0.9313464608834486, "grad_norm": 1.1953125, "learning_rate": 0.00018636847710330138, "loss": 0.8011, "step": 875 }, { "epoch": 0.9366684406599255, "grad_norm": 1.234375, "learning_rate": 0.00018743343982960598, "loss": 0.7856, "step": 880 }, { "epoch": 0.9419904204364024, "grad_norm": 0.49609375, "learning_rate": 0.00018849840255591056, "loss": 0.8367, "step": 885 }, { "epoch": 0.9473124002128792, "grad_norm": 0.95703125, "learning_rate": 0.00018956336528221513, "loss": 0.831, "step": 890 }, { "epoch": 0.9526343799893561, "grad_norm": 1.875, "learning_rate": 0.0001906283280085197, "loss": 0.7564, "step": 895 }, { "epoch": 0.9579563597658329, "grad_norm": 0.498046875, "learning_rate": 0.00019169329073482429, "loss": 0.839, "step": 900 }, { "epoch": 0.9632783395423098, "grad_norm": 0.54296875, "learning_rate": 0.0001927582534611289, "loss": 0.8225, "step": 905 }, { "epoch": 0.9686003193187865, "grad_norm": 1.1875, "learning_rate": 0.00019382321618743344, "loss": 0.7952, "step": 910 }, { "epoch": 0.9739222990952634, "grad_norm": 0.81640625, "learning_rate": 0.00019488817891373804, "loss": 0.8452, "step": 915 }, { "epoch": 0.9792442788717403, "grad_norm": 0.65625, "learning_rate": 0.00019595314164004262, "loss": 0.7544, "step": 920 }, { "epoch": 0.9845662586482171, "grad_norm": 10.5625, "learning_rate": 0.00019701810436634716, "loss": 0.8087, "step": 925 }, { "epoch": 0.989888238424694, "grad_norm": 1.265625, "learning_rate": 0.00019808306709265177, "loss": 0.8145, "step": 930 }, { "epoch": 0.9952102182011708, "grad_norm": 1.6484375, "learning_rate": 0.00019914802981895634, "loss": 0.7837, "step": 935 }, { "epoch": 0.9994678020223523, "eval_loss": 1.973112940788269, "eval_runtime": 12.4076, "eval_samples_per_second": 5.964, "eval_steps_per_second": 0.564, "step": 939 }, { "epoch": 1.0005321979776476, "grad_norm": 1.1328125, "learning_rate": 0.00019999999309038992, "loss": 0.7933, "step": 940 }, { "epoch": 1.0058541777541246, "grad_norm": 0.8984375, "learning_rate": 0.00019999975125413672, "loss": 0.7369, "step": 945 }, { "epoch": 1.0111761575306013, "grad_norm": 1.0234375, "learning_rate": 0.0001999991639383335, "loss": 0.7584, "step": 950 }, { "epoch": 1.0164981373070783, "grad_norm": 0.57421875, "learning_rate": 0.00019999823114500923, "loss": 0.7295, "step": 955 }, { "epoch": 1.021820117083555, "grad_norm": 0.97265625, "learning_rate": 0.0001999969528773866, "loss": 0.7908, "step": 960 }, { "epoch": 1.027142096860032, "grad_norm": 1.359375, "learning_rate": 0.0001999953291398817, "loss": 0.7458, "step": 965 }, { "epoch": 1.0324640766365087, "grad_norm": 0.86328125, "learning_rate": 0.0001999933599381043, "loss": 0.775, "step": 970 }, { "epoch": 1.0377860564129857, "grad_norm": 0.765625, "learning_rate": 0.0001999910452788576, "loss": 0.7861, "step": 975 }, { "epoch": 1.0431080361894625, "grad_norm": 0.5390625, "learning_rate": 0.00019998838517013825, "loss": 0.7601, "step": 980 }, { "epoch": 1.0484300159659394, "grad_norm": 0.390625, "learning_rate": 0.00019998537962113644, "loss": 0.7846, "step": 985 }, { "epoch": 1.0537519957424162, "grad_norm": 0.66796875, "learning_rate": 0.00019998202864223572, "loss": 0.7524, "step": 990 }, { "epoch": 1.059073975518893, "grad_norm": 0.67578125, "learning_rate": 0.0001999783322450131, "loss": 0.7678, "step": 995 }, { "epoch": 1.06439595529537, "grad_norm": 0.94140625, "learning_rate": 0.0001999742904422389, "loss": 0.7478, "step": 1000 }, { "epoch": 1.0697179350718466, "grad_norm": 0.546875, "learning_rate": 0.00019996990324787673, "loss": 0.7714, "step": 1005 }, { "epoch": 1.0750399148483236, "grad_norm": 0.74609375, "learning_rate": 0.00019996517067708352, "loss": 0.7722, "step": 1010 }, { "epoch": 1.0803618946248004, "grad_norm": 0.83203125, "learning_rate": 0.00019996009274620937, "loss": 0.7474, "step": 1015 }, { "epoch": 1.0856838744012773, "grad_norm": 0.423828125, "learning_rate": 0.00019995466947279753, "loss": 0.7402, "step": 1020 }, { "epoch": 1.091005854177754, "grad_norm": 0.75390625, "learning_rate": 0.00019994890087558434, "loss": 0.6987, "step": 1025 }, { "epoch": 1.096327833954231, "grad_norm": 1.4921875, "learning_rate": 0.00019994278697449918, "loss": 0.723, "step": 1030 }, { "epoch": 1.1016498137307078, "grad_norm": 0.8515625, "learning_rate": 0.00019993632779066438, "loss": 0.727, "step": 1035 }, { "epoch": 1.1069717935071848, "grad_norm": 1.0390625, "learning_rate": 0.00019992952334639514, "loss": 0.7163, "step": 1040 }, { "epoch": 1.1122937732836615, "grad_norm": 1.265625, "learning_rate": 0.00019992237366519953, "loss": 0.7932, "step": 1045 }, { "epoch": 1.1176157530601383, "grad_norm": 1.4609375, "learning_rate": 0.0001999148787717782, "loss": 0.707, "step": 1050 }, { "epoch": 1.1229377328366152, "grad_norm": 0.515625, "learning_rate": 0.00019990703869202466, "loss": 0.747, "step": 1055 }, { "epoch": 1.128259712613092, "grad_norm": 0.9140625, "learning_rate": 0.00019989885345302478, "loss": 0.7729, "step": 1060 }, { "epoch": 1.133581692389569, "grad_norm": 1.0546875, "learning_rate": 0.00019989032308305698, "loss": 0.6929, "step": 1065 }, { "epoch": 1.1389036721660457, "grad_norm": 0.94921875, "learning_rate": 0.000199881447611592, "loss": 0.7327, "step": 1070 }, { "epoch": 1.1442256519425227, "grad_norm": 0.6171875, "learning_rate": 0.00019987222706929284, "loss": 0.7864, "step": 1075 }, { "epoch": 1.1495476317189994, "grad_norm": 0.8125, "learning_rate": 0.0001998626614880147, "loss": 0.7699, "step": 1080 }, { "epoch": 1.1548696114954764, "grad_norm": 1.1328125, "learning_rate": 0.0001998527509008048, "loss": 0.7721, "step": 1085 }, { "epoch": 1.1601915912719531, "grad_norm": 0.44921875, "learning_rate": 0.00019984249534190225, "loss": 0.7639, "step": 1090 }, { "epoch": 1.1655135710484301, "grad_norm": 1.0234375, "learning_rate": 0.000199831894846738, "loss": 0.7786, "step": 1095 }, { "epoch": 1.1708355508249069, "grad_norm": 0.431640625, "learning_rate": 0.00019982094945193468, "loss": 0.7042, "step": 1100 }, { "epoch": 1.1761575306013836, "grad_norm": 0.478515625, "learning_rate": 0.0001998096591953065, "loss": 0.7774, "step": 1105 }, { "epoch": 1.1814795103778606, "grad_norm": 0.59765625, "learning_rate": 0.00019979802411585905, "loss": 0.7261, "step": 1110 }, { "epoch": 1.1868014901543373, "grad_norm": 0.73046875, "learning_rate": 0.00019978604425378933, "loss": 0.7516, "step": 1115 }, { "epoch": 1.1921234699308143, "grad_norm": 1.4140625, "learning_rate": 0.00019977371965048532, "loss": 0.7676, "step": 1120 }, { "epoch": 1.197445449707291, "grad_norm": 0.875, "learning_rate": 0.00019976105034852614, "loss": 0.7588, "step": 1125 }, { "epoch": 1.202767429483768, "grad_norm": 0.671875, "learning_rate": 0.0001997480363916818, "loss": 0.7698, "step": 1130 }, { "epoch": 1.2080894092602448, "grad_norm": 0.953125, "learning_rate": 0.00019973467782491292, "loss": 0.7945, "step": 1135 }, { "epoch": 1.2134113890367217, "grad_norm": 0.50390625, "learning_rate": 0.00019972097469437073, "loss": 0.7654, "step": 1140 }, { "epoch": 1.2187333688131985, "grad_norm": 1.3046875, "learning_rate": 0.0001997069270473969, "loss": 0.7187, "step": 1145 }, { "epoch": 1.2240553485896752, "grad_norm": 0.44140625, "learning_rate": 0.00019969253493252326, "loss": 0.7237, "step": 1150 }, { "epoch": 1.2293773283661522, "grad_norm": 1.03125, "learning_rate": 0.00019967779839947172, "loss": 0.731, "step": 1155 }, { "epoch": 1.2346993081426292, "grad_norm": 3.28125, "learning_rate": 0.0001996627174991542, "loss": 0.7115, "step": 1160 }, { "epoch": 1.240021287919106, "grad_norm": 1.40625, "learning_rate": 0.00019964729228367218, "loss": 0.7252, "step": 1165 }, { "epoch": 1.2453432676955827, "grad_norm": 1.625, "learning_rate": 0.0001996315228063168, "loss": 0.7356, "step": 1170 }, { "epoch": 1.2506652474720596, "grad_norm": 1.078125, "learning_rate": 0.0001996154091215685, "loss": 0.7148, "step": 1175 }, { "epoch": 1.2559872272485364, "grad_norm": 0.5703125, "learning_rate": 0.00019959895128509691, "loss": 0.77, "step": 1180 }, { "epoch": 1.2613092070250134, "grad_norm": 1.1953125, "learning_rate": 0.00019958214935376066, "loss": 0.7815, "step": 1185 }, { "epoch": 1.2666311868014901, "grad_norm": 0.65234375, "learning_rate": 0.00019956500338560705, "loss": 0.7653, "step": 1190 }, { "epoch": 1.2719531665779669, "grad_norm": 0.41796875, "learning_rate": 0.0001995475134398721, "loss": 0.7318, "step": 1195 }, { "epoch": 1.2772751463544438, "grad_norm": 0.66015625, "learning_rate": 0.00019952967957698013, "loss": 0.7453, "step": 1200 }, { "epoch": 1.2825971261309208, "grad_norm": 0.8828125, "learning_rate": 0.00019951150185854372, "loss": 0.73, "step": 1205 }, { "epoch": 1.2879191059073976, "grad_norm": 1.1484375, "learning_rate": 0.00019949298034736316, "loss": 0.7422, "step": 1210 }, { "epoch": 1.2932410856838743, "grad_norm": 1.0703125, "learning_rate": 0.00019947411510742679, "loss": 0.7106, "step": 1215 }, { "epoch": 1.2985630654603513, "grad_norm": 0.412109375, "learning_rate": 0.00019945490620391026, "loss": 0.7347, "step": 1220 }, { "epoch": 1.303885045236828, "grad_norm": 0.5078125, "learning_rate": 0.00019943535370317653, "loss": 0.7204, "step": 1225 }, { "epoch": 1.309207025013305, "grad_norm": 0.40625, "learning_rate": 0.00019941545767277575, "loss": 0.796, "step": 1230 }, { "epoch": 1.3145290047897817, "grad_norm": 0.42578125, "learning_rate": 0.00019939521818144472, "loss": 0.7383, "step": 1235 }, { "epoch": 1.3198509845662587, "grad_norm": 0.96875, "learning_rate": 0.00019937463529910701, "loss": 0.7188, "step": 1240 }, { "epoch": 1.3251729643427355, "grad_norm": 0.4921875, "learning_rate": 0.00019935370909687235, "loss": 0.7501, "step": 1245 }, { "epoch": 1.3304949441192124, "grad_norm": 0.40234375, "learning_rate": 0.00019933243964703675, "loss": 0.7951, "step": 1250 }, { "epoch": 1.3358169238956892, "grad_norm": 0.53515625, "learning_rate": 0.00019931082702308195, "loss": 0.7835, "step": 1255 }, { "epoch": 1.341138903672166, "grad_norm": 0.44140625, "learning_rate": 0.0001992888712996753, "loss": 0.7177, "step": 1260 }, { "epoch": 1.346460883448643, "grad_norm": 0.6953125, "learning_rate": 0.00019926657255266958, "loss": 0.7606, "step": 1265 }, { "epoch": 1.3517828632251199, "grad_norm": 0.470703125, "learning_rate": 0.0001992439308591026, "loss": 0.7388, "step": 1270 }, { "epoch": 1.3571048430015966, "grad_norm": 0.7265625, "learning_rate": 0.0001992209462971969, "loss": 0.7789, "step": 1275 }, { "epoch": 1.3624268227780734, "grad_norm": 0.765625, "learning_rate": 0.00019919761894635977, "loss": 0.8055, "step": 1280 }, { "epoch": 1.3677488025545503, "grad_norm": 0.40234375, "learning_rate": 0.00019917394888718248, "loss": 0.7358, "step": 1285 }, { "epoch": 1.373070782331027, "grad_norm": 0.51171875, "learning_rate": 0.00019914993620144054, "loss": 0.773, "step": 1290 }, { "epoch": 1.378392762107504, "grad_norm": 0.3828125, "learning_rate": 0.00019912558097209305, "loss": 0.753, "step": 1295 }, { "epoch": 1.3837147418839808, "grad_norm": 0.439453125, "learning_rate": 0.00019910088328328258, "loss": 0.7989, "step": 1300 }, { "epoch": 1.3890367216604576, "grad_norm": 0.40234375, "learning_rate": 0.00019907584322033478, "loss": 0.7139, "step": 1305 }, { "epoch": 1.3943587014369345, "grad_norm": 0.69921875, "learning_rate": 0.00019905046086975817, "loss": 0.7423, "step": 1310 }, { "epoch": 1.3996806812134115, "grad_norm": 1.2890625, "learning_rate": 0.00019902473631924381, "loss": 0.7438, "step": 1315 }, { "epoch": 1.4050026609898882, "grad_norm": 1.3671875, "learning_rate": 0.00019899866965766497, "loss": 0.77, "step": 1320 }, { "epoch": 1.410324640766365, "grad_norm": 0.9609375, "learning_rate": 0.00019897226097507687, "loss": 0.7244, "step": 1325 }, { "epoch": 1.415646620542842, "grad_norm": 0.66015625, "learning_rate": 0.0001989455103627163, "loss": 0.7432, "step": 1330 }, { "epoch": 1.4209686003193187, "grad_norm": 0.41796875, "learning_rate": 0.00019891841791300143, "loss": 0.6941, "step": 1335 }, { "epoch": 1.4262905800957957, "grad_norm": 0.5078125, "learning_rate": 0.00019889098371953133, "loss": 0.7395, "step": 1340 }, { "epoch": 1.4316125598722724, "grad_norm": 0.76953125, "learning_rate": 0.0001988632078770858, "loss": 0.7506, "step": 1345 }, { "epoch": 1.4369345396487494, "grad_norm": 2.40625, "learning_rate": 0.00019883509048162492, "loss": 0.7703, "step": 1350 }, { "epoch": 1.4422565194252261, "grad_norm": 0.6015625, "learning_rate": 0.00019880663163028876, "loss": 0.7577, "step": 1355 }, { "epoch": 1.4475784992017031, "grad_norm": 1.0546875, "learning_rate": 0.00019877783142139709, "loss": 0.7409, "step": 1360 }, { "epoch": 1.4529004789781799, "grad_norm": 1.0390625, "learning_rate": 0.00019874868995444902, "loss": 0.7757, "step": 1365 }, { "epoch": 1.4582224587546566, "grad_norm": 0.67578125, "learning_rate": 0.00019871920733012259, "loss": 0.7562, "step": 1370 }, { "epoch": 1.4635444385311336, "grad_norm": 1.2265625, "learning_rate": 0.00019868938365027447, "loss": 0.7368, "step": 1375 }, { "epoch": 1.4688664183076106, "grad_norm": 1.1171875, "learning_rate": 0.00019865921901793962, "loss": 0.7536, "step": 1380 }, { "epoch": 1.4741883980840873, "grad_norm": 1.2109375, "learning_rate": 0.00019862871353733098, "loss": 0.7386, "step": 1385 }, { "epoch": 1.479510377860564, "grad_norm": 1.3046875, "learning_rate": 0.00019859786731383903, "loss": 0.7331, "step": 1390 }, { "epoch": 1.484832357637041, "grad_norm": 1.0859375, "learning_rate": 0.00019856668045403135, "loss": 0.7832, "step": 1395 }, { "epoch": 1.4901543374135178, "grad_norm": 1.90625, "learning_rate": 0.00019853515306565247, "loss": 0.6964, "step": 1400 }, { "epoch": 1.4954763171899947, "grad_norm": 1.453125, "learning_rate": 0.00019850328525762337, "loss": 0.7604, "step": 1405 }, { "epoch": 1.5007982969664715, "grad_norm": 0.7109375, "learning_rate": 0.00019847107714004102, "loss": 0.7593, "step": 1410 }, { "epoch": 1.5061202767429482, "grad_norm": 1.0625, "learning_rate": 0.00019843852882417821, "loss": 0.7519, "step": 1415 }, { "epoch": 1.5114422565194252, "grad_norm": 0.419921875, "learning_rate": 0.00019840564042248297, "loss": 0.7583, "step": 1420 }, { "epoch": 1.5167642362959022, "grad_norm": 0.56640625, "learning_rate": 0.0001983724120485783, "loss": 0.8278, "step": 1425 }, { "epoch": 1.522086216072379, "grad_norm": 0.47265625, "learning_rate": 0.00019833884381726172, "loss": 0.7561, "step": 1430 }, { "epoch": 1.5274081958488557, "grad_norm": 0.46484375, "learning_rate": 0.00019830493584450487, "loss": 0.717, "step": 1435 }, { "epoch": 1.5327301756253326, "grad_norm": 0.37890625, "learning_rate": 0.00019827068824745315, "loss": 0.7199, "step": 1440 }, { "epoch": 1.5380521554018096, "grad_norm": 0.51171875, "learning_rate": 0.00019823610114442537, "loss": 0.7577, "step": 1445 }, { "epoch": 1.5433741351782864, "grad_norm": 0.41015625, "learning_rate": 0.0001982011746549131, "loss": 0.7857, "step": 1450 }, { "epoch": 1.5486961149547631, "grad_norm": 0.59765625, "learning_rate": 0.0001981659088995806, "loss": 0.7424, "step": 1455 }, { "epoch": 1.5540180947312399, "grad_norm": 0.400390625, "learning_rate": 0.0001981303040002641, "loss": 0.7691, "step": 1460 }, { "epoch": 1.5593400745077168, "grad_norm": 0.423828125, "learning_rate": 0.00019809436007997154, "loss": 0.7371, "step": 1465 }, { "epoch": 1.5646620542841938, "grad_norm": 0.38671875, "learning_rate": 0.00019805807726288213, "loss": 0.7334, "step": 1470 }, { "epoch": 1.5699840340606706, "grad_norm": 0.4375, "learning_rate": 0.00019802145567434596, "loss": 0.7111, "step": 1475 }, { "epoch": 1.5753060138371473, "grad_norm": 0.396484375, "learning_rate": 0.00019798449544088337, "loss": 0.7515, "step": 1480 }, { "epoch": 1.5806279936136243, "grad_norm": 0.85546875, "learning_rate": 0.00019794719669018472, "loss": 0.7551, "step": 1485 }, { "epoch": 1.5859499733901012, "grad_norm": 1.1171875, "learning_rate": 0.00019790955955110992, "loss": 0.7628, "step": 1490 }, { "epoch": 1.591271953166578, "grad_norm": 0.67578125, "learning_rate": 0.00019787158415368795, "loss": 0.7375, "step": 1495 }, { "epoch": 1.5965939329430547, "grad_norm": 0.9453125, "learning_rate": 0.0001978332706291163, "loss": 0.7451, "step": 1500 }, { "epoch": 1.6019159127195317, "grad_norm": 0.52734375, "learning_rate": 0.0001977946191097607, "loss": 0.7147, "step": 1505 }, { "epoch": 1.6072378924960087, "grad_norm": 0.4140625, "learning_rate": 0.0001977556297291546, "loss": 0.7034, "step": 1510 }, { "epoch": 1.6125598722724854, "grad_norm": 0.4921875, "learning_rate": 0.00019771630262199868, "loss": 0.7225, "step": 1515 }, { "epoch": 1.6178818520489622, "grad_norm": 0.71875, "learning_rate": 0.0001976766379241604, "loss": 0.7171, "step": 1520 }, { "epoch": 1.623203831825439, "grad_norm": 0.5625, "learning_rate": 0.00019763663577267347, "loss": 0.7775, "step": 1525 }, { "epoch": 1.628525811601916, "grad_norm": 0.44140625, "learning_rate": 0.0001975962963057375, "loss": 0.8088, "step": 1530 }, { "epoch": 1.6338477913783929, "grad_norm": 0.42578125, "learning_rate": 0.00019755561966271746, "loss": 0.7424, "step": 1535 }, { "epoch": 1.6391697711548696, "grad_norm": 1.21875, "learning_rate": 0.00019751460598414322, "loss": 0.729, "step": 1540 }, { "epoch": 1.6444917509313464, "grad_norm": 0.671875, "learning_rate": 0.00019747325541170897, "loss": 0.7375, "step": 1545 }, { "epoch": 1.6498137307078233, "grad_norm": 0.50390625, "learning_rate": 0.0001974315680882728, "loss": 0.8017, "step": 1550 }, { "epoch": 1.6551357104843003, "grad_norm": 0.4453125, "learning_rate": 0.00019738954415785627, "loss": 0.7155, "step": 1555 }, { "epoch": 1.660457690260777, "grad_norm": 0.5, "learning_rate": 0.00019734718376564385, "loss": 0.7522, "step": 1560 }, { "epoch": 1.6657796700372538, "grad_norm": 0.439453125, "learning_rate": 0.00019730448705798239, "loss": 0.7624, "step": 1565 }, { "epoch": 1.6711016498137305, "grad_norm": 0.58984375, "learning_rate": 0.00019726145418238065, "loss": 0.7286, "step": 1570 }, { "epoch": 1.6764236295902075, "grad_norm": 0.625, "learning_rate": 0.00019721808528750878, "loss": 0.7936, "step": 1575 }, { "epoch": 1.6817456093666845, "grad_norm": 0.49609375, "learning_rate": 0.0001971743805231978, "loss": 0.7399, "step": 1580 }, { "epoch": 1.6870675891431612, "grad_norm": 0.59375, "learning_rate": 0.00019713034004043914, "loss": 0.7632, "step": 1585 }, { "epoch": 1.692389568919638, "grad_norm": 0.6953125, "learning_rate": 0.000197085963991384, "loss": 0.7721, "step": 1590 }, { "epoch": 1.697711548696115, "grad_norm": 1.3828125, "learning_rate": 0.00019704125252934296, "loss": 0.7678, "step": 1595 }, { "epoch": 1.703033528472592, "grad_norm": 0.7890625, "learning_rate": 0.00019699620580878536, "loss": 0.7745, "step": 1600 }, { "epoch": 1.7083555082490687, "grad_norm": 0.8046875, "learning_rate": 0.00019695082398533883, "loss": 0.7615, "step": 1605 }, { "epoch": 1.7136774880255454, "grad_norm": 0.859375, "learning_rate": 0.00019690510721578861, "loss": 0.7065, "step": 1610 }, { "epoch": 1.7189994678020224, "grad_norm": 1.0078125, "learning_rate": 0.00019685905565807723, "loss": 0.7443, "step": 1615 }, { "epoch": 1.7243214475784994, "grad_norm": 0.6796875, "learning_rate": 0.0001968126694713038, "loss": 0.6955, "step": 1620 }, { "epoch": 1.7296434273549761, "grad_norm": 0.5078125, "learning_rate": 0.00019676594881572345, "loss": 0.7418, "step": 1625 }, { "epoch": 1.7349654071314529, "grad_norm": 0.3984375, "learning_rate": 0.00019671889385274697, "loss": 0.7379, "step": 1630 }, { "epoch": 1.7402873869079296, "grad_norm": 0.416015625, "learning_rate": 0.00019667150474494003, "loss": 0.7381, "step": 1635 }, { "epoch": 1.7456093666844066, "grad_norm": 0.55078125, "learning_rate": 0.00019662378165602267, "loss": 0.7163, "step": 1640 }, { "epoch": 1.7509313464608836, "grad_norm": 0.515625, "learning_rate": 0.00019657572475086882, "loss": 0.7348, "step": 1645 }, { "epoch": 1.7562533262373603, "grad_norm": 0.412109375, "learning_rate": 0.0001965273341955057, "loss": 0.7521, "step": 1650 }, { "epoch": 1.761575306013837, "grad_norm": 0.640625, "learning_rate": 0.0001964786101571132, "loss": 0.7554, "step": 1655 }, { "epoch": 1.766897285790314, "grad_norm": 0.83203125, "learning_rate": 0.0001964295528040233, "loss": 0.703, "step": 1660 }, { "epoch": 1.772219265566791, "grad_norm": 0.55859375, "learning_rate": 0.00019638016230571955, "loss": 0.7592, "step": 1665 }, { "epoch": 1.7775412453432677, "grad_norm": 1.421875, "learning_rate": 0.00019633043883283643, "loss": 0.7379, "step": 1670 }, { "epoch": 1.7828632251197445, "grad_norm": 1.8125, "learning_rate": 0.00019628038255715887, "loss": 0.7531, "step": 1675 }, { "epoch": 1.7881852048962212, "grad_norm": 0.6796875, "learning_rate": 0.0001962299936516214, "loss": 0.7326, "step": 1680 }, { "epoch": 1.7935071846726982, "grad_norm": 2.375, "learning_rate": 0.0001961792722903079, "loss": 0.7935, "step": 1685 }, { "epoch": 1.7988291644491752, "grad_norm": 0.9296875, "learning_rate": 0.00019612821864845067, "loss": 0.7356, "step": 1690 }, { "epoch": 1.804151144225652, "grad_norm": 1.03125, "learning_rate": 0.00019607683290243007, "loss": 0.7077, "step": 1695 }, { "epoch": 1.8094731240021287, "grad_norm": 0.462890625, "learning_rate": 0.0001960251152297738, "loss": 0.733, "step": 1700 }, { "epoch": 1.8147951037786056, "grad_norm": 0.640625, "learning_rate": 0.00019597306580915628, "loss": 0.703, "step": 1705 }, { "epoch": 1.8201170835550826, "grad_norm": 0.75, "learning_rate": 0.00019592068482039804, "loss": 0.7143, "step": 1710 }, { "epoch": 1.8254390633315594, "grad_norm": 0.6796875, "learning_rate": 0.00019586797244446516, "loss": 0.7518, "step": 1715 }, { "epoch": 1.8307610431080361, "grad_norm": 0.80078125, "learning_rate": 0.00019581492886346858, "loss": 0.7418, "step": 1720 }, { "epoch": 1.836083022884513, "grad_norm": 0.9140625, "learning_rate": 0.00019576155426066346, "loss": 0.7585, "step": 1725 }, { "epoch": 1.84140500266099, "grad_norm": 0.65234375, "learning_rate": 0.00019570784882044857, "loss": 0.7638, "step": 1730 }, { "epoch": 1.8467269824374668, "grad_norm": 0.64453125, "learning_rate": 0.0001956538127283657, "loss": 0.6911, "step": 1735 }, { "epoch": 1.8520489622139436, "grad_norm": 0.412109375, "learning_rate": 0.00019559944617109903, "loss": 0.7524, "step": 1740 }, { "epoch": 1.8573709419904203, "grad_norm": 0.93359375, "learning_rate": 0.0001955447493364743, "loss": 0.7205, "step": 1745 }, { "epoch": 1.8626929217668973, "grad_norm": 1.1015625, "learning_rate": 0.00019548972241345834, "loss": 0.7125, "step": 1750 }, { "epoch": 1.8680149015433742, "grad_norm": 0.76171875, "learning_rate": 0.00019543436559215843, "loss": 0.7439, "step": 1755 }, { "epoch": 1.873336881319851, "grad_norm": 0.419921875, "learning_rate": 0.00019537867906382154, "loss": 0.7816, "step": 1760 }, { "epoch": 1.8786588610963277, "grad_norm": 0.76953125, "learning_rate": 0.0001953226630208337, "loss": 0.7388, "step": 1765 }, { "epoch": 1.8839808408728047, "grad_norm": 1.109375, "learning_rate": 0.00019526631765671938, "loss": 0.7467, "step": 1770 }, { "epoch": 1.8893028206492817, "grad_norm": 1.375, "learning_rate": 0.00019520964316614074, "loss": 0.7515, "step": 1775 }, { "epoch": 1.8946248004257584, "grad_norm": 0.474609375, "learning_rate": 0.0001951526397448971, "loss": 0.753, "step": 1780 }, { "epoch": 1.8999467802022352, "grad_norm": 0.6328125, "learning_rate": 0.00019509530758992405, "loss": 0.7295, "step": 1785 }, { "epoch": 1.905268759978712, "grad_norm": 0.92578125, "learning_rate": 0.000195037646899293, "loss": 0.7755, "step": 1790 }, { "epoch": 1.910590739755189, "grad_norm": 0.447265625, "learning_rate": 0.00019497965787221034, "loss": 0.7384, "step": 1795 }, { "epoch": 1.9159127195316659, "grad_norm": 0.625, "learning_rate": 0.00019492134070901674, "loss": 0.7392, "step": 1800 }, { "epoch": 1.9212346993081426, "grad_norm": 0.66796875, "learning_rate": 0.00019486269561118667, "loss": 0.7434, "step": 1805 }, { "epoch": 1.9265566790846194, "grad_norm": 0.470703125, "learning_rate": 0.00019480372278132737, "loss": 0.7346, "step": 1810 }, { "epoch": 1.9318786588610963, "grad_norm": 0.478515625, "learning_rate": 0.00019474442242317847, "loss": 0.7398, "step": 1815 }, { "epoch": 1.9372006386375733, "grad_norm": 0.4375, "learning_rate": 0.00019468479474161104, "loss": 0.7422, "step": 1820 }, { "epoch": 1.94252261841405, "grad_norm": 0.71484375, "learning_rate": 0.00019462483994262705, "loss": 0.8368, "step": 1825 }, { "epoch": 1.9478445981905268, "grad_norm": 0.6796875, "learning_rate": 0.00019456455823335867, "loss": 0.742, "step": 1830 }, { "epoch": 1.9531665779670038, "grad_norm": 1.2578125, "learning_rate": 0.00019450394982206725, "loss": 0.7315, "step": 1835 }, { "epoch": 1.9584885577434807, "grad_norm": 0.416015625, "learning_rate": 0.0001944430149181431, "loss": 0.7271, "step": 1840 }, { "epoch": 1.9638105375199575, "grad_norm": 0.4765625, "learning_rate": 0.0001943817537321043, "loss": 0.7597, "step": 1845 }, { "epoch": 1.9691325172964342, "grad_norm": 0.451171875, "learning_rate": 0.00019432016647559627, "loss": 0.7252, "step": 1850 }, { "epoch": 1.974454497072911, "grad_norm": 0.455078125, "learning_rate": 0.00019425825336139095, "loss": 0.7699, "step": 1855 }, { "epoch": 1.979776476849388, "grad_norm": 0.59765625, "learning_rate": 0.00019419601460338596, "loss": 0.7261, "step": 1860 }, { "epoch": 1.985098456625865, "grad_norm": 0.78515625, "learning_rate": 0.00019413345041660409, "loss": 0.7726, "step": 1865 }, { "epoch": 1.9904204364023417, "grad_norm": 0.396484375, "learning_rate": 0.00019407056101719227, "loss": 0.6764, "step": 1870 }, { "epoch": 1.9957424161788184, "grad_norm": 0.71484375, "learning_rate": 0.0001940073466224211, "loss": 0.7509, "step": 1875 }, { "epoch": 2.0, "eval_loss": 1.9718842506408691, "eval_runtime": 11.8004, "eval_samples_per_second": 6.271, "eval_steps_per_second": 0.593, "step": 1879 }, { "epoch": 2.001064395955295, "grad_norm": 0.6875, "learning_rate": 0.0001939438074506839, "loss": 0.6925, "step": 1880 }, { "epoch": 2.0063863757317724, "grad_norm": 0.80078125, "learning_rate": 0.00019387994372149612, "loss": 0.6744, "step": 1885 }, { "epoch": 2.011708355508249, "grad_norm": 0.6953125, "learning_rate": 0.00019381575565549438, "loss": 0.6553, "step": 1890 }, { "epoch": 2.017030335284726, "grad_norm": 1.1171875, "learning_rate": 0.00019375124347443588, "loss": 0.6365, "step": 1895 }, { "epoch": 2.0223523150612026, "grad_norm": 1.1953125, "learning_rate": 0.00019368640740119756, "loss": 0.6588, "step": 1900 }, { "epoch": 2.02767429483768, "grad_norm": 0.703125, "learning_rate": 0.0001936212476597754, "loss": 0.6305, "step": 1905 }, { "epoch": 2.0329962746141566, "grad_norm": 0.6953125, "learning_rate": 0.00019355576447528346, "loss": 0.6415, "step": 1910 }, { "epoch": 2.0383182543906333, "grad_norm": 0.546875, "learning_rate": 0.0001934899580739534, "loss": 0.6248, "step": 1915 }, { "epoch": 2.04364023416711, "grad_norm": 0.62109375, "learning_rate": 0.0001934238286831334, "loss": 0.6114, "step": 1920 }, { "epoch": 2.048962213943587, "grad_norm": 0.4765625, "learning_rate": 0.00019335737653128753, "loss": 0.6273, "step": 1925 }, { "epoch": 2.054284193720064, "grad_norm": 0.73046875, "learning_rate": 0.000193290601847995, "loss": 0.6697, "step": 1930 }, { "epoch": 2.0596061734965407, "grad_norm": 0.453125, "learning_rate": 0.00019322350486394927, "loss": 0.659, "step": 1935 }, { "epoch": 2.0649281532730175, "grad_norm": 0.71875, "learning_rate": 0.0001931560858109572, "loss": 0.6203, "step": 1940 }, { "epoch": 2.0702501330494942, "grad_norm": 0.62890625, "learning_rate": 0.0001930883449219385, "loss": 0.6613, "step": 1945 }, { "epoch": 2.0755721128259714, "grad_norm": 0.412109375, "learning_rate": 0.00019302028243092459, "loss": 0.6391, "step": 1950 }, { "epoch": 2.080894092602448, "grad_norm": 0.5, "learning_rate": 0.0001929518985730581, "loss": 0.6718, "step": 1955 }, { "epoch": 2.086216072378925, "grad_norm": 0.494140625, "learning_rate": 0.00019288319358459183, "loss": 0.6397, "step": 1960 }, { "epoch": 2.0915380521554017, "grad_norm": 0.5234375, "learning_rate": 0.00019281416770288806, "loss": 0.6484, "step": 1965 }, { "epoch": 2.096860031931879, "grad_norm": 0.451171875, "learning_rate": 0.00019274482116641775, "loss": 0.6803, "step": 1970 }, { "epoch": 2.1021820117083556, "grad_norm": 0.65234375, "learning_rate": 0.0001926751542147595, "loss": 0.6817, "step": 1975 }, { "epoch": 2.1075039914848324, "grad_norm": 0.4765625, "learning_rate": 0.00019260516708859902, "loss": 0.6462, "step": 1980 }, { "epoch": 2.112825971261309, "grad_norm": 0.42578125, "learning_rate": 0.00019253486002972814, "loss": 0.6466, "step": 1985 }, { "epoch": 2.118147951037786, "grad_norm": 0.44140625, "learning_rate": 0.00019246423328104397, "loss": 0.6931, "step": 1990 }, { "epoch": 2.123469930814263, "grad_norm": 0.453125, "learning_rate": 0.00019239328708654804, "loss": 0.646, "step": 1995 }, { "epoch": 2.12879191059074, "grad_norm": 0.75390625, "learning_rate": 0.00019232202169134564, "loss": 0.6191, "step": 2000 }, { "epoch": 2.1341138903672165, "grad_norm": 0.5625, "learning_rate": 0.00019225043734164468, "loss": 0.6753, "step": 2005 }, { "epoch": 2.1394358701436933, "grad_norm": 0.58984375, "learning_rate": 0.00019217853428475507, "loss": 0.6585, "step": 2010 }, { "epoch": 2.1447578499201705, "grad_norm": 0.53125, "learning_rate": 0.0001921063127690878, "loss": 0.6111, "step": 2015 }, { "epoch": 2.1500798296966472, "grad_norm": 0.75390625, "learning_rate": 0.00019203377304415412, "loss": 0.6258, "step": 2020 }, { "epoch": 2.155401809473124, "grad_norm": 0.64453125, "learning_rate": 0.00019196091536056447, "loss": 0.6153, "step": 2025 }, { "epoch": 2.1607237892496007, "grad_norm": 0.69140625, "learning_rate": 0.0001918877399700279, "loss": 0.636, "step": 2030 }, { "epoch": 2.166045769026078, "grad_norm": 0.6484375, "learning_rate": 0.00019181424712535106, "loss": 0.6401, "step": 2035 }, { "epoch": 2.1713677488025547, "grad_norm": 0.5625, "learning_rate": 0.00019174043708043733, "loss": 0.6385, "step": 2040 }, { "epoch": 2.1766897285790314, "grad_norm": 0.5546875, "learning_rate": 0.000191666310090286, "loss": 0.6676, "step": 2045 }, { "epoch": 2.182011708355508, "grad_norm": 0.8046875, "learning_rate": 0.00019159186641099123, "loss": 0.63, "step": 2050 }, { "epoch": 2.187333688131985, "grad_norm": 0.62890625, "learning_rate": 0.00019151710629974135, "loss": 0.6492, "step": 2055 }, { "epoch": 2.192655667908462, "grad_norm": 0.421875, "learning_rate": 0.00019144203001481794, "loss": 0.6504, "step": 2060 }, { "epoch": 2.197977647684939, "grad_norm": 0.396484375, "learning_rate": 0.00019136663781559485, "loss": 0.6106, "step": 2065 }, { "epoch": 2.2032996274614156, "grad_norm": 0.5, "learning_rate": 0.00019129092996253734, "loss": 0.6685, "step": 2070 }, { "epoch": 2.2086216072378924, "grad_norm": 0.5, "learning_rate": 0.00019121490671720123, "loss": 0.623, "step": 2075 }, { "epoch": 2.2139435870143696, "grad_norm": 0.52734375, "learning_rate": 0.00019113856834223193, "loss": 0.687, "step": 2080 }, { "epoch": 2.2192655667908463, "grad_norm": 0.43359375, "learning_rate": 0.00019106191510136355, "loss": 0.6478, "step": 2085 }, { "epoch": 2.224587546567323, "grad_norm": 0.46875, "learning_rate": 0.00019098494725941804, "loss": 0.6461, "step": 2090 }, { "epoch": 2.2299095263438, "grad_norm": 0.74609375, "learning_rate": 0.00019090766508230423, "loss": 0.6268, "step": 2095 }, { "epoch": 2.2352315061202765, "grad_norm": 0.73046875, "learning_rate": 0.00019083006883701686, "loss": 0.6656, "step": 2100 }, { "epoch": 2.2405534858967537, "grad_norm": 0.82421875, "learning_rate": 0.00019075215879163583, "loss": 0.6149, "step": 2105 }, { "epoch": 2.2458754656732305, "grad_norm": 0.62109375, "learning_rate": 0.00019067393521532498, "loss": 0.6526, "step": 2110 }, { "epoch": 2.2511974454497072, "grad_norm": 0.484375, "learning_rate": 0.00019059539837833152, "loss": 0.6255, "step": 2115 }, { "epoch": 2.256519425226184, "grad_norm": 0.5859375, "learning_rate": 0.00019051654855198482, "loss": 0.676, "step": 2120 }, { "epoch": 2.261841405002661, "grad_norm": 1.0390625, "learning_rate": 0.00019043738600869555, "loss": 0.648, "step": 2125 }, { "epoch": 2.267163384779138, "grad_norm": 0.79296875, "learning_rate": 0.00019035791102195484, "loss": 0.6352, "step": 2130 }, { "epoch": 2.2724853645556147, "grad_norm": 0.5390625, "learning_rate": 0.0001902781238663332, "loss": 0.6787, "step": 2135 }, { "epoch": 2.2778073443320914, "grad_norm": 0.80078125, "learning_rate": 0.00019019802481747954, "loss": 0.694, "step": 2140 }, { "epoch": 2.283129324108568, "grad_norm": 0.671875, "learning_rate": 0.00019011761415212047, "loss": 0.6612, "step": 2145 }, { "epoch": 2.2884513038850454, "grad_norm": 0.65234375, "learning_rate": 0.00019003689214805905, "loss": 0.6676, "step": 2150 }, { "epoch": 2.293773283661522, "grad_norm": 0.6640625, "learning_rate": 0.000189955859084174, "loss": 0.6678, "step": 2155 }, { "epoch": 2.299095263437999, "grad_norm": 0.58203125, "learning_rate": 0.00018987451524041867, "loss": 0.659, "step": 2160 }, { "epoch": 2.3044172432144756, "grad_norm": 0.7265625, "learning_rate": 0.00018979286089782012, "loss": 0.6676, "step": 2165 }, { "epoch": 2.309739222990953, "grad_norm": 0.5703125, "learning_rate": 0.00018971089633847807, "loss": 0.6572, "step": 2170 }, { "epoch": 2.3150612027674295, "grad_norm": 0.4453125, "learning_rate": 0.00018962862184556406, "loss": 0.6193, "step": 2175 }, { "epoch": 2.3203831825439063, "grad_norm": 0.7890625, "learning_rate": 0.00018954603770332033, "loss": 0.6453, "step": 2180 }, { "epoch": 2.325705162320383, "grad_norm": 0.70703125, "learning_rate": 0.0001894631441970589, "loss": 0.6557, "step": 2185 }, { "epoch": 2.3310271420968602, "grad_norm": 0.59375, "learning_rate": 0.00018937994161316062, "loss": 0.6718, "step": 2190 }, { "epoch": 2.336349121873337, "grad_norm": 0.4375, "learning_rate": 0.00018929643023907412, "loss": 0.6642, "step": 2195 }, { "epoch": 2.3416711016498137, "grad_norm": 0.5234375, "learning_rate": 0.00018921261036331476, "loss": 0.6345, "step": 2200 }, { "epoch": 2.3469930814262905, "grad_norm": 0.482421875, "learning_rate": 0.0001891284822754639, "loss": 0.6582, "step": 2205 }, { "epoch": 2.3523150612027672, "grad_norm": 0.4453125, "learning_rate": 0.00018904404626616752, "loss": 0.6575, "step": 2210 }, { "epoch": 2.3576370409792444, "grad_norm": 0.52734375, "learning_rate": 0.00018895930262713555, "loss": 0.6606, "step": 2215 }, { "epoch": 2.362959020755721, "grad_norm": 0.470703125, "learning_rate": 0.00018887425165114058, "loss": 0.6436, "step": 2220 }, { "epoch": 2.368281000532198, "grad_norm": 0.54296875, "learning_rate": 0.0001887888936320172, "loss": 0.6509, "step": 2225 }, { "epoch": 2.3736029803086747, "grad_norm": 0.490234375, "learning_rate": 0.0001887032288646605, "loss": 0.6399, "step": 2230 }, { "epoch": 2.378924960085152, "grad_norm": 0.78515625, "learning_rate": 0.00018861725764502557, "loss": 0.6702, "step": 2235 }, { "epoch": 2.3842469398616286, "grad_norm": 0.5078125, "learning_rate": 0.0001885309802701261, "loss": 0.6141, "step": 2240 }, { "epoch": 2.3895689196381054, "grad_norm": 0.50390625, "learning_rate": 0.00018844439703803354, "loss": 0.6306, "step": 2245 }, { "epoch": 2.394890899414582, "grad_norm": 0.384765625, "learning_rate": 0.00018835750824787591, "loss": 0.6449, "step": 2250 }, { "epoch": 2.4002128791910593, "grad_norm": 0.75, "learning_rate": 0.00018827031419983709, "loss": 0.6513, "step": 2255 }, { "epoch": 2.405534858967536, "grad_norm": 0.5390625, "learning_rate": 0.0001881828151951553, "loss": 0.6589, "step": 2260 }, { "epoch": 2.410856838744013, "grad_norm": 0.466796875, "learning_rate": 0.00018809501153612256, "loss": 0.6222, "step": 2265 }, { "epoch": 2.4161788185204895, "grad_norm": 0.66796875, "learning_rate": 0.00018800690352608324, "loss": 0.6575, "step": 2270 }, { "epoch": 2.4215007982969663, "grad_norm": 0.70703125, "learning_rate": 0.00018791849146943332, "loss": 0.6543, "step": 2275 }, { "epoch": 2.4268227780734435, "grad_norm": 0.48046875, "learning_rate": 0.0001878297756716191, "loss": 0.6628, "step": 2280 }, { "epoch": 2.4321447578499202, "grad_norm": 0.6640625, "learning_rate": 0.00018774075643913625, "loss": 0.6593, "step": 2285 }, { "epoch": 2.437466737626397, "grad_norm": 0.482421875, "learning_rate": 0.00018765143407952883, "loss": 0.6596, "step": 2290 }, { "epoch": 2.4427887174028737, "grad_norm": 0.62890625, "learning_rate": 0.0001875618089013881, "loss": 0.7221, "step": 2295 }, { "epoch": 2.4481106971793505, "grad_norm": 0.6640625, "learning_rate": 0.00018747188121435144, "loss": 0.5976, "step": 2300 }, { "epoch": 2.4534326769558277, "grad_norm": 0.453125, "learning_rate": 0.00018738165132910144, "loss": 0.6625, "step": 2305 }, { "epoch": 2.4587546567323044, "grad_norm": 0.515625, "learning_rate": 0.00018729111955736465, "loss": 0.6696, "step": 2310 }, { "epoch": 2.464076636508781, "grad_norm": 0.72265625, "learning_rate": 0.00018720028621191061, "loss": 0.6193, "step": 2315 }, { "epoch": 2.4693986162852584, "grad_norm": 0.6640625, "learning_rate": 0.00018710915160655075, "loss": 0.676, "step": 2320 }, { "epoch": 2.474720596061735, "grad_norm": 0.51171875, "learning_rate": 0.00018701771605613725, "loss": 0.6738, "step": 2325 }, { "epoch": 2.480042575838212, "grad_norm": 0.69140625, "learning_rate": 0.00018692597987656203, "loss": 0.6153, "step": 2330 }, { "epoch": 2.4853645556146886, "grad_norm": 0.51171875, "learning_rate": 0.00018683394338475564, "loss": 0.6602, "step": 2335 }, { "epoch": 2.4906865353911654, "grad_norm": 0.8984375, "learning_rate": 0.00018674160689868613, "loss": 0.6371, "step": 2340 }, { "epoch": 2.4960085151676425, "grad_norm": 0.46875, "learning_rate": 0.00018664897073735793, "loss": 0.6472, "step": 2345 }, { "epoch": 2.5013304949441193, "grad_norm": 0.458984375, "learning_rate": 0.00018655603522081088, "loss": 0.6531, "step": 2350 }, { "epoch": 2.506652474720596, "grad_norm": 0.63671875, "learning_rate": 0.00018646280067011894, "loss": 0.6093, "step": 2355 }, { "epoch": 2.511974454497073, "grad_norm": 0.8359375, "learning_rate": 0.00018636926740738926, "loss": 0.6992, "step": 2360 }, { "epoch": 2.5172964342735495, "grad_norm": 0.45703125, "learning_rate": 0.00018627543575576086, "loss": 0.6245, "step": 2365 }, { "epoch": 2.5226184140500267, "grad_norm": 0.57421875, "learning_rate": 0.00018618130603940386, "loss": 0.6746, "step": 2370 }, { "epoch": 2.5279403938265035, "grad_norm": 0.609375, "learning_rate": 0.00018608687858351783, "loss": 0.6396, "step": 2375 }, { "epoch": 2.5332623736029802, "grad_norm": 0.87109375, "learning_rate": 0.0001859921537143312, "loss": 0.6749, "step": 2380 }, { "epoch": 2.5385843533794574, "grad_norm": 0.5390625, "learning_rate": 0.00018589713175909986, "loss": 0.6895, "step": 2385 }, { "epoch": 2.5439063331559337, "grad_norm": 0.494140625, "learning_rate": 0.00018580181304610603, "loss": 0.6216, "step": 2390 }, { "epoch": 2.549228312932411, "grad_norm": 0.44921875, "learning_rate": 0.00018570619790465718, "loss": 0.6515, "step": 2395 }, { "epoch": 2.5545502927088877, "grad_norm": 0.58984375, "learning_rate": 0.00018561028666508488, "loss": 0.661, "step": 2400 }, { "epoch": 2.5598722724853644, "grad_norm": 0.578125, "learning_rate": 0.0001855140796587437, "loss": 0.7168, "step": 2405 }, { "epoch": 2.5651942522618416, "grad_norm": 0.6640625, "learning_rate": 0.00018541757721801002, "loss": 0.6646, "step": 2410 }, { "epoch": 2.5705162320383184, "grad_norm": 0.53515625, "learning_rate": 0.00018532077967628078, "loss": 0.6926, "step": 2415 }, { "epoch": 2.575838211814795, "grad_norm": 0.5546875, "learning_rate": 0.0001852236873679726, "loss": 0.6247, "step": 2420 }, { "epoch": 2.581160191591272, "grad_norm": 0.81640625, "learning_rate": 0.0001851263006285204, "loss": 0.6618, "step": 2425 }, { "epoch": 2.5864821713677486, "grad_norm": 0.6875, "learning_rate": 0.00018502861979437625, "loss": 0.6673, "step": 2430 }, { "epoch": 2.591804151144226, "grad_norm": 0.50390625, "learning_rate": 0.00018493064520300827, "loss": 0.6899, "step": 2435 }, { "epoch": 2.5971261309207025, "grad_norm": 0.5625, "learning_rate": 0.0001848323771928995, "loss": 0.6905, "step": 2440 }, { "epoch": 2.6024481106971793, "grad_norm": 0.5625, "learning_rate": 0.0001847338161035467, "loss": 0.6497, "step": 2445 }, { "epoch": 2.607770090473656, "grad_norm": 0.7109375, "learning_rate": 0.00018463496227545906, "loss": 0.6578, "step": 2450 }, { "epoch": 2.613092070250133, "grad_norm": 0.7578125, "learning_rate": 0.00018453581605015727, "loss": 0.7002, "step": 2455 }, { "epoch": 2.61841405002661, "grad_norm": 0.6015625, "learning_rate": 0.00018443637777017203, "loss": 0.6744, "step": 2460 }, { "epoch": 2.6237360298030867, "grad_norm": 0.84375, "learning_rate": 0.0001843366477790432, "loss": 0.6617, "step": 2465 }, { "epoch": 2.6290580095795635, "grad_norm": 0.63671875, "learning_rate": 0.00018423662642131828, "loss": 0.6388, "step": 2470 }, { "epoch": 2.6343799893560407, "grad_norm": 0.6953125, "learning_rate": 0.0001841363140425515, "loss": 0.6423, "step": 2475 }, { "epoch": 2.6397019691325174, "grad_norm": 0.49609375, "learning_rate": 0.00018403571098930252, "loss": 0.6503, "step": 2480 }, { "epoch": 2.645023948908994, "grad_norm": 0.53515625, "learning_rate": 0.00018393481760913514, "loss": 0.7427, "step": 2485 }, { "epoch": 2.650345928685471, "grad_norm": 1.5078125, "learning_rate": 0.00018383363425061622, "loss": 0.6905, "step": 2490 }, { "epoch": 2.6556679084619477, "grad_norm": 1.21875, "learning_rate": 0.0001837321612633144, "loss": 0.6969, "step": 2495 }, { "epoch": 2.660989888238425, "grad_norm": 1.3125, "learning_rate": 0.00018363039899779908, "loss": 0.6539, "step": 2500 }, { "epoch": 2.667376263970197, "grad_norm": 0.73828125, "learning_rate": 0.00018352834780563889, "loss": 0.6654, "step": 2505 }, { "epoch": 2.672698243746674, "grad_norm": 0.439453125, "learning_rate": 0.00018342600803940068, "loss": 0.6431, "step": 2510 }, { "epoch": 2.6780202235231507, "grad_norm": 1.3203125, "learning_rate": 0.0001833233800526483, "loss": 0.6596, "step": 2515 }, { "epoch": 2.6833422032996275, "grad_norm": 0.78125, "learning_rate": 0.0001832204641999414, "loss": 0.674, "step": 2520 }, { "epoch": 2.688664183076104, "grad_norm": 0.478515625, "learning_rate": 0.000183117260836834, "loss": 0.678, "step": 2525 }, { "epoch": 2.693986162852581, "grad_norm": 0.478515625, "learning_rate": 0.0001830137703198736, "loss": 0.6595, "step": 2530 }, { "epoch": 2.699308142629058, "grad_norm": 0.5625, "learning_rate": 0.00018290999300659962, "loss": 0.647, "step": 2535 }, { "epoch": 2.704630122405535, "grad_norm": 0.59765625, "learning_rate": 0.0001828059292555423, "loss": 0.6794, "step": 2540 }, { "epoch": 2.7099521021820117, "grad_norm": 0.51953125, "learning_rate": 0.00018270157942622156, "loss": 0.6635, "step": 2545 }, { "epoch": 2.7152740819584884, "grad_norm": 0.427734375, "learning_rate": 0.00018259694387914564, "loss": 0.6607, "step": 2550 }, { "epoch": 2.720596061734965, "grad_norm": 0.5546875, "learning_rate": 0.00018249202297580976, "loss": 0.6829, "step": 2555 }, { "epoch": 2.7259180415114423, "grad_norm": 0.4453125, "learning_rate": 0.00018238681707869522, "loss": 0.6534, "step": 2560 }, { "epoch": 2.731240021287919, "grad_norm": 0.9609375, "learning_rate": 0.00018228132655126767, "loss": 0.611, "step": 2565 }, { "epoch": 2.736562001064396, "grad_norm": 0.9921875, "learning_rate": 0.0001821755517579763, "loss": 0.6687, "step": 2570 }, { "epoch": 2.741883980840873, "grad_norm": 0.42578125, "learning_rate": 0.00018206949306425222, "loss": 0.653, "step": 2575 }, { "epoch": 2.74720596061735, "grad_norm": 0.87890625, "learning_rate": 0.00018196315083650753, "loss": 0.6585, "step": 2580 }, { "epoch": 2.7525279403938265, "grad_norm": 0.73046875, "learning_rate": 0.0001818565254421337, "loss": 0.6414, "step": 2585 }, { "epoch": 2.7578499201703033, "grad_norm": 1.1328125, "learning_rate": 0.00018174961724950065, "loss": 0.6452, "step": 2590 }, { "epoch": 2.76317189994678, "grad_norm": 0.56640625, "learning_rate": 0.0001816424266279552, "loss": 0.6282, "step": 2595 }, { "epoch": 2.7684938797232572, "grad_norm": 0.5, "learning_rate": 0.00018153495394781998, "loss": 0.6971, "step": 2600 }, { "epoch": 2.773815859499734, "grad_norm": 0.482421875, "learning_rate": 0.00018142719958039202, "loss": 0.6389, "step": 2605 }, { "epoch": 2.7791378392762107, "grad_norm": 0.5390625, "learning_rate": 0.00018131916389794154, "loss": 0.6886, "step": 2610 }, { "epoch": 2.7844598190526875, "grad_norm": 0.5625, "learning_rate": 0.0001812108472737107, "loss": 0.6414, "step": 2615 }, { "epoch": 2.789781798829164, "grad_norm": 0.48046875, "learning_rate": 0.00018110225008191217, "loss": 0.6715, "step": 2620 }, { "epoch": 2.7951037786056414, "grad_norm": 0.45703125, "learning_rate": 0.00018099337269772802, "loss": 0.6566, "step": 2625 }, { "epoch": 2.800425758382118, "grad_norm": 0.53515625, "learning_rate": 0.00018088421549730826, "loss": 0.6823, "step": 2630 }, { "epoch": 2.805747738158595, "grad_norm": 0.66015625, "learning_rate": 0.00018077477885776962, "loss": 0.6784, "step": 2635 }, { "epoch": 2.811069717935072, "grad_norm": 0.75, "learning_rate": 0.00018066506315719428, "loss": 0.6835, "step": 2640 }, { "epoch": 2.816391697711549, "grad_norm": 1.140625, "learning_rate": 0.00018055506877462846, "loss": 0.6497, "step": 2645 }, { "epoch": 2.8217136774880256, "grad_norm": 0.7734375, "learning_rate": 0.0001804447960900812, "loss": 0.6716, "step": 2650 }, { "epoch": 2.8270356572645023, "grad_norm": 0.78125, "learning_rate": 0.00018033424548452307, "loss": 0.6717, "step": 2655 }, { "epoch": 2.832357637040979, "grad_norm": 0.47265625, "learning_rate": 0.00018022341733988468, "loss": 0.6597, "step": 2660 }, { "epoch": 2.8376796168174563, "grad_norm": 0.4609375, "learning_rate": 0.00018011231203905563, "loss": 0.6526, "step": 2665 }, { "epoch": 2.843001596593933, "grad_norm": 0.51171875, "learning_rate": 0.00018000092996588296, "loss": 0.6938, "step": 2670 }, { "epoch": 2.84832357637041, "grad_norm": 0.55859375, "learning_rate": 0.00017988927150516985, "loss": 0.7044, "step": 2675 }, { "epoch": 2.8536455561468865, "grad_norm": 0.4375, "learning_rate": 0.00017977733704267447, "loss": 0.6482, "step": 2680 }, { "epoch": 2.8589675359233633, "grad_norm": 0.70703125, "learning_rate": 0.00017966512696510845, "loss": 0.6299, "step": 2685 }, { "epoch": 2.8642895156998405, "grad_norm": 0.640625, "learning_rate": 0.0001795526416601356, "loss": 0.661, "step": 2690 }, { "epoch": 2.869611495476317, "grad_norm": 0.69921875, "learning_rate": 0.00017943988151637063, "loss": 0.6521, "step": 2695 }, { "epoch": 2.874933475252794, "grad_norm": 1.09375, "learning_rate": 0.00017932684692337775, "loss": 0.6633, "step": 2700 }, { "epoch": 2.880255455029271, "grad_norm": 0.859375, "learning_rate": 0.00017921353827166932, "loss": 0.6762, "step": 2705 }, { "epoch": 2.8855774348057475, "grad_norm": 0.9140625, "learning_rate": 0.00017909995595270456, "loss": 0.6608, "step": 2710 }, { "epoch": 2.8908994145822247, "grad_norm": 0.5859375, "learning_rate": 0.00017898610035888807, "loss": 0.6675, "step": 2715 }, { "epoch": 2.8962213943587014, "grad_norm": 0.5078125, "learning_rate": 0.0001788719718835687, "loss": 0.6774, "step": 2720 }, { "epoch": 2.901543374135178, "grad_norm": 0.51953125, "learning_rate": 0.00017875757092103794, "loss": 0.6532, "step": 2725 }, { "epoch": 2.9068653539116553, "grad_norm": 0.46484375, "learning_rate": 0.00017864289786652865, "loss": 0.6671, "step": 2730 }, { "epoch": 2.912187333688132, "grad_norm": 0.69921875, "learning_rate": 0.00017852795311621388, "loss": 0.6792, "step": 2735 }, { "epoch": 2.917509313464609, "grad_norm": 0.498046875, "learning_rate": 0.00017841273706720512, "loss": 0.6545, "step": 2740 }, { "epoch": 2.9228312932410856, "grad_norm": 0.45703125, "learning_rate": 0.00017829725011755132, "loss": 0.624, "step": 2745 }, { "epoch": 2.9281532730175623, "grad_norm": 0.45703125, "learning_rate": 0.00017818149266623723, "loss": 0.6796, "step": 2750 }, { "epoch": 2.9334752527940395, "grad_norm": 0.51953125, "learning_rate": 0.00017806546511318216, "loss": 0.7187, "step": 2755 }, { "epoch": 2.9387972325705163, "grad_norm": 0.515625, "learning_rate": 0.0001779491678592386, "loss": 0.667, "step": 2760 }, { "epoch": 2.944119212346993, "grad_norm": 1.1015625, "learning_rate": 0.0001778326013061908, "loss": 0.6335, "step": 2765 }, { "epoch": 2.94944119212347, "grad_norm": 0.65625, "learning_rate": 0.0001777157658567533, "loss": 0.6619, "step": 2770 }, { "epoch": 2.9547631718999465, "grad_norm": 1.0, "learning_rate": 0.0001775986619145697, "loss": 0.6626, "step": 2775 }, { "epoch": 2.9600851516764237, "grad_norm": 1.1328125, "learning_rate": 0.00017748128988421126, "loss": 0.6814, "step": 2780 }, { "epoch": 2.9654071314529005, "grad_norm": 1.3046875, "learning_rate": 0.00017736365017117524, "loss": 0.689, "step": 2785 }, { "epoch": 2.970729111229377, "grad_norm": 0.490234375, "learning_rate": 0.00017724574318188387, "loss": 0.6237, "step": 2790 }, { "epoch": 2.9760510910058544, "grad_norm": 0.462890625, "learning_rate": 0.00017712756932368272, "loss": 0.6533, "step": 2795 }, { "epoch": 2.981373070782331, "grad_norm": 0.83203125, "learning_rate": 0.00017700912900483923, "loss": 0.6411, "step": 2800 }, { "epoch": 2.986695050558808, "grad_norm": 0.77734375, "learning_rate": 0.0001768904226345416, "loss": 0.6451, "step": 2805 }, { "epoch": 2.9920170303352847, "grad_norm": 0.66796875, "learning_rate": 0.000176771450622897, "loss": 0.6788, "step": 2810 }, { "epoch": 2.9973390101117614, "grad_norm": 0.7578125, "learning_rate": 0.00017665221338093048, "loss": 0.7086, "step": 2815 }, { "epoch": 2.999467802022352, "eval_loss": 2.0285937786102295, "eval_runtime": 12.3766, "eval_samples_per_second": 5.979, "eval_steps_per_second": 0.566, "step": 2817 }, { "epoch": 3.0026609898882386, "grad_norm": 0.6796875, "learning_rate": 0.00017653271132058332, "loss": 0.5998, "step": 2820 }, { "epoch": 3.0079829696647153, "grad_norm": 0.64453125, "learning_rate": 0.00017641294485471173, "loss": 0.527, "step": 2825 }, { "epoch": 3.013304949441192, "grad_norm": 0.8828125, "learning_rate": 0.0001762929143970854, "loss": 0.5497, "step": 2830 }, { "epoch": 3.018626929217669, "grad_norm": 0.62109375, "learning_rate": 0.00017617262036238605, "loss": 0.5321, "step": 2835 }, { "epoch": 3.023948908994146, "grad_norm": 1.0078125, "learning_rate": 0.00017605206316620602, "loss": 0.5764, "step": 2840 }, { "epoch": 3.029270888770623, "grad_norm": 1.25, "learning_rate": 0.00017593124322504673, "loss": 0.567, "step": 2845 }, { "epoch": 3.0345928685470995, "grad_norm": 1.0625, "learning_rate": 0.0001758101609563175, "loss": 0.5559, "step": 2850 }, { "epoch": 3.0399148483235763, "grad_norm": 0.6796875, "learning_rate": 0.00017568881677833376, "loss": 0.5004, "step": 2855 }, { "epoch": 3.045236828100053, "grad_norm": 0.498046875, "learning_rate": 0.00017556721111031592, "loss": 0.5113, "step": 2860 }, { "epoch": 3.05055880787653, "grad_norm": 0.482421875, "learning_rate": 0.00017544534437238777, "loss": 0.5485, "step": 2865 }, { "epoch": 3.055880787653007, "grad_norm": 0.58203125, "learning_rate": 0.00017532321698557498, "loss": 0.5475, "step": 2870 }, { "epoch": 3.0612027674294837, "grad_norm": 0.59375, "learning_rate": 0.00017520082937180374, "loss": 0.5618, "step": 2875 }, { "epoch": 3.0665247472059605, "grad_norm": 0.50390625, "learning_rate": 0.0001750781819538993, "loss": 0.5416, "step": 2880 }, { "epoch": 3.0718467269824377, "grad_norm": 0.58203125, "learning_rate": 0.00017495527515558443, "loss": 0.5332, "step": 2885 }, { "epoch": 3.0771687067589144, "grad_norm": 0.458984375, "learning_rate": 0.00017483210940147807, "loss": 0.5587, "step": 2890 }, { "epoch": 3.082490686535391, "grad_norm": 0.494140625, "learning_rate": 0.00017470868511709376, "loss": 0.5723, "step": 2895 }, { "epoch": 3.087812666311868, "grad_norm": 0.5, "learning_rate": 0.00017458500272883823, "loss": 0.5764, "step": 2900 }, { "epoch": 3.0931346460883447, "grad_norm": 0.435546875, "learning_rate": 0.00017446106266400988, "loss": 0.5275, "step": 2905 }, { "epoch": 3.098456625864822, "grad_norm": 0.60546875, "learning_rate": 0.00017433686535079737, "loss": 0.5389, "step": 2910 }, { "epoch": 3.1037786056412986, "grad_norm": 0.58203125, "learning_rate": 0.0001742124112182781, "loss": 0.5474, "step": 2915 }, { "epoch": 3.1091005854177753, "grad_norm": 0.828125, "learning_rate": 0.00017408770069641668, "loss": 0.5681, "step": 2920 }, { "epoch": 3.114422565194252, "grad_norm": 0.8828125, "learning_rate": 0.00017396273421606357, "loss": 0.5489, "step": 2925 }, { "epoch": 3.1197445449707293, "grad_norm": 0.953125, "learning_rate": 0.00017383751220895346, "loss": 0.5494, "step": 2930 }, { "epoch": 3.125066524747206, "grad_norm": 0.6328125, "learning_rate": 0.00017371203510770385, "loss": 0.5451, "step": 2935 }, { "epoch": 3.130388504523683, "grad_norm": 0.69140625, "learning_rate": 0.00017358630334581357, "loss": 0.5358, "step": 2940 }, { "epoch": 3.1357104843001595, "grad_norm": 0.59765625, "learning_rate": 0.00017346031735766118, "loss": 0.5682, "step": 2945 }, { "epoch": 3.1410324640766367, "grad_norm": 0.55078125, "learning_rate": 0.0001733340775785036, "loss": 0.5435, "step": 2950 }, { "epoch": 3.1463544438531135, "grad_norm": 0.5625, "learning_rate": 0.00017320758444447454, "loss": 0.5454, "step": 2955 }, { "epoch": 3.15167642362959, "grad_norm": 0.53515625, "learning_rate": 0.00017308083839258305, "loss": 0.5766, "step": 2960 }, { "epoch": 3.156998403406067, "grad_norm": 0.6953125, "learning_rate": 0.00017295383986071184, "loss": 0.5457, "step": 2965 }, { "epoch": 3.1623203831825437, "grad_norm": 0.5234375, "learning_rate": 0.00017282658928761597, "loss": 0.5535, "step": 2970 }, { "epoch": 3.167642362959021, "grad_norm": 0.671875, "learning_rate": 0.00017269908711292124, "loss": 0.5701, "step": 2975 }, { "epoch": 3.1729643427354977, "grad_norm": 0.50390625, "learning_rate": 0.00017257133377712275, "loss": 0.5877, "step": 2980 }, { "epoch": 3.1782863225119744, "grad_norm": 0.69140625, "learning_rate": 0.00017244332972158314, "loss": 0.5214, "step": 2985 }, { "epoch": 3.183608302288451, "grad_norm": 0.45703125, "learning_rate": 0.00017231507538853142, "loss": 0.5447, "step": 2990 }, { "epoch": 3.1889302820649283, "grad_norm": 0.63671875, "learning_rate": 0.00017218657122106116, "loss": 0.5619, "step": 2995 }, { "epoch": 3.194252261841405, "grad_norm": 0.53125, "learning_rate": 0.00017205781766312908, "loss": 0.5641, "step": 3000 }, { "epoch": 3.199574241617882, "grad_norm": 0.625, "learning_rate": 0.00017192881515955356, "loss": 0.558, "step": 3005 }, { "epoch": 3.2048962213943586, "grad_norm": 0.66015625, "learning_rate": 0.00017179956415601285, "loss": 0.5721, "step": 3010 }, { "epoch": 3.210218201170836, "grad_norm": 0.474609375, "learning_rate": 0.00017167006509904398, "loss": 0.5513, "step": 3015 }, { "epoch": 3.2155401809473125, "grad_norm": 0.7890625, "learning_rate": 0.00017154031843604072, "loss": 0.5482, "step": 3020 }, { "epoch": 3.2208621607237893, "grad_norm": 0.70703125, "learning_rate": 0.00017141032461525244, "loss": 0.5834, "step": 3025 }, { "epoch": 3.226184140500266, "grad_norm": 0.6796875, "learning_rate": 0.0001712800840857823, "loss": 0.5455, "step": 3030 }, { "epoch": 3.2315061202767428, "grad_norm": 0.49609375, "learning_rate": 0.00017114959729758582, "loss": 0.5782, "step": 3035 }, { "epoch": 3.23682810005322, "grad_norm": 0.60546875, "learning_rate": 0.00017101886470146925, "loss": 0.5694, "step": 3040 }, { "epoch": 3.2421500798296967, "grad_norm": 0.53515625, "learning_rate": 0.00017088788674908819, "loss": 0.5861, "step": 3045 }, { "epoch": 3.2474720596061735, "grad_norm": 0.67578125, "learning_rate": 0.0001707566638929457, "loss": 0.5492, "step": 3050 }, { "epoch": 3.25279403938265, "grad_norm": 0.734375, "learning_rate": 0.00017062519658639114, "loss": 0.5864, "step": 3055 }, { "epoch": 3.258116019159127, "grad_norm": 0.482421875, "learning_rate": 0.00017049348528361822, "loss": 0.5268, "step": 3060 }, { "epoch": 3.263437998935604, "grad_norm": 0.447265625, "learning_rate": 0.0001703615304396637, "loss": 0.5914, "step": 3065 }, { "epoch": 3.268759978712081, "grad_norm": 0.703125, "learning_rate": 0.00017022933251040575, "loss": 0.5649, "step": 3070 }, { "epoch": 3.2740819584885577, "grad_norm": 0.51953125, "learning_rate": 0.0001700968919525623, "loss": 0.5758, "step": 3075 }, { "epoch": 3.2794039382650344, "grad_norm": 0.546875, "learning_rate": 0.00016996420922368955, "loss": 0.5586, "step": 3080 }, { "epoch": 3.2847259180415116, "grad_norm": 0.5390625, "learning_rate": 0.0001698312847821803, "loss": 0.5826, "step": 3085 }, { "epoch": 3.2900478978179883, "grad_norm": 0.52734375, "learning_rate": 0.00016969811908726246, "loss": 0.5774, "step": 3090 }, { "epoch": 3.295369877594465, "grad_norm": 0.55078125, "learning_rate": 0.00016956471259899742, "loss": 0.5724, "step": 3095 }, { "epoch": 3.300691857370942, "grad_norm": 0.5234375, "learning_rate": 0.0001694310657782785, "loss": 0.5497, "step": 3100 }, { "epoch": 3.306013837147419, "grad_norm": 0.60546875, "learning_rate": 0.00016929717908682924, "loss": 0.5608, "step": 3105 }, { "epoch": 3.311335816923896, "grad_norm": 0.5859375, "learning_rate": 0.00016916305298720195, "loss": 0.5384, "step": 3110 }, { "epoch": 3.3166577967003725, "grad_norm": 0.5, "learning_rate": 0.000169028687942776, "loss": 0.5576, "step": 3115 }, { "epoch": 3.3219797764768493, "grad_norm": 0.64453125, "learning_rate": 0.00016889408441775637, "loss": 0.6127, "step": 3120 }, { "epoch": 3.327301756253326, "grad_norm": 0.625, "learning_rate": 0.00016875924287717179, "loss": 0.5582, "step": 3125 }, { "epoch": 3.332623736029803, "grad_norm": 0.6328125, "learning_rate": 0.0001686241637868734, "loss": 0.5577, "step": 3130 }, { "epoch": 3.33794571580628, "grad_norm": 0.70703125, "learning_rate": 0.00016848884761353294, "loss": 0.5816, "step": 3135 }, { "epoch": 3.3432676955827567, "grad_norm": 0.51171875, "learning_rate": 0.00016835329482464136, "loss": 0.5395, "step": 3140 }, { "epoch": 3.3485896753592335, "grad_norm": 0.55078125, "learning_rate": 0.00016821750588850697, "loss": 0.5929, "step": 3145 }, { "epoch": 3.3539116551357107, "grad_norm": 0.486328125, "learning_rate": 0.00016808148127425392, "loss": 0.5645, "step": 3150 }, { "epoch": 3.3592336349121874, "grad_norm": 0.5, "learning_rate": 0.0001679452214518206, "loss": 0.5864, "step": 3155 }, { "epoch": 3.364555614688664, "grad_norm": 0.56640625, "learning_rate": 0.00016780872689195806, "loss": 0.5862, "step": 3160 }, { "epoch": 3.369877594465141, "grad_norm": 0.466796875, "learning_rate": 0.0001676719980662282, "loss": 0.5796, "step": 3165 }, { "epoch": 3.375199574241618, "grad_norm": 0.83984375, "learning_rate": 0.0001675350354470023, "loss": 0.5762, "step": 3170 }, { "epoch": 3.380521554018095, "grad_norm": 0.609375, "learning_rate": 0.0001673978395074595, "loss": 0.5608, "step": 3175 }, { "epoch": 3.3858435337945716, "grad_norm": 0.68359375, "learning_rate": 0.00016726041072158478, "loss": 0.5552, "step": 3180 }, { "epoch": 3.3911655135710483, "grad_norm": 0.70703125, "learning_rate": 0.00016712274956416772, "loss": 0.5798, "step": 3185 }, { "epoch": 3.396487493347525, "grad_norm": 0.57421875, "learning_rate": 0.00016698485651080064, "loss": 0.579, "step": 3190 }, { "epoch": 3.4018094731240023, "grad_norm": 0.490234375, "learning_rate": 0.00016684673203787702, "loss": 0.5886, "step": 3195 }, { "epoch": 3.407131452900479, "grad_norm": 0.578125, "learning_rate": 0.00016670837662258986, "loss": 0.5794, "step": 3200 }, { "epoch": 3.4124534326769558, "grad_norm": 0.578125, "learning_rate": 0.00016656979074293005, "loss": 0.6158, "step": 3205 }, { "epoch": 3.4177754124534325, "grad_norm": 0.81640625, "learning_rate": 0.00016643097487768456, "loss": 0.5823, "step": 3210 }, { "epoch": 3.4230973922299097, "grad_norm": 0.73046875, "learning_rate": 0.00016629192950643512, "loss": 0.5586, "step": 3215 }, { "epoch": 3.4284193720063865, "grad_norm": 0.671875, "learning_rate": 0.0001661526551095562, "loss": 0.5575, "step": 3220 }, { "epoch": 3.433741351782863, "grad_norm": 0.55859375, "learning_rate": 0.00016601315216821356, "loss": 0.5887, "step": 3225 }, { "epoch": 3.43906333155934, "grad_norm": 0.55859375, "learning_rate": 0.00016587342116436248, "loss": 0.5732, "step": 3230 }, { "epoch": 3.444385311335817, "grad_norm": 0.53125, "learning_rate": 0.0001657334625807463, "loss": 0.5648, "step": 3235 }, { "epoch": 3.449707291112294, "grad_norm": 0.62890625, "learning_rate": 0.00016559327690089445, "loss": 0.5885, "step": 3240 }, { "epoch": 3.4550292708887707, "grad_norm": 0.51171875, "learning_rate": 0.000165452864609121, "loss": 0.5653, "step": 3245 }, { "epoch": 3.4603512506652474, "grad_norm": 0.609375, "learning_rate": 0.0001653122261905229, "loss": 0.5704, "step": 3250 }, { "epoch": 3.465673230441724, "grad_norm": 0.80078125, "learning_rate": 0.00016517136213097837, "loss": 0.6076, "step": 3255 }, { "epoch": 3.4709952102182013, "grad_norm": 1.0078125, "learning_rate": 0.00016503027291714508, "loss": 0.6105, "step": 3260 }, { "epoch": 3.476317189994678, "grad_norm": 0.5546875, "learning_rate": 0.00016488895903645867, "loss": 0.5698, "step": 3265 }, { "epoch": 3.481639169771155, "grad_norm": 0.6796875, "learning_rate": 0.00016474742097713092, "loss": 0.5822, "step": 3270 }, { "epoch": 3.4869611495476316, "grad_norm": 0.52734375, "learning_rate": 0.00016460565922814808, "loss": 0.5628, "step": 3275 }, { "epoch": 3.4922831293241083, "grad_norm": 0.57421875, "learning_rate": 0.00016446367427926922, "loss": 0.5588, "step": 3280 }, { "epoch": 3.4976051091005855, "grad_norm": 0.70703125, "learning_rate": 0.0001643214666210245, "loss": 0.5493, "step": 3285 }, { "epoch": 3.5029270888770623, "grad_norm": 0.6953125, "learning_rate": 0.00016417903674471356, "loss": 0.588, "step": 3290 }, { "epoch": 3.508249068653539, "grad_norm": 0.82421875, "learning_rate": 0.0001640363851424037, "loss": 0.5629, "step": 3295 }, { "epoch": 3.513571048430016, "grad_norm": 0.77734375, "learning_rate": 0.0001638935123069283, "loss": 0.597, "step": 3300 }, { "epoch": 3.518893028206493, "grad_norm": 0.78125, "learning_rate": 0.0001637504187318849, "loss": 0.563, "step": 3305 }, { "epoch": 3.5242150079829697, "grad_norm": 0.51953125, "learning_rate": 0.0001636071049116339, "loss": 0.5915, "step": 3310 }, { "epoch": 3.5295369877594465, "grad_norm": 0.51171875, "learning_rate": 0.0001634635713412964, "loss": 0.5308, "step": 3315 }, { "epoch": 3.534858967535923, "grad_norm": 0.79296875, "learning_rate": 0.00016331981851675282, "loss": 0.5425, "step": 3320 }, { "epoch": 3.5401809473124004, "grad_norm": 0.5390625, "learning_rate": 0.00016317584693464093, "loss": 0.5658, "step": 3325 }, { "epoch": 3.545502927088877, "grad_norm": 0.57421875, "learning_rate": 0.00016303165709235444, "loss": 0.5828, "step": 3330 }, { "epoch": 3.550824906865354, "grad_norm": 0.640625, "learning_rate": 0.00016288724948804095, "loss": 0.5912, "step": 3335 }, { "epoch": 3.5561468866418307, "grad_norm": 0.65234375, "learning_rate": 0.0001627426246206004, "loss": 0.56, "step": 3340 }, { "epoch": 3.5614688664183074, "grad_norm": 0.5625, "learning_rate": 0.00016259778298968345, "loss": 0.5752, "step": 3345 }, { "epoch": 3.5667908461947846, "grad_norm": 0.66796875, "learning_rate": 0.00016245272509568954, "loss": 0.5791, "step": 3350 }, { "epoch": 3.5721128259712613, "grad_norm": 0.75, "learning_rate": 0.00016230745143976525, "loss": 0.5523, "step": 3355 }, { "epoch": 3.577434805747738, "grad_norm": 0.82421875, "learning_rate": 0.00016216196252380257, "loss": 0.5962, "step": 3360 }, { "epoch": 3.5827567855242153, "grad_norm": 0.51171875, "learning_rate": 0.00016201625885043733, "loss": 0.5677, "step": 3365 }, { "epoch": 3.5880787653006916, "grad_norm": 0.6796875, "learning_rate": 0.00016187034092304708, "loss": 0.5759, "step": 3370 }, { "epoch": 3.593400745077169, "grad_norm": 0.55078125, "learning_rate": 0.00016172420924574966, "loss": 0.5639, "step": 3375 }, { "epoch": 3.5987227248536455, "grad_norm": 0.56640625, "learning_rate": 0.00016157786432340144, "loss": 0.5914, "step": 3380 }, { "epoch": 3.6040447046301223, "grad_norm": 0.52734375, "learning_rate": 0.00016143130666159542, "loss": 0.5585, "step": 3385 }, { "epoch": 3.6093666844065995, "grad_norm": 0.51953125, "learning_rate": 0.00016128453676665964, "loss": 0.5768, "step": 3390 }, { "epoch": 3.614688664183076, "grad_norm": 0.4375, "learning_rate": 0.0001611375551456553, "loss": 0.5511, "step": 3395 }, { "epoch": 3.620010643959553, "grad_norm": 0.6484375, "learning_rate": 0.00016099036230637513, "loss": 0.5846, "step": 3400 }, { "epoch": 3.6253326237360297, "grad_norm": 0.4921875, "learning_rate": 0.00016084295875734157, "loss": 0.5504, "step": 3405 }, { "epoch": 3.6306546035125065, "grad_norm": 0.53125, "learning_rate": 0.00016069534500780496, "loss": 0.5776, "step": 3410 }, { "epoch": 3.6359765832889837, "grad_norm": 0.97265625, "learning_rate": 0.0001605475215677419, "loss": 0.5976, "step": 3415 }, { "epoch": 3.6412985630654604, "grad_norm": 0.54296875, "learning_rate": 0.00016039948894785346, "loss": 0.5741, "step": 3420 }, { "epoch": 3.646620542841937, "grad_norm": 0.52734375, "learning_rate": 0.00016025124765956328, "loss": 0.5688, "step": 3425 }, { "epoch": 3.651942522618414, "grad_norm": 0.80859375, "learning_rate": 0.00016010279821501602, "loss": 0.5967, "step": 3430 }, { "epoch": 3.6572645023948906, "grad_norm": 0.55859375, "learning_rate": 0.00015995414112707538, "loss": 0.5487, "step": 3435 }, { "epoch": 3.662586482171368, "grad_norm": 0.609375, "learning_rate": 0.0001598052769093225, "loss": 0.564, "step": 3440 }, { "epoch": 3.6679084619478446, "grad_norm": 0.7109375, "learning_rate": 0.0001596562060760541, "loss": 0.6101, "step": 3445 }, { "epoch": 3.6732304417243213, "grad_norm": 0.490234375, "learning_rate": 0.00015950692914228068, "loss": 0.5994, "step": 3450 }, { "epoch": 3.6785524215007985, "grad_norm": 0.67578125, "learning_rate": 0.00015935744662372487, "loss": 0.5635, "step": 3455 }, { "epoch": 3.6838744012772753, "grad_norm": 0.5859375, "learning_rate": 0.00015920775903681937, "loss": 0.5396, "step": 3460 }, { "epoch": 3.689196381053752, "grad_norm": 0.51953125, "learning_rate": 0.00015905786689870556, "loss": 0.574, "step": 3465 }, { "epoch": 3.6945183608302288, "grad_norm": 0.45703125, "learning_rate": 0.00015890777072723137, "loss": 0.6024, "step": 3470 }, { "epoch": 3.6998403406067055, "grad_norm": 0.5, "learning_rate": 0.00015875747104094972, "loss": 0.5896, "step": 3475 }, { "epoch": 3.7051623203831827, "grad_norm": 0.73828125, "learning_rate": 0.00015860696835911652, "loss": 0.5638, "step": 3480 }, { "epoch": 3.7104843001596595, "grad_norm": 0.51171875, "learning_rate": 0.00015845626320168908, "loss": 0.6056, "step": 3485 }, { "epoch": 3.715806279936136, "grad_norm": 0.46875, "learning_rate": 0.00015830535608932417, "loss": 0.5773, "step": 3490 }, { "epoch": 3.721128259712613, "grad_norm": 0.515625, "learning_rate": 0.00015815424754337634, "loss": 0.5379, "step": 3495 }, { "epoch": 3.7264502394890897, "grad_norm": 0.83203125, "learning_rate": 0.00015800293808589601, "loss": 0.5911, "step": 3500 }, { "epoch": 3.731772219265567, "grad_norm": 0.6171875, "learning_rate": 0.00015785142823962768, "loss": 0.562, "step": 3505 }, { "epoch": 3.7370941990420437, "grad_norm": 0.640625, "learning_rate": 0.0001576997185280082, "loss": 0.5798, "step": 3510 }, { "epoch": 3.7424161788185204, "grad_norm": 0.57421875, "learning_rate": 0.0001575478094751649, "loss": 0.6132, "step": 3515 }, { "epoch": 3.7477381585949976, "grad_norm": 0.53515625, "learning_rate": 0.0001573957016059138, "loss": 0.626, "step": 3520 }, { "epoch": 3.7530601383714743, "grad_norm": 0.46484375, "learning_rate": 0.00015724339544575782, "loss": 0.5619, "step": 3525 }, { "epoch": 3.758382118147951, "grad_norm": 0.70703125, "learning_rate": 0.00015709089152088484, "loss": 0.5643, "step": 3530 }, { "epoch": 3.763704097924428, "grad_norm": 0.5546875, "learning_rate": 0.00015693819035816615, "loss": 0.5594, "step": 3535 }, { "epoch": 3.7690260777009046, "grad_norm": 0.55859375, "learning_rate": 0.00015678529248515423, "loss": 0.564, "step": 3540 }, { "epoch": 3.774348057477382, "grad_norm": 0.478515625, "learning_rate": 0.0001566321984300813, "loss": 0.5537, "step": 3545 }, { "epoch": 3.7796700372538585, "grad_norm": 0.546875, "learning_rate": 0.00015647890872185743, "loss": 0.582, "step": 3550 }, { "epoch": 3.7849920170303353, "grad_norm": 0.578125, "learning_rate": 0.0001563254238900685, "loss": 0.5859, "step": 3555 }, { "epoch": 3.790313996806812, "grad_norm": 0.462890625, "learning_rate": 0.00015617174446497443, "loss": 0.5509, "step": 3560 }, { "epoch": 3.7956359765832888, "grad_norm": 0.59765625, "learning_rate": 0.00015601787097750759, "loss": 0.573, "step": 3565 }, { "epoch": 3.800957956359766, "grad_norm": 0.703125, "learning_rate": 0.00015586380395927075, "loss": 0.5858, "step": 3570 }, { "epoch": 3.8062799361362427, "grad_norm": 0.51171875, "learning_rate": 0.0001557095439425353, "loss": 0.5679, "step": 3575 }, { "epoch": 3.8116019159127195, "grad_norm": 0.478515625, "learning_rate": 0.00015555509146023932, "loss": 0.5481, "step": 3580 }, { "epoch": 3.8169238956891967, "grad_norm": 0.8984375, "learning_rate": 0.00015540044704598585, "loss": 0.5643, "step": 3585 }, { "epoch": 3.822245875465673, "grad_norm": 0.53515625, "learning_rate": 0.00015524561123404117, "loss": 0.5872, "step": 3590 }, { "epoch": 3.82756785524215, "grad_norm": 0.5859375, "learning_rate": 0.0001550905845593326, "loss": 0.5669, "step": 3595 }, { "epoch": 3.832889835018627, "grad_norm": 0.55078125, "learning_rate": 0.0001549353675574469, "loss": 0.5921, "step": 3600 }, { "epoch": 3.8382118147951036, "grad_norm": 0.59765625, "learning_rate": 0.00015477996076462843, "loss": 0.5783, "step": 3605 }, { "epoch": 3.843533794571581, "grad_norm": 0.671875, "learning_rate": 0.00015462436471777726, "loss": 0.559, "step": 3610 }, { "epoch": 3.8488557743480576, "grad_norm": 0.53515625, "learning_rate": 0.00015446857995444718, "loss": 0.5822, "step": 3615 }, { "epoch": 3.8541777541245343, "grad_norm": 0.578125, "learning_rate": 0.00015431260701284408, "loss": 0.5454, "step": 3620 }, { "epoch": 3.859499733901011, "grad_norm": 0.53125, "learning_rate": 0.00015415644643182393, "loss": 0.5975, "step": 3625 }, { "epoch": 3.864821713677488, "grad_norm": 0.55859375, "learning_rate": 0.00015400009875089085, "loss": 0.5713, "step": 3630 }, { "epoch": 3.870143693453965, "grad_norm": 0.6015625, "learning_rate": 0.00015384356451019556, "loss": 0.5742, "step": 3635 }, { "epoch": 3.8754656732304418, "grad_norm": 0.6953125, "learning_rate": 0.0001536868442505331, "loss": 0.5468, "step": 3640 }, { "epoch": 3.8807876530069185, "grad_norm": 0.55859375, "learning_rate": 0.00015352993851334136, "loss": 0.6091, "step": 3645 }, { "epoch": 3.8861096327833953, "grad_norm": 0.546875, "learning_rate": 0.0001533728478406988, "loss": 0.5784, "step": 3650 }, { "epoch": 3.891431612559872, "grad_norm": 0.80859375, "learning_rate": 0.00015321557277532305, "loss": 0.6036, "step": 3655 }, { "epoch": 3.896753592336349, "grad_norm": 0.7265625, "learning_rate": 0.0001530581138605686, "loss": 0.5903, "step": 3660 }, { "epoch": 3.902075572112826, "grad_norm": 0.65234375, "learning_rate": 0.00015290047164042512, "loss": 0.5719, "step": 3665 }, { "epoch": 3.9073975518893027, "grad_norm": 0.51953125, "learning_rate": 0.0001527426466595156, "loss": 0.5834, "step": 3670 }, { "epoch": 3.91271953166578, "grad_norm": 0.52734375, "learning_rate": 0.0001525846394630945, "loss": 0.5489, "step": 3675 }, { "epoch": 3.9180415114422567, "grad_norm": 0.486328125, "learning_rate": 0.00015242645059704567, "loss": 0.5702, "step": 3680 }, { "epoch": 3.9233634912187334, "grad_norm": 0.70703125, "learning_rate": 0.00015226808060788067, "loss": 0.6205, "step": 3685 }, { "epoch": 3.92868547099521, "grad_norm": 0.83984375, "learning_rate": 0.00015210953004273677, "loss": 0.5689, "step": 3690 }, { "epoch": 3.934007450771687, "grad_norm": 0.5546875, "learning_rate": 0.00015195079944937514, "loss": 0.5555, "step": 3695 }, { "epoch": 3.939329430548164, "grad_norm": 0.53515625, "learning_rate": 0.00015179188937617885, "loss": 0.5535, "step": 3700 }, { "epoch": 3.944651410324641, "grad_norm": 0.54296875, "learning_rate": 0.0001516328003721511, "loss": 0.5526, "step": 3705 }, { "epoch": 3.9499733901011176, "grad_norm": 0.75, "learning_rate": 0.00015147353298691324, "loss": 0.5647, "step": 3710 }, { "epoch": 3.9552953698775943, "grad_norm": 0.515625, "learning_rate": 0.00015131408777070284, "loss": 0.6031, "step": 3715 }, { "epoch": 3.960617349654071, "grad_norm": 0.59375, "learning_rate": 0.00015115446527437192, "loss": 0.5935, "step": 3720 }, { "epoch": 3.9659393294305483, "grad_norm": 0.76171875, "learning_rate": 0.00015099466604938491, "loss": 0.5641, "step": 3725 }, { "epoch": 3.971261309207025, "grad_norm": 1.0078125, "learning_rate": 0.00015083469064781685, "loss": 0.5836, "step": 3730 }, { "epoch": 3.9765832889835018, "grad_norm": 0.8125, "learning_rate": 0.00015067453962235142, "loss": 0.6089, "step": 3735 }, { "epoch": 3.981905268759979, "grad_norm": 0.515625, "learning_rate": 0.00015051421352627902, "loss": 0.5586, "step": 3740 }, { "epoch": 3.9872272485364557, "grad_norm": 0.76953125, "learning_rate": 0.00015035371291349488, "loss": 0.5859, "step": 3745 }, { "epoch": 3.9925492283129325, "grad_norm": 0.5, "learning_rate": 0.00015019303833849722, "loss": 0.5784, "step": 3750 }, { "epoch": 3.997871208089409, "grad_norm": 0.7421875, "learning_rate": 0.0001500321903563852, "loss": 0.6156, "step": 3755 }, { "epoch": 4.0, "eval_loss": 2.1647000312805176, "eval_runtime": 11.7648, "eval_samples_per_second": 6.29, "eval_steps_per_second": 0.595, "step": 3757 }, { "epoch": 4.003193187865886, "grad_norm": 0.82421875, "learning_rate": 0.0001498711695228571, "loss": 0.5011, "step": 3760 }, { "epoch": 4.008515167642363, "grad_norm": 0.609375, "learning_rate": 0.00014970997639420834, "loss": 0.4581, "step": 3765 }, { "epoch": 4.0138371474188395, "grad_norm": 0.578125, "learning_rate": 0.0001495486115273296, "loss": 0.4708, "step": 3770 }, { "epoch": 4.019159127195317, "grad_norm": 0.59765625, "learning_rate": 0.00014938707547970488, "loss": 0.4529, "step": 3775 }, { "epoch": 4.024481106971794, "grad_norm": 0.6328125, "learning_rate": 0.0001492253688094096, "loss": 0.4527, "step": 3780 }, { "epoch": 4.02980308674827, "grad_norm": 0.62109375, "learning_rate": 0.00014906349207510856, "loss": 0.4484, "step": 3785 }, { "epoch": 4.035125066524747, "grad_norm": 0.55078125, "learning_rate": 0.00014890144583605428, "loss": 0.4464, "step": 3790 }, { "epoch": 4.040447046301224, "grad_norm": 0.578125, "learning_rate": 0.00014873923065208467, "loss": 0.4669, "step": 3795 }, { "epoch": 4.045769026077701, "grad_norm": 0.51953125, "learning_rate": 0.00014857684708362146, "loss": 0.4413, "step": 3800 }, { "epoch": 4.051091005854178, "grad_norm": 0.69921875, "learning_rate": 0.00014841429569166804, "loss": 0.4761, "step": 3805 }, { "epoch": 4.056412985630654, "grad_norm": 0.73046875, "learning_rate": 0.0001482515770378077, "loss": 0.438, "step": 3810 }, { "epoch": 4.0617349654071315, "grad_norm": 0.57421875, "learning_rate": 0.00014808869168420137, "loss": 0.47, "step": 3815 }, { "epoch": 4.067056945183609, "grad_norm": 0.51171875, "learning_rate": 0.00014792564019358614, "loss": 0.4716, "step": 3820 }, { "epoch": 4.072378924960085, "grad_norm": 0.65625, "learning_rate": 0.00014776242312927295, "loss": 0.4458, "step": 3825 }, { "epoch": 4.077700904736562, "grad_norm": 0.7578125, "learning_rate": 0.0001475990410551448, "loss": 0.4517, "step": 3830 }, { "epoch": 4.0830228845130385, "grad_norm": 0.8359375, "learning_rate": 0.0001474354945356547, "loss": 0.4845, "step": 3835 }, { "epoch": 4.088344864289516, "grad_norm": 0.68359375, "learning_rate": 0.0001472717841358239, "loss": 0.4574, "step": 3840 }, { "epoch": 4.093666844065993, "grad_norm": 0.59765625, "learning_rate": 0.00014710791042123967, "loss": 0.4738, "step": 3845 }, { "epoch": 4.098988823842469, "grad_norm": 0.51953125, "learning_rate": 0.00014694387395805368, "loss": 0.4535, "step": 3850 }, { "epoch": 4.104310803618946, "grad_norm": 0.58203125, "learning_rate": 0.0001467796753129797, "loss": 0.4581, "step": 3855 }, { "epoch": 4.109632783395423, "grad_norm": 0.68359375, "learning_rate": 0.00014661531505329192, "loss": 0.4781, "step": 3860 }, { "epoch": 4.1149547631719, "grad_norm": 0.66796875, "learning_rate": 0.00014645079374682284, "loss": 0.4586, "step": 3865 }, { "epoch": 4.120276742948377, "grad_norm": 0.5234375, "learning_rate": 0.00014628611196196135, "loss": 0.4543, "step": 3870 }, { "epoch": 4.125598722724853, "grad_norm": 0.75, "learning_rate": 0.00014612127026765074, "loss": 0.4593, "step": 3875 }, { "epoch": 4.130920702501331, "grad_norm": 0.52734375, "learning_rate": 0.00014595626923338676, "loss": 0.4671, "step": 3880 }, { "epoch": 4.136242682277807, "grad_norm": 0.53515625, "learning_rate": 0.00014579110942921568, "loss": 0.4699, "step": 3885 }, { "epoch": 4.141564662054284, "grad_norm": 0.55859375, "learning_rate": 0.00014562579142573227, "loss": 0.4598, "step": 3890 }, { "epoch": 4.146886641830761, "grad_norm": 0.6015625, "learning_rate": 0.00014546031579407782, "loss": 0.4517, "step": 3895 }, { "epoch": 4.152208621607238, "grad_norm": 0.5625, "learning_rate": 0.00014529468310593824, "loss": 0.4742, "step": 3900 }, { "epoch": 4.157530601383715, "grad_norm": 0.5078125, "learning_rate": 0.000145128893933542, "loss": 0.4618, "step": 3905 }, { "epoch": 4.162852581160192, "grad_norm": 0.5859375, "learning_rate": 0.00014496294884965827, "loss": 0.4878, "step": 3910 }, { "epoch": 4.168174560936668, "grad_norm": 0.52734375, "learning_rate": 0.00014479684842759471, "loss": 0.4877, "step": 3915 }, { "epoch": 4.1734965407131455, "grad_norm": 0.6171875, "learning_rate": 0.0001446305932411958, "loss": 0.4966, "step": 3920 }, { "epoch": 4.178818520489622, "grad_norm": 0.8359375, "learning_rate": 0.0001444641838648406, "loss": 0.4887, "step": 3925 }, { "epoch": 4.184140500266099, "grad_norm": 0.5625, "learning_rate": 0.000144297620873441, "loss": 0.481, "step": 3930 }, { "epoch": 4.189462480042576, "grad_norm": 0.55078125, "learning_rate": 0.00014413090484243938, "loss": 0.4957, "step": 3935 }, { "epoch": 4.1947844598190525, "grad_norm": 0.55078125, "learning_rate": 0.00014396403634780707, "loss": 0.4919, "step": 3940 }, { "epoch": 4.20010643959553, "grad_norm": 0.53125, "learning_rate": 0.00014379701596604197, "loss": 0.4785, "step": 3945 }, { "epoch": 4.205428419372007, "grad_norm": 0.53125, "learning_rate": 0.00014362984427416676, "loss": 0.4613, "step": 3950 }, { "epoch": 4.210750399148483, "grad_norm": 0.55078125, "learning_rate": 0.00014346252184972694, "loss": 0.4713, "step": 3955 }, { "epoch": 4.21607237892496, "grad_norm": 0.64453125, "learning_rate": 0.00014329504927078866, "loss": 0.4909, "step": 3960 }, { "epoch": 4.221394358701437, "grad_norm": 0.546875, "learning_rate": 0.0001431274271159369, "loss": 0.4615, "step": 3965 }, { "epoch": 4.226716338477914, "grad_norm": 0.56640625, "learning_rate": 0.00014295965596427342, "loss": 0.4778, "step": 3970 }, { "epoch": 4.232038318254391, "grad_norm": 0.4921875, "learning_rate": 0.00014279173639541453, "loss": 0.4688, "step": 3975 }, { "epoch": 4.237360298030867, "grad_norm": 0.64453125, "learning_rate": 0.0001426236689894896, "loss": 0.4725, "step": 3980 }, { "epoch": 4.2426822778073445, "grad_norm": 0.52734375, "learning_rate": 0.00014245545432713848, "loss": 0.4606, "step": 3985 }, { "epoch": 4.248004257583821, "grad_norm": 0.5390625, "learning_rate": 0.00014228709298950997, "loss": 0.4674, "step": 3990 }, { "epoch": 4.253326237360298, "grad_norm": 0.72265625, "learning_rate": 0.00014211858555825943, "loss": 0.472, "step": 3995 }, { "epoch": 4.258648217136775, "grad_norm": 0.69921875, "learning_rate": 0.0001419499326155471, "loss": 0.4665, "step": 4000 }, { "epoch": 4.2639701969132515, "grad_norm": 0.609375, "learning_rate": 0.00014178113474403578, "loss": 0.4955, "step": 4005 }, { "epoch": 4.269292176689729, "grad_norm": 0.578125, "learning_rate": 0.00014161219252688907, "loss": 0.4875, "step": 4010 }, { "epoch": 4.274614156466205, "grad_norm": 0.5234375, "learning_rate": 0.00014144310654776925, "loss": 0.4908, "step": 4015 }, { "epoch": 4.279936136242682, "grad_norm": 0.58203125, "learning_rate": 0.00014127387739083525, "loss": 0.4753, "step": 4020 }, { "epoch": 4.285258116019159, "grad_norm": 0.6640625, "learning_rate": 0.00014110450564074065, "loss": 0.4749, "step": 4025 }, { "epoch": 4.290580095795636, "grad_norm": 0.77734375, "learning_rate": 0.00014093499188263167, "loss": 0.5001, "step": 4030 }, { "epoch": 4.295902075572113, "grad_norm": 0.69921875, "learning_rate": 0.00014076533670214514, "loss": 0.5005, "step": 4035 }, { "epoch": 4.301224055348589, "grad_norm": 0.5625, "learning_rate": 0.00014059554068540645, "loss": 0.49, "step": 4040 }, { "epoch": 4.306546035125066, "grad_norm": 0.5625, "learning_rate": 0.0001404256044190276, "loss": 0.4607, "step": 4045 }, { "epoch": 4.311868014901544, "grad_norm": 0.6484375, "learning_rate": 0.00014025552849010508, "loss": 0.4621, "step": 4050 }, { "epoch": 4.31718999467802, "grad_norm": 0.6796875, "learning_rate": 0.00014008531348621789, "loss": 0.4774, "step": 4055 }, { "epoch": 4.322511974454497, "grad_norm": 0.71484375, "learning_rate": 0.0001399149599954256, "loss": 0.4622, "step": 4060 }, { "epoch": 4.327833954230974, "grad_norm": 0.76953125, "learning_rate": 0.00013974446860626604, "loss": 0.4715, "step": 4065 }, { "epoch": 4.333155934007451, "grad_norm": 0.64453125, "learning_rate": 0.0001395738399077536, "loss": 0.4846, "step": 4070 }, { "epoch": 4.338477913783928, "grad_norm": 0.7109375, "learning_rate": 0.00013940307448937706, "loss": 0.4937, "step": 4075 }, { "epoch": 4.343799893560404, "grad_norm": 0.59375, "learning_rate": 0.00013923217294109746, "loss": 0.4719, "step": 4080 }, { "epoch": 4.349121873336881, "grad_norm": 0.5546875, "learning_rate": 0.0001390611358533461, "loss": 0.4818, "step": 4085 }, { "epoch": 4.3544438531133585, "grad_norm": 0.65234375, "learning_rate": 0.00013888996381702267, "loss": 0.4665, "step": 4090 }, { "epoch": 4.359765832889835, "grad_norm": 0.6875, "learning_rate": 0.00013871865742349305, "loss": 0.4916, "step": 4095 }, { "epoch": 4.365087812666312, "grad_norm": 0.578125, "learning_rate": 0.00013854721726458725, "loss": 0.4621, "step": 4100 }, { "epoch": 4.370409792442789, "grad_norm": 0.60546875, "learning_rate": 0.00013837564393259736, "loss": 0.508, "step": 4105 }, { "epoch": 4.3757317722192655, "grad_norm": 0.69921875, "learning_rate": 0.0001382039380202757, "loss": 0.4731, "step": 4110 }, { "epoch": 4.381053751995743, "grad_norm": 0.62890625, "learning_rate": 0.0001380321001208326, "loss": 0.4841, "step": 4115 }, { "epoch": 4.386375731772219, "grad_norm": 0.55078125, "learning_rate": 0.00013786013082793422, "loss": 0.4635, "step": 4120 }, { "epoch": 4.391697711548696, "grad_norm": 0.51171875, "learning_rate": 0.0001376880307357009, "loss": 0.4678, "step": 4125 }, { "epoch": 4.397019691325173, "grad_norm": 0.76171875, "learning_rate": 0.00013751580043870466, "loss": 0.4966, "step": 4130 }, { "epoch": 4.40234167110165, "grad_norm": 0.640625, "learning_rate": 0.00013734344053196746, "loss": 0.4582, "step": 4135 }, { "epoch": 4.407663650878127, "grad_norm": 1.0234375, "learning_rate": 0.000137170951610959, "loss": 0.4699, "step": 4140 }, { "epoch": 4.412985630654603, "grad_norm": 0.66796875, "learning_rate": 0.00013699833427159474, "loss": 0.4967, "step": 4145 }, { "epoch": 4.41830761043108, "grad_norm": 0.7109375, "learning_rate": 0.00013682558911023375, "loss": 0.4699, "step": 4150 }, { "epoch": 4.4236295902075575, "grad_norm": 0.62109375, "learning_rate": 0.00013665271672367674, "loss": 0.4742, "step": 4155 }, { "epoch": 4.428951569984034, "grad_norm": 0.5390625, "learning_rate": 0.0001364797177091639, "loss": 0.4706, "step": 4160 }, { "epoch": 4.434273549760511, "grad_norm": 0.578125, "learning_rate": 0.00013630659266437295, "loss": 0.4903, "step": 4165 }, { "epoch": 4.439595529536987, "grad_norm": 0.61328125, "learning_rate": 0.00013613334218741705, "loss": 0.5098, "step": 4170 }, { "epoch": 4.4449175093134645, "grad_norm": 0.59375, "learning_rate": 0.0001359599668768426, "loss": 0.4781, "step": 4175 }, { "epoch": 4.450239489089942, "grad_norm": 0.5625, "learning_rate": 0.00013578646733162733, "loss": 0.5071, "step": 4180 }, { "epoch": 4.455561468866418, "grad_norm": 0.6328125, "learning_rate": 0.00013561284415117823, "loss": 0.4723, "step": 4185 }, { "epoch": 4.460883448642895, "grad_norm": 0.8828125, "learning_rate": 0.00013543909793532934, "loss": 0.4832, "step": 4190 }, { "epoch": 4.466205428419372, "grad_norm": 0.73828125, "learning_rate": 0.0001352652292843398, "loss": 0.4876, "step": 4195 }, { "epoch": 4.471527408195849, "grad_norm": 0.5859375, "learning_rate": 0.0001350912387988917, "loss": 0.4889, "step": 4200 }, { "epoch": 4.476849387972326, "grad_norm": 0.58984375, "learning_rate": 0.0001349171270800882, "loss": 0.4967, "step": 4205 }, { "epoch": 4.482171367748802, "grad_norm": 0.50390625, "learning_rate": 0.00013474289472945104, "loss": 0.5014, "step": 4210 }, { "epoch": 4.487493347525279, "grad_norm": 0.63671875, "learning_rate": 0.00013456854234891895, "loss": 0.5056, "step": 4215 }, { "epoch": 4.492815327301757, "grad_norm": 0.65234375, "learning_rate": 0.00013439407054084522, "loss": 0.4784, "step": 4220 }, { "epoch": 4.498137307078233, "grad_norm": 0.56640625, "learning_rate": 0.00013421947990799576, "loss": 0.4719, "step": 4225 }, { "epoch": 4.50345928685471, "grad_norm": 0.640625, "learning_rate": 0.00013404477105354699, "loss": 0.5116, "step": 4230 }, { "epoch": 4.508781266631187, "grad_norm": 0.625, "learning_rate": 0.0001338699445810838, "loss": 0.4809, "step": 4235 }, { "epoch": 4.514103246407664, "grad_norm": 0.57421875, "learning_rate": 0.0001336950010945974, "loss": 0.4653, "step": 4240 }, { "epoch": 4.519425226184141, "grad_norm": 0.55078125, "learning_rate": 0.00013351994119848327, "loss": 0.4739, "step": 4245 }, { "epoch": 4.524747205960617, "grad_norm": 0.6796875, "learning_rate": 0.000133344765497539, "loss": 0.4833, "step": 4250 }, { "epoch": 4.530069185737094, "grad_norm": 0.5390625, "learning_rate": 0.0001331694745969624, "loss": 0.5088, "step": 4255 }, { "epoch": 4.5353911655135715, "grad_norm": 0.5625, "learning_rate": 0.00013299406910234916, "loss": 0.4993, "step": 4260 }, { "epoch": 4.540713145290048, "grad_norm": 0.64453125, "learning_rate": 0.00013281854961969092, "loss": 0.4978, "step": 4265 }, { "epoch": 4.546035125066525, "grad_norm": 0.6015625, "learning_rate": 0.000132642916755373, "loss": 0.4695, "step": 4270 }, { "epoch": 4.551357104843001, "grad_norm": 0.51953125, "learning_rate": 0.0001324671711161727, "loss": 0.4945, "step": 4275 }, { "epoch": 4.5566790846194785, "grad_norm": 0.58203125, "learning_rate": 0.00013229131330925667, "loss": 0.5161, "step": 4280 }, { "epoch": 4.562001064395956, "grad_norm": 0.515625, "learning_rate": 0.0001321153439421792, "loss": 0.4903, "step": 4285 }, { "epoch": 4.567323044172432, "grad_norm": 0.6875, "learning_rate": 0.00013193926362287998, "loss": 0.4731, "step": 4290 }, { "epoch": 4.572645023948909, "grad_norm": 0.640625, "learning_rate": 0.00013176307295968207, "loss": 0.4992, "step": 4295 }, { "epoch": 4.5779670037253855, "grad_norm": 0.6171875, "learning_rate": 0.00013158677256128962, "loss": 0.4867, "step": 4300 }, { "epoch": 4.583288983501863, "grad_norm": 0.52734375, "learning_rate": 0.000131410363036786, "loss": 0.4847, "step": 4305 }, { "epoch": 4.58861096327834, "grad_norm": 0.77734375, "learning_rate": 0.00013123384499563154, "loss": 0.5157, "step": 4310 }, { "epoch": 4.593932943054816, "grad_norm": 0.51953125, "learning_rate": 0.00013105721904766154, "loss": 0.4867, "step": 4315 }, { "epoch": 4.599254922831293, "grad_norm": 0.515625, "learning_rate": 0.00013088048580308395, "loss": 0.4797, "step": 4320 }, { "epoch": 4.60457690260777, "grad_norm": 0.5703125, "learning_rate": 0.00013070364587247762, "loss": 0.4831, "step": 4325 }, { "epoch": 4.609898882384247, "grad_norm": 0.59765625, "learning_rate": 0.00013052669986678982, "loss": 0.4981, "step": 4330 }, { "epoch": 4.615220862160724, "grad_norm": 0.86328125, "learning_rate": 0.00013034964839733434, "loss": 0.492, "step": 4335 }, { "epoch": 4.6205428419372, "grad_norm": 0.5546875, "learning_rate": 0.0001301724920757893, "loss": 0.5086, "step": 4340 }, { "epoch": 4.6258648217136775, "grad_norm": 0.50390625, "learning_rate": 0.0001299952315141951, "loss": 0.4565, "step": 4345 }, { "epoch": 4.631186801490155, "grad_norm": 0.58203125, "learning_rate": 0.00012981786732495224, "loss": 0.5395, "step": 4350 }, { "epoch": 4.636508781266631, "grad_norm": 0.546875, "learning_rate": 0.0001296404001208193, "loss": 0.4897, "step": 4355 }, { "epoch": 4.641830761043108, "grad_norm": 0.55859375, "learning_rate": 0.00012946283051491067, "loss": 0.5144, "step": 4360 }, { "epoch": 4.6471527408195845, "grad_norm": 0.76953125, "learning_rate": 0.0001292851591206945, "loss": 0.473, "step": 4365 }, { "epoch": 4.652474720596062, "grad_norm": 1.0, "learning_rate": 0.00012910738655199072, "loss": 0.4786, "step": 4370 }, { "epoch": 4.657796700372539, "grad_norm": 0.8359375, "learning_rate": 0.0001289295134229687, "loss": 0.5002, "step": 4375 }, { "epoch": 4.663118680149015, "grad_norm": 0.6484375, "learning_rate": 0.00012875154034814524, "loss": 0.5096, "step": 4380 }, { "epoch": 4.668440659925492, "grad_norm": 0.62109375, "learning_rate": 0.0001285734679423825, "loss": 0.489, "step": 4385 }, { "epoch": 4.67376263970197, "grad_norm": 0.921875, "learning_rate": 0.0001283952968208857, "loss": 0.5028, "step": 4390 }, { "epoch": 4.679084619478446, "grad_norm": 1.046875, "learning_rate": 0.0001282170275992012, "loss": 0.5051, "step": 4395 }, { "epoch": 4.684406599254923, "grad_norm": 0.7578125, "learning_rate": 0.0001280386608932142, "loss": 0.512, "step": 4400 }, { "epoch": 4.689728579031399, "grad_norm": 0.58203125, "learning_rate": 0.00012786019731914677, "loss": 0.4749, "step": 4405 }, { "epoch": 4.695050558807877, "grad_norm": 0.58203125, "learning_rate": 0.00012768163749355554, "loss": 0.4909, "step": 4410 }, { "epoch": 4.700372538584354, "grad_norm": 0.62109375, "learning_rate": 0.0001275029820333298, "loss": 0.4688, "step": 4415 }, { "epoch": 4.70569451836083, "grad_norm": 0.53125, "learning_rate": 0.0001273242315556891, "loss": 0.4894, "step": 4420 }, { "epoch": 4.711016498137307, "grad_norm": 0.6484375, "learning_rate": 0.00012714538667818134, "loss": 0.4596, "step": 4425 }, { "epoch": 4.716338477913784, "grad_norm": 0.55078125, "learning_rate": 0.00012696644801868053, "loss": 0.5157, "step": 4430 }, { "epoch": 4.721660457690261, "grad_norm": 0.53515625, "learning_rate": 0.00012678741619538475, "loss": 0.4925, "step": 4435 }, { "epoch": 4.726982437466738, "grad_norm": 0.5625, "learning_rate": 0.00012660829182681378, "loss": 0.4697, "step": 4440 }, { "epoch": 4.732304417243214, "grad_norm": 0.69921875, "learning_rate": 0.00012642907553180724, "loss": 0.4802, "step": 4445 }, { "epoch": 4.7376263970196915, "grad_norm": 0.498046875, "learning_rate": 0.0001262497679295224, "loss": 0.5064, "step": 4450 }, { "epoch": 4.742948376796168, "grad_norm": 0.62890625, "learning_rate": 0.0001260703696394318, "loss": 0.473, "step": 4455 }, { "epoch": 4.748270356572645, "grad_norm": 0.609375, "learning_rate": 0.00012589088128132144, "loss": 0.5124, "step": 4460 }, { "epoch": 4.753592336349122, "grad_norm": 0.5546875, "learning_rate": 0.00012571130347528844, "loss": 0.5019, "step": 4465 }, { "epoch": 4.7589143161255985, "grad_norm": 0.55859375, "learning_rate": 0.0001255316368417389, "loss": 0.4592, "step": 4470 }, { "epoch": 4.764236295902076, "grad_norm": 0.90625, "learning_rate": 0.00012535188200138588, "loss": 0.501, "step": 4475 }, { "epoch": 4.769558275678552, "grad_norm": 0.76953125, "learning_rate": 0.00012517203957524713, "loss": 0.4961, "step": 4480 }, { "epoch": 4.774880255455029, "grad_norm": 0.63671875, "learning_rate": 0.00012499211018464297, "loss": 0.4957, "step": 4485 }, { "epoch": 4.780202235231506, "grad_norm": 0.6171875, "learning_rate": 0.0001248120944511942, "loss": 0.4839, "step": 4490 }, { "epoch": 4.785524215007983, "grad_norm": 0.5625, "learning_rate": 0.00012463199299681996, "loss": 0.498, "step": 4495 }, { "epoch": 4.79084619478446, "grad_norm": 0.55859375, "learning_rate": 0.0001244518064437355, "loss": 0.4709, "step": 4500 }, { "epoch": 4.796168174560937, "grad_norm": 0.6015625, "learning_rate": 0.00012427153541444996, "loss": 0.4881, "step": 4505 }, { "epoch": 4.801490154337413, "grad_norm": 0.625, "learning_rate": 0.0001240911805317645, "loss": 0.4899, "step": 4510 }, { "epoch": 4.8068121341138905, "grad_norm": 0.59765625, "learning_rate": 0.00012391074241876993, "loss": 0.514, "step": 4515 }, { "epoch": 4.812134113890368, "grad_norm": 0.546875, "learning_rate": 0.00012373022169884454, "loss": 0.5027, "step": 4520 }, { "epoch": 4.817456093666844, "grad_norm": 0.60546875, "learning_rate": 0.00012354961899565205, "loss": 0.4775, "step": 4525 }, { "epoch": 4.822778073443321, "grad_norm": 0.57421875, "learning_rate": 0.00012336893493313945, "loss": 0.4986, "step": 4530 }, { "epoch": 4.8281000532197975, "grad_norm": 0.54296875, "learning_rate": 0.00012318817013553477, "loss": 0.4728, "step": 4535 }, { "epoch": 4.833422032996275, "grad_norm": 0.57421875, "learning_rate": 0.00012300732522734498, "loss": 0.4797, "step": 4540 }, { "epoch": 4.838744012772752, "grad_norm": 0.6484375, "learning_rate": 0.00012282640083335374, "loss": 0.5078, "step": 4545 }, { "epoch": 4.844065992549228, "grad_norm": 0.60546875, "learning_rate": 0.00012264539757861947, "loss": 0.4934, "step": 4550 }, { "epoch": 4.849387972325705, "grad_norm": 0.5703125, "learning_rate": 0.00012246431608847294, "loss": 0.4861, "step": 4555 }, { "epoch": 4.854709952102182, "grad_norm": 0.70703125, "learning_rate": 0.0001222831569885152, "loss": 0.5071, "step": 4560 }, { "epoch": 4.860031931878659, "grad_norm": 0.703125, "learning_rate": 0.00012210192090461545, "loss": 0.4731, "step": 4565 }, { "epoch": 4.865353911655136, "grad_norm": 0.734375, "learning_rate": 0.0001219206084629089, "loss": 0.5144, "step": 4570 }, { "epoch": 4.870675891431612, "grad_norm": 0.69921875, "learning_rate": 0.00012173922028979443, "loss": 0.4964, "step": 4575 }, { "epoch": 4.87599787120809, "grad_norm": 0.6171875, "learning_rate": 0.00012155775701193276, "loss": 0.5059, "step": 4580 }, { "epoch": 4.881319850984566, "grad_norm": 0.7890625, "learning_rate": 0.00012137621925624383, "loss": 0.4933, "step": 4585 }, { "epoch": 4.886641830761043, "grad_norm": 0.5625, "learning_rate": 0.00012119460764990513, "loss": 0.4985, "step": 4590 }, { "epoch": 4.89196381053752, "grad_norm": 0.953125, "learning_rate": 0.00012101292282034915, "loss": 0.4946, "step": 4595 }, { "epoch": 4.897285790313997, "grad_norm": 0.61328125, "learning_rate": 0.00012083116539526135, "loss": 0.5139, "step": 4600 }, { "epoch": 4.902607770090474, "grad_norm": 1.0234375, "learning_rate": 0.00012064933600257806, "loss": 0.4795, "step": 4605 }, { "epoch": 4.90792974986695, "grad_norm": 0.64453125, "learning_rate": 0.00012046743527048423, "loss": 0.4912, "step": 4610 }, { "epoch": 4.913251729643427, "grad_norm": 0.66015625, "learning_rate": 0.00012028546382741119, "loss": 0.4901, "step": 4615 }, { "epoch": 4.9185737094199045, "grad_norm": 0.62109375, "learning_rate": 0.0001201034223020347, "loss": 0.4644, "step": 4620 }, { "epoch": 4.923895689196381, "grad_norm": 0.55078125, "learning_rate": 0.00011992131132327249, "loss": 0.4988, "step": 4625 }, { "epoch": 4.929217668972858, "grad_norm": 0.5390625, "learning_rate": 0.0001197391315202824, "loss": 0.4882, "step": 4630 }, { "epoch": 4.934539648749334, "grad_norm": 0.51171875, "learning_rate": 0.00011955688352245988, "loss": 0.4956, "step": 4635 }, { "epoch": 4.9398616285258115, "grad_norm": 0.58203125, "learning_rate": 0.00011937456795943611, "loss": 0.4761, "step": 4640 }, { "epoch": 4.945183608302289, "grad_norm": 0.61328125, "learning_rate": 0.0001191921854610756, "loss": 0.5084, "step": 4645 }, { "epoch": 4.950505588078765, "grad_norm": 0.60546875, "learning_rate": 0.00011900973665747424, "loss": 0.5138, "step": 4650 }, { "epoch": 4.955827567855242, "grad_norm": 0.71484375, "learning_rate": 0.00011882722217895678, "loss": 0.4896, "step": 4655 }, { "epoch": 4.961149547631719, "grad_norm": 0.56640625, "learning_rate": 0.00011864464265607507, "loss": 0.5275, "step": 4660 }, { "epoch": 4.966471527408196, "grad_norm": 0.63671875, "learning_rate": 0.00011846199871960557, "loss": 0.5084, "step": 4665 }, { "epoch": 4.971793507184673, "grad_norm": 0.57421875, "learning_rate": 0.00011827929100054733, "loss": 0.4898, "step": 4670 }, { "epoch": 4.97711548696115, "grad_norm": 0.54296875, "learning_rate": 0.0001180965201301197, "loss": 0.4784, "step": 4675 }, { "epoch": 4.982437466737626, "grad_norm": 0.609375, "learning_rate": 0.0001179136867397602, "loss": 0.4938, "step": 4680 }, { "epoch": 4.9877594465141035, "grad_norm": 0.53125, "learning_rate": 0.00011773079146112243, "loss": 0.4884, "step": 4685 }, { "epoch": 4.99308142629058, "grad_norm": 0.54296875, "learning_rate": 0.00011754783492607373, "loss": 0.5033, "step": 4690 }, { "epoch": 4.998403406067057, "grad_norm": 0.7578125, "learning_rate": 0.00011736481776669306, "loss": 0.4937, "step": 4695 }, { "epoch": 4.999467802022353, "eval_loss": 2.368579387664795, "eval_runtime": 11.9313, "eval_samples_per_second": 6.202, "eval_steps_per_second": 0.587, "step": 4696 }, { "epoch": 5.003725385843534, "grad_norm": 0.6875, "learning_rate": 0.00011718174061526885, "loss": 0.4493, "step": 4700 }, { "epoch": 5.0090473656200105, "grad_norm": 0.63671875, "learning_rate": 0.00011699860410429684, "loss": 0.3709, "step": 4705 }, { "epoch": 5.014369345396488, "grad_norm": 0.5859375, "learning_rate": 0.0001168154088664778, "loss": 0.3812, "step": 4710 }, { "epoch": 5.019691325172964, "grad_norm": 0.671875, "learning_rate": 0.00011663215553471532, "loss": 0.386, "step": 4715 }, { "epoch": 5.025013304949441, "grad_norm": 0.50390625, "learning_rate": 0.00011644884474211382, "loss": 0.3661, "step": 4720 }, { "epoch": 5.030335284725918, "grad_norm": 0.5625, "learning_rate": 0.00011626547712197615, "loss": 0.3877, "step": 4725 }, { "epoch": 5.035657264502395, "grad_norm": 0.6484375, "learning_rate": 0.00011608205330780154, "loss": 0.3784, "step": 4730 }, { "epoch": 5.040979244278872, "grad_norm": 0.68359375, "learning_rate": 0.00011589857393328327, "loss": 0.3836, "step": 4735 }, { "epoch": 5.046301224055348, "grad_norm": 0.58203125, "learning_rate": 0.00011571503963230674, "loss": 0.373, "step": 4740 }, { "epoch": 5.051623203831825, "grad_norm": 0.62890625, "learning_rate": 0.00011553145103894692, "loss": 0.3883, "step": 4745 }, { "epoch": 5.056945183608303, "grad_norm": 0.62890625, "learning_rate": 0.00011534780878746648, "loss": 0.3968, "step": 4750 }, { "epoch": 5.062267163384779, "grad_norm": 0.6171875, "learning_rate": 0.00011516411351231333, "loss": 0.3924, "step": 4755 }, { "epoch": 5.067589143161256, "grad_norm": 0.671875, "learning_rate": 0.0001149803658481188, "loss": 0.3828, "step": 4760 }, { "epoch": 5.072911122937732, "grad_norm": 0.57421875, "learning_rate": 0.00011479656642969498, "loss": 0.3807, "step": 4765 }, { "epoch": 5.07823310271421, "grad_norm": 0.84375, "learning_rate": 0.00011461271589203289, "loss": 0.3835, "step": 4770 }, { "epoch": 5.083555082490687, "grad_norm": 0.69140625, "learning_rate": 0.00011442881487030008, "loss": 0.4069, "step": 4775 }, { "epoch": 5.088877062267163, "grad_norm": 0.7578125, "learning_rate": 0.0001142448639998386, "loss": 0.3959, "step": 4780 }, { "epoch": 5.09419904204364, "grad_norm": 0.5390625, "learning_rate": 0.00011406086391616264, "loss": 0.408, "step": 4785 }, { "epoch": 5.0995210218201175, "grad_norm": 0.6875, "learning_rate": 0.00011387681525495646, "loss": 0.4052, "step": 4790 }, { "epoch": 5.104843001596594, "grad_norm": 0.578125, "learning_rate": 0.00011369271865207215, "loss": 0.3844, "step": 4795 }, { "epoch": 5.110164981373071, "grad_norm": 0.640625, "learning_rate": 0.00011350857474352735, "loss": 0.4077, "step": 4800 }, { "epoch": 5.115486961149547, "grad_norm": 0.515625, "learning_rate": 0.00011332438416550319, "loss": 0.3898, "step": 4805 }, { "epoch": 5.1208089409260245, "grad_norm": 0.8046875, "learning_rate": 0.00011314014755434208, "loss": 0.4156, "step": 4810 }, { "epoch": 5.126130920702502, "grad_norm": 0.64453125, "learning_rate": 0.00011295586554654543, "loss": 0.3846, "step": 4815 }, { "epoch": 5.131452900478978, "grad_norm": 0.55078125, "learning_rate": 0.0001127715387787714, "loss": 0.3868, "step": 4820 }, { "epoch": 5.136774880255455, "grad_norm": 0.71484375, "learning_rate": 0.00011258716788783292, "loss": 0.403, "step": 4825 }, { "epoch": 5.1420968600319314, "grad_norm": 0.66015625, "learning_rate": 0.00011240275351069529, "loss": 0.3787, "step": 4830 }, { "epoch": 5.147418839808409, "grad_norm": 0.7265625, "learning_rate": 0.00011221829628447406, "loss": 0.4079, "step": 4835 }, { "epoch": 5.152740819584886, "grad_norm": 0.65625, "learning_rate": 0.00011203379684643278, "loss": 0.3777, "step": 4840 }, { "epoch": 5.158062799361362, "grad_norm": 0.60546875, "learning_rate": 0.00011184925583398089, "loss": 0.3996, "step": 4845 }, { "epoch": 5.163384779137839, "grad_norm": 0.66015625, "learning_rate": 0.00011166467388467143, "loss": 0.3865, "step": 4850 }, { "epoch": 5.1687067589143165, "grad_norm": 0.67578125, "learning_rate": 0.00011148005163619891, "loss": 0.3798, "step": 4855 }, { "epoch": 5.174028738690793, "grad_norm": 0.62890625, "learning_rate": 0.00011129538972639694, "loss": 0.4037, "step": 4860 }, { "epoch": 5.17935071846727, "grad_norm": 0.7109375, "learning_rate": 0.00011111068879323631, "loss": 0.3932, "step": 4865 }, { "epoch": 5.184672698243746, "grad_norm": 0.55859375, "learning_rate": 0.00011092594947482255, "loss": 0.3989, "step": 4870 }, { "epoch": 5.1899946780202235, "grad_norm": 0.65234375, "learning_rate": 0.00011074117240939384, "loss": 0.3951, "step": 4875 }, { "epoch": 5.195316657796701, "grad_norm": 0.62890625, "learning_rate": 0.00011055635823531866, "loss": 0.3944, "step": 4880 }, { "epoch": 5.200638637573177, "grad_norm": 0.5390625, "learning_rate": 0.00011037150759109385, "loss": 0.4103, "step": 4885 }, { "epoch": 5.205960617349654, "grad_norm": 0.68359375, "learning_rate": 0.00011018662111534215, "loss": 0.3872, "step": 4890 }, { "epoch": 5.2112825971261305, "grad_norm": 0.58203125, "learning_rate": 0.00011000169944681014, "loss": 0.3797, "step": 4895 }, { "epoch": 5.216604576902608, "grad_norm": 0.546875, "learning_rate": 0.0001098167432243659, "loss": 0.3864, "step": 4900 }, { "epoch": 5.221926556679085, "grad_norm": 0.77734375, "learning_rate": 0.00010963175308699699, "loss": 0.3999, "step": 4905 }, { "epoch": 5.227248536455561, "grad_norm": 0.67578125, "learning_rate": 0.00010944672967380805, "loss": 0.4062, "step": 4910 }, { "epoch": 5.232570516232038, "grad_norm": 0.5625, "learning_rate": 0.00010926167362401879, "loss": 0.3931, "step": 4915 }, { "epoch": 5.237892496008516, "grad_norm": 0.7265625, "learning_rate": 0.00010907658557696152, "loss": 0.3983, "step": 4920 }, { "epoch": 5.243214475784992, "grad_norm": 0.73828125, "learning_rate": 0.0001088914661720793, "loss": 0.422, "step": 4925 }, { "epoch": 5.248536455561469, "grad_norm": 0.55078125, "learning_rate": 0.00010870631604892332, "loss": 0.3952, "step": 4930 }, { "epoch": 5.253858435337945, "grad_norm": 0.765625, "learning_rate": 0.00010852113584715102, "loss": 0.4116, "step": 4935 }, { "epoch": 5.259180415114423, "grad_norm": 0.61328125, "learning_rate": 0.00010833592620652367, "loss": 0.4049, "step": 4940 }, { "epoch": 5.2645023948909, "grad_norm": 0.60546875, "learning_rate": 0.00010815068776690435, "loss": 0.395, "step": 4945 }, { "epoch": 5.269824374667376, "grad_norm": 0.66015625, "learning_rate": 0.00010796542116825556, "loss": 0.403, "step": 4950 }, { "epoch": 5.275146354443853, "grad_norm": 0.6875, "learning_rate": 0.00010778012705063714, "loss": 0.4208, "step": 4955 }, { "epoch": 5.28046833422033, "grad_norm": 0.6484375, "learning_rate": 0.00010759480605420387, "loss": 0.4079, "step": 4960 }, { "epoch": 5.285790313996807, "grad_norm": 0.5390625, "learning_rate": 0.00010740945881920359, "loss": 0.3896, "step": 4965 }, { "epoch": 5.291112293773284, "grad_norm": 0.5546875, "learning_rate": 0.00010722408598597465, "loss": 0.3975, "step": 4970 }, { "epoch": 5.29643427354976, "grad_norm": 0.625, "learning_rate": 0.00010703868819494382, "loss": 0.3938, "step": 4975 }, { "epoch": 5.3017562533262375, "grad_norm": 0.6015625, "learning_rate": 0.00010685326608662423, "loss": 0.4127, "step": 4980 }, { "epoch": 5.307078233102715, "grad_norm": 0.5546875, "learning_rate": 0.00010666782030161293, "loss": 0.3876, "step": 4985 }, { "epoch": 5.312400212879191, "grad_norm": 0.57421875, "learning_rate": 0.00010648235148058872, "loss": 0.4126, "step": 4990 }, { "epoch": 5.317722192655668, "grad_norm": 0.671875, "learning_rate": 0.00010629686026431005, "loss": 0.3988, "step": 4995 }, { "epoch": 5.3230441724321444, "grad_norm": 0.59765625, "learning_rate": 0.00010611134729361279, "loss": 0.3916, "step": 5000 }, { "epoch": 5.328366152208622, "grad_norm": 0.6953125, "learning_rate": 0.0001059258132094079, "loss": 0.3936, "step": 5005 }, { "epoch": 5.333688131985099, "grad_norm": 0.640625, "learning_rate": 0.00010574025865267923, "loss": 0.4156, "step": 5010 }, { "epoch": 5.339010111761575, "grad_norm": 0.6640625, "learning_rate": 0.00010555468426448147, "loss": 0.4154, "step": 5015 }, { "epoch": 5.344332091538052, "grad_norm": 0.671875, "learning_rate": 0.00010536909068593776, "loss": 0.4029, "step": 5020 }, { "epoch": 5.349654071314529, "grad_norm": 0.6171875, "learning_rate": 0.0001051834785582376, "loss": 0.4131, "step": 5025 }, { "epoch": 5.354976051091006, "grad_norm": 0.7109375, "learning_rate": 0.00010499784852263448, "loss": 0.4088, "step": 5030 }, { "epoch": 5.360298030867483, "grad_norm": 0.578125, "learning_rate": 0.00010481220122044382, "loss": 0.4068, "step": 5035 }, { "epoch": 5.365620010643959, "grad_norm": 0.63671875, "learning_rate": 0.00010462653729304066, "loss": 0.4123, "step": 5040 }, { "epoch": 5.3709419904204365, "grad_norm": 0.5625, "learning_rate": 0.00010444085738185756, "loss": 0.3901, "step": 5045 }, { "epoch": 5.376263970196913, "grad_norm": 0.546875, "learning_rate": 0.00010425516212838213, "loss": 0.4129, "step": 5050 }, { "epoch": 5.38158594997339, "grad_norm": 0.91015625, "learning_rate": 0.00010406945217415522, "loss": 0.407, "step": 5055 }, { "epoch": 5.386907929749867, "grad_norm": 0.609375, "learning_rate": 0.00010388372816076826, "loss": 0.3895, "step": 5060 }, { "epoch": 5.3922299095263435, "grad_norm": 0.64453125, "learning_rate": 0.00010369799072986134, "loss": 0.4018, "step": 5065 }, { "epoch": 5.397551889302821, "grad_norm": 0.65625, "learning_rate": 0.00010351224052312089, "loss": 0.3991, "step": 5070 }, { "epoch": 5.402873869079298, "grad_norm": 0.625, "learning_rate": 0.00010332647818227754, "loss": 0.3984, "step": 5075 }, { "epoch": 5.408195848855774, "grad_norm": 0.69140625, "learning_rate": 0.00010314070434910372, "loss": 0.4086, "step": 5080 }, { "epoch": 5.413517828632251, "grad_norm": 0.55859375, "learning_rate": 0.00010295491966541169, "loss": 0.3859, "step": 5085 }, { "epoch": 5.418839808408728, "grad_norm": 0.5546875, "learning_rate": 0.00010276912477305105, "loss": 0.4041, "step": 5090 }, { "epoch": 5.424161788185205, "grad_norm": 0.58984375, "learning_rate": 0.00010258332031390683, "loss": 0.4089, "step": 5095 }, { "epoch": 5.429483767961682, "grad_norm": 0.62890625, "learning_rate": 0.00010239750692989703, "loss": 0.4134, "step": 5100 }, { "epoch": 5.434805747738158, "grad_norm": 0.61328125, "learning_rate": 0.00010221168526297045, "loss": 0.4026, "step": 5105 }, { "epoch": 5.440127727514636, "grad_norm": 0.61328125, "learning_rate": 0.00010202585595510454, "loss": 0.3891, "step": 5110 }, { "epoch": 5.445449707291112, "grad_norm": 0.62890625, "learning_rate": 0.00010184001964830324, "loss": 0.4012, "step": 5115 }, { "epoch": 5.450771687067589, "grad_norm": 0.62890625, "learning_rate": 0.0001016541769845945, "loss": 0.4125, "step": 5120 }, { "epoch": 5.456093666844066, "grad_norm": 0.73828125, "learning_rate": 0.00010146832860602833, "loss": 0.4073, "step": 5125 }, { "epoch": 5.461415646620543, "grad_norm": 0.734375, "learning_rate": 0.00010128247515467449, "loss": 0.4296, "step": 5130 }, { "epoch": 5.46673762639702, "grad_norm": 0.73046875, "learning_rate": 0.00010109661727262025, "loss": 0.4213, "step": 5135 }, { "epoch": 5.472059606173497, "grad_norm": 0.6171875, "learning_rate": 0.00010091075560196814, "loss": 0.386, "step": 5140 }, { "epoch": 5.477381585949973, "grad_norm": 0.54296875, "learning_rate": 0.00010072489078483384, "loss": 0.3849, "step": 5145 }, { "epoch": 5.4827035657264505, "grad_norm": 0.58203125, "learning_rate": 0.00010053902346334392, "loss": 0.3922, "step": 5150 }, { "epoch": 5.488025545502927, "grad_norm": 0.6484375, "learning_rate": 0.00010035315427963355, "loss": 0.4111, "step": 5155 }, { "epoch": 5.493347525279404, "grad_norm": 0.60546875, "learning_rate": 0.00010016728387584429, "loss": 0.4014, "step": 5160 }, { "epoch": 5.498669505055881, "grad_norm": 0.69921875, "learning_rate": 9.9981412894122e-05, "loss": 0.3995, "step": 5165 }, { "epoch": 5.5039914848323575, "grad_norm": 0.578125, "learning_rate": 9.979554197661456e-05, "loss": 0.4009, "step": 5170 }, { "epoch": 5.509313464608835, "grad_norm": 0.67578125, "learning_rate": 9.96096717654695e-05, "loss": 0.391, "step": 5175 }, { "epoch": 5.514635444385311, "grad_norm": 0.69921875, "learning_rate": 9.942380290283204e-05, "loss": 0.3896, "step": 5180 }, { "epoch": 5.519957424161788, "grad_norm": 0.65234375, "learning_rate": 9.923793603084268e-05, "loss": 0.4054, "step": 5185 }, { "epoch": 5.525279403938265, "grad_norm": 0.5703125, "learning_rate": 9.9052071791635e-05, "loss": 0.4015, "step": 5190 }, { "epoch": 5.530601383714742, "grad_norm": 0.62109375, "learning_rate": 9.88662108273336e-05, "loss": 0.388, "step": 5195 }, { "epoch": 5.535923363491219, "grad_norm": 0.6484375, "learning_rate": 9.86803537800517e-05, "loss": 0.3878, "step": 5200 }, { "epoch": 5.541245343267695, "grad_norm": 0.78515625, "learning_rate": 9.849450129188894e-05, "loss": 0.4179, "step": 5205 }, { "epoch": 5.546567323044172, "grad_norm": 0.61328125, "learning_rate": 9.830865400492928e-05, "loss": 0.4017, "step": 5210 }, { "epoch": 5.5518893028206495, "grad_norm": 0.6796875, "learning_rate": 9.81228125612387e-05, "loss": 0.3996, "step": 5215 }, { "epoch": 5.557211282597126, "grad_norm": 0.9296875, "learning_rate": 9.793697760286297e-05, "loss": 0.4082, "step": 5220 }, { "epoch": 5.562533262373603, "grad_norm": 0.6484375, "learning_rate": 9.775114977182547e-05, "loss": 0.3902, "step": 5225 }, { "epoch": 5.56785524215008, "grad_norm": 0.59765625, "learning_rate": 9.75653297101249e-05, "loss": 0.4093, "step": 5230 }, { "epoch": 5.5731772219265565, "grad_norm": 0.5859375, "learning_rate": 9.737951805973325e-05, "loss": 0.4048, "step": 5235 }, { "epoch": 5.578499201703034, "grad_norm": 0.71484375, "learning_rate": 9.719371546259335e-05, "loss": 0.4012, "step": 5240 }, { "epoch": 5.58382118147951, "grad_norm": 0.7578125, "learning_rate": 9.700792256061674e-05, "loss": 0.4275, "step": 5245 }, { "epoch": 5.589143161255987, "grad_norm": 0.56640625, "learning_rate": 9.682213999568151e-05, "loss": 0.4143, "step": 5250 }, { "epoch": 5.594465141032464, "grad_norm": 0.64453125, "learning_rate": 9.663636840963007e-05, "loss": 0.4076, "step": 5255 }, { "epoch": 5.599787120808941, "grad_norm": 0.66796875, "learning_rate": 9.645060844426678e-05, "loss": 0.4078, "step": 5260 }, { "epoch": 5.605109100585418, "grad_norm": 0.7734375, "learning_rate": 9.626486074135597e-05, "loss": 0.4124, "step": 5265 }, { "epoch": 5.610431080361895, "grad_norm": 0.6953125, "learning_rate": 9.607912594261961e-05, "loss": 0.4222, "step": 5270 }, { "epoch": 5.615753060138371, "grad_norm": 0.74609375, "learning_rate": 9.589340468973495e-05, "loss": 0.4042, "step": 5275 }, { "epoch": 5.621075039914849, "grad_norm": 0.6171875, "learning_rate": 9.570769762433262e-05, "loss": 0.3972, "step": 5280 }, { "epoch": 5.626397019691325, "grad_norm": 0.6015625, "learning_rate": 9.552200538799409e-05, "loss": 0.4004, "step": 5285 }, { "epoch": 5.631718999467802, "grad_norm": 0.62109375, "learning_rate": 9.533632862224968e-05, "loss": 0.4096, "step": 5290 }, { "epoch": 5.637040979244279, "grad_norm": 0.796875, "learning_rate": 9.515066796857624e-05, "loss": 0.423, "step": 5295 }, { "epoch": 5.642362959020756, "grad_norm": 0.6796875, "learning_rate": 9.496502406839496e-05, "loss": 0.398, "step": 5300 }, { "epoch": 5.647684938797233, "grad_norm": 0.62109375, "learning_rate": 9.477939756306912e-05, "loss": 0.4106, "step": 5305 }, { "epoch": 5.653006918573709, "grad_norm": 0.80078125, "learning_rate": 9.4593789093902e-05, "loss": 0.4045, "step": 5310 }, { "epoch": 5.658328898350186, "grad_norm": 0.61328125, "learning_rate": 9.440819930213442e-05, "loss": 0.3792, "step": 5315 }, { "epoch": 5.6636508781266635, "grad_norm": 0.60546875, "learning_rate": 9.422262882894278e-05, "loss": 0.3921, "step": 5320 }, { "epoch": 5.66897285790314, "grad_norm": 0.62890625, "learning_rate": 9.403707831543674e-05, "loss": 0.4052, "step": 5325 }, { "epoch": 5.674294837679617, "grad_norm": 0.6640625, "learning_rate": 9.385154840265694e-05, "loss": 0.4192, "step": 5330 }, { "epoch": 5.679616817456093, "grad_norm": 0.6171875, "learning_rate": 9.366603973157291e-05, "loss": 0.3968, "step": 5335 }, { "epoch": 5.6849387972325705, "grad_norm": 0.6484375, "learning_rate": 9.348055294308073e-05, "loss": 0.4315, "step": 5340 }, { "epoch": 5.690260777009048, "grad_norm": 0.70703125, "learning_rate": 9.32950886780009e-05, "loss": 0.4133, "step": 5345 }, { "epoch": 5.695582756785524, "grad_norm": 0.67578125, "learning_rate": 9.310964757707624e-05, "loss": 0.4155, "step": 5350 }, { "epoch": 5.700904736562001, "grad_norm": 0.6875, "learning_rate": 9.292423028096927e-05, "loss": 0.3969, "step": 5355 }, { "epoch": 5.7062267163384774, "grad_norm": 0.67578125, "learning_rate": 9.273883743026053e-05, "loss": 0.4141, "step": 5360 }, { "epoch": 5.711548696114955, "grad_norm": 0.7578125, "learning_rate": 9.255346966544597e-05, "loss": 0.3907, "step": 5365 }, { "epoch": 5.716870675891432, "grad_norm": 0.640625, "learning_rate": 9.236812762693488e-05, "loss": 0.4138, "step": 5370 }, { "epoch": 5.722192655667908, "grad_norm": 0.625, "learning_rate": 9.21828119550477e-05, "loss": 0.4092, "step": 5375 }, { "epoch": 5.727514635444385, "grad_norm": 0.625, "learning_rate": 9.199752329001383e-05, "loss": 0.4125, "step": 5380 }, { "epoch": 5.7328366152208625, "grad_norm": 0.625, "learning_rate": 9.181226227196922e-05, "loss": 0.414, "step": 5385 }, { "epoch": 5.738158594997339, "grad_norm": 0.58203125, "learning_rate": 9.162702954095448e-05, "loss": 0.4073, "step": 5390 }, { "epoch": 5.743480574773816, "grad_norm": 0.6484375, "learning_rate": 9.144182573691231e-05, "loss": 0.4141, "step": 5395 }, { "epoch": 5.748802554550292, "grad_norm": 0.5546875, "learning_rate": 9.125665149968566e-05, "loss": 0.3844, "step": 5400 }, { "epoch": 5.7541245343267695, "grad_norm": 0.59375, "learning_rate": 9.107150746901521e-05, "loss": 0.3961, "step": 5405 }, { "epoch": 5.759446514103247, "grad_norm": 0.734375, "learning_rate": 9.08863942845373e-05, "loss": 0.4253, "step": 5410 }, { "epoch": 5.764768493879723, "grad_norm": 0.56640625, "learning_rate": 9.070131258578169e-05, "loss": 0.4014, "step": 5415 }, { "epoch": 5.7700904736562, "grad_norm": 0.65234375, "learning_rate": 9.051626301216951e-05, "loss": 0.4292, "step": 5420 }, { "epoch": 5.775412453432677, "grad_norm": 0.62109375, "learning_rate": 9.033124620301063e-05, "loss": 0.4144, "step": 5425 }, { "epoch": 5.780734433209154, "grad_norm": 0.61328125, "learning_rate": 9.014626279750204e-05, "loss": 0.4018, "step": 5430 }, { "epoch": 5.786056412985631, "grad_norm": 0.7421875, "learning_rate": 8.996131343472501e-05, "loss": 0.4314, "step": 5435 }, { "epoch": 5.791378392762107, "grad_norm": 0.71484375, "learning_rate": 8.977639875364344e-05, "loss": 0.4078, "step": 5440 }, { "epoch": 5.796700372538584, "grad_norm": 0.65625, "learning_rate": 8.959151939310134e-05, "loss": 0.3866, "step": 5445 }, { "epoch": 5.802022352315062, "grad_norm": 0.64453125, "learning_rate": 8.940667599182063e-05, "loss": 0.4044, "step": 5450 }, { "epoch": 5.807344332091538, "grad_norm": 0.6171875, "learning_rate": 8.922186918839906e-05, "loss": 0.4243, "step": 5455 }, { "epoch": 5.812666311868015, "grad_norm": 0.69921875, "learning_rate": 8.903709962130802e-05, "loss": 0.4117, "step": 5460 }, { "epoch": 5.817988291644491, "grad_norm": 0.625, "learning_rate": 8.885236792889003e-05, "loss": 0.3994, "step": 5465 }, { "epoch": 5.823310271420969, "grad_norm": 0.66796875, "learning_rate": 8.866767474935697e-05, "loss": 0.4, "step": 5470 }, { "epoch": 5.828632251197446, "grad_norm": 0.671875, "learning_rate": 8.848302072078761e-05, "loss": 0.4109, "step": 5475 }, { "epoch": 5.833954230973922, "grad_norm": 0.6171875, "learning_rate": 8.82984064811254e-05, "loss": 0.4216, "step": 5480 }, { "epoch": 5.839276210750399, "grad_norm": 0.6640625, "learning_rate": 8.811383266817641e-05, "loss": 0.4135, "step": 5485 }, { "epoch": 5.844598190526876, "grad_norm": 0.61328125, "learning_rate": 8.792929991960696e-05, "loss": 0.3902, "step": 5490 }, { "epoch": 5.849920170303353, "grad_norm": 0.6171875, "learning_rate": 8.774480887294155e-05, "loss": 0.3943, "step": 5495 }, { "epoch": 5.85524215007983, "grad_norm": 0.59765625, "learning_rate": 8.75603601655607e-05, "loss": 0.4151, "step": 5500 }, { "epoch": 5.860564129856306, "grad_norm": 0.80859375, "learning_rate": 8.73759544346984e-05, "loss": 0.4301, "step": 5505 }, { "epoch": 5.8658861096327835, "grad_norm": 0.59765625, "learning_rate": 8.719159231744043e-05, "loss": 0.3967, "step": 5510 }, { "epoch": 5.87120808940926, "grad_norm": 0.578125, "learning_rate": 8.700727445072178e-05, "loss": 0.4155, "step": 5515 }, { "epoch": 5.876530069185737, "grad_norm": 0.6640625, "learning_rate": 8.682300147132455e-05, "loss": 0.3933, "step": 5520 }, { "epoch": 5.881852048962214, "grad_norm": 0.6484375, "learning_rate": 8.663877401587577e-05, "loss": 0.4045, "step": 5525 }, { "epoch": 5.8871740287386904, "grad_norm": 0.80859375, "learning_rate": 8.645459272084526e-05, "loss": 0.4024, "step": 5530 }, { "epoch": 5.892496008515168, "grad_norm": 0.67578125, "learning_rate": 8.627045822254326e-05, "loss": 0.4087, "step": 5535 }, { "epoch": 5.897817988291645, "grad_norm": 0.76171875, "learning_rate": 8.608637115711842e-05, "loss": 0.3988, "step": 5540 }, { "epoch": 5.903139968068121, "grad_norm": 0.76953125, "learning_rate": 8.590233216055543e-05, "loss": 0.3944, "step": 5545 }, { "epoch": 5.908461947844598, "grad_norm": 0.6171875, "learning_rate": 8.571834186867303e-05, "loss": 0.4277, "step": 5550 }, { "epoch": 5.9137839276210755, "grad_norm": 0.6328125, "learning_rate": 8.553440091712164e-05, "loss": 0.4042, "step": 5555 }, { "epoch": 5.919105907397552, "grad_norm": 0.59765625, "learning_rate": 8.535050994138117e-05, "loss": 0.4026, "step": 5560 }, { "epoch": 5.924427887174029, "grad_norm": 0.58984375, "learning_rate": 8.516666957675894e-05, "loss": 0.4023, "step": 5565 }, { "epoch": 5.929749866950505, "grad_norm": 0.65234375, "learning_rate": 8.49828804583874e-05, "loss": 0.409, "step": 5570 }, { "epoch": 5.9350718467269825, "grad_norm": 0.64453125, "learning_rate": 8.479914322122192e-05, "loss": 0.4075, "step": 5575 }, { "epoch": 5.94039382650346, "grad_norm": 0.625, "learning_rate": 8.461545850003865e-05, "loss": 0.4055, "step": 5580 }, { "epoch": 5.945715806279936, "grad_norm": 0.71484375, "learning_rate": 8.44318269294324e-05, "loss": 0.3925, "step": 5585 }, { "epoch": 5.951037786056413, "grad_norm": 0.6640625, "learning_rate": 8.424824914381419e-05, "loss": 0.4114, "step": 5590 }, { "epoch": 5.9563597658328895, "grad_norm": 0.88671875, "learning_rate": 8.406472577740935e-05, "loss": 0.4334, "step": 5595 }, { "epoch": 5.961681745609367, "grad_norm": 0.7265625, "learning_rate": 8.388125746425511e-05, "loss": 0.4032, "step": 5600 }, { "epoch": 5.967003725385844, "grad_norm": 0.64453125, "learning_rate": 8.36978448381986e-05, "loss": 0.4088, "step": 5605 }, { "epoch": 5.97232570516232, "grad_norm": 0.60546875, "learning_rate": 8.351448853289449e-05, "loss": 0.3934, "step": 5610 }, { "epoch": 5.977647684938797, "grad_norm": 0.5703125, "learning_rate": 8.333118918180287e-05, "loss": 0.3855, "step": 5615 }, { "epoch": 5.982969664715274, "grad_norm": 0.63671875, "learning_rate": 8.31479474181871e-05, "loss": 0.408, "step": 5620 }, { "epoch": 5.988291644491751, "grad_norm": 0.79296875, "learning_rate": 8.296476387511164e-05, "loss": 0.4017, "step": 5625 }, { "epoch": 5.993613624268228, "grad_norm": 0.7734375, "learning_rate": 8.278163918543966e-05, "loss": 0.4081, "step": 5630 }, { "epoch": 5.998935604044704, "grad_norm": 0.61328125, "learning_rate": 8.259857398183112e-05, "loss": 0.4075, "step": 5635 }, { "epoch": 6.0, "eval_loss": 2.726884603500366, "eval_runtime": 11.8338, "eval_samples_per_second": 6.253, "eval_steps_per_second": 0.592, "step": 5636 }, { "epoch": 6.004257583821182, "grad_norm": 0.5, "learning_rate": 8.241556889674046e-05, "loss": 0.331, "step": 5640 }, { "epoch": 6.009579563597659, "grad_norm": 0.6875, "learning_rate": 8.223262456241435e-05, "loss": 0.3157, "step": 5645 }, { "epoch": 6.014901543374135, "grad_norm": 0.54296875, "learning_rate": 8.204974161088967e-05, "loss": 0.3297, "step": 5650 }, { "epoch": 6.020223523150612, "grad_norm": 0.640625, "learning_rate": 8.186692067399114e-05, "loss": 0.3165, "step": 5655 }, { "epoch": 6.025545502927089, "grad_norm": 0.8984375, "learning_rate": 8.16841623833293e-05, "loss": 0.3297, "step": 5660 }, { "epoch": 6.030867482703566, "grad_norm": 0.84765625, "learning_rate": 8.150146737029828e-05, "loss": 0.3306, "step": 5665 }, { "epoch": 6.036189462480043, "grad_norm": 0.734375, "learning_rate": 8.13188362660735e-05, "loss": 0.3373, "step": 5670 }, { "epoch": 6.041511442256519, "grad_norm": 0.55859375, "learning_rate": 8.11362697016097e-05, "loss": 0.3182, "step": 5675 }, { "epoch": 6.0468334220329965, "grad_norm": 0.6015625, "learning_rate": 8.095376830763856e-05, "loss": 0.3259, "step": 5680 }, { "epoch": 6.052155401809473, "grad_norm": 0.65234375, "learning_rate": 8.077133271466665e-05, "loss": 0.3385, "step": 5685 }, { "epoch": 6.05747738158595, "grad_norm": 0.62890625, "learning_rate": 8.058896355297323e-05, "loss": 0.3255, "step": 5690 }, { "epoch": 6.062799361362427, "grad_norm": 0.74609375, "learning_rate": 8.040666145260798e-05, "loss": 0.3317, "step": 5695 }, { "epoch": 6.0681213411389034, "grad_norm": 0.6796875, "learning_rate": 8.022442704338894e-05, "loss": 0.3323, "step": 5700 }, { "epoch": 6.073443320915381, "grad_norm": 0.61328125, "learning_rate": 8.004226095490037e-05, "loss": 0.3426, "step": 5705 }, { "epoch": 6.078765300691857, "grad_norm": 0.65234375, "learning_rate": 7.986016381649033e-05, "loss": 0.3286, "step": 5710 }, { "epoch": 6.084087280468334, "grad_norm": 0.62109375, "learning_rate": 7.96781362572688e-05, "loss": 0.3293, "step": 5715 }, { "epoch": 6.089409260244811, "grad_norm": 0.703125, "learning_rate": 7.949617890610537e-05, "loss": 0.343, "step": 5720 }, { "epoch": 6.094731240021288, "grad_norm": 0.62109375, "learning_rate": 7.931429239162696e-05, "loss": 0.3153, "step": 5725 }, { "epoch": 6.100053219797765, "grad_norm": 0.57421875, "learning_rate": 7.91324773422159e-05, "loss": 0.3214, "step": 5730 }, { "epoch": 6.105375199574242, "grad_norm": 0.63671875, "learning_rate": 7.89507343860076e-05, "loss": 0.3336, "step": 5735 }, { "epoch": 6.110697179350718, "grad_norm": 0.64453125, "learning_rate": 7.876906415088825e-05, "loss": 0.3532, "step": 5740 }, { "epoch": 6.1160191591271955, "grad_norm": 0.6328125, "learning_rate": 7.858746726449309e-05, "loss": 0.3313, "step": 5745 }, { "epoch": 6.121341138903672, "grad_norm": 0.72265625, "learning_rate": 7.84059443542036e-05, "loss": 0.3202, "step": 5750 }, { "epoch": 6.126663118680149, "grad_norm": 0.65234375, "learning_rate": 7.822449604714602e-05, "loss": 0.335, "step": 5755 }, { "epoch": 6.131985098456626, "grad_norm": 0.68359375, "learning_rate": 7.804312297018865e-05, "loss": 0.3297, "step": 5760 }, { "epoch": 6.1373070782331025, "grad_norm": 0.734375, "learning_rate": 7.786182574993995e-05, "loss": 0.3236, "step": 5765 }, { "epoch": 6.14262905800958, "grad_norm": 0.5859375, "learning_rate": 7.768060501274624e-05, "loss": 0.3269, "step": 5770 }, { "epoch": 6.147951037786056, "grad_norm": 0.71875, "learning_rate": 7.74994613846898e-05, "loss": 0.327, "step": 5775 }, { "epoch": 6.153273017562533, "grad_norm": 0.609375, "learning_rate": 7.731839549158621e-05, "loss": 0.332, "step": 5780 }, { "epoch": 6.15859499733901, "grad_norm": 0.62109375, "learning_rate": 7.713740795898277e-05, "loss": 0.3315, "step": 5785 }, { "epoch": 6.163916977115487, "grad_norm": 0.70703125, "learning_rate": 7.695649941215594e-05, "loss": 0.3413, "step": 5790 }, { "epoch": 6.169238956891964, "grad_norm": 0.73046875, "learning_rate": 7.677567047610928e-05, "loss": 0.3178, "step": 5795 }, { "epoch": 6.174560936668441, "grad_norm": 0.6875, "learning_rate": 7.659492177557138e-05, "loss": 0.3246, "step": 5800 }, { "epoch": 6.179882916444917, "grad_norm": 0.625, "learning_rate": 7.64142539349935e-05, "loss": 0.3383, "step": 5805 }, { "epoch": 6.185204896221395, "grad_norm": 0.546875, "learning_rate": 7.62336675785477e-05, "loss": 0.3273, "step": 5810 }, { "epoch": 6.190526875997871, "grad_norm": 0.6328125, "learning_rate": 7.605316333012452e-05, "loss": 0.3288, "step": 5815 }, { "epoch": 6.195848855774348, "grad_norm": 0.6484375, "learning_rate": 7.587274181333063e-05, "loss": 0.3321, "step": 5820 }, { "epoch": 6.201170835550825, "grad_norm": 0.60546875, "learning_rate": 7.569240365148715e-05, "loss": 0.3397, "step": 5825 }, { "epoch": 6.206492815327302, "grad_norm": 0.65625, "learning_rate": 7.551214946762707e-05, "loss": 0.3271, "step": 5830 }, { "epoch": 6.211814795103779, "grad_norm": 0.6640625, "learning_rate": 7.533197988449325e-05, "loss": 0.3268, "step": 5835 }, { "epoch": 6.217136774880255, "grad_norm": 0.70703125, "learning_rate": 7.515189552453632e-05, "loss": 0.3239, "step": 5840 }, { "epoch": 6.222458754656732, "grad_norm": 0.61328125, "learning_rate": 7.49718970099125e-05, "loss": 0.3244, "step": 5845 }, { "epoch": 6.2277807344332095, "grad_norm": 0.73046875, "learning_rate": 7.479198496248132e-05, "loss": 0.3304, "step": 5850 }, { "epoch": 6.233102714209686, "grad_norm": 0.65234375, "learning_rate": 7.461216000380375e-05, "loss": 0.3414, "step": 5855 }, { "epoch": 6.238424693986163, "grad_norm": 0.6953125, "learning_rate": 7.443242275513971e-05, "loss": 0.3247, "step": 5860 }, { "epoch": 6.24374667376264, "grad_norm": 0.640625, "learning_rate": 7.425277383744622e-05, "loss": 0.3437, "step": 5865 }, { "epoch": 6.2490686535391164, "grad_norm": 0.625, "learning_rate": 7.407321387137511e-05, "loss": 0.3323, "step": 5870 }, { "epoch": 6.254390633315594, "grad_norm": 0.671875, "learning_rate": 7.389374347727087e-05, "loss": 0.3341, "step": 5875 }, { "epoch": 6.25971261309207, "grad_norm": 0.625, "learning_rate": 7.371436327516854e-05, "loss": 0.3405, "step": 5880 }, { "epoch": 6.265034592868547, "grad_norm": 0.6953125, "learning_rate": 7.353507388479162e-05, "loss": 0.3278, "step": 5885 }, { "epoch": 6.270356572645024, "grad_norm": 0.625, "learning_rate": 7.335587592554975e-05, "loss": 0.3304, "step": 5890 }, { "epoch": 6.275678552421501, "grad_norm": 0.65625, "learning_rate": 7.31767700165369e-05, "loss": 0.3433, "step": 5895 }, { "epoch": 6.281000532197978, "grad_norm": 0.6328125, "learning_rate": 7.299775677652872e-05, "loss": 0.3286, "step": 5900 }, { "epoch": 6.286322511974454, "grad_norm": 0.7265625, "learning_rate": 7.281883682398103e-05, "loss": 0.3269, "step": 5905 }, { "epoch": 6.291644491750931, "grad_norm": 0.6484375, "learning_rate": 7.26400107770272e-05, "loss": 0.3321, "step": 5910 }, { "epoch": 6.2969664715274085, "grad_norm": 0.66796875, "learning_rate": 7.246127925347612e-05, "loss": 0.3301, "step": 5915 }, { "epoch": 6.302288451303885, "grad_norm": 0.6015625, "learning_rate": 7.228264287081024e-05, "loss": 0.3274, "step": 5920 }, { "epoch": 6.307610431080362, "grad_norm": 0.66796875, "learning_rate": 7.210410224618327e-05, "loss": 0.3362, "step": 5925 }, { "epoch": 6.312932410856838, "grad_norm": 0.6484375, "learning_rate": 7.192565799641804e-05, "loss": 0.3361, "step": 5930 }, { "epoch": 6.3182543906333155, "grad_norm": 0.63671875, "learning_rate": 7.174731073800452e-05, "loss": 0.3342, "step": 5935 }, { "epoch": 6.323576370409793, "grad_norm": 0.59375, "learning_rate": 7.156906108709757e-05, "loss": 0.3272, "step": 5940 }, { "epoch": 6.328898350186269, "grad_norm": 0.7421875, "learning_rate": 7.139090965951475e-05, "loss": 0.3431, "step": 5945 }, { "epoch": 6.334220329962746, "grad_norm": 0.63671875, "learning_rate": 7.121285707073439e-05, "loss": 0.3305, "step": 5950 }, { "epoch": 6.339542309739223, "grad_norm": 0.65234375, "learning_rate": 7.10349039358933e-05, "loss": 0.3374, "step": 5955 }, { "epoch": 6.3448642895157, "grad_norm": 0.70703125, "learning_rate": 7.085705086978465e-05, "loss": 0.3316, "step": 5960 }, { "epoch": 6.350186269292177, "grad_norm": 0.64453125, "learning_rate": 7.067929848685602e-05, "loss": 0.3213, "step": 5965 }, { "epoch": 6.355508249068653, "grad_norm": 0.703125, "learning_rate": 7.050164740120699e-05, "loss": 0.3362, "step": 5970 }, { "epoch": 6.36083022884513, "grad_norm": 0.64453125, "learning_rate": 7.032409822658728e-05, "loss": 0.3289, "step": 5975 }, { "epoch": 6.366152208621608, "grad_norm": 0.640625, "learning_rate": 7.014665157639457e-05, "loss": 0.3248, "step": 5980 }, { "epoch": 6.371474188398084, "grad_norm": 0.65234375, "learning_rate": 6.99693080636722e-05, "loss": 0.3394, "step": 5985 }, { "epoch": 6.376796168174561, "grad_norm": 0.640625, "learning_rate": 6.979206830110727e-05, "loss": 0.3407, "step": 5990 }, { "epoch": 6.382118147951037, "grad_norm": 0.625, "learning_rate": 6.96149329010285e-05, "loss": 0.3312, "step": 5995 }, { "epoch": 6.387440127727515, "grad_norm": 0.69140625, "learning_rate": 6.943790247540392e-05, "loss": 0.3335, "step": 6000 }, { "epoch": 6.392762107503992, "grad_norm": 0.63671875, "learning_rate": 6.926097763583901e-05, "loss": 0.3271, "step": 6005 }, { "epoch": 6.398084087280468, "grad_norm": 0.7578125, "learning_rate": 6.908415899357441e-05, "loss": 0.3308, "step": 6010 }, { "epoch": 6.403406067056945, "grad_norm": 0.79296875, "learning_rate": 6.890744715948388e-05, "loss": 0.3434, "step": 6015 }, { "epoch": 6.4087280468334225, "grad_norm": 0.65234375, "learning_rate": 6.873084274407224e-05, "loss": 0.3489, "step": 6020 }, { "epoch": 6.414050026609899, "grad_norm": 0.5859375, "learning_rate": 6.855434635747309e-05, "loss": 0.338, "step": 6025 }, { "epoch": 6.419372006386376, "grad_norm": 0.62890625, "learning_rate": 6.837795860944691e-05, "loss": 0.3273, "step": 6030 }, { "epoch": 6.424693986162852, "grad_norm": 0.58203125, "learning_rate": 6.820168010937881e-05, "loss": 0.3247, "step": 6035 }, { "epoch": 6.4300159659393294, "grad_norm": 0.5703125, "learning_rate": 6.802551146627645e-05, "loss": 0.3311, "step": 6040 }, { "epoch": 6.435337945715807, "grad_norm": 0.55859375, "learning_rate": 6.784945328876795e-05, "loss": 0.325, "step": 6045 }, { "epoch": 6.440659925492283, "grad_norm": 0.6484375, "learning_rate": 6.767350618509994e-05, "loss": 0.3392, "step": 6050 }, { "epoch": 6.44598190526876, "grad_norm": 0.62890625, "learning_rate": 6.749767076313505e-05, "loss": 0.3208, "step": 6055 }, { "epoch": 6.451303885045236, "grad_norm": 0.609375, "learning_rate": 6.732194763035033e-05, "loss": 0.3262, "step": 6060 }, { "epoch": 6.456625864821714, "grad_norm": 0.62109375, "learning_rate": 6.714633739383471e-05, "loss": 0.3412, "step": 6065 }, { "epoch": 6.461947844598191, "grad_norm": 0.6953125, "learning_rate": 6.697084066028718e-05, "loss": 0.3345, "step": 6070 }, { "epoch": 6.467269824374667, "grad_norm": 0.609375, "learning_rate": 6.67954580360146e-05, "loss": 0.3462, "step": 6075 }, { "epoch": 6.472591804151144, "grad_norm": 0.7421875, "learning_rate": 6.662019012692951e-05, "loss": 0.3391, "step": 6080 }, { "epoch": 6.477913783927621, "grad_norm": 0.5546875, "learning_rate": 6.64450375385482e-05, "loss": 0.3373, "step": 6085 }, { "epoch": 6.483235763704098, "grad_norm": 0.7421875, "learning_rate": 6.627000087598868e-05, "loss": 0.3356, "step": 6090 }, { "epoch": 6.488557743480575, "grad_norm": 0.6953125, "learning_rate": 6.609508074396816e-05, "loss": 0.3297, "step": 6095 }, { "epoch": 6.493879723257051, "grad_norm": 0.64453125, "learning_rate": 6.592027774680154e-05, "loss": 0.351, "step": 6100 }, { "epoch": 6.4992017030335285, "grad_norm": 0.7109375, "learning_rate": 6.57455924883989e-05, "loss": 0.333, "step": 6105 }, { "epoch": 6.504523682810006, "grad_norm": 0.68359375, "learning_rate": 6.557102557226356e-05, "loss": 0.3439, "step": 6110 }, { "epoch": 6.509845662586482, "grad_norm": 0.6171875, "learning_rate": 6.539657760149007e-05, "loss": 0.3344, "step": 6115 }, { "epoch": 6.515167642362959, "grad_norm": 0.58984375, "learning_rate": 6.522224917876191e-05, "loss": 0.3318, "step": 6120 }, { "epoch": 6.5204896221394355, "grad_norm": 0.671875, "learning_rate": 6.504804090634971e-05, "loss": 0.331, "step": 6125 }, { "epoch": 6.525811601915913, "grad_norm": 0.69140625, "learning_rate": 6.487395338610891e-05, "loss": 0.3377, "step": 6130 }, { "epoch": 6.53113358169239, "grad_norm": 0.640625, "learning_rate": 6.469998721947771e-05, "loss": 0.3395, "step": 6135 }, { "epoch": 6.536455561468866, "grad_norm": 0.6171875, "learning_rate": 6.452614300747523e-05, "loss": 0.3383, "step": 6140 }, { "epoch": 6.541777541245343, "grad_norm": 0.7109375, "learning_rate": 6.435242135069916e-05, "loss": 0.3254, "step": 6145 }, { "epoch": 6.547099521021821, "grad_norm": 0.63671875, "learning_rate": 6.417882284932373e-05, "loss": 0.33, "step": 6150 }, { "epoch": 6.552421500798297, "grad_norm": 0.70703125, "learning_rate": 6.40053481030978e-05, "loss": 0.3387, "step": 6155 }, { "epoch": 6.557743480574774, "grad_norm": 0.66015625, "learning_rate": 6.383199771134261e-05, "loss": 0.3236, "step": 6160 }, { "epoch": 6.56306546035125, "grad_norm": 0.609375, "learning_rate": 6.365877227294978e-05, "loss": 0.3457, "step": 6165 }, { "epoch": 6.568387440127728, "grad_norm": 0.6328125, "learning_rate": 6.348567238637937e-05, "loss": 0.3257, "step": 6170 }, { "epoch": 6.573709419904205, "grad_norm": 0.74609375, "learning_rate": 6.331269864965742e-05, "loss": 0.3363, "step": 6175 }, { "epoch": 6.579031399680681, "grad_norm": 0.7578125, "learning_rate": 6.313985166037442e-05, "loss": 0.3382, "step": 6180 }, { "epoch": 6.584353379457158, "grad_norm": 0.6875, "learning_rate": 6.296713201568285e-05, "loss": 0.3333, "step": 6185 }, { "epoch": 6.589675359233635, "grad_norm": 0.671875, "learning_rate": 6.279454031229517e-05, "loss": 0.3325, "step": 6190 }, { "epoch": 6.594997339010112, "grad_norm": 0.65625, "learning_rate": 6.262207714648195e-05, "loss": 0.327, "step": 6195 }, { "epoch": 6.600319318786589, "grad_norm": 0.63671875, "learning_rate": 6.244974311406971e-05, "loss": 0.3334, "step": 6200 }, { "epoch": 6.605641298563065, "grad_norm": 0.6015625, "learning_rate": 6.227753881043864e-05, "loss": 0.3328, "step": 6205 }, { "epoch": 6.6109632783395424, "grad_norm": 0.62109375, "learning_rate": 6.210546483052097e-05, "loss": 0.3255, "step": 6210 }, { "epoch": 6.616285258116019, "grad_norm": 0.65234375, "learning_rate": 6.193352176879851e-05, "loss": 0.3385, "step": 6215 }, { "epoch": 6.621607237892496, "grad_norm": 0.671875, "learning_rate": 6.176171021930094e-05, "loss": 0.3436, "step": 6220 }, { "epoch": 6.626929217668973, "grad_norm": 0.6640625, "learning_rate": 6.159003077560347e-05, "loss": 0.3333, "step": 6225 }, { "epoch": 6.632251197445449, "grad_norm": 0.66796875, "learning_rate": 6.141848403082493e-05, "loss": 0.3319, "step": 6230 }, { "epoch": 6.637573177221927, "grad_norm": 0.671875, "learning_rate": 6.124707057762571e-05, "loss": 0.3409, "step": 6235 }, { "epoch": 6.642895156998403, "grad_norm": 0.671875, "learning_rate": 6.107579100820575e-05, "loss": 0.3223, "step": 6240 }, { "epoch": 6.64821713677488, "grad_norm": 0.59765625, "learning_rate": 6.090464591430236e-05, "loss": 0.3459, "step": 6245 }, { "epoch": 6.653539116551357, "grad_norm": 0.64453125, "learning_rate": 6.0733635887188346e-05, "loss": 0.3403, "step": 6250 }, { "epoch": 6.658861096327834, "grad_norm": 0.6328125, "learning_rate": 6.056276151766985e-05, "loss": 0.3509, "step": 6255 }, { "epoch": 6.664183076104311, "grad_norm": 0.6015625, "learning_rate": 6.039202339608432e-05, "loss": 0.3351, "step": 6260 }, { "epoch": 6.669505055880788, "grad_norm": 0.5546875, "learning_rate": 6.022142211229855e-05, "loss": 0.3348, "step": 6265 }, { "epoch": 6.674827035657264, "grad_norm": 0.65625, "learning_rate": 6.0050958255706524e-05, "loss": 0.3365, "step": 6270 }, { "epoch": 6.6801490154337415, "grad_norm": 0.59765625, "learning_rate": 5.988063241522747e-05, "loss": 0.3318, "step": 6275 }, { "epoch": 6.685470995210218, "grad_norm": 0.71484375, "learning_rate": 5.971044517930382e-05, "loss": 0.3417, "step": 6280 }, { "epoch": 6.690792974986695, "grad_norm": 0.5859375, "learning_rate": 5.954039713589909e-05, "loss": 0.3313, "step": 6285 }, { "epoch": 6.696114954763172, "grad_norm": 0.578125, "learning_rate": 5.9370488872496036e-05, "loss": 0.3293, "step": 6290 }, { "epoch": 6.7014369345396485, "grad_norm": 0.6328125, "learning_rate": 5.9200720976094373e-05, "loss": 0.3277, "step": 6295 }, { "epoch": 6.706758914316126, "grad_norm": 0.68359375, "learning_rate": 5.903109403320891e-05, "loss": 0.3411, "step": 6300 }, { "epoch": 6.712080894092603, "grad_norm": 0.53125, "learning_rate": 5.8861608629867524e-05, "loss": 0.3254, "step": 6305 }, { "epoch": 6.717402873869079, "grad_norm": 0.66796875, "learning_rate": 5.86922653516091e-05, "loss": 0.338, "step": 6310 }, { "epoch": 6.722724853645556, "grad_norm": 0.5859375, "learning_rate": 5.852306478348144e-05, "loss": 0.3315, "step": 6315 }, { "epoch": 6.728046833422033, "grad_norm": 0.71875, "learning_rate": 5.83540075100394e-05, "loss": 0.3518, "step": 6320 }, { "epoch": 6.73336881319851, "grad_norm": 0.88671875, "learning_rate": 5.818509411534272e-05, "loss": 0.3522, "step": 6325 }, { "epoch": 6.738690792974987, "grad_norm": 0.640625, "learning_rate": 5.80163251829541e-05, "loss": 0.3321, "step": 6330 }, { "epoch": 6.744012772751463, "grad_norm": 0.61328125, "learning_rate": 5.784770129593717e-05, "loss": 0.3379, "step": 6335 }, { "epoch": 6.749334752527941, "grad_norm": 0.671875, "learning_rate": 5.767922303685435e-05, "loss": 0.3417, "step": 6340 }, { "epoch": 6.754656732304417, "grad_norm": 0.66015625, "learning_rate": 5.751089098776501e-05, "loss": 0.3381, "step": 6345 }, { "epoch": 6.759978712080894, "grad_norm": 0.64453125, "learning_rate": 5.7342705730223536e-05, "loss": 0.3346, "step": 6350 }, { "epoch": 6.765300691857371, "grad_norm": 0.70703125, "learning_rate": 5.717466784527692e-05, "loss": 0.3407, "step": 6355 }, { "epoch": 6.770622671633848, "grad_norm": 0.71875, "learning_rate": 5.700677791346318e-05, "loss": 0.3348, "step": 6360 }, { "epoch": 6.775944651410325, "grad_norm": 0.671875, "learning_rate": 5.683903651480913e-05, "loss": 0.3372, "step": 6365 }, { "epoch": 6.781266631186801, "grad_norm": 0.58984375, "learning_rate": 5.667144422882844e-05, "loss": 0.3389, "step": 6370 }, { "epoch": 6.786588610963278, "grad_norm": 0.8984375, "learning_rate": 5.6504001634519675e-05, "loss": 0.3394, "step": 6375 }, { "epoch": 6.7919105907397554, "grad_norm": 0.58203125, "learning_rate": 5.63367093103641e-05, "loss": 0.3299, "step": 6380 }, { "epoch": 6.797232570516232, "grad_norm": 0.6171875, "learning_rate": 5.6169567834323944e-05, "loss": 0.3272, "step": 6385 }, { "epoch": 6.802554550292709, "grad_norm": 0.7109375, "learning_rate": 5.600257778384037e-05, "loss": 0.3345, "step": 6390 }, { "epoch": 6.807876530069185, "grad_norm": 0.80859375, "learning_rate": 5.5835739735831174e-05, "loss": 0.3329, "step": 6395 }, { "epoch": 6.813198509845662, "grad_norm": 0.57421875, "learning_rate": 5.5669054266689156e-05, "loss": 0.3301, "step": 6400 }, { "epoch": 6.81852048962214, "grad_norm": 0.5859375, "learning_rate": 5.550252195227996e-05, "loss": 0.3308, "step": 6405 }, { "epoch": 6.823842469398616, "grad_norm": 0.703125, "learning_rate": 5.5336143367940106e-05, "loss": 0.3365, "step": 6410 }, { "epoch": 6.829164449175093, "grad_norm": 0.6484375, "learning_rate": 5.5169919088475065e-05, "loss": 0.3355, "step": 6415 }, { "epoch": 6.83448642895157, "grad_norm": 0.6015625, "learning_rate": 5.500384968815707e-05, "loss": 0.3432, "step": 6420 }, { "epoch": 6.839808408728047, "grad_norm": 0.734375, "learning_rate": 5.4837935740723354e-05, "loss": 0.3258, "step": 6425 }, { "epoch": 6.845130388504524, "grad_norm": 0.69140625, "learning_rate": 5.467217781937424e-05, "loss": 0.3267, "step": 6430 }, { "epoch": 6.850452368281001, "grad_norm": 0.67578125, "learning_rate": 5.4506576496770734e-05, "loss": 0.3277, "step": 6435 }, { "epoch": 6.855774348057477, "grad_norm": 0.5703125, "learning_rate": 5.434113234503302e-05, "loss": 0.32, "step": 6440 }, { "epoch": 6.8610963278339545, "grad_norm": 0.64453125, "learning_rate": 5.417584593573821e-05, "loss": 0.3358, "step": 6445 }, { "epoch": 6.866418307610431, "grad_norm": 0.66015625, "learning_rate": 5.401071783991849e-05, "loss": 0.3335, "step": 6450 }, { "epoch": 6.871740287386908, "grad_norm": 0.65234375, "learning_rate": 5.384574862805908e-05, "loss": 0.325, "step": 6455 }, { "epoch": 6.877062267163385, "grad_norm": 0.7421875, "learning_rate": 5.3680938870096306e-05, "loss": 0.3282, "step": 6460 }, { "epoch": 6.8823842469398615, "grad_norm": 0.69921875, "learning_rate": 5.351628913541551e-05, "loss": 0.3413, "step": 6465 }, { "epoch": 6.887706226716339, "grad_norm": 0.5859375, "learning_rate": 5.3351799992849395e-05, "loss": 0.3283, "step": 6470 }, { "epoch": 6.893028206492815, "grad_norm": 0.6484375, "learning_rate": 5.318747201067562e-05, "loss": 0.3317, "step": 6475 }, { "epoch": 6.898350186269292, "grad_norm": 0.65625, "learning_rate": 5.3023305756615214e-05, "loss": 0.334, "step": 6480 }, { "epoch": 6.903672166045769, "grad_norm": 0.671875, "learning_rate": 5.2859301797830416e-05, "loss": 0.3366, "step": 6485 }, { "epoch": 6.908994145822246, "grad_norm": 0.640625, "learning_rate": 5.2695460700922774e-05, "loss": 0.3475, "step": 6490 }, { "epoch": 6.914316125598723, "grad_norm": 0.66015625, "learning_rate": 5.2531783031931195e-05, "loss": 0.3353, "step": 6495 }, { "epoch": 6.919638105375199, "grad_norm": 0.6640625, "learning_rate": 5.236826935632997e-05, "loss": 0.3388, "step": 6500 }, { "epoch": 6.924960085151676, "grad_norm": 0.58984375, "learning_rate": 5.220492023902672e-05, "loss": 0.3244, "step": 6505 }, { "epoch": 6.930282064928154, "grad_norm": 0.58984375, "learning_rate": 5.204173624436077e-05, "loss": 0.329, "step": 6510 }, { "epoch": 6.93560404470463, "grad_norm": 0.6640625, "learning_rate": 5.187871793610084e-05, "loss": 0.3315, "step": 6515 }, { "epoch": 6.940926024481107, "grad_norm": 0.6171875, "learning_rate": 5.171586587744316e-05, "loss": 0.3396, "step": 6520 }, { "epoch": 6.946248004257583, "grad_norm": 0.69921875, "learning_rate": 5.1553180631009735e-05, "loss": 0.3472, "step": 6525 }, { "epoch": 6.951569984034061, "grad_norm": 0.65625, "learning_rate": 5.139066275884623e-05, "loss": 0.3428, "step": 6530 }, { "epoch": 6.956891963810538, "grad_norm": 0.68359375, "learning_rate": 5.122831282242005e-05, "loss": 0.3333, "step": 6535 }, { "epoch": 6.962213943587014, "grad_norm": 0.58984375, "learning_rate": 5.1066131382618464e-05, "loss": 0.3301, "step": 6540 }, { "epoch": 6.967535923363491, "grad_norm": 0.640625, "learning_rate": 5.0904118999746444e-05, "loss": 0.329, "step": 6545 }, { "epoch": 6.9728579031399684, "grad_norm": 0.5859375, "learning_rate": 5.0742276233525166e-05, "loss": 0.3298, "step": 6550 }, { "epoch": 6.978179882916445, "grad_norm": 0.6328125, "learning_rate": 5.058060364308965e-05, "loss": 0.3311, "step": 6555 }, { "epoch": 6.983501862692922, "grad_norm": 0.6640625, "learning_rate": 5.0419101786987e-05, "loss": 0.3373, "step": 6560 }, { "epoch": 6.988823842469398, "grad_norm": 0.60546875, "learning_rate": 5.0257771223174444e-05, "loss": 0.315, "step": 6565 }, { "epoch": 6.994145822245875, "grad_norm": 0.68359375, "learning_rate": 5.00966125090176e-05, "loss": 0.334, "step": 6570 }, { "epoch": 6.999467802022353, "grad_norm": 0.68359375, "learning_rate": 4.993562620128815e-05, "loss": 0.3395, "step": 6575 }, { "epoch": 6.999467802022353, "eval_loss": 3.1681480407714844, "eval_runtime": 11.9702, "eval_samples_per_second": 6.182, "eval_steps_per_second": 0.585, "step": 6575 }, { "epoch": 7.004789781798829, "grad_norm": 0.61328125, "learning_rate": 4.9774812856162325e-05, "loss": 0.2899, "step": 6580 }, { "epoch": 7.010111761575306, "grad_norm": 0.64453125, "learning_rate": 4.96141730292186e-05, "loss": 0.2954, "step": 6585 }, { "epoch": 7.015433741351782, "grad_norm": 0.73046875, "learning_rate": 4.945370727543623e-05, "loss": 0.2888, "step": 6590 }, { "epoch": 7.02075572112826, "grad_norm": 0.59375, "learning_rate": 4.929341614919296e-05, "loss": 0.2957, "step": 6595 }, { "epoch": 7.026077700904737, "grad_norm": 0.61328125, "learning_rate": 4.9133300204263147e-05, "loss": 0.2918, "step": 6600 }, { "epoch": 7.031399680681213, "grad_norm": 0.5546875, "learning_rate": 4.897335999381606e-05, "loss": 0.2899, "step": 6605 }, { "epoch": 7.03672166045769, "grad_norm": 0.703125, "learning_rate": 4.881359607041378e-05, "loss": 0.2961, "step": 6610 }, { "epoch": 7.0420436402341675, "grad_norm": 0.5703125, "learning_rate": 4.865400898600937e-05, "loss": 0.2875, "step": 6615 }, { "epoch": 7.047365620010644, "grad_norm": 0.65625, "learning_rate": 4.849459929194499e-05, "loss": 0.2815, "step": 6620 }, { "epoch": 7.052687599787121, "grad_norm": 0.609375, "learning_rate": 4.833536753894977e-05, "loss": 0.2973, "step": 6625 }, { "epoch": 7.058009579563597, "grad_norm": 0.53125, "learning_rate": 4.817631427713836e-05, "loss": 0.2855, "step": 6630 }, { "epoch": 7.0633315593400745, "grad_norm": 0.5859375, "learning_rate": 4.80174400560086e-05, "loss": 0.2923, "step": 6635 }, { "epoch": 7.068653539116552, "grad_norm": 0.6796875, "learning_rate": 4.785874542443973e-05, "loss": 0.2905, "step": 6640 }, { "epoch": 7.073975518893028, "grad_norm": 0.62109375, "learning_rate": 4.770023093069065e-05, "loss": 0.2825, "step": 6645 }, { "epoch": 7.079297498669505, "grad_norm": 0.61328125, "learning_rate": 4.754189712239789e-05, "loss": 0.2911, "step": 6650 }, { "epoch": 7.0846194784459815, "grad_norm": 0.6640625, "learning_rate": 4.738374454657373e-05, "loss": 0.2892, "step": 6655 }, { "epoch": 7.089941458222459, "grad_norm": 0.60546875, "learning_rate": 4.722577374960434e-05, "loss": 0.2764, "step": 6660 }, { "epoch": 7.095263437998936, "grad_norm": 0.69921875, "learning_rate": 4.7067985277247886e-05, "loss": 0.2881, "step": 6665 }, { "epoch": 7.100585417775412, "grad_norm": 0.6640625, "learning_rate": 4.69103796746326e-05, "loss": 0.2873, "step": 6670 }, { "epoch": 7.105907397551889, "grad_norm": 0.5859375, "learning_rate": 4.675295748625503e-05, "loss": 0.2864, "step": 6675 }, { "epoch": 7.111229377328367, "grad_norm": 0.69921875, "learning_rate": 4.659571925597791e-05, "loss": 0.2841, "step": 6680 }, { "epoch": 7.116551357104843, "grad_norm": 0.57421875, "learning_rate": 4.643866552702856e-05, "loss": 0.2904, "step": 6685 }, { "epoch": 7.12187333688132, "grad_norm": 0.75, "learning_rate": 4.628179684199685e-05, "loss": 0.2957, "step": 6690 }, { "epoch": 7.127195316657796, "grad_norm": 0.6015625, "learning_rate": 4.612511374283334e-05, "loss": 0.2912, "step": 6695 }, { "epoch": 7.132517296434274, "grad_norm": 0.671875, "learning_rate": 4.596861677084746e-05, "loss": 0.2881, "step": 6700 }, { "epoch": 7.137839276210751, "grad_norm": 0.71484375, "learning_rate": 4.581230646670559e-05, "loss": 0.2876, "step": 6705 }, { "epoch": 7.143161255987227, "grad_norm": 0.6640625, "learning_rate": 4.56561833704292e-05, "loss": 0.2951, "step": 6710 }, { "epoch": 7.148483235763704, "grad_norm": 0.66015625, "learning_rate": 4.550024802139299e-05, "loss": 0.2926, "step": 6715 }, { "epoch": 7.153805215540181, "grad_norm": 0.6796875, "learning_rate": 4.53445009583231e-05, "loss": 0.2935, "step": 6720 }, { "epoch": 7.159127195316658, "grad_norm": 0.578125, "learning_rate": 4.5188942719295044e-05, "loss": 0.2984, "step": 6725 }, { "epoch": 7.164449175093135, "grad_norm": 0.625, "learning_rate": 4.50335738417321e-05, "loss": 0.2921, "step": 6730 }, { "epoch": 7.169771154869611, "grad_norm": 0.71484375, "learning_rate": 4.487839486240329e-05, "loss": 0.2948, "step": 6735 }, { "epoch": 7.1750931346460884, "grad_norm": 0.640625, "learning_rate": 4.4723406317421615e-05, "loss": 0.2889, "step": 6740 }, { "epoch": 7.180415114422566, "grad_norm": 0.546875, "learning_rate": 4.4568608742242104e-05, "loss": 0.2887, "step": 6745 }, { "epoch": 7.185737094199042, "grad_norm": 0.65234375, "learning_rate": 4.441400267166006e-05, "loss": 0.2837, "step": 6750 }, { "epoch": 7.191059073975519, "grad_norm": 0.64453125, "learning_rate": 4.425958863980918e-05, "loss": 0.291, "step": 6755 }, { "epoch": 7.196381053751995, "grad_norm": 0.58984375, "learning_rate": 4.410536718015973e-05, "loss": 0.2907, "step": 6760 }, { "epoch": 7.201703033528473, "grad_norm": 0.78515625, "learning_rate": 4.395133882551655e-05, "loss": 0.297, "step": 6765 }, { "epoch": 7.20702501330495, "grad_norm": 0.671875, "learning_rate": 4.379750410801745e-05, "loss": 0.2892, "step": 6770 }, { "epoch": 7.212346993081426, "grad_norm": 0.625, "learning_rate": 4.364386355913135e-05, "loss": 0.2912, "step": 6775 }, { "epoch": 7.217668972857903, "grad_norm": 0.65234375, "learning_rate": 4.3490417709656115e-05, "loss": 0.2849, "step": 6780 }, { "epoch": 7.22299095263438, "grad_norm": 0.55859375, "learning_rate": 4.333716708971716e-05, "loss": 0.2789, "step": 6785 }, { "epoch": 7.228312932410857, "grad_norm": 0.60546875, "learning_rate": 4.318411222876533e-05, "loss": 0.2981, "step": 6790 }, { "epoch": 7.233634912187334, "grad_norm": 0.63671875, "learning_rate": 4.30312536555752e-05, "loss": 0.2894, "step": 6795 }, { "epoch": 7.23895689196381, "grad_norm": 0.58984375, "learning_rate": 4.2878591898243226e-05, "loss": 0.2783, "step": 6800 }, { "epoch": 7.2442788717402875, "grad_norm": 0.65234375, "learning_rate": 4.272612748418581e-05, "loss": 0.2931, "step": 6805 }, { "epoch": 7.249600851516764, "grad_norm": 0.65625, "learning_rate": 4.257386094013761e-05, "loss": 0.2974, "step": 6810 }, { "epoch": 7.254922831293241, "grad_norm": 0.66796875, "learning_rate": 4.242179279214984e-05, "loss": 0.2903, "step": 6815 }, { "epoch": 7.260244811069718, "grad_norm": 0.56640625, "learning_rate": 4.226992356558803e-05, "loss": 0.294, "step": 6820 }, { "epoch": 7.2655667908461945, "grad_norm": 0.61328125, "learning_rate": 4.211825378513066e-05, "loss": 0.2867, "step": 6825 }, { "epoch": 7.270888770622672, "grad_norm": 0.72265625, "learning_rate": 4.19667839747671e-05, "loss": 0.2879, "step": 6830 }, { "epoch": 7.276210750399149, "grad_norm": 0.70703125, "learning_rate": 4.181551465779587e-05, "loss": 0.3017, "step": 6835 }, { "epoch": 7.281532730175625, "grad_norm": 0.671875, "learning_rate": 4.1664446356822874e-05, "loss": 0.2941, "step": 6840 }, { "epoch": 7.286854709952102, "grad_norm": 0.65234375, "learning_rate": 4.15135795937594e-05, "loss": 0.2913, "step": 6845 }, { "epoch": 7.292176689728579, "grad_norm": 0.6796875, "learning_rate": 4.136291488982058e-05, "loss": 0.2957, "step": 6850 }, { "epoch": 7.297498669505056, "grad_norm": 0.64453125, "learning_rate": 4.1212452765523544e-05, "loss": 0.2843, "step": 6855 }, { "epoch": 7.302820649281533, "grad_norm": 0.640625, "learning_rate": 4.1062193740685374e-05, "loss": 0.2851, "step": 6860 }, { "epoch": 7.308142629058009, "grad_norm": 0.65234375, "learning_rate": 4.091213833442156e-05, "loss": 0.2861, "step": 6865 }, { "epoch": 7.313464608834487, "grad_norm": 0.6015625, "learning_rate": 4.0762287065144164e-05, "loss": 0.2941, "step": 6870 }, { "epoch": 7.318786588610963, "grad_norm": 0.6953125, "learning_rate": 4.061264045055996e-05, "loss": 0.2964, "step": 6875 }, { "epoch": 7.32410856838744, "grad_norm": 0.67578125, "learning_rate": 4.0463199007668704e-05, "loss": 0.2875, "step": 6880 }, { "epoch": 7.329430548163917, "grad_norm": 0.62890625, "learning_rate": 4.0313963252761244e-05, "loss": 0.2951, "step": 6885 }, { "epoch": 7.334752527940394, "grad_norm": 0.5703125, "learning_rate": 4.016493370141788e-05, "loss": 0.2938, "step": 6890 }, { "epoch": 7.340074507716871, "grad_norm": 0.6484375, "learning_rate": 4.001611086850663e-05, "loss": 0.2993, "step": 6895 }, { "epoch": 7.345396487493348, "grad_norm": 0.61328125, "learning_rate": 3.986749526818111e-05, "loss": 0.2893, "step": 6900 }, { "epoch": 7.350718467269824, "grad_norm": 0.65234375, "learning_rate": 3.9719087413879145e-05, "loss": 0.2922, "step": 6905 }, { "epoch": 7.3560404470463014, "grad_norm": 0.671875, "learning_rate": 3.957088781832078e-05, "loss": 0.2933, "step": 6910 }, { "epoch": 7.361362426822778, "grad_norm": 0.58203125, "learning_rate": 3.9422896993506595e-05, "loss": 0.2936, "step": 6915 }, { "epoch": 7.366684406599255, "grad_norm": 0.59375, "learning_rate": 3.9275115450715915e-05, "loss": 0.2926, "step": 6920 }, { "epoch": 7.372006386375732, "grad_norm": 0.59765625, "learning_rate": 3.9127543700505035e-05, "loss": 0.2818, "step": 6925 }, { "epoch": 7.377328366152208, "grad_norm": 0.65625, "learning_rate": 3.898018225270534e-05, "loss": 0.2988, "step": 6930 }, { "epoch": 7.382650345928686, "grad_norm": 0.62109375, "learning_rate": 3.88330316164219e-05, "loss": 0.2848, "step": 6935 }, { "epoch": 7.387972325705162, "grad_norm": 0.63671875, "learning_rate": 3.868609230003124e-05, "loss": 0.297, "step": 6940 }, { "epoch": 7.393294305481639, "grad_norm": 0.55078125, "learning_rate": 3.853936481117995e-05, "loss": 0.2801, "step": 6945 }, { "epoch": 7.398616285258116, "grad_norm": 0.61328125, "learning_rate": 3.839284965678274e-05, "loss": 0.2892, "step": 6950 }, { "epoch": 7.403938265034593, "grad_norm": 0.59765625, "learning_rate": 3.824654734302078e-05, "loss": 0.2929, "step": 6955 }, { "epoch": 7.40926024481107, "grad_norm": 0.61328125, "learning_rate": 3.81004583753399e-05, "loss": 0.2906, "step": 6960 }, { "epoch": 7.414582224587546, "grad_norm": 0.60546875, "learning_rate": 3.7954583258448906e-05, "loss": 0.2877, "step": 6965 }, { "epoch": 7.419904204364023, "grad_norm": 0.6328125, "learning_rate": 3.7808922496317624e-05, "loss": 0.2849, "step": 6970 }, { "epoch": 7.4252261841405005, "grad_norm": 0.57421875, "learning_rate": 3.766347659217557e-05, "loss": 0.2978, "step": 6975 }, { "epoch": 7.430548163916977, "grad_norm": 0.65625, "learning_rate": 3.751824604850985e-05, "loss": 0.2899, "step": 6980 }, { "epoch": 7.435870143693454, "grad_norm": 0.69140625, "learning_rate": 3.737323136706348e-05, "loss": 0.2883, "step": 6985 }, { "epoch": 7.441192123469931, "grad_norm": 0.67578125, "learning_rate": 3.722843304883379e-05, "loss": 0.2824, "step": 6990 }, { "epoch": 7.4465141032464075, "grad_norm": 0.53125, "learning_rate": 3.708385159407061e-05, "loss": 0.2878, "step": 6995 }, { "epoch": 7.451836083022885, "grad_norm": 0.671875, "learning_rate": 3.693948750227454e-05, "loss": 0.2977, "step": 7000 }, { "epoch": 7.457158062799361, "grad_norm": 0.61328125, "learning_rate": 3.679534127219527e-05, "loss": 0.2977, "step": 7005 }, { "epoch": 7.462480042575838, "grad_norm": 0.59375, "learning_rate": 3.6651413401829694e-05, "loss": 0.2766, "step": 7010 }, { "epoch": 7.467802022352315, "grad_norm": 0.66796875, "learning_rate": 3.6507704388420504e-05, "loss": 0.2932, "step": 7015 }, { "epoch": 7.473124002128792, "grad_norm": 0.6953125, "learning_rate": 3.636421472845418e-05, "loss": 0.3004, "step": 7020 }, { "epoch": 7.478445981905269, "grad_norm": 0.671875, "learning_rate": 3.622094491765934e-05, "loss": 0.2844, "step": 7025 }, { "epoch": 7.483767961681746, "grad_norm": 0.64453125, "learning_rate": 3.6077895451005114e-05, "loss": 0.2859, "step": 7030 }, { "epoch": 7.489089941458222, "grad_norm": 0.70703125, "learning_rate": 3.593506682269941e-05, "loss": 0.2857, "step": 7035 }, { "epoch": 7.4944119212347, "grad_norm": 0.5859375, "learning_rate": 3.579245952618715e-05, "loss": 0.2844, "step": 7040 }, { "epoch": 7.499733901011176, "grad_norm": 0.68359375, "learning_rate": 3.565007405414865e-05, "loss": 0.3005, "step": 7045 }, { "epoch": 7.505055880787653, "grad_norm": 0.6484375, "learning_rate": 3.55079108984977e-05, "loss": 0.2963, "step": 7050 }, { "epoch": 7.51037786056413, "grad_norm": 0.60546875, "learning_rate": 3.536597055038027e-05, "loss": 0.2997, "step": 7055 }, { "epoch": 7.515699840340607, "grad_norm": 0.625, "learning_rate": 3.5224253500172446e-05, "loss": 0.2895, "step": 7060 }, { "epoch": 7.521021820117084, "grad_norm": 0.6484375, "learning_rate": 3.508276023747882e-05, "loss": 0.3035, "step": 7065 }, { "epoch": 7.52634379989356, "grad_norm": 0.53515625, "learning_rate": 3.494149125113091e-05, "loss": 0.2953, "step": 7070 }, { "epoch": 7.531665779670037, "grad_norm": 0.60546875, "learning_rate": 3.48004470291854e-05, "loss": 0.2834, "step": 7075 }, { "epoch": 7.5369877594465144, "grad_norm": 0.62890625, "learning_rate": 3.4659628058922444e-05, "loss": 0.2942, "step": 7080 }, { "epoch": 7.542309739222991, "grad_norm": 0.5625, "learning_rate": 3.451903482684399e-05, "loss": 0.284, "step": 7085 }, { "epoch": 7.547631718999468, "grad_norm": 0.62109375, "learning_rate": 3.437866781867213e-05, "loss": 0.295, "step": 7090 }, { "epoch": 7.552953698775944, "grad_norm": 0.58203125, "learning_rate": 3.423852751934735e-05, "loss": 0.2992, "step": 7095 }, { "epoch": 7.558275678552421, "grad_norm": 0.68359375, "learning_rate": 3.4098614413027e-05, "loss": 0.2933, "step": 7100 }, { "epoch": 7.563597658328899, "grad_norm": 0.62890625, "learning_rate": 3.3958928983083346e-05, "loss": 0.2949, "step": 7105 }, { "epoch": 7.568919638105375, "grad_norm": 0.625, "learning_rate": 3.381947171210224e-05, "loss": 0.2811, "step": 7110 }, { "epoch": 7.574241617881852, "grad_norm": 0.515625, "learning_rate": 3.3680243081881246e-05, "loss": 0.2885, "step": 7115 }, { "epoch": 7.579563597658328, "grad_norm": 0.64453125, "learning_rate": 3.3541243573427984e-05, "loss": 0.2969, "step": 7120 }, { "epoch": 7.584885577434806, "grad_norm": 0.625, "learning_rate": 3.3402473666958524e-05, "loss": 0.3025, "step": 7125 }, { "epoch": 7.590207557211283, "grad_norm": 0.65625, "learning_rate": 3.326393384189571e-05, "loss": 0.2973, "step": 7130 }, { "epoch": 7.595529536987759, "grad_norm": 0.56640625, "learning_rate": 3.31256245768675e-05, "loss": 0.286, "step": 7135 }, { "epoch": 7.600851516764236, "grad_norm": 0.625, "learning_rate": 3.2987546349705346e-05, "loss": 0.2939, "step": 7140 }, { "epoch": 7.6061734965407135, "grad_norm": 0.58984375, "learning_rate": 3.28496996374424e-05, "loss": 0.2873, "step": 7145 }, { "epoch": 7.61149547631719, "grad_norm": 0.58984375, "learning_rate": 3.271208491631208e-05, "loss": 0.2839, "step": 7150 }, { "epoch": 7.616817456093667, "grad_norm": 0.63671875, "learning_rate": 3.257470266174629e-05, "loss": 0.3026, "step": 7155 }, { "epoch": 7.622139435870143, "grad_norm": 0.66796875, "learning_rate": 3.243755334837382e-05, "loss": 0.2905, "step": 7160 }, { "epoch": 7.6274614156466205, "grad_norm": 0.66796875, "learning_rate": 3.230063745001866e-05, "loss": 0.2884, "step": 7165 }, { "epoch": 7.632783395423098, "grad_norm": 0.65234375, "learning_rate": 3.216395543969841e-05, "loss": 0.2887, "step": 7170 }, { "epoch": 7.638105375199574, "grad_norm": 0.6015625, "learning_rate": 3.202750778962267e-05, "loss": 0.3034, "step": 7175 }, { "epoch": 7.643427354976051, "grad_norm": 0.53515625, "learning_rate": 3.189129497119131e-05, "loss": 0.2943, "step": 7180 }, { "epoch": 7.648749334752528, "grad_norm": 0.6328125, "learning_rate": 3.175531745499298e-05, "loss": 0.2948, "step": 7185 }, { "epoch": 7.654071314529005, "grad_norm": 0.60546875, "learning_rate": 3.161957571080329e-05, "loss": 0.2756, "step": 7190 }, { "epoch": 7.659393294305482, "grad_norm": 0.6015625, "learning_rate": 3.148407020758342e-05, "loss": 0.2897, "step": 7195 }, { "epoch": 7.664715274081958, "grad_norm": 0.68359375, "learning_rate": 3.13488014134783e-05, "loss": 0.3001, "step": 7200 }, { "epoch": 7.670037253858435, "grad_norm": 0.65625, "learning_rate": 3.121376979581516e-05, "loss": 0.2977, "step": 7205 }, { "epoch": 7.675359233634913, "grad_norm": 0.64453125, "learning_rate": 3.1078975821101755e-05, "loss": 0.2801, "step": 7210 }, { "epoch": 7.680681213411389, "grad_norm": 0.55859375, "learning_rate": 3.094441995502487e-05, "loss": 0.2809, "step": 7215 }, { "epoch": 7.686003193187866, "grad_norm": 0.6015625, "learning_rate": 3.081010266244867e-05, "loss": 0.2929, "step": 7220 }, { "epoch": 7.691325172964342, "grad_norm": 0.8203125, "learning_rate": 3.06760244074131e-05, "loss": 0.2929, "step": 7225 }, { "epoch": 7.69664715274082, "grad_norm": 0.640625, "learning_rate": 3.054218565313222e-05, "loss": 0.2856, "step": 7230 }, { "epoch": 7.701969132517297, "grad_norm": 0.57421875, "learning_rate": 3.0408586861992695e-05, "loss": 0.2894, "step": 7235 }, { "epoch": 7.707291112293773, "grad_norm": 0.6171875, "learning_rate": 3.0275228495552266e-05, "loss": 0.2951, "step": 7240 }, { "epoch": 7.71261309207025, "grad_norm": 0.5625, "learning_rate": 3.0142111014537888e-05, "loss": 0.2889, "step": 7245 }, { "epoch": 7.717935071846727, "grad_norm": 0.64453125, "learning_rate": 3.0009234878844395e-05, "loss": 0.2882, "step": 7250 }, { "epoch": 7.723257051623204, "grad_norm": 0.609375, "learning_rate": 2.9876600547532808e-05, "loss": 0.2821, "step": 7255 }, { "epoch": 7.728579031399681, "grad_norm": 0.59765625, "learning_rate": 2.9744208478828763e-05, "loss": 0.2919, "step": 7260 }, { "epoch": 7.733901011176157, "grad_norm": 0.63671875, "learning_rate": 2.961205913012095e-05, "loss": 0.2859, "step": 7265 }, { "epoch": 7.739222990952634, "grad_norm": 0.609375, "learning_rate": 2.9480152957959408e-05, "loss": 0.2995, "step": 7270 }, { "epoch": 7.744544970729111, "grad_norm": 0.65625, "learning_rate": 2.9348490418054132e-05, "loss": 0.2924, "step": 7275 }, { "epoch": 7.749866950505588, "grad_norm": 0.62109375, "learning_rate": 2.921707196527349e-05, "loss": 0.2915, "step": 7280 }, { "epoch": 7.755188930282065, "grad_norm": 0.671875, "learning_rate": 2.9085898053642414e-05, "loss": 0.3016, "step": 7285 }, { "epoch": 7.760510910058541, "grad_norm": 0.7578125, "learning_rate": 2.895496913634107e-05, "loss": 0.2886, "step": 7290 }, { "epoch": 7.765832889835019, "grad_norm": 0.68359375, "learning_rate": 2.8824285665703254e-05, "loss": 0.2859, "step": 7295 }, { "epoch": 7.771154869611496, "grad_norm": 0.65625, "learning_rate": 2.8693848093214727e-05, "loss": 0.2944, "step": 7300 }, { "epoch": 7.776476849387972, "grad_norm": 0.73828125, "learning_rate": 2.85636568695118e-05, "loss": 0.2911, "step": 7305 }, { "epoch": 7.781798829164449, "grad_norm": 0.6484375, "learning_rate": 2.8433712444379567e-05, "loss": 0.2874, "step": 7310 }, { "epoch": 7.7871208089409265, "grad_norm": 0.5625, "learning_rate": 2.8304015266750573e-05, "loss": 0.288, "step": 7315 }, { "epoch": 7.792442788717403, "grad_norm": 0.58203125, "learning_rate": 2.8174565784703246e-05, "loss": 0.2948, "step": 7320 }, { "epoch": 7.79776476849388, "grad_norm": 0.64453125, "learning_rate": 2.8045364445460088e-05, "loss": 0.2913, "step": 7325 }, { "epoch": 7.803086748270356, "grad_norm": 0.6640625, "learning_rate": 2.7916411695386446e-05, "loss": 0.2903, "step": 7330 }, { "epoch": 7.8084087280468335, "grad_norm": 0.67578125, "learning_rate": 2.778770797998882e-05, "loss": 0.2892, "step": 7335 }, { "epoch": 7.813730707823311, "grad_norm": 0.59375, "learning_rate": 2.765925374391334e-05, "loss": 0.2974, "step": 7340 }, { "epoch": 7.819052687599787, "grad_norm": 0.59765625, "learning_rate": 2.7531049430944223e-05, "loss": 0.2925, "step": 7345 }, { "epoch": 7.824374667376264, "grad_norm": 0.625, "learning_rate": 2.7403095484002227e-05, "loss": 0.2853, "step": 7350 }, { "epoch": 7.8296966471527405, "grad_norm": 0.5703125, "learning_rate": 2.727539234514316e-05, "loss": 0.2867, "step": 7355 }, { "epoch": 7.835018626929218, "grad_norm": 0.609375, "learning_rate": 2.714794045555643e-05, "loss": 0.2991, "step": 7360 }, { "epoch": 7.840340606705695, "grad_norm": 0.66796875, "learning_rate": 2.702074025556327e-05, "loss": 0.2899, "step": 7365 }, { "epoch": 7.845662586482171, "grad_norm": 0.62890625, "learning_rate": 2.6893792184615484e-05, "loss": 0.2922, "step": 7370 }, { "epoch": 7.850984566258648, "grad_norm": 0.67578125, "learning_rate": 2.676709668129378e-05, "loss": 0.2828, "step": 7375 }, { "epoch": 7.856306546035125, "grad_norm": 0.58203125, "learning_rate": 2.664065418330629e-05, "loss": 0.2926, "step": 7380 }, { "epoch": 7.861628525811602, "grad_norm": 0.73046875, "learning_rate": 2.6514465127487098e-05, "loss": 0.2904, "step": 7385 }, { "epoch": 7.866950505588079, "grad_norm": 0.64453125, "learning_rate": 2.6388529949794682e-05, "loss": 0.2911, "step": 7390 }, { "epoch": 7.872272485364555, "grad_norm": 0.69921875, "learning_rate": 2.6262849085310336e-05, "loss": 0.2905, "step": 7395 }, { "epoch": 7.877594465141033, "grad_norm": 0.6328125, "learning_rate": 2.6137422968236923e-05, "loss": 0.2969, "step": 7400 }, { "epoch": 7.882916444917509, "grad_norm": 0.66796875, "learning_rate": 2.6012252031897032e-05, "loss": 0.2999, "step": 7405 }, { "epoch": 7.888238424693986, "grad_norm": 0.62890625, "learning_rate": 2.5887336708731756e-05, "loss": 0.2861, "step": 7410 }, { "epoch": 7.893560404470463, "grad_norm": 0.61328125, "learning_rate": 2.5762677430299063e-05, "loss": 0.292, "step": 7415 }, { "epoch": 7.89888238424694, "grad_norm": 0.59765625, "learning_rate": 2.563827462727234e-05, "loss": 0.2922, "step": 7420 }, { "epoch": 7.904204364023417, "grad_norm": 0.6484375, "learning_rate": 2.5514128729438892e-05, "loss": 0.2968, "step": 7425 }, { "epoch": 7.909526343799894, "grad_norm": 0.6640625, "learning_rate": 2.539024016569851e-05, "loss": 0.2962, "step": 7430 }, { "epoch": 7.91484832357637, "grad_norm": 0.59765625, "learning_rate": 2.5266609364061823e-05, "loss": 0.281, "step": 7435 }, { "epoch": 7.920170303352847, "grad_norm": 0.640625, "learning_rate": 2.5143236751649125e-05, "loss": 0.2981, "step": 7440 }, { "epoch": 7.925492283129324, "grad_norm": 0.67578125, "learning_rate": 2.5020122754688614e-05, "loss": 0.2922, "step": 7445 }, { "epoch": 7.930814262905801, "grad_norm": 0.6015625, "learning_rate": 2.4897267798514967e-05, "loss": 0.2979, "step": 7450 }, { "epoch": 7.936136242682278, "grad_norm": 0.57421875, "learning_rate": 2.4774672307568036e-05, "loss": 0.2936, "step": 7455 }, { "epoch": 7.941458222458754, "grad_norm": 0.63671875, "learning_rate": 2.465233670539121e-05, "loss": 0.2962, "step": 7460 }, { "epoch": 7.946780202235232, "grad_norm": 0.6875, "learning_rate": 2.453026141463003e-05, "loss": 0.2965, "step": 7465 }, { "epoch": 7.952102182011709, "grad_norm": 0.59375, "learning_rate": 2.4408446857030752e-05, "loss": 0.2951, "step": 7470 }, { "epoch": 7.957424161788185, "grad_norm": 0.6640625, "learning_rate": 2.428689345343871e-05, "loss": 0.2946, "step": 7475 }, { "epoch": 7.962746141564662, "grad_norm": 0.65234375, "learning_rate": 2.4165601623797184e-05, "loss": 0.2916, "step": 7480 }, { "epoch": 7.968068121341139, "grad_norm": 0.5703125, "learning_rate": 2.4044571787145686e-05, "loss": 0.2904, "step": 7485 }, { "epoch": 7.973390101117616, "grad_norm": 0.55859375, "learning_rate": 2.3923804361618552e-05, "loss": 0.2876, "step": 7490 }, { "epoch": 7.978712080894093, "grad_norm": 0.65234375, "learning_rate": 2.3803299764443576e-05, "loss": 0.2979, "step": 7495 }, { "epoch": 7.984034060670569, "grad_norm": 0.69921875, "learning_rate": 2.368305841194056e-05, "loss": 0.2954, "step": 7500 }, { "epoch": 7.9893560404470465, "grad_norm": 0.67578125, "learning_rate": 2.3563080719519815e-05, "loss": 0.284, "step": 7505 }, { "epoch": 7.994678020223523, "grad_norm": 0.6484375, "learning_rate": 2.344336710168078e-05, "loss": 0.2959, "step": 7510 }, { "epoch": 8.0, "grad_norm": 0.55859375, "learning_rate": 2.33239179720105e-05, "loss": 0.2962, "step": 7515 }, { "epoch": 8.0, "eval_loss": 3.6134331226348877, "eval_runtime": 11.8084, "eval_samples_per_second": 6.267, "eval_steps_per_second": 0.593, "step": 7515 }, { "epoch": 8.005321979776477, "grad_norm": 0.625, "learning_rate": 2.320473374318233e-05, "loss": 0.272, "step": 7520 }, { "epoch": 8.010643959552954, "grad_norm": 0.546875, "learning_rate": 2.30858148269545e-05, "loss": 0.278, "step": 7525 }, { "epoch": 8.01596593932943, "grad_norm": 0.6015625, "learning_rate": 2.2967161634168488e-05, "loss": 0.2694, "step": 7530 }, { "epoch": 8.021287919105907, "grad_norm": 0.57421875, "learning_rate": 2.2848774574747867e-05, "loss": 0.2697, "step": 7535 }, { "epoch": 8.026609898882384, "grad_norm": 0.6015625, "learning_rate": 2.2730654057696733e-05, "loss": 0.2708, "step": 7540 }, { "epoch": 8.031931878658861, "grad_norm": 0.55078125, "learning_rate": 2.2612800491098317e-05, "loss": 0.2705, "step": 7545 }, { "epoch": 8.037253858435339, "grad_norm": 0.58984375, "learning_rate": 2.249521428211363e-05, "loss": 0.281, "step": 7550 }, { "epoch": 8.042575838211814, "grad_norm": 0.66015625, "learning_rate": 2.237789583697999e-05, "loss": 0.2694, "step": 7555 }, { "epoch": 8.047897817988291, "grad_norm": 0.5859375, "learning_rate": 2.226084556100957e-05, "loss": 0.2743, "step": 7560 }, { "epoch": 8.053219797764768, "grad_norm": 0.5859375, "learning_rate": 2.2144063858588248e-05, "loss": 0.2789, "step": 7565 }, { "epoch": 8.058541777541246, "grad_norm": 0.66015625, "learning_rate": 2.2027551133173852e-05, "loss": 0.2779, "step": 7570 }, { "epoch": 8.063863757317723, "grad_norm": 0.6171875, "learning_rate": 2.1911307787295055e-05, "loss": 0.28, "step": 7575 }, { "epoch": 8.069185737094198, "grad_norm": 0.546875, "learning_rate": 2.1795334222549824e-05, "loss": 0.28, "step": 7580 }, { "epoch": 8.074507716870675, "grad_norm": 0.57421875, "learning_rate": 2.1679630839604126e-05, "loss": 0.282, "step": 7585 }, { "epoch": 8.079829696647153, "grad_norm": 0.60546875, "learning_rate": 2.1564198038190477e-05, "loss": 0.2716, "step": 7590 }, { "epoch": 8.08515167642363, "grad_norm": 0.63671875, "learning_rate": 2.1449036217106612e-05, "loss": 0.2684, "step": 7595 }, { "epoch": 8.090473656200107, "grad_norm": 0.58984375, "learning_rate": 2.1334145774213988e-05, "loss": 0.2761, "step": 7600 }, { "epoch": 8.095795635976584, "grad_norm": 0.65234375, "learning_rate": 2.1219527106436676e-05, "loss": 0.2792, "step": 7605 }, { "epoch": 8.10111761575306, "grad_norm": 0.6484375, "learning_rate": 2.1105180609759646e-05, "loss": 0.2856, "step": 7610 }, { "epoch": 8.106439595529537, "grad_norm": 0.6171875, "learning_rate": 2.0991106679227666e-05, "loss": 0.2739, "step": 7615 }, { "epoch": 8.111761575306014, "grad_norm": 0.6328125, "learning_rate": 2.0877305708943827e-05, "loss": 0.2799, "step": 7620 }, { "epoch": 8.117083555082491, "grad_norm": 0.63671875, "learning_rate": 2.0763778092068188e-05, "loss": 0.2754, "step": 7625 }, { "epoch": 8.122405534858968, "grad_norm": 0.66015625, "learning_rate": 2.0650524220816404e-05, "loss": 0.277, "step": 7630 }, { "epoch": 8.127727514635444, "grad_norm": 0.6328125, "learning_rate": 2.053754448645846e-05, "loss": 0.276, "step": 7635 }, { "epoch": 8.133049494411921, "grad_norm": 0.58203125, "learning_rate": 2.042483927931711e-05, "loss": 0.2717, "step": 7640 }, { "epoch": 8.138371474188398, "grad_norm": 0.72265625, "learning_rate": 2.0312408988766852e-05, "loss": 0.2708, "step": 7645 }, { "epoch": 8.143693453964875, "grad_norm": 0.53125, "learning_rate": 2.0200254003232323e-05, "loss": 0.277, "step": 7650 }, { "epoch": 8.149015433741353, "grad_norm": 0.546875, "learning_rate": 2.0088374710186963e-05, "loss": 0.2704, "step": 7655 }, { "epoch": 8.154337413517828, "grad_norm": 0.62890625, "learning_rate": 1.9976771496151835e-05, "loss": 0.278, "step": 7660 }, { "epoch": 8.159659393294305, "grad_norm": 0.5546875, "learning_rate": 1.9865444746694195e-05, "loss": 0.2735, "step": 7665 }, { "epoch": 8.164981373070782, "grad_norm": 0.66015625, "learning_rate": 1.975439484642614e-05, "loss": 0.2783, "step": 7670 }, { "epoch": 8.17030335284726, "grad_norm": 0.5625, "learning_rate": 1.964362217900334e-05, "loss": 0.2794, "step": 7675 }, { "epoch": 8.175625332623737, "grad_norm": 0.59375, "learning_rate": 1.9533127127123597e-05, "loss": 0.2721, "step": 7680 }, { "epoch": 8.180947312400212, "grad_norm": 0.671875, "learning_rate": 1.9422910072525747e-05, "loss": 0.2745, "step": 7685 }, { "epoch": 8.18626929217669, "grad_norm": 0.6171875, "learning_rate": 1.93129713959881e-05, "loss": 0.2739, "step": 7690 }, { "epoch": 8.191591271953166, "grad_norm": 0.6484375, "learning_rate": 1.9203311477327223e-05, "loss": 0.2775, "step": 7695 }, { "epoch": 8.196913251729644, "grad_norm": 0.62890625, "learning_rate": 1.9093930695396634e-05, "loss": 0.2738, "step": 7700 }, { "epoch": 8.20223523150612, "grad_norm": 0.62109375, "learning_rate": 1.8984829428085584e-05, "loss": 0.2782, "step": 7705 }, { "epoch": 8.207557211282596, "grad_norm": 0.578125, "learning_rate": 1.8876008052317507e-05, "loss": 0.2855, "step": 7710 }, { "epoch": 8.212879191059073, "grad_norm": 0.54296875, "learning_rate": 1.8767466944048984e-05, "loss": 0.275, "step": 7715 }, { "epoch": 8.21820117083555, "grad_norm": 0.58984375, "learning_rate": 1.865920647826821e-05, "loss": 0.2687, "step": 7720 }, { "epoch": 8.223523150612028, "grad_norm": 0.65625, "learning_rate": 1.8551227028993955e-05, "loss": 0.2721, "step": 7725 }, { "epoch": 8.228845130388505, "grad_norm": 0.6484375, "learning_rate": 1.844352896927407e-05, "loss": 0.2799, "step": 7730 }, { "epoch": 8.234167110164982, "grad_norm": 0.61328125, "learning_rate": 1.8336112671184193e-05, "loss": 0.2787, "step": 7735 }, { "epoch": 8.239489089941458, "grad_norm": 0.5703125, "learning_rate": 1.822897850582659e-05, "loss": 0.2723, "step": 7740 }, { "epoch": 8.244811069717935, "grad_norm": 0.640625, "learning_rate": 1.81221268433289e-05, "loss": 0.2895, "step": 7745 }, { "epoch": 8.250133049494412, "grad_norm": 0.6328125, "learning_rate": 1.8015558052842607e-05, "loss": 0.2811, "step": 7750 }, { "epoch": 8.25545502927089, "grad_norm": 0.578125, "learning_rate": 1.7909272502542017e-05, "loss": 0.2728, "step": 7755 }, { "epoch": 8.260777009047366, "grad_norm": 0.58203125, "learning_rate": 1.78032705596229e-05, "loss": 0.2751, "step": 7760 }, { "epoch": 8.266098988823842, "grad_norm": 0.625, "learning_rate": 1.769755259030119e-05, "loss": 0.2714, "step": 7765 }, { "epoch": 8.271420968600319, "grad_norm": 0.6171875, "learning_rate": 1.75921189598118e-05, "loss": 0.2764, "step": 7770 }, { "epoch": 8.276742948376796, "grad_norm": 0.63671875, "learning_rate": 1.74869700324072e-05, "loss": 0.274, "step": 7775 }, { "epoch": 8.282064928153273, "grad_norm": 0.7109375, "learning_rate": 1.7382106171356372e-05, "loss": 0.28, "step": 7780 }, { "epoch": 8.28738690792975, "grad_norm": 0.6953125, "learning_rate": 1.727752773894341e-05, "loss": 0.2835, "step": 7785 }, { "epoch": 8.292708887706226, "grad_norm": 0.59765625, "learning_rate": 1.717323509646631e-05, "loss": 0.2691, "step": 7790 }, { "epoch": 8.298030867482703, "grad_norm": 0.6640625, "learning_rate": 1.7069228604235722e-05, "loss": 0.2718, "step": 7795 }, { "epoch": 8.30335284725918, "grad_norm": 0.6171875, "learning_rate": 1.696550862157369e-05, "loss": 0.2724, "step": 7800 }, { "epoch": 8.308674827035658, "grad_norm": 0.5703125, "learning_rate": 1.6862075506812458e-05, "loss": 0.2788, "step": 7805 }, { "epoch": 8.313996806812135, "grad_norm": 0.578125, "learning_rate": 1.6758929617293152e-05, "loss": 0.2771, "step": 7810 }, { "epoch": 8.31931878658861, "grad_norm": 0.59375, "learning_rate": 1.665607130936465e-05, "loss": 0.2735, "step": 7815 }, { "epoch": 8.324640766365087, "grad_norm": 0.63671875, "learning_rate": 1.6553500938382205e-05, "loss": 0.2759, "step": 7820 }, { "epoch": 8.329962746141565, "grad_norm": 0.72265625, "learning_rate": 1.6451218858706374e-05, "loss": 0.2779, "step": 7825 }, { "epoch": 8.335284725918042, "grad_norm": 0.55078125, "learning_rate": 1.6349225423701697e-05, "loss": 0.2835, "step": 7830 }, { "epoch": 8.340606705694519, "grad_norm": 0.6015625, "learning_rate": 1.6247520985735542e-05, "loss": 0.2724, "step": 7835 }, { "epoch": 8.345928685470994, "grad_norm": 0.6015625, "learning_rate": 1.6146105896176787e-05, "loss": 0.2744, "step": 7840 }, { "epoch": 8.351250665247472, "grad_norm": 0.67578125, "learning_rate": 1.6044980505394702e-05, "loss": 0.286, "step": 7845 }, { "epoch": 8.356572645023949, "grad_norm": 0.6484375, "learning_rate": 1.5944145162757718e-05, "loss": 0.2833, "step": 7850 }, { "epoch": 8.361894624800426, "grad_norm": 0.59375, "learning_rate": 1.5843600216632205e-05, "loss": 0.2747, "step": 7855 }, { "epoch": 8.367216604576903, "grad_norm": 0.609375, "learning_rate": 1.5743346014381198e-05, "loss": 0.2796, "step": 7860 }, { "epoch": 8.372538584353379, "grad_norm": 0.63671875, "learning_rate": 1.5643382902363356e-05, "loss": 0.2764, "step": 7865 }, { "epoch": 8.377860564129856, "grad_norm": 0.73828125, "learning_rate": 1.5543711225931658e-05, "loss": 0.271, "step": 7870 }, { "epoch": 8.383182543906333, "grad_norm": 0.64453125, "learning_rate": 1.5444331329432203e-05, "loss": 0.2782, "step": 7875 }, { "epoch": 8.38850452368281, "grad_norm": 0.5390625, "learning_rate": 1.534524355620307e-05, "loss": 0.2641, "step": 7880 }, { "epoch": 8.393826503459287, "grad_norm": 0.578125, "learning_rate": 1.524644824857311e-05, "loss": 0.2661, "step": 7885 }, { "epoch": 8.399148483235763, "grad_norm": 0.64453125, "learning_rate": 1.5147945747860748e-05, "loss": 0.2841, "step": 7890 }, { "epoch": 8.40447046301224, "grad_norm": 0.63671875, "learning_rate": 1.504973639437286e-05, "loss": 0.2709, "step": 7895 }, { "epoch": 8.409792442788717, "grad_norm": 0.62109375, "learning_rate": 1.495182052740346e-05, "loss": 0.2764, "step": 7900 }, { "epoch": 8.415114422565194, "grad_norm": 0.578125, "learning_rate": 1.4854198485232696e-05, "loss": 0.2809, "step": 7905 }, { "epoch": 8.420436402341672, "grad_norm": 0.67578125, "learning_rate": 1.4756870605125673e-05, "loss": 0.279, "step": 7910 }, { "epoch": 8.425758382118149, "grad_norm": 0.71484375, "learning_rate": 1.4659837223331075e-05, "loss": 0.2712, "step": 7915 }, { "epoch": 8.431080361894624, "grad_norm": 0.68359375, "learning_rate": 1.4563098675080245e-05, "loss": 0.2769, "step": 7920 }, { "epoch": 8.436402341671101, "grad_norm": 0.65625, "learning_rate": 1.446665529458593e-05, "loss": 0.2833, "step": 7925 }, { "epoch": 8.441724321447579, "grad_norm": 0.609375, "learning_rate": 1.43705074150411e-05, "loss": 0.2766, "step": 7930 }, { "epoch": 8.447046301224056, "grad_norm": 0.5859375, "learning_rate": 1.4274655368617872e-05, "loss": 0.2791, "step": 7935 }, { "epoch": 8.452368281000533, "grad_norm": 0.5234375, "learning_rate": 1.4179099486466241e-05, "loss": 0.272, "step": 7940 }, { "epoch": 8.457690260777008, "grad_norm": 0.62109375, "learning_rate": 1.4083840098713064e-05, "loss": 0.2831, "step": 7945 }, { "epoch": 8.463012240553486, "grad_norm": 0.625, "learning_rate": 1.3988877534460932e-05, "loss": 0.2772, "step": 7950 }, { "epoch": 8.468334220329963, "grad_norm": 0.609375, "learning_rate": 1.3894212121786832e-05, "loss": 0.27, "step": 7955 }, { "epoch": 8.47365620010644, "grad_norm": 0.55859375, "learning_rate": 1.379984418774124e-05, "loss": 0.2682, "step": 7960 }, { "epoch": 8.478978179882917, "grad_norm": 0.625, "learning_rate": 1.3705774058346876e-05, "loss": 0.2759, "step": 7965 }, { "epoch": 8.484300159659393, "grad_norm": 0.6875, "learning_rate": 1.3612002058597617e-05, "loss": 0.2736, "step": 7970 }, { "epoch": 8.48962213943587, "grad_norm": 0.609375, "learning_rate": 1.3518528512457374e-05, "loss": 0.2738, "step": 7975 }, { "epoch": 8.494944119212347, "grad_norm": 0.67578125, "learning_rate": 1.3425353742858893e-05, "loss": 0.287, "step": 7980 }, { "epoch": 8.500266098988824, "grad_norm": 0.609375, "learning_rate": 1.3332478071702736e-05, "loss": 0.2659, "step": 7985 }, { "epoch": 8.505588078765301, "grad_norm": 0.734375, "learning_rate": 1.3239901819856237e-05, "loss": 0.2784, "step": 7990 }, { "epoch": 8.510910058541777, "grad_norm": 0.6796875, "learning_rate": 1.3147625307152123e-05, "loss": 0.2715, "step": 7995 }, { "epoch": 8.516232038318254, "grad_norm": 0.6171875, "learning_rate": 1.3055648852387714e-05, "loss": 0.2811, "step": 8000 }, { "epoch": 8.521554018094731, "grad_norm": 0.6328125, "learning_rate": 1.2963972773323618e-05, "loss": 0.2733, "step": 8005 }, { "epoch": 8.526875997871208, "grad_norm": 0.59765625, "learning_rate": 1.2872597386682749e-05, "loss": 0.2698, "step": 8010 }, { "epoch": 8.532197977647686, "grad_norm": 0.64453125, "learning_rate": 1.2781523008149154e-05, "loss": 0.282, "step": 8015 }, { "epoch": 8.537519957424163, "grad_norm": 0.73828125, "learning_rate": 1.2690749952366998e-05, "loss": 0.2688, "step": 8020 }, { "epoch": 8.542841937200638, "grad_norm": 0.6171875, "learning_rate": 1.2600278532939336e-05, "loss": 0.2776, "step": 8025 }, { "epoch": 8.548163916977115, "grad_norm": 0.65625, "learning_rate": 1.2510109062427289e-05, "loss": 0.2772, "step": 8030 }, { "epoch": 8.553485896753592, "grad_norm": 0.498046875, "learning_rate": 1.2420241852348647e-05, "loss": 0.2817, "step": 8035 }, { "epoch": 8.55880787653007, "grad_norm": 0.60546875, "learning_rate": 1.2330677213177033e-05, "loss": 0.2774, "step": 8040 }, { "epoch": 8.564129856306547, "grad_norm": 0.65234375, "learning_rate": 1.2241415454340731e-05, "loss": 0.2689, "step": 8045 }, { "epoch": 8.569451836083022, "grad_norm": 0.625, "learning_rate": 1.215245688422163e-05, "loss": 0.2781, "step": 8050 }, { "epoch": 8.5747738158595, "grad_norm": 0.7109375, "learning_rate": 1.2063801810154163e-05, "loss": 0.2874, "step": 8055 }, { "epoch": 8.580095795635977, "grad_norm": 0.6953125, "learning_rate": 1.1975450538424271e-05, "loss": 0.2846, "step": 8060 }, { "epoch": 8.585417775412454, "grad_norm": 0.58984375, "learning_rate": 1.1887403374268213e-05, "loss": 0.2783, "step": 8065 }, { "epoch": 8.590739755188931, "grad_norm": 0.59375, "learning_rate": 1.1799660621871788e-05, "loss": 0.2789, "step": 8070 }, { "epoch": 8.596061734965406, "grad_norm": 0.61328125, "learning_rate": 1.1712222584368948e-05, "loss": 0.2758, "step": 8075 }, { "epoch": 8.601383714741884, "grad_norm": 0.6796875, "learning_rate": 1.1625089563840996e-05, "loss": 0.2684, "step": 8080 }, { "epoch": 8.60670569451836, "grad_norm": 0.6796875, "learning_rate": 1.1538261861315469e-05, "loss": 0.2818, "step": 8085 }, { "epoch": 8.612027674294838, "grad_norm": 0.609375, "learning_rate": 1.1451739776765046e-05, "loss": 0.2689, "step": 8090 }, { "epoch": 8.617349654071315, "grad_norm": 0.5390625, "learning_rate": 1.13655236091066e-05, "loss": 0.2688, "step": 8095 }, { "epoch": 8.62267163384779, "grad_norm": 0.61328125, "learning_rate": 1.1279613656200106e-05, "loss": 0.2798, "step": 8100 }, { "epoch": 8.627993613624268, "grad_norm": 0.6171875, "learning_rate": 1.1194010214847561e-05, "loss": 0.2702, "step": 8105 }, { "epoch": 8.633315593400745, "grad_norm": 0.54296875, "learning_rate": 1.1108713580792152e-05, "loss": 0.2857, "step": 8110 }, { "epoch": 8.638637573177222, "grad_norm": 0.625, "learning_rate": 1.1023724048717032e-05, "loss": 0.2749, "step": 8115 }, { "epoch": 8.6439595529537, "grad_norm": 0.609375, "learning_rate": 1.0939041912244352e-05, "loss": 0.2681, "step": 8120 }, { "epoch": 8.649281532730175, "grad_norm": 0.71875, "learning_rate": 1.085466746393432e-05, "loss": 0.2811, "step": 8125 }, { "epoch": 8.654603512506652, "grad_norm": 0.55078125, "learning_rate": 1.077060099528412e-05, "loss": 0.2756, "step": 8130 }, { "epoch": 8.65992549228313, "grad_norm": 0.6171875, "learning_rate": 1.0686842796726936e-05, "loss": 0.2809, "step": 8135 }, { "epoch": 8.665247472059606, "grad_norm": 0.62890625, "learning_rate": 1.060339315763097e-05, "loss": 0.2767, "step": 8140 }, { "epoch": 8.670569451836084, "grad_norm": 0.71484375, "learning_rate": 1.05202523662983e-05, "loss": 0.28, "step": 8145 }, { "epoch": 8.675891431612559, "grad_norm": 0.6328125, "learning_rate": 1.043742070996413e-05, "loss": 0.2834, "step": 8150 }, { "epoch": 8.681213411389036, "grad_norm": 0.69140625, "learning_rate": 1.0354898474795615e-05, "loss": 0.2788, "step": 8155 }, { "epoch": 8.686535391165513, "grad_norm": 0.58203125, "learning_rate": 1.027268594589087e-05, "loss": 0.2687, "step": 8160 }, { "epoch": 8.69185737094199, "grad_norm": 0.55078125, "learning_rate": 1.0190783407278082e-05, "loss": 0.2691, "step": 8165 }, { "epoch": 8.697179350718468, "grad_norm": 0.54296875, "learning_rate": 1.0109191141914499e-05, "loss": 0.2673, "step": 8170 }, { "epoch": 8.702501330494943, "grad_norm": 0.57421875, "learning_rate": 1.00279094316854e-05, "loss": 0.2779, "step": 8175 }, { "epoch": 8.70782331027142, "grad_norm": 0.62109375, "learning_rate": 9.946938557403196e-06, "loss": 0.2729, "step": 8180 }, { "epoch": 8.713145290047898, "grad_norm": 0.6640625, "learning_rate": 9.86627879880634e-06, "loss": 0.2809, "step": 8185 }, { "epoch": 8.718467269824375, "grad_norm": 0.703125, "learning_rate": 9.785930434558532e-06, "loss": 0.2672, "step": 8190 }, { "epoch": 8.723789249600852, "grad_norm": 0.58984375, "learning_rate": 9.705893742247652e-06, "loss": 0.2737, "step": 8195 }, { "epoch": 8.72911122937733, "grad_norm": 0.62109375, "learning_rate": 9.626168998384755e-06, "loss": 0.283, "step": 8200 }, { "epoch": 8.734433209153805, "grad_norm": 0.67578125, "learning_rate": 9.546756478403219e-06, "loss": 0.2821, "step": 8205 }, { "epoch": 8.739755188930282, "grad_norm": 0.65625, "learning_rate": 9.467656456657748e-06, "loss": 0.2765, "step": 8210 }, { "epoch": 8.745077168706759, "grad_norm": 0.5859375, "learning_rate": 9.388869206423423e-06, "loss": 0.2726, "step": 8215 }, { "epoch": 8.750399148483236, "grad_norm": 0.6875, "learning_rate": 9.310394999894756e-06, "loss": 0.2776, "step": 8220 }, { "epoch": 8.755721128259713, "grad_norm": 0.53125, "learning_rate": 9.232234108184756e-06, "loss": 0.2866, "step": 8225 }, { "epoch": 8.761043108036189, "grad_norm": 0.609375, "learning_rate": 9.154386801324e-06, "loss": 0.2744, "step": 8230 }, { "epoch": 8.766365087812666, "grad_norm": 0.7734375, "learning_rate": 9.076853348259684e-06, "loss": 0.2788, "step": 8235 }, { "epoch": 8.771687067589143, "grad_norm": 0.6328125, "learning_rate": 8.999634016854686e-06, "loss": 0.2854, "step": 8240 }, { "epoch": 8.77700904736562, "grad_norm": 0.54296875, "learning_rate": 8.92272907388666e-06, "loss": 0.2712, "step": 8245 }, { "epoch": 8.782331027142098, "grad_norm": 0.5703125, "learning_rate": 8.846138785047143e-06, "loss": 0.2809, "step": 8250 }, { "epoch": 8.787653006918573, "grad_norm": 0.60546875, "learning_rate": 8.76986341494056e-06, "loss": 0.2791, "step": 8255 }, { "epoch": 8.79297498669505, "grad_norm": 0.66796875, "learning_rate": 8.693903227083389e-06, "loss": 0.2731, "step": 8260 }, { "epoch": 8.798296966471527, "grad_norm": 0.5625, "learning_rate": 8.618258483903174e-06, "loss": 0.2677, "step": 8265 }, { "epoch": 8.803618946248005, "grad_norm": 0.64453125, "learning_rate": 8.542929446737702e-06, "loss": 0.276, "step": 8270 }, { "epoch": 8.808940926024482, "grad_norm": 0.57421875, "learning_rate": 8.467916375834039e-06, "loss": 0.2743, "step": 8275 }, { "epoch": 8.814262905800957, "grad_norm": 0.578125, "learning_rate": 8.393219530347662e-06, "loss": 0.2722, "step": 8280 }, { "epoch": 8.819584885577434, "grad_norm": 0.61328125, "learning_rate": 8.318839168341497e-06, "loss": 0.2869, "step": 8285 }, { "epoch": 8.824906865353912, "grad_norm": 0.6875, "learning_rate": 8.244775546785156e-06, "loss": 0.2832, "step": 8290 }, { "epoch": 8.830228845130389, "grad_norm": 0.6484375, "learning_rate": 8.171028921553935e-06, "loss": 0.2841, "step": 8295 }, { "epoch": 8.835550824906866, "grad_norm": 0.65625, "learning_rate": 8.09759954742797e-06, "loss": 0.2826, "step": 8300 }, { "epoch": 8.840872804683343, "grad_norm": 0.58203125, "learning_rate": 8.024487678091374e-06, "loss": 0.2676, "step": 8305 }, { "epoch": 8.846194784459819, "grad_norm": 0.58984375, "learning_rate": 7.951693566131324e-06, "loss": 0.2792, "step": 8310 }, { "epoch": 8.851516764236296, "grad_norm": 0.625, "learning_rate": 7.879217463037214e-06, "loss": 0.2744, "step": 8315 }, { "epoch": 8.856838744012773, "grad_norm": 0.6171875, "learning_rate": 7.807059619199809e-06, "loss": 0.2783, "step": 8320 }, { "epoch": 8.86216072378925, "grad_norm": 0.62109375, "learning_rate": 7.735220283910284e-06, "loss": 0.2814, "step": 8325 }, { "epoch": 8.867482703565727, "grad_norm": 0.52734375, "learning_rate": 7.663699705359495e-06, "loss": 0.2698, "step": 8330 }, { "epoch": 8.872804683342203, "grad_norm": 0.62890625, "learning_rate": 7.592498130637005e-06, "loss": 0.2775, "step": 8335 }, { "epoch": 8.87812666311868, "grad_norm": 0.58984375, "learning_rate": 7.521615805730331e-06, "loss": 0.271, "step": 8340 }, { "epoch": 8.883448642895157, "grad_norm": 0.70703125, "learning_rate": 7.451052975524009e-06, "loss": 0.2772, "step": 8345 }, { "epoch": 8.888770622671634, "grad_norm": 0.65625, "learning_rate": 7.3808098837988004e-06, "loss": 0.2683, "step": 8350 }, { "epoch": 8.894092602448112, "grad_norm": 0.5390625, "learning_rate": 7.310886773230807e-06, "loss": 0.2783, "step": 8355 }, { "epoch": 8.899414582224587, "grad_norm": 0.625, "learning_rate": 7.241283885390715e-06, "loss": 0.2724, "step": 8360 }, { "epoch": 8.904736562001064, "grad_norm": 0.64453125, "learning_rate": 7.1720014607428275e-06, "loss": 0.2736, "step": 8365 }, { "epoch": 8.910058541777541, "grad_norm": 0.62890625, "learning_rate": 7.103039738644346e-06, "loss": 0.2754, "step": 8370 }, { "epoch": 8.915380521554019, "grad_norm": 0.5546875, "learning_rate": 7.034398957344557e-06, "loss": 0.2662, "step": 8375 }, { "epoch": 8.920702501330496, "grad_norm": 0.61328125, "learning_rate": 6.966079353983879e-06, "loss": 0.2817, "step": 8380 }, { "epoch": 8.926024481106971, "grad_norm": 0.58203125, "learning_rate": 6.898081164593151e-06, "loss": 0.2783, "step": 8385 }, { "epoch": 8.931346460883448, "grad_norm": 0.6875, "learning_rate": 6.8304046240928125e-06, "loss": 0.2794, "step": 8390 }, { "epoch": 8.936668440659925, "grad_norm": 0.6484375, "learning_rate": 6.763049966292046e-06, "loss": 0.2713, "step": 8395 }, { "epoch": 8.941990420436403, "grad_norm": 0.546875, "learning_rate": 6.6960174238880125e-06, "loss": 0.2748, "step": 8400 }, { "epoch": 8.94731240021288, "grad_norm": 0.61328125, "learning_rate": 6.629307228464987e-06, "loss": 0.2892, "step": 8405 }, { "epoch": 8.952634379989355, "grad_norm": 0.625, "learning_rate": 6.562919610493623e-06, "loss": 0.274, "step": 8410 }, { "epoch": 8.957956359765832, "grad_norm": 0.63671875, "learning_rate": 6.4968547993301875e-06, "loss": 0.2834, "step": 8415 }, { "epoch": 8.96327833954231, "grad_norm": 0.578125, "learning_rate": 6.431113023215607e-06, "loss": 0.2627, "step": 8420 }, { "epoch": 8.968600319318787, "grad_norm": 0.6171875, "learning_rate": 6.365694509274867e-06, "loss": 0.279, "step": 8425 }, { "epoch": 8.973922299095264, "grad_norm": 0.57421875, "learning_rate": 6.300599483516101e-06, "loss": 0.2765, "step": 8430 }, { "epoch": 8.97924427887174, "grad_norm": 0.54296875, "learning_rate": 6.235828170829871e-06, "loss": 0.2788, "step": 8435 }, { "epoch": 8.984566258648217, "grad_norm": 0.578125, "learning_rate": 6.171380794988401e-06, "loss": 0.2753, "step": 8440 }, { "epoch": 8.989888238424694, "grad_norm": 0.67578125, "learning_rate": 6.107257578644721e-06, "loss": 0.2905, "step": 8445 }, { "epoch": 8.995210218201171, "grad_norm": 0.640625, "learning_rate": 6.043458743331965e-06, "loss": 0.284, "step": 8450 }, { "epoch": 8.999467802022352, "eval_loss": 3.8100473880767822, "eval_runtime": 11.9757, "eval_samples_per_second": 6.179, "eval_steps_per_second": 0.585, "step": 8454 }, { "epoch": 9.000532197977648, "grad_norm": 0.73046875, "learning_rate": 5.979984509462655e-06, "loss": 0.2773, "step": 8455 }, { "epoch": 9.005854177754124, "grad_norm": 0.56640625, "learning_rate": 5.916835096327811e-06, "loss": 0.2717, "step": 8460 }, { "epoch": 9.0111761575306, "grad_norm": 0.59375, "learning_rate": 5.85401072209627e-06, "loss": 0.2693, "step": 8465 }, { "epoch": 9.016498137307078, "grad_norm": 0.55078125, "learning_rate": 5.791511603813971e-06, "loss": 0.2757, "step": 8470 }, { "epoch": 9.021820117083555, "grad_norm": 0.68359375, "learning_rate": 5.729337957403092e-06, "loss": 0.2773, "step": 8475 }, { "epoch": 9.027142096860032, "grad_norm": 0.66015625, "learning_rate": 5.667489997661424e-06, "loss": 0.2801, "step": 8480 }, { "epoch": 9.03246407663651, "grad_norm": 0.609375, "learning_rate": 5.605967938261547e-06, "loss": 0.2742, "step": 8485 }, { "epoch": 9.037786056412985, "grad_norm": 0.5859375, "learning_rate": 5.544771991750097e-06, "loss": 0.2689, "step": 8490 }, { "epoch": 9.043108036189462, "grad_norm": 0.625, "learning_rate": 5.483902369547123e-06, "loss": 0.2713, "step": 8495 }, { "epoch": 9.04843001596594, "grad_norm": 0.61328125, "learning_rate": 5.423359281945217e-06, "loss": 0.2791, "step": 8500 }, { "epoch": 9.053751995742417, "grad_norm": 0.59375, "learning_rate": 5.3631429381089005e-06, "loss": 0.2774, "step": 8505 }, { "epoch": 9.059073975518894, "grad_norm": 0.58203125, "learning_rate": 5.303253546073828e-06, "loss": 0.2612, "step": 8510 }, { "epoch": 9.06439595529537, "grad_norm": 0.5234375, "learning_rate": 5.243691312746124e-06, "loss": 0.28, "step": 8515 }, { "epoch": 9.069717935071846, "grad_norm": 0.6171875, "learning_rate": 5.184456443901642e-06, "loss": 0.2761, "step": 8520 }, { "epoch": 9.075039914848324, "grad_norm": 0.66015625, "learning_rate": 5.1255491441852595e-06, "loss": 0.269, "step": 8525 }, { "epoch": 9.0803618946248, "grad_norm": 0.546875, "learning_rate": 5.066969617110118e-06, "loss": 0.273, "step": 8530 }, { "epoch": 9.085683874401278, "grad_norm": 0.64453125, "learning_rate": 5.008718065057039e-06, "loss": 0.2741, "step": 8535 }, { "epoch": 9.091005854177753, "grad_norm": 0.5703125, "learning_rate": 4.95079468927373e-06, "loss": 0.273, "step": 8540 }, { "epoch": 9.09632783395423, "grad_norm": 0.6015625, "learning_rate": 4.893199689874084e-06, "loss": 0.2809, "step": 8545 }, { "epoch": 9.101649813730708, "grad_norm": 0.62890625, "learning_rate": 4.835933265837544e-06, "loss": 0.2685, "step": 8550 }, { "epoch": 9.106971793507185, "grad_norm": 0.69140625, "learning_rate": 4.778995615008397e-06, "loss": 0.2691, "step": 8555 }, { "epoch": 9.112293773283662, "grad_norm": 0.58203125, "learning_rate": 4.7223869340950686e-06, "loss": 0.2859, "step": 8560 }, { "epoch": 9.117615753060138, "grad_norm": 0.6640625, "learning_rate": 4.666107418669474e-06, "loss": 0.2741, "step": 8565 }, { "epoch": 9.122937732836615, "grad_norm": 0.71875, "learning_rate": 4.61015726316627e-06, "loss": 0.2805, "step": 8570 }, { "epoch": 9.128259712613092, "grad_norm": 0.61328125, "learning_rate": 4.554536660882325e-06, "loss": 0.2852, "step": 8575 }, { "epoch": 9.13358169238957, "grad_norm": 0.58984375, "learning_rate": 4.499245803975926e-06, "loss": 0.269, "step": 8580 }, { "epoch": 9.138903672166046, "grad_norm": 0.5546875, "learning_rate": 4.444284883466121e-06, "loss": 0.2676, "step": 8585 }, { "epoch": 9.144225651942522, "grad_norm": 0.6328125, "learning_rate": 4.389654089232142e-06, "loss": 0.2739, "step": 8590 }, { "epoch": 9.149547631718999, "grad_norm": 0.5703125, "learning_rate": 4.335353610012671e-06, "loss": 0.2704, "step": 8595 }, { "epoch": 9.154869611495476, "grad_norm": 0.625, "learning_rate": 4.281383633405245e-06, "loss": 0.2723, "step": 8600 }, { "epoch": 9.160191591271953, "grad_norm": 0.609375, "learning_rate": 4.227744345865548e-06, "loss": 0.2816, "step": 8605 }, { "epoch": 9.16551357104843, "grad_norm": 0.5625, "learning_rate": 4.1744359327067885e-06, "loss": 0.2717, "step": 8610 }, { "epoch": 9.170835550824908, "grad_norm": 0.6484375, "learning_rate": 4.121458578099125e-06, "loss": 0.2753, "step": 8615 }, { "epoch": 9.176157530601383, "grad_norm": 0.6171875, "learning_rate": 4.068812465068939e-06, "loss": 0.2772, "step": 8620 }, { "epoch": 9.18147951037786, "grad_norm": 0.51171875, "learning_rate": 4.016497775498207e-06, "loss": 0.2809, "step": 8625 }, { "epoch": 9.186801490154338, "grad_norm": 0.6328125, "learning_rate": 3.964514690123955e-06, "loss": 0.2828, "step": 8630 }, { "epoch": 9.192123469930815, "grad_norm": 0.5859375, "learning_rate": 3.912863388537558e-06, "loss": 0.286, "step": 8635 }, { "epoch": 9.197445449707292, "grad_norm": 0.65625, "learning_rate": 3.86154404918414e-06, "loss": 0.2716, "step": 8640 }, { "epoch": 9.202767429483767, "grad_norm": 0.57421875, "learning_rate": 3.810556849361968e-06, "loss": 0.2656, "step": 8645 }, { "epoch": 9.208089409260245, "grad_norm": 0.5703125, "learning_rate": 3.7599019652218125e-06, "loss": 0.2699, "step": 8650 }, { "epoch": 9.213411389036722, "grad_norm": 0.55078125, "learning_rate": 3.7095795717663862e-06, "loss": 0.263, "step": 8655 }, { "epoch": 9.218733368813199, "grad_norm": 0.55859375, "learning_rate": 3.659589842849731e-06, "loss": 0.2712, "step": 8660 }, { "epoch": 9.224055348589676, "grad_norm": 0.6171875, "learning_rate": 3.609932951176509e-06, "loss": 0.2759, "step": 8665 }, { "epoch": 9.229377328366152, "grad_norm": 0.5859375, "learning_rate": 3.5606090683015904e-06, "loss": 0.2742, "step": 8670 }, { "epoch": 9.234699308142629, "grad_norm": 0.65234375, "learning_rate": 3.511618364629321e-06, "loss": 0.2622, "step": 8675 }, { "epoch": 9.240021287919106, "grad_norm": 0.5859375, "learning_rate": 3.4629610094129794e-06, "loss": 0.2737, "step": 8680 }, { "epoch": 9.245343267695583, "grad_norm": 0.5703125, "learning_rate": 3.414637170754198e-06, "loss": 0.2706, "step": 8685 }, { "epoch": 9.25066524747206, "grad_norm": 0.6015625, "learning_rate": 3.3666470156023866e-06, "loss": 0.2775, "step": 8690 }, { "epoch": 9.255987227248536, "grad_norm": 0.62109375, "learning_rate": 3.3189907097540884e-06, "loss": 0.279, "step": 8695 }, { "epoch": 9.261309207025013, "grad_norm": 0.58203125, "learning_rate": 3.271668417852569e-06, "loss": 0.2766, "step": 8700 }, { "epoch": 9.26663118680149, "grad_norm": 0.640625, "learning_rate": 3.224680303387018e-06, "loss": 0.2715, "step": 8705 }, { "epoch": 9.271953166577967, "grad_norm": 0.6015625, "learning_rate": 3.1780265286921796e-06, "loss": 0.2726, "step": 8710 }, { "epoch": 9.277275146354445, "grad_norm": 0.73828125, "learning_rate": 3.1317072549477243e-06, "loss": 0.2723, "step": 8715 }, { "epoch": 9.28259712613092, "grad_norm": 0.5703125, "learning_rate": 3.0857226421776462e-06, "loss": 0.2742, "step": 8720 }, { "epoch": 9.287919105907397, "grad_norm": 0.6953125, "learning_rate": 3.0400728492497866e-06, "loss": 0.2733, "step": 8725 }, { "epoch": 9.293241085683874, "grad_norm": 0.67578125, "learning_rate": 2.9947580338752555e-06, "loss": 0.2837, "step": 8730 }, { "epoch": 9.298563065460351, "grad_norm": 0.75, "learning_rate": 2.9497783526078126e-06, "loss": 0.2812, "step": 8735 }, { "epoch": 9.303885045236829, "grad_norm": 0.6484375, "learning_rate": 2.9051339608434867e-06, "loss": 0.2695, "step": 8740 }, { "epoch": 9.309207025013304, "grad_norm": 0.625, "learning_rate": 2.860825012819912e-06, "loss": 0.27, "step": 8745 }, { "epoch": 9.314529004789781, "grad_norm": 0.59765625, "learning_rate": 2.8168516616157823e-06, "loss": 0.2642, "step": 8750 }, { "epoch": 9.319850984566258, "grad_norm": 0.63671875, "learning_rate": 2.7732140591504418e-06, "loss": 0.2722, "step": 8755 }, { "epoch": 9.325172964342736, "grad_norm": 0.64453125, "learning_rate": 2.7299123561832397e-06, "loss": 0.2641, "step": 8760 }, { "epoch": 9.330494944119213, "grad_norm": 0.62109375, "learning_rate": 2.686946702313098e-06, "loss": 0.2744, "step": 8765 }, { "epoch": 9.335816923895688, "grad_norm": 0.6171875, "learning_rate": 2.6443172459779233e-06, "loss": 0.27, "step": 8770 }, { "epoch": 9.341138903672165, "grad_norm": 0.61328125, "learning_rate": 2.6020241344541176e-06, "loss": 0.2773, "step": 8775 }, { "epoch": 9.346460883448643, "grad_norm": 0.61328125, "learning_rate": 2.5600675138561238e-06, "loss": 0.2763, "step": 8780 }, { "epoch": 9.35178286322512, "grad_norm": 0.57421875, "learning_rate": 2.518447529135848e-06, "loss": 0.2691, "step": 8785 }, { "epoch": 9.357104843001597, "grad_norm": 0.546875, "learning_rate": 2.4771643240821706e-06, "loss": 0.2728, "step": 8790 }, { "epoch": 9.362426822778074, "grad_norm": 0.5859375, "learning_rate": 2.4362180413204704e-06, "loss": 0.2755, "step": 8795 }, { "epoch": 9.36774880255455, "grad_norm": 0.57421875, "learning_rate": 2.395608822312134e-06, "loss": 0.2685, "step": 8800 }, { "epoch": 9.373070782331027, "grad_norm": 0.5859375, "learning_rate": 2.3553368073540694e-06, "loss": 0.2698, "step": 8805 }, { "epoch": 9.378392762107504, "grad_norm": 0.609375, "learning_rate": 2.3154021355782153e-06, "loss": 0.2753, "step": 8810 }, { "epoch": 9.383714741883981, "grad_norm": 0.55859375, "learning_rate": 2.2758049449509876e-06, "loss": 0.2807, "step": 8815 }, { "epoch": 9.389036721660458, "grad_norm": 0.5625, "learning_rate": 2.236545372272958e-06, "loss": 0.2741, "step": 8820 }, { "epoch": 9.394358701436934, "grad_norm": 0.61328125, "learning_rate": 2.197623553178274e-06, "loss": 0.2714, "step": 8825 }, { "epoch": 9.399680681213411, "grad_norm": 0.58984375, "learning_rate": 2.1590396221341623e-06, "loss": 0.273, "step": 8830 }, { "epoch": 9.405002660989888, "grad_norm": 0.58203125, "learning_rate": 2.1207937124405496e-06, "loss": 0.2648, "step": 8835 }, { "epoch": 9.410324640766365, "grad_norm": 0.578125, "learning_rate": 2.0828859562295856e-06, "loss": 0.2745, "step": 8840 }, { "epoch": 9.415646620542843, "grad_norm": 0.56640625, "learning_rate": 2.045316484465121e-06, "loss": 0.2718, "step": 8845 }, { "epoch": 9.420968600319318, "grad_norm": 0.71484375, "learning_rate": 2.00808542694233e-06, "loss": 0.2675, "step": 8850 }, { "epoch": 9.426290580095795, "grad_norm": 0.57421875, "learning_rate": 1.971192912287201e-06, "loss": 0.2733, "step": 8855 }, { "epoch": 9.431612559872272, "grad_norm": 0.625, "learning_rate": 1.9346390679561787e-06, "loss": 0.2754, "step": 8860 }, { "epoch": 9.43693453964875, "grad_norm": 0.63671875, "learning_rate": 1.898424020235623e-06, "loss": 0.2742, "step": 8865 }, { "epoch": 9.442256519425227, "grad_norm": 0.57421875, "learning_rate": 1.8625478942414176e-06, "loss": 0.2717, "step": 8870 }, { "epoch": 9.447578499201702, "grad_norm": 0.6015625, "learning_rate": 1.8270108139185505e-06, "loss": 0.2689, "step": 8875 }, { "epoch": 9.45290047897818, "grad_norm": 0.55859375, "learning_rate": 1.7918129020406903e-06, "loss": 0.2692, "step": 8880 }, { "epoch": 9.458222458754657, "grad_norm": 0.56640625, "learning_rate": 1.7569542802097106e-06, "loss": 0.2749, "step": 8885 }, { "epoch": 9.463544438531134, "grad_norm": 0.5859375, "learning_rate": 1.7224350688553215e-06, "loss": 0.2853, "step": 8890 }, { "epoch": 9.468866418307611, "grad_norm": 0.609375, "learning_rate": 1.6882553872346386e-06, "loss": 0.2725, "step": 8895 }, { "epoch": 9.474188398084088, "grad_norm": 0.671875, "learning_rate": 1.6544153534317597e-06, "loss": 0.2698, "step": 8900 }, { "epoch": 9.479510377860564, "grad_norm": 0.60546875, "learning_rate": 1.6209150843573995e-06, "loss": 0.262, "step": 8905 }, { "epoch": 9.48483235763704, "grad_norm": 0.6171875, "learning_rate": 1.5877546957483891e-06, "loss": 0.2677, "step": 8910 }, { "epoch": 9.490154337413518, "grad_norm": 0.66015625, "learning_rate": 1.5549343021673767e-06, "loss": 0.269, "step": 8915 }, { "epoch": 9.495476317189995, "grad_norm": 0.6015625, "learning_rate": 1.5224540170024282e-06, "loss": 0.2724, "step": 8920 }, { "epoch": 9.500798296966472, "grad_norm": 0.578125, "learning_rate": 1.4903139524665376e-06, "loss": 0.2768, "step": 8925 }, { "epoch": 9.506120276742948, "grad_norm": 0.56640625, "learning_rate": 1.458514219597329e-06, "loss": 0.2667, "step": 8930 }, { "epoch": 9.511442256519425, "grad_norm": 0.62890625, "learning_rate": 1.4270549282566659e-06, "loss": 0.2785, "step": 8935 }, { "epoch": 9.516764236295902, "grad_norm": 0.58984375, "learning_rate": 1.39593618713022e-06, "loss": 0.2638, "step": 8940 }, { "epoch": 9.52208621607238, "grad_norm": 0.6328125, "learning_rate": 1.3651581037271598e-06, "loss": 0.2662, "step": 8945 }, { "epoch": 9.527408195848857, "grad_norm": 0.51953125, "learning_rate": 1.3347207843797394e-06, "loss": 0.2759, "step": 8950 }, { "epoch": 9.532730175625332, "grad_norm": 0.53125, "learning_rate": 1.30462433424291e-06, "loss": 0.2813, "step": 8955 }, { "epoch": 9.53805215540181, "grad_norm": 0.58203125, "learning_rate": 1.2748688572940205e-06, "loss": 0.2783, "step": 8960 }, { "epoch": 9.543374135178286, "grad_norm": 0.73046875, "learning_rate": 1.2454544563324288e-06, "loss": 0.2765, "step": 8965 }, { "epoch": 9.548696114954764, "grad_norm": 0.65234375, "learning_rate": 1.2163812329791025e-06, "loss": 0.2768, "step": 8970 }, { "epoch": 9.55401809473124, "grad_norm": 0.55078125, "learning_rate": 1.1876492876763624e-06, "loss": 0.2739, "step": 8975 }, { "epoch": 9.559340074507716, "grad_norm": 0.5859375, "learning_rate": 1.1592587196874284e-06, "loss": 0.2718, "step": 8980 }, { "epoch": 9.564662054284193, "grad_norm": 0.54296875, "learning_rate": 1.1312096270961526e-06, "loss": 0.2882, "step": 8985 }, { "epoch": 9.56998403406067, "grad_norm": 0.5546875, "learning_rate": 1.1035021068066753e-06, "loss": 0.2757, "step": 8990 }, { "epoch": 9.575306013837148, "grad_norm": 0.55859375, "learning_rate": 1.0761362545430254e-06, "loss": 0.2679, "step": 8995 }, { "epoch": 9.580627993613625, "grad_norm": 0.59765625, "learning_rate": 1.0491121648488644e-06, "loss": 0.2695, "step": 9000 }, { "epoch": 9.5859499733901, "grad_norm": 0.6328125, "learning_rate": 1.0224299310871544e-06, "loss": 0.2782, "step": 9005 }, { "epoch": 9.591271953166578, "grad_norm": 0.6015625, "learning_rate": 9.960896454397572e-07, "loss": 0.269, "step": 9010 }, { "epoch": 9.596593932943055, "grad_norm": 0.5859375, "learning_rate": 9.700913989072025e-07, "loss": 0.2754, "step": 9015 }, { "epoch": 9.601915912719532, "grad_norm": 0.62890625, "learning_rate": 9.444352813083534e-07, "loss": 0.2688, "step": 9020 }, { "epoch": 9.60723789249601, "grad_norm": 0.61328125, "learning_rate": 9.191213812800637e-07, "loss": 0.2764, "step": 9025 }, { "epoch": 9.612559872272485, "grad_norm": 0.6796875, "learning_rate": 8.941497862768988e-07, "loss": 0.2768, "step": 9030 }, { "epoch": 9.617881852048962, "grad_norm": 0.546875, "learning_rate": 8.695205825708153e-07, "loss": 0.2705, "step": 9035 }, { "epoch": 9.623203831825439, "grad_norm": 0.65234375, "learning_rate": 8.452338552509042e-07, "loss": 0.2702, "step": 9040 }, { "epoch": 9.628525811601916, "grad_norm": 0.62890625, "learning_rate": 8.212896882230481e-07, "loss": 0.2752, "step": 9045 }, { "epoch": 9.633847791378393, "grad_norm": 0.5859375, "learning_rate": 7.976881642096423e-07, "loss": 0.2821, "step": 9050 }, { "epoch": 9.639169771154869, "grad_norm": 0.60546875, "learning_rate": 7.744293647493406e-07, "loss": 0.2776, "step": 9055 }, { "epoch": 9.644491750931346, "grad_norm": 0.484375, "learning_rate": 7.515133701967325e-07, "loss": 0.2635, "step": 9060 }, { "epoch": 9.649813730707823, "grad_norm": 0.71484375, "learning_rate": 7.289402597220884e-07, "loss": 0.2742, "step": 9065 }, { "epoch": 9.6551357104843, "grad_norm": 0.5625, "learning_rate": 7.06710111311093e-07, "loss": 0.2695, "step": 9070 }, { "epoch": 9.660457690260777, "grad_norm": 0.625, "learning_rate": 6.848230017645452e-07, "loss": 0.2735, "step": 9075 }, { "epoch": 9.665779670037255, "grad_norm": 0.578125, "learning_rate": 6.632790066981142e-07, "loss": 0.2731, "step": 9080 }, { "epoch": 9.67110164981373, "grad_norm": 0.640625, "learning_rate": 6.420782005420956e-07, "loss": 0.2805, "step": 9085 }, { "epoch": 9.676423629590207, "grad_norm": 0.56640625, "learning_rate": 6.212206565411105e-07, "loss": 0.2675, "step": 9090 }, { "epoch": 9.681745609366684, "grad_norm": 0.55859375, "learning_rate": 6.007064467538959e-07, "loss": 0.273, "step": 9095 }, { "epoch": 9.687067589143162, "grad_norm": 0.6171875, "learning_rate": 5.805356420530151e-07, "loss": 0.2818, "step": 9100 }, { "epoch": 9.692389568919639, "grad_norm": 0.6953125, "learning_rate": 5.607083121246692e-07, "loss": 0.2797, "step": 9105 }, { "epoch": 9.697711548696114, "grad_norm": 0.63671875, "learning_rate": 5.412245254683868e-07, "loss": 0.2735, "step": 9110 }, { "epoch": 9.703033528472591, "grad_norm": 0.578125, "learning_rate": 5.22084349396823e-07, "loss": 0.2763, "step": 9115 }, { "epoch": 9.708355508249069, "grad_norm": 0.71484375, "learning_rate": 5.032878500355497e-07, "loss": 0.2665, "step": 9120 }, { "epoch": 9.713677488025546, "grad_norm": 0.62890625, "learning_rate": 4.848350923227884e-07, "loss": 0.2638, "step": 9125 }, { "epoch": 9.718999467802023, "grad_norm": 0.59765625, "learning_rate": 4.6672614000919933e-07, "loss": 0.2741, "step": 9130 }, { "epoch": 9.724321447578498, "grad_norm": 0.546875, "learning_rate": 4.4896105565765955e-07, "loss": 0.275, "step": 9135 }, { "epoch": 9.729643427354976, "grad_norm": 0.63671875, "learning_rate": 4.3153990064306314e-07, "loss": 0.2765, "step": 9140 }, { "epoch": 9.734965407131453, "grad_norm": 0.59765625, "learning_rate": 4.14462735152088e-07, "loss": 0.2732, "step": 9145 }, { "epoch": 9.74028738690793, "grad_norm": 0.6953125, "learning_rate": 3.977296181829959e-07, "loss": 0.2816, "step": 9150 }, { "epoch": 9.745609366684407, "grad_norm": 0.56640625, "learning_rate": 3.813406075454218e-07, "loss": 0.2702, "step": 9155 }, { "epoch": 9.750931346460883, "grad_norm": 0.546875, "learning_rate": 3.6529575986019583e-07, "loss": 0.2829, "step": 9160 }, { "epoch": 9.75625332623736, "grad_norm": 0.59375, "learning_rate": 3.4959513055911054e-07, "loss": 0.2788, "step": 9165 }, { "epoch": 9.761575306013837, "grad_norm": 0.7109375, "learning_rate": 3.342387738847763e-07, "loss": 0.2759, "step": 9170 }, { "epoch": 9.766897285790314, "grad_norm": 0.57421875, "learning_rate": 3.1922674289038835e-07, "loss": 0.2669, "step": 9175 }, { "epoch": 9.772219265566791, "grad_norm": 0.61328125, "learning_rate": 3.045590894395822e-07, "loss": 0.2753, "step": 9180 }, { "epoch": 9.777541245343269, "grad_norm": 0.61328125, "learning_rate": 2.90235864206212e-07, "loss": 0.2749, "step": 9185 }, { "epoch": 9.782863225119744, "grad_norm": 0.69140625, "learning_rate": 2.7625711667422783e-07, "loss": 0.27, "step": 9190 }, { "epoch": 9.788185204896221, "grad_norm": 0.57421875, "learning_rate": 2.626228951374765e-07, "loss": 0.2735, "step": 9195 }, { "epoch": 9.793507184672698, "grad_norm": 0.62109375, "learning_rate": 2.4933324669949023e-07, "loss": 0.2677, "step": 9200 }, { "epoch": 9.798829164449176, "grad_norm": 0.6484375, "learning_rate": 2.3638821727341996e-07, "loss": 0.2758, "step": 9205 }, { "epoch": 9.804151144225653, "grad_norm": 0.66796875, "learning_rate": 2.2378785158180259e-07, "loss": 0.2648, "step": 9210 }, { "epoch": 9.809473124002128, "grad_norm": 0.55859375, "learning_rate": 2.1153219315643847e-07, "loss": 0.2854, "step": 9215 }, { "epoch": 9.814795103778605, "grad_norm": 0.58984375, "learning_rate": 1.996212843382139e-07, "loss": 0.2769, "step": 9220 }, { "epoch": 9.820117083555083, "grad_norm": 0.5625, "learning_rate": 1.8805516627700137e-07, "loss": 0.2696, "step": 9225 }, { "epoch": 9.82543906333156, "grad_norm": 0.69140625, "learning_rate": 1.768338789314594e-07, "loss": 0.2748, "step": 9230 }, { "epoch": 9.830761043108037, "grad_norm": 0.59765625, "learning_rate": 1.6595746106895517e-07, "loss": 0.2809, "step": 9235 }, { "epoch": 9.836083022884512, "grad_norm": 0.5859375, "learning_rate": 1.5542595026535322e-07, "loss": 0.2594, "step": 9240 }, { "epoch": 9.84140500266099, "grad_norm": 0.625, "learning_rate": 1.4523938290498252e-07, "loss": 0.2792, "step": 9245 }, { "epoch": 9.846726982437467, "grad_norm": 0.609375, "learning_rate": 1.353977941804363e-07, "loss": 0.2692, "step": 9250 }, { "epoch": 9.852048962213944, "grad_norm": 0.625, "learning_rate": 1.2590121809247234e-07, "loss": 0.2715, "step": 9255 }, { "epoch": 9.857370941990421, "grad_norm": 0.65234375, "learning_rate": 1.167496874499019e-07, "loss": 0.276, "step": 9260 }, { "epoch": 9.862692921766897, "grad_norm": 0.55859375, "learning_rate": 1.0794323386945637e-07, "loss": 0.2683, "step": 9265 }, { "epoch": 9.868014901543374, "grad_norm": 0.578125, "learning_rate": 9.948188777572088e-08, "loss": 0.2782, "step": 9270 }, { "epoch": 9.873336881319851, "grad_norm": 0.578125, "learning_rate": 9.136567840100085e-08, "loss": 0.29, "step": 9275 }, { "epoch": 9.878658861096328, "grad_norm": 0.54296875, "learning_rate": 8.359463378517784e-08, "loss": 0.2626, "step": 9280 }, { "epoch": 9.883980840872805, "grad_norm": 0.65234375, "learning_rate": 7.616878077572054e-08, "loss": 0.2797, "step": 9285 }, { "epoch": 9.88930282064928, "grad_norm": 0.546875, "learning_rate": 6.908814502747384e-08, "loss": 0.281, "step": 9290 }, { "epoch": 9.894624800425758, "grad_norm": 0.640625, "learning_rate": 6.235275100265892e-08, "loss": 0.2723, "step": 9295 }, { "epoch": 9.899946780202235, "grad_norm": 0.73046875, "learning_rate": 5.596262197073987e-08, "loss": 0.2812, "step": 9300 }, { "epoch": 9.905268759978712, "grad_norm": 0.72265625, "learning_rate": 4.991778000835723e-08, "loss": 0.2761, "step": 9305 }, { "epoch": 9.91059073975519, "grad_norm": 0.609375, "learning_rate": 4.421824599926127e-08, "loss": 0.273, "step": 9310 }, { "epoch": 9.915912719531665, "grad_norm": 0.56640625, "learning_rate": 3.886403963422325e-08, "loss": 0.2708, "step": 9315 }, { "epoch": 9.921234699308142, "grad_norm": 0.62890625, "learning_rate": 3.385517941097982e-08, "loss": 0.2665, "step": 9320 }, { "epoch": 9.92655667908462, "grad_norm": 0.5859375, "learning_rate": 2.919168263414429e-08, "loss": 0.2754, "step": 9325 }, { "epoch": 9.931878658861097, "grad_norm": 0.58984375, "learning_rate": 2.487356541521768e-08, "loss": 0.2764, "step": 9330 }, { "epoch": 9.937200638637574, "grad_norm": 0.546875, "learning_rate": 2.0900842672422206e-08, "loss": 0.276, "step": 9335 }, { "epoch": 9.94252261841405, "grad_norm": 0.59765625, "learning_rate": 1.7273528130734574e-08, "loss": 0.2696, "step": 9340 }, { "epoch": 9.947844598190526, "grad_norm": 0.62109375, "learning_rate": 1.3991634321841585e-08, "loss": 0.2751, "step": 9345 }, { "epoch": 9.953166577967004, "grad_norm": 0.51953125, "learning_rate": 1.1055172584018003e-08, "loss": 0.2732, "step": 9350 }, { "epoch": 9.95848855774348, "grad_norm": 0.6953125, "learning_rate": 8.464153062182068e-09, "loss": 0.2879, "step": 9355 }, { "epoch": 9.963810537519958, "grad_norm": 0.578125, "learning_rate": 6.2185847077844695e-09, "loss": 0.2763, "step": 9360 }, { "epoch": 9.969132517296433, "grad_norm": 0.578125, "learning_rate": 4.318475278830558e-09, "loss": 0.2722, "step": 9365 }, { "epoch": 9.97445449707291, "grad_norm": 0.71875, "learning_rate": 2.7638313398248293e-09, "loss": 0.2729, "step": 9370 }, { "epoch": 9.979776476849388, "grad_norm": 0.62890625, "learning_rate": 1.5546582617598226e-09, "loss": 0.2756, "step": 9375 }, { "epoch": 9.985098456625865, "grad_norm": 0.5703125, "learning_rate": 6.909602220828148e-10, "loss": 0.2769, "step": 9380 }, { "epoch": 9.990420436402342, "grad_norm": 0.5859375, "learning_rate": 1.7274020471802488e-10, "loss": 0.2747, "step": 9385 }, { "epoch": 9.99574241617882, "grad_norm": 0.58984375, "learning_rate": 0.0, "loss": 0.2782, "step": 9390 }, { "epoch": 9.99574241617882, "eval_loss": 3.8291501998901367, "eval_runtime": 11.7635, "eval_samples_per_second": 6.291, "eval_steps_per_second": 0.595, "step": 9390 }, { "epoch": 9.99574241617882, "step": 9390, "total_flos": 1.0870943703560618e+19, "train_loss": 0.28536031966011366, "train_runtime": 35929.5875, "train_samples_per_second": 3.136, "train_steps_per_second": 0.261 } ], "logging_steps": 5, "max_steps": 9390, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 1.0870943703560618e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }