{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.999328768962277, "eval_steps": 500, "global_step": 27930, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.27031534910202026, "learning_rate": 4.9999996046262024e-05, "loss": 2.2625, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.3651030957698822, "learning_rate": 4.999998418504935e-05, "loss": 2.1664, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.4536094069480896, "learning_rate": 4.999996441636572e-05, "loss": 2.1371, "step": 15 }, { "epoch": 0.0, "grad_norm": 0.3164426386356354, "learning_rate": 4.999993674021739e-05, "loss": 2.2912, "step": 20 }, { "epoch": 0.0, "grad_norm": 0.9441952109336853, "learning_rate": 4.9999908905929944e-05, "loss": 2.5717, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.45403921604156494, "learning_rate": 4.99998669963689e-05, "loss": 2.2767, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.38720938563346863, "learning_rate": 4.9999817179373974e-05, "loss": 2.2304, "step": 35 }, { "epoch": 0.01, "grad_norm": 0.2565215826034546, "learning_rate": 4.9999759454960915e-05, "loss": 1.9965, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.41144421696662903, "learning_rate": 4.9999693823147996e-05, "loss": 2.2625, "step": 45 }, { "epoch": 0.01, "grad_norm": 0.7766994833946228, "learning_rate": 4.999962028395596e-05, "loss": 2.253, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.4188031852245331, "learning_rate": 4.9999538837408077e-05, "loss": 2.132, "step": 55 }, { "epoch": 0.01, "grad_norm": 0.6002323031425476, "learning_rate": 4.999944948353011e-05, "loss": 2.2579, "step": 60 }, { "epoch": 0.01, "grad_norm": 0.3897973597049713, "learning_rate": 4.999935222235031e-05, "loss": 2.0546, "step": 65 }, { "epoch": 0.01, "grad_norm": 3.2654597759246826, "learning_rate": 4.9999247053899453e-05, "loss": 2.1811, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.29960134625434875, "learning_rate": 4.99991339782108e-05, "loss": 1.9428, "step": 75 }, { "epoch": 0.01, "grad_norm": 0.48218828439712524, "learning_rate": 4.999901299532012e-05, "loss": 2.2476, "step": 80 }, { "epoch": 0.02, "grad_norm": 1.6963359117507935, "learning_rate": 4.9998884105265666e-05, "loss": 1.9934, "step": 85 }, { "epoch": 0.02, "grad_norm": 2.0964760780334473, "learning_rate": 4.9998747308088226e-05, "loss": 2.1915, "step": 90 }, { "epoch": 0.02, "grad_norm": 0.5373502969741821, "learning_rate": 4.999860260383104e-05, "loss": 1.988, "step": 95 }, { "epoch": 0.02, "grad_norm": 0.4756152331829071, "learning_rate": 4.999844999253991e-05, "loss": 2.0278, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.35103705525398254, "learning_rate": 4.9998289474263094e-05, "loss": 2.1245, "step": 105 }, { "epoch": 0.02, "grad_norm": 0.6553156971931458, "learning_rate": 4.999812104905136e-05, "loss": 2.0964, "step": 110 }, { "epoch": 0.02, "grad_norm": 0.5029746890068054, "learning_rate": 4.9997944716957985e-05, "loss": 2.356, "step": 115 }, { "epoch": 0.02, "grad_norm": 0.3125253915786743, "learning_rate": 4.999776047803873e-05, "loss": 2.1678, "step": 120 }, { "epoch": 0.02, "grad_norm": 0.8542938232421875, "learning_rate": 4.99975683323519e-05, "loss": 2.1263, "step": 125 }, { "epoch": 0.02, "grad_norm": 0.3991641700267792, "learning_rate": 4.9997368279958236e-05, "loss": 2.0609, "step": 130 }, { "epoch": 0.02, "grad_norm": 0.4593062698841095, "learning_rate": 4.999716032092103e-05, "loss": 2.0401, "step": 135 }, { "epoch": 0.03, "grad_norm": 0.3167307674884796, "learning_rate": 4.999694445530607e-05, "loss": 1.9401, "step": 140 }, { "epoch": 0.03, "grad_norm": 0.7320476174354553, "learning_rate": 4.9996720683181617e-05, "loss": 2.0134, "step": 145 }, { "epoch": 0.03, "grad_norm": 0.5831002593040466, "learning_rate": 4.9996489004618455e-05, "loss": 1.922, "step": 150 }, { "epoch": 0.03, "grad_norm": 1.0691373348236084, "learning_rate": 4.999624941968986e-05, "loss": 2.0451, "step": 155 }, { "epoch": 0.03, "grad_norm": 0.5325329899787903, "learning_rate": 4.999600192847162e-05, "loss": 2.0023, "step": 160 }, { "epoch": 0.03, "grad_norm": 1.7082959413528442, "learning_rate": 4.9995746531042006e-05, "loss": 1.9057, "step": 165 }, { "epoch": 0.03, "grad_norm": 1.2313815355300903, "learning_rate": 4.9995483227481824e-05, "loss": 1.8793, "step": 170 }, { "epoch": 0.03, "grad_norm": 0.24114160239696503, "learning_rate": 4.9995212017874325e-05, "loss": 2.1221, "step": 175 }, { "epoch": 0.03, "grad_norm": 0.4924792945384979, "learning_rate": 4.9994932902305315e-05, "loss": 2.0354, "step": 180 }, { "epoch": 0.03, "grad_norm": 3.932770013809204, "learning_rate": 4.9994645880863064e-05, "loss": 2.0849, "step": 185 }, { "epoch": 0.03, "grad_norm": 0.9561445713043213, "learning_rate": 4.9994350953638366e-05, "loss": 1.9205, "step": 190 }, { "epoch": 0.03, "grad_norm": 0.7701564431190491, "learning_rate": 4.999404812072451e-05, "loss": 2.2077, "step": 195 }, { "epoch": 0.04, "grad_norm": 0.6854715943336487, "learning_rate": 4.9993737382217265e-05, "loss": 2.2886, "step": 200 }, { "epoch": 0.04, "grad_norm": 0.7976120710372925, "learning_rate": 4.999341873821493e-05, "loss": 2.3515, "step": 205 }, { "epoch": 0.04, "grad_norm": 1.1986960172653198, "learning_rate": 4.999309218881829e-05, "loss": 1.7781, "step": 210 }, { "epoch": 0.04, "grad_norm": 0.5879755020141602, "learning_rate": 4.999275773413063e-05, "loss": 2.2107, "step": 215 }, { "epoch": 0.04, "grad_norm": 1.0343559980392456, "learning_rate": 4.9992415374257734e-05, "loss": 1.8219, "step": 220 }, { "epoch": 0.04, "grad_norm": 0.6155093312263489, "learning_rate": 4.9992065109307904e-05, "loss": 2.3041, "step": 225 }, { "epoch": 0.04, "grad_norm": 1.1242767572402954, "learning_rate": 4.9991706939391916e-05, "loss": 2.2295, "step": 230 }, { "epoch": 0.04, "grad_norm": 0.5417547225952148, "learning_rate": 4.9991340864623057e-05, "loss": 1.7807, "step": 235 }, { "epoch": 0.04, "grad_norm": 0.36535757780075073, "learning_rate": 4.999096688511712e-05, "loss": 1.8687, "step": 240 }, { "epoch": 0.04, "grad_norm": 0.8204144835472107, "learning_rate": 4.99905850009924e-05, "loss": 2.1915, "step": 245 }, { "epoch": 0.04, "grad_norm": 0.5473398566246033, "learning_rate": 4.999019521236969e-05, "loss": 1.9105, "step": 250 }, { "epoch": 0.05, "grad_norm": 0.5916422605514526, "learning_rate": 4.998979751937226e-05, "loss": 1.8488, "step": 255 }, { "epoch": 0.05, "grad_norm": 0.40376555919647217, "learning_rate": 4.998939192212591e-05, "loss": 1.9348, "step": 260 }, { "epoch": 0.05, "grad_norm": 0.4245920479297638, "learning_rate": 4.998897842075894e-05, "loss": 1.8736, "step": 265 }, { "epoch": 0.05, "grad_norm": 3.863102436065674, "learning_rate": 4.998855701540213e-05, "loss": 1.8208, "step": 270 }, { "epoch": 0.05, "grad_norm": 0.9868331551551819, "learning_rate": 4.998812770618877e-05, "loss": 2.0812, "step": 275 }, { "epoch": 0.05, "grad_norm": 0.3408450484275818, "learning_rate": 4.998769049325465e-05, "loss": 2.2077, "step": 280 }, { "epoch": 0.05, "grad_norm": 0.3527214229106903, "learning_rate": 4.998724537673807e-05, "loss": 2.1004, "step": 285 }, { "epoch": 0.05, "grad_norm": 1.1077073812484741, "learning_rate": 4.9986792356779803e-05, "loss": 2.3449, "step": 290 }, { "epoch": 0.05, "grad_norm": 0.5574270486831665, "learning_rate": 4.9986331433523156e-05, "loss": 1.8885, "step": 295 }, { "epoch": 0.05, "grad_norm": 0.4326357841491699, "learning_rate": 4.99858626071139e-05, "loss": 1.9964, "step": 300 }, { "epoch": 0.05, "grad_norm": 0.9008758068084717, "learning_rate": 4.998538587770034e-05, "loss": 1.9349, "step": 305 }, { "epoch": 0.06, "grad_norm": 0.8824496865272522, "learning_rate": 4.998490124543327e-05, "loss": 1.9985, "step": 310 }, { "epoch": 0.06, "grad_norm": 0.6680270433425903, "learning_rate": 4.998440871046596e-05, "loss": 1.8583, "step": 315 }, { "epoch": 0.06, "grad_norm": 0.392929345369339, "learning_rate": 4.99839082729542e-05, "loss": 1.926, "step": 320 }, { "epoch": 0.06, "grad_norm": 0.6838160753250122, "learning_rate": 4.998339993305629e-05, "loss": 2.0671, "step": 325 }, { "epoch": 0.06, "grad_norm": 0.6205262541770935, "learning_rate": 4.9982883690933014e-05, "loss": 1.8439, "step": 330 }, { "epoch": 0.06, "grad_norm": 0.5349203944206238, "learning_rate": 4.998235954674766e-05, "loss": 2.1677, "step": 335 }, { "epoch": 0.06, "grad_norm": 0.5297353267669678, "learning_rate": 4.9981827500666e-05, "loss": 2.1915, "step": 340 }, { "epoch": 0.06, "grad_norm": 0.5714792013168335, "learning_rate": 4.9981287552856344e-05, "loss": 1.8878, "step": 345 }, { "epoch": 0.06, "grad_norm": 0.546989917755127, "learning_rate": 4.998073970348945e-05, "loss": 2.1013, "step": 350 }, { "epoch": 0.06, "grad_norm": 0.7602975964546204, "learning_rate": 4.9980183952738634e-05, "loss": 2.2048, "step": 355 }, { "epoch": 0.06, "grad_norm": 1.0656648874282837, "learning_rate": 4.997962030077965e-05, "loss": 2.1493, "step": 360 }, { "epoch": 0.07, "grad_norm": 0.6455523371696472, "learning_rate": 4.997904874779079e-05, "loss": 1.6774, "step": 365 }, { "epoch": 0.07, "grad_norm": 0.34377214312553406, "learning_rate": 4.997846929395285e-05, "loss": 1.8235, "step": 370 }, { "epoch": 0.07, "grad_norm": 0.6402696967124939, "learning_rate": 4.997788193944909e-05, "loss": 1.9533, "step": 375 }, { "epoch": 0.07, "grad_norm": 0.5523343086242676, "learning_rate": 4.99772866844653e-05, "loss": 2.0183, "step": 380 }, { "epoch": 0.07, "grad_norm": 0.6676031947135925, "learning_rate": 4.997668352918975e-05, "loss": 1.8157, "step": 385 }, { "epoch": 0.07, "grad_norm": 0.9743964672088623, "learning_rate": 4.997607247381323e-05, "loss": 2.1711, "step": 390 }, { "epoch": 0.07, "grad_norm": 0.7307085990905762, "learning_rate": 4.9975453518529014e-05, "loss": 1.8769, "step": 395 }, { "epoch": 0.07, "grad_norm": 1.004335641860962, "learning_rate": 4.997482666353287e-05, "loss": 1.6377, "step": 400 }, { "epoch": 0.07, "grad_norm": 0.7974880337715149, "learning_rate": 4.997419190902307e-05, "loss": 1.8962, "step": 405 }, { "epoch": 0.07, "grad_norm": 0.4197213649749756, "learning_rate": 4.99735492552004e-05, "loss": 2.0266, "step": 410 }, { "epoch": 0.07, "grad_norm": 0.5465844869613647, "learning_rate": 4.997289870226812e-05, "loss": 1.9882, "step": 415 }, { "epoch": 0.08, "grad_norm": 0.7714999914169312, "learning_rate": 4.9972240250432e-05, "loss": 1.7723, "step": 420 }, { "epoch": 0.08, "grad_norm": 0.4609231650829315, "learning_rate": 4.9971573899900306e-05, "loss": 2.1264, "step": 425 }, { "epoch": 0.08, "grad_norm": 2.4901442527770996, "learning_rate": 4.997089965088381e-05, "loss": 1.8215, "step": 430 }, { "epoch": 0.08, "grad_norm": 1.3669675588607788, "learning_rate": 4.997021750359577e-05, "loss": 1.9637, "step": 435 }, { "epoch": 0.08, "grad_norm": 0.6169967651367188, "learning_rate": 4.9969527458251964e-05, "loss": 2.2501, "step": 440 }, { "epoch": 0.08, "grad_norm": 0.862839937210083, "learning_rate": 4.9968829515070626e-05, "loss": 2.0921, "step": 445 }, { "epoch": 0.08, "grad_norm": 0.5367420315742493, "learning_rate": 4.996812367427254e-05, "loss": 1.9298, "step": 450 }, { "epoch": 0.08, "grad_norm": 0.4946227967739105, "learning_rate": 4.996740993608094e-05, "loss": 2.0265, "step": 455 }, { "epoch": 0.08, "grad_norm": 0.68914395570755, "learning_rate": 4.9966688300721603e-05, "loss": 1.9182, "step": 460 }, { "epoch": 0.08, "grad_norm": 0.44019490480422974, "learning_rate": 4.9965958768422775e-05, "loss": 2.1147, "step": 465 }, { "epoch": 0.08, "grad_norm": 0.7905745506286621, "learning_rate": 4.9965221339415194e-05, "loss": 1.909, "step": 470 }, { "epoch": 0.09, "grad_norm": 0.5773944854736328, "learning_rate": 4.996447601393211e-05, "loss": 1.8616, "step": 475 }, { "epoch": 0.09, "grad_norm": 2.7853548526763916, "learning_rate": 4.9963722792209286e-05, "loss": 2.2229, "step": 480 }, { "epoch": 0.09, "grad_norm": 0.6674456596374512, "learning_rate": 4.996296167448495e-05, "loss": 1.9064, "step": 485 }, { "epoch": 0.09, "grad_norm": 2.360712766647339, "learning_rate": 4.9962192660999866e-05, "loss": 1.932, "step": 490 }, { "epoch": 0.09, "grad_norm": 0.3946886360645294, "learning_rate": 4.996141575199723e-05, "loss": 2.112, "step": 495 }, { "epoch": 0.09, "grad_norm": 1.100876808166504, "learning_rate": 4.996063094772281e-05, "loss": 2.0544, "step": 500 }, { "epoch": 0.09, "grad_norm": 1.3931337594985962, "learning_rate": 4.9959838248424836e-05, "loss": 1.7863, "step": 505 }, { "epoch": 0.09, "grad_norm": 0.4513574540615082, "learning_rate": 4.995903765435402e-05, "loss": 2.1613, "step": 510 }, { "epoch": 0.09, "grad_norm": 0.6865633130073547, "learning_rate": 4.9958229165763614e-05, "loss": 2.0138, "step": 515 }, { "epoch": 0.09, "grad_norm": 0.6642078757286072, "learning_rate": 4.9957412782909324e-05, "loss": 1.8269, "step": 520 }, { "epoch": 0.09, "grad_norm": 1.0124841928482056, "learning_rate": 4.9956588506049374e-05, "loss": 2.0759, "step": 525 }, { "epoch": 0.09, "grad_norm": 0.8257017731666565, "learning_rate": 4.9955756335444495e-05, "loss": 1.7972, "step": 530 }, { "epoch": 0.1, "grad_norm": 0.7027864456176758, "learning_rate": 4.9954916271357876e-05, "loss": 2.1897, "step": 535 }, { "epoch": 0.1, "grad_norm": 0.6689014434814453, "learning_rate": 4.9954068314055255e-05, "loss": 2.0164, "step": 540 }, { "epoch": 0.1, "grad_norm": 0.6984819769859314, "learning_rate": 4.9953212463804824e-05, "loss": 1.8525, "step": 545 }, { "epoch": 0.1, "grad_norm": 0.8252110481262207, "learning_rate": 4.9952348720877294e-05, "loss": 1.906, "step": 550 }, { "epoch": 0.1, "grad_norm": 0.7641300559043884, "learning_rate": 4.995147708554587e-05, "loss": 2.179, "step": 555 }, { "epoch": 0.1, "grad_norm": 0.5277909636497498, "learning_rate": 4.9950597558086246e-05, "loss": 2.078, "step": 560 }, { "epoch": 0.1, "grad_norm": 1.3785237073898315, "learning_rate": 4.99497101387766e-05, "loss": 2.1335, "step": 565 }, { "epoch": 0.1, "grad_norm": 0.9279726147651672, "learning_rate": 4.994881482789764e-05, "loss": 1.823, "step": 570 }, { "epoch": 0.1, "grad_norm": 0.6728343963623047, "learning_rate": 4.994791162573256e-05, "loss": 2.1131, "step": 575 }, { "epoch": 0.1, "grad_norm": 0.7369157075881958, "learning_rate": 4.9947000532567014e-05, "loss": 1.7872, "step": 580 }, { "epoch": 0.1, "grad_norm": 1.544488549232483, "learning_rate": 4.994608154868921e-05, "loss": 2.2275, "step": 585 }, { "epoch": 0.11, "grad_norm": 1.0794744491577148, "learning_rate": 4.9945154674389796e-05, "loss": 2.0575, "step": 590 }, { "epoch": 0.11, "grad_norm": 0.5891600847244263, "learning_rate": 4.994421990996195e-05, "loss": 2.0061, "step": 595 }, { "epoch": 0.11, "grad_norm": 0.5102590322494507, "learning_rate": 4.994327725570135e-05, "loss": 1.8905, "step": 600 }, { "epoch": 0.11, "grad_norm": 0.9400714635848999, "learning_rate": 4.994232671190614e-05, "loss": 1.9824, "step": 605 }, { "epoch": 0.11, "grad_norm": 0.7190917730331421, "learning_rate": 4.994136827887699e-05, "loss": 1.9437, "step": 610 }, { "epoch": 0.11, "grad_norm": 0.5271459221839905, "learning_rate": 4.994040195691703e-05, "loss": 1.9421, "step": 615 }, { "epoch": 0.11, "grad_norm": 1.935598611831665, "learning_rate": 4.9939427746331937e-05, "loss": 2.1908, "step": 620 }, { "epoch": 0.11, "grad_norm": 14.462722778320312, "learning_rate": 4.993844564742982e-05, "loss": 2.0414, "step": 625 }, { "epoch": 0.11, "grad_norm": 2.596210479736328, "learning_rate": 4.9937455660521345e-05, "loss": 1.966, "step": 630 }, { "epoch": 0.11, "grad_norm": 0.547702431678772, "learning_rate": 4.993645778591963e-05, "loss": 1.8967, "step": 635 }, { "epoch": 0.11, "grad_norm": 2.006232976913452, "learning_rate": 4.99354520239403e-05, "loss": 1.8511, "step": 640 }, { "epoch": 0.12, "grad_norm": 1.9103983640670776, "learning_rate": 4.993443837490148e-05, "loss": 1.7829, "step": 645 }, { "epoch": 0.12, "grad_norm": 0.8740873336791992, "learning_rate": 4.993341683912378e-05, "loss": 2.0906, "step": 650 }, { "epoch": 0.12, "grad_norm": 0.9075211882591248, "learning_rate": 4.9932387416930326e-05, "loss": 2.1868, "step": 655 }, { "epoch": 0.12, "grad_norm": 0.5851132273674011, "learning_rate": 4.993135010864671e-05, "loss": 1.8202, "step": 660 }, { "epoch": 0.12, "grad_norm": 0.7303404808044434, "learning_rate": 4.993030491460104e-05, "loss": 1.8032, "step": 665 }, { "epoch": 0.12, "grad_norm": 0.5029420256614685, "learning_rate": 4.9929251835123904e-05, "loss": 1.8013, "step": 670 }, { "epoch": 0.12, "grad_norm": 0.8666778802871704, "learning_rate": 4.9928190870548384e-05, "loss": 2.1201, "step": 675 }, { "epoch": 0.12, "grad_norm": 1.3739973306655884, "learning_rate": 4.9927122021210076e-05, "loss": 1.8149, "step": 680 }, { "epoch": 0.12, "grad_norm": 0.6640814542770386, "learning_rate": 4.992604528744705e-05, "loss": 1.9143, "step": 685 }, { "epoch": 0.12, "grad_norm": 0.9875686764717102, "learning_rate": 4.992496066959987e-05, "loss": 1.7527, "step": 690 }, { "epoch": 0.12, "grad_norm": 0.5057778358459473, "learning_rate": 4.992386816801161e-05, "loss": 1.9771, "step": 695 }, { "epoch": 0.13, "grad_norm": 0.7259483337402344, "learning_rate": 4.992276778302782e-05, "loss": 1.891, "step": 700 }, { "epoch": 0.13, "grad_norm": 0.7736199498176575, "learning_rate": 4.9921659514996555e-05, "loss": 1.8235, "step": 705 }, { "epoch": 0.13, "grad_norm": 0.6016276478767395, "learning_rate": 4.992054336426836e-05, "loss": 1.868, "step": 710 }, { "epoch": 0.13, "grad_norm": 0.6753050088882446, "learning_rate": 4.991941933119626e-05, "loss": 1.8746, "step": 715 }, { "epoch": 0.13, "grad_norm": 0.6114783883094788, "learning_rate": 4.99182874161358e-05, "loss": 1.9586, "step": 720 }, { "epoch": 0.13, "grad_norm": 1.7114779949188232, "learning_rate": 4.9917147619445005e-05, "loss": 2.3431, "step": 725 }, { "epoch": 0.13, "grad_norm": 0.6027891635894775, "learning_rate": 4.991599994148438e-05, "loss": 2.1642, "step": 730 }, { "epoch": 0.13, "grad_norm": 0.7011446952819824, "learning_rate": 4.991484438261693e-05, "loss": 1.9181, "step": 735 }, { "epoch": 0.13, "grad_norm": 0.5473892092704773, "learning_rate": 4.991368094320818e-05, "loss": 2.1816, "step": 740 }, { "epoch": 0.13, "grad_norm": 0.4778834879398346, "learning_rate": 4.991250962362611e-05, "loss": 1.7557, "step": 745 }, { "epoch": 0.13, "grad_norm": 0.6992874145507812, "learning_rate": 4.99113304242412e-05, "loss": 2.0037, "step": 750 }, { "epoch": 0.14, "grad_norm": 1.2139089107513428, "learning_rate": 4.9910143345426446e-05, "loss": 1.9781, "step": 755 }, { "epoch": 0.14, "grad_norm": 0.7376035451889038, "learning_rate": 4.9908948387557306e-05, "loss": 1.9145, "step": 760 }, { "epoch": 0.14, "grad_norm": 0.9813640117645264, "learning_rate": 4.9907745551011764e-05, "loss": 1.9012, "step": 765 }, { "epoch": 0.14, "grad_norm": 0.6064789295196533, "learning_rate": 4.990653483617025e-05, "loss": 2.0977, "step": 770 }, { "epoch": 0.14, "grad_norm": 0.8322803378105164, "learning_rate": 4.990531624341573e-05, "loss": 2.0924, "step": 775 }, { "epoch": 0.14, "grad_norm": 0.39520594477653503, "learning_rate": 4.990408977313363e-05, "loss": 2.0109, "step": 780 }, { "epoch": 0.14, "grad_norm": 0.7123391628265381, "learning_rate": 4.9902855425711905e-05, "loss": 2.1073, "step": 785 }, { "epoch": 0.14, "grad_norm": 0.617445170879364, "learning_rate": 4.9901613201540945e-05, "loss": 2.081, "step": 790 }, { "epoch": 0.14, "grad_norm": 0.2326832264661789, "learning_rate": 4.990036310101369e-05, "loss": 2.0825, "step": 795 }, { "epoch": 0.14, "grad_norm": 0.828574001789093, "learning_rate": 4.989910512452554e-05, "loss": 1.9948, "step": 800 }, { "epoch": 0.14, "grad_norm": 0.5718833804130554, "learning_rate": 4.989783927247439e-05, "loss": 1.9836, "step": 805 }, { "epoch": 0.14, "grad_norm": 0.5843701958656311, "learning_rate": 4.989656554526062e-05, "loss": 1.8482, "step": 810 }, { "epoch": 0.15, "grad_norm": 0.36369892954826355, "learning_rate": 4.989528394328712e-05, "loss": 1.9934, "step": 815 }, { "epoch": 0.15, "grad_norm": 0.5329487323760986, "learning_rate": 4.9893994466959246e-05, "loss": 1.7943, "step": 820 }, { "epoch": 0.15, "grad_norm": 0.9195080399513245, "learning_rate": 4.989269711668487e-05, "loss": 1.8812, "step": 825 }, { "epoch": 0.15, "grad_norm": 0.639167845249176, "learning_rate": 4.9891391892874354e-05, "loss": 2.0915, "step": 830 }, { "epoch": 0.15, "grad_norm": 0.9392881393432617, "learning_rate": 4.989007879594051e-05, "loss": 1.9084, "step": 835 }, { "epoch": 0.15, "grad_norm": 3.4047281742095947, "learning_rate": 4.988875782629869e-05, "loss": 2.1699, "step": 840 }, { "epoch": 0.15, "grad_norm": 1.1853729486465454, "learning_rate": 4.9887428984366705e-05, "loss": 2.3892, "step": 845 }, { "epoch": 0.15, "grad_norm": 4.790334701538086, "learning_rate": 4.9886092270564876e-05, "loss": 1.9039, "step": 850 }, { "epoch": 0.15, "grad_norm": 0.5937879681587219, "learning_rate": 4.9884747685315996e-05, "loss": 2.0544, "step": 855 }, { "epoch": 0.15, "grad_norm": 0.8132315278053284, "learning_rate": 4.988339522904536e-05, "loss": 1.9541, "step": 860 }, { "epoch": 0.15, "grad_norm": 0.7449702620506287, "learning_rate": 4.988203490218075e-05, "loss": 1.9042, "step": 865 }, { "epoch": 0.16, "grad_norm": 0.5091820359230042, "learning_rate": 4.9880666705152434e-05, "loss": 1.9421, "step": 870 }, { "epoch": 0.16, "grad_norm": 0.45351913571357727, "learning_rate": 4.987929063839317e-05, "loss": 1.8925, "step": 875 }, { "epoch": 0.16, "grad_norm": 0.5173969268798828, "learning_rate": 4.987790670233821e-05, "loss": 1.9209, "step": 880 }, { "epoch": 0.16, "grad_norm": 0.5571516156196594, "learning_rate": 4.987651489742528e-05, "loss": 1.9871, "step": 885 }, { "epoch": 0.16, "grad_norm": 0.9875330328941345, "learning_rate": 4.987511522409463e-05, "loss": 1.7102, "step": 890 }, { "epoch": 0.16, "grad_norm": 0.513702392578125, "learning_rate": 4.987370768278895e-05, "loss": 2.0374, "step": 895 }, { "epoch": 0.16, "grad_norm": 0.7539028525352478, "learning_rate": 4.987229227395346e-05, "loss": 1.9778, "step": 900 }, { "epoch": 0.16, "grad_norm": 0.5421642661094666, "learning_rate": 4.9870868998035845e-05, "loss": 1.7694, "step": 905 }, { "epoch": 0.16, "grad_norm": 0.8099106550216675, "learning_rate": 4.986943785548629e-05, "loss": 1.7683, "step": 910 }, { "epoch": 0.16, "grad_norm": 0.797715425491333, "learning_rate": 4.986799884675747e-05, "loss": 1.8887, "step": 915 }, { "epoch": 0.16, "grad_norm": 0.6067102551460266, "learning_rate": 4.9866551972304526e-05, "loss": 1.9329, "step": 920 }, { "epoch": 0.17, "grad_norm": 0.49789664149284363, "learning_rate": 4.986509723258511e-05, "loss": 2.0683, "step": 925 }, { "epoch": 0.17, "grad_norm": 0.9681997299194336, "learning_rate": 4.9863634628059366e-05, "loss": 1.9902, "step": 930 }, { "epoch": 0.17, "grad_norm": 0.5609699487686157, "learning_rate": 4.98621641591899e-05, "loss": 1.8425, "step": 935 }, { "epoch": 0.17, "grad_norm": 0.5346946120262146, "learning_rate": 4.986068582644182e-05, "loss": 2.0863, "step": 940 }, { "epoch": 0.17, "grad_norm": 0.893808126449585, "learning_rate": 4.985919963028274e-05, "loss": 2.0174, "step": 945 }, { "epoch": 0.17, "grad_norm": 0.4146747887134552, "learning_rate": 4.985770557118272e-05, "loss": 1.9704, "step": 950 }, { "epoch": 0.17, "grad_norm": 1.030465006828308, "learning_rate": 4.985620364961433e-05, "loss": 2.1478, "step": 955 }, { "epoch": 0.17, "grad_norm": 0.6539037823677063, "learning_rate": 4.985469386605265e-05, "loss": 2.1638, "step": 960 }, { "epoch": 0.17, "grad_norm": 0.698945164680481, "learning_rate": 4.9853176220975195e-05, "loss": 1.93, "step": 965 }, { "epoch": 0.17, "grad_norm": 0.6615301370620728, "learning_rate": 4.9851650714862006e-05, "loss": 2.0131, "step": 970 }, { "epoch": 0.17, "grad_norm": 0.6794114112854004, "learning_rate": 4.9850117348195606e-05, "loss": 1.9938, "step": 975 }, { "epoch": 0.18, "grad_norm": 0.4842730164527893, "learning_rate": 4.984857612146099e-05, "loss": 1.9502, "step": 980 }, { "epoch": 0.18, "grad_norm": 1.0093274116516113, "learning_rate": 4.984702703514565e-05, "loss": 1.6196, "step": 985 }, { "epoch": 0.18, "grad_norm": 0.6907001733779907, "learning_rate": 4.984547008973956e-05, "loss": 1.951, "step": 990 }, { "epoch": 0.18, "grad_norm": 0.6231794357299805, "learning_rate": 4.984390528573517e-05, "loss": 2.0397, "step": 995 }, { "epoch": 0.18, "grad_norm": 0.7842473387718201, "learning_rate": 4.984233262362745e-05, "loss": 2.0751, "step": 1000 }, { "epoch": 0.18, "grad_norm": 1.2024447917938232, "learning_rate": 4.98407521039138e-05, "loss": 1.5978, "step": 1005 }, { "epoch": 0.18, "grad_norm": 0.8643293976783752, "learning_rate": 4.983916372709416e-05, "loss": 2.2393, "step": 1010 }, { "epoch": 0.18, "grad_norm": 0.6391337513923645, "learning_rate": 4.9837567493670936e-05, "loss": 1.9106, "step": 1015 }, { "epoch": 0.18, "grad_norm": 0.7475038766860962, "learning_rate": 4.983596340414899e-05, "loss": 2.0462, "step": 1020 }, { "epoch": 0.18, "grad_norm": 1.4926930665969849, "learning_rate": 4.9834351459035714e-05, "loss": 1.7617, "step": 1025 }, { "epoch": 0.18, "grad_norm": 0.6800340414047241, "learning_rate": 4.9832731658840956e-05, "loss": 1.7933, "step": 1030 }, { "epoch": 0.19, "grad_norm": 0.5612903237342834, "learning_rate": 4.983110400407707e-05, "loss": 1.9801, "step": 1035 }, { "epoch": 0.19, "grad_norm": 0.5487276315689087, "learning_rate": 4.982946849525886e-05, "loss": 1.8137, "step": 1040 }, { "epoch": 0.19, "grad_norm": 1.1689997911453247, "learning_rate": 4.982782513290365e-05, "loss": 2.0348, "step": 1045 }, { "epoch": 0.19, "grad_norm": 0.5050978660583496, "learning_rate": 4.9826173917531235e-05, "loss": 2.1974, "step": 1050 }, { "epoch": 0.19, "grad_norm": 0.6541104912757874, "learning_rate": 4.982451484966389e-05, "loss": 1.824, "step": 1055 }, { "epoch": 0.19, "grad_norm": 0.4202798306941986, "learning_rate": 4.9822847929826374e-05, "loss": 1.9036, "step": 1060 }, { "epoch": 0.19, "grad_norm": 0.6690078377723694, "learning_rate": 4.9821173158545936e-05, "loss": 1.8924, "step": 1065 }, { "epoch": 0.19, "grad_norm": 0.41221898794174194, "learning_rate": 4.98194905363523e-05, "loss": 2.2559, "step": 1070 }, { "epoch": 0.19, "grad_norm": 1.146337628364563, "learning_rate": 4.981780006377769e-05, "loss": 1.7768, "step": 1075 }, { "epoch": 0.19, "grad_norm": 0.376248836517334, "learning_rate": 4.981610174135678e-05, "loss": 1.8807, "step": 1080 }, { "epoch": 0.19, "grad_norm": 0.8797820806503296, "learning_rate": 4.981439556962676e-05, "loss": 1.5606, "step": 1085 }, { "epoch": 0.2, "grad_norm": 0.9308553338050842, "learning_rate": 4.981268154912729e-05, "loss": 1.7338, "step": 1090 }, { "epoch": 0.2, "grad_norm": 0.8552083969116211, "learning_rate": 4.981095968040053e-05, "loss": 1.7534, "step": 1095 }, { "epoch": 0.2, "grad_norm": 0.9000282287597656, "learning_rate": 4.9809229963991064e-05, "loss": 1.7684, "step": 1100 }, { "epoch": 0.2, "grad_norm": 1.6335359811782837, "learning_rate": 4.980749240044604e-05, "loss": 1.9627, "step": 1105 }, { "epoch": 0.2, "grad_norm": 0.8010488152503967, "learning_rate": 4.9805746990315024e-05, "loss": 2.0931, "step": 1110 }, { "epoch": 0.2, "grad_norm": 0.5501634478569031, "learning_rate": 4.980399373415009e-05, "loss": 2.1531, "step": 1115 }, { "epoch": 0.2, "grad_norm": 0.876747190952301, "learning_rate": 4.980223263250581e-05, "loss": 1.755, "step": 1120 }, { "epoch": 0.2, "grad_norm": 0.8235836625099182, "learning_rate": 4.98004636859392e-05, "loss": 2.2103, "step": 1125 }, { "epoch": 0.2, "grad_norm": 0.3459777534008026, "learning_rate": 4.979868689500978e-05, "loss": 1.8597, "step": 1130 }, { "epoch": 0.2, "grad_norm": 0.6202408075332642, "learning_rate": 4.9796902260279546e-05, "loss": 1.8657, "step": 1135 }, { "epoch": 0.2, "grad_norm": 0.912277340888977, "learning_rate": 4.9795109782312974e-05, "loss": 1.7297, "step": 1140 }, { "epoch": 0.2, "grad_norm": 0.5778829455375671, "learning_rate": 4.979330946167704e-05, "loss": 1.939, "step": 1145 }, { "epoch": 0.21, "grad_norm": 0.8134537935256958, "learning_rate": 4.9791501298941165e-05, "loss": 1.9645, "step": 1150 }, { "epoch": 0.21, "grad_norm": 0.8636032938957214, "learning_rate": 4.978968529467728e-05, "loss": 1.6756, "step": 1155 }, { "epoch": 0.21, "grad_norm": 0.87062668800354, "learning_rate": 4.978786144945977e-05, "loss": 1.7352, "step": 1160 }, { "epoch": 0.21, "grad_norm": 0.9337445497512817, "learning_rate": 4.978602976386554e-05, "loss": 1.6599, "step": 1165 }, { "epoch": 0.21, "grad_norm": 0.8104198575019836, "learning_rate": 4.978419023847393e-05, "loss": 1.7557, "step": 1170 }, { "epoch": 0.21, "grad_norm": 0.3826814591884613, "learning_rate": 4.978234287386678e-05, "loss": 2.0881, "step": 1175 }, { "epoch": 0.21, "grad_norm": 0.47126176953315735, "learning_rate": 4.978048767062843e-05, "loss": 2.2003, "step": 1180 }, { "epoch": 0.21, "grad_norm": 0.4971998333930969, "learning_rate": 4.977862462934566e-05, "loss": 1.8297, "step": 1185 }, { "epoch": 0.21, "grad_norm": 0.5127992033958435, "learning_rate": 4.977675375060775e-05, "loss": 1.8892, "step": 1190 }, { "epoch": 0.21, "grad_norm": 0.7723473310470581, "learning_rate": 4.9774875035006464e-05, "loss": 1.9711, "step": 1195 }, { "epoch": 0.21, "grad_norm": 0.5480608344078064, "learning_rate": 4.977298848313604e-05, "loss": 1.7975, "step": 1200 }, { "epoch": 0.22, "grad_norm": 0.4244174659252167, "learning_rate": 4.9771094095593176e-05, "loss": 2.1068, "step": 1205 }, { "epoch": 0.22, "grad_norm": 0.5436844825744629, "learning_rate": 4.9769191872977085e-05, "loss": 2.09, "step": 1210 }, { "epoch": 0.22, "grad_norm": 0.4076632261276245, "learning_rate": 4.9767281815889425e-05, "loss": 2.038, "step": 1215 }, { "epoch": 0.22, "grad_norm": 0.5819802284240723, "learning_rate": 4.976536392493435e-05, "loss": 1.8444, "step": 1220 }, { "epoch": 0.22, "grad_norm": 0.7633818984031677, "learning_rate": 4.976343820071849e-05, "loss": 2.0922, "step": 1225 }, { "epoch": 0.22, "grad_norm": 0.6349738836288452, "learning_rate": 4.9761504643850945e-05, "loss": 1.9147, "step": 1230 }, { "epoch": 0.22, "grad_norm": 0.5177556276321411, "learning_rate": 4.9759563254943306e-05, "loss": 1.7667, "step": 1235 }, { "epoch": 0.22, "grad_norm": 1.4259908199310303, "learning_rate": 4.9757614034609615e-05, "loss": 1.9067, "step": 1240 }, { "epoch": 0.22, "grad_norm": 0.8151448369026184, "learning_rate": 4.975565698346642e-05, "loss": 2.0555, "step": 1245 }, { "epoch": 0.22, "grad_norm": 0.9140625, "learning_rate": 4.9753692102132735e-05, "loss": 1.9399, "step": 1250 }, { "epoch": 0.22, "grad_norm": 0.86171954870224, "learning_rate": 4.975171939123005e-05, "loss": 2.203, "step": 1255 }, { "epoch": 0.23, "grad_norm": 0.751837432384491, "learning_rate": 4.9749738851382326e-05, "loss": 1.7247, "step": 1260 }, { "epoch": 0.23, "grad_norm": 3.416966438293457, "learning_rate": 4.9747750483216015e-05, "loss": 2.1371, "step": 1265 }, { "epoch": 0.23, "grad_norm": 0.8290925621986389, "learning_rate": 4.974575428736002e-05, "loss": 1.9527, "step": 1270 }, { "epoch": 0.23, "grad_norm": 0.5154220461845398, "learning_rate": 4.974375026444575e-05, "loss": 2.2402, "step": 1275 }, { "epoch": 0.23, "grad_norm": 1.5878034830093384, "learning_rate": 4.974173841510708e-05, "loss": 2.008, "step": 1280 }, { "epoch": 0.23, "grad_norm": 0.9623072743415833, "learning_rate": 4.9739718739980346e-05, "loss": 1.734, "step": 1285 }, { "epoch": 0.23, "grad_norm": 0.6699478030204773, "learning_rate": 4.9737691239704366e-05, "loss": 1.9048, "step": 1290 }, { "epoch": 0.23, "grad_norm": 0.6063868403434753, "learning_rate": 4.9735655914920445e-05, "loss": 2.3764, "step": 1295 }, { "epoch": 0.23, "grad_norm": 0.598061740398407, "learning_rate": 4.973361276627235e-05, "loss": 1.901, "step": 1300 }, { "epoch": 0.23, "grad_norm": 0.7591137886047363, "learning_rate": 4.9731561794406326e-05, "loss": 1.8706, "step": 1305 }, { "epoch": 0.23, "grad_norm": 0.5705850124359131, "learning_rate": 4.9729502999971086e-05, "loss": 2.0533, "step": 1310 }, { "epoch": 0.24, "grad_norm": 1.9160748720169067, "learning_rate": 4.9727436383617853e-05, "loss": 1.8877, "step": 1315 }, { "epoch": 0.24, "grad_norm": 0.8184042572975159, "learning_rate": 4.9725361946000267e-05, "loss": 1.8283, "step": 1320 }, { "epoch": 0.24, "grad_norm": 0.9235120415687561, "learning_rate": 4.9723279687774485e-05, "loss": 2.1376, "step": 1325 }, { "epoch": 0.24, "grad_norm": 0.4321657121181488, "learning_rate": 4.9721189609599106e-05, "loss": 1.9237, "step": 1330 }, { "epoch": 0.24, "grad_norm": 0.47391876578330994, "learning_rate": 4.9719091712135244e-05, "loss": 1.9186, "step": 1335 }, { "epoch": 0.24, "grad_norm": 2.293057441711426, "learning_rate": 4.9716985996046443e-05, "loss": 1.8054, "step": 1340 }, { "epoch": 0.24, "grad_norm": 0.6522160768508911, "learning_rate": 4.971487246199875e-05, "loss": 1.9026, "step": 1345 }, { "epoch": 0.24, "grad_norm": 0.6272959113121033, "learning_rate": 4.971275111066067e-05, "loss": 2.1646, "step": 1350 }, { "epoch": 0.24, "grad_norm": 0.32946717739105225, "learning_rate": 4.9710621942703186e-05, "loss": 1.8629, "step": 1355 }, { "epoch": 0.24, "grad_norm": 0.5091928839683533, "learning_rate": 4.970848495879975e-05, "loss": 2.0348, "step": 1360 }, { "epoch": 0.24, "grad_norm": 2.6853437423706055, "learning_rate": 4.9706340159626284e-05, "loss": 1.6108, "step": 1365 }, { "epoch": 0.25, "grad_norm": 0.7039760947227478, "learning_rate": 4.970418754586119e-05, "loss": 2.1027, "step": 1370 }, { "epoch": 0.25, "grad_norm": 0.4680666923522949, "learning_rate": 4.970202711818535e-05, "loss": 2.0027, "step": 1375 }, { "epoch": 0.25, "grad_norm": 0.49407467246055603, "learning_rate": 4.969985887728208e-05, "loss": 2.144, "step": 1380 }, { "epoch": 0.25, "grad_norm": 0.6088310480117798, "learning_rate": 4.96976828238372e-05, "loss": 2.2039, "step": 1385 }, { "epoch": 0.25, "grad_norm": 0.39288827776908875, "learning_rate": 4.9695498958539014e-05, "loss": 1.9803, "step": 1390 }, { "epoch": 0.25, "grad_norm": 0.5070251226425171, "learning_rate": 4.969330728207825e-05, "loss": 2.078, "step": 1395 }, { "epoch": 0.25, "grad_norm": 0.4578852653503418, "learning_rate": 4.969110779514815e-05, "loss": 1.7425, "step": 1400 }, { "epoch": 0.25, "grad_norm": 0.8231094479560852, "learning_rate": 4.96889004984444e-05, "loss": 2.1139, "step": 1405 }, { "epoch": 0.25, "grad_norm": 0.5976428985595703, "learning_rate": 4.9686685392665174e-05, "loss": 2.0963, "step": 1410 }, { "epoch": 0.25, "grad_norm": 0.8335079550743103, "learning_rate": 4.968446247851111e-05, "loss": 1.9815, "step": 1415 }, { "epoch": 0.25, "grad_norm": 0.8027464747428894, "learning_rate": 4.96822317566853e-05, "loss": 1.878, "step": 1420 }, { "epoch": 0.26, "grad_norm": 1.3042505979537964, "learning_rate": 4.9679993227893326e-05, "loss": 2.0152, "step": 1425 }, { "epoch": 0.26, "grad_norm": 0.6054438352584839, "learning_rate": 4.967774689284323e-05, "loss": 2.0464, "step": 1430 }, { "epoch": 0.26, "grad_norm": 0.845056414604187, "learning_rate": 4.9675492752245536e-05, "loss": 2.0495, "step": 1435 }, { "epoch": 0.26, "grad_norm": 0.930960476398468, "learning_rate": 4.967323080681322e-05, "loss": 2.0878, "step": 1440 }, { "epoch": 0.26, "grad_norm": 3.2143394947052, "learning_rate": 4.967096105726173e-05, "loss": 2.1571, "step": 1445 }, { "epoch": 0.26, "grad_norm": 0.4864540696144104, "learning_rate": 4.966868350430899e-05, "loss": 1.9011, "step": 1450 }, { "epoch": 0.26, "grad_norm": 0.6188135147094727, "learning_rate": 4.9666398148675374e-05, "loss": 2.0124, "step": 1455 }, { "epoch": 0.26, "grad_norm": 1.174637794494629, "learning_rate": 4.9664104991083757e-05, "loss": 1.8437, "step": 1460 }, { "epoch": 0.26, "grad_norm": 0.6095223426818848, "learning_rate": 4.9661804032259464e-05, "loss": 1.8675, "step": 1465 }, { "epoch": 0.26, "grad_norm": 0.4776439964771271, "learning_rate": 4.9659495272930266e-05, "loss": 2.1693, "step": 1470 }, { "epoch": 0.26, "grad_norm": 0.6089404225349426, "learning_rate": 4.9657178713826434e-05, "loss": 2.2899, "step": 1475 }, { "epoch": 0.26, "grad_norm": 1.6431376934051514, "learning_rate": 4.96548543556807e-05, "loss": 1.9234, "step": 1480 }, { "epoch": 0.27, "grad_norm": 0.5474340319633484, "learning_rate": 4.965252219922825e-05, "loss": 2.0113, "step": 1485 }, { "epoch": 0.27, "grad_norm": 0.44193366169929504, "learning_rate": 4.9650182245206726e-05, "loss": 1.8976, "step": 1490 }, { "epoch": 0.27, "grad_norm": 0.8061282634735107, "learning_rate": 4.9647834494356274e-05, "loss": 2.0584, "step": 1495 }, { "epoch": 0.27, "grad_norm": 0.5918030738830566, "learning_rate": 4.964547894741949e-05, "loss": 2.2051, "step": 1500 }, { "epoch": 0.27, "grad_norm": 0.7036905288696289, "learning_rate": 4.964311560514141e-05, "loss": 2.1186, "step": 1505 }, { "epoch": 0.27, "grad_norm": 0.5520045757293701, "learning_rate": 4.964074446826957e-05, "loss": 2.0825, "step": 1510 }, { "epoch": 0.27, "grad_norm": 0.34511449933052063, "learning_rate": 4.9638365537553956e-05, "loss": 1.9604, "step": 1515 }, { "epoch": 0.27, "grad_norm": 1.0205286741256714, "learning_rate": 4.963597881374702e-05, "loss": 1.5741, "step": 1520 }, { "epoch": 0.27, "grad_norm": 1.0340920686721802, "learning_rate": 4.9633584297603686e-05, "loss": 2.076, "step": 1525 }, { "epoch": 0.27, "grad_norm": 0.757640540599823, "learning_rate": 4.963118198988133e-05, "loss": 2.2935, "step": 1530 }, { "epoch": 0.27, "grad_norm": 1.603891134262085, "learning_rate": 4.9628771891339806e-05, "loss": 2.0039, "step": 1535 }, { "epoch": 0.28, "grad_norm": 0.5811793804168701, "learning_rate": 4.962635400274142e-05, "loss": 1.8798, "step": 1540 }, { "epoch": 0.28, "grad_norm": 0.6387256383895874, "learning_rate": 4.962392832485095e-05, "loss": 1.814, "step": 1545 }, { "epoch": 0.28, "grad_norm": 0.7429436445236206, "learning_rate": 4.962149485843564e-05, "loss": 1.8334, "step": 1550 }, { "epoch": 0.28, "grad_norm": 0.6524866819381714, "learning_rate": 4.9619053604265185e-05, "loss": 1.9859, "step": 1555 }, { "epoch": 0.28, "grad_norm": 0.7228694558143616, "learning_rate": 4.961660456311176e-05, "loss": 1.8701, "step": 1560 }, { "epoch": 0.28, "grad_norm": 0.49972599744796753, "learning_rate": 4.961414773574998e-05, "loss": 2.0535, "step": 1565 }, { "epoch": 0.28, "grad_norm": 0.3244457542896271, "learning_rate": 4.961168312295696e-05, "loss": 1.7632, "step": 1570 }, { "epoch": 0.28, "grad_norm": 0.6015215516090393, "learning_rate": 4.9609210725512234e-05, "loss": 1.8848, "step": 1575 }, { "epoch": 0.28, "grad_norm": 0.5646616220474243, "learning_rate": 4.9606730544197834e-05, "loss": 1.8899, "step": 1580 }, { "epoch": 0.28, "grad_norm": 0.66986483335495, "learning_rate": 4.960424257979822e-05, "loss": 2.0326, "step": 1585 }, { "epoch": 0.28, "grad_norm": 1.0103802680969238, "learning_rate": 4.960174683310035e-05, "loss": 1.9453, "step": 1590 }, { "epoch": 0.29, "grad_norm": 0.6305530667304993, "learning_rate": 4.9599243304893625e-05, "loss": 1.6763, "step": 1595 }, { "epoch": 0.29, "grad_norm": 0.6252381801605225, "learning_rate": 4.95967319959699e-05, "loss": 1.7975, "step": 1600 }, { "epoch": 0.29, "grad_norm": 1.2989176511764526, "learning_rate": 4.95942129071235e-05, "loss": 2.024, "step": 1605 }, { "epoch": 0.29, "grad_norm": 0.5174963474273682, "learning_rate": 4.959168603915122e-05, "loss": 1.7532, "step": 1610 }, { "epoch": 0.29, "grad_norm": 0.730141282081604, "learning_rate": 4.95891513928523e-05, "loss": 1.7375, "step": 1615 }, { "epoch": 0.29, "grad_norm": 0.690668523311615, "learning_rate": 4.958660896902844e-05, "loss": 2.0428, "step": 1620 }, { "epoch": 0.29, "grad_norm": 0.5279943346977234, "learning_rate": 4.958405876848382e-05, "loss": 1.8309, "step": 1625 }, { "epoch": 0.29, "grad_norm": 0.4510974586009979, "learning_rate": 4.958150079202505e-05, "loss": 1.7792, "step": 1630 }, { "epoch": 0.29, "grad_norm": 0.628743052482605, "learning_rate": 4.957893504046123e-05, "loss": 2.0351, "step": 1635 }, { "epoch": 0.29, "grad_norm": 0.7251464128494263, "learning_rate": 4.95763615146039e-05, "loss": 1.9267, "step": 1640 }, { "epoch": 0.29, "grad_norm": 0.6540181040763855, "learning_rate": 4.957378021526705e-05, "loss": 1.8565, "step": 1645 }, { "epoch": 0.3, "grad_norm": 0.511382520198822, "learning_rate": 4.957119114326717e-05, "loss": 2.0029, "step": 1650 }, { "epoch": 0.3, "grad_norm": 0.8582594990730286, "learning_rate": 4.9568594299423154e-05, "loss": 1.9714, "step": 1655 }, { "epoch": 0.3, "grad_norm": 0.29603156447410583, "learning_rate": 4.9565989684556405e-05, "loss": 2.0099, "step": 1660 }, { "epoch": 0.3, "grad_norm": 0.5270999073982239, "learning_rate": 4.956337729949074e-05, "loss": 1.9188, "step": 1665 }, { "epoch": 0.3, "grad_norm": 0.8146567940711975, "learning_rate": 4.9560757145052465e-05, "loss": 2.0585, "step": 1670 }, { "epoch": 0.3, "grad_norm": 0.967383861541748, "learning_rate": 4.955812922207033e-05, "loss": 1.86, "step": 1675 }, { "epoch": 0.3, "grad_norm": 0.5086208581924438, "learning_rate": 4.955549353137554e-05, "loss": 1.8033, "step": 1680 }, { "epoch": 0.3, "grad_norm": 0.7942809462547302, "learning_rate": 4.955285007380177e-05, "loss": 1.9515, "step": 1685 }, { "epoch": 0.3, "grad_norm": 0.9034421443939209, "learning_rate": 4.9550198850185136e-05, "loss": 1.747, "step": 1690 }, { "epoch": 0.3, "grad_norm": 0.7966691851615906, "learning_rate": 4.954753986136422e-05, "loss": 2.2449, "step": 1695 }, { "epoch": 0.3, "grad_norm": 0.7726231813430786, "learning_rate": 4.954487310818006e-05, "loss": 2.1002, "step": 1700 }, { "epoch": 0.31, "grad_norm": 0.5616759061813354, "learning_rate": 4.9542198591476144e-05, "loss": 2.0595, "step": 1705 }, { "epoch": 0.31, "grad_norm": 0.9209871888160706, "learning_rate": 4.953951631209842e-05, "loss": 1.7772, "step": 1710 }, { "epoch": 0.31, "grad_norm": 0.5064948201179504, "learning_rate": 4.953682627089529e-05, "loss": 1.9936, "step": 1715 }, { "epoch": 0.31, "grad_norm": 0.5640893578529358, "learning_rate": 4.953412846871761e-05, "loss": 2.0519, "step": 1720 }, { "epoch": 0.31, "grad_norm": 0.8482123017311096, "learning_rate": 4.95314229064187e-05, "loss": 1.573, "step": 1725 }, { "epoch": 0.31, "grad_norm": 1.2031537294387817, "learning_rate": 4.952870958485432e-05, "loss": 1.8774, "step": 1730 }, { "epoch": 0.31, "grad_norm": 0.6398987174034119, "learning_rate": 4.952598850488269e-05, "loss": 2.3118, "step": 1735 }, { "epoch": 0.31, "grad_norm": 1.2966880798339844, "learning_rate": 4.952325966736449e-05, "loss": 1.9382, "step": 1740 }, { "epoch": 0.31, "grad_norm": 1.0261720418930054, "learning_rate": 4.952052307316284e-05, "loss": 1.8073, "step": 1745 }, { "epoch": 0.31, "grad_norm": 1.920386791229248, "learning_rate": 4.951777872314333e-05, "loss": 1.752, "step": 1750 }, { "epoch": 0.31, "grad_norm": 0.49061957001686096, "learning_rate": 4.951502661817399e-05, "loss": 2.0995, "step": 1755 }, { "epoch": 0.32, "grad_norm": 0.8655161261558533, "learning_rate": 4.951226675912532e-05, "loss": 1.8597, "step": 1760 }, { "epoch": 0.32, "grad_norm": 0.9834558367729187, "learning_rate": 4.9509499146870236e-05, "loss": 1.9147, "step": 1765 }, { "epoch": 0.32, "grad_norm": 1.557944893836975, "learning_rate": 4.950672378228416e-05, "loss": 1.8975, "step": 1770 }, { "epoch": 0.32, "grad_norm": 0.3454476296901703, "learning_rate": 4.950394066624492e-05, "loss": 1.9411, "step": 1775 }, { "epoch": 0.32, "grad_norm": 0.5218179225921631, "learning_rate": 4.950114979963282e-05, "loss": 1.8548, "step": 1780 }, { "epoch": 0.32, "grad_norm": 0.9810051321983337, "learning_rate": 4.94983511833306e-05, "loss": 2.064, "step": 1785 }, { "epoch": 0.32, "grad_norm": 0.9594024419784546, "learning_rate": 4.949554481822347e-05, "loss": 1.996, "step": 1790 }, { "epoch": 0.32, "grad_norm": 0.664357602596283, "learning_rate": 4.949273070519907e-05, "loss": 1.8579, "step": 1795 }, { "epoch": 0.32, "grad_norm": 0.5438924431800842, "learning_rate": 4.948990884514752e-05, "loss": 1.829, "step": 1800 }, { "epoch": 0.32, "grad_norm": 0.6013981699943542, "learning_rate": 4.948707923896134e-05, "loss": 1.6491, "step": 1805 }, { "epoch": 0.32, "grad_norm": 0.6347360014915466, "learning_rate": 4.948424188753556e-05, "loss": 2.038, "step": 1810 }, { "epoch": 0.32, "grad_norm": 0.27879294753074646, "learning_rate": 4.948139679176762e-05, "loss": 1.9321, "step": 1815 }, { "epoch": 0.33, "grad_norm": 0.48399272561073303, "learning_rate": 4.9478543952557425e-05, "loss": 1.7748, "step": 1820 }, { "epoch": 0.33, "grad_norm": 0.5349772572517395, "learning_rate": 4.9475683370807326e-05, "loss": 2.0395, "step": 1825 }, { "epoch": 0.33, "grad_norm": 1.5943443775177002, "learning_rate": 4.9472815047422115e-05, "loss": 1.759, "step": 1830 }, { "epoch": 0.33, "grad_norm": 0.6108463406562805, "learning_rate": 4.9469938983309045e-05, "loss": 2.1658, "step": 1835 }, { "epoch": 0.33, "grad_norm": 1.8649053573608398, "learning_rate": 4.946705517937782e-05, "loss": 1.9245, "step": 1840 }, { "epoch": 0.33, "grad_norm": 1.3017230033874512, "learning_rate": 4.946416363654056e-05, "loss": 1.7443, "step": 1845 }, { "epoch": 0.33, "grad_norm": 0.48545828461647034, "learning_rate": 4.946126435571188e-05, "loss": 1.9818, "step": 1850 }, { "epoch": 0.33, "grad_norm": 0.7874136567115784, "learning_rate": 4.945835733780881e-05, "loss": 1.8089, "step": 1855 }, { "epoch": 0.33, "grad_norm": 0.6153427958488464, "learning_rate": 4.945544258375086e-05, "loss": 1.896, "step": 1860 }, { "epoch": 0.33, "grad_norm": 2.7674331665039062, "learning_rate": 4.945252009445992e-05, "loss": 2.0585, "step": 1865 }, { "epoch": 0.33, "grad_norm": 1.3186115026474, "learning_rate": 4.94495898708604e-05, "loss": 2.1453, "step": 1870 }, { "epoch": 0.34, "grad_norm": 0.5109472274780273, "learning_rate": 4.944665191387913e-05, "loss": 1.7282, "step": 1875 }, { "epoch": 0.34, "grad_norm": 0.5344210267066956, "learning_rate": 4.9443706224445366e-05, "loss": 1.8201, "step": 1880 }, { "epoch": 0.34, "grad_norm": 10.217705726623535, "learning_rate": 4.9440752803490844e-05, "loss": 2.2501, "step": 1885 }, { "epoch": 0.34, "grad_norm": 0.9253365993499756, "learning_rate": 4.9437791651949704e-05, "loss": 1.9486, "step": 1890 }, { "epoch": 0.34, "grad_norm": 0.6712260842323303, "learning_rate": 4.943482277075858e-05, "loss": 1.8439, "step": 1895 }, { "epoch": 0.34, "grad_norm": 0.6539827585220337, "learning_rate": 4.943184616085652e-05, "loss": 1.9844, "step": 1900 }, { "epoch": 0.34, "grad_norm": 0.4726671278476715, "learning_rate": 4.9428861823185016e-05, "loss": 1.7578, "step": 1905 }, { "epoch": 0.34, "grad_norm": 0.7064640522003174, "learning_rate": 4.942586975868801e-05, "loss": 2.0314, "step": 1910 }, { "epoch": 0.34, "grad_norm": 1.1073232889175415, "learning_rate": 4.94228699683119e-05, "loss": 1.9822, "step": 1915 }, { "epoch": 0.34, "grad_norm": 0.4439001679420471, "learning_rate": 4.941986245300552e-05, "loss": 1.9749, "step": 1920 }, { "epoch": 0.34, "grad_norm": 0.8952843546867371, "learning_rate": 4.941684721372012e-05, "loss": 1.9725, "step": 1925 }, { "epoch": 0.35, "grad_norm": 0.6618747115135193, "learning_rate": 4.941382425140944e-05, "loss": 1.7512, "step": 1930 }, { "epoch": 0.35, "grad_norm": 0.7568296790122986, "learning_rate": 4.941079356702963e-05, "loss": 1.9417, "step": 1935 }, { "epoch": 0.35, "grad_norm": 0.7739149332046509, "learning_rate": 4.9407755161539295e-05, "loss": 2.0415, "step": 1940 }, { "epoch": 0.35, "grad_norm": 0.6339311003684998, "learning_rate": 4.9404709035899475e-05, "loss": 2.0725, "step": 1945 }, { "epoch": 0.35, "grad_norm": 0.8696390390396118, "learning_rate": 4.940165519107367e-05, "loss": 1.9008, "step": 1950 }, { "epoch": 0.35, "grad_norm": 0.9669596552848816, "learning_rate": 4.939859362802779e-05, "loss": 2.1317, "step": 1955 }, { "epoch": 0.35, "grad_norm": 1.293050765991211, "learning_rate": 4.9395524347730224e-05, "loss": 2.0905, "step": 1960 }, { "epoch": 0.35, "grad_norm": 3.0222151279449463, "learning_rate": 4.9392447351151766e-05, "loss": 2.1604, "step": 1965 }, { "epoch": 0.35, "grad_norm": 1.7745553255081177, "learning_rate": 4.9389362639265673e-05, "loss": 1.7385, "step": 1970 }, { "epoch": 0.35, "grad_norm": 0.5706347227096558, "learning_rate": 4.938627021304764e-05, "loss": 1.6271, "step": 1975 }, { "epoch": 0.35, "grad_norm": 0.8289479613304138, "learning_rate": 4.9383170073475795e-05, "loss": 2.0758, "step": 1980 }, { "epoch": 0.36, "grad_norm": 0.584398090839386, "learning_rate": 4.938006222153071e-05, "loss": 2.0172, "step": 1985 }, { "epoch": 0.36, "grad_norm": 1.0959614515304565, "learning_rate": 4.9376946658195385e-05, "loss": 1.963, "step": 1990 }, { "epoch": 0.36, "grad_norm": 0.6466994881629944, "learning_rate": 4.93738233844553e-05, "loss": 2.0027, "step": 1995 }, { "epoch": 0.36, "grad_norm": 1.194799780845642, "learning_rate": 4.93706924012983e-05, "loss": 2.0338, "step": 2000 }, { "epoch": 0.36, "grad_norm": 4.551344871520996, "learning_rate": 4.9367553709714754e-05, "loss": 2.0308, "step": 2005 }, { "epoch": 0.36, "grad_norm": 0.7082229852676392, "learning_rate": 4.9364407310697394e-05, "loss": 1.6489, "step": 2010 }, { "epoch": 0.36, "grad_norm": 0.5904958248138428, "learning_rate": 4.9361253205241446e-05, "loss": 1.8373, "step": 2015 }, { "epoch": 0.36, "grad_norm": 1.1805599927902222, "learning_rate": 4.9358091394344543e-05, "loss": 1.8094, "step": 2020 }, { "epoch": 0.36, "grad_norm": 0.45778775215148926, "learning_rate": 4.9354921879006755e-05, "loss": 1.8818, "step": 2025 }, { "epoch": 0.36, "grad_norm": 0.8848242163658142, "learning_rate": 4.93517446602306e-05, "loss": 1.923, "step": 2030 }, { "epoch": 0.36, "grad_norm": 0.4579658508300781, "learning_rate": 4.934855973902105e-05, "loss": 2.0447, "step": 2035 }, { "epoch": 0.37, "grad_norm": 0.5983306169509888, "learning_rate": 4.934536711638546e-05, "loss": 2.0134, "step": 2040 }, { "epoch": 0.37, "grad_norm": 1.4230965375900269, "learning_rate": 4.934216679333367e-05, "loss": 2.1751, "step": 2045 }, { "epoch": 0.37, "grad_norm": 0.8285879492759705, "learning_rate": 4.933895877087794e-05, "loss": 1.8737, "step": 2050 }, { "epoch": 0.37, "grad_norm": 0.829188346862793, "learning_rate": 4.933574305003296e-05, "loss": 1.9408, "step": 2055 }, { "epoch": 0.37, "grad_norm": 0.6285657286643982, "learning_rate": 4.933251963181586e-05, "loss": 1.8272, "step": 2060 }, { "epoch": 0.37, "grad_norm": 0.5883930921554565, "learning_rate": 4.932928851724621e-05, "loss": 1.8055, "step": 2065 }, { "epoch": 0.37, "grad_norm": 0.6375488042831421, "learning_rate": 4.9326049707346e-05, "loss": 1.8569, "step": 2070 }, { "epoch": 0.37, "grad_norm": 0.6939455270767212, "learning_rate": 4.9322803203139666e-05, "loss": 1.5819, "step": 2075 }, { "epoch": 0.37, "grad_norm": 0.8837314248085022, "learning_rate": 4.931954900565408e-05, "loss": 1.8026, "step": 2080 }, { "epoch": 0.37, "grad_norm": 0.8050704598426819, "learning_rate": 4.931628711591854e-05, "loss": 1.9522, "step": 2085 }, { "epoch": 0.37, "grad_norm": 0.8270391821861267, "learning_rate": 4.931301753496476e-05, "loss": 1.9587, "step": 2090 }, { "epoch": 0.37, "grad_norm": 1.1010689735412598, "learning_rate": 4.930974026382693e-05, "loss": 2.0557, "step": 2095 }, { "epoch": 0.38, "grad_norm": 4.389054298400879, "learning_rate": 4.930645530354163e-05, "loss": 1.7961, "step": 2100 }, { "epoch": 0.38, "grad_norm": 0.7086299061775208, "learning_rate": 4.93031626551479e-05, "loss": 1.8631, "step": 2105 }, { "epoch": 0.38, "grad_norm": 0.6452075242996216, "learning_rate": 4.9299862319687204e-05, "loss": 2.1335, "step": 2110 }, { "epoch": 0.38, "grad_norm": 0.5691851377487183, "learning_rate": 4.9296554298203423e-05, "loss": 2.1724, "step": 2115 }, { "epoch": 0.38, "grad_norm": 0.40636715292930603, "learning_rate": 4.929323859174289e-05, "loss": 1.908, "step": 2120 }, { "epoch": 0.38, "grad_norm": 0.7919933795928955, "learning_rate": 4.928991520135436e-05, "loss": 1.8257, "step": 2125 }, { "epoch": 0.38, "grad_norm": 1.567937970161438, "learning_rate": 4.928658412808901e-05, "loss": 1.8642, "step": 2130 }, { "epoch": 0.38, "grad_norm": 0.49311545491218567, "learning_rate": 4.9283245373000465e-05, "loss": 1.8398, "step": 2135 }, { "epoch": 0.38, "grad_norm": 0.7059115767478943, "learning_rate": 4.927989893714477e-05, "loss": 1.879, "step": 2140 }, { "epoch": 0.38, "grad_norm": 0.5949462652206421, "learning_rate": 4.927654482158039e-05, "loss": 1.9516, "step": 2145 }, { "epoch": 0.38, "grad_norm": 0.8240839838981628, "learning_rate": 4.927318302736824e-05, "loss": 2.192, "step": 2150 }, { "epoch": 0.39, "grad_norm": 0.7169739603996277, "learning_rate": 4.926981355557164e-05, "loss": 1.7331, "step": 2155 }, { "epoch": 0.39, "grad_norm": 3.85520076751709, "learning_rate": 4.926643640725635e-05, "loss": 1.9134, "step": 2160 }, { "epoch": 0.39, "grad_norm": 0.6009440422058105, "learning_rate": 4.9263051583490584e-05, "loss": 1.8528, "step": 2165 }, { "epoch": 0.39, "grad_norm": 0.7495572566986084, "learning_rate": 4.9259659085344925e-05, "loss": 1.7285, "step": 2170 }, { "epoch": 0.39, "grad_norm": 0.4452281892299652, "learning_rate": 4.925625891389244e-05, "loss": 1.9832, "step": 2175 }, { "epoch": 0.39, "grad_norm": 0.6736578345298767, "learning_rate": 4.925285107020859e-05, "loss": 2.4924, "step": 2180 }, { "epoch": 0.39, "grad_norm": 0.6530075073242188, "learning_rate": 4.924943555537128e-05, "loss": 2.2028, "step": 2185 }, { "epoch": 0.39, "grad_norm": 0.629514753818512, "learning_rate": 4.924601237046082e-05, "loss": 1.9472, "step": 2190 }, { "epoch": 0.39, "grad_norm": 0.8466399908065796, "learning_rate": 4.924258151655997e-05, "loss": 1.8294, "step": 2195 }, { "epoch": 0.39, "grad_norm": 0.8130293488502502, "learning_rate": 4.92391429947539e-05, "loss": 2.1553, "step": 2200 }, { "epoch": 0.39, "grad_norm": 0.7702954411506653, "learning_rate": 4.923569680613023e-05, "loss": 2.0827, "step": 2205 }, { "epoch": 0.4, "grad_norm": 0.4999507963657379, "learning_rate": 4.923224295177897e-05, "loss": 1.736, "step": 2210 }, { "epoch": 0.4, "grad_norm": 1.1567188501358032, "learning_rate": 4.922878143279256e-05, "loss": 1.7352, "step": 2215 }, { "epoch": 0.4, "grad_norm": 0.736792802810669, "learning_rate": 4.92253122502659e-05, "loss": 1.606, "step": 2220 }, { "epoch": 0.4, "grad_norm": 0.5883451700210571, "learning_rate": 4.922183540529627e-05, "loss": 1.8151, "step": 2225 }, { "epoch": 0.4, "grad_norm": 0.6268717646598816, "learning_rate": 4.921835089898341e-05, "loss": 2.3119, "step": 2230 }, { "epoch": 0.4, "grad_norm": 2.0409209728240967, "learning_rate": 4.9214858732429444e-05, "loss": 1.9868, "step": 2235 }, { "epoch": 0.4, "grad_norm": 0.6836051344871521, "learning_rate": 4.921135890673896e-05, "loss": 1.8223, "step": 2240 }, { "epoch": 0.4, "grad_norm": 0.6672108173370361, "learning_rate": 4.920785142301893e-05, "loss": 2.0905, "step": 2245 }, { "epoch": 0.4, "grad_norm": 0.7968783378601074, "learning_rate": 4.920433628237879e-05, "loss": 1.8637, "step": 2250 }, { "epoch": 0.4, "grad_norm": 0.7804060578346252, "learning_rate": 4.9200813485930375e-05, "loss": 1.749, "step": 2255 }, { "epoch": 0.4, "grad_norm": 1.4104400873184204, "learning_rate": 4.9197283034787925e-05, "loss": 1.8482, "step": 2260 }, { "epoch": 0.41, "grad_norm": 0.480487585067749, "learning_rate": 4.919374493006812e-05, "loss": 1.966, "step": 2265 }, { "epoch": 0.41, "grad_norm": 0.7626844644546509, "learning_rate": 4.919019917289006e-05, "loss": 1.796, "step": 2270 }, { "epoch": 0.41, "grad_norm": 0.4077138304710388, "learning_rate": 4.918664576437528e-05, "loss": 2.1519, "step": 2275 }, { "epoch": 0.41, "grad_norm": 0.5030271410942078, "learning_rate": 4.918308470564771e-05, "loss": 1.8834, "step": 2280 }, { "epoch": 0.41, "grad_norm": 0.5887728929519653, "learning_rate": 4.9179515997833706e-05, "loss": 2.0374, "step": 2285 }, { "epoch": 0.41, "grad_norm": 0.766281247138977, "learning_rate": 4.917593964206205e-05, "loss": 1.8359, "step": 2290 }, { "epoch": 0.41, "grad_norm": 0.6358659267425537, "learning_rate": 4.917235563946394e-05, "loss": 1.7785, "step": 2295 }, { "epoch": 0.41, "grad_norm": 0.4064447283744812, "learning_rate": 4.916876399117299e-05, "loss": 2.0286, "step": 2300 }, { "epoch": 0.41, "grad_norm": 0.7041420936584473, "learning_rate": 4.916516469832524e-05, "loss": 1.7674, "step": 2305 }, { "epoch": 0.41, "grad_norm": 0.7361471056938171, "learning_rate": 4.916155776205913e-05, "loss": 1.7543, "step": 2310 }, { "epoch": 0.41, "grad_norm": 1.4390116930007935, "learning_rate": 4.915794318351555e-05, "loss": 1.7741, "step": 2315 }, { "epoch": 0.42, "grad_norm": 0.5468139052391052, "learning_rate": 4.915432096383779e-05, "loss": 1.8634, "step": 2320 }, { "epoch": 0.42, "grad_norm": 0.36798736453056335, "learning_rate": 4.915069110417152e-05, "loss": 1.9809, "step": 2325 }, { "epoch": 0.42, "grad_norm": 0.6476161479949951, "learning_rate": 4.914705360566489e-05, "loss": 1.8003, "step": 2330 }, { "epoch": 0.42, "grad_norm": 0.4177202880382538, "learning_rate": 4.914340846946844e-05, "loss": 1.8988, "step": 2335 }, { "epoch": 0.42, "grad_norm": 0.6585730314254761, "learning_rate": 4.91397556967351e-05, "loss": 2.0532, "step": 2340 }, { "epoch": 0.42, "grad_norm": 0.7709575295448303, "learning_rate": 4.913609528862027e-05, "loss": 1.8792, "step": 2345 }, { "epoch": 0.42, "grad_norm": 0.782526969909668, "learning_rate": 4.9132427246281705e-05, "loss": 1.7209, "step": 2350 }, { "epoch": 0.42, "grad_norm": 0.9264500737190247, "learning_rate": 4.9128751570879616e-05, "loss": 1.9869, "step": 2355 }, { "epoch": 0.42, "grad_norm": 0.5954318642616272, "learning_rate": 4.912506826357661e-05, "loss": 1.9642, "step": 2360 }, { "epoch": 0.42, "grad_norm": 0.49041104316711426, "learning_rate": 4.9121377325537724e-05, "loss": 1.6972, "step": 2365 }, { "epoch": 0.42, "grad_norm": 0.8424311280250549, "learning_rate": 4.911767875793039e-05, "loss": 1.7111, "step": 2370 }, { "epoch": 0.43, "grad_norm": 0.7687463760375977, "learning_rate": 4.911397256192446e-05, "loss": 1.8755, "step": 2375 }, { "epoch": 0.43, "grad_norm": 0.6133802533149719, "learning_rate": 4.9110258738692204e-05, "loss": 2.5453, "step": 2380 }, { "epoch": 0.43, "grad_norm": 0.6727443337440491, "learning_rate": 4.9106537289408305e-05, "loss": 1.6229, "step": 2385 }, { "epoch": 0.43, "grad_norm": 1.0047177076339722, "learning_rate": 4.910280821524985e-05, "loss": 2.0174, "step": 2390 }, { "epoch": 0.43, "grad_norm": 0.6900597810745239, "learning_rate": 4.909907151739633e-05, "loss": 1.8986, "step": 2395 }, { "epoch": 0.43, "grad_norm": 0.730687141418457, "learning_rate": 4.909532719702968e-05, "loss": 1.8659, "step": 2400 }, { "epoch": 0.43, "grad_norm": 0.8315011858940125, "learning_rate": 4.909157525533421e-05, "loss": 1.6193, "step": 2405 }, { "epoch": 0.43, "grad_norm": 2.3859171867370605, "learning_rate": 4.908781569349666e-05, "loss": 2.1098, "step": 2410 }, { "epoch": 0.43, "grad_norm": 0.7538830637931824, "learning_rate": 4.9084048512706174e-05, "loss": 1.7888, "step": 2415 }, { "epoch": 0.43, "grad_norm": 0.840951681137085, "learning_rate": 4.908027371415431e-05, "loss": 1.6484, "step": 2420 }, { "epoch": 0.43, "grad_norm": 1.1713069677352905, "learning_rate": 4.907649129903504e-05, "loss": 1.9293, "step": 2425 }, { "epoch": 0.43, "grad_norm": 0.6097638010978699, "learning_rate": 4.9072701268544726e-05, "loss": 1.9388, "step": 2430 }, { "epoch": 0.44, "grad_norm": 0.6106088161468506, "learning_rate": 4.906890362388215e-05, "loss": 1.6406, "step": 2435 }, { "epoch": 0.44, "grad_norm": 1.9904814958572388, "learning_rate": 4.906509836624852e-05, "loss": 1.6748, "step": 2440 }, { "epoch": 0.44, "grad_norm": 1.0274256467819214, "learning_rate": 4.906128549684741e-05, "loss": 1.9952, "step": 2445 }, { "epoch": 0.44, "grad_norm": 0.5849592685699463, "learning_rate": 4.905746501688485e-05, "loss": 1.7149, "step": 2450 }, { "epoch": 0.44, "grad_norm": 1.4269081354141235, "learning_rate": 4.9053636927569246e-05, "loss": 2.0261, "step": 2455 }, { "epoch": 0.44, "grad_norm": 0.9268386363983154, "learning_rate": 4.904980123011142e-05, "loss": 1.6845, "step": 2460 }, { "epoch": 0.44, "grad_norm": 0.6730044484138489, "learning_rate": 4.904595792572459e-05, "loss": 1.9066, "step": 2465 }, { "epoch": 0.44, "grad_norm": 0.5785349011421204, "learning_rate": 4.9042107015624405e-05, "loss": 1.9116, "step": 2470 }, { "epoch": 0.44, "grad_norm": 7.519561767578125, "learning_rate": 4.903824850102889e-05, "loss": 1.9645, "step": 2475 }, { "epoch": 0.44, "grad_norm": 0.5183913111686707, "learning_rate": 4.903438238315851e-05, "loss": 2.0975, "step": 2480 }, { "epoch": 0.44, "grad_norm": 0.6047394871711731, "learning_rate": 4.903050866323609e-05, "loss": 1.9343, "step": 2485 }, { "epoch": 0.45, "grad_norm": 1.560421109199524, "learning_rate": 4.9026627342486886e-05, "loss": 1.9887, "step": 2490 }, { "epoch": 0.45, "grad_norm": 0.5357934236526489, "learning_rate": 4.902273842213857e-05, "loss": 2.06, "step": 2495 }, { "epoch": 0.45, "grad_norm": 0.7645955085754395, "learning_rate": 4.901884190342121e-05, "loss": 1.7665, "step": 2500 }, { "epoch": 0.45, "grad_norm": 0.7442848086357117, "learning_rate": 4.901493778756725e-05, "loss": 1.7705, "step": 2505 }, { "epoch": 0.45, "grad_norm": 0.5181974768638611, "learning_rate": 4.901102607581156e-05, "loss": 2.1552, "step": 2510 }, { "epoch": 0.45, "grad_norm": 0.5318804979324341, "learning_rate": 4.900710676939143e-05, "loss": 1.925, "step": 2515 }, { "epoch": 0.45, "grad_norm": 0.42624104022979736, "learning_rate": 4.900317986954651e-05, "loss": 1.8521, "step": 2520 }, { "epoch": 0.45, "grad_norm": 0.8136389255523682, "learning_rate": 4.89992453775189e-05, "loss": 1.8125, "step": 2525 }, { "epoch": 0.45, "grad_norm": 0.8504977226257324, "learning_rate": 4.8995303294553054e-05, "loss": 1.704, "step": 2530 }, { "epoch": 0.45, "grad_norm": 0.5383654236793518, "learning_rate": 4.899135362189585e-05, "loss": 1.8018, "step": 2535 }, { "epoch": 0.45, "grad_norm": 0.590053915977478, "learning_rate": 4.898739636079658e-05, "loss": 1.7633, "step": 2540 }, { "epoch": 0.46, "grad_norm": 0.6471716165542603, "learning_rate": 4.898343151250691e-05, "loss": 1.8805, "step": 2545 }, { "epoch": 0.46, "grad_norm": 0.8511287569999695, "learning_rate": 4.8979459078280924e-05, "loss": 1.5846, "step": 2550 }, { "epoch": 0.46, "grad_norm": 0.3268200755119324, "learning_rate": 4.8975479059375093e-05, "loss": 1.6611, "step": 2555 }, { "epoch": 0.46, "grad_norm": 0.7143041491508484, "learning_rate": 4.8971491457048305e-05, "loss": 1.9187, "step": 2560 }, { "epoch": 0.46, "grad_norm": 0.5510767698287964, "learning_rate": 4.896749627256182e-05, "loss": 2.0012, "step": 2565 }, { "epoch": 0.46, "grad_norm": 0.34194180369377136, "learning_rate": 4.8963493507179314e-05, "loss": 1.8626, "step": 2570 }, { "epoch": 0.46, "grad_norm": 1.2060723304748535, "learning_rate": 4.895948316216687e-05, "loss": 1.7144, "step": 2575 }, { "epoch": 0.46, "grad_norm": 0.33225440979003906, "learning_rate": 4.895546523879294e-05, "loss": 1.709, "step": 2580 }, { "epoch": 0.46, "grad_norm": 0.596155047416687, "learning_rate": 4.895143973832841e-05, "loss": 2.3011, "step": 2585 }, { "epoch": 0.46, "grad_norm": 0.8214730620384216, "learning_rate": 4.8947406662046516e-05, "loss": 1.7885, "step": 2590 }, { "epoch": 0.46, "grad_norm": 0.9236437082290649, "learning_rate": 4.894336601122293e-05, "loss": 2.1127, "step": 2595 }, { "epoch": 0.47, "grad_norm": 0.6400870680809021, "learning_rate": 4.893931778713572e-05, "loss": 2.0813, "step": 2600 }, { "epoch": 0.47, "grad_norm": 0.41277340054512024, "learning_rate": 4.893526199106531e-05, "loss": 2.1293, "step": 2605 }, { "epoch": 0.47, "grad_norm": 1.0775052309036255, "learning_rate": 4.893119862429455e-05, "loss": 1.453, "step": 2610 }, { "epoch": 0.47, "grad_norm": 0.6560648083686829, "learning_rate": 4.892712768810869e-05, "loss": 1.6368, "step": 2615 }, { "epoch": 0.47, "grad_norm": 0.7044461369514465, "learning_rate": 4.8923049183795355e-05, "loss": 1.9884, "step": 2620 }, { "epoch": 0.47, "grad_norm": 0.6141453385353088, "learning_rate": 4.8918963112644576e-05, "loss": 1.7104, "step": 2625 }, { "epoch": 0.47, "grad_norm": 0.2917144298553467, "learning_rate": 4.891486947594878e-05, "loss": 1.9332, "step": 2630 }, { "epoch": 0.47, "grad_norm": 0.97393798828125, "learning_rate": 4.8910768275002763e-05, "loss": 1.9389, "step": 2635 }, { "epoch": 0.47, "grad_norm": 0.5923995971679688, "learning_rate": 4.890665951110374e-05, "loss": 2.0709, "step": 2640 }, { "epoch": 0.47, "grad_norm": 0.865432620048523, "learning_rate": 4.890254318555131e-05, "loss": 2.1132, "step": 2645 }, { "epoch": 0.47, "grad_norm": 0.7784671187400818, "learning_rate": 4.889841929964746e-05, "loss": 1.7297, "step": 2650 }, { "epoch": 0.48, "grad_norm": 0.5125192999839783, "learning_rate": 4.889428785469657e-05, "loss": 1.9602, "step": 2655 }, { "epoch": 0.48, "grad_norm": 0.7674616575241089, "learning_rate": 4.8890148852005416e-05, "loss": 1.9787, "step": 2660 }, { "epoch": 0.48, "grad_norm": 1.3875999450683594, "learning_rate": 4.888600229288317e-05, "loss": 1.7716, "step": 2665 }, { "epoch": 0.48, "grad_norm": 0.8313968181610107, "learning_rate": 4.8881848178641364e-05, "loss": 1.4916, "step": 2670 }, { "epoch": 0.48, "grad_norm": 0.8791585564613342, "learning_rate": 4.887768651059395e-05, "loss": 1.9918, "step": 2675 }, { "epoch": 0.48, "grad_norm": 0.9071698188781738, "learning_rate": 4.887351729005726e-05, "loss": 2.0969, "step": 2680 }, { "epoch": 0.48, "grad_norm": 0.9274541139602661, "learning_rate": 4.8869340518350024e-05, "loss": 2.0151, "step": 2685 }, { "epoch": 0.48, "grad_norm": 0.5907272100448608, "learning_rate": 4.8865156196793334e-05, "loss": 2.1632, "step": 2690 }, { "epoch": 0.48, "grad_norm": 0.831947922706604, "learning_rate": 4.88609643267107e-05, "loss": 2.2977, "step": 2695 }, { "epoch": 0.48, "grad_norm": 0.7203566431999207, "learning_rate": 4.885676490942799e-05, "loss": 2.071, "step": 2700 }, { "epoch": 0.48, "grad_norm": 1.22580087184906, "learning_rate": 4.885255794627349e-05, "loss": 1.9375, "step": 2705 }, { "epoch": 0.49, "grad_norm": 0.7402857542037964, "learning_rate": 4.884834343857786e-05, "loss": 2.1846, "step": 2710 }, { "epoch": 0.49, "grad_norm": 0.610673189163208, "learning_rate": 4.884412138767414e-05, "loss": 1.9636, "step": 2715 }, { "epoch": 0.49, "grad_norm": 0.7960859537124634, "learning_rate": 4.883989179489775e-05, "loss": 2.0377, "step": 2720 }, { "epoch": 0.49, "grad_norm": 0.5835393071174622, "learning_rate": 4.883565466158653e-05, "loss": 2.1319, "step": 2725 }, { "epoch": 0.49, "grad_norm": 0.7329149842262268, "learning_rate": 4.8831409989080656e-05, "loss": 1.7587, "step": 2730 }, { "epoch": 0.49, "grad_norm": 0.5767766833305359, "learning_rate": 4.882715777872273e-05, "loss": 1.9912, "step": 2735 }, { "epoch": 0.49, "grad_norm": 0.826280951499939, "learning_rate": 4.882289803185772e-05, "loss": 1.6964, "step": 2740 }, { "epoch": 0.49, "grad_norm": 0.6336571574211121, "learning_rate": 4.881863074983298e-05, "loss": 2.1097, "step": 2745 }, { "epoch": 0.49, "grad_norm": 0.4215265214443207, "learning_rate": 4.8814355933998235e-05, "loss": 1.8692, "step": 2750 }, { "epoch": 0.49, "grad_norm": 0.4438379108905792, "learning_rate": 4.881007358570562e-05, "loss": 1.797, "step": 2755 }, { "epoch": 0.49, "grad_norm": 0.6177186369895935, "learning_rate": 4.880578370630963e-05, "loss": 1.7992, "step": 2760 }, { "epoch": 0.49, "grad_norm": 0.6064658761024475, "learning_rate": 4.8801486297167156e-05, "loss": 1.9718, "step": 2765 }, { "epoch": 0.5, "grad_norm": 0.5778130888938904, "learning_rate": 4.879718135963746e-05, "loss": 2.0216, "step": 2770 }, { "epoch": 0.5, "grad_norm": 1.0478150844573975, "learning_rate": 4.879286889508219e-05, "loss": 1.6562, "step": 2775 }, { "epoch": 0.5, "grad_norm": 0.9771472811698914, "learning_rate": 4.8788548904865375e-05, "loss": 1.8386, "step": 2780 }, { "epoch": 0.5, "grad_norm": 0.5626159310340881, "learning_rate": 4.878422139035341e-05, "loss": 2.2397, "step": 2785 }, { "epoch": 0.5, "grad_norm": 0.8984270691871643, "learning_rate": 4.877988635291511e-05, "loss": 1.8268, "step": 2790 }, { "epoch": 0.5, "grad_norm": 0.7879625558853149, "learning_rate": 4.877554379392163e-05, "loss": 1.9463, "step": 2795 }, { "epoch": 0.5, "grad_norm": 0.40900498628616333, "learning_rate": 4.877119371474651e-05, "loss": 1.8477, "step": 2800 }, { "epoch": 0.5, "grad_norm": 0.685408353805542, "learning_rate": 4.876683611676569e-05, "loss": 1.9898, "step": 2805 }, { "epoch": 0.5, "grad_norm": 0.5986139178276062, "learning_rate": 4.876247100135746e-05, "loss": 2.0433, "step": 2810 }, { "epoch": 0.5, "grad_norm": 0.7480238676071167, "learning_rate": 4.875809836990252e-05, "loss": 1.9656, "step": 2815 }, { "epoch": 0.5, "grad_norm": 7.595890522003174, "learning_rate": 4.8753718223783905e-05, "loss": 1.8644, "step": 2820 }, { "epoch": 0.51, "grad_norm": 1.7732789516448975, "learning_rate": 4.874933056438707e-05, "loss": 1.8836, "step": 2825 }, { "epoch": 0.51, "grad_norm": 0.520476222038269, "learning_rate": 4.874493539309982e-05, "loss": 2.0753, "step": 2830 }, { "epoch": 0.51, "grad_norm": 0.2731196880340576, "learning_rate": 4.874053271131235e-05, "loss": 2.1824, "step": 2835 }, { "epoch": 0.51, "grad_norm": 1.522787094116211, "learning_rate": 4.8736122520417215e-05, "loss": 1.776, "step": 2840 }, { "epoch": 0.51, "grad_norm": 0.6246161460876465, "learning_rate": 4.8731704821809354e-05, "loss": 1.8492, "step": 2845 }, { "epoch": 0.51, "grad_norm": 0.5303625464439392, "learning_rate": 4.872727961688609e-05, "loss": 1.7064, "step": 2850 }, { "epoch": 0.51, "grad_norm": 0.939147412776947, "learning_rate": 4.87228469070471e-05, "loss": 1.8514, "step": 2855 }, { "epoch": 0.51, "grad_norm": 0.5877857208251953, "learning_rate": 4.8718406693694465e-05, "loss": 1.7117, "step": 2860 }, { "epoch": 0.51, "grad_norm": 0.4529130458831787, "learning_rate": 4.87139589782326e-05, "loss": 1.8436, "step": 2865 }, { "epoch": 0.51, "grad_norm": 0.8624293208122253, "learning_rate": 4.870950376206831e-05, "loss": 1.8674, "step": 2870 }, { "epoch": 0.51, "grad_norm": 0.6913080811500549, "learning_rate": 4.87050410466108e-05, "loss": 2.4175, "step": 2875 }, { "epoch": 0.52, "grad_norm": 1.0130589008331299, "learning_rate": 4.87005708332716e-05, "loss": 1.8629, "step": 2880 }, { "epoch": 0.52, "grad_norm": 0.40573757886886597, "learning_rate": 4.869609312346465e-05, "loss": 2.1532, "step": 2885 }, { "epoch": 0.52, "grad_norm": 1.0451394319534302, "learning_rate": 4.869160791860623e-05, "loss": 1.9334, "step": 2890 }, { "epoch": 0.52, "grad_norm": 3.0864038467407227, "learning_rate": 4.868711522011502e-05, "loss": 1.7183, "step": 2895 }, { "epoch": 0.52, "grad_norm": 0.2843054234981537, "learning_rate": 4.868261502941204e-05, "loss": 2.0684, "step": 2900 }, { "epoch": 0.52, "grad_norm": 0.6705717444419861, "learning_rate": 4.867810734792072e-05, "loss": 1.8852, "step": 2905 }, { "epoch": 0.52, "grad_norm": 0.8164358735084534, "learning_rate": 4.867359217706681e-05, "loss": 2.0016, "step": 2910 }, { "epoch": 0.52, "grad_norm": 0.6517994403839111, "learning_rate": 4.866906951827847e-05, "loss": 1.936, "step": 2915 }, { "epoch": 0.52, "grad_norm": 0.3796793818473816, "learning_rate": 4.866453937298621e-05, "loss": 1.9839, "step": 2920 }, { "epoch": 0.52, "grad_norm": 0.5019554495811462, "learning_rate": 4.866000174262291e-05, "loss": 1.6889, "step": 2925 }, { "epoch": 0.52, "grad_norm": 1.4469722509384155, "learning_rate": 4.865545662862381e-05, "loss": 1.9799, "step": 2930 }, { "epoch": 0.53, "grad_norm": 1.0865347385406494, "learning_rate": 4.865090403242654e-05, "loss": 1.7499, "step": 2935 }, { "epoch": 0.53, "grad_norm": 0.9820341467857361, "learning_rate": 4.864634395547106e-05, "loss": 1.9783, "step": 2940 }, { "epoch": 0.53, "grad_norm": 0.41876184940338135, "learning_rate": 4.8641776399199743e-05, "loss": 1.991, "step": 2945 }, { "epoch": 0.53, "grad_norm": 1.171912431716919, "learning_rate": 4.863720136505729e-05, "loss": 1.7386, "step": 2950 }, { "epoch": 0.53, "grad_norm": 0.8185043931007385, "learning_rate": 4.863261885449078e-05, "loss": 1.852, "step": 2955 }, { "epoch": 0.53, "grad_norm": 0.5740297436714172, "learning_rate": 4.862802886894966e-05, "loss": 1.4815, "step": 2960 }, { "epoch": 0.53, "grad_norm": 0.5629348754882812, "learning_rate": 4.862343140988573e-05, "loss": 1.9167, "step": 2965 }, { "epoch": 0.53, "grad_norm": 0.5648974180221558, "learning_rate": 4.861882647875318e-05, "loss": 2.1185, "step": 2970 }, { "epoch": 0.53, "grad_norm": 0.46517616510391235, "learning_rate": 4.861421407700853e-05, "loss": 1.8994, "step": 2975 }, { "epoch": 0.53, "grad_norm": 0.4326724410057068, "learning_rate": 4.860959420611067e-05, "loss": 1.9796, "step": 2980 }, { "epoch": 0.53, "grad_norm": 1.0473462343215942, "learning_rate": 4.860496686752088e-05, "loss": 1.5259, "step": 2985 }, { "epoch": 0.54, "grad_norm": 0.6728710532188416, "learning_rate": 4.860033206270277e-05, "loss": 2.0169, "step": 2990 }, { "epoch": 0.54, "grad_norm": 1.2514013051986694, "learning_rate": 4.859568979312233e-05, "loss": 1.8368, "step": 2995 }, { "epoch": 0.54, "grad_norm": 0.6908515095710754, "learning_rate": 4.8591040060247914e-05, "loss": 2.2935, "step": 3000 }, { "epoch": 0.54, "grad_norm": 0.934448778629303, "learning_rate": 4.8586382865550216e-05, "loss": 1.9923, "step": 3005 }, { "epoch": 0.54, "grad_norm": 0.7105411887168884, "learning_rate": 4.85817182105023e-05, "loss": 1.8267, "step": 3010 }, { "epoch": 0.54, "grad_norm": 0.6507242321968079, "learning_rate": 4.8577046096579596e-05, "loss": 1.7661, "step": 3015 }, { "epoch": 0.54, "grad_norm": 0.576221227645874, "learning_rate": 4.857236652525989e-05, "loss": 1.7314, "step": 3020 }, { "epoch": 0.54, "grad_norm": 1.374879002571106, "learning_rate": 4.856767949802333e-05, "loss": 2.1544, "step": 3025 }, { "epoch": 0.54, "grad_norm": 0.6787958145141602, "learning_rate": 4.856298501635241e-05, "loss": 1.5669, "step": 3030 }, { "epoch": 0.54, "grad_norm": 0.8718627095222473, "learning_rate": 4.8558283081732004e-05, "loss": 1.8672, "step": 3035 }, { "epoch": 0.54, "grad_norm": 0.5774152278900146, "learning_rate": 4.855357369564931e-05, "loss": 1.7822, "step": 3040 }, { "epoch": 0.55, "grad_norm": 0.5398827791213989, "learning_rate": 4.854885685959391e-05, "loss": 1.974, "step": 3045 }, { "epoch": 0.55, "grad_norm": 1.1588878631591797, "learning_rate": 4.8544132575057745e-05, "loss": 1.5734, "step": 3050 }, { "epoch": 0.55, "grad_norm": 0.9843418598175049, "learning_rate": 4.8539400843535086e-05, "loss": 2.267, "step": 3055 }, { "epoch": 0.55, "grad_norm": 0.5535194277763367, "learning_rate": 4.8534661666522584e-05, "loss": 1.5894, "step": 3060 }, { "epoch": 0.55, "grad_norm": 1.1449483633041382, "learning_rate": 4.852991504551924e-05, "loss": 1.7232, "step": 3065 }, { "epoch": 0.55, "grad_norm": 0.4027251601219177, "learning_rate": 4.852516098202639e-05, "loss": 1.934, "step": 3070 }, { "epoch": 0.55, "grad_norm": 3.9327101707458496, "learning_rate": 4.852039947754775e-05, "loss": 1.9557, "step": 3075 }, { "epoch": 0.55, "grad_norm": 0.7538480162620544, "learning_rate": 4.851563053358938e-05, "loss": 1.759, "step": 3080 }, { "epoch": 0.55, "grad_norm": 0.788642942905426, "learning_rate": 4.85108541516597e-05, "loss": 1.9517, "step": 3085 }, { "epoch": 0.55, "grad_norm": 0.7686721086502075, "learning_rate": 4.850607033326946e-05, "loss": 1.676, "step": 3090 }, { "epoch": 0.55, "grad_norm": 1.1780756711959839, "learning_rate": 4.850127907993178e-05, "loss": 2.0255, "step": 3095 }, { "epoch": 0.55, "grad_norm": 0.5100767612457275, "learning_rate": 4.8496480393162144e-05, "loss": 1.7199, "step": 3100 }, { "epoch": 0.56, "grad_norm": 1.0231378078460693, "learning_rate": 4.849167427447835e-05, "loss": 2.0379, "step": 3105 }, { "epoch": 0.56, "grad_norm": 0.6608045697212219, "learning_rate": 4.8486860725400584e-05, "loss": 1.9066, "step": 3110 }, { "epoch": 0.56, "grad_norm": 0.8580523133277893, "learning_rate": 4.848203974745136e-05, "loss": 1.842, "step": 3115 }, { "epoch": 0.56, "grad_norm": 1.7334781885147095, "learning_rate": 4.847721134215555e-05, "loss": 2.1667, "step": 3120 }, { "epoch": 0.56, "grad_norm": 0.904914379119873, "learning_rate": 4.847237551104037e-05, "loss": 2.0194, "step": 3125 }, { "epoch": 0.56, "grad_norm": 0.9545987248420715, "learning_rate": 4.846753225563541e-05, "loss": 1.9362, "step": 3130 }, { "epoch": 0.56, "grad_norm": 0.530463457107544, "learning_rate": 4.846268157747256e-05, "loss": 2.1253, "step": 3135 }, { "epoch": 0.56, "grad_norm": 0.36319640278816223, "learning_rate": 4.845782347808609e-05, "loss": 1.9925, "step": 3140 }, { "epoch": 0.56, "grad_norm": 1.2937252521514893, "learning_rate": 4.845295795901262e-05, "loss": 2.2135, "step": 3145 }, { "epoch": 0.56, "grad_norm": 0.6018854379653931, "learning_rate": 4.8448085021791115e-05, "loss": 1.7548, "step": 3150 }, { "epoch": 0.56, "grad_norm": 0.49405333399772644, "learning_rate": 4.844320466796286e-05, "loss": 1.9319, "step": 3155 }, { "epoch": 0.57, "grad_norm": 1.0258450508117676, "learning_rate": 4.8438316899071525e-05, "loss": 1.969, "step": 3160 }, { "epoch": 0.57, "grad_norm": 1.1361362934112549, "learning_rate": 4.84334217166631e-05, "loss": 1.9683, "step": 3165 }, { "epoch": 0.57, "grad_norm": 0.8203340768814087, "learning_rate": 4.842851912228592e-05, "loss": 2.0704, "step": 3170 }, { "epoch": 0.57, "grad_norm": 0.7416129112243652, "learning_rate": 4.8423609117490686e-05, "loss": 1.8224, "step": 3175 }, { "epoch": 0.57, "grad_norm": 1.2014813423156738, "learning_rate": 4.841869170383041e-05, "loss": 1.8246, "step": 3180 }, { "epoch": 0.57, "grad_norm": 0.5674131512641907, "learning_rate": 4.841376688286048e-05, "loss": 2.114, "step": 3185 }, { "epoch": 0.57, "grad_norm": 0.6258872151374817, "learning_rate": 4.8408834656138605e-05, "loss": 1.9057, "step": 3190 }, { "epoch": 0.57, "grad_norm": 0.6485921144485474, "learning_rate": 4.840389502522484e-05, "loss": 1.9694, "step": 3195 }, { "epoch": 0.57, "grad_norm": 0.786283016204834, "learning_rate": 4.83989479916816e-05, "loss": 1.9653, "step": 3200 }, { "epoch": 0.57, "grad_norm": 0.5875077843666077, "learning_rate": 4.8393993557073605e-05, "loss": 1.8774, "step": 3205 }, { "epoch": 0.57, "grad_norm": 0.5160438418388367, "learning_rate": 4.838903172296796e-05, "loss": 1.9262, "step": 3210 }, { "epoch": 0.58, "grad_norm": 0.4956705868244171, "learning_rate": 4.838406249093408e-05, "loss": 2.0985, "step": 3215 }, { "epoch": 0.58, "grad_norm": 0.6459410190582275, "learning_rate": 4.837908586254372e-05, "loss": 1.9046, "step": 3220 }, { "epoch": 0.58, "grad_norm": 0.8042646050453186, "learning_rate": 4.837410183937099e-05, "loss": 2.1288, "step": 3225 }, { "epoch": 0.58, "grad_norm": 0.6269195079803467, "learning_rate": 4.836911042299233e-05, "loss": 2.0259, "step": 3230 }, { "epoch": 0.58, "grad_norm": 0.8424185514450073, "learning_rate": 4.8364111614986527e-05, "loss": 1.6815, "step": 3235 }, { "epoch": 0.58, "grad_norm": 0.9412960410118103, "learning_rate": 4.835910541693469e-05, "loss": 1.8376, "step": 3240 }, { "epoch": 0.58, "grad_norm": 0.6458824276924133, "learning_rate": 4.835409183042028e-05, "loss": 1.803, "step": 3245 }, { "epoch": 0.58, "grad_norm": 0.5122315287590027, "learning_rate": 4.834907085702908e-05, "loss": 2.0321, "step": 3250 }, { "epoch": 0.58, "grad_norm": 0.6900401711463928, "learning_rate": 4.834404249834924e-05, "loss": 2.0708, "step": 3255 }, { "epoch": 0.58, "grad_norm": 0.9199984669685364, "learning_rate": 4.83390067559712e-05, "loss": 1.7106, "step": 3260 }, { "epoch": 0.58, "grad_norm": 0.7172281742095947, "learning_rate": 4.8333963631487776e-05, "loss": 2.0546, "step": 3265 }, { "epoch": 0.59, "grad_norm": 0.753948986530304, "learning_rate": 4.8328913126494104e-05, "loss": 1.8142, "step": 3270 }, { "epoch": 0.59, "grad_norm": 0.8077734708786011, "learning_rate": 4.8323855242587644e-05, "loss": 1.908, "step": 3275 }, { "epoch": 0.59, "grad_norm": 0.6292251944541931, "learning_rate": 4.831878998136821e-05, "loss": 2.0195, "step": 3280 }, { "epoch": 0.59, "grad_norm": 0.6925063133239746, "learning_rate": 4.8313717344437936e-05, "loss": 1.9374, "step": 3285 }, { "epoch": 0.59, "grad_norm": 0.5431442856788635, "learning_rate": 4.8308637333401284e-05, "loss": 1.9119, "step": 3290 }, { "epoch": 0.59, "grad_norm": 0.8268663883209229, "learning_rate": 4.830354994986507e-05, "loss": 1.8918, "step": 3295 }, { "epoch": 0.59, "grad_norm": 0.5958759784698486, "learning_rate": 4.829845519543842e-05, "loss": 1.6231, "step": 3300 }, { "epoch": 0.59, "grad_norm": 0.7670718431472778, "learning_rate": 4.82933530717328e-05, "loss": 1.8069, "step": 3305 }, { "epoch": 0.59, "grad_norm": 0.5632594227790833, "learning_rate": 4.828824358036201e-05, "loss": 1.8654, "step": 3310 }, { "epoch": 0.59, "grad_norm": 1.0903352499008179, "learning_rate": 4.8283126722942176e-05, "loss": 2.0945, "step": 3315 }, { "epoch": 0.59, "grad_norm": 0.4936974346637726, "learning_rate": 4.8278002501091755e-05, "loss": 2.1163, "step": 3320 }, { "epoch": 0.6, "grad_norm": 0.2784171402454376, "learning_rate": 4.827287091643154e-05, "loss": 1.9931, "step": 3325 }, { "epoch": 0.6, "grad_norm": 0.8564805388450623, "learning_rate": 4.8267731970584626e-05, "loss": 2.1167, "step": 3330 }, { "epoch": 0.6, "grad_norm": 2.2154970169067383, "learning_rate": 4.8262585665176485e-05, "loss": 1.7391, "step": 3335 }, { "epoch": 0.6, "grad_norm": 0.9126550555229187, "learning_rate": 4.825743200183486e-05, "loss": 2.0901, "step": 3340 }, { "epoch": 0.6, "grad_norm": 0.4065916836261749, "learning_rate": 4.825227098218987e-05, "loss": 1.991, "step": 3345 }, { "epoch": 0.6, "grad_norm": 0.732071578502655, "learning_rate": 4.824710260787394e-05, "loss": 2.0214, "step": 3350 }, { "epoch": 0.6, "grad_norm": 0.5489442348480225, "learning_rate": 4.824192688052181e-05, "loss": 1.8027, "step": 3355 }, { "epoch": 0.6, "grad_norm": 0.5446940660476685, "learning_rate": 4.8236743801770565e-05, "loss": 1.7853, "step": 3360 }, { "epoch": 0.6, "grad_norm": 0.5271531343460083, "learning_rate": 4.823155337325961e-05, "loss": 1.8151, "step": 3365 }, { "epoch": 0.6, "grad_norm": 0.5761736631393433, "learning_rate": 4.8226355596630655e-05, "loss": 1.6954, "step": 3370 }, { "epoch": 0.6, "grad_norm": 0.9606974124908447, "learning_rate": 4.8221150473527774e-05, "loss": 1.7548, "step": 3375 }, { "epoch": 0.61, "grad_norm": 0.4853847026824951, "learning_rate": 4.821593800559734e-05, "loss": 1.9427, "step": 3380 }, { "epoch": 0.61, "grad_norm": 0.9224936366081238, "learning_rate": 4.8210718194488046e-05, "loss": 1.707, "step": 3385 }, { "epoch": 0.61, "grad_norm": 0.6053968071937561, "learning_rate": 4.82054910418509e-05, "loss": 2.1525, "step": 3390 }, { "epoch": 0.61, "grad_norm": 0.5924646258354187, "learning_rate": 4.8200256549339274e-05, "loss": 1.6875, "step": 3395 }, { "epoch": 0.61, "grad_norm": 8.748364448547363, "learning_rate": 4.8195014718608814e-05, "loss": 1.9114, "step": 3400 }, { "epoch": 0.61, "grad_norm": 0.8054745197296143, "learning_rate": 4.81897655513175e-05, "loss": 1.935, "step": 3405 }, { "epoch": 0.61, "grad_norm": 0.8999778032302856, "learning_rate": 4.8184509049125645e-05, "loss": 1.9618, "step": 3410 }, { "epoch": 0.61, "grad_norm": 0.8038251996040344, "learning_rate": 4.8179245213695887e-05, "loss": 1.5852, "step": 3415 }, { "epoch": 0.61, "grad_norm": 1.1532466411590576, "learning_rate": 4.817397404669317e-05, "loss": 1.7721, "step": 3420 }, { "epoch": 0.61, "grad_norm": 1.0493075847625732, "learning_rate": 4.816869554978475e-05, "loss": 1.8461, "step": 3425 }, { "epoch": 0.61, "grad_norm": 0.7406529784202576, "learning_rate": 4.81634097246402e-05, "loss": 1.9881, "step": 3430 }, { "epoch": 0.61, "grad_norm": 0.6479585766792297, "learning_rate": 4.8158116572931445e-05, "loss": 1.7853, "step": 3435 }, { "epoch": 0.62, "grad_norm": 0.9244452118873596, "learning_rate": 4.815281609633269e-05, "loss": 1.7463, "step": 3440 }, { "epoch": 0.62, "grad_norm": 0.5907859802246094, "learning_rate": 4.814750829652048e-05, "loss": 1.9086, "step": 3445 }, { "epoch": 0.62, "grad_norm": 0.5566309094429016, "learning_rate": 4.814219317517365e-05, "loss": 2.0991, "step": 3450 }, { "epoch": 0.62, "grad_norm": 0.7224152088165283, "learning_rate": 4.813687073397339e-05, "loss": 1.9487, "step": 3455 }, { "epoch": 0.62, "grad_norm": 0.6035553216934204, "learning_rate": 4.8131540974603164e-05, "loss": 2.0636, "step": 3460 }, { "epoch": 0.62, "grad_norm": 0.7073687314987183, "learning_rate": 4.8126203898748776e-05, "loss": 1.9388, "step": 3465 }, { "epoch": 0.62, "grad_norm": 1.752922534942627, "learning_rate": 4.812085950809835e-05, "loss": 1.5911, "step": 3470 }, { "epoch": 0.62, "grad_norm": 0.6370113492012024, "learning_rate": 4.811550780434229e-05, "loss": 1.7502, "step": 3475 }, { "epoch": 0.62, "grad_norm": 0.7335951924324036, "learning_rate": 4.811014878917335e-05, "loss": 1.8573, "step": 3480 }, { "epoch": 0.62, "grad_norm": 0.6506394743919373, "learning_rate": 4.810478246428658e-05, "loss": 1.7608, "step": 3485 }, { "epoch": 0.62, "grad_norm": 0.5935482382774353, "learning_rate": 4.8099408831379335e-05, "loss": 1.9053, "step": 3490 }, { "epoch": 0.63, "grad_norm": 1.0178114175796509, "learning_rate": 4.80940278921513e-05, "loss": 1.9641, "step": 3495 }, { "epoch": 0.63, "grad_norm": 0.5880244970321655, "learning_rate": 4.8088639648304455e-05, "loss": 1.6671, "step": 3500 }, { "epoch": 0.63, "grad_norm": 0.6311240792274475, "learning_rate": 4.80832441015431e-05, "loss": 1.6607, "step": 3505 }, { "epoch": 0.63, "grad_norm": 0.4380526542663574, "learning_rate": 4.807784125357383e-05, "loss": 2.2915, "step": 3510 }, { "epoch": 0.63, "grad_norm": 0.7547699213027954, "learning_rate": 4.8072431106105577e-05, "loss": 1.7945, "step": 3515 }, { "epoch": 0.63, "grad_norm": 1.0797021389007568, "learning_rate": 4.806701366084956e-05, "loss": 1.9746, "step": 3520 }, { "epoch": 0.63, "grad_norm": 1.3622939586639404, "learning_rate": 4.8061588919519304e-05, "loss": 1.7607, "step": 3525 }, { "epoch": 0.63, "grad_norm": 0.4650992751121521, "learning_rate": 4.805615688383066e-05, "loss": 1.9469, "step": 3530 }, { "epoch": 0.63, "grad_norm": 0.421215683221817, "learning_rate": 4.805071755550177e-05, "loss": 1.7305, "step": 3535 }, { "epoch": 0.63, "grad_norm": 3.6225435733795166, "learning_rate": 4.8045270936253086e-05, "loss": 1.6456, "step": 3540 }, { "epoch": 0.63, "grad_norm": 0.6064786314964294, "learning_rate": 4.8039817027807374e-05, "loss": 1.8573, "step": 3545 }, { "epoch": 0.64, "grad_norm": 0.45190367102622986, "learning_rate": 4.803435583188969e-05, "loss": 1.9488, "step": 3550 }, { "epoch": 0.64, "grad_norm": 0.44525858759880066, "learning_rate": 4.8028887350227424e-05, "loss": 2.1564, "step": 3555 }, { "epoch": 0.64, "grad_norm": 0.6965821385383606, "learning_rate": 4.802341158455024e-05, "loss": 1.7877, "step": 3560 }, { "epoch": 0.64, "grad_norm": 0.4658873975276947, "learning_rate": 4.80179285365901e-05, "loss": 1.9438, "step": 3565 }, { "epoch": 0.64, "grad_norm": 0.8804203867912292, "learning_rate": 4.801243820808132e-05, "loss": 1.8994, "step": 3570 }, { "epoch": 0.64, "grad_norm": 0.5602685809135437, "learning_rate": 4.8006940600760475e-05, "loss": 1.8585, "step": 3575 }, { "epoch": 0.64, "grad_norm": 0.7118222713470459, "learning_rate": 4.800143571636644e-05, "loss": 1.9065, "step": 3580 }, { "epoch": 0.64, "grad_norm": 0.5043877959251404, "learning_rate": 4.799592355664041e-05, "loss": 1.8838, "step": 3585 }, { "epoch": 0.64, "grad_norm": 0.5556797981262207, "learning_rate": 4.799040412332588e-05, "loss": 1.8743, "step": 3590 }, { "epoch": 0.64, "grad_norm": 0.5163377523422241, "learning_rate": 4.798487741816864e-05, "loss": 1.888, "step": 3595 }, { "epoch": 0.64, "grad_norm": 0.7609609961509705, "learning_rate": 4.797934344291678e-05, "loss": 2.0205, "step": 3600 }, { "epoch": 0.65, "grad_norm": 0.6983132362365723, "learning_rate": 4.797380219932069e-05, "loss": 1.9689, "step": 3605 }, { "epoch": 0.65, "grad_norm": 1.0645403861999512, "learning_rate": 4.7968253689133074e-05, "loss": 1.8991, "step": 3610 }, { "epoch": 0.65, "grad_norm": 0.5911131501197815, "learning_rate": 4.79626979141089e-05, "loss": 2.0719, "step": 3615 }, { "epoch": 0.65, "grad_norm": 0.25667804479599, "learning_rate": 4.795713487600547e-05, "loss": 2.1776, "step": 3620 }, { "epoch": 0.65, "grad_norm": 0.4448702037334442, "learning_rate": 4.795156457658235e-05, "loss": 2.1112, "step": 3625 }, { "epoch": 0.65, "grad_norm": 0.5844559669494629, "learning_rate": 4.7945987017601435e-05, "loss": 2.0799, "step": 3630 }, { "epoch": 0.65, "grad_norm": 0.5194575786590576, "learning_rate": 4.79404022008269e-05, "loss": 1.9454, "step": 3635 }, { "epoch": 0.65, "grad_norm": 0.4168377220630646, "learning_rate": 4.7934810128025216e-05, "loss": 1.5136, "step": 3640 }, { "epoch": 0.65, "grad_norm": 0.6346775889396667, "learning_rate": 4.792921080096514e-05, "loss": 2.225, "step": 3645 }, { "epoch": 0.65, "grad_norm": 1.2728739976882935, "learning_rate": 4.7923604221417754e-05, "loss": 1.9655, "step": 3650 }, { "epoch": 0.65, "grad_norm": 0.6890284419059753, "learning_rate": 4.79179903911564e-05, "loss": 1.9297, "step": 3655 }, { "epoch": 0.66, "grad_norm": 3.341615676879883, "learning_rate": 4.7912369311956726e-05, "loss": 2.0387, "step": 3660 }, { "epoch": 0.66, "grad_norm": 0.8217077851295471, "learning_rate": 4.790674098559668e-05, "loss": 1.8114, "step": 3665 }, { "epoch": 0.66, "grad_norm": 0.9442257881164551, "learning_rate": 4.790110541385649e-05, "loss": 2.011, "step": 3670 }, { "epoch": 0.66, "grad_norm": 1.0342832803726196, "learning_rate": 4.789546259851869e-05, "loss": 1.9695, "step": 3675 }, { "epoch": 0.66, "grad_norm": 0.5166803598403931, "learning_rate": 4.788981254136809e-05, "loss": 1.6962, "step": 3680 }, { "epoch": 0.66, "grad_norm": 0.49156954884529114, "learning_rate": 4.78841552441918e-05, "loss": 1.9125, "step": 3685 }, { "epoch": 0.66, "grad_norm": 1.0156267881393433, "learning_rate": 4.7878490708779225e-05, "loss": 2.0294, "step": 3690 }, { "epoch": 0.66, "grad_norm": 0.5214945673942566, "learning_rate": 4.787281893692204e-05, "loss": 2.1405, "step": 3695 }, { "epoch": 0.66, "grad_norm": 0.5293641090393066, "learning_rate": 4.7867139930414214e-05, "loss": 1.8728, "step": 3700 }, { "epoch": 0.66, "grad_norm": 0.5196399688720703, "learning_rate": 4.786145369105204e-05, "loss": 1.8972, "step": 3705 }, { "epoch": 0.66, "grad_norm": 0.6026878952980042, "learning_rate": 4.785576022063405e-05, "loss": 1.9923, "step": 3710 }, { "epoch": 0.66, "grad_norm": 0.5810438394546509, "learning_rate": 4.785005952096109e-05, "loss": 1.9926, "step": 3715 }, { "epoch": 0.67, "grad_norm": 0.6136882305145264, "learning_rate": 4.784435159383627e-05, "loss": 1.9561, "step": 3720 }, { "epoch": 0.67, "grad_norm": 0.7523034811019897, "learning_rate": 4.783863644106502e-05, "loss": 1.9247, "step": 3725 }, { "epoch": 0.67, "grad_norm": 0.477094441652298, "learning_rate": 4.783291406445504e-05, "loss": 2.2932, "step": 3730 }, { "epoch": 0.67, "grad_norm": 0.7570971250534058, "learning_rate": 4.78271844658163e-05, "loss": 2.1548, "step": 3735 }, { "epoch": 0.67, "grad_norm": 0.6923122406005859, "learning_rate": 4.7821447646961066e-05, "loss": 1.7722, "step": 3740 }, { "epoch": 0.67, "grad_norm": 0.4171823561191559, "learning_rate": 4.7815703609703896e-05, "loss": 2.1501, "step": 3745 }, { "epoch": 0.67, "grad_norm": 0.8493704199790955, "learning_rate": 4.7809952355861614e-05, "loss": 2.1935, "step": 3750 }, { "epoch": 0.67, "grad_norm": 1.7510491609573364, "learning_rate": 4.7804193887253344e-05, "loss": 1.7398, "step": 3755 }, { "epoch": 0.67, "grad_norm": 0.6981149315834045, "learning_rate": 4.779842820570049e-05, "loss": 1.857, "step": 3760 }, { "epoch": 0.67, "grad_norm": 0.6277537941932678, "learning_rate": 4.7792655313026715e-05, "loss": 1.9971, "step": 3765 }, { "epoch": 0.67, "grad_norm": 0.963358461856842, "learning_rate": 4.7786875211058e-05, "loss": 1.7726, "step": 3770 }, { "epoch": 0.68, "grad_norm": 0.5438962578773499, "learning_rate": 4.778108790162256e-05, "loss": 1.6827, "step": 3775 }, { "epoch": 0.68, "grad_norm": 0.7283356189727783, "learning_rate": 4.777529338655094e-05, "loss": 1.915, "step": 3780 }, { "epoch": 0.68, "grad_norm": 1.1045600175857544, "learning_rate": 4.776949166767592e-05, "loss": 2.3648, "step": 3785 }, { "epoch": 0.68, "grad_norm": 0.6527469754219055, "learning_rate": 4.77636827468326e-05, "loss": 1.8445, "step": 3790 }, { "epoch": 0.68, "grad_norm": 0.3629080057144165, "learning_rate": 4.775786662585831e-05, "loss": 1.949, "step": 3795 }, { "epoch": 0.68, "grad_norm": 0.49108001589775085, "learning_rate": 4.77520433065927e-05, "loss": 1.8602, "step": 3800 }, { "epoch": 0.68, "grad_norm": 0.3771425783634186, "learning_rate": 4.774621279087768e-05, "loss": 1.8913, "step": 3805 }, { "epoch": 0.68, "grad_norm": 0.9145960807800293, "learning_rate": 4.7740375080557434e-05, "loss": 1.9314, "step": 3810 }, { "epoch": 0.68, "grad_norm": 1.6728583574295044, "learning_rate": 4.773453017747842e-05, "loss": 1.6274, "step": 3815 }, { "epoch": 0.68, "grad_norm": 0.5668622255325317, "learning_rate": 4.772867808348938e-05, "loss": 2.0638, "step": 3820 }, { "epoch": 0.68, "grad_norm": 0.6772801876068115, "learning_rate": 4.7722818800441316e-05, "loss": 1.9404, "step": 3825 }, { "epoch": 0.69, "grad_norm": 10.99435043334961, "learning_rate": 4.771695233018754e-05, "loss": 1.8604, "step": 3830 }, { "epoch": 0.69, "grad_norm": 0.8190693259239197, "learning_rate": 4.7711078674583576e-05, "loss": 1.8342, "step": 3835 }, { "epoch": 0.69, "grad_norm": 3.4974558353424072, "learning_rate": 4.770519783548728e-05, "loss": 2.1466, "step": 3840 }, { "epoch": 0.69, "grad_norm": 0.8280129432678223, "learning_rate": 4.7699309814758746e-05, "loss": 1.845, "step": 3845 }, { "epoch": 0.69, "grad_norm": 0.6876835823059082, "learning_rate": 4.7693414614260356e-05, "loss": 2.0733, "step": 3850 }, { "epoch": 0.69, "grad_norm": 1.2137017250061035, "learning_rate": 4.7687512235856744e-05, "loss": 2.1651, "step": 3855 }, { "epoch": 0.69, "grad_norm": 0.7566338181495667, "learning_rate": 4.768160268141484e-05, "loss": 1.515, "step": 3860 }, { "epoch": 0.69, "grad_norm": 0.49978703260421753, "learning_rate": 4.767568595280383e-05, "loss": 1.8929, "step": 3865 }, { "epoch": 0.69, "grad_norm": 0.3817705512046814, "learning_rate": 4.766976205189516e-05, "loss": 1.8465, "step": 3870 }, { "epoch": 0.69, "grad_norm": 0.4288356900215149, "learning_rate": 4.766383098056255e-05, "loss": 1.8791, "step": 3875 }, { "epoch": 0.69, "grad_norm": 0.5867508053779602, "learning_rate": 4.7657892740682006e-05, "loss": 1.9564, "step": 3880 }, { "epoch": 0.7, "grad_norm": 1.2050591707229614, "learning_rate": 4.765194733413178e-05, "loss": 1.9315, "step": 3885 }, { "epoch": 0.7, "grad_norm": 0.9066573977470398, "learning_rate": 4.764599476279241e-05, "loss": 1.9297, "step": 3890 }, { "epoch": 0.7, "grad_norm": 1.0444132089614868, "learning_rate": 4.764003502854668e-05, "loss": 1.7961, "step": 3895 }, { "epoch": 0.7, "grad_norm": 0.708714485168457, "learning_rate": 4.763406813327964e-05, "loss": 1.9499, "step": 3900 }, { "epoch": 0.7, "grad_norm": 0.7246438264846802, "learning_rate": 4.762809407887862e-05, "loss": 1.8408, "step": 3905 }, { "epoch": 0.7, "grad_norm": 0.7492526769638062, "learning_rate": 4.762211286723321e-05, "loss": 2.181, "step": 3910 }, { "epoch": 0.7, "grad_norm": 0.6265992522239685, "learning_rate": 4.761612450023526e-05, "loss": 2.1916, "step": 3915 }, { "epoch": 0.7, "grad_norm": 1.4639782905578613, "learning_rate": 4.7610128979778883e-05, "loss": 2.1112, "step": 3920 }, { "epoch": 0.7, "grad_norm": 0.7836752533912659, "learning_rate": 4.760412630776046e-05, "loss": 1.8764, "step": 3925 }, { "epoch": 0.7, "grad_norm": 0.8531836867332458, "learning_rate": 4.759811648607862e-05, "loss": 2.0257, "step": 3930 }, { "epoch": 0.7, "grad_norm": 0.540393054485321, "learning_rate": 4.7592099516634294e-05, "loss": 1.6083, "step": 3935 }, { "epoch": 0.71, "grad_norm": 0.5300696492195129, "learning_rate": 4.75860754013306e-05, "loss": 1.8257, "step": 3940 }, { "epoch": 0.71, "grad_norm": 0.6871602535247803, "learning_rate": 4.758004414207299e-05, "loss": 2.0006, "step": 3945 }, { "epoch": 0.71, "grad_norm": 0.5659440159797668, "learning_rate": 4.7574005740769136e-05, "loss": 1.6773, "step": 3950 }, { "epoch": 0.71, "grad_norm": 0.466205358505249, "learning_rate": 4.7567960199328984e-05, "loss": 2.129, "step": 3955 }, { "epoch": 0.71, "grad_norm": 1.5387719869613647, "learning_rate": 4.7561907519664724e-05, "loss": 1.8874, "step": 3960 }, { "epoch": 0.71, "grad_norm": 0.7125961184501648, "learning_rate": 4.755584770369081e-05, "loss": 1.919, "step": 3965 }, { "epoch": 0.71, "grad_norm": 0.9576629400253296, "learning_rate": 4.754978075332398e-05, "loss": 1.9866, "step": 3970 }, { "epoch": 0.71, "grad_norm": 0.6893014311790466, "learning_rate": 4.754370667048317e-05, "loss": 1.6943, "step": 3975 }, { "epoch": 0.71, "grad_norm": 1.0120683908462524, "learning_rate": 4.753762545708964e-05, "loss": 1.7324, "step": 3980 }, { "epoch": 0.71, "grad_norm": 1.0890415906906128, "learning_rate": 4.753153711506685e-05, "loss": 1.8863, "step": 3985 }, { "epoch": 0.71, "grad_norm": 0.5287376642227173, "learning_rate": 4.752544164634054e-05, "loss": 2.111, "step": 3990 }, { "epoch": 0.72, "grad_norm": 1.014571189880371, "learning_rate": 4.7519339052838706e-05, "loss": 1.9179, "step": 3995 }, { "epoch": 0.72, "grad_norm": 0.6377423405647278, "learning_rate": 4.7513229336491594e-05, "loss": 1.736, "step": 4000 }, { "epoch": 0.72, "grad_norm": 0.7745433449745178, "learning_rate": 4.7507112499231696e-05, "loss": 2.0101, "step": 4005 }, { "epoch": 0.72, "grad_norm": 0.7528572678565979, "learning_rate": 4.750098854299376e-05, "loss": 1.9943, "step": 4010 }, { "epoch": 0.72, "grad_norm": 1.8742436170578003, "learning_rate": 4.7494857469714796e-05, "loss": 1.9835, "step": 4015 }, { "epoch": 0.72, "grad_norm": 1.895379662513733, "learning_rate": 4.748871928133405e-05, "loss": 1.7243, "step": 4020 }, { "epoch": 0.72, "grad_norm": 1.6951816082000732, "learning_rate": 4.748257397979302e-05, "loss": 1.7516, "step": 4025 }, { "epoch": 0.72, "grad_norm": 0.9127728939056396, "learning_rate": 4.7476421567035475e-05, "loss": 2.2468, "step": 4030 }, { "epoch": 0.72, "grad_norm": 0.9135558605194092, "learning_rate": 4.747026204500741e-05, "loss": 1.8177, "step": 4035 }, { "epoch": 0.72, "grad_norm": 0.3471672236919403, "learning_rate": 4.746409541565706e-05, "loss": 1.8712, "step": 4040 }, { "epoch": 0.72, "grad_norm": 0.621139407157898, "learning_rate": 4.7457921680934946e-05, "loss": 1.9565, "step": 4045 }, { "epoch": 0.72, "grad_norm": 1.2294261455535889, "learning_rate": 4.745174084279381e-05, "loss": 2.0262, "step": 4050 }, { "epoch": 0.73, "grad_norm": 0.7085707783699036, "learning_rate": 4.744555290318864e-05, "loss": 1.8709, "step": 4055 }, { "epoch": 0.73, "grad_norm": 0.5060424208641052, "learning_rate": 4.7439357864076674e-05, "loss": 1.8826, "step": 4060 }, { "epoch": 0.73, "grad_norm": 0.5154075622558594, "learning_rate": 4.74331557274174e-05, "loss": 2.1846, "step": 4065 }, { "epoch": 0.73, "grad_norm": 0.9464437961578369, "learning_rate": 4.7426946495172545e-05, "loss": 2.0822, "step": 4070 }, { "epoch": 0.73, "grad_norm": 1.2713757753372192, "learning_rate": 4.7420730169306084e-05, "loss": 1.9661, "step": 4075 }, { "epoch": 0.73, "grad_norm": 7.036674976348877, "learning_rate": 4.7414506751784236e-05, "loss": 1.8481, "step": 4080 }, { "epoch": 0.73, "grad_norm": 0.6142359972000122, "learning_rate": 4.740827624457547e-05, "loss": 1.8596, "step": 4085 }, { "epoch": 0.73, "grad_norm": 0.9795764088630676, "learning_rate": 4.740203864965048e-05, "loss": 2.054, "step": 4090 }, { "epoch": 0.73, "grad_norm": 0.9507243633270264, "learning_rate": 4.739579396898222e-05, "loss": 1.9746, "step": 4095 }, { "epoch": 0.73, "grad_norm": 0.5483179092407227, "learning_rate": 4.738954220454586e-05, "loss": 2.0328, "step": 4100 }, { "epoch": 0.73, "grad_norm": 0.9164968729019165, "learning_rate": 4.738328335831883e-05, "loss": 2.0429, "step": 4105 }, { "epoch": 0.74, "grad_norm": 0.672351062297821, "learning_rate": 4.7377017432280815e-05, "loss": 1.8414, "step": 4110 }, { "epoch": 0.74, "grad_norm": 0.5230313539505005, "learning_rate": 4.7370744428413704e-05, "loss": 2.2537, "step": 4115 }, { "epoch": 0.74, "grad_norm": 0.480780690908432, "learning_rate": 4.736446434870166e-05, "loss": 1.8738, "step": 4120 }, { "epoch": 0.74, "grad_norm": 0.4368326663970947, "learning_rate": 4.7358177195131035e-05, "loss": 1.8399, "step": 4125 }, { "epoch": 0.74, "grad_norm": 1.0797245502471924, "learning_rate": 4.735188296969048e-05, "loss": 1.8166, "step": 4130 }, { "epoch": 0.74, "grad_norm": 0.6688716411590576, "learning_rate": 4.734558167437084e-05, "loss": 1.8741, "step": 4135 }, { "epoch": 0.74, "grad_norm": 0.3694940507411957, "learning_rate": 4.733927331116521e-05, "loss": 2.2319, "step": 4140 }, { "epoch": 0.74, "grad_norm": 0.4561655521392822, "learning_rate": 4.7332957882068925e-05, "loss": 1.6936, "step": 4145 }, { "epoch": 0.74, "grad_norm": 0.8536581993103027, "learning_rate": 4.732663538907954e-05, "loss": 1.6583, "step": 4150 }, { "epoch": 0.74, "grad_norm": 0.9004095196723938, "learning_rate": 4.7320305834196856e-05, "loss": 1.9843, "step": 4155 }, { "epoch": 0.74, "grad_norm": 1.3630378246307373, "learning_rate": 4.7313969219422906e-05, "loss": 1.6481, "step": 4160 }, { "epoch": 0.75, "grad_norm": 0.9355126023292542, "learning_rate": 4.7307625546761945e-05, "loss": 1.9516, "step": 4165 }, { "epoch": 0.75, "grad_norm": 1.281752347946167, "learning_rate": 4.73012748182205e-05, "loss": 1.5777, "step": 4170 }, { "epoch": 0.75, "grad_norm": 0.5159614086151123, "learning_rate": 4.729491703580727e-05, "loss": 2.0937, "step": 4175 }, { "epoch": 0.75, "grad_norm": 0.5500282049179077, "learning_rate": 4.728855220153322e-05, "loss": 1.8505, "step": 4180 }, { "epoch": 0.75, "grad_norm": 0.47913625836372375, "learning_rate": 4.728218031741156e-05, "loss": 2.1684, "step": 4185 }, { "epoch": 0.75, "grad_norm": 1.0718451738357544, "learning_rate": 4.72758013854577e-05, "loss": 1.9026, "step": 4190 }, { "epoch": 0.75, "grad_norm": 1.538801908493042, "learning_rate": 4.7269415407689276e-05, "loss": 1.8334, "step": 4195 }, { "epoch": 0.75, "grad_norm": 1.0258709192276, "learning_rate": 4.726302238612619e-05, "loss": 1.8611, "step": 4200 }, { "epoch": 0.75, "grad_norm": 0.7120585441589355, "learning_rate": 4.725662232279053e-05, "loss": 1.8925, "step": 4205 }, { "epoch": 0.75, "grad_norm": 0.7516504526138306, "learning_rate": 4.7250215219706636e-05, "loss": 2.3299, "step": 4210 }, { "epoch": 0.75, "grad_norm": 2.125640630722046, "learning_rate": 4.7243801078901084e-05, "loss": 2.3689, "step": 4215 }, { "epoch": 0.76, "grad_norm": 0.7650467157363892, "learning_rate": 4.7237379902402636e-05, "loss": 1.7783, "step": 4220 }, { "epoch": 0.76, "grad_norm": 0.7223068475723267, "learning_rate": 4.723095169224231e-05, "loss": 2.2361, "step": 4225 }, { "epoch": 0.76, "grad_norm": 0.856893002986908, "learning_rate": 4.722451645045336e-05, "loss": 1.9778, "step": 4230 }, { "epoch": 0.76, "grad_norm": 0.5695852041244507, "learning_rate": 4.7218074179071225e-05, "loss": 1.8546, "step": 4235 }, { "epoch": 0.76, "grad_norm": 0.7213203310966492, "learning_rate": 4.72116248801336e-05, "loss": 1.9171, "step": 4240 }, { "epoch": 0.76, "grad_norm": 1.4641231298446655, "learning_rate": 4.720516855568039e-05, "loss": 2.0134, "step": 4245 }, { "epoch": 0.76, "grad_norm": 0.9336233139038086, "learning_rate": 4.719870520775372e-05, "loss": 1.7665, "step": 4250 }, { "epoch": 0.76, "grad_norm": 0.6485480070114136, "learning_rate": 4.719223483839795e-05, "loss": 1.6734, "step": 4255 }, { "epoch": 0.76, "grad_norm": 0.33693981170654297, "learning_rate": 4.718575744965965e-05, "loss": 1.7652, "step": 4260 }, { "epoch": 0.76, "grad_norm": 0.7903716564178467, "learning_rate": 4.717927304358761e-05, "loss": 2.0551, "step": 4265 }, { "epoch": 0.76, "grad_norm": 0.736282467842102, "learning_rate": 4.7172781622232834e-05, "loss": 1.9317, "step": 4270 }, { "epoch": 0.77, "grad_norm": 0.5668120384216309, "learning_rate": 4.716628318764856e-05, "loss": 1.8764, "step": 4275 }, { "epoch": 0.77, "grad_norm": 0.4904122054576874, "learning_rate": 4.715977774189023e-05, "loss": 1.8657, "step": 4280 }, { "epoch": 0.77, "grad_norm": 0.4836113154888153, "learning_rate": 4.715326528701553e-05, "loss": 2.1163, "step": 4285 }, { "epoch": 0.77, "grad_norm": 1.0537573099136353, "learning_rate": 4.7146745825084315e-05, "loss": 1.8157, "step": 4290 }, { "epoch": 0.77, "grad_norm": 0.7334871888160706, "learning_rate": 4.71402193581587e-05, "loss": 1.9083, "step": 4295 }, { "epoch": 0.77, "grad_norm": 0.6025909185409546, "learning_rate": 4.7133685888303e-05, "loss": 1.6309, "step": 4300 }, { "epoch": 0.77, "grad_norm": 0.5878241658210754, "learning_rate": 4.712714541758374e-05, "loss": 1.7923, "step": 4305 }, { "epoch": 0.77, "grad_norm": 0.9317494630813599, "learning_rate": 4.7120597948069676e-05, "loss": 1.6716, "step": 4310 }, { "epoch": 0.77, "grad_norm": 1.2589433193206787, "learning_rate": 4.711404348183175e-05, "loss": 1.9239, "step": 4315 }, { "epoch": 0.77, "grad_norm": 1.0758674144744873, "learning_rate": 4.710748202094315e-05, "loss": 1.8766, "step": 4320 }, { "epoch": 0.77, "grad_norm": 0.7382737994194031, "learning_rate": 4.710091356747924e-05, "loss": 1.8116, "step": 4325 }, { "epoch": 0.78, "grad_norm": 0.790633499622345, "learning_rate": 4.709433812351764e-05, "loss": 2.1118, "step": 4330 }, { "epoch": 0.78, "grad_norm": Infinity, "learning_rate": 4.708907273658756e-05, "loss": 1.9079, "step": 4335 }, { "epoch": 0.78, "grad_norm": 0.3426882028579712, "learning_rate": 4.708248471497269e-05, "loss": 1.958, "step": 4340 }, { "epoch": 0.78, "grad_norm": 0.6428537368774414, "learning_rate": 4.707588970868914e-05, "loss": 1.9855, "step": 4345 }, { "epoch": 0.78, "grad_norm": 0.4597220718860626, "learning_rate": 4.7069287719822915e-05, "loss": 1.875, "step": 4350 }, { "epoch": 0.78, "grad_norm": 0.8507035970687866, "learning_rate": 4.7062678750462205e-05, "loss": 1.7406, "step": 4355 }, { "epoch": 0.78, "grad_norm": 0.8498551845550537, "learning_rate": 4.7056062802697435e-05, "loss": 1.8201, "step": 4360 }, { "epoch": 0.78, "grad_norm": 1.037539005279541, "learning_rate": 4.704943987862121e-05, "loss": 2.0407, "step": 4365 }, { "epoch": 0.78, "grad_norm": 0.6955700516700745, "learning_rate": 4.704280998032836e-05, "loss": 1.8599, "step": 4370 }, { "epoch": 0.78, "grad_norm": 0.9754367470741272, "learning_rate": 4.703617310991592e-05, "loss": 2.0508, "step": 4375 }, { "epoch": 0.78, "grad_norm": 0.6729751825332642, "learning_rate": 4.702952926948312e-05, "loss": 1.8073, "step": 4380 }, { "epoch": 0.78, "grad_norm": 0.47259873151779175, "learning_rate": 4.7022878461131404e-05, "loss": 1.7583, "step": 4385 }, { "epoch": 0.79, "grad_norm": 2.3793861865997314, "learning_rate": 4.701622068696441e-05, "loss": 1.8839, "step": 4390 }, { "epoch": 0.79, "grad_norm": 1.0509848594665527, "learning_rate": 4.700955594908799e-05, "loss": 1.8347, "step": 4395 }, { "epoch": 0.79, "grad_norm": 0.6066625118255615, "learning_rate": 4.700288424961019e-05, "loss": 1.9021, "step": 4400 }, { "epoch": 0.79, "grad_norm": 0.6687719821929932, "learning_rate": 4.6996205590641274e-05, "loss": 1.9797, "step": 4405 }, { "epoch": 0.79, "grad_norm": 0.5127503871917725, "learning_rate": 4.6989519974293684e-05, "loss": 1.7509, "step": 4410 }, { "epoch": 0.79, "grad_norm": 0.6416090130805969, "learning_rate": 4.698282740268208e-05, "loss": 1.8623, "step": 4415 }, { "epoch": 0.79, "grad_norm": 0.7178260087966919, "learning_rate": 4.697612787792331e-05, "loss": 1.8798, "step": 4420 }, { "epoch": 0.79, "grad_norm": 0.5021282434463501, "learning_rate": 4.696942140213643e-05, "loss": 2.0592, "step": 4425 }, { "epoch": 0.79, "grad_norm": 0.5925451517105103, "learning_rate": 4.696270797744269e-05, "loss": 1.859, "step": 4430 }, { "epoch": 0.79, "grad_norm": 0.7658485770225525, "learning_rate": 4.6955987605965555e-05, "loss": 1.9042, "step": 4435 }, { "epoch": 0.79, "grad_norm": 0.7695988416671753, "learning_rate": 4.6949260289830644e-05, "loss": 1.482, "step": 4440 }, { "epoch": 0.8, "grad_norm": 0.6327041387557983, "learning_rate": 4.694252603116582e-05, "loss": 2.0297, "step": 4445 }, { "epoch": 0.8, "grad_norm": 0.7171325087547302, "learning_rate": 4.693578483210113e-05, "loss": 1.9515, "step": 4450 }, { "epoch": 0.8, "grad_norm": 0.562306821346283, "learning_rate": 4.692903669476878e-05, "loss": 1.9341, "step": 4455 }, { "epoch": 0.8, "grad_norm": 0.8432119488716125, "learning_rate": 4.692228162130322e-05, "loss": 2.1466, "step": 4460 }, { "epoch": 0.8, "grad_norm": 0.496180921792984, "learning_rate": 4.691551961384108e-05, "loss": 2.0088, "step": 4465 }, { "epoch": 0.8, "grad_norm": 1.1345199346542358, "learning_rate": 4.690875067452116e-05, "loss": 2.2793, "step": 4470 }, { "epoch": 0.8, "grad_norm": 0.6553413271903992, "learning_rate": 4.690197480548447e-05, "loss": 2.2163, "step": 4475 }, { "epoch": 0.8, "grad_norm": 0.8875722289085388, "learning_rate": 4.6895192008874225e-05, "loss": 1.7843, "step": 4480 }, { "epoch": 0.8, "grad_norm": 0.8258360624313354, "learning_rate": 4.6888402286835804e-05, "loss": 1.8453, "step": 4485 }, { "epoch": 0.8, "grad_norm": 0.9482008814811707, "learning_rate": 4.688160564151679e-05, "loss": 1.6878, "step": 4490 }, { "epoch": 0.8, "grad_norm": 0.45167991518974304, "learning_rate": 4.687480207506697e-05, "loss": 2.0793, "step": 4495 }, { "epoch": 0.81, "grad_norm": 0.2857486605644226, "learning_rate": 4.6867991589638284e-05, "loss": 2.0167, "step": 4500 }, { "epoch": 0.81, "grad_norm": 0.6648562550544739, "learning_rate": 4.686117418738489e-05, "loss": 1.7389, "step": 4505 }, { "epoch": 0.81, "grad_norm": 0.9790322780609131, "learning_rate": 4.685434987046314e-05, "loss": 2.0685, "step": 4510 }, { "epoch": 0.81, "grad_norm": 0.4371451437473297, "learning_rate": 4.6847518641031544e-05, "loss": 1.9618, "step": 4515 }, { "epoch": 0.81, "grad_norm": 0.7518529891967773, "learning_rate": 4.6840680501250814e-05, "loss": 1.7586, "step": 4520 }, { "epoch": 0.81, "grad_norm": 0.6136543154716492, "learning_rate": 4.683383545328385e-05, "loss": 1.9332, "step": 4525 }, { "epoch": 0.81, "grad_norm": 0.8958885073661804, "learning_rate": 4.682698349929573e-05, "loss": 1.6817, "step": 4530 }, { "epoch": 0.81, "grad_norm": 0.6221014857292175, "learning_rate": 4.682012464145373e-05, "loss": 2.2389, "step": 4535 }, { "epoch": 0.81, "grad_norm": 1.043770432472229, "learning_rate": 4.681325888192729e-05, "loss": 1.7041, "step": 4540 }, { "epoch": 0.81, "grad_norm": 0.9577236175537109, "learning_rate": 4.680638622288804e-05, "loss": 1.9666, "step": 4545 }, { "epoch": 0.81, "grad_norm": 0.5839371681213379, "learning_rate": 4.679950666650982e-05, "loss": 1.9035, "step": 4550 }, { "epoch": 0.82, "grad_norm": 1.314014196395874, "learning_rate": 4.67926202149686e-05, "loss": 1.9786, "step": 4555 }, { "epoch": 0.82, "grad_norm": 0.44664058089256287, "learning_rate": 4.678572687044257e-05, "loss": 1.9205, "step": 4560 }, { "epoch": 0.82, "grad_norm": 0.5979194641113281, "learning_rate": 4.6778826635112085e-05, "loss": 1.8723, "step": 4565 }, { "epoch": 0.82, "grad_norm": 0.8216999173164368, "learning_rate": 4.677191951115968e-05, "loss": 1.9851, "step": 4570 }, { "epoch": 0.82, "grad_norm": 0.9313204288482666, "learning_rate": 4.676638885365804e-05, "loss": 1.9628, "step": 4575 }, { "epoch": 0.82, "grad_norm": 0.9604901075363159, "learning_rate": 4.675946933569314e-05, "loss": 1.6638, "step": 4580 }, { "epoch": 0.82, "grad_norm": 0.9496554732322693, "learning_rate": 4.675254293522902e-05, "loss": 1.7364, "step": 4585 }, { "epoch": 0.82, "grad_norm": 0.6214568018913269, "learning_rate": 4.674560965445649e-05, "loss": 1.7255, "step": 4590 }, { "epoch": 0.82, "grad_norm": 0.6437748670578003, "learning_rate": 4.673866949556854e-05, "loss": 2.1914, "step": 4595 }, { "epoch": 0.82, "grad_norm": 1.4943290948867798, "learning_rate": 4.6731722460760355e-05, "loss": 1.9668, "step": 4600 }, { "epoch": 0.82, "grad_norm": 0.41138797998428345, "learning_rate": 4.672476855222924e-05, "loss": 1.8255, "step": 4605 }, { "epoch": 0.83, "grad_norm": 0.6955471634864807, "learning_rate": 4.671780777217474e-05, "loss": 1.9333, "step": 4610 }, { "epoch": 0.83, "grad_norm": 1.1246094703674316, "learning_rate": 4.671084012279853e-05, "loss": 2.0143, "step": 4615 }, { "epoch": 0.83, "grad_norm": 0.7304584980010986, "learning_rate": 4.6703865606304465e-05, "loss": 1.9142, "step": 4620 }, { "epoch": 0.83, "grad_norm": 1.188251256942749, "learning_rate": 4.6696884224898584e-05, "loss": 1.8847, "step": 4625 }, { "epoch": 0.83, "grad_norm": 0.9002482295036316, "learning_rate": 4.66898959807891e-05, "loss": 2.1051, "step": 4630 }, { "epoch": 0.83, "grad_norm": 0.9018658995628357, "learning_rate": 4.668290087618638e-05, "loss": 1.8925, "step": 4635 }, { "epoch": 0.83, "grad_norm": 0.6331837177276611, "learning_rate": 4.667589891330297e-05, "loss": 2.0296, "step": 4640 }, { "epoch": 0.83, "grad_norm": 0.5853480696678162, "learning_rate": 4.666889009435358e-05, "loss": 1.9348, "step": 4645 }, { "epoch": 0.83, "grad_norm": 1.261925458908081, "learning_rate": 4.6661874421555094e-05, "loss": 2.1714, "step": 4650 }, { "epoch": 0.83, "grad_norm": 0.5459082126617432, "learning_rate": 4.665485189712657e-05, "loss": 1.7488, "step": 4655 }, { "epoch": 0.83, "grad_norm": 0.5560827851295471, "learning_rate": 4.664782252328922e-05, "loss": 1.8829, "step": 4660 }, { "epoch": 0.84, "grad_norm": 12.994915008544922, "learning_rate": 4.664078630226643e-05, "loss": 2.2486, "step": 4665 }, { "epoch": 0.84, "grad_norm": 0.5184732675552368, "learning_rate": 4.663374323628374e-05, "loss": 1.5356, "step": 4670 }, { "epoch": 0.84, "grad_norm": 0.5672849416732788, "learning_rate": 4.662669332756887e-05, "loss": 1.6331, "step": 4675 }, { "epoch": 0.84, "grad_norm": 0.5396742224693298, "learning_rate": 4.661963657835172e-05, "loss": 2.1822, "step": 4680 }, { "epoch": 0.84, "grad_norm": 0.557628333568573, "learning_rate": 4.6612572990864303e-05, "loss": 1.9965, "step": 4685 }, { "epoch": 0.84, "grad_norm": 0.8263502717018127, "learning_rate": 4.6605502567340844e-05, "loss": 2.1538, "step": 4690 }, { "epoch": 0.84, "grad_norm": 2.053760051727295, "learning_rate": 4.6598425310017704e-05, "loss": 2.0905, "step": 4695 }, { "epoch": 0.84, "grad_norm": 1.3697140216827393, "learning_rate": 4.659134122113341e-05, "loss": 1.7339, "step": 4700 }, { "epoch": 0.84, "grad_norm": 0.7235055565834045, "learning_rate": 4.658425030292866e-05, "loss": 1.8672, "step": 4705 }, { "epoch": 0.84, "grad_norm": 0.7260095477104187, "learning_rate": 4.657715255764629e-05, "loss": 1.7353, "step": 4710 }, { "epoch": 0.84, "grad_norm": 0.5732203125953674, "learning_rate": 4.657004798753133e-05, "loss": 2.0318, "step": 4715 }, { "epoch": 0.84, "grad_norm": 13.018949508666992, "learning_rate": 4.656293659483093e-05, "loss": 2.1095, "step": 4720 }, { "epoch": 0.85, "grad_norm": 0.6857671737670898, "learning_rate": 4.655581838179444e-05, "loss": 1.8601, "step": 4725 }, { "epoch": 0.85, "grad_norm": 0.5136845707893372, "learning_rate": 4.6548693350673325e-05, "loss": 1.9605, "step": 4730 }, { "epoch": 0.85, "grad_norm": 0.44918909668922424, "learning_rate": 4.654156150372123e-05, "loss": 1.9067, "step": 4735 }, { "epoch": 0.85, "grad_norm": 0.4144088327884674, "learning_rate": 4.653442284319395e-05, "loss": 2.0102, "step": 4740 }, { "epoch": 0.85, "grad_norm": 1.849292516708374, "learning_rate": 4.652727737134944e-05, "loss": 2.1593, "step": 4745 }, { "epoch": 0.85, "grad_norm": 0.5621273517608643, "learning_rate": 4.65201250904478e-05, "loss": 2.0483, "step": 4750 }, { "epoch": 0.85, "grad_norm": 1.3969671726226807, "learning_rate": 4.65129660027513e-05, "loss": 1.9204, "step": 4755 }, { "epoch": 0.85, "grad_norm": 0.598132312297821, "learning_rate": 4.6505800110524347e-05, "loss": 2.0326, "step": 4760 }, { "epoch": 0.85, "grad_norm": 0.7579675316810608, "learning_rate": 4.649862741603349e-05, "loss": 1.7728, "step": 4765 }, { "epoch": 0.85, "grad_norm": 0.7204775810241699, "learning_rate": 4.649144792154747e-05, "loss": 1.9556, "step": 4770 }, { "epoch": 0.85, "grad_norm": 1.0572607517242432, "learning_rate": 4.648426162933716e-05, "loss": 2.1159, "step": 4775 }, { "epoch": 0.86, "grad_norm": 0.5266518592834473, "learning_rate": 4.647706854167554e-05, "loss": 1.9842, "step": 4780 }, { "epoch": 0.86, "grad_norm": 0.8113346695899963, "learning_rate": 4.6469868660837805e-05, "loss": 1.7724, "step": 4785 }, { "epoch": 0.86, "grad_norm": 0.8336856961250305, "learning_rate": 4.646266198910126e-05, "loss": 1.8038, "step": 4790 }, { "epoch": 0.86, "grad_norm": 0.7910550832748413, "learning_rate": 4.645544852874538e-05, "loss": 1.8405, "step": 4795 }, { "epoch": 0.86, "grad_norm": 0.7810003161430359, "learning_rate": 4.644822828205176e-05, "loss": 1.825, "step": 4800 }, { "epoch": 0.86, "grad_norm": 0.8987298607826233, "learning_rate": 4.644100125130418e-05, "loss": 1.9643, "step": 4805 }, { "epoch": 0.86, "grad_norm": 0.5562641620635986, "learning_rate": 4.643376743878852e-05, "loss": 1.7666, "step": 4810 }, { "epoch": 0.86, "grad_norm": 0.8323341012001038, "learning_rate": 4.642652684679283e-05, "loss": 1.6686, "step": 4815 }, { "epoch": 0.86, "grad_norm": 1.0062700510025024, "learning_rate": 4.6419279477607314e-05, "loss": 1.9944, "step": 4820 }, { "epoch": 0.86, "grad_norm": 0.6746718883514404, "learning_rate": 4.64120253335243e-05, "loss": 1.91, "step": 4825 }, { "epoch": 0.86, "grad_norm": 0.5174027681350708, "learning_rate": 4.6404764416838264e-05, "loss": 1.8239, "step": 4830 }, { "epoch": 0.87, "grad_norm": 0.3364204466342926, "learning_rate": 4.6397496729845844e-05, "loss": 1.7488, "step": 4835 }, { "epoch": 0.87, "grad_norm": 1.2048060894012451, "learning_rate": 4.639022227484578e-05, "loss": 1.7477, "step": 4840 }, { "epoch": 0.87, "grad_norm": 0.4208517372608185, "learning_rate": 4.638294105413898e-05, "loss": 1.9285, "step": 4845 }, { "epoch": 0.87, "grad_norm": 1.080003261566162, "learning_rate": 4.6375653070028505e-05, "loss": 1.8835, "step": 4850 }, { "epoch": 0.87, "grad_norm": 1.1424204111099243, "learning_rate": 4.6368358324819524e-05, "loss": 1.6948, "step": 4855 }, { "epoch": 0.87, "grad_norm": 0.866698682308197, "learning_rate": 4.636105682081935e-05, "loss": 1.8874, "step": 4860 }, { "epoch": 0.87, "grad_norm": 0.8074659705162048, "learning_rate": 4.6353748560337456e-05, "loss": 1.7727, "step": 4865 }, { "epoch": 0.87, "grad_norm": 2.238422393798828, "learning_rate": 4.634643354568543e-05, "loss": 1.9305, "step": 4870 }, { "epoch": 0.87, "grad_norm": 0.5514354109764099, "learning_rate": 4.633911177917701e-05, "loss": 1.9675, "step": 4875 }, { "epoch": 0.87, "grad_norm": 0.41248565912246704, "learning_rate": 4.633178326312806e-05, "loss": 1.7863, "step": 4880 }, { "epoch": 0.87, "grad_norm": 0.5374939441680908, "learning_rate": 4.632444799985657e-05, "loss": 1.8905, "step": 4885 }, { "epoch": 0.88, "grad_norm": 0.9222282767295837, "learning_rate": 4.63171059916827e-05, "loss": 1.9385, "step": 4890 }, { "epoch": 0.88, "grad_norm": 3.0097501277923584, "learning_rate": 4.630975724092871e-05, "loss": 1.772, "step": 4895 }, { "epoch": 0.88, "grad_norm": 1.815949559211731, "learning_rate": 4.6302401749919e-05, "loss": 1.9147, "step": 4900 }, { "epoch": 0.88, "grad_norm": 0.4607148766517639, "learning_rate": 4.629503952098011e-05, "loss": 1.8251, "step": 4905 }, { "epoch": 0.88, "grad_norm": 0.6612110137939453, "learning_rate": 4.62876705564407e-05, "loss": 2.0799, "step": 4910 }, { "epoch": 0.88, "grad_norm": 0.9041725993156433, "learning_rate": 4.628029485863157e-05, "loss": 1.8727, "step": 4915 }, { "epoch": 0.88, "grad_norm": 0.5865108370780945, "learning_rate": 4.627291242988564e-05, "loss": 2.0539, "step": 4920 }, { "epoch": 0.88, "grad_norm": 0.646685779094696, "learning_rate": 4.6265523272537976e-05, "loss": 2.1545, "step": 4925 }, { "epoch": 0.88, "grad_norm": 0.8874340057373047, "learning_rate": 4.625812738892575e-05, "loss": 2.0838, "step": 4930 }, { "epoch": 0.88, "grad_norm": 0.48262983560562134, "learning_rate": 4.625072478138828e-05, "loss": 1.7378, "step": 4935 }, { "epoch": 0.88, "grad_norm": 0.4273892641067505, "learning_rate": 4.6243315452267014e-05, "loss": 1.6208, "step": 4940 }, { "epoch": 0.89, "grad_norm": 0.860099732875824, "learning_rate": 4.623589940390549e-05, "loss": 1.5971, "step": 4945 }, { "epoch": 0.89, "grad_norm": 0.6735519170761108, "learning_rate": 4.622847663864941e-05, "loss": 1.8806, "step": 4950 }, { "epoch": 0.89, "grad_norm": 0.47200706601142883, "learning_rate": 4.6221047158846594e-05, "loss": 1.8463, "step": 4955 }, { "epoch": 0.89, "grad_norm": 1.0925625562667847, "learning_rate": 4.621361096684698e-05, "loss": 1.9257, "step": 4960 }, { "epoch": 0.89, "grad_norm": 0.9255615472793579, "learning_rate": 4.6206168065002613e-05, "loss": 2.2559, "step": 4965 }, { "epoch": 0.89, "grad_norm": 0.8095393776893616, "learning_rate": 4.619871845566769e-05, "loss": 1.8567, "step": 4970 }, { "epoch": 0.89, "grad_norm": 0.8361244201660156, "learning_rate": 4.619126214119851e-05, "loss": 1.4803, "step": 4975 }, { "epoch": 0.89, "grad_norm": 0.6798988580703735, "learning_rate": 4.618379912395349e-05, "loss": 2.0151, "step": 4980 }, { "epoch": 0.89, "grad_norm": 0.3742920756340027, "learning_rate": 4.617632940629319e-05, "loss": 2.137, "step": 4985 }, { "epoch": 0.89, "grad_norm": 0.6546251177787781, "learning_rate": 4.616885299058027e-05, "loss": 2.2428, "step": 4990 }, { "epoch": 0.89, "grad_norm": 0.6916753649711609, "learning_rate": 4.6161369879179504e-05, "loss": 2.0984, "step": 4995 }, { "epoch": 0.89, "grad_norm": 1.0272783041000366, "learning_rate": 4.61538800744578e-05, "loss": 1.6963, "step": 5000 }, { "epoch": 0.9, "grad_norm": 0.4335922598838806, "learning_rate": 4.614638357878418e-05, "loss": 2.0127, "step": 5005 }, { "epoch": 0.9, "grad_norm": 1.4880688190460205, "learning_rate": 4.613888039452978e-05, "loss": 1.786, "step": 5010 }, { "epoch": 0.9, "grad_norm": 0.689954400062561, "learning_rate": 4.613137052406784e-05, "loss": 1.6233, "step": 5015 }, { "epoch": 0.9, "grad_norm": 1.206842303276062, "learning_rate": 4.6123853969773726e-05, "loss": 1.8923, "step": 5020 }, { "epoch": 0.9, "grad_norm": 0.6934424042701721, "learning_rate": 4.611633073402492e-05, "loss": 1.8961, "step": 5025 }, { "epoch": 0.9, "grad_norm": 0.48485127091407776, "learning_rate": 4.610880081920101e-05, "loss": 1.6494, "step": 5030 }, { "epoch": 0.9, "grad_norm": 0.7396721243858337, "learning_rate": 4.610126422768372e-05, "loss": 1.9127, "step": 5035 }, { "epoch": 0.9, "grad_norm": 2.117673873901367, "learning_rate": 4.609372096185683e-05, "loss": 2.0358, "step": 5040 }, { "epoch": 0.9, "grad_norm": 0.5403966307640076, "learning_rate": 4.608617102410631e-05, "loss": 1.7811, "step": 5045 }, { "epoch": 0.9, "grad_norm": 0.5867857933044434, "learning_rate": 4.6078614416820164e-05, "loss": 1.9641, "step": 5050 }, { "epoch": 0.9, "grad_norm": 0.7298117876052856, "learning_rate": 4.6071051142388555e-05, "loss": 1.7933, "step": 5055 }, { "epoch": 0.91, "grad_norm": 1.0257443189620972, "learning_rate": 4.606348120320374e-05, "loss": 1.8427, "step": 5060 }, { "epoch": 0.91, "grad_norm": 0.8619400262832642, "learning_rate": 4.6055904601660084e-05, "loss": 1.9523, "step": 5065 }, { "epoch": 0.91, "grad_norm": 0.7043263912200928, "learning_rate": 4.6048321340154054e-05, "loss": 1.9647, "step": 5070 }, { "epoch": 0.91, "grad_norm": 0.8900740742683411, "learning_rate": 4.604073142108423e-05, "loss": 1.8617, "step": 5075 }, { "epoch": 0.91, "grad_norm": 0.5621470808982849, "learning_rate": 4.6033134846851294e-05, "loss": 2.0817, "step": 5080 }, { "epoch": 0.91, "grad_norm": 0.5144213438034058, "learning_rate": 4.602553161985804e-05, "loss": 1.9585, "step": 5085 }, { "epoch": 0.91, "grad_norm": 0.5859657526016235, "learning_rate": 4.601792174250935e-05, "loss": 1.9843, "step": 5090 }, { "epoch": 0.91, "grad_norm": 0.48220011591911316, "learning_rate": 4.6010305217212244e-05, "loss": 2.3189, "step": 5095 }, { "epoch": 0.91, "grad_norm": 0.6994110941886902, "learning_rate": 4.60026820463758e-05, "loss": 1.699, "step": 5100 }, { "epoch": 0.91, "grad_norm": 4.361588478088379, "learning_rate": 4.5995052232411216e-05, "loss": 1.8276, "step": 5105 }, { "epoch": 0.91, "grad_norm": 0.627723753452301, "learning_rate": 4.598741577773182e-05, "loss": 2.045, "step": 5110 }, { "epoch": 0.92, "grad_norm": 1.4922670125961304, "learning_rate": 4.5979772684752995e-05, "loss": 1.7747, "step": 5115 }, { "epoch": 0.92, "grad_norm": 0.6467026472091675, "learning_rate": 4.597212295589225e-05, "loss": 1.7939, "step": 5120 }, { "epoch": 0.92, "grad_norm": 0.6912665367126465, "learning_rate": 4.596446659356919e-05, "loss": 1.8892, "step": 5125 }, { "epoch": 0.92, "grad_norm": 1.146193265914917, "learning_rate": 4.595680360020551e-05, "loss": 1.7835, "step": 5130 }, { "epoch": 0.92, "grad_norm": 0.549444317817688, "learning_rate": 4.594913397822501e-05, "loss": 1.8231, "step": 5135 }, { "epoch": 0.92, "grad_norm": 0.6164882779121399, "learning_rate": 4.594145773005358e-05, "loss": 1.8279, "step": 5140 }, { "epoch": 0.92, "grad_norm": 0.3416946232318878, "learning_rate": 4.593377485811922e-05, "loss": 2.243, "step": 5145 }, { "epoch": 0.92, "grad_norm": 0.5608302354812622, "learning_rate": 4.5926085364852e-05, "loss": 1.7795, "step": 5150 }, { "epoch": 0.92, "grad_norm": 0.5376662015914917, "learning_rate": 4.591838925268411e-05, "loss": 1.7712, "step": 5155 }, { "epoch": 0.92, "grad_norm": 0.6536192893981934, "learning_rate": 4.591068652404982e-05, "loss": 2.2747, "step": 5160 }, { "epoch": 0.92, "grad_norm": 0.40830197930336, "learning_rate": 4.59029771813855e-05, "loss": 1.9315, "step": 5165 }, { "epoch": 0.93, "grad_norm": 0.8668363094329834, "learning_rate": 4.58952612271296e-05, "loss": 2.0715, "step": 5170 }, { "epoch": 0.93, "grad_norm": 0.6481930613517761, "learning_rate": 4.588753866372267e-05, "loss": 1.8249, "step": 5175 }, { "epoch": 0.93, "grad_norm": 0.3583790063858032, "learning_rate": 4.5879809493607364e-05, "loss": 1.8025, "step": 5180 }, { "epoch": 0.93, "grad_norm": 0.5957528352737427, "learning_rate": 4.587207371922838e-05, "loss": 2.1159, "step": 5185 }, { "epoch": 0.93, "grad_norm": 0.6677088141441345, "learning_rate": 4.586433134303257e-05, "loss": 1.9156, "step": 5190 }, { "epoch": 0.93, "grad_norm": 0.8160932660102844, "learning_rate": 4.585658236746881e-05, "loss": 2.0241, "step": 5195 }, { "epoch": 0.93, "grad_norm": 0.6950777769088745, "learning_rate": 4.584882679498813e-05, "loss": 1.9214, "step": 5200 }, { "epoch": 0.93, "grad_norm": 0.726952314376831, "learning_rate": 4.5841064628043575e-05, "loss": 1.7244, "step": 5205 }, { "epoch": 0.93, "grad_norm": 0.7783024907112122, "learning_rate": 4.583329586909033e-05, "loss": 1.8253, "step": 5210 }, { "epoch": 0.93, "grad_norm": 0.8075493574142456, "learning_rate": 4.5825520520585635e-05, "loss": 1.7076, "step": 5215 }, { "epoch": 0.93, "grad_norm": 1.0622972249984741, "learning_rate": 4.5817738584988835e-05, "loss": 1.9551, "step": 5220 }, { "epoch": 0.94, "grad_norm": 0.6024557948112488, "learning_rate": 4.580995006476134e-05, "loss": 1.8075, "step": 5225 }, { "epoch": 0.94, "grad_norm": 0.5445288419723511, "learning_rate": 4.580215496236666e-05, "loss": 1.7501, "step": 5230 }, { "epoch": 0.94, "grad_norm": 1.8697842359542847, "learning_rate": 4.5794353280270376e-05, "loss": 1.7579, "step": 5235 }, { "epoch": 0.94, "grad_norm": 0.6215696334838867, "learning_rate": 4.5786545020940155e-05, "loss": 1.6428, "step": 5240 }, { "epoch": 0.94, "grad_norm": 0.6145104169845581, "learning_rate": 4.577873018684573e-05, "loss": 1.7593, "step": 5245 }, { "epoch": 0.94, "grad_norm": 0.7944318056106567, "learning_rate": 4.577090878045893e-05, "loss": 2.1836, "step": 5250 }, { "epoch": 0.94, "grad_norm": 0.7711953520774841, "learning_rate": 4.576308080425367e-05, "loss": 1.9731, "step": 5255 }, { "epoch": 0.94, "grad_norm": 0.5569404363632202, "learning_rate": 4.575524626070592e-05, "loss": 2.1212, "step": 5260 }, { "epoch": 0.94, "grad_norm": 0.6812270283699036, "learning_rate": 4.574740515229374e-05, "loss": 2.0945, "step": 5265 }, { "epoch": 0.94, "grad_norm": 0.8699807524681091, "learning_rate": 4.5739557481497275e-05, "loss": 2.0372, "step": 5270 }, { "epoch": 0.94, "grad_norm": 4.8972649574279785, "learning_rate": 4.573170325079872e-05, "loss": 2.3673, "step": 5275 }, { "epoch": 0.95, "grad_norm": 0.6763632297515869, "learning_rate": 4.5723842462682375e-05, "loss": 1.907, "step": 5280 }, { "epoch": 0.95, "grad_norm": 0.44302767515182495, "learning_rate": 4.571597511963459e-05, "loss": 1.939, "step": 5285 }, { "epoch": 0.95, "grad_norm": 0.5313250422477722, "learning_rate": 4.5708101224143796e-05, "loss": 1.8028, "step": 5290 }, { "epoch": 0.95, "grad_norm": 0.4340111315250397, "learning_rate": 4.5700220778700504e-05, "loss": 1.7074, "step": 5295 }, { "epoch": 0.95, "grad_norm": 0.5250926613807678, "learning_rate": 4.569233378579729e-05, "loss": 1.956, "step": 5300 }, { "epoch": 0.95, "grad_norm": 0.8783259391784668, "learning_rate": 4.568444024792881e-05, "loss": 2.1352, "step": 5305 }, { "epoch": 0.95, "grad_norm": 0.6077532172203064, "learning_rate": 4.5676540167591776e-05, "loss": 1.6879, "step": 5310 }, { "epoch": 0.95, "grad_norm": 0.854468584060669, "learning_rate": 4.5668633547284964e-05, "loss": 2.0933, "step": 5315 }, { "epoch": 0.95, "grad_norm": 4.812654972076416, "learning_rate": 4.566072038950925e-05, "loss": 1.6729, "step": 5320 }, { "epoch": 0.95, "grad_norm": 0.6669471859931946, "learning_rate": 4.565280069676755e-05, "loss": 2.0241, "step": 5325 }, { "epoch": 0.95, "grad_norm": 0.4599258303642273, "learning_rate": 4.5644874471564854e-05, "loss": 1.802, "step": 5330 }, { "epoch": 0.95, "grad_norm": 1.0719258785247803, "learning_rate": 4.5636941716408224e-05, "loss": 2.0428, "step": 5335 }, { "epoch": 0.96, "grad_norm": 0.8084414601325989, "learning_rate": 4.5629002433806764e-05, "loss": 1.8502, "step": 5340 }, { "epoch": 0.96, "grad_norm": 0.715983510017395, "learning_rate": 4.5621056626271694e-05, "loss": 1.9962, "step": 5345 }, { "epoch": 0.96, "grad_norm": 0.682518720626831, "learning_rate": 4.5613104296316245e-05, "loss": 1.5369, "step": 5350 }, { "epoch": 0.96, "grad_norm": 1.020588994026184, "learning_rate": 4.560514544645573e-05, "loss": 1.8893, "step": 5355 }, { "epoch": 0.96, "grad_norm": 1.0426610708236694, "learning_rate": 4.559718007920753e-05, "loss": 1.7818, "step": 5360 }, { "epoch": 0.96, "grad_norm": 0.8958699703216553, "learning_rate": 4.5589208197091084e-05, "loss": 1.9423, "step": 5365 }, { "epoch": 0.96, "grad_norm": 1.6348686218261719, "learning_rate": 4.558122980262789e-05, "loss": 1.9759, "step": 5370 }, { "epoch": 0.96, "grad_norm": 1.073533058166504, "learning_rate": 4.557324489834151e-05, "loss": 1.9607, "step": 5375 }, { "epoch": 0.96, "grad_norm": 0.9768711924552917, "learning_rate": 4.556525348675755e-05, "loss": 1.8728, "step": 5380 }, { "epoch": 0.96, "grad_norm": 0.6651968359947205, "learning_rate": 4.55572555704037e-05, "loss": 1.8073, "step": 5385 }, { "epoch": 0.96, "grad_norm": 0.7141188979148865, "learning_rate": 4.554925115180968e-05, "loss": 1.7795, "step": 5390 }, { "epoch": 0.97, "grad_norm": 1.1313745975494385, "learning_rate": 4.5541240233507285e-05, "loss": 1.8481, "step": 5395 }, { "epoch": 0.97, "grad_norm": 0.4851437211036682, "learning_rate": 4.5533222818030376e-05, "loss": 1.9045, "step": 5400 }, { "epoch": 0.97, "grad_norm": 0.3471333384513855, "learning_rate": 4.5525198907914826e-05, "loss": 2.006, "step": 5405 }, { "epoch": 0.97, "grad_norm": 0.5303329229354858, "learning_rate": 4.5517168505698616e-05, "loss": 2.0183, "step": 5410 }, { "epoch": 0.97, "grad_norm": 0.40987589955329895, "learning_rate": 4.550913161392173e-05, "loss": 2.2088, "step": 5415 }, { "epoch": 0.97, "grad_norm": 0.5597338676452637, "learning_rate": 4.5501088235126254e-05, "loss": 1.9469, "step": 5420 }, { "epoch": 0.97, "grad_norm": 0.5752057433128357, "learning_rate": 4.549303837185628e-05, "loss": 1.9666, "step": 5425 }, { "epoch": 0.97, "grad_norm": 0.7234611511230469, "learning_rate": 4.548498202665798e-05, "loss": 1.6851, "step": 5430 }, { "epoch": 0.97, "grad_norm": 0.3815643787384033, "learning_rate": 4.5476919202079574e-05, "loss": 2.0537, "step": 5435 }, { "epoch": 0.97, "grad_norm": 0.5907308459281921, "learning_rate": 4.5468849900671324e-05, "loss": 1.8174, "step": 5440 }, { "epoch": 0.97, "grad_norm": 0.5893926024436951, "learning_rate": 4.546077412498553e-05, "loss": 1.6955, "step": 5445 }, { "epoch": 0.98, "grad_norm": 0.5866884589195251, "learning_rate": 4.545269187757657e-05, "loss": 1.7888, "step": 5450 }, { "epoch": 0.98, "grad_norm": 0.43314459919929504, "learning_rate": 4.5444603161000834e-05, "loss": 1.848, "step": 5455 }, { "epoch": 0.98, "grad_norm": 0.9520348906517029, "learning_rate": 4.543650797781679e-05, "loss": 2.004, "step": 5460 }, { "epoch": 0.98, "grad_norm": 5.312831401824951, "learning_rate": 4.542840633058493e-05, "loss": 2.1996, "step": 5465 }, { "epoch": 0.98, "grad_norm": 0.944259524345398, "learning_rate": 4.542029822186779e-05, "loss": 1.7677, "step": 5470 }, { "epoch": 0.98, "grad_norm": 0.5071392059326172, "learning_rate": 4.5412183654229965e-05, "loss": 1.67, "step": 5475 }, { "epoch": 0.98, "grad_norm": 0.7843353152275085, "learning_rate": 4.5404062630238094e-05, "loss": 2.2598, "step": 5480 }, { "epoch": 0.98, "grad_norm": 0.4275512397289276, "learning_rate": 4.5395935152460834e-05, "loss": 1.7225, "step": 5485 }, { "epoch": 0.98, "grad_norm": 0.9836548566818237, "learning_rate": 4.53878012234689e-05, "loss": 1.9897, "step": 5490 }, { "epoch": 0.98, "grad_norm": 2.572413206100464, "learning_rate": 4.5379660845835045e-05, "loss": 1.7828, "step": 5495 }, { "epoch": 0.98, "grad_norm": 0.5451618432998657, "learning_rate": 4.5371514022134085e-05, "loss": 2.0577, "step": 5500 }, { "epoch": 0.99, "grad_norm": 0.5373215675354004, "learning_rate": 4.536336075494282e-05, "loss": 1.6858, "step": 5505 }, { "epoch": 0.99, "grad_norm": 0.5195179581642151, "learning_rate": 4.535520104684014e-05, "loss": 1.8985, "step": 5510 }, { "epoch": 0.99, "grad_norm": 0.5343700647354126, "learning_rate": 4.534703490040695e-05, "loss": 2.0989, "step": 5515 }, { "epoch": 0.99, "grad_norm": 0.7476509213447571, "learning_rate": 4.533886231822619e-05, "loss": 1.7542, "step": 5520 }, { "epoch": 0.99, "grad_norm": 0.7178905010223389, "learning_rate": 4.533068330288284e-05, "loss": 2.0403, "step": 5525 }, { "epoch": 0.99, "grad_norm": 0.4951605796813965, "learning_rate": 4.532249785696392e-05, "loss": 1.9205, "step": 5530 }, { "epoch": 0.99, "grad_norm": 0.46740642189979553, "learning_rate": 4.531430598305848e-05, "loss": 2.0617, "step": 5535 }, { "epoch": 0.99, "grad_norm": 1.046222448348999, "learning_rate": 4.5306107683757584e-05, "loss": 1.6893, "step": 5540 }, { "epoch": 0.99, "grad_norm": 0.4250253736972809, "learning_rate": 4.529790296165437e-05, "loss": 2.0393, "step": 5545 }, { "epoch": 0.99, "grad_norm": 1.2679554224014282, "learning_rate": 4.528969181934397e-05, "loss": 1.7908, "step": 5550 }, { "epoch": 0.99, "grad_norm": 0.8419818878173828, "learning_rate": 4.5281474259423565e-05, "loss": 1.8416, "step": 5555 }, { "epoch": 1.0, "grad_norm": 0.6362389922142029, "learning_rate": 4.527325028449236e-05, "loss": 2.0143, "step": 5560 }, { "epoch": 1.0, "grad_norm": 0.8798420429229736, "learning_rate": 4.5265019897151595e-05, "loss": 1.7087, "step": 5565 }, { "epoch": 1.0, "grad_norm": 0.9879406690597534, "learning_rate": 4.525678310000452e-05, "loss": 2.0423, "step": 5570 }, { "epoch": 1.0, "grad_norm": 1.0143115520477295, "learning_rate": 4.524853989565644e-05, "loss": 1.9527, "step": 5575 }, { "epoch": 1.0, "grad_norm": 0.6878682971000671, "learning_rate": 4.524029028671467e-05, "loss": 1.9395, "step": 5580 }, { "epoch": 1.0, "grad_norm": 0.6117766499519348, "learning_rate": 4.523203427578855e-05, "loss": 1.7447, "step": 5585 }, { "epoch": 1.0, "grad_norm": 0.42814651131629944, "learning_rate": 4.522377186548944e-05, "loss": 1.8254, "step": 5590 }, { "epoch": 1.0, "grad_norm": 0.6543037295341492, "learning_rate": 4.5215503058430754e-05, "loss": 2.0367, "step": 5595 }, { "epoch": 1.0, "grad_norm": 0.5894500017166138, "learning_rate": 4.520722785722789e-05, "loss": 1.8911, "step": 5600 }, { "epoch": 1.0, "grad_norm": 0.7925021052360535, "learning_rate": 4.519894626449829e-05, "loss": 1.7694, "step": 5605 }, { "epoch": 1.0, "grad_norm": 0.6827127933502197, "learning_rate": 4.519065828286142e-05, "loss": 1.7906, "step": 5610 }, { "epoch": 1.01, "grad_norm": 0.5993368029594421, "learning_rate": 4.5182363914938754e-05, "loss": 2.142, "step": 5615 }, { "epoch": 1.01, "grad_norm": 0.5092318654060364, "learning_rate": 4.517406316335379e-05, "loss": 2.0453, "step": 5620 }, { "epoch": 1.01, "grad_norm": 0.5682955980300903, "learning_rate": 4.5165756030732056e-05, "loss": 1.866, "step": 5625 }, { "epoch": 1.01, "grad_norm": 0.5514644980430603, "learning_rate": 4.515744251970109e-05, "loss": 1.9906, "step": 5630 }, { "epoch": 1.01, "grad_norm": 0.839066207408905, "learning_rate": 4.514912263289044e-05, "loss": 1.9757, "step": 5635 }, { "epoch": 1.01, "grad_norm": 0.632759153842926, "learning_rate": 4.514079637293168e-05, "loss": 1.8389, "step": 5640 }, { "epoch": 1.01, "grad_norm": 0.5392670631408691, "learning_rate": 4.513246374245841e-05, "loss": 2.0059, "step": 5645 }, { "epoch": 1.01, "grad_norm": 0.7347068190574646, "learning_rate": 4.5124124744106216e-05, "loss": 1.8033, "step": 5650 }, { "epoch": 1.01, "grad_norm": 1.1633368730545044, "learning_rate": 4.511577938051272e-05, "loss": 2.0502, "step": 5655 }, { "epoch": 1.01, "grad_norm": 0.4993457794189453, "learning_rate": 4.510742765431756e-05, "loss": 2.0386, "step": 5660 }, { "epoch": 1.01, "grad_norm": 0.9043493270874023, "learning_rate": 4.509906956816238e-05, "loss": 2.1084, "step": 5665 }, { "epoch": 1.01, "grad_norm": 0.4216392934322357, "learning_rate": 4.509070512469081e-05, "loss": 1.8757, "step": 5670 }, { "epoch": 1.02, "grad_norm": 0.5536708235740662, "learning_rate": 4.508233432654855e-05, "loss": 1.8147, "step": 5675 }, { "epoch": 1.02, "grad_norm": 1.015234112739563, "learning_rate": 4.507395717638325e-05, "loss": 1.9113, "step": 5680 }, { "epoch": 1.02, "grad_norm": 0.9013848900794983, "learning_rate": 4.506557367684461e-05, "loss": 1.6582, "step": 5685 }, { "epoch": 1.02, "grad_norm": 0.41156238317489624, "learning_rate": 4.505718383058431e-05, "loss": 1.8162, "step": 5690 }, { "epoch": 1.02, "grad_norm": 0.8023267984390259, "learning_rate": 4.504878764025606e-05, "loss": 1.9033, "step": 5695 }, { "epoch": 1.02, "grad_norm": 0.4735568165779114, "learning_rate": 4.504038510851557e-05, "loss": 1.604, "step": 5700 }, { "epoch": 1.02, "grad_norm": 1.373860478401184, "learning_rate": 4.503197623802054e-05, "loss": 1.9613, "step": 5705 }, { "epoch": 1.02, "grad_norm": 0.47717657685279846, "learning_rate": 4.5023561031430696e-05, "loss": 1.8763, "step": 5710 }, { "epoch": 1.02, "grad_norm": 0.9874609708786011, "learning_rate": 4.501513949140775e-05, "loss": 1.7237, "step": 5715 }, { "epoch": 1.02, "grad_norm": 0.4677652418613434, "learning_rate": 4.500671162061545e-05, "loss": 2.2116, "step": 5720 }, { "epoch": 1.02, "grad_norm": 0.7714501023292542, "learning_rate": 4.4998277421719506e-05, "loss": 1.9144, "step": 5725 }, { "epoch": 1.03, "grad_norm": 0.6441267132759094, "learning_rate": 4.498983689738764e-05, "loss": 1.7153, "step": 5730 }, { "epoch": 1.03, "grad_norm": 1.421938419342041, "learning_rate": 4.4981390050289595e-05, "loss": 1.882, "step": 5735 }, { "epoch": 1.03, "grad_norm": 0.5032657980918884, "learning_rate": 4.497293688309709e-05, "loss": 1.9654, "step": 5740 }, { "epoch": 1.03, "grad_norm": 0.5165999531745911, "learning_rate": 4.4964477398483875e-05, "loss": 1.7969, "step": 5745 }, { "epoch": 1.03, "grad_norm": 0.9800401329994202, "learning_rate": 4.495601159912566e-05, "loss": 1.7086, "step": 5750 }, { "epoch": 1.03, "grad_norm": 0.6153430342674255, "learning_rate": 4.494753948770016e-05, "loss": 1.641, "step": 5755 }, { "epoch": 1.03, "grad_norm": 0.5942739844322205, "learning_rate": 4.493906106688712e-05, "loss": 1.9742, "step": 5760 }, { "epoch": 1.03, "grad_norm": 0.4884500205516815, "learning_rate": 4.493057633936824e-05, "loss": 2.0754, "step": 5765 }, { "epoch": 1.03, "grad_norm": 0.8295217156410217, "learning_rate": 4.492208530782722e-05, "loss": 1.7546, "step": 5770 }, { "epoch": 1.03, "grad_norm": 0.7193713784217834, "learning_rate": 4.49135879749498e-05, "loss": 1.9467, "step": 5775 }, { "epoch": 1.03, "grad_norm": 0.7422948479652405, "learning_rate": 4.490508434342364e-05, "loss": 1.8656, "step": 5780 }, { "epoch": 1.04, "grad_norm": 3.331986665725708, "learning_rate": 4.4896574415938465e-05, "loss": 1.7584, "step": 5785 }, { "epoch": 1.04, "grad_norm": 0.6991448402404785, "learning_rate": 4.4888058195185935e-05, "loss": 1.6829, "step": 5790 }, { "epoch": 1.04, "grad_norm": 0.7543762922286987, "learning_rate": 4.487953568385972e-05, "loss": 1.7909, "step": 5795 }, { "epoch": 1.04, "grad_norm": 0.5476166605949402, "learning_rate": 4.4871006884655495e-05, "loss": 2.1132, "step": 5800 }, { "epoch": 1.04, "grad_norm": 0.785237729549408, "learning_rate": 4.486247180027091e-05, "loss": 1.6932, "step": 5805 }, { "epoch": 1.04, "grad_norm": 0.8334198594093323, "learning_rate": 4.485393043340559e-05, "loss": 1.9682, "step": 5810 }, { "epoch": 1.04, "grad_norm": 1.4550981521606445, "learning_rate": 4.484538278676118e-05, "loss": 1.4896, "step": 5815 }, { "epoch": 1.04, "grad_norm": 1.184747576713562, "learning_rate": 4.483682886304129e-05, "loss": 1.6799, "step": 5820 }, { "epoch": 1.04, "grad_norm": 0.6847954392433167, "learning_rate": 4.48282686649515e-05, "loss": 1.8183, "step": 5825 }, { "epoch": 1.04, "grad_norm": 0.5142964124679565, "learning_rate": 4.4819702195199406e-05, "loss": 2.0018, "step": 5830 }, { "epoch": 1.04, "grad_norm": 1.4494707584381104, "learning_rate": 4.481112945649457e-05, "loss": 1.7658, "step": 5835 }, { "epoch": 1.05, "grad_norm": 0.7955223321914673, "learning_rate": 4.480255045154854e-05, "loss": 1.7644, "step": 5840 }, { "epoch": 1.05, "grad_norm": 0.46225401759147644, "learning_rate": 4.479396518307486e-05, "loss": 1.846, "step": 5845 }, { "epoch": 1.05, "grad_norm": 0.8068023920059204, "learning_rate": 4.478537365378902e-05, "loss": 1.9631, "step": 5850 }, { "epoch": 1.05, "grad_norm": 0.9534473419189453, "learning_rate": 4.477677586640854e-05, "loss": 1.9243, "step": 5855 }, { "epoch": 1.05, "grad_norm": 1.4949363470077515, "learning_rate": 4.476817182365286e-05, "loss": 1.7548, "step": 5860 }, { "epoch": 1.05, "grad_norm": 0.6866559386253357, "learning_rate": 4.4759561528243455e-05, "loss": 1.7041, "step": 5865 }, { "epoch": 1.05, "grad_norm": 0.4342908561229706, "learning_rate": 4.475094498290374e-05, "loss": 1.9776, "step": 5870 }, { "epoch": 1.05, "grad_norm": 0.4420872628688812, "learning_rate": 4.474232219035913e-05, "loss": 2.1547, "step": 5875 }, { "epoch": 1.05, "grad_norm": 0.4919748902320862, "learning_rate": 4.4733693153336995e-05, "loss": 1.8431, "step": 5880 }, { "epoch": 1.05, "grad_norm": 0.8170517683029175, "learning_rate": 4.472505787456671e-05, "loss": 1.7578, "step": 5885 }, { "epoch": 1.05, "grad_norm": 0.8107596039772034, "learning_rate": 4.4716416356779576e-05, "loss": 1.8205, "step": 5890 }, { "epoch": 1.06, "grad_norm": 0.5859491229057312, "learning_rate": 4.470776860270892e-05, "loss": 1.5296, "step": 5895 }, { "epoch": 1.06, "grad_norm": 0.41586703062057495, "learning_rate": 4.469911461509002e-05, "loss": 2.2296, "step": 5900 }, { "epoch": 1.06, "grad_norm": 0.7300328016281128, "learning_rate": 4.4690454396660104e-05, "loss": 1.9785, "step": 5905 }, { "epoch": 1.06, "grad_norm": 1.0052844285964966, "learning_rate": 4.46817879501584e-05, "loss": 1.8619, "step": 5910 }, { "epoch": 1.06, "grad_norm": 0.6375717520713806, "learning_rate": 4.467311527832611e-05, "loss": 1.8155, "step": 5915 }, { "epoch": 1.06, "grad_norm": 0.5199123620986938, "learning_rate": 4.466443638390637e-05, "loss": 1.8485, "step": 5920 }, { "epoch": 1.06, "grad_norm": 0.6076669692993164, "learning_rate": 4.465575126964433e-05, "loss": 1.644, "step": 5925 }, { "epoch": 1.06, "grad_norm": 0.4569863975048065, "learning_rate": 4.464705993828706e-05, "loss": 1.9771, "step": 5930 }, { "epoch": 1.06, "grad_norm": 0.5174528360366821, "learning_rate": 4.4638362392583636e-05, "loss": 1.8152, "step": 5935 }, { "epoch": 1.06, "grad_norm": 1.4930440187454224, "learning_rate": 4.462965863528507e-05, "loss": 1.7984, "step": 5940 }, { "epoch": 1.06, "grad_norm": 0.42374441027641296, "learning_rate": 4.4620948669144356e-05, "loss": 1.7553, "step": 5945 }, { "epoch": 1.07, "grad_norm": 0.7552632093429565, "learning_rate": 4.461223249691645e-05, "loss": 2.0744, "step": 5950 }, { "epoch": 1.07, "grad_norm": 0.7354189157485962, "learning_rate": 4.460351012135828e-05, "loss": 1.9068, "step": 5955 }, { "epoch": 1.07, "grad_norm": 0.44200262427330017, "learning_rate": 4.4594781545228715e-05, "loss": 1.8694, "step": 5960 }, { "epoch": 1.07, "grad_norm": 0.45945167541503906, "learning_rate": 4.4586046771288593e-05, "loss": 2.0557, "step": 5965 }, { "epoch": 1.07, "grad_norm": 0.506199061870575, "learning_rate": 4.4577305802300715e-05, "loss": 1.8714, "step": 5970 }, { "epoch": 1.07, "grad_norm": 0.6749559640884399, "learning_rate": 4.456855864102984e-05, "loss": 2.0554, "step": 5975 }, { "epoch": 1.07, "grad_norm": 0.7233107686042786, "learning_rate": 4.455980529024269e-05, "loss": 1.9416, "step": 5980 }, { "epoch": 1.07, "grad_norm": 0.6712331771850586, "learning_rate": 4.4551045752707945e-05, "loss": 1.9445, "step": 5985 }, { "epoch": 1.07, "grad_norm": 1.0653775930404663, "learning_rate": 4.4542280031196224e-05, "loss": 1.9905, "step": 5990 }, { "epoch": 1.07, "grad_norm": 1.2245427370071411, "learning_rate": 4.453350812848014e-05, "loss": 1.5629, "step": 5995 }, { "epoch": 1.07, "grad_norm": 0.6561611890792847, "learning_rate": 4.452473004733422e-05, "loss": 2.0267, "step": 6000 }, { "epoch": 1.07, "grad_norm": 1.8881734609603882, "learning_rate": 4.4515945790534964e-05, "loss": 1.7464, "step": 6005 }, { "epoch": 1.08, "grad_norm": 0.6296302080154419, "learning_rate": 4.450715536086082e-05, "loss": 1.8395, "step": 6010 }, { "epoch": 1.08, "grad_norm": 0.5000322461128235, "learning_rate": 4.4498358761092204e-05, "loss": 1.8501, "step": 6015 }, { "epoch": 1.08, "grad_norm": 0.5313418507575989, "learning_rate": 4.448955599401147e-05, "loss": 1.879, "step": 6020 }, { "epoch": 1.08, "grad_norm": 0.8610731363296509, "learning_rate": 4.448074706240292e-05, "loss": 1.8825, "step": 6025 }, { "epoch": 1.08, "grad_norm": 0.30168166756629944, "learning_rate": 4.4471931969052816e-05, "loss": 1.7678, "step": 6030 }, { "epoch": 1.08, "grad_norm": 0.8175820708274841, "learning_rate": 4.446311071674936e-05, "loss": 1.8989, "step": 6035 }, { "epoch": 1.08, "grad_norm": 0.7665614485740662, "learning_rate": 4.4454283308282695e-05, "loss": 1.8553, "step": 6040 }, { "epoch": 1.08, "grad_norm": 0.42481526732444763, "learning_rate": 4.444544974644493e-05, "loss": 2.1974, "step": 6045 }, { "epoch": 1.08, "grad_norm": 0.5651946067810059, "learning_rate": 4.443661003403013e-05, "loss": 2.1612, "step": 6050 }, { "epoch": 1.08, "grad_norm": 0.8740639686584473, "learning_rate": 4.4427764173834264e-05, "loss": 1.6421, "step": 6055 }, { "epoch": 1.08, "grad_norm": 0.6985417604446411, "learning_rate": 4.441891216865528e-05, "loss": 2.0725, "step": 6060 }, { "epoch": 1.09, "grad_norm": 0.8365675210952759, "learning_rate": 4.441005402129306e-05, "loss": 1.9568, "step": 6065 }, { "epoch": 1.09, "grad_norm": 0.5997393727302551, "learning_rate": 4.4401189734549405e-05, "loss": 2.0727, "step": 6070 }, { "epoch": 1.09, "grad_norm": 1.2542804479599, "learning_rate": 4.439231931122811e-05, "loss": 2.039, "step": 6075 }, { "epoch": 1.09, "grad_norm": 0.8525872230529785, "learning_rate": 4.4383442754134874e-05, "loss": 1.6291, "step": 6080 }, { "epoch": 1.09, "grad_norm": 0.5538986325263977, "learning_rate": 4.4374560066077336e-05, "loss": 2.0365, "step": 6085 }, { "epoch": 1.09, "grad_norm": 0.48354509472846985, "learning_rate": 4.436567124986508e-05, "loss": 2.0723, "step": 6090 }, { "epoch": 1.09, "grad_norm": 1.0979423522949219, "learning_rate": 4.435677630830964e-05, "loss": 1.984, "step": 6095 }, { "epoch": 1.09, "grad_norm": 0.6961491107940674, "learning_rate": 4.434787524422448e-05, "loss": 1.7996, "step": 6100 }, { "epoch": 1.09, "grad_norm": 1.0703699588775635, "learning_rate": 4.433896806042498e-05, "loss": 1.904, "step": 6105 }, { "epoch": 1.09, "grad_norm": 0.8863664269447327, "learning_rate": 4.433005475972849e-05, "loss": 1.7778, "step": 6110 }, { "epoch": 1.09, "grad_norm": 0.995624303817749, "learning_rate": 4.432113534495427e-05, "loss": 2.0512, "step": 6115 }, { "epoch": 1.1, "grad_norm": 0.5747253894805908, "learning_rate": 4.4312209818923526e-05, "loss": 1.7252, "step": 6120 }, { "epoch": 1.1, "grad_norm": 0.899549663066864, "learning_rate": 4.430327818445939e-05, "loss": 1.8408, "step": 6125 }, { "epoch": 1.1, "grad_norm": 0.6088781356811523, "learning_rate": 4.429434044438693e-05, "loss": 1.7636, "step": 6130 }, { "epoch": 1.1, "grad_norm": 0.7908390164375305, "learning_rate": 4.428539660153315e-05, "loss": 1.7863, "step": 6135 }, { "epoch": 1.1, "grad_norm": 0.4704549312591553, "learning_rate": 4.427644665872697e-05, "loss": 1.8403, "step": 6140 }, { "epoch": 1.1, "grad_norm": 0.4836312532424927, "learning_rate": 4.426749061879926e-05, "loss": 1.8234, "step": 6145 }, { "epoch": 1.1, "grad_norm": 0.6054650545120239, "learning_rate": 4.425852848458279e-05, "loss": 1.9921, "step": 6150 }, { "epoch": 1.1, "grad_norm": 0.598057210445404, "learning_rate": 4.42495602589123e-05, "loss": 1.9285, "step": 6155 }, { "epoch": 1.1, "grad_norm": 0.6998239755630493, "learning_rate": 4.42405859446244e-05, "loss": 1.7761, "step": 6160 }, { "epoch": 1.1, "grad_norm": 0.4462050199508667, "learning_rate": 4.4231605544557684e-05, "loss": 1.7563, "step": 6165 }, { "epoch": 1.1, "grad_norm": 1.1818965673446655, "learning_rate": 4.422261906155263e-05, "loss": 2.1757, "step": 6170 }, { "epoch": 1.11, "grad_norm": 1.3994632959365845, "learning_rate": 4.421362649845165e-05, "loss": 1.8819, "step": 6175 }, { "epoch": 1.11, "grad_norm": 0.699133038520813, "learning_rate": 4.42046278580991e-05, "loss": 1.9893, "step": 6180 }, { "epoch": 1.11, "grad_norm": 0.4134710431098938, "learning_rate": 4.419562314334123e-05, "loss": 2.0212, "step": 6185 }, { "epoch": 1.11, "grad_norm": 0.8407548666000366, "learning_rate": 4.4186612357026225e-05, "loss": 1.8084, "step": 6190 }, { "epoch": 1.11, "grad_norm": 0.4963303506374359, "learning_rate": 4.4177595502004175e-05, "loss": 1.9406, "step": 6195 }, { "epoch": 1.11, "grad_norm": 0.6621739864349365, "learning_rate": 4.416857258112712e-05, "loss": 1.9083, "step": 6200 }, { "epoch": 1.11, "grad_norm": 0.6170079708099365, "learning_rate": 4.4159543597248996e-05, "loss": 1.9234, "step": 6205 }, { "epoch": 1.11, "grad_norm": 0.6613223552703857, "learning_rate": 4.415050855322567e-05, "loss": 2.0523, "step": 6210 }, { "epoch": 1.11, "grad_norm": 1.43656325340271, "learning_rate": 4.41414674519149e-05, "loss": 1.9491, "step": 6215 }, { "epoch": 1.11, "grad_norm": 0.7505949139595032, "learning_rate": 4.4132420296176386e-05, "loss": 1.9439, "step": 6220 }, { "epoch": 1.11, "grad_norm": 0.665459394454956, "learning_rate": 4.4123367088871735e-05, "loss": 1.7187, "step": 6225 }, { "epoch": 1.12, "grad_norm": 0.3662698566913605, "learning_rate": 4.411430783286447e-05, "loss": 1.6875, "step": 6230 }, { "epoch": 1.12, "grad_norm": 0.3431253731250763, "learning_rate": 4.410524253102003e-05, "loss": 1.9651, "step": 6235 }, { "epoch": 1.12, "grad_norm": 0.5281440615653992, "learning_rate": 4.409617118620574e-05, "loss": 1.9109, "step": 6240 }, { "epoch": 1.12, "grad_norm": 0.659487247467041, "learning_rate": 4.408709380129088e-05, "loss": 1.9324, "step": 6245 }, { "epoch": 1.12, "grad_norm": 0.7130793333053589, "learning_rate": 4.407801037914662e-05, "loss": 1.813, "step": 6250 }, { "epoch": 1.12, "grad_norm": 0.6839697360992432, "learning_rate": 4.4068920922646015e-05, "loss": 1.8927, "step": 6255 }, { "epoch": 1.12, "grad_norm": 1.225645661354065, "learning_rate": 4.405982543466406e-05, "loss": 1.9195, "step": 6260 }, { "epoch": 1.12, "grad_norm": 0.6370910406112671, "learning_rate": 4.4050723918077664e-05, "loss": 2.0735, "step": 6265 }, { "epoch": 1.12, "grad_norm": 0.8523099422454834, "learning_rate": 4.4041616375765614e-05, "loss": 1.8858, "step": 6270 }, { "epoch": 1.12, "grad_norm": 1.1719610691070557, "learning_rate": 4.4032502810608614e-05, "loss": 1.7128, "step": 6275 }, { "epoch": 1.12, "grad_norm": 0.9267966747283936, "learning_rate": 4.402338322548929e-05, "loss": 1.6958, "step": 6280 }, { "epoch": 1.12, "grad_norm": 0.4360824525356293, "learning_rate": 4.401425762329214e-05, "loss": 1.9678, "step": 6285 }, { "epoch": 1.13, "grad_norm": 0.7322136163711548, "learning_rate": 4.400512600690361e-05, "loss": 1.8844, "step": 6290 }, { "epoch": 1.13, "grad_norm": 1.1388806104660034, "learning_rate": 4.399598837921199e-05, "loss": 1.645, "step": 6295 }, { "epoch": 1.13, "grad_norm": 0.42380547523498535, "learning_rate": 4.3986844743107525e-05, "loss": 1.9983, "step": 6300 }, { "epoch": 1.13, "grad_norm": 0.9260616302490234, "learning_rate": 4.397769510148233e-05, "loss": 1.6449, "step": 6305 }, { "epoch": 1.13, "grad_norm": 0.5340147614479065, "learning_rate": 4.396853945723043e-05, "loss": 1.7605, "step": 6310 }, { "epoch": 1.13, "grad_norm": 0.5331189632415771, "learning_rate": 4.395937781324774e-05, "loss": 1.8059, "step": 6315 }, { "epoch": 1.13, "grad_norm": 0.6908158659934998, "learning_rate": 4.395021017243208e-05, "loss": 1.9612, "step": 6320 }, { "epoch": 1.13, "grad_norm": 0.8586407899856567, "learning_rate": 4.3941036537683176e-05, "loss": 1.9027, "step": 6325 }, { "epoch": 1.13, "grad_norm": 0.7058882713317871, "learning_rate": 4.393185691190264e-05, "loss": 1.9719, "step": 6330 }, { "epoch": 1.13, "grad_norm": 0.7748107314109802, "learning_rate": 4.392267129799397e-05, "loss": 2.0075, "step": 6335 }, { "epoch": 1.13, "grad_norm": 0.6403058767318726, "learning_rate": 4.391347969886256e-05, "loss": 2.2573, "step": 6340 }, { "epoch": 1.14, "grad_norm": 0.7095941305160522, "learning_rate": 4.3904282117415724e-05, "loss": 1.8924, "step": 6345 }, { "epoch": 1.14, "grad_norm": 0.880134642124176, "learning_rate": 4.389507855656263e-05, "loss": 1.6801, "step": 6350 }, { "epoch": 1.14, "grad_norm": 0.8959532976150513, "learning_rate": 4.3885869019214374e-05, "loss": 1.6508, "step": 6355 }, { "epoch": 1.14, "grad_norm": 1.0066273212432861, "learning_rate": 4.38766535082839e-05, "loss": 1.9506, "step": 6360 }, { "epoch": 1.14, "grad_norm": 0.32858455181121826, "learning_rate": 4.386743202668609e-05, "loss": 2.0022, "step": 6365 }, { "epoch": 1.14, "grad_norm": 1.6246061325073242, "learning_rate": 4.3858204577337674e-05, "loss": 1.8174, "step": 6370 }, { "epoch": 1.14, "grad_norm": 0.4996178448200226, "learning_rate": 4.3848971163157296e-05, "loss": 1.7198, "step": 6375 }, { "epoch": 1.14, "grad_norm": 0.6967160105705261, "learning_rate": 4.383973178706545e-05, "loss": 1.9285, "step": 6380 }, { "epoch": 1.14, "grad_norm": 1.0754936933517456, "learning_rate": 4.383048645198458e-05, "loss": 2.0293, "step": 6385 }, { "epoch": 1.14, "grad_norm": 0.49468663334846497, "learning_rate": 4.3821235160838955e-05, "loss": 1.8142, "step": 6390 }, { "epoch": 1.14, "grad_norm": 0.9518940448760986, "learning_rate": 4.3811977916554746e-05, "loss": 2.1539, "step": 6395 }, { "epoch": 1.15, "grad_norm": 0.5778300166130066, "learning_rate": 4.3802714722060024e-05, "loss": 1.8289, "step": 6400 }, { "epoch": 1.15, "grad_norm": 0.8983391523361206, "learning_rate": 4.379344558028471e-05, "loss": 1.6746, "step": 6405 }, { "epoch": 1.15, "grad_norm": 0.6165648102760315, "learning_rate": 4.3784170494160646e-05, "loss": 2.0491, "step": 6410 }, { "epoch": 1.15, "grad_norm": 0.9584977626800537, "learning_rate": 4.377488946662152e-05, "loss": 1.8224, "step": 6415 }, { "epoch": 1.15, "grad_norm": 1.769154667854309, "learning_rate": 4.376560250060292e-05, "loss": 1.8792, "step": 6420 }, { "epoch": 1.15, "grad_norm": 0.5984408855438232, "learning_rate": 4.375630959904229e-05, "loss": 1.9362, "step": 6425 }, { "epoch": 1.15, "grad_norm": 0.7467585206031799, "learning_rate": 4.3747010764878985e-05, "loss": 1.4512, "step": 6430 }, { "epoch": 1.15, "grad_norm": 1.2350887060165405, "learning_rate": 4.373770600105419e-05, "loss": 1.8879, "step": 6435 }, { "epoch": 1.15, "grad_norm": 3.794597625732422, "learning_rate": 4.372839531051103e-05, "loss": 1.7501, "step": 6440 }, { "epoch": 1.15, "grad_norm": 0.3860040605068207, "learning_rate": 4.371907869619444e-05, "loss": 1.7878, "step": 6445 }, { "epoch": 1.15, "grad_norm": 0.4788179099559784, "learning_rate": 4.3709756161051266e-05, "loss": 2.0143, "step": 6450 }, { "epoch": 1.16, "grad_norm": 0.5330110192298889, "learning_rate": 4.3700427708030205e-05, "loss": 1.6505, "step": 6455 }, { "epoch": 1.16, "grad_norm": 0.7788562178611755, "learning_rate": 4.369109334008186e-05, "loss": 1.9456, "step": 6460 }, { "epoch": 1.16, "grad_norm": 0.4161795675754547, "learning_rate": 4.3681753060158656e-05, "loss": 2.1514, "step": 6465 }, { "epoch": 1.16, "grad_norm": 0.8420090079307556, "learning_rate": 4.367240687121495e-05, "loss": 1.8194, "step": 6470 }, { "epoch": 1.16, "grad_norm": 0.37459495663642883, "learning_rate": 4.366305477620689e-05, "loss": 1.8492, "step": 6475 }, { "epoch": 1.16, "grad_norm": 0.8855760097503662, "learning_rate": 4.365369677809257e-05, "loss": 1.6327, "step": 6480 }, { "epoch": 1.16, "grad_norm": 1.2534199953079224, "learning_rate": 4.36443328798319e-05, "loss": 1.9092, "step": 6485 }, { "epoch": 1.16, "grad_norm": 0.44040653109550476, "learning_rate": 4.363496308438666e-05, "loss": 1.9596, "step": 6490 }, { "epoch": 1.16, "grad_norm": 0.6483808755874634, "learning_rate": 4.3625587394720534e-05, "loss": 1.8081, "step": 6495 }, { "epoch": 1.16, "grad_norm": 1.1044362783432007, "learning_rate": 4.3616205813799025e-05, "loss": 1.8881, "step": 6500 }, { "epoch": 1.16, "grad_norm": 1.2582200765609741, "learning_rate": 4.360681834458953e-05, "loss": 2.0978, "step": 6505 }, { "epoch": 1.17, "grad_norm": 0.6778598427772522, "learning_rate": 4.359742499006129e-05, "loss": 2.1348, "step": 6510 }, { "epoch": 1.17, "grad_norm": 0.45896151661872864, "learning_rate": 4.35880257531854e-05, "loss": 1.8157, "step": 6515 }, { "epoch": 1.17, "grad_norm": 0.8110174536705017, "learning_rate": 4.357862063693486e-05, "loss": 1.9552, "step": 6520 }, { "epoch": 1.17, "grad_norm": 0.5105950236320496, "learning_rate": 4.356920964428448e-05, "loss": 2.0259, "step": 6525 }, { "epoch": 1.17, "grad_norm": 0.7677923440933228, "learning_rate": 4.3559792778210945e-05, "loss": 1.9829, "step": 6530 }, { "epoch": 1.17, "grad_norm": 2.0216026306152344, "learning_rate": 4.3550370041692805e-05, "loss": 1.8117, "step": 6535 }, { "epoch": 1.17, "grad_norm": 0.8491010069847107, "learning_rate": 4.354094143771047e-05, "loss": 1.9182, "step": 6540 }, { "epoch": 1.17, "grad_norm": 0.7707613706588745, "learning_rate": 4.353150696924619e-05, "loss": 1.9682, "step": 6545 }, { "epoch": 1.17, "grad_norm": 0.6204885840415955, "learning_rate": 4.3522066639284085e-05, "loss": 1.641, "step": 6550 }, { "epoch": 1.17, "grad_norm": 0.7384446263313293, "learning_rate": 4.3512620450810114e-05, "loss": 1.9758, "step": 6555 }, { "epoch": 1.17, "grad_norm": 0.7315991520881653, "learning_rate": 4.35031684068121e-05, "loss": 1.6545, "step": 6560 }, { "epoch": 1.18, "grad_norm": 0.5005430579185486, "learning_rate": 4.349371051027972e-05, "loss": 1.9201, "step": 6565 }, { "epoch": 1.18, "grad_norm": 1.0951027870178223, "learning_rate": 4.348424676420449e-05, "loss": 1.9609, "step": 6570 }, { "epoch": 1.18, "grad_norm": 0.5726012587547302, "learning_rate": 4.3474777171579794e-05, "loss": 1.835, "step": 6575 }, { "epoch": 1.18, "grad_norm": 0.7009186744689941, "learning_rate": 4.3465301735400844e-05, "loss": 2.0274, "step": 6580 }, { "epoch": 1.18, "grad_norm": 0.8929822444915771, "learning_rate": 4.345582045866472e-05, "loss": 1.8169, "step": 6585 }, { "epoch": 1.18, "grad_norm": 0.615864098072052, "learning_rate": 4.3446333344370326e-05, "loss": 1.7477, "step": 6590 }, { "epoch": 1.18, "grad_norm": 0.7010105848312378, "learning_rate": 4.343684039551845e-05, "loss": 1.6822, "step": 6595 }, { "epoch": 1.18, "grad_norm": 0.7396606206893921, "learning_rate": 4.3427341615111696e-05, "loss": 1.7189, "step": 6600 }, { "epoch": 1.18, "grad_norm": 0.8250526189804077, "learning_rate": 4.3417837006154505e-05, "loss": 1.8397, "step": 6605 }, { "epoch": 1.18, "grad_norm": 0.417876660823822, "learning_rate": 4.340832657165318e-05, "loss": 1.7366, "step": 6610 }, { "epoch": 1.18, "grad_norm": 0.5691598653793335, "learning_rate": 4.3398810314615876e-05, "loss": 1.678, "step": 6615 }, { "epoch": 1.18, "grad_norm": 0.3957613706588745, "learning_rate": 4.338928823805256e-05, "loss": 2.2207, "step": 6620 }, { "epoch": 1.19, "grad_norm": 0.5127105712890625, "learning_rate": 4.337976034497507e-05, "loss": 2.0833, "step": 6625 }, { "epoch": 1.19, "grad_norm": 0.7685470581054688, "learning_rate": 4.337022663839706e-05, "loss": 1.6234, "step": 6630 }, { "epoch": 1.19, "grad_norm": 0.8764786124229431, "learning_rate": 4.336068712133403e-05, "loss": 1.8451, "step": 6635 }, { "epoch": 1.19, "grad_norm": 0.5960032939910889, "learning_rate": 4.335114179680333e-05, "loss": 1.9596, "step": 6640 }, { "epoch": 1.19, "grad_norm": 0.9183564782142639, "learning_rate": 4.334159066782413e-05, "loss": 1.9834, "step": 6645 }, { "epoch": 1.19, "grad_norm": 1.964171051979065, "learning_rate": 4.333203373741744e-05, "loss": 1.8609, "step": 6650 }, { "epoch": 1.19, "grad_norm": 0.5139592885971069, "learning_rate": 4.332247100860612e-05, "loss": 1.8969, "step": 6655 }, { "epoch": 1.19, "grad_norm": 0.40804627537727356, "learning_rate": 4.331290248441483e-05, "loss": 1.6632, "step": 6660 }, { "epoch": 1.19, "grad_norm": 0.5056566596031189, "learning_rate": 4.3303328167870105e-05, "loss": 1.7616, "step": 6665 }, { "epoch": 1.19, "grad_norm": 2.207601547241211, "learning_rate": 4.32937480620003e-05, "loss": 1.9739, "step": 6670 }, { "epoch": 1.19, "grad_norm": 0.5137903094291687, "learning_rate": 4.3284162169835565e-05, "loss": 1.8736, "step": 6675 }, { "epoch": 1.2, "grad_norm": 0.5918135046958923, "learning_rate": 4.327457049440792e-05, "loss": 2.1611, "step": 6680 }, { "epoch": 1.2, "grad_norm": 0.4136945903301239, "learning_rate": 4.326497303875121e-05, "loss": 2.0066, "step": 6685 }, { "epoch": 1.2, "grad_norm": 0.7371619343757629, "learning_rate": 4.3255369805901096e-05, "loss": 1.5216, "step": 6690 }, { "epoch": 1.2, "grad_norm": 7.495757579803467, "learning_rate": 4.324576079889507e-05, "loss": 1.752, "step": 6695 }, { "epoch": 1.2, "grad_norm": 0.8859977722167969, "learning_rate": 4.323614602077246e-05, "loss": 1.7231, "step": 6700 }, { "epoch": 1.2, "grad_norm": 0.48695892095565796, "learning_rate": 4.3226525474574405e-05, "loss": 1.612, "step": 6705 }, { "epoch": 1.2, "grad_norm": 0.5769939422607422, "learning_rate": 4.321689916334387e-05, "loss": 1.9836, "step": 6710 }, { "epoch": 1.2, "grad_norm": 0.7489534616470337, "learning_rate": 4.3207267090125646e-05, "loss": 1.6957, "step": 6715 }, { "epoch": 1.2, "grad_norm": 1.0079684257507324, "learning_rate": 4.3197629257966365e-05, "loss": 1.89, "step": 6720 }, { "epoch": 1.2, "grad_norm": 0.8229489326477051, "learning_rate": 4.3187985669914454e-05, "loss": 1.929, "step": 6725 }, { "epoch": 1.2, "grad_norm": 0.6666339039802551, "learning_rate": 4.317833632902016e-05, "loss": 1.823, "step": 6730 }, { "epoch": 1.21, "grad_norm": 0.8089262247085571, "learning_rate": 4.316868123833558e-05, "loss": 1.9112, "step": 6735 }, { "epoch": 1.21, "grad_norm": 0.6247695684432983, "learning_rate": 4.31590204009146e-05, "loss": 1.637, "step": 6740 }, { "epoch": 1.21, "grad_norm": 0.4710233211517334, "learning_rate": 4.314935381981293e-05, "loss": 2.0317, "step": 6745 }, { "epoch": 1.21, "grad_norm": 0.6251475214958191, "learning_rate": 4.31396814980881e-05, "loss": 2.1173, "step": 6750 }, { "epoch": 1.21, "grad_norm": 0.8826267719268799, "learning_rate": 4.3130003438799466e-05, "loss": 2.0483, "step": 6755 }, { "epoch": 1.21, "grad_norm": 0.6971041560173035, "learning_rate": 4.312031964500818e-05, "loss": 1.921, "step": 6760 }, { "epoch": 1.21, "grad_norm": 0.6338077187538147, "learning_rate": 4.311063011977723e-05, "loss": 1.7711, "step": 6765 }, { "epoch": 1.21, "grad_norm": 1.1039173603057861, "learning_rate": 4.3100934866171385e-05, "loss": 1.5759, "step": 6770 }, { "epoch": 1.21, "grad_norm": 1.1805471181869507, "learning_rate": 4.3091233887257256e-05, "loss": 1.7043, "step": 6775 }, { "epoch": 1.21, "grad_norm": 0.7616626024246216, "learning_rate": 4.308152718610324e-05, "loss": 1.739, "step": 6780 }, { "epoch": 1.21, "grad_norm": 0.3182941675186157, "learning_rate": 4.307181476577957e-05, "loss": 1.8309, "step": 6785 }, { "epoch": 1.22, "grad_norm": 1.0036624670028687, "learning_rate": 4.3062096629358285e-05, "loss": 1.7662, "step": 6790 }, { "epoch": 1.22, "grad_norm": 4.941795349121094, "learning_rate": 4.3052372779913194e-05, "loss": 1.7195, "step": 6795 }, { "epoch": 1.22, "grad_norm": 0.5514867901802063, "learning_rate": 4.304264322051997e-05, "loss": 1.899, "step": 6800 }, { "epoch": 1.22, "grad_norm": 0.600469708442688, "learning_rate": 4.303290795425605e-05, "loss": 2.0107, "step": 6805 }, { "epoch": 1.22, "grad_norm": 0.567573070526123, "learning_rate": 4.302316698420068e-05, "loss": 1.89, "step": 6810 }, { "epoch": 1.22, "grad_norm": 0.5658138394355774, "learning_rate": 4.301342031343493e-05, "loss": 2.2113, "step": 6815 }, { "epoch": 1.22, "grad_norm": 1.3791340589523315, "learning_rate": 4.300366794504167e-05, "loss": 2.0512, "step": 6820 }, { "epoch": 1.22, "grad_norm": 0.7716684937477112, "learning_rate": 4.2993909882105546e-05, "loss": 1.9361, "step": 6825 }, { "epoch": 1.22, "grad_norm": 0.4616861939430237, "learning_rate": 4.2984146127713046e-05, "loss": 2.2654, "step": 6830 }, { "epoch": 1.22, "grad_norm": 1.1081273555755615, "learning_rate": 4.2974376684952414e-05, "loss": 2.059, "step": 6835 }, { "epoch": 1.22, "grad_norm": 0.547261118888855, "learning_rate": 4.296460155691373e-05, "loss": 1.8463, "step": 6840 }, { "epoch": 1.23, "grad_norm": 0.597197949886322, "learning_rate": 4.295482074668885e-05, "loss": 1.8631, "step": 6845 }, { "epoch": 1.23, "grad_norm": 0.4494568705558777, "learning_rate": 4.2945034257371445e-05, "loss": 1.693, "step": 6850 }, { "epoch": 1.23, "grad_norm": 0.7037421464920044, "learning_rate": 4.293524209205696e-05, "loss": 1.7386, "step": 6855 }, { "epoch": 1.23, "grad_norm": 0.509117066860199, "learning_rate": 4.292544425384265e-05, "loss": 1.7397, "step": 6860 }, { "epoch": 1.23, "grad_norm": 0.8861694931983948, "learning_rate": 4.2915640745827576e-05, "loss": 1.8453, "step": 6865 }, { "epoch": 1.23, "grad_norm": 0.8716956973075867, "learning_rate": 4.290583157111255e-05, "loss": 1.8874, "step": 6870 }, { "epoch": 1.23, "grad_norm": 0.608605682849884, "learning_rate": 4.2896016732800235e-05, "loss": 1.6868, "step": 6875 }, { "epoch": 1.23, "grad_norm": 0.43817174434661865, "learning_rate": 4.288619623399503e-05, "loss": 1.7757, "step": 6880 }, { "epoch": 1.23, "grad_norm": 0.7979812622070312, "learning_rate": 4.2876370077803174e-05, "loss": 1.9506, "step": 6885 }, { "epoch": 1.23, "grad_norm": 0.8355962634086609, "learning_rate": 4.286653826733265e-05, "loss": 1.8282, "step": 6890 }, { "epoch": 1.23, "grad_norm": 0.4820092022418976, "learning_rate": 4.285670080569326e-05, "loss": 1.7797, "step": 6895 }, { "epoch": 1.24, "grad_norm": 0.9699495434761047, "learning_rate": 4.284685769599658e-05, "loss": 1.8737, "step": 6900 }, { "epoch": 1.24, "grad_norm": 0.6602120995521545, "learning_rate": 4.283700894135597e-05, "loss": 1.8891, "step": 6905 }, { "epoch": 1.24, "grad_norm": 0.5803684592247009, "learning_rate": 4.282715454488661e-05, "loss": 2.1173, "step": 6910 }, { "epoch": 1.24, "grad_norm": 0.8737105131149292, "learning_rate": 4.2817294509705394e-05, "loss": 1.9188, "step": 6915 }, { "epoch": 1.24, "grad_norm": 1.0067740678787231, "learning_rate": 4.280742883893106e-05, "loss": 1.9328, "step": 6920 }, { "epoch": 1.24, "grad_norm": 0.7453013062477112, "learning_rate": 4.279755753568413e-05, "loss": 2.0628, "step": 6925 }, { "epoch": 1.24, "grad_norm": 0.7748093008995056, "learning_rate": 4.278768060308686e-05, "loss": 1.8465, "step": 6930 }, { "epoch": 1.24, "grad_norm": 0.8429641723632812, "learning_rate": 4.277779804426333e-05, "loss": 1.6647, "step": 6935 }, { "epoch": 1.24, "grad_norm": 0.9702791571617126, "learning_rate": 4.2767909862339374e-05, "loss": 2.1712, "step": 6940 }, { "epoch": 1.24, "grad_norm": 3.3242080211639404, "learning_rate": 4.275801606044262e-05, "loss": 1.812, "step": 6945 }, { "epoch": 1.24, "grad_norm": 0.5527956485748291, "learning_rate": 4.2748116641702466e-05, "loss": 1.7873, "step": 6950 }, { "epoch": 1.24, "grad_norm": 0.6284675002098083, "learning_rate": 4.27382116092501e-05, "loss": 1.9656, "step": 6955 }, { "epoch": 1.25, "grad_norm": 10.499287605285645, "learning_rate": 4.272830096621845e-05, "loss": 1.724, "step": 6960 }, { "epoch": 1.25, "grad_norm": 0.9335476756095886, "learning_rate": 4.271838471574227e-05, "loss": 1.9242, "step": 6965 }, { "epoch": 1.25, "grad_norm": 0.3686840832233429, "learning_rate": 4.270846286095803e-05, "loss": 2.0095, "step": 6970 }, { "epoch": 1.25, "grad_norm": 0.7806186676025391, "learning_rate": 4.2698535405004034e-05, "loss": 1.7676, "step": 6975 }, { "epoch": 1.25, "grad_norm": 1.4177755117416382, "learning_rate": 4.2688602351020315e-05, "loss": 1.824, "step": 6980 }, { "epoch": 1.25, "grad_norm": 0.5843412280082703, "learning_rate": 4.267866370214868e-05, "loss": 1.9222, "step": 6985 }, { "epoch": 1.25, "grad_norm": 0.8569367527961731, "learning_rate": 4.266871946153272e-05, "loss": 1.8849, "step": 6990 }, { "epoch": 1.25, "grad_norm": 0.5695585012435913, "learning_rate": 4.265876963231779e-05, "loss": 2.0957, "step": 6995 }, { "epoch": 1.25, "grad_norm": 6.371403694152832, "learning_rate": 4.2648814217651013e-05, "loss": 1.9637, "step": 7000 }, { "epoch": 1.25, "grad_norm": 0.9307239651679993, "learning_rate": 4.2638853220681276e-05, "loss": 2.0494, "step": 7005 }, { "epoch": 1.25, "grad_norm": 1.0416122674942017, "learning_rate": 4.262888664455923e-05, "loss": 1.9864, "step": 7010 }, { "epoch": 1.26, "grad_norm": 0.673014223575592, "learning_rate": 4.261891449243729e-05, "loss": 1.7286, "step": 7015 }, { "epoch": 1.26, "grad_norm": 0.7358102202415466, "learning_rate": 4.2608936767469655e-05, "loss": 1.7191, "step": 7020 }, { "epoch": 1.26, "grad_norm": 0.7470425367355347, "learning_rate": 4.259895347281225e-05, "loss": 1.6247, "step": 7025 }, { "epoch": 1.26, "grad_norm": 0.47723567485809326, "learning_rate": 4.258896461162279e-05, "loss": 1.7392, "step": 7030 }, { "epoch": 1.26, "grad_norm": 0.9144250154495239, "learning_rate": 4.257897018706074e-05, "loss": 1.8239, "step": 7035 }, { "epoch": 1.26, "grad_norm": 0.484836608171463, "learning_rate": 4.2568970202287334e-05, "loss": 1.6783, "step": 7040 }, { "epoch": 1.26, "grad_norm": 1.3341679573059082, "learning_rate": 4.255896466046555e-05, "loss": 1.8347, "step": 7045 }, { "epoch": 1.26, "grad_norm": 0.928821861743927, "learning_rate": 4.254895356476013e-05, "loss": 1.8622, "step": 7050 }, { "epoch": 1.26, "grad_norm": 0.6659140586853027, "learning_rate": 4.253893691833758e-05, "loss": 2.1149, "step": 7055 }, { "epoch": 1.26, "grad_norm": 0.630410373210907, "learning_rate": 4.252891472436616e-05, "loss": 1.8716, "step": 7060 }, { "epoch": 1.26, "grad_norm": 0.746567964553833, "learning_rate": 4.251888698601587e-05, "loss": 1.8274, "step": 7065 }, { "epoch": 1.27, "grad_norm": 0.6550970673561096, "learning_rate": 4.250885370645847e-05, "loss": 1.9049, "step": 7070 }, { "epoch": 1.27, "grad_norm": 0.8050583004951477, "learning_rate": 4.249881488886749e-05, "loss": 1.9531, "step": 7075 }, { "epoch": 1.27, "grad_norm": 0.8060429692268372, "learning_rate": 4.248877053641819e-05, "loss": 1.84, "step": 7080 }, { "epoch": 1.27, "grad_norm": 0.6672247648239136, "learning_rate": 4.2478720652287596e-05, "loss": 1.9365, "step": 7085 }, { "epoch": 1.27, "grad_norm": 0.9606338739395142, "learning_rate": 4.246866523965446e-05, "loss": 1.8334, "step": 7090 }, { "epoch": 1.27, "grad_norm": 0.8220806121826172, "learning_rate": 4.2458604301699324e-05, "loss": 2.2147, "step": 7095 }, { "epoch": 1.27, "grad_norm": 0.5984706282615662, "learning_rate": 4.2448537841604435e-05, "loss": 1.9588, "step": 7100 }, { "epoch": 1.27, "grad_norm": 0.6690345406532288, "learning_rate": 4.243846586255381e-05, "loss": 1.8757, "step": 7105 }, { "epoch": 1.27, "grad_norm": 2.075214385986328, "learning_rate": 4.242838836773321e-05, "loss": 1.7474, "step": 7110 }, { "epoch": 1.27, "grad_norm": 0.8031553626060486, "learning_rate": 4.2418305360330134e-05, "loss": 1.6749, "step": 7115 }, { "epoch": 1.27, "grad_norm": 0.6923268437385559, "learning_rate": 4.240821684353382e-05, "loss": 1.7302, "step": 7120 }, { "epoch": 1.28, "grad_norm": 0.5841795206069946, "learning_rate": 4.2398122820535266e-05, "loss": 1.7478, "step": 7125 }, { "epoch": 1.28, "grad_norm": 0.6711515188217163, "learning_rate": 4.2388023294527204e-05, "loss": 2.0452, "step": 7130 }, { "epoch": 1.28, "grad_norm": 0.6497505903244019, "learning_rate": 4.237791826870409e-05, "loss": 1.7706, "step": 7135 }, { "epoch": 1.28, "grad_norm": 0.7864512801170349, "learning_rate": 4.236780774626215e-05, "loss": 2.1351, "step": 7140 }, { "epoch": 1.28, "grad_norm": 0.8343969583511353, "learning_rate": 4.235769173039932e-05, "loss": 1.7934, "step": 7145 }, { "epoch": 1.28, "grad_norm": 0.734727144241333, "learning_rate": 4.234959496459615e-05, "loss": 1.9032, "step": 7150 }, { "epoch": 1.28, "grad_norm": 1.37156343460083, "learning_rate": 4.233946906864011e-05, "loss": 2.0043, "step": 7155 }, { "epoch": 1.28, "grad_norm": 0.48946473002433777, "learning_rate": 4.232933768822668e-05, "loss": 2.1292, "step": 7160 }, { "epoch": 1.28, "grad_norm": 0.64715576171875, "learning_rate": 4.231920082656041e-05, "loss": 1.7692, "step": 7165 }, { "epoch": 1.28, "grad_norm": 0.7342175245285034, "learning_rate": 4.230905848684757e-05, "loss": 1.8173, "step": 7170 }, { "epoch": 1.28, "grad_norm": 1.2754048109054565, "learning_rate": 4.2298910672296186e-05, "loss": 1.6126, "step": 7175 }, { "epoch": 1.29, "grad_norm": 0.9194117784500122, "learning_rate": 4.2288757386115994e-05, "loss": 1.9669, "step": 7180 }, { "epoch": 1.29, "grad_norm": 3.632533550262451, "learning_rate": 4.227859863151847e-05, "loss": 1.7307, "step": 7185 }, { "epoch": 1.29, "grad_norm": 0.4630647599697113, "learning_rate": 4.226843441171682e-05, "loss": 1.6671, "step": 7190 }, { "epoch": 1.29, "grad_norm": 0.5320610404014587, "learning_rate": 4.225826472992598e-05, "loss": 1.8363, "step": 7195 }, { "epoch": 1.29, "grad_norm": 0.6387033462524414, "learning_rate": 4.2248089589362593e-05, "loss": 2.0306, "step": 7200 }, { "epoch": 1.29, "grad_norm": 0.5456752181053162, "learning_rate": 4.223790899324507e-05, "loss": 1.7391, "step": 7205 }, { "epoch": 1.29, "grad_norm": 6.061885833740234, "learning_rate": 4.2227722944793515e-05, "loss": 1.9154, "step": 7210 }, { "epoch": 1.29, "grad_norm": 0.6948996186256409, "learning_rate": 4.221753144722976e-05, "loss": 1.8669, "step": 7215 }, { "epoch": 1.29, "grad_norm": 0.5726954936981201, "learning_rate": 4.220733450377737e-05, "loss": 2.0606, "step": 7220 }, { "epoch": 1.29, "grad_norm": 0.9554140567779541, "learning_rate": 4.219713211766162e-05, "loss": 1.8438, "step": 7225 }, { "epoch": 1.29, "grad_norm": 0.6293787360191345, "learning_rate": 4.218692429210953e-05, "loss": 1.825, "step": 7230 }, { "epoch": 1.3, "grad_norm": 1.0556727647781372, "learning_rate": 4.217671103034982e-05, "loss": 1.7117, "step": 7235 }, { "epoch": 1.3, "grad_norm": 0.6158782243728638, "learning_rate": 4.216649233561293e-05, "loss": 1.9096, "step": 7240 }, { "epoch": 1.3, "grad_norm": 0.9434940218925476, "learning_rate": 4.2156268211131027e-05, "loss": 1.8082, "step": 7245 }, { "epoch": 1.3, "grad_norm": 0.7455072402954102, "learning_rate": 4.214603866013799e-05, "loss": 1.9597, "step": 7250 }, { "epoch": 1.3, "grad_norm": 0.912811279296875, "learning_rate": 4.213580368586942e-05, "loss": 1.8557, "step": 7255 }, { "epoch": 1.3, "grad_norm": 0.6361580491065979, "learning_rate": 4.212556329156263e-05, "loss": 2.0177, "step": 7260 }, { "epoch": 1.3, "grad_norm": 6.6606597900390625, "learning_rate": 4.211531748045664e-05, "loss": 1.7674, "step": 7265 }, { "epoch": 1.3, "grad_norm": 0.5496176481246948, "learning_rate": 4.2105066255792185e-05, "loss": 2.1907, "step": 7270 }, { "epoch": 1.3, "grad_norm": 0.6689378023147583, "learning_rate": 4.209480962081174e-05, "loss": 1.7675, "step": 7275 }, { "epoch": 1.3, "grad_norm": 0.8176302909851074, "learning_rate": 4.208454757875945e-05, "loss": 1.6746, "step": 7280 }, { "epoch": 1.3, "grad_norm": 0.5315504670143127, "learning_rate": 4.2074280132881184e-05, "loss": 1.6911, "step": 7285 }, { "epoch": 1.3, "grad_norm": 0.6745172739028931, "learning_rate": 4.2064007286424545e-05, "loss": 1.786, "step": 7290 }, { "epoch": 1.31, "grad_norm": 0.8373345136642456, "learning_rate": 4.205372904263881e-05, "loss": 2.0911, "step": 7295 }, { "epoch": 1.31, "grad_norm": 0.54158616065979, "learning_rate": 4.204344540477499e-05, "loss": 1.7611, "step": 7300 }, { "epoch": 1.31, "grad_norm": 0.6230586767196655, "learning_rate": 4.203315637608578e-05, "loss": 1.7406, "step": 7305 }, { "epoch": 1.31, "grad_norm": 2.31952166557312, "learning_rate": 4.202286195982559e-05, "loss": 1.8494, "step": 7310 }, { "epoch": 1.31, "grad_norm": 0.6334202885627747, "learning_rate": 4.201256215925054e-05, "loss": 1.6764, "step": 7315 }, { "epoch": 1.31, "grad_norm": 0.43206092715263367, "learning_rate": 4.200225697761844e-05, "loss": 1.9862, "step": 7320 }, { "epoch": 1.31, "grad_norm": 0.6146365404129028, "learning_rate": 4.1991946418188805e-05, "loss": 2.0393, "step": 7325 }, { "epoch": 1.31, "grad_norm": 0.48394322395324707, "learning_rate": 4.198163048422287e-05, "loss": 1.8087, "step": 7330 }, { "epoch": 1.31, "grad_norm": 0.8574031591415405, "learning_rate": 4.197130917898355e-05, "loss": 1.9837, "step": 7335 }, { "epoch": 1.31, "grad_norm": 0.4108412563800812, "learning_rate": 4.196098250573545e-05, "loss": 2.2631, "step": 7340 }, { "epoch": 1.31, "grad_norm": 0.6564179062843323, "learning_rate": 4.195065046774491e-05, "loss": 2.0216, "step": 7345 }, { "epoch": 1.32, "grad_norm": 0.3432505130767822, "learning_rate": 4.194031306827993e-05, "loss": 2.1613, "step": 7350 }, { "epoch": 1.32, "grad_norm": 0.463270366191864, "learning_rate": 4.192997031061021e-05, "loss": 1.6699, "step": 7355 }, { "epoch": 1.32, "grad_norm": 0.4397541284561157, "learning_rate": 4.1919622198007176e-05, "loss": 1.7908, "step": 7360 }, { "epoch": 1.32, "grad_norm": 0.724583089351654, "learning_rate": 4.190926873374391e-05, "loss": 1.9014, "step": 7365 }, { "epoch": 1.32, "grad_norm": 1.0059683322906494, "learning_rate": 4.189890992109521e-05, "loss": 1.6797, "step": 7370 }, { "epoch": 1.32, "grad_norm": 1.4795385599136353, "learning_rate": 4.1888545763337554e-05, "loss": 1.8065, "step": 7375 }, { "epoch": 1.32, "grad_norm": 0.9191558361053467, "learning_rate": 4.187817626374912e-05, "loss": 1.9115, "step": 7380 }, { "epoch": 1.32, "grad_norm": 0.3600243628025055, "learning_rate": 4.186780142560977e-05, "loss": 2.096, "step": 7385 }, { "epoch": 1.32, "grad_norm": 0.8530924320220947, "learning_rate": 4.185742125220106e-05, "loss": 1.5549, "step": 7390 }, { "epoch": 1.32, "grad_norm": 0.6997917890548706, "learning_rate": 4.1847035746806205e-05, "loss": 1.9537, "step": 7395 }, { "epoch": 1.32, "grad_norm": 0.5921128392219543, "learning_rate": 4.1836644912710166e-05, "loss": 2.0546, "step": 7400 }, { "epoch": 1.33, "grad_norm": 0.5461764335632324, "learning_rate": 4.1826248753199526e-05, "loss": 1.5721, "step": 7405 }, { "epoch": 1.33, "grad_norm": 0.8535491228103638, "learning_rate": 4.1815847271562594e-05, "loss": 1.8103, "step": 7410 }, { "epoch": 1.33, "grad_norm": 0.9533465504646301, "learning_rate": 4.180544047108934e-05, "loss": 1.87, "step": 7415 }, { "epoch": 1.33, "grad_norm": 0.49469488859176636, "learning_rate": 4.179502835507143e-05, "loss": 1.8931, "step": 7420 }, { "epoch": 1.33, "grad_norm": 0.4135587215423584, "learning_rate": 4.1784610926802214e-05, "loss": 2.1653, "step": 7425 }, { "epoch": 1.33, "grad_norm": 0.6948500871658325, "learning_rate": 4.1774188189576705e-05, "loss": 1.8297, "step": 7430 }, { "epoch": 1.33, "grad_norm": 0.7468460202217102, "learning_rate": 4.17637601466916e-05, "loss": 1.9556, "step": 7435 }, { "epoch": 1.33, "grad_norm": 0.8151171207427979, "learning_rate": 4.1753326801445285e-05, "loss": 1.6892, "step": 7440 }, { "epoch": 1.33, "grad_norm": 0.8748107552528381, "learning_rate": 4.174288815713783e-05, "loss": 1.7057, "step": 7445 }, { "epoch": 1.33, "grad_norm": 0.40320098400115967, "learning_rate": 4.1732444217070943e-05, "loss": 1.8913, "step": 7450 }, { "epoch": 1.33, "grad_norm": 0.8582679629325867, "learning_rate": 4.172199498454804e-05, "loss": 1.9204, "step": 7455 }, { "epoch": 1.34, "grad_norm": 0.35141122341156006, "learning_rate": 4.1711540462874224e-05, "loss": 1.9048, "step": 7460 }, { "epoch": 1.34, "grad_norm": 0.42094314098358154, "learning_rate": 4.170108065535622e-05, "loss": 1.9885, "step": 7465 }, { "epoch": 1.34, "grad_norm": 0.6282821297645569, "learning_rate": 4.169061556530248e-05, "loss": 1.7992, "step": 7470 }, { "epoch": 1.34, "grad_norm": 0.7469441890716553, "learning_rate": 4.168014519602309e-05, "loss": 1.9007, "step": 7475 }, { "epoch": 1.34, "grad_norm": 0.5168988704681396, "learning_rate": 4.1669669550829815e-05, "loss": 1.8918, "step": 7480 }, { "epoch": 1.34, "grad_norm": 0.7953056693077087, "learning_rate": 4.165918863303609e-05, "loss": 2.1881, "step": 7485 }, { "epoch": 1.34, "grad_norm": 0.6245642900466919, "learning_rate": 4.164870244595703e-05, "loss": 1.9878, "step": 7490 }, { "epoch": 1.34, "grad_norm": 1.3559062480926514, "learning_rate": 4.16382109929094e-05, "loss": 2.1096, "step": 7495 }, { "epoch": 1.34, "grad_norm": 1.1444271802902222, "learning_rate": 4.162771427721164e-05, "loss": 2.078, "step": 7500 }, { "epoch": 1.34, "grad_norm": 0.5032902359962463, "learning_rate": 4.161721230218384e-05, "loss": 1.814, "step": 7505 }, { "epoch": 1.34, "grad_norm": 1.2069705724716187, "learning_rate": 4.1606705071147775e-05, "loss": 1.8121, "step": 7510 }, { "epoch": 1.35, "grad_norm": 0.5158296823501587, "learning_rate": 4.1596192587426874e-05, "loss": 1.7489, "step": 7515 }, { "epoch": 1.35, "grad_norm": 0.6261261701583862, "learning_rate": 4.158567485434622e-05, "loss": 1.8973, "step": 7520 }, { "epoch": 1.35, "grad_norm": 1.6703084707260132, "learning_rate": 4.1575151875232565e-05, "loss": 2.0307, "step": 7525 }, { "epoch": 1.35, "grad_norm": 0.8123947381973267, "learning_rate": 4.156462365341431e-05, "loss": 1.6317, "step": 7530 }, { "epoch": 1.35, "grad_norm": 1.7036327123641968, "learning_rate": 4.1554090192221525e-05, "loss": 1.4194, "step": 7535 }, { "epoch": 1.35, "grad_norm": 0.5714519023895264, "learning_rate": 4.154355149498594e-05, "loss": 2.0492, "step": 7540 }, { "epoch": 1.35, "grad_norm": 0.5328961610794067, "learning_rate": 4.153300756504093e-05, "loss": 1.8173, "step": 7545 }, { "epoch": 1.35, "grad_norm": 0.6683387756347656, "learning_rate": 4.152245840572153e-05, "loss": 2.0118, "step": 7550 }, { "epoch": 1.35, "grad_norm": 0.5071770548820496, "learning_rate": 4.151190402036443e-05, "loss": 1.9307, "step": 7555 }, { "epoch": 1.35, "grad_norm": 0.5669893622398376, "learning_rate": 4.150134441230797e-05, "loss": 2.0048, "step": 7560 }, { "epoch": 1.35, "grad_norm": 0.7352867722511292, "learning_rate": 4.149077958489214e-05, "loss": 2.0041, "step": 7565 }, { "epoch": 1.35, "grad_norm": 0.7905187010765076, "learning_rate": 4.148020954145859e-05, "loss": 2.0104, "step": 7570 }, { "epoch": 1.36, "grad_norm": 0.7344765067100525, "learning_rate": 4.146963428535062e-05, "loss": 1.8857, "step": 7575 }, { "epoch": 1.36, "grad_norm": 0.9353702664375305, "learning_rate": 4.1459053819913164e-05, "loss": 1.9718, "step": 7580 }, { "epoch": 1.36, "grad_norm": 0.8625378012657166, "learning_rate": 4.144846814849282e-05, "loss": 1.9804, "step": 7585 }, { "epoch": 1.36, "grad_norm": 0.47366541624069214, "learning_rate": 4.1437877274437817e-05, "loss": 1.7501, "step": 7590 }, { "epoch": 1.36, "grad_norm": 0.9116063117980957, "learning_rate": 4.1427281201098046e-05, "loss": 1.4518, "step": 7595 }, { "epoch": 1.36, "grad_norm": 1.8927298784255981, "learning_rate": 4.141667993182503e-05, "loss": 1.7712, "step": 7600 }, { "epoch": 1.36, "grad_norm": 0.8056049942970276, "learning_rate": 4.140607346997194e-05, "loss": 1.6893, "step": 7605 }, { "epoch": 1.36, "grad_norm": 2.4493560791015625, "learning_rate": 4.13954618188936e-05, "loss": 1.5022, "step": 7610 }, { "epoch": 1.36, "grad_norm": 0.4495934844017029, "learning_rate": 4.138484498194645e-05, "loss": 2.0982, "step": 7615 }, { "epoch": 1.36, "grad_norm": 0.6876751780509949, "learning_rate": 4.1374222962488594e-05, "loss": 1.8517, "step": 7620 }, { "epoch": 1.36, "grad_norm": 0.509681224822998, "learning_rate": 4.136359576387976e-05, "loss": 1.9499, "step": 7625 }, { "epoch": 1.37, "grad_norm": 0.3566737771034241, "learning_rate": 4.135296338948134e-05, "loss": 2.1347, "step": 7630 }, { "epoch": 1.37, "grad_norm": 0.6193088889122009, "learning_rate": 4.1342325842656315e-05, "loss": 1.9541, "step": 7635 }, { "epoch": 1.37, "grad_norm": 0.8318669199943542, "learning_rate": 4.133168312676935e-05, "loss": 1.6854, "step": 7640 }, { "epoch": 1.37, "grad_norm": 0.5070250034332275, "learning_rate": 4.1321035245186726e-05, "loss": 1.8134, "step": 7645 }, { "epoch": 1.37, "grad_norm": 6.577232360839844, "learning_rate": 4.1310382201276354e-05, "loss": 2.0211, "step": 7650 }, { "epoch": 1.37, "grad_norm": 1.0688321590423584, "learning_rate": 4.1299723998407774e-05, "loss": 1.5149, "step": 7655 }, { "epoch": 1.37, "grad_norm": 3.326978921890259, "learning_rate": 4.128906063995217e-05, "loss": 2.042, "step": 7660 }, { "epoch": 1.37, "grad_norm": 0.7662460803985596, "learning_rate": 4.127839212928236e-05, "loss": 1.7589, "step": 7665 }, { "epoch": 1.37, "grad_norm": 0.7111654877662659, "learning_rate": 4.126771846977277e-05, "loss": 1.8719, "step": 7670 }, { "epoch": 1.37, "grad_norm": 0.8834118843078613, "learning_rate": 4.125703966479948e-05, "loss": 2.0589, "step": 7675 }, { "epoch": 1.37, "grad_norm": 0.599921464920044, "learning_rate": 4.124635571774018e-05, "loss": 1.9423, "step": 7680 }, { "epoch": 1.38, "grad_norm": 0.6738287806510925, "learning_rate": 4.12356666319742e-05, "loss": 1.9728, "step": 7685 }, { "epoch": 1.38, "grad_norm": 0.6264171004295349, "learning_rate": 4.122497241088247e-05, "loss": 1.961, "step": 7690 }, { "epoch": 1.38, "grad_norm": 0.831802487373352, "learning_rate": 4.121427305784758e-05, "loss": 2.0417, "step": 7695 }, { "epoch": 1.38, "grad_norm": 0.9367872476577759, "learning_rate": 4.120356857625372e-05, "loss": 1.8298, "step": 7700 }, { "epoch": 1.38, "grad_norm": 0.6761164665222168, "learning_rate": 4.11928589694867e-05, "loss": 1.6786, "step": 7705 }, { "epoch": 1.38, "grad_norm": 0.7731988430023193, "learning_rate": 4.118214424093396e-05, "loss": 1.8711, "step": 7710 }, { "epoch": 1.38, "grad_norm": 0.530742347240448, "learning_rate": 4.117142439398457e-05, "loss": 2.009, "step": 7715 }, { "epoch": 1.38, "grad_norm": 0.6078718900680542, "learning_rate": 4.116069943202919e-05, "loss": 1.9871, "step": 7720 }, { "epoch": 1.38, "grad_norm": 0.6330639719963074, "learning_rate": 4.114996935846014e-05, "loss": 1.7457, "step": 7725 }, { "epoch": 1.38, "grad_norm": 0.661787211894989, "learning_rate": 4.113923417667131e-05, "loss": 1.7891, "step": 7730 }, { "epoch": 1.38, "grad_norm": 0.9162965416908264, "learning_rate": 4.112849389005823e-05, "loss": 1.9484, "step": 7735 }, { "epoch": 1.39, "grad_norm": 0.8108046054840088, "learning_rate": 4.1117748502018036e-05, "loss": 1.5814, "step": 7740 }, { "epoch": 1.39, "grad_norm": 0.5153409838676453, "learning_rate": 4.110699801594951e-05, "loss": 1.4343, "step": 7745 }, { "epoch": 1.39, "grad_norm": 0.30527204275131226, "learning_rate": 4.1096242435253e-05, "loss": 2.0052, "step": 7750 }, { "epoch": 1.39, "grad_norm": 0.5223674178123474, "learning_rate": 4.108548176333049e-05, "loss": 2.1677, "step": 7755 }, { "epoch": 1.39, "grad_norm": 0.5189133286476135, "learning_rate": 4.107471600358557e-05, "loss": 1.9642, "step": 7760 }, { "epoch": 1.39, "grad_norm": 0.6149393320083618, "learning_rate": 4.1063945159423433e-05, "loss": 1.8314, "step": 7765 }, { "epoch": 1.39, "grad_norm": 0.6912689805030823, "learning_rate": 4.105316923425091e-05, "loss": 1.9248, "step": 7770 }, { "epoch": 1.39, "grad_norm": 0.5106381177902222, "learning_rate": 4.1042388231476384e-05, "loss": 2.0442, "step": 7775 }, { "epoch": 1.39, "grad_norm": 0.4784005880355835, "learning_rate": 4.10316021545099e-05, "loss": 1.7323, "step": 7780 }, { "epoch": 1.39, "grad_norm": 0.8782150149345398, "learning_rate": 4.102081100676307e-05, "loss": 1.8736, "step": 7785 }, { "epoch": 1.39, "grad_norm": 1.3147215843200684, "learning_rate": 4.101001479164913e-05, "loss": 1.7307, "step": 7790 }, { "epoch": 1.4, "grad_norm": 0.5481510758399963, "learning_rate": 4.099921351258292e-05, "loss": 1.6104, "step": 7795 }, { "epoch": 1.4, "grad_norm": 0.617058515548706, "learning_rate": 4.098840717298085e-05, "loss": 1.9166, "step": 7800 }, { "epoch": 1.4, "grad_norm": 2.2154157161712646, "learning_rate": 4.097759577626098e-05, "loss": 1.8153, "step": 7805 }, { "epoch": 1.4, "grad_norm": 0.7002790570259094, "learning_rate": 4.096677932584293e-05, "loss": 1.7273, "step": 7810 }, { "epoch": 1.4, "grad_norm": 1.020469069480896, "learning_rate": 4.095595782514794e-05, "loss": 2.0839, "step": 7815 }, { "epoch": 1.4, "grad_norm": 0.7469928860664368, "learning_rate": 4.094513127759883e-05, "loss": 1.7802, "step": 7820 }, { "epoch": 1.4, "grad_norm": 0.54106605052948, "learning_rate": 4.093429968662005e-05, "loss": 1.7512, "step": 7825 }, { "epoch": 1.4, "grad_norm": 0.8594502210617065, "learning_rate": 4.09234630556376e-05, "loss": 1.9866, "step": 7830 }, { "epoch": 1.4, "grad_norm": 0.5994930267333984, "learning_rate": 4.09126213880791e-05, "loss": 1.8757, "step": 7835 }, { "epoch": 1.4, "grad_norm": 1.0593057870864868, "learning_rate": 4.090177468737375e-05, "loss": 1.6099, "step": 7840 }, { "epoch": 1.4, "grad_norm": 0.458686888217926, "learning_rate": 4.089092295695238e-05, "loss": 2.0074, "step": 7845 }, { "epoch": 1.41, "grad_norm": 0.5471193790435791, "learning_rate": 4.0880066200247346e-05, "loss": 2.0962, "step": 7850 }, { "epoch": 1.41, "grad_norm": 0.5086942315101624, "learning_rate": 4.0869204420692665e-05, "loss": 1.5895, "step": 7855 }, { "epoch": 1.41, "grad_norm": 0.7652096748352051, "learning_rate": 4.085833762172387e-05, "loss": 2.0743, "step": 7860 }, { "epoch": 1.41, "grad_norm": 2.0196499824523926, "learning_rate": 4.084746580677816e-05, "loss": 2.0508, "step": 7865 }, { "epoch": 1.41, "grad_norm": 0.541129469871521, "learning_rate": 4.083658897929426e-05, "loss": 1.857, "step": 7870 }, { "epoch": 1.41, "grad_norm": 2.4016308784484863, "learning_rate": 4.082570714271248e-05, "loss": 1.669, "step": 7875 }, { "epoch": 1.41, "grad_norm": 0.5275091528892517, "learning_rate": 4.081482030047476e-05, "loss": 1.9097, "step": 7880 }, { "epoch": 1.41, "grad_norm": 0.7216492295265198, "learning_rate": 4.080392845602459e-05, "loss": 1.9592, "step": 7885 }, { "epoch": 1.41, "grad_norm": 0.5998548865318298, "learning_rate": 4.079303161280706e-05, "loss": 1.7357, "step": 7890 }, { "epoch": 1.41, "grad_norm": 0.7786138653755188, "learning_rate": 4.0782129774268815e-05, "loss": 2.0231, "step": 7895 }, { "epoch": 1.41, "grad_norm": 0.4815676510334015, "learning_rate": 4.0771222943858104e-05, "loss": 1.8351, "step": 7900 }, { "epoch": 1.41, "grad_norm": 0.5540018677711487, "learning_rate": 4.076031112502474e-05, "loss": 1.8817, "step": 7905 }, { "epoch": 1.42, "grad_norm": 1.1450433731079102, "learning_rate": 4.074939432122014e-05, "loss": 2.1311, "step": 7910 }, { "epoch": 1.42, "grad_norm": 0.612622857093811, "learning_rate": 4.073847253589725e-05, "loss": 1.9964, "step": 7915 }, { "epoch": 1.42, "grad_norm": 0.6572409868240356, "learning_rate": 4.0727545772510645e-05, "loss": 1.8965, "step": 7920 }, { "epoch": 1.42, "grad_norm": 0.8887383937835693, "learning_rate": 4.071661403451643e-05, "loss": 1.9419, "step": 7925 }, { "epoch": 1.42, "grad_norm": 0.8722497820854187, "learning_rate": 4.070567732537232e-05, "loss": 2.0361, "step": 7930 }, { "epoch": 1.42, "grad_norm": 0.48383402824401855, "learning_rate": 4.0694735648537584e-05, "loss": 1.7524, "step": 7935 }, { "epoch": 1.42, "grad_norm": 0.5321690440177917, "learning_rate": 4.0683789007473055e-05, "loss": 1.8623, "step": 7940 }, { "epoch": 1.42, "grad_norm": 0.6886895298957825, "learning_rate": 4.067283740564115e-05, "loss": 1.5339, "step": 7945 }, { "epoch": 1.42, "grad_norm": 1.0236868858337402, "learning_rate": 4.066188084650584e-05, "loss": 1.8861, "step": 7950 }, { "epoch": 1.42, "grad_norm": 0.5006799697875977, "learning_rate": 4.065091933353269e-05, "loss": 2.1672, "step": 7955 }, { "epoch": 1.42, "grad_norm": 1.0537916421890259, "learning_rate": 4.0639952870188815e-05, "loss": 1.8532, "step": 7960 }, { "epoch": 1.43, "grad_norm": 0.6310824155807495, "learning_rate": 4.062898145994288e-05, "loss": 1.9918, "step": 7965 }, { "epoch": 1.43, "grad_norm": 0.6223093271255493, "learning_rate": 4.061800510626515e-05, "loss": 1.8719, "step": 7970 }, { "epoch": 1.43, "grad_norm": 2.3217649459838867, "learning_rate": 4.060702381262742e-05, "loss": 1.9299, "step": 7975 }, { "epoch": 1.43, "grad_norm": 0.998820960521698, "learning_rate": 4.0596037582503084e-05, "loss": 1.8601, "step": 7980 }, { "epoch": 1.43, "grad_norm": 0.5109826922416687, "learning_rate": 4.0585046419367053e-05, "loss": 1.8392, "step": 7985 }, { "epoch": 1.43, "grad_norm": 0.45510098338127136, "learning_rate": 4.057405032669582e-05, "loss": 1.8877, "step": 7990 }, { "epoch": 1.43, "grad_norm": 0.46888962388038635, "learning_rate": 4.056304930796746e-05, "loss": 1.783, "step": 7995 }, { "epoch": 1.43, "grad_norm": 6.063878059387207, "learning_rate": 4.055204336666158e-05, "loss": 2.0542, "step": 8000 }, { "epoch": 1.43, "grad_norm": 0.7420514822006226, "learning_rate": 4.054103250625933e-05, "loss": 2.0758, "step": 8005 }, { "epoch": 1.43, "grad_norm": 0.3912978768348694, "learning_rate": 4.053001673024346e-05, "loss": 1.8226, "step": 8010 }, { "epoch": 1.43, "grad_norm": 0.851407527923584, "learning_rate": 4.051899604209823e-05, "loss": 1.8753, "step": 8015 }, { "epoch": 1.44, "grad_norm": 1.0191857814788818, "learning_rate": 4.0507970445309476e-05, "loss": 2.0584, "step": 8020 }, { "epoch": 1.44, "grad_norm": 0.5699238181114197, "learning_rate": 4.049693994336459e-05, "loss": 1.9835, "step": 8025 }, { "epoch": 1.44, "grad_norm": 0.5822308659553528, "learning_rate": 4.0485904539752503e-05, "loss": 1.8653, "step": 8030 }, { "epoch": 1.44, "grad_norm": 0.7040959000587463, "learning_rate": 4.0474864237963704e-05, "loss": 1.7044, "step": 8035 }, { "epoch": 1.44, "grad_norm": 0.6993011832237244, "learning_rate": 4.046381904149024e-05, "loss": 1.733, "step": 8040 }, { "epoch": 1.44, "grad_norm": 0.9968368411064148, "learning_rate": 4.045276895382568e-05, "loss": 1.8138, "step": 8045 }, { "epoch": 1.44, "grad_norm": 0.7696248888969421, "learning_rate": 4.044171397846517e-05, "loss": 1.8775, "step": 8050 }, { "epoch": 1.44, "grad_norm": 0.6908630728721619, "learning_rate": 4.0430654118905374e-05, "loss": 1.8886, "step": 8055 }, { "epoch": 1.44, "grad_norm": 0.7053937315940857, "learning_rate": 4.041958937864453e-05, "loss": 1.6934, "step": 8060 }, { "epoch": 1.44, "grad_norm": 0.7439703941345215, "learning_rate": 4.040851976118239e-05, "loss": 1.8191, "step": 8065 }, { "epoch": 1.44, "grad_norm": 0.9062896370887756, "learning_rate": 4.039744527002027e-05, "loss": 1.6822, "step": 8070 }, { "epoch": 1.45, "grad_norm": 0.5621775388717651, "learning_rate": 4.038636590866103e-05, "loss": 1.7791, "step": 8075 }, { "epoch": 1.45, "grad_norm": 0.6721243858337402, "learning_rate": 4.0375281680609046e-05, "loss": 1.9383, "step": 8080 }, { "epoch": 1.45, "grad_norm": 0.5987261533737183, "learning_rate": 4.036419258937026e-05, "loss": 2.1479, "step": 8085 }, { "epoch": 1.45, "grad_norm": 0.3930743634700775, "learning_rate": 4.0353098638452134e-05, "loss": 2.0115, "step": 8090 }, { "epoch": 1.45, "grad_norm": 0.7518051862716675, "learning_rate": 4.0341999831363676e-05, "loss": 2.1983, "step": 8095 }, { "epoch": 1.45, "grad_norm": 1.2611430883407593, "learning_rate": 4.0330896171615434e-05, "loss": 1.9812, "step": 8100 }, { "epoch": 1.45, "grad_norm": 0.8846983313560486, "learning_rate": 4.031978766271948e-05, "loss": 1.7216, "step": 8105 }, { "epoch": 1.45, "grad_norm": 1.0537869930267334, "learning_rate": 4.030867430818941e-05, "loss": 1.9612, "step": 8110 }, { "epoch": 1.45, "grad_norm": 0.5428566336631775, "learning_rate": 4.0297556111540393e-05, "loss": 1.9787, "step": 8115 }, { "epoch": 1.45, "grad_norm": 0.7369449138641357, "learning_rate": 4.028643307628909e-05, "loss": 1.8887, "step": 8120 }, { "epoch": 1.45, "grad_norm": 1.3817769289016724, "learning_rate": 4.027530520595371e-05, "loss": 1.8445, "step": 8125 }, { "epoch": 1.46, "grad_norm": 0.8463249802589417, "learning_rate": 4.026417250405399e-05, "loss": 2.0274, "step": 8130 }, { "epoch": 1.46, "grad_norm": 1.6427886486053467, "learning_rate": 4.025303497411118e-05, "loss": 2.0033, "step": 8135 }, { "epoch": 1.46, "grad_norm": 0.6956122517585754, "learning_rate": 4.024189261964808e-05, "loss": 1.5285, "step": 8140 }, { "epoch": 1.46, "grad_norm": 0.7735291123390198, "learning_rate": 4.023074544418901e-05, "loss": 1.6345, "step": 8145 }, { "epoch": 1.46, "grad_norm": 0.6430752873420715, "learning_rate": 4.021959345125981e-05, "loss": 1.7802, "step": 8150 }, { "epoch": 1.46, "grad_norm": 0.8027344942092896, "learning_rate": 4.0208436644387834e-05, "loss": 1.6531, "step": 8155 }, { "epoch": 1.46, "grad_norm": 2.263455867767334, "learning_rate": 4.019727502710197e-05, "loss": 1.8208, "step": 8160 }, { "epoch": 1.46, "grad_norm": 0.4575340747833252, "learning_rate": 4.018610860293264e-05, "loss": 1.7946, "step": 8165 }, { "epoch": 1.46, "grad_norm": 0.8558323383331299, "learning_rate": 4.017493737541177e-05, "loss": 1.9303, "step": 8170 }, { "epoch": 1.46, "grad_norm": 0.5351092219352722, "learning_rate": 4.0163761348072804e-05, "loss": 1.8344, "step": 8175 }, { "epoch": 1.46, "grad_norm": 0.9965373873710632, "learning_rate": 4.01525805244507e-05, "loss": 1.7591, "step": 8180 }, { "epoch": 1.47, "grad_norm": 0.5350607633590698, "learning_rate": 4.014139490808196e-05, "loss": 1.8909, "step": 8185 }, { "epoch": 1.47, "grad_norm": 0.6248508095741272, "learning_rate": 4.0130204502504575e-05, "loss": 1.7733, "step": 8190 }, { "epoch": 1.47, "grad_norm": 0.5423145890235901, "learning_rate": 4.0119009311258057e-05, "loss": 1.9219, "step": 8195 }, { "epoch": 1.47, "grad_norm": 0.892905056476593, "learning_rate": 4.0107809337883435e-05, "loss": 1.7403, "step": 8200 }, { "epoch": 1.47, "grad_norm": 0.8622453808784485, "learning_rate": 4.0096604585923246e-05, "loss": 1.7071, "step": 8205 }, { "epoch": 1.47, "grad_norm": 0.459806352853775, "learning_rate": 4.008539505892156e-05, "loss": 1.8587, "step": 8210 }, { "epoch": 1.47, "grad_norm": 2.5942025184631348, "learning_rate": 4.007418076042392e-05, "loss": 1.7092, "step": 8215 }, { "epoch": 1.47, "grad_norm": 0.9066646695137024, "learning_rate": 4.0062961693977406e-05, "loss": 1.9558, "step": 8220 }, { "epoch": 1.47, "grad_norm": 1.168616533279419, "learning_rate": 4.0051737863130594e-05, "loss": 1.7656, "step": 8225 }, { "epoch": 1.47, "grad_norm": 0.33232593536376953, "learning_rate": 4.004050927143358e-05, "loss": 1.9195, "step": 8230 }, { "epoch": 1.47, "grad_norm": 0.546242892742157, "learning_rate": 4.002927592243794e-05, "loss": 1.9144, "step": 8235 }, { "epoch": 1.47, "grad_norm": 0.4449472427368164, "learning_rate": 4.00180378196968e-05, "loss": 1.5383, "step": 8240 }, { "epoch": 1.48, "grad_norm": 0.6070382595062256, "learning_rate": 4.000679496676473e-05, "loss": 1.5511, "step": 8245 }, { "epoch": 1.48, "grad_norm": 0.3525921404361725, "learning_rate": 3.9995547367197845e-05, "loss": 1.9672, "step": 8250 }, { "epoch": 1.48, "grad_norm": 2.3565738201141357, "learning_rate": 3.998429502455375e-05, "loss": 1.61, "step": 8255 }, { "epoch": 1.48, "grad_norm": 0.6370993256568909, "learning_rate": 3.9973037942391564e-05, "loss": 1.9055, "step": 8260 }, { "epoch": 1.48, "grad_norm": 0.565390408039093, "learning_rate": 3.996177612427187e-05, "loss": 1.8377, "step": 8265 }, { "epoch": 1.48, "grad_norm": 0.6728577017784119, "learning_rate": 3.995050957375678e-05, "loss": 1.9646, "step": 8270 }, { "epoch": 1.48, "grad_norm": 1.3911441564559937, "learning_rate": 3.99392382944099e-05, "loss": 1.8547, "step": 8275 }, { "epoch": 1.48, "grad_norm": 1.7410084009170532, "learning_rate": 3.992796228979631e-05, "loss": 1.803, "step": 8280 }, { "epoch": 1.48, "grad_norm": 0.7153067588806152, "learning_rate": 3.991668156348261e-05, "loss": 1.7957, "step": 8285 }, { "epoch": 1.48, "grad_norm": 0.6121552586555481, "learning_rate": 3.9905396119036876e-05, "loss": 1.8041, "step": 8290 }, { "epoch": 1.48, "grad_norm": 0.8138649463653564, "learning_rate": 3.989410596002869e-05, "loss": 1.8954, "step": 8295 }, { "epoch": 1.49, "grad_norm": 1.1422274112701416, "learning_rate": 3.9882811090029106e-05, "loss": 1.8859, "step": 8300 }, { "epoch": 1.49, "grad_norm": 0.29510730504989624, "learning_rate": 3.98715115126107e-05, "loss": 1.9913, "step": 8305 }, { "epoch": 1.49, "grad_norm": 0.8496959209442139, "learning_rate": 3.986020723134751e-05, "loss": 1.8877, "step": 8310 }, { "epoch": 1.49, "grad_norm": 0.5750755071640015, "learning_rate": 3.984889824981506e-05, "loss": 1.6277, "step": 8315 }, { "epoch": 1.49, "grad_norm": 0.6985676288604736, "learning_rate": 3.983758457159037e-05, "loss": 1.823, "step": 8320 }, { "epoch": 1.49, "grad_norm": 1.0394619703292847, "learning_rate": 3.982626620025196e-05, "loss": 2.0183, "step": 8325 }, { "epoch": 1.49, "grad_norm": 0.700357973575592, "learning_rate": 3.9814943139379815e-05, "loss": 1.825, "step": 8330 }, { "epoch": 1.49, "grad_norm": 0.3674771785736084, "learning_rate": 3.980361539255541e-05, "loss": 2.0308, "step": 8335 }, { "epoch": 1.49, "grad_norm": 0.899864912033081, "learning_rate": 3.979228296336168e-05, "loss": 1.9477, "step": 8340 }, { "epoch": 1.49, "grad_norm": 0.9185581803321838, "learning_rate": 3.978094585538308e-05, "loss": 1.8853, "step": 8345 }, { "epoch": 1.49, "grad_norm": 0.47985824942588806, "learning_rate": 3.9769604072205524e-05, "loss": 1.742, "step": 8350 }, { "epoch": 1.5, "grad_norm": 0.42667725682258606, "learning_rate": 3.9758257617416414e-05, "loss": 1.935, "step": 8355 }, { "epoch": 1.5, "grad_norm": 1.3752597570419312, "learning_rate": 3.974690649460461e-05, "loss": 1.9357, "step": 8360 }, { "epoch": 1.5, "grad_norm": 0.3676702380180359, "learning_rate": 3.973555070736047e-05, "loss": 1.7689, "step": 8365 }, { "epoch": 1.5, "grad_norm": 0.7576048374176025, "learning_rate": 3.9724190259275804e-05, "loss": 1.9353, "step": 8370 }, { "epoch": 1.5, "grad_norm": 0.8016186356544495, "learning_rate": 3.9712825153943934e-05, "loss": 1.8762, "step": 8375 }, { "epoch": 1.5, "grad_norm": 0.8371986150741577, "learning_rate": 3.970145539495962e-05, "loss": 1.7327, "step": 8380 }, { "epoch": 1.5, "grad_norm": 0.9057819843292236, "learning_rate": 3.9690080985919096e-05, "loss": 1.5471, "step": 8385 }, { "epoch": 1.5, "grad_norm": 1.966324806213379, "learning_rate": 3.9678701930420095e-05, "loss": 1.915, "step": 8390 }, { "epoch": 1.5, "grad_norm": 1.0594321489334106, "learning_rate": 3.966731823206179e-05, "loss": 2.0922, "step": 8395 }, { "epoch": 1.5, "grad_norm": 0.3403851389884949, "learning_rate": 3.965592989444484e-05, "loss": 1.8441, "step": 8400 }, { "epoch": 1.5, "grad_norm": 0.611869215965271, "learning_rate": 3.964453692117136e-05, "loss": 2.1253, "step": 8405 }, { "epoch": 1.51, "grad_norm": 0.7241839170455933, "learning_rate": 3.963313931584493e-05, "loss": 1.7705, "step": 8410 }, { "epoch": 1.51, "grad_norm": 1.3061637878417969, "learning_rate": 3.9621737082070616e-05, "loss": 1.9305, "step": 8415 }, { "epoch": 1.51, "grad_norm": 1.108342170715332, "learning_rate": 3.9610330223454926e-05, "loss": 1.8427, "step": 8420 }, { "epoch": 1.51, "grad_norm": 0.5697190761566162, "learning_rate": 3.959891874360584e-05, "loss": 1.7063, "step": 8425 }, { "epoch": 1.51, "grad_norm": 0.7562960982322693, "learning_rate": 3.9587502646132804e-05, "loss": 2.0017, "step": 8430 }, { "epoch": 1.51, "grad_norm": 1.1858628988265991, "learning_rate": 3.95760819346467e-05, "loss": 1.8896, "step": 8435 }, { "epoch": 1.51, "grad_norm": 0.5219876766204834, "learning_rate": 3.95646566127599e-05, "loss": 1.8532, "step": 8440 }, { "epoch": 1.51, "grad_norm": 0.36081287264823914, "learning_rate": 3.955322668408623e-05, "loss": 1.7424, "step": 8445 }, { "epoch": 1.51, "grad_norm": 1.1630758047103882, "learning_rate": 3.9541792152240965e-05, "loss": 1.6692, "step": 8450 }, { "epoch": 1.51, "grad_norm": 0.41798633337020874, "learning_rate": 3.953035302084082e-05, "loss": 1.8561, "step": 8455 }, { "epoch": 1.51, "grad_norm": 0.932482898235321, "learning_rate": 3.9518909293503994e-05, "loss": 1.9695, "step": 8460 }, { "epoch": 1.52, "grad_norm": 0.6918326616287231, "learning_rate": 3.950746097385012e-05, "loss": 1.8665, "step": 8465 }, { "epoch": 1.52, "grad_norm": 0.8644784688949585, "learning_rate": 3.9496008065500303e-05, "loss": 1.9897, "step": 8470 }, { "epoch": 1.52, "grad_norm": 0.774658203125, "learning_rate": 3.9484550572077075e-05, "loss": 1.7163, "step": 8475 }, { "epoch": 1.52, "grad_norm": 0.9765104651451111, "learning_rate": 3.9473088497204424e-05, "loss": 1.848, "step": 8480 }, { "epoch": 1.52, "grad_norm": 0.5454521775245667, "learning_rate": 3.946162184450781e-05, "loss": 1.8886, "step": 8485 }, { "epoch": 1.52, "grad_norm": 0.514015793800354, "learning_rate": 3.945015061761412e-05, "loss": 2.0871, "step": 8490 }, { "epoch": 1.52, "grad_norm": 0.8409627079963684, "learning_rate": 3.943867482015169e-05, "loss": 1.8131, "step": 8495 }, { "epoch": 1.52, "grad_norm": 0.5961334705352783, "learning_rate": 3.94271944557503e-05, "loss": 1.9685, "step": 8500 }, { "epoch": 1.52, "grad_norm": 1.0173949003219604, "learning_rate": 3.941570952804118e-05, "loss": 1.8924, "step": 8505 }, { "epoch": 1.52, "grad_norm": 0.6633391380310059, "learning_rate": 3.9404220040657e-05, "loss": 1.8846, "step": 8510 }, { "epoch": 1.52, "grad_norm": 12.116934776306152, "learning_rate": 3.9392725997231874e-05, "loss": 1.9804, "step": 8515 }, { "epoch": 1.53, "grad_norm": 0.48671266436576843, "learning_rate": 3.938122740140137e-05, "loss": 1.921, "step": 8520 }, { "epoch": 1.53, "grad_norm": 0.747097373008728, "learning_rate": 3.936972425680246e-05, "loss": 1.962, "step": 8525 }, { "epoch": 1.53, "grad_norm": 0.5572634935379028, "learning_rate": 3.935821656707359e-05, "loss": 1.6872, "step": 8530 }, { "epoch": 1.53, "grad_norm": 0.41499078273773193, "learning_rate": 3.934670433585464e-05, "loss": 1.7176, "step": 8535 }, { "epoch": 1.53, "grad_norm": 0.8987358212471008, "learning_rate": 3.93351875667869e-05, "loss": 2.0023, "step": 8540 }, { "epoch": 1.53, "grad_norm": 0.6154505610466003, "learning_rate": 3.932366626351313e-05, "loss": 2.1713, "step": 8545 }, { "epoch": 1.53, "grad_norm": 0.38522958755493164, "learning_rate": 3.931214042967749e-05, "loss": 1.9133, "step": 8550 }, { "epoch": 1.53, "grad_norm": 0.7890143990516663, "learning_rate": 3.93006100689256e-05, "loss": 2.0062, "step": 8555 }, { "epoch": 1.53, "grad_norm": 0.5682582855224609, "learning_rate": 3.92890751849045e-05, "loss": 1.9966, "step": 8560 }, { "epoch": 1.53, "grad_norm": 0.6989255547523499, "learning_rate": 3.9277535781262676e-05, "loss": 1.6446, "step": 8565 }, { "epoch": 1.53, "grad_norm": 0.769821286201477, "learning_rate": 3.926599186165001e-05, "loss": 1.9349, "step": 8570 }, { "epoch": 1.53, "grad_norm": 0.46688029170036316, "learning_rate": 3.925444342971783e-05, "loss": 2.1585, "step": 8575 }, { "epoch": 1.54, "grad_norm": 0.609228789806366, "learning_rate": 3.9242890489118914e-05, "loss": 1.8908, "step": 8580 }, { "epoch": 1.54, "grad_norm": 0.7954393625259399, "learning_rate": 3.9231333043507436e-05, "loss": 1.9862, "step": 8585 }, { "epoch": 1.54, "grad_norm": 0.45553261041641235, "learning_rate": 3.9219771096539e-05, "loss": 2.0167, "step": 8590 }, { "epoch": 1.54, "grad_norm": 0.3282541334629059, "learning_rate": 3.9208204651870653e-05, "loss": 2.0051, "step": 8595 }, { "epoch": 1.54, "grad_norm": 0.7702216506004333, "learning_rate": 3.9196633713160834e-05, "loss": 1.8989, "step": 8600 }, { "epoch": 1.54, "grad_norm": 1.1692415475845337, "learning_rate": 3.918505828406942e-05, "loss": 1.9096, "step": 8605 }, { "epoch": 1.54, "grad_norm": 0.8676744699478149, "learning_rate": 3.917347836825773e-05, "loss": 1.9068, "step": 8610 }, { "epoch": 1.54, "grad_norm": 0.6879375576972961, "learning_rate": 3.9161893969388465e-05, "loss": 1.8098, "step": 8615 }, { "epoch": 1.54, "grad_norm": 0.6627622842788696, "learning_rate": 3.9150305091125746e-05, "loss": 1.7859, "step": 8620 }, { "epoch": 1.54, "grad_norm": 3.1366701126098633, "learning_rate": 3.913871173713515e-05, "loss": 2.2024, "step": 8625 }, { "epoch": 1.54, "grad_norm": 0.37004053592681885, "learning_rate": 3.912711391108362e-05, "loss": 1.688, "step": 8630 }, { "epoch": 1.55, "grad_norm": 1.0141103267669678, "learning_rate": 3.911551161663956e-05, "loss": 2.1577, "step": 8635 }, { "epoch": 1.55, "grad_norm": 0.5730118155479431, "learning_rate": 3.9103904857472745e-05, "loss": 1.8459, "step": 8640 }, { "epoch": 1.55, "grad_norm": 0.5071932077407837, "learning_rate": 3.9092293637254395e-05, "loss": 2.0975, "step": 8645 }, { "epoch": 1.55, "grad_norm": 1.376279354095459, "learning_rate": 3.908067795965712e-05, "loss": 2.0358, "step": 8650 }, { "epoch": 1.55, "grad_norm": 1.1163078546524048, "learning_rate": 3.9069057828354947e-05, "loss": 2.0103, "step": 8655 }, { "epoch": 1.55, "grad_norm": 0.7586190700531006, "learning_rate": 3.905743324702332e-05, "loss": 2.0341, "step": 8660 }, { "epoch": 1.55, "grad_norm": 0.7790800333023071, "learning_rate": 3.9045804219339076e-05, "loss": 1.836, "step": 8665 }, { "epoch": 1.55, "grad_norm": 0.8625195622444153, "learning_rate": 3.903417074898047e-05, "loss": 2.0557, "step": 8670 }, { "epoch": 1.55, "grad_norm": 0.7535802721977234, "learning_rate": 3.902253283962716e-05, "loss": 1.9474, "step": 8675 }, { "epoch": 1.55, "grad_norm": 0.9609790444374084, "learning_rate": 3.9010890494960195e-05, "loss": 1.8005, "step": 8680 }, { "epoch": 1.55, "grad_norm": 0.8396172523498535, "learning_rate": 3.8999243718662036e-05, "loss": 1.8434, "step": 8685 }, { "epoch": 1.56, "grad_norm": 0.6551474332809448, "learning_rate": 3.8987592514416564e-05, "loss": 1.8392, "step": 8690 }, { "epoch": 1.56, "grad_norm": 0.9734852313995361, "learning_rate": 3.8975936885909035e-05, "loss": 1.8953, "step": 8695 }, { "epoch": 1.56, "grad_norm": 0.37571898102760315, "learning_rate": 3.896427683682611e-05, "loss": 1.7285, "step": 8700 }, { "epoch": 1.56, "grad_norm": 0.8209627866744995, "learning_rate": 3.895261237085585e-05, "loss": 1.833, "step": 8705 }, { "epoch": 1.56, "grad_norm": 0.3994194269180298, "learning_rate": 3.894094349168772e-05, "loss": 2.1348, "step": 8710 }, { "epoch": 1.56, "grad_norm": 0.766764223575592, "learning_rate": 3.892927020301257e-05, "loss": 1.9434, "step": 8715 }, { "epoch": 1.56, "grad_norm": 0.7344279885292053, "learning_rate": 3.891759250852266e-05, "loss": 1.9841, "step": 8720 }, { "epoch": 1.56, "grad_norm": 0.8418555855751038, "learning_rate": 3.8905910411911625e-05, "loss": 1.9083, "step": 8725 }, { "epoch": 1.56, "grad_norm": 0.784453272819519, "learning_rate": 3.8894223916874494e-05, "loss": 2.1818, "step": 8730 }, { "epoch": 1.56, "grad_norm": 0.8457757830619812, "learning_rate": 3.8882533027107713e-05, "loss": 2.121, "step": 8735 }, { "epoch": 1.56, "grad_norm": 0.5709465742111206, "learning_rate": 3.887083774630908e-05, "loss": 1.8507, "step": 8740 }, { "epoch": 1.57, "grad_norm": 0.5889132022857666, "learning_rate": 3.885913807817781e-05, "loss": 1.7544, "step": 8745 }, { "epoch": 1.57, "grad_norm": 2.483522653579712, "learning_rate": 3.8847434026414516e-05, "loss": 2.0034, "step": 8750 }, { "epoch": 1.57, "grad_norm": 0.6778923869132996, "learning_rate": 3.883572559472114e-05, "loss": 1.7116, "step": 8755 }, { "epoch": 1.57, "grad_norm": 0.8211702704429626, "learning_rate": 3.882401278680107e-05, "loss": 1.7848, "step": 8760 }, { "epoch": 1.57, "grad_norm": 1.0367469787597656, "learning_rate": 3.881229560635905e-05, "loss": 1.9696, "step": 8765 }, { "epoch": 1.57, "grad_norm": 0.47565755248069763, "learning_rate": 3.8800574057101227e-05, "loss": 1.8395, "step": 8770 }, { "epoch": 1.57, "grad_norm": 5.1431565284729, "learning_rate": 3.878884814273509e-05, "loss": 1.5831, "step": 8775 }, { "epoch": 1.57, "grad_norm": 0.6216502785682678, "learning_rate": 3.877711786696956e-05, "loss": 2.0567, "step": 8780 }, { "epoch": 1.57, "grad_norm": 1.0751134157180786, "learning_rate": 3.8765383233514895e-05, "loss": 1.9522, "step": 8785 }, { "epoch": 1.57, "grad_norm": 0.7744889259338379, "learning_rate": 3.875364424608275e-05, "loss": 1.8294, "step": 8790 }, { "epoch": 1.57, "grad_norm": 0.7963035106658936, "learning_rate": 3.874190090838616e-05, "loss": 2.0646, "step": 8795 }, { "epoch": 1.58, "grad_norm": 0.6751787662506104, "learning_rate": 3.873015322413954e-05, "loss": 1.864, "step": 8800 }, { "epoch": 1.58, "grad_norm": 1.275324821472168, "learning_rate": 3.871840119705866e-05, "loss": 1.6613, "step": 8805 }, { "epoch": 1.58, "grad_norm": 11.015603065490723, "learning_rate": 3.870664483086067e-05, "loss": 1.9226, "step": 8810 }, { "epoch": 1.58, "grad_norm": 0.6475034952163696, "learning_rate": 3.869488412926411e-05, "loss": 1.6963, "step": 8815 }, { "epoch": 1.58, "grad_norm": 0.9630057215690613, "learning_rate": 3.8683119095988865e-05, "loss": 2.0588, "step": 8820 }, { "epoch": 1.58, "grad_norm": 0.8182287812232971, "learning_rate": 3.867134973475622e-05, "loss": 2.4044, "step": 8825 }, { "epoch": 1.58, "grad_norm": 1.1581512689590454, "learning_rate": 3.8659576049288806e-05, "loss": 1.6287, "step": 8830 }, { "epoch": 1.58, "grad_norm": 0.7855355143547058, "learning_rate": 3.864779804331061e-05, "loss": 1.6889, "step": 8835 }, { "epoch": 1.58, "grad_norm": 0.5677868723869324, "learning_rate": 3.863601572054704e-05, "loss": 1.8155, "step": 8840 }, { "epoch": 1.58, "grad_norm": 0.6653741002082825, "learning_rate": 3.8624229084724804e-05, "loss": 2.2333, "step": 8845 }, { "epoch": 1.58, "grad_norm": 0.5179418325424194, "learning_rate": 3.861243813957201e-05, "loss": 1.884, "step": 8850 }, { "epoch": 1.59, "grad_norm": 0.8245850801467896, "learning_rate": 3.8600642888818125e-05, "loss": 1.9703, "step": 8855 }, { "epoch": 1.59, "grad_norm": 0.5349978804588318, "learning_rate": 3.8588843336193994e-05, "loss": 2.0805, "step": 8860 }, { "epoch": 1.59, "grad_norm": 0.5746064782142639, "learning_rate": 3.857703948543176e-05, "loss": 1.5872, "step": 8865 }, { "epoch": 1.59, "grad_norm": 0.7160893082618713, "learning_rate": 3.856523134026502e-05, "loss": 1.8665, "step": 8870 }, { "epoch": 1.59, "grad_norm": 1.8419469594955444, "learning_rate": 3.8553418904428644e-05, "loss": 1.4556, "step": 8875 }, { "epoch": 1.59, "grad_norm": 0.6989808678627014, "learning_rate": 3.854160218165891e-05, "loss": 1.6613, "step": 8880 }, { "epoch": 1.59, "grad_norm": 1.752234697341919, "learning_rate": 3.8529781175693423e-05, "loss": 1.8097, "step": 8885 }, { "epoch": 1.59, "grad_norm": 0.7804884910583496, "learning_rate": 3.851795589027117e-05, "loss": 1.7908, "step": 8890 }, { "epoch": 1.59, "grad_norm": 0.6645786762237549, "learning_rate": 3.850612632913247e-05, "loss": 2.0249, "step": 8895 }, { "epoch": 1.59, "grad_norm": 0.454006165266037, "learning_rate": 3.849429249601901e-05, "loss": 1.7901, "step": 8900 }, { "epoch": 1.59, "grad_norm": 0.7006505131721497, "learning_rate": 3.8482454394673806e-05, "loss": 1.9092, "step": 8905 }, { "epoch": 1.59, "grad_norm": 0.7154279947280884, "learning_rate": 3.8470612028841245e-05, "loss": 1.8582, "step": 8910 }, { "epoch": 1.6, "grad_norm": 0.37950852513313293, "learning_rate": 3.845876540226706e-05, "loss": 1.9758, "step": 8915 }, { "epoch": 1.6, "grad_norm": 0.5836817622184753, "learning_rate": 3.844691451869832e-05, "loss": 1.7204, "step": 8920 }, { "epoch": 1.6, "grad_norm": 0.4787684679031372, "learning_rate": 3.843505938188346e-05, "loss": 1.781, "step": 8925 }, { "epoch": 1.6, "grad_norm": 0.5430890917778015, "learning_rate": 3.8423199995572235e-05, "loss": 2.0103, "step": 8930 }, { "epoch": 1.6, "grad_norm": 0.5893062353134155, "learning_rate": 3.841133636351576e-05, "loss": 1.9518, "step": 8935 }, { "epoch": 1.6, "grad_norm": 0.7547013163566589, "learning_rate": 3.83994684894665e-05, "loss": 1.7754, "step": 8940 }, { "epoch": 1.6, "grad_norm": 2.5084164142608643, "learning_rate": 3.8387596377178236e-05, "loss": 1.6288, "step": 8945 }, { "epoch": 1.6, "grad_norm": 0.533129870891571, "learning_rate": 3.837572003040612e-05, "loss": 1.7496, "step": 8950 }, { "epoch": 1.6, "grad_norm": 0.6257514357566833, "learning_rate": 3.8363839452906627e-05, "loss": 1.8528, "step": 8955 }, { "epoch": 1.6, "grad_norm": 0.8380822539329529, "learning_rate": 3.835195464843757e-05, "loss": 1.8115, "step": 8960 }, { "epoch": 1.6, "grad_norm": 0.9693661332130432, "learning_rate": 3.834006562075809e-05, "loss": 1.6759, "step": 8965 }, { "epoch": 1.61, "grad_norm": 0.6388950943946838, "learning_rate": 3.83281723736287e-05, "loss": 1.7906, "step": 8970 }, { "epoch": 1.61, "grad_norm": 0.9138167500495911, "learning_rate": 3.83162749108112e-05, "loss": 1.7656, "step": 8975 }, { "epoch": 1.61, "grad_norm": 0.5847556591033936, "learning_rate": 3.830437323606876e-05, "loss": 1.7698, "step": 8980 }, { "epoch": 1.61, "grad_norm": 0.6280438899993896, "learning_rate": 3.8292467353165864e-05, "loss": 2.0533, "step": 8985 }, { "epoch": 1.61, "grad_norm": 9.531767845153809, "learning_rate": 3.828055726586832e-05, "loss": 2.0169, "step": 8990 }, { "epoch": 1.61, "grad_norm": 0.7199810743331909, "learning_rate": 3.826864297794329e-05, "loss": 1.8667, "step": 8995 }, { "epoch": 1.61, "grad_norm": 0.5033280849456787, "learning_rate": 3.8256724493159246e-05, "loss": 1.6972, "step": 9000 }, { "epoch": 1.61, "grad_norm": 1.0077893733978271, "learning_rate": 3.8244801815286e-05, "loss": 1.8997, "step": 9005 }, { "epoch": 1.61, "grad_norm": 0.8881186246871948, "learning_rate": 3.823287494809469e-05, "loss": 2.0588, "step": 9010 }, { "epoch": 1.61, "grad_norm": 0.8047612309455872, "learning_rate": 3.822094389535775e-05, "loss": 2.0178, "step": 9015 }, { "epoch": 1.61, "grad_norm": 0.4620480239391327, "learning_rate": 3.8209008660848974e-05, "loss": 2.0099, "step": 9020 }, { "epoch": 1.62, "grad_norm": 0.5921469926834106, "learning_rate": 3.819706924834346e-05, "loss": 1.8967, "step": 9025 }, { "epoch": 1.62, "grad_norm": 0.6395788192749023, "learning_rate": 3.818512566161765e-05, "loss": 1.7851, "step": 9030 }, { "epoch": 1.62, "grad_norm": 0.7558723092079163, "learning_rate": 3.8173177904449265e-05, "loss": 1.6899, "step": 9035 }, { "epoch": 1.62, "grad_norm": 1.2595165967941284, "learning_rate": 3.816122598061739e-05, "loss": 1.9932, "step": 9040 }, { "epoch": 1.62, "grad_norm": 0.8348121643066406, "learning_rate": 3.814926989390238e-05, "loss": 1.7467, "step": 9045 }, { "epoch": 1.62, "grad_norm": 0.8437023162841797, "learning_rate": 3.8137309648085965e-05, "loss": 1.5459, "step": 9050 }, { "epoch": 1.62, "grad_norm": 0.6804198622703552, "learning_rate": 3.8125345246951137e-05, "loss": 1.9587, "step": 9055 }, { "epoch": 1.62, "grad_norm": 1.2239028215408325, "learning_rate": 3.8113376694282245e-05, "loss": 1.8473, "step": 9060 }, { "epoch": 1.62, "grad_norm": 0.9916425943374634, "learning_rate": 3.8101403993864905e-05, "loss": 1.7582, "step": 9065 }, { "epoch": 1.62, "grad_norm": 0.7374111413955688, "learning_rate": 3.808942714948609e-05, "loss": 1.8658, "step": 9070 }, { "epoch": 1.62, "grad_norm": 0.6606670022010803, "learning_rate": 3.807744616493405e-05, "loss": 1.7126, "step": 9075 }, { "epoch": 1.63, "grad_norm": 0.7266793847084045, "learning_rate": 3.806546104399837e-05, "loss": 2.1215, "step": 9080 }, { "epoch": 1.63, "grad_norm": 0.6009851098060608, "learning_rate": 3.8053471790469935e-05, "loss": 1.691, "step": 9085 }, { "epoch": 1.63, "grad_norm": 0.6415896415710449, "learning_rate": 3.8041478408140926e-05, "loss": 1.4786, "step": 9090 }, { "epoch": 1.63, "grad_norm": 0.6692395806312561, "learning_rate": 3.802948090080484e-05, "loss": 1.8868, "step": 9095 }, { "epoch": 1.63, "grad_norm": 2.1220877170562744, "learning_rate": 3.8017479272256476e-05, "loss": 1.8618, "step": 9100 }, { "epoch": 1.63, "grad_norm": 0.643251895904541, "learning_rate": 3.800547352629195e-05, "loss": 1.9866, "step": 9105 }, { "epoch": 1.63, "grad_norm": 0.6525068283081055, "learning_rate": 3.799346366670864e-05, "loss": 1.7077, "step": 9110 }, { "epoch": 1.63, "grad_norm": 0.711988091468811, "learning_rate": 3.798144969730528e-05, "loss": 1.7389, "step": 9115 }, { "epoch": 1.63, "grad_norm": 0.5814893841743469, "learning_rate": 3.796943162188186e-05, "loss": 2.0051, "step": 9120 }, { "epoch": 1.63, "grad_norm": 1.351395845413208, "learning_rate": 3.79574094442397e-05, "loss": 2.038, "step": 9125 }, { "epoch": 1.63, "grad_norm": 0.42251327633857727, "learning_rate": 3.7945383168181405e-05, "loss": 1.7681, "step": 9130 }, { "epoch": 1.64, "grad_norm": 1.0346791744232178, "learning_rate": 3.793335279751085e-05, "loss": 2.0539, "step": 9135 }, { "epoch": 1.64, "grad_norm": 0.7705110311508179, "learning_rate": 3.792131833603325e-05, "loss": 2.0802, "step": 9140 }, { "epoch": 1.64, "grad_norm": 0.7287485003471375, "learning_rate": 3.790927978755509e-05, "loss": 1.7397, "step": 9145 }, { "epoch": 1.64, "grad_norm": 0.5948812365531921, "learning_rate": 3.789723715588416e-05, "loss": 1.8581, "step": 9150 }, { "epoch": 1.64, "grad_norm": 0.4935462772846222, "learning_rate": 3.7885190444829503e-05, "loss": 1.7997, "step": 9155 }, { "epoch": 1.64, "grad_norm": 0.9329087138175964, "learning_rate": 3.7873139658201516e-05, "loss": 2.0113, "step": 9160 }, { "epoch": 1.64, "grad_norm": 0.4054046869277954, "learning_rate": 3.786108479981183e-05, "loss": 2.0666, "step": 9165 }, { "epoch": 1.64, "grad_norm": 0.6794940829277039, "learning_rate": 3.7849025873473404e-05, "loss": 2.0231, "step": 9170 }, { "epoch": 1.64, "grad_norm": 0.5740146636962891, "learning_rate": 3.7836962883000446e-05, "loss": 1.82, "step": 9175 }, { "epoch": 1.64, "grad_norm": 0.6178922057151794, "learning_rate": 3.782489583220847e-05, "loss": 2.0086, "step": 9180 }, { "epoch": 1.64, "grad_norm": 0.6487544775009155, "learning_rate": 3.781282472491429e-05, "loss": 1.7158, "step": 9185 }, { "epoch": 1.64, "grad_norm": 0.6807827353477478, "learning_rate": 3.780074956493597e-05, "loss": 2.0595, "step": 9190 }, { "epoch": 1.65, "grad_norm": 0.687058687210083, "learning_rate": 3.778867035609288e-05, "loss": 1.5461, "step": 9195 }, { "epoch": 1.65, "grad_norm": 0.5961658954620361, "learning_rate": 3.777658710220564e-05, "loss": 2.0018, "step": 9200 }, { "epoch": 1.65, "grad_norm": 0.5087503790855408, "learning_rate": 3.776449980709621e-05, "loss": 2.0864, "step": 9205 }, { "epoch": 1.65, "grad_norm": 0.8616182804107666, "learning_rate": 3.775240847458775e-05, "loss": 1.7007, "step": 9210 }, { "epoch": 1.65, "grad_norm": 0.5542956590652466, "learning_rate": 3.774031310850477e-05, "loss": 2.0111, "step": 9215 }, { "epoch": 1.65, "grad_norm": 0.8051539063453674, "learning_rate": 3.772821371267301e-05, "loss": 1.7596, "step": 9220 }, { "epoch": 1.65, "grad_norm": 1.0442497730255127, "learning_rate": 3.771611029091948e-05, "loss": 1.6548, "step": 9225 }, { "epoch": 1.65, "grad_norm": 0.7157022953033447, "learning_rate": 3.77040028470725e-05, "loss": 1.5846, "step": 9230 }, { "epoch": 1.65, "grad_norm": 0.660575807094574, "learning_rate": 3.7691891384961654e-05, "loss": 2.0322, "step": 9235 }, { "epoch": 1.65, "grad_norm": 1.2163981199264526, "learning_rate": 3.767977590841776e-05, "loss": 2.1145, "step": 9240 }, { "epoch": 1.65, "grad_norm": 0.7265014052391052, "learning_rate": 3.7667656421272946e-05, "loss": 1.7375, "step": 9245 }, { "epoch": 1.66, "grad_norm": 1.1795376539230347, "learning_rate": 3.765553292736059e-05, "loss": 1.9286, "step": 9250 }, { "epoch": 1.66, "grad_norm": 0.5306074023246765, "learning_rate": 3.7643405430515344e-05, "loss": 1.9642, "step": 9255 }, { "epoch": 1.66, "grad_norm": 0.6961469650268555, "learning_rate": 3.763127393457311e-05, "loss": 1.9714, "step": 9260 }, { "epoch": 1.66, "grad_norm": 0.5733311772346497, "learning_rate": 3.76191384433711e-05, "loss": 1.6244, "step": 9265 }, { "epoch": 1.66, "grad_norm": 0.6173118352890015, "learning_rate": 3.7606998960747726e-05, "loss": 1.9729, "step": 9270 }, { "epoch": 1.66, "grad_norm": 0.7085820436477661, "learning_rate": 3.7594855490542704e-05, "loss": 1.8824, "step": 9275 }, { "epoch": 1.66, "grad_norm": 1.8288882970809937, "learning_rate": 3.758270803659701e-05, "loss": 1.7626, "step": 9280 }, { "epoch": 1.66, "grad_norm": 0.7774258255958557, "learning_rate": 3.757055660275286e-05, "loss": 1.7774, "step": 9285 }, { "epoch": 1.66, "grad_norm": 1.5239425897598267, "learning_rate": 3.755840119285377e-05, "loss": 2.165, "step": 9290 }, { "epoch": 1.66, "grad_norm": 0.4820414185523987, "learning_rate": 3.7546241810744445e-05, "loss": 2.2087, "step": 9295 }, { "epoch": 1.66, "grad_norm": 0.6915215849876404, "learning_rate": 3.75340784602709e-05, "loss": 1.6879, "step": 9300 }, { "epoch": 1.67, "grad_norm": 0.5616482496261597, "learning_rate": 3.7521911145280395e-05, "loss": 2.1273, "step": 9305 }, { "epoch": 1.67, "grad_norm": 0.9838537573814392, "learning_rate": 3.750973986962145e-05, "loss": 1.7339, "step": 9310 }, { "epoch": 1.67, "grad_norm": 0.5095938444137573, "learning_rate": 3.7497564637143814e-05, "loss": 1.8669, "step": 9315 }, { "epoch": 1.67, "grad_norm": 0.5213245153427124, "learning_rate": 3.74853854516985e-05, "loss": 1.8368, "step": 9320 }, { "epoch": 1.67, "grad_norm": 0.6392638087272644, "learning_rate": 3.747320231713778e-05, "loss": 1.8477, "step": 9325 }, { "epoch": 1.67, "grad_norm": 8.093245506286621, "learning_rate": 3.746101523731515e-05, "loss": 1.8944, "step": 9330 }, { "epoch": 1.67, "grad_norm": 0.5117234587669373, "learning_rate": 3.744882421608541e-05, "loss": 1.73, "step": 9335 }, { "epoch": 1.67, "grad_norm": 0.5771470665931702, "learning_rate": 3.743662925730453e-05, "loss": 2.0571, "step": 9340 }, { "epoch": 1.67, "grad_norm": 0.6059357523918152, "learning_rate": 3.7424430364829785e-05, "loss": 1.735, "step": 9345 }, { "epoch": 1.67, "grad_norm": 0.48577621579170227, "learning_rate": 3.7412227542519664e-05, "loss": 1.8274, "step": 9350 }, { "epoch": 1.67, "grad_norm": 0.7163307070732117, "learning_rate": 3.740002079423392e-05, "loss": 2.0151, "step": 9355 }, { "epoch": 1.68, "grad_norm": 0.6120556592941284, "learning_rate": 3.738781012383351e-05, "loss": 1.8655, "step": 9360 }, { "epoch": 1.68, "grad_norm": 0.6820749640464783, "learning_rate": 3.737559553518067e-05, "loss": 1.8406, "step": 9365 }, { "epoch": 1.68, "grad_norm": 0.6751840114593506, "learning_rate": 3.736337703213888e-05, "loss": 1.8045, "step": 9370 }, { "epoch": 1.68, "grad_norm": 0.5991414785385132, "learning_rate": 3.735115461857282e-05, "loss": 1.7511, "step": 9375 }, { "epoch": 1.68, "grad_norm": 0.6772866249084473, "learning_rate": 3.733892829834845e-05, "loss": 2.0622, "step": 9380 }, { "epoch": 1.68, "grad_norm": 0.37456727027893066, "learning_rate": 3.732669807533291e-05, "loss": 1.7938, "step": 9385 }, { "epoch": 1.68, "grad_norm": 0.5582310557365417, "learning_rate": 3.731446395339463e-05, "loss": 1.7566, "step": 9390 }, { "epoch": 1.68, "grad_norm": 0.5218795537948608, "learning_rate": 3.730222593640324e-05, "loss": 1.6361, "step": 9395 }, { "epoch": 1.68, "grad_norm": 0.6812488436698914, "learning_rate": 3.7289984028229636e-05, "loss": 1.9274, "step": 9400 }, { "epoch": 1.68, "grad_norm": 0.6946431398391724, "learning_rate": 3.72777382327459e-05, "loss": 1.8467, "step": 9405 }, { "epoch": 1.68, "grad_norm": 2.7319812774658203, "learning_rate": 3.726548855382536e-05, "loss": 1.696, "step": 9410 }, { "epoch": 1.69, "grad_norm": 0.6396403312683105, "learning_rate": 3.725323499534259e-05, "loss": 1.8228, "step": 9415 }, { "epoch": 1.69, "grad_norm": 0.4877493977546692, "learning_rate": 3.724097756117338e-05, "loss": 2.1406, "step": 9420 }, { "epoch": 1.69, "grad_norm": 0.5904247164726257, "learning_rate": 3.722871625519475e-05, "loss": 1.9487, "step": 9425 }, { "epoch": 1.69, "grad_norm": 0.6823410391807556, "learning_rate": 3.721645108128493e-05, "loss": 1.7107, "step": 9430 }, { "epoch": 1.69, "grad_norm": 0.688463568687439, "learning_rate": 3.720418204332338e-05, "loss": 1.7925, "step": 9435 }, { "epoch": 1.69, "grad_norm": 1.0077917575836182, "learning_rate": 3.719190914519078e-05, "loss": 1.8868, "step": 9440 }, { "epoch": 1.69, "grad_norm": 1.1595532894134521, "learning_rate": 3.7179632390769055e-05, "loss": 1.6869, "step": 9445 }, { "epoch": 1.69, "grad_norm": 0.8011316657066345, "learning_rate": 3.716735178394131e-05, "loss": 1.7987, "step": 9450 }, { "epoch": 1.69, "grad_norm": 0.5646600723266602, "learning_rate": 3.7155067328591906e-05, "loss": 1.8537, "step": 9455 }, { "epoch": 1.69, "grad_norm": 0.5149386525154114, "learning_rate": 3.7142779028606405e-05, "loss": 1.8974, "step": 9460 }, { "epoch": 1.69, "grad_norm": 1.7945959568023682, "learning_rate": 3.713048688787157e-05, "loss": 1.6476, "step": 9465 }, { "epoch": 1.7, "grad_norm": 0.4259641766548157, "learning_rate": 3.7118190910275394e-05, "loss": 1.8111, "step": 9470 }, { "epoch": 1.7, "grad_norm": 0.5765038728713989, "learning_rate": 3.71058910997071e-05, "loss": 1.571, "step": 9475 }, { "epoch": 1.7, "grad_norm": 0.6007339954376221, "learning_rate": 3.7093587460057087e-05, "loss": 1.8102, "step": 9480 }, { "epoch": 1.7, "grad_norm": 0.3550678491592407, "learning_rate": 3.708127999521701e-05, "loss": 1.8255, "step": 9485 }, { "epoch": 1.7, "grad_norm": 1.5561153888702393, "learning_rate": 3.706896870907967e-05, "loss": 1.8103, "step": 9490 }, { "epoch": 1.7, "grad_norm": 0.6592022180557251, "learning_rate": 3.705665360553914e-05, "loss": 1.7564, "step": 9495 }, { "epoch": 1.7, "grad_norm": 0.9379505515098572, "learning_rate": 3.704433468849068e-05, "loss": 1.9543, "step": 9500 }, { "epoch": 1.7, "grad_norm": 0.7542968392372131, "learning_rate": 3.7032011961830744e-05, "loss": 1.9984, "step": 9505 }, { "epoch": 1.7, "grad_norm": 1.1859865188598633, "learning_rate": 3.7019685429456986e-05, "loss": 1.8548, "step": 9510 }, { "epoch": 1.7, "grad_norm": 1.0546746253967285, "learning_rate": 3.70073550952683e-05, "loss": 1.6335, "step": 9515 }, { "epoch": 1.7, "grad_norm": 0.41656044125556946, "learning_rate": 3.699502096316474e-05, "loss": 1.817, "step": 9520 }, { "epoch": 1.7, "grad_norm": 0.3669623136520386, "learning_rate": 3.698268303704759e-05, "loss": 1.8767, "step": 9525 }, { "epoch": 1.71, "grad_norm": 0.5422895550727844, "learning_rate": 3.69728099670865e-05, "loss": 2.1527, "step": 9530 }, { "epoch": 1.71, "grad_norm": 0.6797967553138733, "learning_rate": 3.6960465221579927e-05, "loss": 1.7675, "step": 9535 }, { "epoch": 1.71, "grad_norm": 0.9210271239280701, "learning_rate": 3.69481166929897e-05, "loss": 1.6502, "step": 9540 }, { "epoch": 1.71, "grad_norm": 0.8029117584228516, "learning_rate": 3.693576438522167e-05, "loss": 1.8089, "step": 9545 }, { "epoch": 1.71, "grad_norm": 0.47192418575286865, "learning_rate": 3.6923408302182834e-05, "loss": 2.0336, "step": 9550 }, { "epoch": 1.71, "grad_norm": 0.66648268699646, "learning_rate": 3.691104844778142e-05, "loss": 1.7003, "step": 9555 }, { "epoch": 1.71, "grad_norm": 0.6902146935462952, "learning_rate": 3.689868482592684e-05, "loss": 1.8137, "step": 9560 }, { "epoch": 1.71, "grad_norm": 0.8713681697845459, "learning_rate": 3.688631744052969e-05, "loss": 2.1054, "step": 9565 }, { "epoch": 1.71, "grad_norm": 0.8756383657455444, "learning_rate": 3.687394629550178e-05, "loss": 2.0403, "step": 9570 }, { "epoch": 1.71, "grad_norm": 0.8325067758560181, "learning_rate": 3.6861571394756066e-05, "loss": 1.8843, "step": 9575 }, { "epoch": 1.71, "grad_norm": 0.5879186987876892, "learning_rate": 3.684919274220674e-05, "loss": 1.9288, "step": 9580 }, { "epoch": 1.72, "grad_norm": 0.5836779475212097, "learning_rate": 3.683681034176914e-05, "loss": 2.024, "step": 9585 }, { "epoch": 1.72, "grad_norm": 0.7767724990844727, "learning_rate": 3.682442419735982e-05, "loss": 2.3006, "step": 9590 }, { "epoch": 1.72, "grad_norm": 0.923039972782135, "learning_rate": 3.681203431289649e-05, "loss": 1.9507, "step": 9595 }, { "epoch": 1.72, "grad_norm": 0.46944764256477356, "learning_rate": 3.6799640692298075e-05, "loss": 1.9669, "step": 9600 }, { "epoch": 1.72, "grad_norm": 0.29924699664115906, "learning_rate": 3.678724333948466e-05, "loss": 1.8226, "step": 9605 }, { "epoch": 1.72, "grad_norm": 0.4777134954929352, "learning_rate": 3.6774842258377515e-05, "loss": 2.3745, "step": 9610 }, { "epoch": 1.72, "grad_norm": 0.8692833781242371, "learning_rate": 3.676243745289909e-05, "loss": 1.9515, "step": 9615 }, { "epoch": 1.72, "grad_norm": 0.9050225019454956, "learning_rate": 3.675002892697301e-05, "loss": 1.9985, "step": 9620 }, { "epoch": 1.72, "grad_norm": 0.6240570545196533, "learning_rate": 3.673761668452408e-05, "loss": 1.8733, "step": 9625 }, { "epoch": 1.72, "grad_norm": 0.6922764778137207, "learning_rate": 3.6725200729478285e-05, "loss": 1.6951, "step": 9630 }, { "epoch": 1.72, "grad_norm": 0.4850800931453705, "learning_rate": 3.671278106576278e-05, "loss": 1.9839, "step": 9635 }, { "epoch": 1.73, "grad_norm": 0.7302365303039551, "learning_rate": 3.670035769730589e-05, "loss": 2.0729, "step": 9640 }, { "epoch": 1.73, "grad_norm": 0.43896591663360596, "learning_rate": 3.668793062803711e-05, "loss": 1.8726, "step": 9645 }, { "epoch": 1.73, "grad_norm": 0.8171257972717285, "learning_rate": 3.667549986188712e-05, "loss": 1.9601, "step": 9650 }, { "epoch": 1.73, "grad_norm": 0.631952166557312, "learning_rate": 3.666306540278775e-05, "loss": 1.745, "step": 9655 }, { "epoch": 1.73, "grad_norm": 0.7533524036407471, "learning_rate": 3.665062725467201e-05, "loss": 1.9608, "step": 9660 }, { "epoch": 1.73, "grad_norm": 0.5362209677696228, "learning_rate": 3.6638185421474084e-05, "loss": 1.5627, "step": 9665 }, { "epoch": 1.73, "grad_norm": 0.6645787358283997, "learning_rate": 3.6625739907129295e-05, "loss": 1.5983, "step": 9670 }, { "epoch": 1.73, "grad_norm": 0.5133046507835388, "learning_rate": 3.6613290715574155e-05, "loss": 1.9776, "step": 9675 }, { "epoch": 1.73, "grad_norm": 1.0483719110488892, "learning_rate": 3.660083785074634e-05, "loss": 2.0728, "step": 9680 }, { "epoch": 1.73, "grad_norm": 0.8712103366851807, "learning_rate": 3.6588381316584674e-05, "loss": 1.7619, "step": 9685 }, { "epoch": 1.73, "grad_norm": 1.295785665512085, "learning_rate": 3.6575921117029136e-05, "loss": 1.8996, "step": 9690 }, { "epoch": 1.74, "grad_norm": 0.5898334383964539, "learning_rate": 3.6563457256020884e-05, "loss": 1.8068, "step": 9695 }, { "epoch": 1.74, "grad_norm": 0.5830318331718445, "learning_rate": 3.655098973750223e-05, "loss": 1.8471, "step": 9700 }, { "epoch": 1.74, "grad_norm": 0.6544398069381714, "learning_rate": 3.653851856541662e-05, "loss": 1.9984, "step": 9705 }, { "epoch": 1.74, "grad_norm": 0.37488725781440735, "learning_rate": 3.6526043743708704e-05, "loss": 1.8106, "step": 9710 }, { "epoch": 1.74, "grad_norm": 1.239549160003662, "learning_rate": 3.651356527632423e-05, "loss": 2.1417, "step": 9715 }, { "epoch": 1.74, "grad_norm": 0.47542452812194824, "learning_rate": 3.650108316721013e-05, "loss": 1.6287, "step": 9720 }, { "epoch": 1.74, "grad_norm": 0.43621698021888733, "learning_rate": 3.648859742031449e-05, "loss": 1.96, "step": 9725 }, { "epoch": 1.74, "grad_norm": 0.5922963619232178, "learning_rate": 3.6476108039586535e-05, "loss": 1.8913, "step": 9730 }, { "epoch": 1.74, "grad_norm": 0.6242755055427551, "learning_rate": 3.646361502897665e-05, "loss": 1.8241, "step": 9735 }, { "epoch": 1.74, "grad_norm": 1.1280851364135742, "learning_rate": 3.645111839243636e-05, "loss": 2.1662, "step": 9740 }, { "epoch": 1.74, "grad_norm": 0.6557629704475403, "learning_rate": 3.643861813391833e-05, "loss": 2.0015, "step": 9745 }, { "epoch": 1.75, "grad_norm": 0.6168567538261414, "learning_rate": 3.642611425737639e-05, "loss": 1.5228, "step": 9750 }, { "epoch": 1.75, "grad_norm": 0.4185939133167267, "learning_rate": 3.6413606766765506e-05, "loss": 2.1472, "step": 9755 }, { "epoch": 1.75, "grad_norm": 0.49710220098495483, "learning_rate": 3.6401095666041765e-05, "loss": 1.9794, "step": 9760 }, { "epoch": 1.75, "grad_norm": 0.6820828914642334, "learning_rate": 3.638858095916244e-05, "loss": 1.7363, "step": 9765 }, { "epoch": 1.75, "grad_norm": 0.7763912081718445, "learning_rate": 3.637606265008592e-05, "loss": 1.9053, "step": 9770 }, { "epoch": 1.75, "grad_norm": 0.9690907001495361, "learning_rate": 3.636354074277172e-05, "loss": 1.6437, "step": 9775 }, { "epoch": 1.75, "grad_norm": 0.8117865324020386, "learning_rate": 3.635101524118051e-05, "loss": 1.7684, "step": 9780 }, { "epoch": 1.75, "grad_norm": 0.878116250038147, "learning_rate": 3.63384861492741e-05, "loss": 1.9089, "step": 9785 }, { "epoch": 1.75, "grad_norm": 0.3111972510814667, "learning_rate": 3.632595347101543e-05, "loss": 2.0609, "step": 9790 }, { "epoch": 1.75, "grad_norm": 0.38425540924072266, "learning_rate": 3.6313417210368564e-05, "loss": 1.7993, "step": 9795 }, { "epoch": 1.75, "grad_norm": 0.6652374267578125, "learning_rate": 3.630087737129872e-05, "loss": 2.0774, "step": 9800 }, { "epoch": 1.76, "grad_norm": 0.9802791476249695, "learning_rate": 3.628833395777224e-05, "loss": 1.8374, "step": 9805 }, { "epoch": 1.76, "grad_norm": 1.093260407447815, "learning_rate": 3.6275786973756575e-05, "loss": 1.945, "step": 9810 }, { "epoch": 1.76, "grad_norm": 0.5013108253479004, "learning_rate": 3.626323642322035e-05, "loss": 1.8702, "step": 9815 }, { "epoch": 1.76, "grad_norm": 0.7069824934005737, "learning_rate": 3.6250682310133266e-05, "loss": 2.0702, "step": 9820 }, { "epoch": 1.76, "grad_norm": 0.539284348487854, "learning_rate": 3.6238124638466195e-05, "loss": 1.7797, "step": 9825 }, { "epoch": 1.76, "grad_norm": 0.642426609992981, "learning_rate": 3.622556341219111e-05, "loss": 1.8979, "step": 9830 }, { "epoch": 1.76, "grad_norm": 0.7472493052482605, "learning_rate": 3.6212998635281116e-05, "loss": 1.8212, "step": 9835 }, { "epoch": 1.76, "grad_norm": 1.1682623624801636, "learning_rate": 3.620043031171043e-05, "loss": 1.7595, "step": 9840 }, { "epoch": 1.76, "grad_norm": 0.6802049279212952, "learning_rate": 3.618785844545443e-05, "loss": 1.9368, "step": 9845 }, { "epoch": 1.76, "grad_norm": 1.0584665536880493, "learning_rate": 3.617528304048955e-05, "loss": 1.91, "step": 9850 }, { "epoch": 1.76, "grad_norm": 0.6849753260612488, "learning_rate": 3.616270410079339e-05, "loss": 1.8772, "step": 9855 }, { "epoch": 1.76, "grad_norm": 0.9579459428787231, "learning_rate": 3.6150121630344676e-05, "loss": 1.9455, "step": 9860 }, { "epoch": 1.77, "grad_norm": 0.8555591702461243, "learning_rate": 3.61375356331232e-05, "loss": 1.4362, "step": 9865 }, { "epoch": 1.77, "grad_norm": 0.5340035557746887, "learning_rate": 3.612494611310992e-05, "loss": 1.9012, "step": 9870 }, { "epoch": 1.77, "grad_norm": 0.8525397777557373, "learning_rate": 3.61123530742869e-05, "loss": 1.9174, "step": 9875 }, { "epoch": 1.77, "grad_norm": 0.6355140209197998, "learning_rate": 3.6099756520637274e-05, "loss": 1.7693, "step": 9880 }, { "epoch": 1.77, "grad_norm": 0.794104814529419, "learning_rate": 3.608715645614534e-05, "loss": 1.9271, "step": 9885 }, { "epoch": 1.77, "grad_norm": 0.5898195505142212, "learning_rate": 3.6074552884796485e-05, "loss": 1.969, "step": 9890 }, { "epoch": 1.77, "grad_norm": 0.8139387965202332, "learning_rate": 3.606194581057721e-05, "loss": 1.8805, "step": 9895 }, { "epoch": 1.77, "grad_norm": 0.7168921828269958, "learning_rate": 3.60493352374751e-05, "loss": 1.7252, "step": 9900 }, { "epoch": 1.77, "grad_norm": 0.9840201735496521, "learning_rate": 3.6036721169478896e-05, "loss": 1.8256, "step": 9905 }, { "epoch": 1.77, "grad_norm": 0.7537431120872498, "learning_rate": 3.60241036105784e-05, "loss": 2.0456, "step": 9910 }, { "epoch": 1.77, "grad_norm": 1.142112374305725, "learning_rate": 3.601148256476454e-05, "loss": 1.987, "step": 9915 }, { "epoch": 1.78, "grad_norm": 0.9110394716262817, "learning_rate": 3.599885803602933e-05, "loss": 1.7752, "step": 9920 }, { "epoch": 1.78, "grad_norm": 0.6123466491699219, "learning_rate": 3.59862300283659e-05, "loss": 2.0613, "step": 9925 }, { "epoch": 1.78, "grad_norm": 0.7708951234817505, "learning_rate": 3.597359854576848e-05, "loss": 1.8793, "step": 9930 }, { "epoch": 1.78, "grad_norm": 0.6696035861968994, "learning_rate": 3.596096359223241e-05, "loss": 1.8185, "step": 9935 }, { "epoch": 1.78, "grad_norm": 1.329271674156189, "learning_rate": 3.594832517175408e-05, "loss": 2.0871, "step": 9940 }, { "epoch": 1.78, "grad_norm": 1.1870595216751099, "learning_rate": 3.593568328833104e-05, "loss": 1.6247, "step": 9945 }, { "epoch": 1.78, "grad_norm": 0.8780333399772644, "learning_rate": 3.59230379459619e-05, "loss": 1.6585, "step": 9950 }, { "epoch": 1.78, "grad_norm": 16.52887725830078, "learning_rate": 3.591038914864635e-05, "loss": 2.0053, "step": 9955 }, { "epoch": 1.78, "grad_norm": 0.8829307556152344, "learning_rate": 3.5897736900385213e-05, "loss": 1.7779, "step": 9960 }, { "epoch": 1.78, "grad_norm": 0.539667546749115, "learning_rate": 3.5885081205180377e-05, "loss": 1.761, "step": 9965 }, { "epoch": 1.78, "grad_norm": 0.6431927680969238, "learning_rate": 3.5872422067034824e-05, "loss": 1.908, "step": 9970 }, { "epoch": 1.79, "grad_norm": 1.2214795351028442, "learning_rate": 3.5859759489952635e-05, "loss": 1.8322, "step": 9975 }, { "epoch": 1.79, "grad_norm": 1.3363548517227173, "learning_rate": 3.5847093477938956e-05, "loss": 1.9769, "step": 9980 }, { "epoch": 1.79, "grad_norm": 0.6570048928260803, "learning_rate": 3.583442403500005e-05, "loss": 1.7981, "step": 9985 }, { "epoch": 1.79, "grad_norm": 1.1348395347595215, "learning_rate": 3.582175116514324e-05, "loss": 1.7122, "step": 9990 }, { "epoch": 1.79, "grad_norm": 0.6550275683403015, "learning_rate": 3.5809074872376943e-05, "loss": 1.8589, "step": 9995 }, { "epoch": 1.79, "grad_norm": 1.4769278764724731, "learning_rate": 3.5796395160710666e-05, "loss": 1.8013, "step": 10000 }, { "epoch": 1.79, "grad_norm": 0.5346553921699524, "learning_rate": 3.5783712034154985e-05, "loss": 1.988, "step": 10005 }, { "epoch": 1.79, "grad_norm": 0.48149240016937256, "learning_rate": 3.5771025496721556e-05, "loss": 1.7784, "step": 10010 }, { "epoch": 1.79, "grad_norm": 0.6305123567581177, "learning_rate": 3.5758335552423127e-05, "loss": 2.0311, "step": 10015 }, { "epoch": 1.79, "grad_norm": 0.48739558458328247, "learning_rate": 3.574564220527351e-05, "loss": 2.1409, "step": 10020 }, { "epoch": 1.79, "grad_norm": 1.4256173372268677, "learning_rate": 3.57329454592876e-05, "loss": 1.6745, "step": 10025 }, { "epoch": 1.8, "grad_norm": 0.9247502088546753, "learning_rate": 3.572024531848136e-05, "loss": 1.7922, "step": 10030 }, { "epoch": 1.8, "grad_norm": 5.438934326171875, "learning_rate": 3.5707541786871845e-05, "loss": 1.6375, "step": 10035 }, { "epoch": 1.8, "grad_norm": 0.6272771954536438, "learning_rate": 3.5694834868477154e-05, "loss": 1.8961, "step": 10040 }, { "epoch": 1.8, "grad_norm": 0.6681824326515198, "learning_rate": 3.5682124567316485e-05, "loss": 1.716, "step": 10045 }, { "epoch": 1.8, "grad_norm": 0.6500905156135559, "learning_rate": 3.5669410887410095e-05, "loss": 1.8117, "step": 10050 }, { "epoch": 1.8, "grad_norm": 0.7739772796630859, "learning_rate": 3.5656693832779295e-05, "loss": 1.7639, "step": 10055 }, { "epoch": 1.8, "grad_norm": 1.4250242710113525, "learning_rate": 3.5643973407446496e-05, "loss": 1.9421, "step": 10060 }, { "epoch": 1.8, "grad_norm": 0.9519471526145935, "learning_rate": 3.5631249615435145e-05, "loss": 1.9734, "step": 10065 }, { "epoch": 1.8, "grad_norm": 0.9607334733009338, "learning_rate": 3.561852246076976e-05, "loss": 1.7387, "step": 10070 }, { "epoch": 1.8, "grad_norm": 0.9569016695022583, "learning_rate": 3.560579194747593e-05, "loss": 1.6301, "step": 10075 }, { "epoch": 1.8, "grad_norm": 0.48008155822753906, "learning_rate": 3.559305807958032e-05, "loss": 1.9027, "step": 10080 }, { "epoch": 1.81, "grad_norm": 0.8056952953338623, "learning_rate": 3.5580320861110625e-05, "loss": 1.9209, "step": 10085 }, { "epoch": 1.81, "grad_norm": 0.4301571547985077, "learning_rate": 3.556758029609561e-05, "loss": 1.9884, "step": 10090 }, { "epoch": 1.81, "grad_norm": 1.0041254758834839, "learning_rate": 3.555483638856513e-05, "loss": 1.6986, "step": 10095 }, { "epoch": 1.81, "grad_norm": 0.7964014410972595, "learning_rate": 3.554208914255004e-05, "loss": 1.7893, "step": 10100 }, { "epoch": 1.81, "grad_norm": 0.7074928283691406, "learning_rate": 3.552933856208229e-05, "loss": 1.8304, "step": 10105 }, { "epoch": 1.81, "grad_norm": 1.151492953300476, "learning_rate": 3.55165846511949e-05, "loss": 1.6251, "step": 10110 }, { "epoch": 1.81, "grad_norm": 0.4866889417171478, "learning_rate": 3.5503827413921885e-05, "loss": 1.7865, "step": 10115 }, { "epoch": 1.81, "grad_norm": 0.7697849273681641, "learning_rate": 3.549106685429836e-05, "loss": 1.669, "step": 10120 }, { "epoch": 1.81, "grad_norm": 0.7082263827323914, "learning_rate": 3.5478302976360486e-05, "loss": 1.9564, "step": 10125 }, { "epoch": 1.81, "grad_norm": 1.869172215461731, "learning_rate": 3.5465535784145464e-05, "loss": 2.0057, "step": 10130 }, { "epoch": 1.81, "grad_norm": 0.6560725569725037, "learning_rate": 3.545276528169154e-05, "loss": 1.9155, "step": 10135 }, { "epoch": 1.82, "grad_norm": 1.027157187461853, "learning_rate": 3.5439991473038006e-05, "loss": 1.6909, "step": 10140 }, { "epoch": 1.82, "grad_norm": 0.7855679392814636, "learning_rate": 3.542721436222521e-05, "loss": 1.8473, "step": 10145 }, { "epoch": 1.82, "grad_norm": 0.6946498155593872, "learning_rate": 3.541443395329454e-05, "loss": 2.1364, "step": 10150 }, { "epoch": 1.82, "grad_norm": 1.001931071281433, "learning_rate": 3.540165025028844e-05, "loss": 1.8402, "step": 10155 }, { "epoch": 1.82, "grad_norm": 0.9406866431236267, "learning_rate": 3.538886325725036e-05, "loss": 1.9564, "step": 10160 }, { "epoch": 1.82, "grad_norm": 0.3592713177204132, "learning_rate": 3.5376072978224824e-05, "loss": 1.5525, "step": 10165 }, { "epoch": 1.82, "grad_norm": 1.2049317359924316, "learning_rate": 3.536327941725739e-05, "loss": 1.9862, "step": 10170 }, { "epoch": 1.82, "grad_norm": 0.8280571699142456, "learning_rate": 3.5350482578394636e-05, "loss": 1.745, "step": 10175 }, { "epoch": 1.82, "grad_norm": 0.7504371404647827, "learning_rate": 3.533768246568421e-05, "loss": 1.9558, "step": 10180 }, { "epoch": 1.82, "grad_norm": 0.9918588995933533, "learning_rate": 3.532487908317476e-05, "loss": 1.979, "step": 10185 }, { "epoch": 1.82, "grad_norm": 0.46797382831573486, "learning_rate": 3.5312072434915986e-05, "loss": 1.99, "step": 10190 }, { "epoch": 1.82, "grad_norm": 0.5668039917945862, "learning_rate": 3.5299262524958615e-05, "loss": 1.7308, "step": 10195 }, { "epoch": 1.83, "grad_norm": 1.981918454170227, "learning_rate": 3.528644935735442e-05, "loss": 1.7743, "step": 10200 }, { "epoch": 1.83, "grad_norm": 0.9254132509231567, "learning_rate": 3.527363293615619e-05, "loss": 2.0246, "step": 10205 }, { "epoch": 1.83, "grad_norm": 0.9930950403213501, "learning_rate": 3.5260813265417735e-05, "loss": 1.9832, "step": 10210 }, { "epoch": 1.83, "grad_norm": 0.7647066116333008, "learning_rate": 3.524799034919392e-05, "loss": 2.1351, "step": 10215 }, { "epoch": 1.83, "grad_norm": 1.070019245147705, "learning_rate": 3.523516419154062e-05, "loss": 1.942, "step": 10220 }, { "epoch": 1.83, "grad_norm": 0.79027259349823, "learning_rate": 3.522233479651472e-05, "loss": 1.9146, "step": 10225 }, { "epoch": 1.83, "grad_norm": 0.4503646194934845, "learning_rate": 3.520950216817417e-05, "loss": 2.0948, "step": 10230 }, { "epoch": 1.83, "grad_norm": 0.7014816999435425, "learning_rate": 3.519666631057789e-05, "loss": 1.7254, "step": 10235 }, { "epoch": 1.83, "grad_norm": 1.0421819686889648, "learning_rate": 3.518382722778587e-05, "loss": 1.8633, "step": 10240 }, { "epoch": 1.83, "grad_norm": 0.7871792316436768, "learning_rate": 3.5170984923859095e-05, "loss": 1.7731, "step": 10245 }, { "epoch": 1.83, "grad_norm": 0.5907774567604065, "learning_rate": 3.515813940285957e-05, "loss": 1.9179, "step": 10250 }, { "epoch": 1.84, "grad_norm": 0.4600268304347992, "learning_rate": 3.514529066885032e-05, "loss": 1.7114, "step": 10255 }, { "epoch": 1.84, "grad_norm": 0.6806623339653015, "learning_rate": 3.51324387258954e-05, "loss": 1.6823, "step": 10260 }, { "epoch": 1.84, "grad_norm": 0.7156978249549866, "learning_rate": 3.5119583578059846e-05, "loss": 1.709, "step": 10265 }, { "epoch": 1.84, "grad_norm": 0.7115353345870972, "learning_rate": 3.510672522940975e-05, "loss": 1.7488, "step": 10270 }, { "epoch": 1.84, "grad_norm": 0.8930706977844238, "learning_rate": 3.509386368401218e-05, "loss": 1.6939, "step": 10275 }, { "epoch": 1.84, "grad_norm": 0.555728018283844, "learning_rate": 3.508099894593523e-05, "loss": 1.7732, "step": 10280 }, { "epoch": 1.84, "grad_norm": 0.6877692341804504, "learning_rate": 3.506813101924802e-05, "loss": 1.799, "step": 10285 }, { "epoch": 1.84, "grad_norm": 0.9091320633888245, "learning_rate": 3.5055259908020645e-05, "loss": 1.6652, "step": 10290 }, { "epoch": 1.84, "grad_norm": 0.6716402173042297, "learning_rate": 3.504238561632424e-05, "loss": 1.6759, "step": 10295 }, { "epoch": 1.84, "grad_norm": 0.615459144115448, "learning_rate": 3.502950814823092e-05, "loss": 1.7817, "step": 10300 }, { "epoch": 1.84, "grad_norm": 0.8127800822257996, "learning_rate": 3.501662750781382e-05, "loss": 2.2239, "step": 10305 }, { "epoch": 1.85, "grad_norm": 0.9452939033508301, "learning_rate": 3.5003743699147076e-05, "loss": 2.1167, "step": 10310 }, { "epoch": 1.85, "grad_norm": 0.7668461799621582, "learning_rate": 3.499085672630582e-05, "loss": 1.6316, "step": 10315 }, { "epoch": 1.85, "grad_norm": 0.7267149090766907, "learning_rate": 3.497796659336619e-05, "loss": 2.0752, "step": 10320 }, { "epoch": 1.85, "grad_norm": 0.7957209944725037, "learning_rate": 3.4965073304405326e-05, "loss": 1.8228, "step": 10325 }, { "epoch": 1.85, "grad_norm": 0.5732484459877014, "learning_rate": 3.495217686350136e-05, "loss": 1.8593, "step": 10330 }, { "epoch": 1.85, "grad_norm": 1.3122698068618774, "learning_rate": 3.493927727473343e-05, "loss": 1.6822, "step": 10335 }, { "epoch": 1.85, "grad_norm": 0.5948165655136108, "learning_rate": 3.492637454218166e-05, "loss": 1.9183, "step": 10340 }, { "epoch": 1.85, "grad_norm": 0.938048243522644, "learning_rate": 3.491346866992716e-05, "loss": 2.0546, "step": 10345 }, { "epoch": 1.85, "grad_norm": 0.6968057751655579, "learning_rate": 3.490055966205207e-05, "loss": 1.8256, "step": 10350 }, { "epoch": 1.85, "grad_norm": 0.5820090174674988, "learning_rate": 3.4887647522639464e-05, "loss": 1.9695, "step": 10355 }, { "epoch": 1.85, "grad_norm": 0.7669956088066101, "learning_rate": 3.4874732255773465e-05, "loss": 1.7784, "step": 10360 }, { "epoch": 1.86, "grad_norm": 0.5515414476394653, "learning_rate": 3.486181386553916e-05, "loss": 1.9826, "step": 10365 }, { "epoch": 1.86, "grad_norm": 7.995153427124023, "learning_rate": 3.484889235602261e-05, "loss": 1.8934, "step": 10370 }, { "epoch": 1.86, "grad_norm": 0.8782067894935608, "learning_rate": 3.483596773131088e-05, "loss": 1.7218, "step": 10375 }, { "epoch": 1.86, "grad_norm": 0.6286051273345947, "learning_rate": 3.482303999549201e-05, "loss": 1.9686, "step": 10380 }, { "epoch": 1.86, "grad_norm": 0.4949547350406647, "learning_rate": 3.481010915265504e-05, "loss": 2.0401, "step": 10385 }, { "epoch": 1.86, "grad_norm": 0.9229368567466736, "learning_rate": 3.4797175206889984e-05, "loss": 1.6785, "step": 10390 }, { "epoch": 1.86, "grad_norm": 0.5106030702590942, "learning_rate": 3.478423816228784e-05, "loss": 1.9079, "step": 10395 }, { "epoch": 1.86, "grad_norm": 0.9068930745124817, "learning_rate": 3.477129802294057e-05, "loss": 1.9197, "step": 10400 }, { "epoch": 1.86, "grad_norm": 0.7757366299629211, "learning_rate": 3.475835479294114e-05, "loss": 1.9806, "step": 10405 }, { "epoch": 1.86, "grad_norm": 0.814594566822052, "learning_rate": 3.4745408476383476e-05, "loss": 2.0118, "step": 10410 }, { "epoch": 1.86, "grad_norm": 0.643321692943573, "learning_rate": 3.473245907736248e-05, "loss": 1.9629, "step": 10415 }, { "epoch": 1.87, "grad_norm": 0.7608657479286194, "learning_rate": 3.471950659997404e-05, "loss": 1.8221, "step": 10420 }, { "epoch": 1.87, "grad_norm": 1.1012566089630127, "learning_rate": 3.470655104831501e-05, "loss": 1.87, "step": 10425 }, { "epoch": 1.87, "grad_norm": 0.7623602151870728, "learning_rate": 3.469359242648321e-05, "loss": 1.9752, "step": 10430 }, { "epoch": 1.87, "grad_norm": 0.9718965888023376, "learning_rate": 3.468063073857747e-05, "loss": 1.784, "step": 10435 }, { "epoch": 1.87, "grad_norm": 3.79907488822937, "learning_rate": 3.466766598869753e-05, "loss": 2.1134, "step": 10440 }, { "epoch": 1.87, "grad_norm": 15.143911361694336, "learning_rate": 3.4654698180944134e-05, "loss": 2.1167, "step": 10445 }, { "epoch": 1.87, "grad_norm": 0.5659872889518738, "learning_rate": 3.4641727319418995e-05, "loss": 2.2435, "step": 10450 }, { "epoch": 1.87, "grad_norm": 0.6032043695449829, "learning_rate": 3.4628753408224765e-05, "loss": 2.0959, "step": 10455 }, { "epoch": 1.87, "grad_norm": 0.6183554530143738, "learning_rate": 3.461577645146511e-05, "loss": 1.8351, "step": 10460 }, { "epoch": 1.87, "grad_norm": 0.806084156036377, "learning_rate": 3.4602796453244615e-05, "loss": 1.6052, "step": 10465 }, { "epoch": 1.87, "grad_norm": 0.7526884078979492, "learning_rate": 3.4589813417668823e-05, "loss": 1.9665, "step": 10470 }, { "epoch": 1.87, "grad_norm": 0.9556746482849121, "learning_rate": 3.457682734884428e-05, "loss": 1.8647, "step": 10475 }, { "epoch": 1.88, "grad_norm": 0.5155915021896362, "learning_rate": 3.456383825087846e-05, "loss": 1.9263, "step": 10480 }, { "epoch": 1.88, "grad_norm": 0.661363422870636, "learning_rate": 3.45508461278798e-05, "loss": 1.7769, "step": 10485 }, { "epoch": 1.88, "grad_norm": 0.5568251609802246, "learning_rate": 3.453785098395769e-05, "loss": 1.9676, "step": 10490 }, { "epoch": 1.88, "grad_norm": 0.6141484975814819, "learning_rate": 3.45248528232225e-05, "loss": 2.0722, "step": 10495 }, { "epoch": 1.88, "grad_norm": 0.6515774726867676, "learning_rate": 3.4511851649785506e-05, "loss": 1.9458, "step": 10500 }, { "epoch": 1.88, "grad_norm": 0.6424517631530762, "learning_rate": 3.4498847467759e-05, "loss": 1.9665, "step": 10505 }, { "epoch": 1.88, "grad_norm": 0.39145955443382263, "learning_rate": 3.4485840281256176e-05, "loss": 1.9062, "step": 10510 }, { "epoch": 1.88, "grad_norm": 0.9824689030647278, "learning_rate": 3.447283009439119e-05, "loss": 1.6084, "step": 10515 }, { "epoch": 1.88, "grad_norm": 0.44701799750328064, "learning_rate": 3.445981691127915e-05, "loss": 1.9367, "step": 10520 }, { "epoch": 1.88, "grad_norm": 0.8132763504981995, "learning_rate": 3.4446800736036134e-05, "loss": 1.8618, "step": 10525 }, { "epoch": 1.88, "grad_norm": 0.5754613876342773, "learning_rate": 3.4433781572779126e-05, "loss": 1.72, "step": 10530 }, { "epoch": 1.89, "grad_norm": 0.46498289704322815, "learning_rate": 3.442075942562608e-05, "loss": 1.9345, "step": 10535 }, { "epoch": 1.89, "grad_norm": 0.9286674857139587, "learning_rate": 3.4407734298695895e-05, "loss": 1.7018, "step": 10540 }, { "epoch": 1.89, "grad_norm": 0.5865563750267029, "learning_rate": 3.439470619610839e-05, "loss": 1.9787, "step": 10545 }, { "epoch": 1.89, "grad_norm": 1.0381340980529785, "learning_rate": 3.438167512198436e-05, "loss": 1.7573, "step": 10550 }, { "epoch": 1.89, "grad_norm": 1.0353657007217407, "learning_rate": 3.4368641080445504e-05, "loss": 1.7011, "step": 10555 }, { "epoch": 1.89, "grad_norm": 0.5357837677001953, "learning_rate": 3.4355604075614493e-05, "loss": 2.1718, "step": 10560 }, { "epoch": 1.89, "grad_norm": 0.7515667080879211, "learning_rate": 3.434256411161491e-05, "loss": 1.9217, "step": 10565 }, { "epoch": 1.89, "grad_norm": 0.7363743185997009, "learning_rate": 3.432952119257128e-05, "loss": 1.9062, "step": 10570 }, { "epoch": 1.89, "grad_norm": 0.6471055746078491, "learning_rate": 3.431647532260908e-05, "loss": 2.0589, "step": 10575 }, { "epoch": 1.89, "grad_norm": 0.6963765621185303, "learning_rate": 3.4303426505854695e-05, "loss": 1.9334, "step": 10580 }, { "epoch": 1.89, "grad_norm": 1.006638765335083, "learning_rate": 3.4290374746435454e-05, "loss": 1.8345, "step": 10585 }, { "epoch": 1.9, "grad_norm": 0.5938763618469238, "learning_rate": 3.427732004847961e-05, "loss": 1.7264, "step": 10590 }, { "epoch": 1.9, "grad_norm": 0.7721897959709167, "learning_rate": 3.4264262416116366e-05, "loss": 1.7195, "step": 10595 }, { "epoch": 1.9, "grad_norm": 0.6883012652397156, "learning_rate": 3.425120185347584e-05, "loss": 1.5873, "step": 10600 }, { "epoch": 1.9, "grad_norm": 0.9483616352081299, "learning_rate": 3.423813836468904e-05, "loss": 1.8211, "step": 10605 }, { "epoch": 1.9, "grad_norm": 0.6309101581573486, "learning_rate": 3.4225071953887976e-05, "loss": 1.9336, "step": 10610 }, { "epoch": 1.9, "grad_norm": 0.8639585971832275, "learning_rate": 3.421200262520553e-05, "loss": 2.0485, "step": 10615 }, { "epoch": 1.9, "grad_norm": 0.8056395649909973, "learning_rate": 3.419893038277552e-05, "loss": 1.8187, "step": 10620 }, { "epoch": 1.9, "grad_norm": 1.7254058122634888, "learning_rate": 3.418585523073266e-05, "loss": 1.8431, "step": 10625 }, { "epoch": 1.9, "grad_norm": 0.5998656153678894, "learning_rate": 3.417277717321264e-05, "loss": 1.8515, "step": 10630 }, { "epoch": 1.9, "grad_norm": 0.8533723950386047, "learning_rate": 3.4159696214352014e-05, "loss": 1.9317, "step": 10635 }, { "epoch": 1.9, "grad_norm": 0.44307661056518555, "learning_rate": 3.414661235828829e-05, "loss": 1.7573, "step": 10640 }, { "epoch": 1.91, "grad_norm": 0.5623877644538879, "learning_rate": 3.413352560915988e-05, "loss": 1.9952, "step": 10645 }, { "epoch": 1.91, "grad_norm": 0.5950538516044617, "learning_rate": 3.4120435971106105e-05, "loss": 2.0118, "step": 10650 }, { "epoch": 1.91, "grad_norm": 0.7914562225341797, "learning_rate": 3.410734344826719e-05, "loss": 1.6905, "step": 10655 }, { "epoch": 1.91, "grad_norm": 2.2145016193389893, "learning_rate": 3.409424804478431e-05, "loss": 1.9919, "step": 10660 }, { "epoch": 1.91, "grad_norm": 1.1050101518630981, "learning_rate": 3.408114976479953e-05, "loss": 1.8972, "step": 10665 }, { "epoch": 1.91, "grad_norm": 0.7993802428245544, "learning_rate": 3.406804861245581e-05, "loss": 1.7432, "step": 10670 }, { "epoch": 1.91, "grad_norm": 0.7174950838088989, "learning_rate": 3.4054944591897034e-05, "loss": 1.7941, "step": 10675 }, { "epoch": 1.91, "grad_norm": 2.201014518737793, "learning_rate": 3.4041837707267996e-05, "loss": 1.8024, "step": 10680 }, { "epoch": 1.91, "grad_norm": 0.560925304889679, "learning_rate": 3.402872796271438e-05, "loss": 1.6103, "step": 10685 }, { "epoch": 1.91, "grad_norm": 0.6595713496208191, "learning_rate": 3.4015615362382814e-05, "loss": 1.983, "step": 10690 }, { "epoch": 1.91, "grad_norm": 0.6843532919883728, "learning_rate": 3.400249991042076e-05, "loss": 2.0138, "step": 10695 }, { "epoch": 1.92, "grad_norm": 0.5372428297996521, "learning_rate": 3.398938161097665e-05, "loss": 2.0141, "step": 10700 }, { "epoch": 1.92, "grad_norm": 0.6535273790359497, "learning_rate": 3.3976260468199785e-05, "loss": 2.0042, "step": 10705 }, { "epoch": 1.92, "grad_norm": 0.6673722863197327, "learning_rate": 3.3963136486240366e-05, "loss": 1.8941, "step": 10710 }, { "epoch": 1.92, "grad_norm": 0.6364368200302124, "learning_rate": 3.3950009669249497e-05, "loss": 1.7138, "step": 10715 }, { "epoch": 1.92, "grad_norm": 0.6928640007972717, "learning_rate": 3.393688002137919e-05, "loss": 1.6929, "step": 10720 }, { "epoch": 1.92, "grad_norm": 0.8325533270835876, "learning_rate": 3.392374754678231e-05, "loss": 1.9077, "step": 10725 }, { "epoch": 1.92, "grad_norm": 1.1905181407928467, "learning_rate": 3.3910612249612674e-05, "loss": 1.9971, "step": 10730 }, { "epoch": 1.92, "grad_norm": 0.6571048498153687, "learning_rate": 3.3897474134024953e-05, "loss": 2.2925, "step": 10735 }, { "epoch": 1.92, "grad_norm": 0.5732169151306152, "learning_rate": 3.3884333204174724e-05, "loss": 1.9575, "step": 10740 }, { "epoch": 1.92, "grad_norm": 0.4101000130176544, "learning_rate": 3.3871189464218445e-05, "loss": 2.1045, "step": 10745 }, { "epoch": 1.92, "grad_norm": 0.73232102394104, "learning_rate": 3.385804291831347e-05, "loss": 1.9257, "step": 10750 }, { "epoch": 1.93, "grad_norm": 0.6931615471839905, "learning_rate": 3.384489357061804e-05, "loss": 1.878, "step": 10755 }, { "epoch": 1.93, "grad_norm": 0.5579245686531067, "learning_rate": 3.3831741425291294e-05, "loss": 1.9672, "step": 10760 }, { "epoch": 1.93, "grad_norm": 0.762345016002655, "learning_rate": 3.381858648649322e-05, "loss": 1.8285, "step": 10765 }, { "epoch": 1.93, "grad_norm": 1.3375375270843506, "learning_rate": 3.380542875838472e-05, "loss": 2.3338, "step": 10770 }, { "epoch": 1.93, "grad_norm": 1.3226951360702515, "learning_rate": 3.379226824512758e-05, "loss": 1.9114, "step": 10775 }, { "epoch": 1.93, "grad_norm": 0.7157413363456726, "learning_rate": 3.3779104950884446e-05, "loss": 1.8751, "step": 10780 }, { "epoch": 1.93, "grad_norm": 0.9126617312431335, "learning_rate": 3.376593887981887e-05, "loss": 2.0082, "step": 10785 }, { "epoch": 1.93, "grad_norm": 0.915809690952301, "learning_rate": 3.3752770036095236e-05, "loss": 2.2042, "step": 10790 }, { "epoch": 1.93, "grad_norm": 1.350906252861023, "learning_rate": 3.373959842387888e-05, "loss": 2.0439, "step": 10795 }, { "epoch": 1.93, "grad_norm": 0.6320156455039978, "learning_rate": 3.3726424047335943e-05, "loss": 2.0582, "step": 10800 }, { "epoch": 1.93, "grad_norm": 0.42162302136421204, "learning_rate": 3.3713246910633473e-05, "loss": 2.2003, "step": 10805 }, { "epoch": 1.93, "grad_norm": 0.7747112512588501, "learning_rate": 3.370006701793939e-05, "loss": 1.7979, "step": 10810 }, { "epoch": 1.94, "grad_norm": 0.5144717693328857, "learning_rate": 3.3686884373422465e-05, "loss": 1.8804, "step": 10815 }, { "epoch": 1.94, "grad_norm": 0.6446430683135986, "learning_rate": 3.367369898125238e-05, "loss": 2.1296, "step": 10820 }, { "epoch": 1.94, "grad_norm": 0.71882164478302, "learning_rate": 3.366051084559965e-05, "loss": 2.0616, "step": 10825 }, { "epoch": 1.94, "grad_norm": 1.0291924476623535, "learning_rate": 3.3647319970635665e-05, "loss": 1.9716, "step": 10830 }, { "epoch": 1.94, "grad_norm": 1.1064375638961792, "learning_rate": 3.363412636053269e-05, "loss": 1.7938, "step": 10835 }, { "epoch": 1.94, "grad_norm": 1.4092391729354858, "learning_rate": 3.362093001946386e-05, "loss": 2.3134, "step": 10840 }, { "epoch": 1.94, "grad_norm": 0.5512030124664307, "learning_rate": 3.360773095160315e-05, "loss": 1.8288, "step": 10845 }, { "epoch": 1.94, "grad_norm": 2.6880881786346436, "learning_rate": 3.3594529161125424e-05, "loss": 1.606, "step": 10850 }, { "epoch": 1.94, "grad_norm": 0.32284656167030334, "learning_rate": 3.358132465220639e-05, "loss": 1.8226, "step": 10855 }, { "epoch": 1.94, "grad_norm": 1.3082622289657593, "learning_rate": 3.356811742902262e-05, "loss": 1.9702, "step": 10860 }, { "epoch": 1.94, "grad_norm": 0.579648494720459, "learning_rate": 3.355490749575155e-05, "loss": 1.8468, "step": 10865 }, { "epoch": 1.95, "grad_norm": 0.6377948522567749, "learning_rate": 3.354169485657147e-05, "loss": 2.1329, "step": 10870 }, { "epoch": 1.95, "grad_norm": 0.47102659940719604, "learning_rate": 3.3528479515661514e-05, "loss": 1.7664, "step": 10875 }, { "epoch": 1.95, "grad_norm": 0.959218442440033, "learning_rate": 3.35152614772017e-05, "loss": 1.4881, "step": 10880 }, { "epoch": 1.95, "grad_norm": 0.5902130007743835, "learning_rate": 3.3502040745372874e-05, "loss": 1.9162, "step": 10885 }, { "epoch": 1.95, "grad_norm": 1.0470603704452515, "learning_rate": 3.348881732435673e-05, "loss": 1.4373, "step": 10890 }, { "epoch": 1.95, "grad_norm": 0.8185494542121887, "learning_rate": 3.347559121833584e-05, "loss": 1.8553, "step": 10895 }, { "epoch": 1.95, "grad_norm": 0.7984978556632996, "learning_rate": 3.34623624314936e-05, "loss": 1.7071, "step": 10900 }, { "epoch": 1.95, "grad_norm": 0.5953229665756226, "learning_rate": 3.344913096801425e-05, "loss": 1.9628, "step": 10905 }, { "epoch": 1.95, "grad_norm": 0.4407386779785156, "learning_rate": 3.3435896832082916e-05, "loss": 1.8533, "step": 10910 }, { "epoch": 1.95, "grad_norm": 9.959710121154785, "learning_rate": 3.3422660027885515e-05, "loss": 1.8898, "step": 10915 }, { "epoch": 1.95, "grad_norm": 1.026064157485962, "learning_rate": 3.340942055960886e-05, "loss": 1.9131, "step": 10920 }, { "epoch": 1.96, "grad_norm": 0.8007922172546387, "learning_rate": 3.339617843144057e-05, "loss": 1.7598, "step": 10925 }, { "epoch": 1.96, "grad_norm": 0.5460824966430664, "learning_rate": 3.3382933647569115e-05, "loss": 2.0906, "step": 10930 }, { "epoch": 1.96, "grad_norm": 0.6225911378860474, "learning_rate": 3.33696862121838e-05, "loss": 1.8498, "step": 10935 }, { "epoch": 1.96, "grad_norm": 0.6068547368049622, "learning_rate": 3.33564361294748e-05, "loss": 1.8413, "step": 10940 }, { "epoch": 1.96, "grad_norm": 2.0755717754364014, "learning_rate": 3.334318340363309e-05, "loss": 2.0584, "step": 10945 }, { "epoch": 1.96, "grad_norm": 0.5796096920967102, "learning_rate": 3.332992803885049e-05, "loss": 2.1057, "step": 10950 }, { "epoch": 1.96, "grad_norm": 2.20377254486084, "learning_rate": 3.331667003931967e-05, "loss": 1.8257, "step": 10955 }, { "epoch": 1.96, "grad_norm": 0.8372711539268494, "learning_rate": 3.330340940923411e-05, "loss": 1.9423, "step": 10960 }, { "epoch": 1.96, "grad_norm": 0.5656445622444153, "learning_rate": 3.3290146152788136e-05, "loss": 1.7365, "step": 10965 }, { "epoch": 1.96, "grad_norm": 0.3391028344631195, "learning_rate": 3.3276880274176927e-05, "loss": 1.7596, "step": 10970 }, { "epoch": 1.96, "grad_norm": 1.2373957633972168, "learning_rate": 3.326361177759643e-05, "loss": 1.7008, "step": 10975 }, { "epoch": 1.97, "grad_norm": 0.6096294522285461, "learning_rate": 3.325034066724348e-05, "loss": 1.8067, "step": 10980 }, { "epoch": 1.97, "grad_norm": 0.7483140230178833, "learning_rate": 3.323706694731572e-05, "loss": 1.6209, "step": 10985 }, { "epoch": 1.97, "grad_norm": 0.4939500391483307, "learning_rate": 3.3223790622011595e-05, "loss": 1.7462, "step": 10990 }, { "epoch": 1.97, "grad_norm": 1.016813039779663, "learning_rate": 3.321051169553042e-05, "loss": 1.8239, "step": 10995 }, { "epoch": 1.97, "grad_norm": 0.5926358103752136, "learning_rate": 3.3197230172072294e-05, "loss": 2.0223, "step": 11000 }, { "epoch": 1.97, "grad_norm": 0.7104119062423706, "learning_rate": 3.318394605583813e-05, "loss": 1.7247, "step": 11005 }, { "epoch": 1.97, "grad_norm": 1.1022257804870605, "learning_rate": 3.317065935102972e-05, "loss": 1.6968, "step": 11010 }, { "epoch": 1.97, "grad_norm": 0.7626497745513916, "learning_rate": 3.315737006184961e-05, "loss": 1.9668, "step": 11015 }, { "epoch": 1.97, "grad_norm": 0.606796383857727, "learning_rate": 3.314407819250119e-05, "loss": 1.8814, "step": 11020 }, { "epoch": 1.97, "grad_norm": 0.34029826521873474, "learning_rate": 3.313078374718868e-05, "loss": 1.6823, "step": 11025 }, { "epoch": 1.97, "grad_norm": 0.5755228400230408, "learning_rate": 3.311748673011709e-05, "loss": 1.6421, "step": 11030 }, { "epoch": 1.98, "grad_norm": 0.7753984332084656, "learning_rate": 3.310418714549225e-05, "loss": 2.0978, "step": 11035 }, { "epoch": 1.98, "grad_norm": 0.5059102177619934, "learning_rate": 3.309088499752082e-05, "loss": 2.0625, "step": 11040 }, { "epoch": 1.98, "grad_norm": 0.6016696691513062, "learning_rate": 3.307758029041024e-05, "loss": 1.8624, "step": 11045 }, { "epoch": 1.98, "grad_norm": 0.7924507856369019, "learning_rate": 3.306427302836879e-05, "loss": 1.8362, "step": 11050 }, { "epoch": 1.98, "grad_norm": 0.7843905687332153, "learning_rate": 3.3050963215605526e-05, "loss": 2.0905, "step": 11055 }, { "epoch": 1.98, "grad_norm": 1.0598030090332031, "learning_rate": 3.3037650856330354e-05, "loss": 1.7577, "step": 11060 }, { "epoch": 1.98, "grad_norm": 0.7119802832603455, "learning_rate": 3.3024335954753946e-05, "loss": 1.896, "step": 11065 }, { "epoch": 1.98, "grad_norm": 0.5891833901405334, "learning_rate": 3.301101851508779e-05, "loss": 2.03, "step": 11070 }, { "epoch": 1.98, "grad_norm": 0.3929714262485504, "learning_rate": 3.2997698541544184e-05, "loss": 1.6645, "step": 11075 }, { "epoch": 1.98, "grad_norm": 0.6735256910324097, "learning_rate": 3.298437603833622e-05, "loss": 2.2231, "step": 11080 }, { "epoch": 1.98, "grad_norm": 0.9890113472938538, "learning_rate": 3.29710510096778e-05, "loss": 1.8691, "step": 11085 }, { "epoch": 1.99, "grad_norm": 0.49276089668273926, "learning_rate": 3.295772345978361e-05, "loss": 1.8855, "step": 11090 }, { "epoch": 1.99, "grad_norm": 0.7664940357208252, "learning_rate": 3.2944393392869145e-05, "loss": 2.1009, "step": 11095 }, { "epoch": 1.99, "grad_norm": 0.7249503135681152, "learning_rate": 3.2931060813150685e-05, "loss": 1.8206, "step": 11100 }, { "epoch": 1.99, "grad_norm": 1.5629411935806274, "learning_rate": 3.291772572484533e-05, "loss": 1.7675, "step": 11105 }, { "epoch": 1.99, "grad_norm": 0.38072723150253296, "learning_rate": 3.2904388132170936e-05, "loss": 1.8236, "step": 11110 }, { "epoch": 1.99, "grad_norm": 0.7046849727630615, "learning_rate": 3.2891048039346177e-05, "loss": 1.7859, "step": 11115 }, { "epoch": 1.99, "grad_norm": 0.5897841453552246, "learning_rate": 3.2877705450590526e-05, "loss": 1.9604, "step": 11120 }, { "epoch": 1.99, "grad_norm": 0.6484891772270203, "learning_rate": 3.2864360370124206e-05, "loss": 1.8623, "step": 11125 }, { "epoch": 1.99, "grad_norm": 0.7585556507110596, "learning_rate": 3.285101280216827e-05, "loss": 1.746, "step": 11130 }, { "epoch": 1.99, "grad_norm": 0.6930608153343201, "learning_rate": 3.2837662750944535e-05, "loss": 1.5794, "step": 11135 }, { "epoch": 1.99, "grad_norm": 0.684201717376709, "learning_rate": 3.282431022067561e-05, "loss": 1.8985, "step": 11140 }, { "epoch": 1.99, "grad_norm": 0.8467980027198792, "learning_rate": 3.281095521558488e-05, "loss": 1.8468, "step": 11145 }, { "epoch": 2.0, "grad_norm": 2.533331871032715, "learning_rate": 3.2797597739896545e-05, "loss": 1.6749, "step": 11150 }, { "epoch": 2.0, "grad_norm": 0.5449894666671753, "learning_rate": 3.278423779783554e-05, "loss": 1.7722, "step": 11155 }, { "epoch": 2.0, "grad_norm": 0.5651440024375916, "learning_rate": 3.27708753936276e-05, "loss": 1.8541, "step": 11160 }, { "epoch": 2.0, "grad_norm": 0.9599629640579224, "learning_rate": 3.2757510531499256e-05, "loss": 1.9124, "step": 11165 }, { "epoch": 2.0, "grad_norm": 0.530594527721405, "learning_rate": 3.274414321567779e-05, "loss": 2.0135, "step": 11170 }, { "epoch": 2.0, "grad_norm": 0.8075482249259949, "learning_rate": 3.273077345039127e-05, "loss": 1.8081, "step": 11175 }, { "epoch": 2.0, "grad_norm": 0.5179402232170105, "learning_rate": 3.271740123986856e-05, "loss": 1.8615, "step": 11180 }, { "epoch": 2.0, "grad_norm": 0.6782111525535583, "learning_rate": 3.270402658833924e-05, "loss": 1.9938, "step": 11185 }, { "epoch": 2.0, "grad_norm": 0.5111522674560547, "learning_rate": 3.2690649500033726e-05, "loss": 1.5514, "step": 11190 }, { "epoch": 2.0, "grad_norm": 1.3096226453781128, "learning_rate": 3.267726997918318e-05, "loss": 1.7645, "step": 11195 }, { "epoch": 2.0, "grad_norm": 0.4817447364330292, "learning_rate": 3.266388803001951e-05, "loss": 1.7603, "step": 11200 }, { "epoch": 2.01, "grad_norm": 0.8163765668869019, "learning_rate": 3.2650503656775446e-05, "loss": 1.8734, "step": 11205 }, { "epoch": 2.01, "grad_norm": 0.504024863243103, "learning_rate": 3.2637116863684426e-05, "loss": 1.5327, "step": 11210 }, { "epoch": 2.01, "grad_norm": 0.4556671380996704, "learning_rate": 3.2623727654980686e-05, "loss": 1.9113, "step": 11215 }, { "epoch": 2.01, "grad_norm": 0.9641478657722473, "learning_rate": 3.261033603489923e-05, "loss": 1.8351, "step": 11220 }, { "epoch": 2.01, "grad_norm": 0.5921196937561035, "learning_rate": 3.259694200767579e-05, "loss": 1.9184, "step": 11225 }, { "epoch": 2.01, "grad_norm": 0.8275460004806519, "learning_rate": 3.258354557754691e-05, "loss": 1.7349, "step": 11230 }, { "epoch": 2.01, "grad_norm": 0.7848659753799438, "learning_rate": 3.257014674874986e-05, "loss": 1.7097, "step": 11235 }, { "epoch": 2.01, "grad_norm": 0.5022772550582886, "learning_rate": 3.255674552552267e-05, "loss": 1.7641, "step": 11240 }, { "epoch": 2.01, "grad_norm": 1.0205901861190796, "learning_rate": 3.254334191210414e-05, "loss": 1.9206, "step": 11245 }, { "epoch": 2.01, "grad_norm": 0.4597795009613037, "learning_rate": 3.252993591273382e-05, "loss": 1.9408, "step": 11250 }, { "epoch": 2.01, "grad_norm": 0.4975505769252777, "learning_rate": 3.251652753165202e-05, "loss": 1.9817, "step": 11255 }, { "epoch": 2.02, "grad_norm": 0.8713247776031494, "learning_rate": 3.2503116773099786e-05, "loss": 1.69, "step": 11260 }, { "epoch": 2.02, "grad_norm": 0.6077904105186462, "learning_rate": 3.248970364131894e-05, "loss": 1.8712, "step": 11265 }, { "epoch": 2.02, "grad_norm": 0.489361971616745, "learning_rate": 3.2476288140552026e-05, "loss": 1.7067, "step": 11270 }, { "epoch": 2.02, "grad_norm": 0.7086228728294373, "learning_rate": 3.246287027504237e-05, "loss": 1.7504, "step": 11275 }, { "epoch": 2.02, "grad_norm": 0.7792586088180542, "learning_rate": 3.2449450049034024e-05, "loss": 1.874, "step": 11280 }, { "epoch": 2.02, "grad_norm": 0.6796438097953796, "learning_rate": 3.243602746677179e-05, "loss": 1.8736, "step": 11285 }, { "epoch": 2.02, "grad_norm": 1.0525963306427002, "learning_rate": 3.242260253250122e-05, "loss": 1.7056, "step": 11290 }, { "epoch": 2.02, "grad_norm": 0.3810064196586609, "learning_rate": 3.240917525046862e-05, "loss": 1.7772, "step": 11295 }, { "epoch": 2.02, "grad_norm": 0.3621659278869629, "learning_rate": 3.2395745624921e-05, "loss": 1.8997, "step": 11300 }, { "epoch": 2.02, "grad_norm": 0.7644492387771606, "learning_rate": 3.238231366010616e-05, "loss": 1.7473, "step": 11305 }, { "epoch": 2.02, "grad_norm": 0.8346506953239441, "learning_rate": 3.2368879360272606e-05, "loss": 1.7338, "step": 11310 }, { "epoch": 2.03, "grad_norm": 0.7360906004905701, "learning_rate": 3.235544272966961e-05, "loss": 1.6659, "step": 11315 }, { "epoch": 2.03, "grad_norm": 0.8705644607543945, "learning_rate": 3.2342003772547145e-05, "loss": 1.9399, "step": 11320 }, { "epoch": 2.03, "grad_norm": 1.1617865562438965, "learning_rate": 3.232856249315595e-05, "loss": 1.7525, "step": 11325 }, { "epoch": 2.03, "grad_norm": 0.6664028167724609, "learning_rate": 3.231511889574748e-05, "loss": 1.7085, "step": 11330 }, { "epoch": 2.03, "grad_norm": 0.378801167011261, "learning_rate": 3.230167298457395e-05, "loss": 2.1498, "step": 11335 }, { "epoch": 2.03, "grad_norm": 2.0433850288391113, "learning_rate": 3.2288224763888274e-05, "loss": 1.9798, "step": 11340 }, { "epoch": 2.03, "grad_norm": 0.8449577689170837, "learning_rate": 3.227477423794412e-05, "loss": 1.9539, "step": 11345 }, { "epoch": 2.03, "grad_norm": 1.6414170265197754, "learning_rate": 3.226132141099586e-05, "loss": 1.8589, "step": 11350 }, { "epoch": 2.03, "grad_norm": 0.9575155377388, "learning_rate": 3.2247866287298634e-05, "loss": 1.749, "step": 11355 }, { "epoch": 2.03, "grad_norm": 1.6495996713638306, "learning_rate": 3.2234408871108266e-05, "loss": 1.7329, "step": 11360 }, { "epoch": 2.03, "grad_norm": 0.5812102556228638, "learning_rate": 3.2220949166681335e-05, "loss": 1.9318, "step": 11365 }, { "epoch": 2.04, "grad_norm": 0.8048359155654907, "learning_rate": 3.220748717827513e-05, "loss": 1.853, "step": 11370 }, { "epoch": 2.04, "grad_norm": 0.7342902421951294, "learning_rate": 3.219402291014766e-05, "loss": 1.9342, "step": 11375 }, { "epoch": 2.04, "grad_norm": 0.7486286163330078, "learning_rate": 3.218055636655766e-05, "loss": 2.1832, "step": 11380 }, { "epoch": 2.04, "grad_norm": 1.1646840572357178, "learning_rate": 3.21670875517646e-05, "loss": 1.7443, "step": 11385 }, { "epoch": 2.04, "grad_norm": 1.269112467765808, "learning_rate": 3.215361647002863e-05, "loss": 1.9097, "step": 11390 }, { "epoch": 2.04, "grad_norm": 0.7798238396644592, "learning_rate": 3.2140143125610654e-05, "loss": 1.9486, "step": 11395 }, { "epoch": 2.04, "grad_norm": 0.9379304647445679, "learning_rate": 3.212666752277228e-05, "loss": 1.7346, "step": 11400 }, { "epoch": 2.04, "grad_norm": 1.0831714868545532, "learning_rate": 3.211318966577581e-05, "loss": 2.0097, "step": 11405 }, { "epoch": 2.04, "grad_norm": 0.3674708902835846, "learning_rate": 3.20997095588843e-05, "loss": 2.05, "step": 11410 }, { "epoch": 2.04, "grad_norm": 0.9015005826950073, "learning_rate": 3.20862272063615e-05, "loss": 1.7768, "step": 11415 }, { "epoch": 2.04, "grad_norm": 0.8480490446090698, "learning_rate": 3.207274261247183e-05, "loss": 1.768, "step": 11420 }, { "epoch": 2.05, "grad_norm": 0.48236319422721863, "learning_rate": 3.2059255781480475e-05, "loss": 1.9133, "step": 11425 }, { "epoch": 2.05, "grad_norm": 1.2355780601501465, "learning_rate": 3.204576671765331e-05, "loss": 1.9076, "step": 11430 }, { "epoch": 2.05, "grad_norm": 0.6403537392616272, "learning_rate": 3.2032275425256916e-05, "loss": 1.7401, "step": 11435 }, { "epoch": 2.05, "grad_norm": 0.7580591440200806, "learning_rate": 3.201878190855857e-05, "loss": 1.8118, "step": 11440 }, { "epoch": 2.05, "grad_norm": 0.9600961208343506, "learning_rate": 3.2005286171826256e-05, "loss": 2.1253, "step": 11445 }, { "epoch": 2.05, "grad_norm": 0.496980756521225, "learning_rate": 3.199178821932865e-05, "loss": 1.9104, "step": 11450 }, { "epoch": 2.05, "grad_norm": 0.6483522653579712, "learning_rate": 3.1978288055335164e-05, "loss": 1.7537, "step": 11455 }, { "epoch": 2.05, "grad_norm": 0.5102001428604126, "learning_rate": 3.196478568411589e-05, "loss": 1.8392, "step": 11460 }, { "epoch": 2.05, "grad_norm": 0.5300964713096619, "learning_rate": 3.195128110994159e-05, "loss": 2.1472, "step": 11465 }, { "epoch": 2.05, "grad_norm": 0.7955127358436584, "learning_rate": 3.193777433708376e-05, "loss": 1.9474, "step": 11470 }, { "epoch": 2.05, "grad_norm": 0.7563099265098572, "learning_rate": 3.192426536981459e-05, "loss": 1.9155, "step": 11475 }, { "epoch": 2.05, "grad_norm": 0.6697816252708435, "learning_rate": 3.191075421240694e-05, "loss": 1.9573, "step": 11480 }, { "epoch": 2.06, "grad_norm": 0.7206788063049316, "learning_rate": 3.189724086913438e-05, "loss": 1.8042, "step": 11485 }, { "epoch": 2.06, "grad_norm": 0.4096161127090454, "learning_rate": 3.188372534427117e-05, "loss": 2.0095, "step": 11490 }, { "epoch": 2.06, "grad_norm": 0.6973406672477722, "learning_rate": 3.1870207642092246e-05, "loss": 1.4596, "step": 11495 }, { "epoch": 2.06, "grad_norm": 0.4881574213504791, "learning_rate": 3.185668776687326e-05, "loss": 1.9685, "step": 11500 }, { "epoch": 2.06, "grad_norm": 0.6438494920730591, "learning_rate": 3.184316572289053e-05, "loss": 1.9556, "step": 11505 }, { "epoch": 2.06, "grad_norm": 0.7263823747634888, "learning_rate": 3.182964151442107e-05, "loss": 2.1189, "step": 11510 }, { "epoch": 2.06, "grad_norm": 0.909968912601471, "learning_rate": 3.181611514574255e-05, "loss": 1.7608, "step": 11515 }, { "epoch": 2.06, "grad_norm": 0.6028372645378113, "learning_rate": 3.180258662113338e-05, "loss": 1.7227, "step": 11520 }, { "epoch": 2.06, "grad_norm": 0.7164033651351929, "learning_rate": 3.17890559448726e-05, "loss": 1.978, "step": 11525 }, { "epoch": 2.06, "grad_norm": 0.8094786405563354, "learning_rate": 3.177552312123995e-05, "loss": 1.957, "step": 11530 }, { "epoch": 2.06, "grad_norm": 0.6405372619628906, "learning_rate": 3.1761988154515864e-05, "loss": 1.9312, "step": 11535 }, { "epoch": 2.07, "grad_norm": 0.37368497252464294, "learning_rate": 3.1748451048981424e-05, "loss": 2.1636, "step": 11540 }, { "epoch": 2.07, "grad_norm": 0.5130252242088318, "learning_rate": 3.1734911808918406e-05, "loss": 1.5754, "step": 11545 }, { "epoch": 2.07, "grad_norm": 0.6798210740089417, "learning_rate": 3.172137043860927e-05, "loss": 1.8592, "step": 11550 }, { "epoch": 2.07, "grad_norm": 0.8292738199234009, "learning_rate": 3.170782694233712e-05, "loss": 1.6516, "step": 11555 }, { "epoch": 2.07, "grad_norm": 0.5600850582122803, "learning_rate": 3.169428132438576e-05, "loss": 1.7959, "step": 11560 }, { "epoch": 2.07, "grad_norm": 0.9339403510093689, "learning_rate": 3.168073358903966e-05, "loss": 1.8377, "step": 11565 }, { "epoch": 2.07, "grad_norm": 0.5029419660568237, "learning_rate": 3.166718374058395e-05, "loss": 2.0719, "step": 11570 }, { "epoch": 2.07, "grad_norm": 1.2558355331420898, "learning_rate": 3.165363178330444e-05, "loss": 1.8537, "step": 11575 }, { "epoch": 2.07, "grad_norm": 0.9240514039993286, "learning_rate": 3.16400777214876e-05, "loss": 1.5651, "step": 11580 }, { "epoch": 2.07, "grad_norm": 0.5042521357536316, "learning_rate": 3.1626521559420556e-05, "loss": 1.6923, "step": 11585 }, { "epoch": 2.07, "grad_norm": 0.8794254064559937, "learning_rate": 3.161296330139111e-05, "loss": 1.6979, "step": 11590 }, { "epoch": 2.08, "grad_norm": 0.6789154410362244, "learning_rate": 3.1599402951687744e-05, "loss": 1.9248, "step": 11595 }, { "epoch": 2.08, "grad_norm": 0.4555959403514862, "learning_rate": 3.1585840514599574e-05, "loss": 1.7823, "step": 11600 }, { "epoch": 2.08, "grad_norm": 0.4320109784603119, "learning_rate": 3.1572275994416376e-05, "loss": 1.9124, "step": 11605 }, { "epoch": 2.08, "grad_norm": 0.5282646417617798, "learning_rate": 3.155870939542861e-05, "loss": 1.8452, "step": 11610 }, { "epoch": 2.08, "grad_norm": 1.2354097366333008, "learning_rate": 3.154514072192736e-05, "loss": 2.0078, "step": 11615 }, { "epoch": 2.08, "grad_norm": 0.7362090945243835, "learning_rate": 3.153156997820441e-05, "loss": 1.7717, "step": 11620 }, { "epoch": 2.08, "grad_norm": 0.5220533609390259, "learning_rate": 3.151799716855215e-05, "loss": 1.6834, "step": 11625 }, { "epoch": 2.08, "grad_norm": 0.6556394696235657, "learning_rate": 3.150442229726366e-05, "loss": 1.7327, "step": 11630 }, { "epoch": 2.08, "grad_norm": 0.4095345139503479, "learning_rate": 3.1490845368632645e-05, "loss": 1.852, "step": 11635 }, { "epoch": 2.08, "grad_norm": 0.6544451117515564, "learning_rate": 3.147726638695349e-05, "loss": 1.8067, "step": 11640 }, { "epoch": 2.08, "grad_norm": 0.5166847705841064, "learning_rate": 3.14636853565212e-05, "loss": 1.7826, "step": 11645 }, { "epoch": 2.09, "grad_norm": 0.8674813508987427, "learning_rate": 3.145010228163145e-05, "loss": 1.9693, "step": 11650 }, { "epoch": 2.09, "grad_norm": 0.7544355988502502, "learning_rate": 3.1436517166580565e-05, "loss": 1.6409, "step": 11655 }, { "epoch": 2.09, "grad_norm": 0.8396444916725159, "learning_rate": 3.1422930015665484e-05, "loss": 1.9227, "step": 11660 }, { "epoch": 2.09, "grad_norm": 0.7315942049026489, "learning_rate": 3.140934083318382e-05, "loss": 1.6208, "step": 11665 }, { "epoch": 2.09, "grad_norm": 3.2649600505828857, "learning_rate": 3.139574962343381e-05, "loss": 1.7472, "step": 11670 }, { "epoch": 2.09, "grad_norm": 0.7523483037948608, "learning_rate": 3.138215639071435e-05, "loss": 1.7216, "step": 11675 }, { "epoch": 2.09, "grad_norm": 0.9226972460746765, "learning_rate": 3.1368561139324956e-05, "loss": 1.6117, "step": 11680 }, { "epoch": 2.09, "grad_norm": 1.0279871225357056, "learning_rate": 3.13549638735658e-05, "loss": 2.0684, "step": 11685 }, { "epoch": 2.09, "grad_norm": 0.8119601607322693, "learning_rate": 3.1341364597737686e-05, "loss": 1.7426, "step": 11690 }, { "epoch": 2.09, "grad_norm": 0.5219517350196838, "learning_rate": 3.132776331614205e-05, "loss": 2.1323, "step": 11695 }, { "epoch": 2.09, "grad_norm": 0.8303288221359253, "learning_rate": 3.131416003308097e-05, "loss": 1.7259, "step": 11700 }, { "epoch": 2.1, "grad_norm": 0.494863897562027, "learning_rate": 3.130055475285714e-05, "loss": 1.9558, "step": 11705 }, { "epoch": 2.1, "grad_norm": 1.1602771282196045, "learning_rate": 3.12869474797739e-05, "loss": 1.8207, "step": 11710 }, { "epoch": 2.1, "grad_norm": 0.6153246164321899, "learning_rate": 3.127333821813522e-05, "loss": 1.8699, "step": 11715 }, { "epoch": 2.1, "grad_norm": 0.7908428907394409, "learning_rate": 3.1259726972245694e-05, "loss": 1.8281, "step": 11720 }, { "epoch": 2.1, "grad_norm": 0.8119539618492126, "learning_rate": 3.124611374641056e-05, "loss": 1.8657, "step": 11725 }, { "epoch": 2.1, "grad_norm": 0.5964948534965515, "learning_rate": 3.1232498544935635e-05, "loss": 1.8478, "step": 11730 }, { "epoch": 2.1, "grad_norm": 1.046675682067871, "learning_rate": 3.121888137212742e-05, "loss": 1.7418, "step": 11735 }, { "epoch": 2.1, "grad_norm": 0.6229978799819946, "learning_rate": 3.120526223229302e-05, "loss": 1.8828, "step": 11740 }, { "epoch": 2.1, "grad_norm": 1.7325795888900757, "learning_rate": 3.119164112974014e-05, "loss": 1.6943, "step": 11745 }, { "epoch": 2.1, "grad_norm": 0.5110823512077332, "learning_rate": 3.1178018068777125e-05, "loss": 2.0465, "step": 11750 }, { "epoch": 2.1, "grad_norm": 0.5617272257804871, "learning_rate": 3.1164393053712944e-05, "loss": 2.0785, "step": 11755 }, { "epoch": 2.1, "grad_norm": 0.6913292407989502, "learning_rate": 3.115076608885716e-05, "loss": 1.8045, "step": 11760 }, { "epoch": 2.11, "grad_norm": 1.4254094362258911, "learning_rate": 3.1137137178519985e-05, "loss": 1.7095, "step": 11765 }, { "epoch": 2.11, "grad_norm": 0.7044830322265625, "learning_rate": 3.112350632701222e-05, "loss": 1.9113, "step": 11770 }, { "epoch": 2.11, "grad_norm": 0.994310200214386, "learning_rate": 3.110987353864529e-05, "loss": 1.6473, "step": 11775 }, { "epoch": 2.11, "grad_norm": 3.0846924781799316, "learning_rate": 3.109623881773124e-05, "loss": 1.7661, "step": 11780 }, { "epoch": 2.11, "grad_norm": 1.5573928356170654, "learning_rate": 3.108260216858272e-05, "loss": 1.8612, "step": 11785 }, { "epoch": 2.11, "grad_norm": 1.210252046585083, "learning_rate": 3.106896359551299e-05, "loss": 1.9547, "step": 11790 }, { "epoch": 2.11, "grad_norm": 0.9023343920707703, "learning_rate": 3.1055323102835895e-05, "loss": 1.7782, "step": 11795 }, { "epoch": 2.11, "grad_norm": 21.55355453491211, "learning_rate": 3.1041680694865935e-05, "loss": 1.7996, "step": 11800 }, { "epoch": 2.11, "grad_norm": 0.7996496558189392, "learning_rate": 3.102803637591818e-05, "loss": 2.1707, "step": 11805 }, { "epoch": 2.11, "grad_norm": 0.8086062073707581, "learning_rate": 3.1014390150308326e-05, "loss": 2.1546, "step": 11810 }, { "epoch": 2.11, "grad_norm": 3.054373025894165, "learning_rate": 3.1003471799924244e-05, "loss": 2.0085, "step": 11815 }, { "epoch": 2.12, "grad_norm": 0.8012299537658691, "learning_rate": 3.098982215320005e-05, "loss": 1.6379, "step": 11820 }, { "epoch": 2.12, "grad_norm": 1.008513331413269, "learning_rate": 3.097617061190086e-05, "loss": 1.9324, "step": 11825 }, { "epoch": 2.12, "grad_norm": 0.827659547328949, "learning_rate": 3.096251718034466e-05, "loss": 2.1212, "step": 11830 }, { "epoch": 2.12, "grad_norm": 0.8780943155288696, "learning_rate": 3.0948861862850005e-05, "loss": 1.4271, "step": 11835 }, { "epoch": 2.12, "grad_norm": 0.5806859731674194, "learning_rate": 3.093520466373607e-05, "loss": 1.9781, "step": 11840 }, { "epoch": 2.12, "grad_norm": 0.8726419806480408, "learning_rate": 3.0921545587322605e-05, "loss": 1.7361, "step": 11845 }, { "epoch": 2.12, "grad_norm": 0.8282438516616821, "learning_rate": 3.090788463792996e-05, "loss": 1.4147, "step": 11850 }, { "epoch": 2.12, "grad_norm": 1.0596107244491577, "learning_rate": 3.0894221819879094e-05, "loss": 1.8455, "step": 11855 }, { "epoch": 2.12, "grad_norm": 0.9032155275344849, "learning_rate": 3.0880557137491546e-05, "loss": 1.897, "step": 11860 }, { "epoch": 2.12, "grad_norm": 2.334949016571045, "learning_rate": 3.0866890595089414e-05, "loss": 2.0026, "step": 11865 }, { "epoch": 2.12, "grad_norm": 0.5920724868774414, "learning_rate": 3.085322219699544e-05, "loss": 1.7887, "step": 11870 }, { "epoch": 2.13, "grad_norm": 0.8088483214378357, "learning_rate": 3.083955194753291e-05, "loss": 1.6292, "step": 11875 }, { "epoch": 2.13, "grad_norm": 1.0029832124710083, "learning_rate": 3.082587985102572e-05, "loss": 1.8685, "step": 11880 }, { "epoch": 2.13, "grad_norm": 0.9885581135749817, "learning_rate": 3.0812205911798336e-05, "loss": 1.812, "step": 11885 }, { "epoch": 2.13, "grad_norm": 1.0413260459899902, "learning_rate": 3.079853013417582e-05, "loss": 2.1284, "step": 11890 }, { "epoch": 2.13, "grad_norm": 1.0664329528808594, "learning_rate": 3.078485252248379e-05, "loss": 2.0554, "step": 11895 }, { "epoch": 2.13, "grad_norm": 0.3859197795391083, "learning_rate": 3.077117308104846e-05, "loss": 2.0608, "step": 11900 }, { "epoch": 2.13, "grad_norm": 0.6689034104347229, "learning_rate": 3.0757491814196665e-05, "loss": 1.7033, "step": 11905 }, { "epoch": 2.13, "grad_norm": 0.5232422351837158, "learning_rate": 3.074380872625573e-05, "loss": 1.7849, "step": 11910 }, { "epoch": 2.13, "grad_norm": 0.7212871313095093, "learning_rate": 3.073012382155362e-05, "loss": 1.9207, "step": 11915 }, { "epoch": 2.13, "grad_norm": 0.5604449510574341, "learning_rate": 3.071643710441886e-05, "loss": 1.8573, "step": 11920 }, { "epoch": 2.13, "grad_norm": 1.3720588684082031, "learning_rate": 3.070274857918054e-05, "loss": 1.696, "step": 11925 }, { "epoch": 2.14, "grad_norm": 0.821916937828064, "learning_rate": 3.068905825016834e-05, "loss": 1.5985, "step": 11930 }, { "epoch": 2.14, "grad_norm": 0.8182123899459839, "learning_rate": 3.0675366121712476e-05, "loss": 2.0291, "step": 11935 }, { "epoch": 2.14, "grad_norm": 1.4312397241592407, "learning_rate": 3.066167219814376e-05, "loss": 1.5368, "step": 11940 }, { "epoch": 2.14, "grad_norm": 0.5408018231391907, "learning_rate": 3.0647976483793584e-05, "loss": 1.828, "step": 11945 }, { "epoch": 2.14, "grad_norm": 0.6024120450019836, "learning_rate": 3.063427898299388e-05, "loss": 1.8548, "step": 11950 }, { "epoch": 2.14, "grad_norm": 0.9336116313934326, "learning_rate": 3.062057970007715e-05, "loss": 1.6518, "step": 11955 }, { "epoch": 2.14, "grad_norm": 2.2404088973999023, "learning_rate": 3.060687863937647e-05, "loss": 2.039, "step": 11960 }, { "epoch": 2.14, "grad_norm": 0.6012473106384277, "learning_rate": 3.059317580522546e-05, "loss": 2.2862, "step": 11965 }, { "epoch": 2.14, "grad_norm": 0.5676471590995789, "learning_rate": 3.057947120195833e-05, "loss": 2.1326, "step": 11970 }, { "epoch": 2.14, "grad_norm": 0.6940319538116455, "learning_rate": 3.056576483390983e-05, "loss": 2.0709, "step": 11975 }, { "epoch": 2.14, "grad_norm": 0.5876992344856262, "learning_rate": 3.055205670541527e-05, "loss": 1.8725, "step": 11980 }, { "epoch": 2.15, "grad_norm": 1.3545783758163452, "learning_rate": 3.05383468208105e-05, "loss": 1.7014, "step": 11985 }, { "epoch": 2.15, "grad_norm": 0.659228503704071, "learning_rate": 3.052463518443196e-05, "loss": 1.6168, "step": 11990 }, { "epoch": 2.15, "grad_norm": 0.3598320782184601, "learning_rate": 3.0510921800616633e-05, "loss": 1.812, "step": 11995 }, { "epoch": 2.15, "grad_norm": 1.2681336402893066, "learning_rate": 3.049720667370204e-05, "loss": 1.6559, "step": 12000 }, { "epoch": 2.15, "grad_norm": 0.691615879535675, "learning_rate": 3.048348980802626e-05, "loss": 1.7639, "step": 12005 }, { "epoch": 2.15, "grad_norm": 0.6840038299560547, "learning_rate": 3.0469771207927932e-05, "loss": 1.7349, "step": 12010 }, { "epoch": 2.15, "grad_norm": 0.38991519808769226, "learning_rate": 3.0456050877746228e-05, "loss": 2.1607, "step": 12015 }, { "epoch": 2.15, "grad_norm": 0.6570853590965271, "learning_rate": 3.044232882182088e-05, "loss": 2.0544, "step": 12020 }, { "epoch": 2.15, "grad_norm": 0.6140663027763367, "learning_rate": 3.042860504449217e-05, "loss": 1.7201, "step": 12025 }, { "epoch": 2.15, "grad_norm": 0.6192611455917358, "learning_rate": 3.0414879550100894e-05, "loss": 1.895, "step": 12030 }, { "epoch": 2.15, "grad_norm": 0.8460562825202942, "learning_rate": 3.0401152342988426e-05, "loss": 1.7679, "step": 12035 }, { "epoch": 2.16, "grad_norm": 0.6160430312156677, "learning_rate": 3.0387423427496674e-05, "loss": 1.7706, "step": 12040 }, { "epoch": 2.16, "grad_norm": 0.532014787197113, "learning_rate": 3.0373692807968074e-05, "loss": 2.0207, "step": 12045 }, { "epoch": 2.16, "grad_norm": 0.6476467847824097, "learning_rate": 3.0359960488745598e-05, "loss": 1.6105, "step": 12050 }, { "epoch": 2.16, "grad_norm": 0.520656406879425, "learning_rate": 3.0346226474172783e-05, "loss": 1.8744, "step": 12055 }, { "epoch": 2.16, "grad_norm": 0.8512612581253052, "learning_rate": 3.0332490768593675e-05, "loss": 1.8777, "step": 12060 }, { "epoch": 2.16, "grad_norm": 0.6043909788131714, "learning_rate": 3.0318753376352866e-05, "loss": 1.6042, "step": 12065 }, { "epoch": 2.16, "grad_norm": 1.555930256843567, "learning_rate": 3.030501430179548e-05, "loss": 1.7851, "step": 12070 }, { "epoch": 2.16, "grad_norm": 0.8706346750259399, "learning_rate": 3.0291273549267173e-05, "loss": 1.9402, "step": 12075 }, { "epoch": 2.16, "grad_norm": 0.7135600447654724, "learning_rate": 3.027753112311413e-05, "loss": 2.1418, "step": 12080 }, { "epoch": 2.16, "grad_norm": 1.1376060247421265, "learning_rate": 3.026378702768307e-05, "loss": 1.8637, "step": 12085 }, { "epoch": 2.16, "grad_norm": 0.6475717425346375, "learning_rate": 3.0250041267321232e-05, "loss": 2.0024, "step": 12090 }, { "epoch": 2.16, "grad_norm": 0.3832349181175232, "learning_rate": 3.02362938463764e-05, "loss": 1.837, "step": 12095 }, { "epoch": 2.17, "grad_norm": 1.567743182182312, "learning_rate": 3.0222544769196858e-05, "loss": 1.7733, "step": 12100 }, { "epoch": 2.17, "grad_norm": 0.7041072249412537, "learning_rate": 3.0208794040131426e-05, "loss": 1.5469, "step": 12105 }, { "epoch": 2.17, "grad_norm": 0.7668848633766174, "learning_rate": 3.0195041663529456e-05, "loss": 1.9095, "step": 12110 }, { "epoch": 2.17, "grad_norm": 0.9507167935371399, "learning_rate": 3.01812876437408e-05, "loss": 1.8337, "step": 12115 }, { "epoch": 2.17, "grad_norm": 0.7399880886077881, "learning_rate": 3.0167531985115842e-05, "loss": 1.6083, "step": 12120 }, { "epoch": 2.17, "grad_norm": 0.739559531211853, "learning_rate": 3.0153774692005492e-05, "loss": 1.6946, "step": 12125 }, { "epoch": 2.17, "grad_norm": 0.6560977697372437, "learning_rate": 3.0140015768761164e-05, "loss": 1.874, "step": 12130 }, { "epoch": 2.17, "grad_norm": 0.5614013075828552, "learning_rate": 3.012625521973479e-05, "loss": 1.8449, "step": 12135 }, { "epoch": 2.17, "grad_norm": 0.746187686920166, "learning_rate": 3.011249304927883e-05, "loss": 1.7156, "step": 12140 }, { "epoch": 2.17, "grad_norm": 0.5409942865371704, "learning_rate": 3.0098729261746228e-05, "loss": 2.0243, "step": 12145 }, { "epoch": 2.17, "grad_norm": 1.222755789756775, "learning_rate": 3.0084963861490468e-05, "loss": 1.7754, "step": 12150 }, { "epoch": 2.18, "grad_norm": 1.2106776237487793, "learning_rate": 3.0071196852865528e-05, "loss": 1.9023, "step": 12155 }, { "epoch": 2.18, "grad_norm": 0.8225529193878174, "learning_rate": 3.0057428240225897e-05, "loss": 2.0112, "step": 12160 }, { "epoch": 2.18, "grad_norm": 0.5646623969078064, "learning_rate": 3.0043658027926585e-05, "loss": 1.5788, "step": 12165 }, { "epoch": 2.18, "grad_norm": 0.7176980972290039, "learning_rate": 3.0029886220323082e-05, "loss": 1.9195, "step": 12170 }, { "epoch": 2.18, "grad_norm": 0.6058060526847839, "learning_rate": 3.0016112821771418e-05, "loss": 1.9483, "step": 12175 }, { "epoch": 2.18, "grad_norm": 0.7406783699989319, "learning_rate": 3.000233783662808e-05, "loss": 1.8359, "step": 12180 }, { "epoch": 2.18, "grad_norm": 1.0844707489013672, "learning_rate": 2.998856126925011e-05, "loss": 1.972, "step": 12185 }, { "epoch": 2.18, "grad_norm": 0.5166330933570862, "learning_rate": 2.9974783123995005e-05, "loss": 1.7504, "step": 12190 }, { "epoch": 2.18, "grad_norm": 1.1674580574035645, "learning_rate": 2.9961003405220774e-05, "loss": 1.6967, "step": 12195 }, { "epoch": 2.18, "grad_norm": 1.0376147031784058, "learning_rate": 2.9947222117285945e-05, "loss": 1.8488, "step": 12200 }, { "epoch": 2.18, "grad_norm": 1.062048077583313, "learning_rate": 2.9933439264549518e-05, "loss": 1.6505, "step": 12205 }, { "epoch": 2.19, "grad_norm": 0.7804186344146729, "learning_rate": 2.9919654851370998e-05, "loss": 1.6969, "step": 12210 }, { "epoch": 2.19, "grad_norm": 0.533622145652771, "learning_rate": 2.9905868882110378e-05, "loss": 1.9222, "step": 12215 }, { "epoch": 2.19, "grad_norm": 0.6791996955871582, "learning_rate": 2.989208136112815e-05, "loss": 1.8727, "step": 12220 }, { "epoch": 2.19, "grad_norm": 1.5420125722885132, "learning_rate": 2.9878292292785293e-05, "loss": 1.4943, "step": 12225 }, { "epoch": 2.19, "grad_norm": 0.7872235774993896, "learning_rate": 2.9864501681443276e-05, "loss": 1.5611, "step": 12230 }, { "epoch": 2.19, "grad_norm": 0.7012057900428772, "learning_rate": 2.985070953146406e-05, "loss": 1.6986, "step": 12235 }, { "epoch": 2.19, "grad_norm": 0.6529300212860107, "learning_rate": 2.983691584721008e-05, "loss": 1.6813, "step": 12240 }, { "epoch": 2.19, "grad_norm": 0.9842402935028076, "learning_rate": 2.982312063304427e-05, "loss": 1.9067, "step": 12245 }, { "epoch": 2.19, "grad_norm": 0.594903826713562, "learning_rate": 2.9809323893330043e-05, "loss": 1.6995, "step": 12250 }, { "epoch": 2.19, "grad_norm": 1.0693496465682983, "learning_rate": 2.9795525632431297e-05, "loss": 1.4401, "step": 12255 }, { "epoch": 2.19, "grad_norm": 0.7465589046478271, "learning_rate": 2.978172585471241e-05, "loss": 1.9998, "step": 12260 }, { "epoch": 2.2, "grad_norm": 0.5380420684814453, "learning_rate": 2.976792456453823e-05, "loss": 1.8497, "step": 12265 }, { "epoch": 2.2, "grad_norm": 0.618114709854126, "learning_rate": 2.975412176627409e-05, "loss": 1.7707, "step": 12270 }, { "epoch": 2.2, "grad_norm": 1.5173262357711792, "learning_rate": 2.9740317464285816e-05, "loss": 1.8605, "step": 12275 }, { "epoch": 2.2, "grad_norm": 0.9060313701629639, "learning_rate": 2.9726511662939695e-05, "loss": 1.8223, "step": 12280 }, { "epoch": 2.2, "grad_norm": 1.1166000366210938, "learning_rate": 2.971270436660247e-05, "loss": 1.7321, "step": 12285 }, { "epoch": 2.2, "grad_norm": 0.9780591726303101, "learning_rate": 2.969889557964139e-05, "loss": 1.6634, "step": 12290 }, { "epoch": 2.2, "grad_norm": 1.0793691873550415, "learning_rate": 2.9685085306424154e-05, "loss": 2.0318, "step": 12295 }, { "epoch": 2.2, "grad_norm": 0.39177799224853516, "learning_rate": 2.967127355131894e-05, "loss": 1.8835, "step": 12300 }, { "epoch": 2.2, "grad_norm": 0.9409979581832886, "learning_rate": 2.9657460318694407e-05, "loss": 1.7953, "step": 12305 }, { "epoch": 2.2, "grad_norm": 1.2627776861190796, "learning_rate": 2.9643645612919646e-05, "loss": 1.9271, "step": 12310 }, { "epoch": 2.2, "grad_norm": 0.5884200930595398, "learning_rate": 2.9629829438364248e-05, "loss": 1.6694, "step": 12315 }, { "epoch": 2.21, "grad_norm": 0.5460672378540039, "learning_rate": 2.9616011799398253e-05, "loss": 1.9784, "step": 12320 }, { "epoch": 2.21, "grad_norm": 1.2458994388580322, "learning_rate": 2.9602192700392168e-05, "loss": 2.0202, "step": 12325 }, { "epoch": 2.21, "grad_norm": 1.5217969417572021, "learning_rate": 2.9588372145716948e-05, "loss": 1.7552, "step": 12330 }, { "epoch": 2.21, "grad_norm": 0.5435410737991333, "learning_rate": 2.957455013974404e-05, "loss": 1.8492, "step": 12335 }, { "epoch": 2.21, "grad_norm": 0.8544949293136597, "learning_rate": 2.956072668684532e-05, "loss": 1.9221, "step": 12340 }, { "epoch": 2.21, "grad_norm": 0.841983437538147, "learning_rate": 2.9546901791393134e-05, "loss": 1.8283, "step": 12345 }, { "epoch": 2.21, "grad_norm": 0.6002446413040161, "learning_rate": 2.9533075457760296e-05, "loss": 1.7687, "step": 12350 }, { "epoch": 2.21, "grad_norm": 0.6456916332244873, "learning_rate": 2.951924769032004e-05, "loss": 2.0222, "step": 12355 }, { "epoch": 2.21, "grad_norm": 0.909529447555542, "learning_rate": 2.9505418493446087e-05, "loss": 1.9912, "step": 12360 }, { "epoch": 2.21, "grad_norm": 0.6185993552207947, "learning_rate": 2.9491587871512598e-05, "loss": 1.5441, "step": 12365 }, { "epoch": 2.21, "grad_norm": 0.7631831169128418, "learning_rate": 2.947775582889419e-05, "loss": 2.0657, "step": 12370 }, { "epoch": 2.22, "grad_norm": 0.7581762075424194, "learning_rate": 2.9463922369965917e-05, "loss": 1.6854, "step": 12375 }, { "epoch": 2.22, "grad_norm": 0.6361576318740845, "learning_rate": 2.9450087499103297e-05, "loss": 2.0618, "step": 12380 }, { "epoch": 2.22, "grad_norm": 0.9206913113594055, "learning_rate": 2.9436251220682275e-05, "loss": 1.8886, "step": 12385 }, { "epoch": 2.22, "grad_norm": 0.7134048342704773, "learning_rate": 2.9422413539079263e-05, "loss": 1.6918, "step": 12390 }, { "epoch": 2.22, "grad_norm": 0.76563960313797, "learning_rate": 2.9408574458671106e-05, "loss": 1.9828, "step": 12395 }, { "epoch": 2.22, "grad_norm": 0.9646211862564087, "learning_rate": 2.9394733983835082e-05, "loss": 1.8371, "step": 12400 }, { "epoch": 2.22, "grad_norm": 0.6462224125862122, "learning_rate": 2.938089211894892e-05, "loss": 1.7227, "step": 12405 }, { "epoch": 2.22, "grad_norm": 0.7270526885986328, "learning_rate": 2.9367048868390812e-05, "loss": 1.8023, "step": 12410 }, { "epoch": 2.22, "grad_norm": 0.6550453305244446, "learning_rate": 2.9353204236539334e-05, "loss": 1.9456, "step": 12415 }, { "epoch": 2.22, "grad_norm": 0.5939522385597229, "learning_rate": 2.933935822777355e-05, "loss": 2.0319, "step": 12420 }, { "epoch": 2.22, "grad_norm": 0.8000824451446533, "learning_rate": 2.932551084647293e-05, "loss": 1.7618, "step": 12425 }, { "epoch": 2.22, "grad_norm": 1.1937888860702515, "learning_rate": 2.931166209701739e-05, "loss": 1.9905, "step": 12430 }, { "epoch": 2.23, "grad_norm": 0.8587405681610107, "learning_rate": 2.929781198378727e-05, "loss": 1.8422, "step": 12435 }, { "epoch": 2.23, "grad_norm": 0.4814565181732178, "learning_rate": 2.928396051116336e-05, "loss": 1.8622, "step": 12440 }, { "epoch": 2.23, "grad_norm": 0.6059118509292603, "learning_rate": 2.9270107683526866e-05, "loss": 1.7572, "step": 12445 }, { "epoch": 2.23, "grad_norm": 0.7698599696159363, "learning_rate": 2.9256253505259406e-05, "loss": 1.8953, "step": 12450 }, { "epoch": 2.23, "grad_norm": 1.1599043607711792, "learning_rate": 2.924239798074307e-05, "loss": 1.6808, "step": 12455 }, { "epoch": 2.23, "grad_norm": 0.7120723128318787, "learning_rate": 2.922854111436032e-05, "loss": 1.7554, "step": 12460 }, { "epoch": 2.23, "grad_norm": 0.5135558247566223, "learning_rate": 2.9214682910494096e-05, "loss": 2.0082, "step": 12465 }, { "epoch": 2.23, "grad_norm": 0.6316121816635132, "learning_rate": 2.9200823373527713e-05, "loss": 2.05, "step": 12470 }, { "epoch": 2.23, "grad_norm": 0.9234532713890076, "learning_rate": 2.918696250784494e-05, "loss": 1.6301, "step": 12475 }, { "epoch": 2.23, "grad_norm": 0.7441158294677734, "learning_rate": 2.917310031782995e-05, "loss": 2.0701, "step": 12480 }, { "epoch": 2.23, "grad_norm": 1.686782717704773, "learning_rate": 2.9159236807867345e-05, "loss": 1.6456, "step": 12485 }, { "epoch": 2.24, "grad_norm": 0.481157511472702, "learning_rate": 2.9145371982342145e-05, "loss": 2.0247, "step": 12490 }, { "epoch": 2.24, "grad_norm": 0.3480195999145508, "learning_rate": 2.913150584563977e-05, "loss": 1.7576, "step": 12495 }, { "epoch": 2.24, "grad_norm": 0.9952110052108765, "learning_rate": 2.911763840214607e-05, "loss": 1.8107, "step": 12500 }, { "epoch": 2.24, "grad_norm": 1.2428638935089111, "learning_rate": 2.91037696562473e-05, "loss": 1.8556, "step": 12505 }, { "epoch": 2.24, "grad_norm": 0.8979426026344299, "learning_rate": 2.908989961233014e-05, "loss": 1.8072, "step": 12510 }, { "epoch": 2.24, "grad_norm": 2.5153095722198486, "learning_rate": 2.9076028274781663e-05, "loss": 2.073, "step": 12515 }, { "epoch": 2.24, "grad_norm": 0.7544877529144287, "learning_rate": 2.9062155647989364e-05, "loss": 1.8057, "step": 12520 }, { "epoch": 2.24, "grad_norm": 1.3223302364349365, "learning_rate": 2.904828173634114e-05, "loss": 1.9224, "step": 12525 }, { "epoch": 2.24, "grad_norm": 0.5328083634376526, "learning_rate": 2.90344065442253e-05, "loss": 1.9428, "step": 12530 }, { "epoch": 2.24, "grad_norm": 0.7167631387710571, "learning_rate": 2.902053007603055e-05, "loss": 2.0441, "step": 12535 }, { "epoch": 2.24, "grad_norm": 1.436819314956665, "learning_rate": 2.9006652336146e-05, "loss": 2.0272, "step": 12540 }, { "epoch": 2.25, "grad_norm": 0.5790032148361206, "learning_rate": 2.8992773328961177e-05, "loss": 1.9511, "step": 12545 }, { "epoch": 2.25, "grad_norm": 0.8261239528656006, "learning_rate": 2.8978893058865987e-05, "loss": 1.8447, "step": 12550 }, { "epoch": 2.25, "grad_norm": 0.655191957950592, "learning_rate": 2.896501153025075e-05, "loss": 1.9432, "step": 12555 }, { "epoch": 2.25, "grad_norm": 0.519561231136322, "learning_rate": 2.8951128747506184e-05, "loss": 2.0886, "step": 12560 }, { "epoch": 2.25, "grad_norm": 0.7673883438110352, "learning_rate": 2.8937244715023386e-05, "loss": 1.8421, "step": 12565 }, { "epoch": 2.25, "grad_norm": 0.6510341167449951, "learning_rate": 2.8923359437193877e-05, "loss": 1.6667, "step": 12570 }, { "epoch": 2.25, "grad_norm": 0.6979449987411499, "learning_rate": 2.8909472918409552e-05, "loss": 1.8703, "step": 12575 }, { "epoch": 2.25, "grad_norm": 0.6348164677619934, "learning_rate": 2.88955851630627e-05, "loss": 1.782, "step": 12580 }, { "epoch": 2.25, "grad_norm": 0.9662460088729858, "learning_rate": 2.8881696175546014e-05, "loss": 1.9121, "step": 12585 }, { "epoch": 2.25, "grad_norm": 0.7864895462989807, "learning_rate": 2.886780596025256e-05, "loss": 1.7273, "step": 12590 }, { "epoch": 2.25, "grad_norm": 1.4080077409744263, "learning_rate": 2.8853914521575797e-05, "loss": 1.6424, "step": 12595 }, { "epoch": 2.26, "grad_norm": 0.8357173800468445, "learning_rate": 2.8840021863909583e-05, "loss": 1.9837, "step": 12600 }, { "epoch": 2.26, "grad_norm": 0.4318985641002655, "learning_rate": 2.882612799164815e-05, "loss": 1.7542, "step": 12605 }, { "epoch": 2.26, "grad_norm": 0.6467979550361633, "learning_rate": 2.8812232909186103e-05, "loss": 1.8621, "step": 12610 }, { "epoch": 2.26, "grad_norm": 0.7634948492050171, "learning_rate": 2.8798336620918464e-05, "loss": 1.719, "step": 12615 }, { "epoch": 2.26, "grad_norm": 0.7211525440216064, "learning_rate": 2.878443913124059e-05, "loss": 1.7789, "step": 12620 }, { "epoch": 2.26, "grad_norm": 0.5822710394859314, "learning_rate": 2.8770540444548272e-05, "loss": 1.9444, "step": 12625 }, { "epoch": 2.26, "grad_norm": 0.549856424331665, "learning_rate": 2.8756640565237637e-05, "loss": 2.1138, "step": 12630 }, { "epoch": 2.26, "grad_norm": 1.3974264860153198, "learning_rate": 2.8742739497705208e-05, "loss": 1.9182, "step": 12635 }, { "epoch": 2.26, "grad_norm": 0.5173325538635254, "learning_rate": 2.8728837246347867e-05, "loss": 1.8516, "step": 12640 }, { "epoch": 2.26, "grad_norm": 0.45196789503097534, "learning_rate": 2.8714933815562894e-05, "loss": 1.7245, "step": 12645 }, { "epoch": 2.26, "grad_norm": 0.7211712598800659, "learning_rate": 2.870102920974793e-05, "loss": 1.924, "step": 12650 }, { "epoch": 2.27, "grad_norm": 0.7739723920822144, "learning_rate": 2.868712343330099e-05, "loss": 1.7423, "step": 12655 }, { "epoch": 2.27, "grad_norm": 0.738148033618927, "learning_rate": 2.8673216490620452e-05, "loss": 1.9875, "step": 12660 }, { "epoch": 2.27, "grad_norm": 0.5198020935058594, "learning_rate": 2.8659308386105065e-05, "loss": 1.8309, "step": 12665 }, { "epoch": 2.27, "grad_norm": 0.786413848400116, "learning_rate": 2.864539912415396e-05, "loss": 2.0195, "step": 12670 }, { "epoch": 2.27, "grad_norm": 0.3828994035720825, "learning_rate": 2.8631488709166625e-05, "loss": 1.9273, "step": 12675 }, { "epoch": 2.27, "grad_norm": 0.7168930172920227, "learning_rate": 2.8617577145542902e-05, "loss": 1.8926, "step": 12680 }, { "epoch": 2.27, "grad_norm": 0.955230176448822, "learning_rate": 2.8603664437682998e-05, "loss": 1.6857, "step": 12685 }, { "epoch": 2.27, "grad_norm": 0.8382362127304077, "learning_rate": 2.8589750589987506e-05, "loss": 2.0365, "step": 12690 }, { "epoch": 2.27, "grad_norm": 0.8835332989692688, "learning_rate": 2.8575835606857356e-05, "loss": 2.0321, "step": 12695 }, { "epoch": 2.27, "grad_norm": 0.4743799865245819, "learning_rate": 2.8561919492693845e-05, "loss": 2.0926, "step": 12700 }, { "epoch": 2.27, "grad_norm": 0.7596274614334106, "learning_rate": 2.854800225189862e-05, "loss": 1.8994, "step": 12705 }, { "epoch": 2.28, "grad_norm": 0.7106197476387024, "learning_rate": 2.8534083888873697e-05, "loss": 1.7113, "step": 12710 }, { "epoch": 2.28, "grad_norm": 0.7384430766105652, "learning_rate": 2.8520164408021432e-05, "loss": 1.742, "step": 12715 }, { "epoch": 2.28, "grad_norm": 0.9046937227249146, "learning_rate": 2.850624381374456e-05, "loss": 1.9561, "step": 12720 }, { "epoch": 2.28, "grad_norm": 0.8590306639671326, "learning_rate": 2.8492322110446146e-05, "loss": 1.8945, "step": 12725 }, { "epoch": 2.28, "grad_norm": 0.8791319727897644, "learning_rate": 2.8478399302529597e-05, "loss": 1.8657, "step": 12730 }, { "epoch": 2.28, "grad_norm": 0.6100798845291138, "learning_rate": 2.84644753943987e-05, "loss": 1.8644, "step": 12735 }, { "epoch": 2.28, "grad_norm": 0.6638292670249939, "learning_rate": 2.8450550390457565e-05, "loss": 1.8004, "step": 12740 }, { "epoch": 2.28, "grad_norm": 0.814220666885376, "learning_rate": 2.8436624295110663e-05, "loss": 1.885, "step": 12745 }, { "epoch": 2.28, "grad_norm": 1.095098614692688, "learning_rate": 2.8422697112762803e-05, "loss": 2.1555, "step": 12750 }, { "epoch": 2.28, "grad_norm": 0.6496490836143494, "learning_rate": 2.8408768847819133e-05, "loss": 1.7966, "step": 12755 }, { "epoch": 2.28, "grad_norm": 0.9682343006134033, "learning_rate": 2.8394839504685162e-05, "loss": 1.8505, "step": 12760 }, { "epoch": 2.28, "grad_norm": 0.9972975850105286, "learning_rate": 2.8380909087766717e-05, "loss": 1.7333, "step": 12765 }, { "epoch": 2.29, "grad_norm": 0.9131410121917725, "learning_rate": 2.8366977601469986e-05, "loss": 1.9954, "step": 12770 }, { "epoch": 2.29, "grad_norm": 0.8180422782897949, "learning_rate": 2.8353045050201465e-05, "loss": 1.7182, "step": 12775 }, { "epoch": 2.29, "grad_norm": 0.5822163820266724, "learning_rate": 2.8339111438368032e-05, "loss": 1.6172, "step": 12780 }, { "epoch": 2.29, "grad_norm": 0.9533834457397461, "learning_rate": 2.832517677037686e-05, "loss": 1.971, "step": 12785 }, { "epoch": 2.29, "grad_norm": 0.9467795491218567, "learning_rate": 2.8311241050635467e-05, "loss": 1.8465, "step": 12790 }, { "epoch": 2.29, "grad_norm": 0.5219370722770691, "learning_rate": 2.8297304283551728e-05, "loss": 1.932, "step": 12795 }, { "epoch": 2.29, "grad_norm": 0.45871034264564514, "learning_rate": 2.8283366473533808e-05, "loss": 2.0782, "step": 12800 }, { "epoch": 2.29, "grad_norm": 0.48181986808776855, "learning_rate": 2.8269427624990224e-05, "loss": 2.0225, "step": 12805 }, { "epoch": 2.29, "grad_norm": 0.6003161668777466, "learning_rate": 2.8255487742329838e-05, "loss": 1.7363, "step": 12810 }, { "epoch": 2.29, "grad_norm": 0.6212576031684875, "learning_rate": 2.8241546829961808e-05, "loss": 1.887, "step": 12815 }, { "epoch": 2.29, "grad_norm": 0.5601562261581421, "learning_rate": 2.8227604892295628e-05, "loss": 1.8317, "step": 12820 }, { "epoch": 2.3, "grad_norm": 0.9833465814590454, "learning_rate": 2.8213661933741132e-05, "loss": 1.8634, "step": 12825 }, { "epoch": 2.3, "grad_norm": 4.311710357666016, "learning_rate": 2.8199717958708455e-05, "loss": 1.6882, "step": 12830 }, { "epoch": 2.3, "grad_norm": 0.8071384429931641, "learning_rate": 2.818577297160806e-05, "loss": 2.0808, "step": 12835 }, { "epoch": 2.3, "grad_norm": 0.8173998594284058, "learning_rate": 2.8171826976850753e-05, "loss": 1.7736, "step": 12840 }, { "epoch": 2.3, "grad_norm": 0.7156624794006348, "learning_rate": 2.815787997884761e-05, "loss": 1.89, "step": 12845 }, { "epoch": 2.3, "grad_norm": 1.1727192401885986, "learning_rate": 2.8143931982010068e-05, "loss": 1.8794, "step": 12850 }, { "epoch": 2.3, "grad_norm": 0.5154191255569458, "learning_rate": 2.8129982990749872e-05, "loss": 1.9237, "step": 12855 }, { "epoch": 2.3, "grad_norm": 1.5282955169677734, "learning_rate": 2.8116033009479058e-05, "loss": 1.9519, "step": 12860 }, { "epoch": 2.3, "grad_norm": 0.8198376297950745, "learning_rate": 2.8102082042610007e-05, "loss": 1.8245, "step": 12865 }, { "epoch": 2.3, "grad_norm": 1.2161682844161987, "learning_rate": 2.8088130094555394e-05, "loss": 1.7885, "step": 12870 }, { "epoch": 2.3, "grad_norm": 4.205202579498291, "learning_rate": 2.8074177169728188e-05, "loss": 1.9236, "step": 12875 }, { "epoch": 2.31, "grad_norm": 0.6405089497566223, "learning_rate": 2.8060223272541706e-05, "loss": 1.863, "step": 12880 }, { "epoch": 2.31, "grad_norm": 1.1483161449432373, "learning_rate": 2.8046268407409555e-05, "loss": 1.8218, "step": 12885 }, { "epoch": 2.31, "grad_norm": 1.4012528657913208, "learning_rate": 2.8032312578745634e-05, "loss": 1.7995, "step": 12890 }, { "epoch": 2.31, "grad_norm": 0.5227799415588379, "learning_rate": 2.8018355790964153e-05, "loss": 2.0666, "step": 12895 }, { "epoch": 2.31, "grad_norm": 0.7263579368591309, "learning_rate": 2.8004398048479646e-05, "loss": 1.5815, "step": 12900 }, { "epoch": 2.31, "grad_norm": 0.8712220191955566, "learning_rate": 2.799043935570692e-05, "loss": 1.6493, "step": 12905 }, { "epoch": 2.31, "grad_norm": 0.4584944248199463, "learning_rate": 2.797647971706111e-05, "loss": 2.0632, "step": 12910 }, { "epoch": 2.31, "grad_norm": 0.9673914909362793, "learning_rate": 2.7962519136957625e-05, "loss": 1.7794, "step": 12915 }, { "epoch": 2.31, "grad_norm": 0.5151163339614868, "learning_rate": 2.7948557619812183e-05, "loss": 1.8515, "step": 12920 }, { "epoch": 2.31, "grad_norm": 0.7004354596138, "learning_rate": 2.7934595170040802e-05, "loss": 1.7434, "step": 12925 }, { "epoch": 2.31, "grad_norm": 0.5903719663619995, "learning_rate": 2.7920631792059797e-05, "loss": 1.9251, "step": 12930 }, { "epoch": 2.32, "grad_norm": 1.2344070672988892, "learning_rate": 2.7906667490285765e-05, "loss": 1.7713, "step": 12935 }, { "epoch": 2.32, "grad_norm": 1.2571988105773926, "learning_rate": 2.7892702269135595e-05, "loss": 1.5384, "step": 12940 }, { "epoch": 2.32, "grad_norm": 0.5835108757019043, "learning_rate": 2.787873613302649e-05, "loss": 1.7925, "step": 12945 }, { "epoch": 2.32, "grad_norm": 0.5990691184997559, "learning_rate": 2.7864769086375903e-05, "loss": 1.7678, "step": 12950 }, { "epoch": 2.32, "grad_norm": 1.5972462892532349, "learning_rate": 2.7850801133601616e-05, "loss": 1.8765, "step": 12955 }, { "epoch": 2.32, "grad_norm": 2.4750313758850098, "learning_rate": 2.7836832279121678e-05, "loss": 2.0151, "step": 12960 }, { "epoch": 2.32, "grad_norm": 1.888633131980896, "learning_rate": 2.782286252735441e-05, "loss": 1.9451, "step": 12965 }, { "epoch": 2.32, "grad_norm": 0.58510422706604, "learning_rate": 2.7808891882718445e-05, "loss": 2.0672, "step": 12970 }, { "epoch": 2.32, "grad_norm": 0.7262510657310486, "learning_rate": 2.7794920349632675e-05, "loss": 1.9223, "step": 12975 }, { "epoch": 2.32, "grad_norm": 0.7543483972549438, "learning_rate": 2.7780947932516294e-05, "loss": 1.9088, "step": 12980 }, { "epoch": 2.32, "grad_norm": 5.534985542297363, "learning_rate": 2.7766974635788743e-05, "loss": 2.0211, "step": 12985 }, { "epoch": 2.33, "grad_norm": 0.6918667554855347, "learning_rate": 2.7753000463869794e-05, "loss": 1.7378, "step": 12990 }, { "epoch": 2.33, "grad_norm": 0.7650964260101318, "learning_rate": 2.7739025421179433e-05, "loss": 1.5172, "step": 12995 }, { "epoch": 2.33, "grad_norm": 0.662441611289978, "learning_rate": 2.7725049512137967e-05, "loss": 1.9162, "step": 13000 }, { "epoch": 2.33, "grad_norm": 0.6917517781257629, "learning_rate": 2.771107274116597e-05, "loss": 1.7873, "step": 13005 }, { "epoch": 2.33, "grad_norm": 0.6874393820762634, "learning_rate": 2.769709511268427e-05, "loss": 1.9959, "step": 13010 }, { "epoch": 2.33, "grad_norm": 0.8905332684516907, "learning_rate": 2.7683116631113976e-05, "loss": 1.9593, "step": 13015 }, { "epoch": 2.33, "grad_norm": 1.5240230560302734, "learning_rate": 2.7669137300876467e-05, "loss": 1.6583, "step": 13020 }, { "epoch": 2.33, "grad_norm": 1.5556925535202026, "learning_rate": 2.7655157126393404e-05, "loss": 1.6085, "step": 13025 }, { "epoch": 2.33, "grad_norm": 0.9076879620552063, "learning_rate": 2.7641176112086698e-05, "loss": 1.6375, "step": 13030 }, { "epoch": 2.33, "grad_norm": 0.4439728260040283, "learning_rate": 2.762719426237853e-05, "loss": 1.9813, "step": 13035 }, { "epoch": 2.33, "grad_norm": 0.7149770259857178, "learning_rate": 2.761321158169134e-05, "loss": 1.9144, "step": 13040 }, { "epoch": 2.33, "grad_norm": 0.49863100051879883, "learning_rate": 2.759922807444784e-05, "loss": 2.1206, "step": 13045 }, { "epoch": 2.34, "grad_norm": 1.2692310810089111, "learning_rate": 2.7585243745071003e-05, "loss": 1.9092, "step": 13050 }, { "epoch": 2.34, "grad_norm": 0.9751031398773193, "learning_rate": 2.757125859798405e-05, "loss": 2.052, "step": 13055 }, { "epoch": 2.34, "grad_norm": 0.8686983585357666, "learning_rate": 2.7557272637610483e-05, "loss": 1.8621, "step": 13060 }, { "epoch": 2.34, "grad_norm": 0.48079320788383484, "learning_rate": 2.7543285868374037e-05, "loss": 1.6149, "step": 13065 }, { "epoch": 2.34, "grad_norm": 0.8371298909187317, "learning_rate": 2.7529298294698712e-05, "loss": 1.9314, "step": 13070 }, { "epoch": 2.34, "grad_norm": 0.3321702182292938, "learning_rate": 2.7515309921008775e-05, "loss": 1.8415, "step": 13075 }, { "epoch": 2.34, "grad_norm": 0.8658364415168762, "learning_rate": 2.7501320751728732e-05, "loss": 1.7368, "step": 13080 }, { "epoch": 2.34, "grad_norm": 0.8022560477256775, "learning_rate": 2.7487330791283335e-05, "loss": 1.6356, "step": 13085 }, { "epoch": 2.34, "grad_norm": 1.0247143507003784, "learning_rate": 2.7473340044097605e-05, "loss": 1.8981, "step": 13090 }, { "epoch": 2.34, "grad_norm": 0.5062198042869568, "learning_rate": 2.7459348514596796e-05, "loss": 2.1258, "step": 13095 }, { "epoch": 2.34, "grad_norm": 6.957434177398682, "learning_rate": 2.7445356207206424e-05, "loss": 1.7353, "step": 13100 }, { "epoch": 2.35, "grad_norm": 0.8973393440246582, "learning_rate": 2.7431363126352232e-05, "loss": 1.5304, "step": 13105 }, { "epoch": 2.35, "grad_norm": 0.7698760628700256, "learning_rate": 2.7417369276460215e-05, "loss": 1.8007, "step": 13110 }, { "epoch": 2.35, "grad_norm": 0.6482462882995605, "learning_rate": 2.7403374661956626e-05, "loss": 1.9363, "step": 13115 }, { "epoch": 2.35, "grad_norm": 0.528232216835022, "learning_rate": 2.7389379287267953e-05, "loss": 1.5902, "step": 13120 }, { "epoch": 2.35, "grad_norm": 0.5823779702186584, "learning_rate": 2.7375383156820895e-05, "loss": 1.8967, "step": 13125 }, { "epoch": 2.35, "grad_norm": 1.153917908668518, "learning_rate": 2.7361386275042434e-05, "loss": 1.8673, "step": 13130 }, { "epoch": 2.35, "grad_norm": 1.1044551134109497, "learning_rate": 2.734738864635977e-05, "loss": 1.9955, "step": 13135 }, { "epoch": 2.35, "grad_norm": 0.7324090600013733, "learning_rate": 2.733339027520032e-05, "loss": 1.8696, "step": 13140 }, { "epoch": 2.35, "grad_norm": 0.6724053621292114, "learning_rate": 2.7319391165991786e-05, "loss": 2.0311, "step": 13145 }, { "epoch": 2.35, "grad_norm": 1.5398845672607422, "learning_rate": 2.7305391323162056e-05, "loss": 1.8971, "step": 13150 }, { "epoch": 2.35, "grad_norm": 0.6307987570762634, "learning_rate": 2.7291390751139263e-05, "loss": 1.998, "step": 13155 }, { "epoch": 2.36, "grad_norm": 0.5545775890350342, "learning_rate": 2.7277389454351777e-05, "loss": 1.9609, "step": 13160 }, { "epoch": 2.36, "grad_norm": 0.9437121748924255, "learning_rate": 2.7263387437228204e-05, "loss": 1.9158, "step": 13165 }, { "epoch": 2.36, "grad_norm": 0.5333415865898132, "learning_rate": 2.724938470419736e-05, "loss": 2.0586, "step": 13170 }, { "epoch": 2.36, "grad_norm": 0.7932983636856079, "learning_rate": 2.72353812596883e-05, "loss": 1.9107, "step": 13175 }, { "epoch": 2.36, "grad_norm": 0.802893877029419, "learning_rate": 2.7221377108130292e-05, "loss": 2.0469, "step": 13180 }, { "epoch": 2.36, "grad_norm": 0.4990454614162445, "learning_rate": 2.720737225395284e-05, "loss": 1.9607, "step": 13185 }, { "epoch": 2.36, "grad_norm": 1.1566194295883179, "learning_rate": 2.7193366701585677e-05, "loss": 1.781, "step": 13190 }, { "epoch": 2.36, "grad_norm": 0.773506224155426, "learning_rate": 2.717936045545873e-05, "loss": 1.8001, "step": 13195 }, { "epoch": 2.36, "grad_norm": 0.6509556174278259, "learning_rate": 2.7165353520002167e-05, "loss": 1.9832, "step": 13200 }, { "epoch": 2.36, "grad_norm": 0.9481406807899475, "learning_rate": 2.7151345899646363e-05, "loss": 1.8591, "step": 13205 }, { "epoch": 2.36, "grad_norm": 0.6072911620140076, "learning_rate": 2.7137337598821932e-05, "loss": 1.8942, "step": 13210 }, { "epoch": 2.37, "grad_norm": 0.9392735362052917, "learning_rate": 2.712332862195967e-05, "loss": 1.8107, "step": 13215 }, { "epoch": 2.37, "grad_norm": 0.8217921257019043, "learning_rate": 2.7109318973490606e-05, "loss": 1.8219, "step": 13220 }, { "epoch": 2.37, "grad_norm": 1.2502552270889282, "learning_rate": 2.709530865784598e-05, "loss": 2.1015, "step": 13225 }, { "epoch": 2.37, "grad_norm": 0.7115715742111206, "learning_rate": 2.7081297679457236e-05, "loss": 1.8984, "step": 13230 }, { "epoch": 2.37, "grad_norm": 0.6438332796096802, "learning_rate": 2.706728604275604e-05, "loss": 1.9235, "step": 13235 }, { "epoch": 2.37, "grad_norm": 0.5191622376441956, "learning_rate": 2.705327375217427e-05, "loss": 1.9651, "step": 13240 }, { "epoch": 2.37, "grad_norm": 1.5185312032699585, "learning_rate": 2.703926081214397e-05, "loss": 1.9163, "step": 13245 }, { "epoch": 2.37, "grad_norm": 0.4574000835418701, "learning_rate": 2.702524722709744e-05, "loss": 1.8515, "step": 13250 }, { "epoch": 2.37, "grad_norm": 0.6382564902305603, "learning_rate": 2.7011233001467166e-05, "loss": 2.0203, "step": 13255 }, { "epoch": 2.37, "grad_norm": 0.9020372629165649, "learning_rate": 2.6997218139685825e-05, "loss": 1.7434, "step": 13260 }, { "epoch": 2.37, "grad_norm": 0.9500882625579834, "learning_rate": 2.6983202646186306e-05, "loss": 1.6759, "step": 13265 }, { "epoch": 2.38, "grad_norm": 0.5408741235733032, "learning_rate": 2.69691865254017e-05, "loss": 1.9249, "step": 13270 }, { "epoch": 2.38, "grad_norm": 0.5403386950492859, "learning_rate": 2.695516978176529e-05, "loss": 1.9793, "step": 13275 }, { "epoch": 2.38, "grad_norm": 1.1900765895843506, "learning_rate": 2.694115241971056e-05, "loss": 1.5909, "step": 13280 }, { "epoch": 2.38, "grad_norm": 0.8298088908195496, "learning_rate": 2.6927134443671187e-05, "loss": 1.6979, "step": 13285 }, { "epoch": 2.38, "grad_norm": 0.30263030529022217, "learning_rate": 2.6913115858081038e-05, "loss": 2.0684, "step": 13290 }, { "epoch": 2.38, "grad_norm": 0.6629304885864258, "learning_rate": 2.6899096667374187e-05, "loss": 1.7992, "step": 13295 }, { "epoch": 2.38, "grad_norm": 0.8511559367179871, "learning_rate": 2.6885076875984888e-05, "loss": 1.8706, "step": 13300 }, { "epoch": 2.38, "grad_norm": 0.7309024333953857, "learning_rate": 2.6871056488347583e-05, "loss": 1.6392, "step": 13305 }, { "epoch": 2.38, "grad_norm": 0.796414315700531, "learning_rate": 2.6857035508896917e-05, "loss": 1.8747, "step": 13310 }, { "epoch": 2.38, "grad_norm": 0.9297599792480469, "learning_rate": 2.6843013942067702e-05, "loss": 1.527, "step": 13315 }, { "epoch": 2.38, "grad_norm": 1.2013111114501953, "learning_rate": 2.6828991792294943e-05, "loss": 1.8731, "step": 13320 }, { "epoch": 2.39, "grad_norm": 0.5262635946273804, "learning_rate": 2.6814969064013846e-05, "loss": 1.898, "step": 13325 }, { "epoch": 2.39, "grad_norm": 0.6759445071220398, "learning_rate": 2.6800945761659784e-05, "loss": 1.9081, "step": 13330 }, { "epoch": 2.39, "grad_norm": 0.5417052507400513, "learning_rate": 2.6786921889668303e-05, "loss": 1.689, "step": 13335 }, { "epoch": 2.39, "grad_norm": 0.7009138464927673, "learning_rate": 2.677289745247515e-05, "loss": 1.9229, "step": 13340 }, { "epoch": 2.39, "grad_norm": 0.689923882484436, "learning_rate": 2.675887245451624e-05, "loss": 1.8571, "step": 13345 }, { "epoch": 2.39, "grad_norm": 1.4503270387649536, "learning_rate": 2.6744846900227654e-05, "loss": 1.8241, "step": 13350 }, { "epoch": 2.39, "grad_norm": 1.711902141571045, "learning_rate": 2.673082079404568e-05, "loss": 1.7829, "step": 13355 }, { "epoch": 2.39, "grad_norm": 0.37441059947013855, "learning_rate": 2.6716794140406758e-05, "loss": 1.8407, "step": 13360 }, { "epoch": 2.39, "grad_norm": 0.5013666749000549, "learning_rate": 2.6702766943747493e-05, "loss": 1.9828, "step": 13365 }, { "epoch": 2.39, "grad_norm": 0.7650493383407593, "learning_rate": 2.6688739208504686e-05, "loss": 1.7357, "step": 13370 }, { "epoch": 2.39, "grad_norm": 0.6591855883598328, "learning_rate": 2.667471093911529e-05, "loss": 2.0887, "step": 13375 }, { "epoch": 2.39, "grad_norm": 0.5767456889152527, "learning_rate": 2.6660682140016435e-05, "loss": 1.9841, "step": 13380 }, { "epoch": 2.4, "grad_norm": 0.6293467283248901, "learning_rate": 2.6646652815645412e-05, "loss": 2.0894, "step": 13385 }, { "epoch": 2.4, "grad_norm": 0.4856579601764679, "learning_rate": 2.6632622970439698e-05, "loss": 1.8616, "step": 13390 }, { "epoch": 2.4, "grad_norm": 0.9088009595870972, "learning_rate": 2.6618592608836896e-05, "loss": 1.8978, "step": 13395 }, { "epoch": 2.4, "grad_norm": 1.0228464603424072, "learning_rate": 2.660456173527482e-05, "loss": 1.7805, "step": 13400 }, { "epoch": 2.4, "grad_norm": 0.8853181004524231, "learning_rate": 2.6590530354191405e-05, "loss": 1.683, "step": 13405 }, { "epoch": 2.4, "grad_norm": 0.7312606573104858, "learning_rate": 2.657649847002477e-05, "loss": 1.9513, "step": 13410 }, { "epoch": 2.4, "grad_norm": 0.7548524737358093, "learning_rate": 2.6562466087213188e-05, "loss": 1.7518, "step": 13415 }, { "epoch": 2.4, "grad_norm": 1.1628096103668213, "learning_rate": 2.654843321019508e-05, "loss": 1.9882, "step": 13420 }, { "epoch": 2.4, "grad_norm": 0.8537325263023376, "learning_rate": 2.653439984340904e-05, "loss": 2.1221, "step": 13425 }, { "epoch": 2.4, "grad_norm": 0.9286057949066162, "learning_rate": 2.65203659912938e-05, "loss": 1.7583, "step": 13430 }, { "epoch": 2.4, "grad_norm": 0.7813141942024231, "learning_rate": 2.650633165828827e-05, "loss": 1.7862, "step": 13435 }, { "epoch": 2.41, "grad_norm": 0.770072877407074, "learning_rate": 2.6492296848831476e-05, "loss": 1.7671, "step": 13440 }, { "epoch": 2.41, "grad_norm": 0.7098547220230103, "learning_rate": 2.6478261567362628e-05, "loss": 2.02, "step": 13445 }, { "epoch": 2.41, "grad_norm": 1.0164227485656738, "learning_rate": 2.646422581832107e-05, "loss": 1.6729, "step": 13450 }, { "epoch": 2.41, "grad_norm": 1.548316478729248, "learning_rate": 2.645018960614629e-05, "loss": 1.694, "step": 13455 }, { "epoch": 2.41, "grad_norm": 0.7778927683830261, "learning_rate": 2.6436152935277936e-05, "loss": 1.775, "step": 13460 }, { "epoch": 2.41, "grad_norm": 0.7858615517616272, "learning_rate": 2.642211581015579e-05, "loss": 1.6574, "step": 13465 }, { "epoch": 2.41, "grad_norm": 0.9521759748458862, "learning_rate": 2.640807823521978e-05, "loss": 1.7402, "step": 13470 }, { "epoch": 2.41, "grad_norm": 1.4299689531326294, "learning_rate": 2.6394040214909976e-05, "loss": 1.7257, "step": 13475 }, { "epoch": 2.41, "grad_norm": 0.9395617246627808, "learning_rate": 2.6380001753666593e-05, "loss": 1.8034, "step": 13480 }, { "epoch": 2.41, "grad_norm": 0.4695361256599426, "learning_rate": 2.636596285592999e-05, "loss": 1.8073, "step": 13485 }, { "epoch": 2.41, "grad_norm": 0.31577566266059875, "learning_rate": 2.635192352614064e-05, "loss": 1.9157, "step": 13490 }, { "epoch": 2.42, "grad_norm": 0.4319940507411957, "learning_rate": 2.6337883768739192e-05, "loss": 1.969, "step": 13495 }, { "epoch": 2.42, "grad_norm": 0.9783098101615906, "learning_rate": 2.632384358816638e-05, "loss": 2.0406, "step": 13500 }, { "epoch": 2.42, "grad_norm": 1.060631513595581, "learning_rate": 2.6309802988863124e-05, "loss": 1.7076, "step": 13505 }, { "epoch": 2.42, "grad_norm": 0.9120562672615051, "learning_rate": 2.6295761975270432e-05, "loss": 1.958, "step": 13510 }, { "epoch": 2.42, "grad_norm": 0.6509984135627747, "learning_rate": 2.628172055182948e-05, "loss": 1.9328, "step": 13515 }, { "epoch": 2.42, "grad_norm": 0.6889690160751343, "learning_rate": 2.626767872298156e-05, "loss": 2.1166, "step": 13520 }, { "epoch": 2.42, "grad_norm": 1.167043924331665, "learning_rate": 2.6253636493168065e-05, "loss": 1.9546, "step": 13525 }, { "epoch": 2.42, "grad_norm": 0.5428257584571838, "learning_rate": 2.623959386683056e-05, "loss": 1.8359, "step": 13530 }, { "epoch": 2.42, "grad_norm": 0.911302387714386, "learning_rate": 2.6225550848410706e-05, "loss": 1.7099, "step": 13535 }, { "epoch": 2.42, "grad_norm": 0.8027520775794983, "learning_rate": 2.6211507442350303e-05, "loss": 1.9308, "step": 13540 }, { "epoch": 2.42, "grad_norm": 0.5085805058479309, "learning_rate": 2.619746365309125e-05, "loss": 1.7125, "step": 13545 }, { "epoch": 2.43, "grad_norm": 0.828308641910553, "learning_rate": 2.6183419485075605e-05, "loss": 2.0749, "step": 13550 }, { "epoch": 2.43, "grad_norm": 1.7583789825439453, "learning_rate": 2.6169374942745505e-05, "loss": 1.8622, "step": 13555 }, { "epoch": 2.43, "grad_norm": 0.5300992131233215, "learning_rate": 2.615533003054324e-05, "loss": 1.4851, "step": 13560 }, { "epoch": 2.43, "grad_norm": 0.7071987986564636, "learning_rate": 2.6141284752911205e-05, "loss": 1.7153, "step": 13565 }, { "epoch": 2.43, "grad_norm": 1.1922173500061035, "learning_rate": 2.612723911429189e-05, "loss": 1.8535, "step": 13570 }, { "epoch": 2.43, "grad_norm": 0.631430447101593, "learning_rate": 2.6113193119127934e-05, "loss": 2.0645, "step": 13575 }, { "epoch": 2.43, "grad_norm": 0.7159907817840576, "learning_rate": 2.6099146771862064e-05, "loss": 1.748, "step": 13580 }, { "epoch": 2.43, "grad_norm": 0.622367262840271, "learning_rate": 2.6085100076937126e-05, "loss": 1.5421, "step": 13585 }, { "epoch": 2.43, "grad_norm": 0.9370682239532471, "learning_rate": 2.6071053038796077e-05, "loss": 1.8222, "step": 13590 }, { "epoch": 2.43, "grad_norm": 0.9740180969238281, "learning_rate": 2.6057005661881982e-05, "loss": 1.6439, "step": 13595 }, { "epoch": 2.43, "grad_norm": 1.1773573160171509, "learning_rate": 2.6042957950638013e-05, "loss": 1.9287, "step": 13600 }, { "epoch": 2.44, "grad_norm": 0.8247451186180115, "learning_rate": 2.602890990950745e-05, "loss": 1.8199, "step": 13605 }, { "epoch": 2.44, "grad_norm": 0.7261055111885071, "learning_rate": 2.6014861542933676e-05, "loss": 1.7212, "step": 13610 }, { "epoch": 2.44, "grad_norm": 0.8083186149597168, "learning_rate": 2.600081285536016e-05, "loss": 1.9574, "step": 13615 }, { "epoch": 2.44, "grad_norm": 0.9959443211555481, "learning_rate": 2.5986763851230506e-05, "loss": 1.982, "step": 13620 }, { "epoch": 2.44, "grad_norm": 0.7056101560592651, "learning_rate": 2.5972714534988403e-05, "loss": 1.7576, "step": 13625 }, { "epoch": 2.44, "grad_norm": 0.7754709720611572, "learning_rate": 2.5958664911077614e-05, "loss": 1.8267, "step": 13630 }, { "epoch": 2.44, "grad_norm": 0.6835538744926453, "learning_rate": 2.5944614983942044e-05, "loss": 1.7875, "step": 13635 }, { "epoch": 2.44, "grad_norm": 0.8215372562408447, "learning_rate": 2.5930564758025665e-05, "loss": 1.462, "step": 13640 }, { "epoch": 2.44, "grad_norm": 0.608349621295929, "learning_rate": 2.5916514237772543e-05, "loss": 1.9472, "step": 13645 }, { "epoch": 2.44, "grad_norm": 0.9655773639678955, "learning_rate": 2.590246342762685e-05, "loss": 1.7016, "step": 13650 }, { "epoch": 2.44, "grad_norm": 0.6545687317848206, "learning_rate": 2.5888412332032847e-05, "loss": 1.7792, "step": 13655 }, { "epoch": 2.45, "grad_norm": 0.8544228672981262, "learning_rate": 2.587436095543488e-05, "loss": 1.7444, "step": 13660 }, { "epoch": 2.45, "grad_norm": 0.4463987946510315, "learning_rate": 2.5860309302277373e-05, "loss": 1.8348, "step": 13665 }, { "epoch": 2.45, "grad_norm": 0.6795549392700195, "learning_rate": 2.5846257377004874e-05, "loss": 1.95, "step": 13670 }, { "epoch": 2.45, "grad_norm": 0.8079219460487366, "learning_rate": 2.583220518406197e-05, "loss": 1.7795, "step": 13675 }, { "epoch": 2.45, "grad_norm": 0.4398018419742584, "learning_rate": 2.581815272789337e-05, "loss": 2.0165, "step": 13680 }, { "epoch": 2.45, "grad_norm": 1.9223191738128662, "learning_rate": 2.5804100012943855e-05, "loss": 1.7688, "step": 13685 }, { "epoch": 2.45, "grad_norm": 0.9821711778640747, "learning_rate": 2.579004704365827e-05, "loss": 1.8864, "step": 13690 }, { "epoch": 2.45, "grad_norm": 0.7216715812683105, "learning_rate": 2.5775993824481565e-05, "loss": 1.8994, "step": 13695 }, { "epoch": 2.45, "grad_norm": 0.4654800295829773, "learning_rate": 2.5761940359858767e-05, "loss": 2.0082, "step": 13700 }, { "epoch": 2.45, "grad_norm": 1.3378876447677612, "learning_rate": 2.5747886654234967e-05, "loss": 1.799, "step": 13705 }, { "epoch": 2.45, "grad_norm": 1.4554535150527954, "learning_rate": 2.5733832712055328e-05, "loss": 1.8686, "step": 13710 }, { "epoch": 2.45, "grad_norm": 0.4863360822200775, "learning_rate": 2.5719778537765115e-05, "loss": 2.0027, "step": 13715 }, { "epoch": 2.46, "grad_norm": 6.204029560089111, "learning_rate": 2.570572413580964e-05, "loss": 2.0457, "step": 13720 }, { "epoch": 2.46, "grad_norm": 1.010169267654419, "learning_rate": 2.5691669510634302e-05, "loss": 1.829, "step": 13725 }, { "epoch": 2.46, "grad_norm": 0.463751882314682, "learning_rate": 2.5677614666684567e-05, "loss": 1.9938, "step": 13730 }, { "epoch": 2.46, "grad_norm": 0.6773427724838257, "learning_rate": 2.566355960840596e-05, "loss": 2.0168, "step": 13735 }, { "epoch": 2.46, "grad_norm": 0.6346594095230103, "learning_rate": 2.5649504340244085e-05, "loss": 1.7716, "step": 13740 }, { "epoch": 2.46, "grad_norm": 0.40613624453544617, "learning_rate": 2.5635448866644613e-05, "loss": 1.7708, "step": 13745 }, { "epoch": 2.46, "grad_norm": 0.7240647077560425, "learning_rate": 2.5621393192053278e-05, "loss": 2.1106, "step": 13750 }, { "epoch": 2.46, "grad_norm": 0.6415520906448364, "learning_rate": 2.560733732091587e-05, "loss": 1.5509, "step": 13755 }, { "epoch": 2.46, "grad_norm": 0.5846617221832275, "learning_rate": 2.559328125767826e-05, "loss": 1.7258, "step": 13760 }, { "epoch": 2.46, "grad_norm": 0.5477493405342102, "learning_rate": 2.557922500678635e-05, "loss": 1.7324, "step": 13765 }, { "epoch": 2.46, "grad_norm": 0.5833098292350769, "learning_rate": 2.556516857268613e-05, "loss": 1.9903, "step": 13770 }, { "epoch": 2.47, "grad_norm": 0.5727924108505249, "learning_rate": 2.555111195982364e-05, "loss": 1.9487, "step": 13775 }, { "epoch": 2.47, "grad_norm": 0.5207077264785767, "learning_rate": 2.553705517264496e-05, "loss": 1.9171, "step": 13780 }, { "epoch": 2.47, "grad_norm": 0.6740149259567261, "learning_rate": 2.5522998215596245e-05, "loss": 1.8721, "step": 13785 }, { "epoch": 2.47, "grad_norm": 2.3125998973846436, "learning_rate": 2.55089410931237e-05, "loss": 1.8894, "step": 13790 }, { "epoch": 2.47, "grad_norm": 0.8759080767631531, "learning_rate": 2.5494883809673576e-05, "loss": 1.7691, "step": 13795 }, { "epoch": 2.47, "grad_norm": 0.6559151411056519, "learning_rate": 2.5480826369692178e-05, "loss": 1.8659, "step": 13800 }, { "epoch": 2.47, "grad_norm": 1.1714446544647217, "learning_rate": 2.5466768777625872e-05, "loss": 1.8289, "step": 13805 }, { "epoch": 2.47, "grad_norm": 1.3722327947616577, "learning_rate": 2.5452711037921033e-05, "loss": 1.5297, "step": 13810 }, { "epoch": 2.47, "grad_norm": 0.6491648554801941, "learning_rate": 2.5438653155024145e-05, "loss": 1.8004, "step": 13815 }, { "epoch": 2.47, "grad_norm": 0.7832010984420776, "learning_rate": 2.542459513338168e-05, "loss": 1.7827, "step": 13820 }, { "epoch": 2.47, "grad_norm": 0.854123055934906, "learning_rate": 2.541053697744018e-05, "loss": 2.0299, "step": 13825 }, { "epoch": 2.48, "grad_norm": 0.6451117992401123, "learning_rate": 2.5396478691646227e-05, "loss": 1.848, "step": 13830 }, { "epoch": 2.48, "grad_norm": 0.6874570846557617, "learning_rate": 2.538242028044645e-05, "loss": 1.7095, "step": 13835 }, { "epoch": 2.48, "grad_norm": 2.2778282165527344, "learning_rate": 2.53683617482875e-05, "loss": 1.7892, "step": 13840 }, { "epoch": 2.48, "grad_norm": 0.7223300337791443, "learning_rate": 2.535430309961609e-05, "loss": 1.8236, "step": 13845 }, { "epoch": 2.48, "grad_norm": 0.7660056948661804, "learning_rate": 2.5340244338878948e-05, "loss": 2.1062, "step": 13850 }, { "epoch": 2.48, "grad_norm": 0.7210371494293213, "learning_rate": 2.5326185470522852e-05, "loss": 1.7827, "step": 13855 }, { "epoch": 2.48, "grad_norm": 0.532279372215271, "learning_rate": 2.5312126498994603e-05, "loss": 1.7299, "step": 13860 }, { "epoch": 2.48, "grad_norm": 1.1334110498428345, "learning_rate": 2.5298067428741034e-05, "loss": 1.8157, "step": 13865 }, { "epoch": 2.48, "grad_norm": 2.2065303325653076, "learning_rate": 2.5284008264209037e-05, "loss": 1.842, "step": 13870 }, { "epoch": 2.48, "grad_norm": 1.4892088174819946, "learning_rate": 2.5269949009845495e-05, "loss": 2.1346, "step": 13875 }, { "epoch": 2.48, "grad_norm": 0.8326558470726013, "learning_rate": 2.525588967009734e-05, "loss": 1.7902, "step": 13880 }, { "epoch": 2.49, "grad_norm": 0.9677508473396301, "learning_rate": 2.5241830249411523e-05, "loss": 1.4476, "step": 13885 }, { "epoch": 2.49, "grad_norm": 0.45370617508888245, "learning_rate": 2.5227770752235043e-05, "loss": 2.0553, "step": 13890 }, { "epoch": 2.49, "grad_norm": 0.6809149980545044, "learning_rate": 2.5213711183014888e-05, "loss": 1.8837, "step": 13895 }, { "epoch": 2.49, "grad_norm": 0.7268977165222168, "learning_rate": 2.5199651546198084e-05, "loss": 1.9655, "step": 13900 }, { "epoch": 2.49, "grad_norm": 0.5356599688529968, "learning_rate": 2.51855918462317e-05, "loss": 1.7184, "step": 13905 }, { "epoch": 2.49, "grad_norm": 0.551061749458313, "learning_rate": 2.5171532087562782e-05, "loss": 1.8176, "step": 13910 }, { "epoch": 2.49, "grad_norm": 0.8018007278442383, "learning_rate": 2.5157472274638437e-05, "loss": 2.0117, "step": 13915 }, { "epoch": 2.49, "grad_norm": 1.2345116138458252, "learning_rate": 2.5143412411905766e-05, "loss": 1.6238, "step": 13920 }, { "epoch": 2.49, "grad_norm": 0.5574769377708435, "learning_rate": 2.5129352503811876e-05, "loss": 1.8545, "step": 13925 }, { "epoch": 2.49, "grad_norm": 0.5945510268211365, "learning_rate": 2.5115292554803915e-05, "loss": 1.8348, "step": 13930 }, { "epoch": 2.49, "grad_norm": 1.0902730226516724, "learning_rate": 2.5101232569329042e-05, "loss": 1.7905, "step": 13935 }, { "epoch": 2.5, "grad_norm": 0.5143458843231201, "learning_rate": 2.50871725518344e-05, "loss": 1.7065, "step": 13940 }, { "epoch": 2.5, "grad_norm": 1.1873408555984497, "learning_rate": 2.5073112506767156e-05, "loss": 1.9102, "step": 13945 }, { "epoch": 2.5, "grad_norm": 0.836678147315979, "learning_rate": 2.5059052438574504e-05, "loss": 1.9174, "step": 13950 }, { "epoch": 2.5, "grad_norm": 0.9396674036979675, "learning_rate": 2.5044992351703617e-05, "loss": 1.8286, "step": 13955 }, { "epoch": 2.5, "grad_norm": 0.3221425712108612, "learning_rate": 2.5030932250601696e-05, "loss": 2.0022, "step": 13960 }, { "epoch": 2.5, "grad_norm": 1.404722809791565, "learning_rate": 2.5016872139715934e-05, "loss": 1.8733, "step": 13965 }, { "epoch": 2.5, "grad_norm": 1.7955238819122314, "learning_rate": 2.5002812023493526e-05, "loss": 1.7287, "step": 13970 }, { "epoch": 2.5, "grad_norm": 0.7894271016120911, "learning_rate": 2.498875190638168e-05, "loss": 1.9474, "step": 13975 }, { "epoch": 2.5, "grad_norm": 0.6397268176078796, "learning_rate": 2.4974691792827598e-05, "loss": 1.7299, "step": 13980 }, { "epoch": 2.5, "grad_norm": 0.6210111975669861, "learning_rate": 2.4960631687278475e-05, "loss": 1.9172, "step": 13985 }, { "epoch": 2.5, "grad_norm": 0.5007354617118835, "learning_rate": 2.494657159418151e-05, "loss": 2.2217, "step": 13990 }, { "epoch": 2.51, "grad_norm": 0.6257606744766235, "learning_rate": 2.4932511517983906e-05, "loss": 1.8129, "step": 13995 }, { "epoch": 2.51, "grad_norm": 0.7970540523529053, "learning_rate": 2.4918451463132843e-05, "loss": 1.7572, "step": 14000 }, { "epoch": 2.51, "grad_norm": 0.6435466408729553, "learning_rate": 2.4904391434075495e-05, "loss": 1.827, "step": 14005 }, { "epoch": 2.51, "grad_norm": 1.1177036762237549, "learning_rate": 2.489033143525905e-05, "loss": 1.8556, "step": 14010 }, { "epoch": 2.51, "grad_norm": 1.462189793586731, "learning_rate": 2.4876271471130664e-05, "loss": 1.8425, "step": 14015 }, { "epoch": 2.51, "grad_norm": 0.4156283438205719, "learning_rate": 2.486221154613749e-05, "loss": 1.9366, "step": 14020 }, { "epoch": 2.51, "grad_norm": 1.2464938163757324, "learning_rate": 2.484815166472668e-05, "loss": 1.6903, "step": 14025 }, { "epoch": 2.51, "grad_norm": 0.604915201663971, "learning_rate": 2.4834091831345343e-05, "loss": 1.6903, "step": 14030 }, { "epoch": 2.51, "grad_norm": 0.5225396156311035, "learning_rate": 2.4820032050440602e-05, "loss": 1.7955, "step": 14035 }, { "epoch": 2.51, "grad_norm": 0.6467166543006897, "learning_rate": 2.4805972326459555e-05, "loss": 1.9819, "step": 14040 }, { "epoch": 2.51, "grad_norm": 1.0003973245620728, "learning_rate": 2.479191266384928e-05, "loss": 1.5891, "step": 14045 }, { "epoch": 2.51, "grad_norm": 1.0571430921554565, "learning_rate": 2.4777853067056824e-05, "loss": 1.7885, "step": 14050 }, { "epoch": 2.52, "grad_norm": 0.7798799872398376, "learning_rate": 2.4763793540529226e-05, "loss": 1.5714, "step": 14055 }, { "epoch": 2.52, "grad_norm": 0.5746772289276123, "learning_rate": 2.4749734088713502e-05, "loss": 1.8001, "step": 14060 }, { "epoch": 2.52, "grad_norm": 1.2530102729797363, "learning_rate": 2.473567471605665e-05, "loss": 1.8787, "step": 14065 }, { "epoch": 2.52, "grad_norm": 0.9434322714805603, "learning_rate": 2.4721615427005628e-05, "loss": 2.0167, "step": 14070 }, { "epoch": 2.52, "grad_norm": 1.5446343421936035, "learning_rate": 2.4707556226007377e-05, "loss": 1.5517, "step": 14075 }, { "epoch": 2.52, "grad_norm": 0.794992983341217, "learning_rate": 2.469349711750881e-05, "loss": 2.0354, "step": 14080 }, { "epoch": 2.52, "grad_norm": 0.5005180239677429, "learning_rate": 2.467943810595682e-05, "loss": 1.7495, "step": 14085 }, { "epoch": 2.52, "grad_norm": 1.5188912153244019, "learning_rate": 2.4665379195798247e-05, "loss": 1.722, "step": 14090 }, { "epoch": 2.52, "grad_norm": 3.6539127826690674, "learning_rate": 2.4651320391479902e-05, "loss": 1.621, "step": 14095 }, { "epoch": 2.52, "grad_norm": 0.638843297958374, "learning_rate": 2.463726169744859e-05, "loss": 1.8223, "step": 14100 }, { "epoch": 2.52, "grad_norm": 0.6514829397201538, "learning_rate": 2.462320311815105e-05, "loss": 1.6494, "step": 14105 }, { "epoch": 2.53, "grad_norm": 1.295366883277893, "learning_rate": 2.4609144658034e-05, "loss": 1.8201, "step": 14110 }, { "epoch": 2.53, "grad_norm": 0.8184346556663513, "learning_rate": 2.4595086321544125e-05, "loss": 2.0595, "step": 14115 }, { "epoch": 2.53, "grad_norm": 1.1989121437072754, "learning_rate": 2.458102811312805e-05, "loss": 1.9181, "step": 14120 }, { "epoch": 2.53, "grad_norm": 1.7824451923370361, "learning_rate": 2.4566970037232385e-05, "loss": 1.7497, "step": 14125 }, { "epoch": 2.53, "grad_norm": 0.6604727506637573, "learning_rate": 2.4552912098303675e-05, "loss": 2.0074, "step": 14130 }, { "epoch": 2.53, "grad_norm": 0.795128583908081, "learning_rate": 2.453885430078845e-05, "loss": 2.2564, "step": 14135 }, { "epoch": 2.53, "grad_norm": Infinity, "learning_rate": 2.4527608167581993e-05, "loss": 2.0231, "step": 14140 }, { "epoch": 2.53, "grad_norm": 0.7836667895317078, "learning_rate": 2.451355063581608e-05, "loss": 1.9791, "step": 14145 }, { "epoch": 2.53, "grad_norm": 1.0901525020599365, "learning_rate": 2.4499493257913634e-05, "loss": 1.497, "step": 14150 }, { "epoch": 2.53, "grad_norm": 2.3279173374176025, "learning_rate": 2.4485436038320983e-05, "loss": 2.0759, "step": 14155 }, { "epoch": 2.53, "grad_norm": 1.8435465097427368, "learning_rate": 2.447137898148443e-05, "loss": 2.0998, "step": 14160 }, { "epoch": 2.54, "grad_norm": 1.8504188060760498, "learning_rate": 2.4457322091850183e-05, "loss": 1.6399, "step": 14165 }, { "epoch": 2.54, "grad_norm": 0.5952614545822144, "learning_rate": 2.444326537386444e-05, "loss": 2.058, "step": 14170 }, { "epoch": 2.54, "grad_norm": 0.9884322285652161, "learning_rate": 2.4429208831973333e-05, "loss": 1.7262, "step": 14175 }, { "epoch": 2.54, "grad_norm": 1.2859538793563843, "learning_rate": 2.441515247062291e-05, "loss": 1.7738, "step": 14180 }, { "epoch": 2.54, "grad_norm": 0.47512462735176086, "learning_rate": 2.4401096294259217e-05, "loss": 1.8085, "step": 14185 }, { "epoch": 2.54, "grad_norm": 0.804999828338623, "learning_rate": 2.438704030732818e-05, "loss": 1.7905, "step": 14190 }, { "epoch": 2.54, "grad_norm": 1.0102812051773071, "learning_rate": 2.43729845142757e-05, "loss": 2.0173, "step": 14195 }, { "epoch": 2.54, "grad_norm": 0.7093918919563293, "learning_rate": 2.4358928919547616e-05, "loss": 1.8977, "step": 14200 }, { "epoch": 2.54, "grad_norm": 1.1343001127243042, "learning_rate": 2.43448735275897e-05, "loss": 1.5511, "step": 14205 }, { "epoch": 2.54, "grad_norm": 0.6553163528442383, "learning_rate": 2.4330818342847655e-05, "loss": 1.5871, "step": 14210 }, { "epoch": 2.54, "grad_norm": 1.924633502960205, "learning_rate": 2.4316763369767127e-05, "loss": 2.0279, "step": 14215 }, { "epoch": 2.55, "grad_norm": 1.7396385669708252, "learning_rate": 2.4302708612793686e-05, "loss": 1.8374, "step": 14220 }, { "epoch": 2.55, "grad_norm": 0.5724390149116516, "learning_rate": 2.4288654076372842e-05, "loss": 2.1751, "step": 14225 }, { "epoch": 2.55, "grad_norm": 0.5967565774917603, "learning_rate": 2.4274599764950034e-05, "loss": 1.8294, "step": 14230 }, { "epoch": 2.55, "grad_norm": 0.9949921369552612, "learning_rate": 2.426054568297062e-05, "loss": 1.8312, "step": 14235 }, { "epoch": 2.55, "grad_norm": 0.9570918083190918, "learning_rate": 2.4246491834879892e-05, "loss": 1.8949, "step": 14240 }, { "epoch": 2.55, "grad_norm": 0.5771768093109131, "learning_rate": 2.423243822512307e-05, "loss": 1.8138, "step": 14245 }, { "epoch": 2.55, "grad_norm": 0.2878669500350952, "learning_rate": 2.4218384858145306e-05, "loss": 1.7788, "step": 14250 }, { "epoch": 2.55, "grad_norm": 1.2565678358078003, "learning_rate": 2.420433173839165e-05, "loss": 1.6618, "step": 14255 }, { "epoch": 2.55, "grad_norm": 2.0233705043792725, "learning_rate": 2.41902788703071e-05, "loss": 1.8718, "step": 14260 }, { "epoch": 2.55, "grad_norm": 1.1918349266052246, "learning_rate": 2.4176226258336574e-05, "loss": 1.7568, "step": 14265 }, { "epoch": 2.55, "grad_norm": 0.5747388601303101, "learning_rate": 2.4162173906924882e-05, "loss": 1.975, "step": 14270 }, { "epoch": 2.56, "grad_norm": 0.6768869161605835, "learning_rate": 2.4148121820516785e-05, "loss": 2.0272, "step": 14275 }, { "epoch": 2.56, "grad_norm": 0.4785836338996887, "learning_rate": 2.413407000355694e-05, "loss": 1.7301, "step": 14280 }, { "epoch": 2.56, "grad_norm": 0.7693573236465454, "learning_rate": 2.412001846048991e-05, "loss": 1.7174, "step": 14285 }, { "epoch": 2.56, "grad_norm": 1.5978009700775146, "learning_rate": 2.4105967195760192e-05, "loss": 1.8307, "step": 14290 }, { "epoch": 2.56, "grad_norm": 0.6870273947715759, "learning_rate": 2.40919162138122e-05, "loss": 1.6285, "step": 14295 }, { "epoch": 2.56, "grad_norm": 0.6152418255805969, "learning_rate": 2.407786551909023e-05, "loss": 2.058, "step": 14300 }, { "epoch": 2.56, "grad_norm": 1.458072543144226, "learning_rate": 2.4063815116038512e-05, "loss": 1.4863, "step": 14305 }, { "epoch": 2.56, "grad_norm": 0.782149612903595, "learning_rate": 2.4049765009101165e-05, "loss": 1.7991, "step": 14310 }, { "epoch": 2.56, "grad_norm": 0.7315062880516052, "learning_rate": 2.4035715202722237e-05, "loss": 1.6951, "step": 14315 }, { "epoch": 2.56, "grad_norm": 2.745988130569458, "learning_rate": 2.4021665701345667e-05, "loss": 1.8848, "step": 14320 }, { "epoch": 2.56, "grad_norm": 0.6406875848770142, "learning_rate": 2.40076165094153e-05, "loss": 2.0148, "step": 14325 }, { "epoch": 2.56, "grad_norm": 0.698235809803009, "learning_rate": 2.3993567631374862e-05, "loss": 2.1415, "step": 14330 }, { "epoch": 2.57, "grad_norm": 0.499514639377594, "learning_rate": 2.397951907166802e-05, "loss": 2.03, "step": 14335 }, { "epoch": 2.57, "grad_norm": 1.9097588062286377, "learning_rate": 2.3965470834738312e-05, "loss": 1.8086, "step": 14340 }, { "epoch": 2.57, "grad_norm": 0.8855141401290894, "learning_rate": 2.3951422925029184e-05, "loss": 1.465, "step": 14345 }, { "epoch": 2.57, "grad_norm": 0.5352545976638794, "learning_rate": 2.393737534698398e-05, "loss": 1.7741, "step": 14350 }, { "epoch": 2.57, "grad_norm": 2.3942019939422607, "learning_rate": 2.3923328105045928e-05, "loss": 2.0296, "step": 14355 }, { "epoch": 2.57, "grad_norm": 1.186553716659546, "learning_rate": 2.390928120365816e-05, "loss": 2.0242, "step": 14360 }, { "epoch": 2.57, "grad_norm": 0.4111693203449249, "learning_rate": 2.3895234647263704e-05, "loss": 1.9497, "step": 14365 }, { "epoch": 2.57, "grad_norm": 0.9013734459877014, "learning_rate": 2.3881188440305468e-05, "loss": 1.9767, "step": 14370 }, { "epoch": 2.57, "grad_norm": 0.960102379322052, "learning_rate": 2.3867142587226247e-05, "loss": 2.0041, "step": 14375 }, { "epoch": 2.57, "grad_norm": 0.4831627905368805, "learning_rate": 2.3853097092468732e-05, "loss": 1.5793, "step": 14380 }, { "epoch": 2.57, "grad_norm": 0.7343013882637024, "learning_rate": 2.383905196047551e-05, "loss": 1.647, "step": 14385 }, { "epoch": 2.58, "grad_norm": 1.5119633674621582, "learning_rate": 2.382500719568903e-05, "loss": 1.7457, "step": 14390 }, { "epoch": 2.58, "grad_norm": 0.567718505859375, "learning_rate": 2.381096280255165e-05, "loss": 1.9885, "step": 14395 }, { "epoch": 2.58, "grad_norm": 1.3946149349212646, "learning_rate": 2.3796918785505587e-05, "loss": 1.9862, "step": 14400 }, { "epoch": 2.58, "grad_norm": 0.8330457210540771, "learning_rate": 2.3782875148992954e-05, "loss": 1.8574, "step": 14405 }, { "epoch": 2.58, "grad_norm": 0.6044062376022339, "learning_rate": 2.3768831897455745e-05, "loss": 1.796, "step": 14410 }, { "epoch": 2.58, "grad_norm": 1.53191077709198, "learning_rate": 2.375478903533582e-05, "loss": 1.9155, "step": 14415 }, { "epoch": 2.58, "grad_norm": 0.6012637615203857, "learning_rate": 2.3740746567074935e-05, "loss": 2.1244, "step": 14420 }, { "epoch": 2.58, "grad_norm": 0.6626573204994202, "learning_rate": 2.3726704497114687e-05, "loss": 1.6726, "step": 14425 }, { "epoch": 2.58, "grad_norm": 1.1490142345428467, "learning_rate": 2.371266282989659e-05, "loss": 1.7387, "step": 14430 }, { "epoch": 2.58, "grad_norm": 0.9652857780456543, "learning_rate": 2.3698621569861996e-05, "loss": 1.6093, "step": 14435 }, { "epoch": 2.58, "grad_norm": 0.8638285398483276, "learning_rate": 2.3684580721452153e-05, "loss": 1.9487, "step": 14440 }, { "epoch": 2.59, "grad_norm": 0.6422768235206604, "learning_rate": 2.3670540289108153e-05, "loss": 1.8297, "step": 14445 }, { "epoch": 2.59, "grad_norm": 1.0574924945831299, "learning_rate": 2.3656500277270983e-05, "loss": 1.9849, "step": 14450 }, { "epoch": 2.59, "grad_norm": 0.574671745300293, "learning_rate": 2.3642460690381495e-05, "loss": 2.0423, "step": 14455 }, { "epoch": 2.59, "grad_norm": 0.7926428914070129, "learning_rate": 2.362842153288037e-05, "loss": 1.5625, "step": 14460 }, { "epoch": 2.59, "grad_norm": 0.752262532711029, "learning_rate": 2.361438280920821e-05, "loss": 2.0275, "step": 14465 }, { "epoch": 2.59, "grad_norm": 0.7062698602676392, "learning_rate": 2.3600344523805424e-05, "loss": 2.1019, "step": 14470 }, { "epoch": 2.59, "grad_norm": 0.3775525689125061, "learning_rate": 2.3586306681112323e-05, "loss": 1.9513, "step": 14475 }, { "epoch": 2.59, "grad_norm": 1.0212444067001343, "learning_rate": 2.3572269285569054e-05, "loss": 1.9915, "step": 14480 }, { "epoch": 2.59, "grad_norm": 0.917746901512146, "learning_rate": 2.3558232341615643e-05, "loss": 1.8467, "step": 14485 }, { "epoch": 2.59, "grad_norm": 0.5298284888267517, "learning_rate": 2.354419585369195e-05, "loss": 1.8151, "step": 14490 }, { "epoch": 2.59, "grad_norm": 0.6283917427062988, "learning_rate": 2.3530159826237707e-05, "loss": 2.2388, "step": 14495 }, { "epoch": 2.6, "grad_norm": 0.9578955769538879, "learning_rate": 2.3516124263692502e-05, "loss": 1.5422, "step": 14500 }, { "epoch": 2.6, "grad_norm": 0.6109092235565186, "learning_rate": 2.3502089170495762e-05, "loss": 1.9209, "step": 14505 }, { "epoch": 2.6, "grad_norm": 0.5404300093650818, "learning_rate": 2.3488054551086785e-05, "loss": 1.9089, "step": 14510 }, { "epoch": 2.6, "grad_norm": 0.5609441995620728, "learning_rate": 2.347402040990469e-05, "loss": 1.8755, "step": 14515 }, { "epoch": 2.6, "grad_norm": 4.9784345626831055, "learning_rate": 2.3459986751388478e-05, "loss": 1.9347, "step": 14520 }, { "epoch": 2.6, "grad_norm": 2.010941505432129, "learning_rate": 2.3445953579976968e-05, "loss": 1.7676, "step": 14525 }, { "epoch": 2.6, "grad_norm": 0.6856008172035217, "learning_rate": 2.343192090010885e-05, "loss": 1.9795, "step": 14530 }, { "epoch": 2.6, "grad_norm": 0.712049126625061, "learning_rate": 2.3417888716222643e-05, "loss": 1.8283, "step": 14535 }, { "epoch": 2.6, "grad_norm": 0.41500189900398254, "learning_rate": 2.3403857032756706e-05, "loss": 1.8836, "step": 14540 }, { "epoch": 2.6, "grad_norm": 0.3549801707267761, "learning_rate": 2.3389825854149263e-05, "loss": 1.8883, "step": 14545 }, { "epoch": 2.6, "grad_norm": 0.8253669142723083, "learning_rate": 2.337579518483835e-05, "loss": 1.522, "step": 14550 }, { "epoch": 2.61, "grad_norm": 0.8156489729881287, "learning_rate": 2.3361765029261863e-05, "loss": 1.7102, "step": 14555 }, { "epoch": 2.61, "grad_norm": 0.8357172608375549, "learning_rate": 2.334773539185752e-05, "loss": 1.7754, "step": 14560 }, { "epoch": 2.61, "grad_norm": 0.7100852727890015, "learning_rate": 2.333370627706288e-05, "loss": 1.4992, "step": 14565 }, { "epoch": 2.61, "grad_norm": 0.7589089870452881, "learning_rate": 2.3319677689315338e-05, "loss": 2.224, "step": 14570 }, { "epoch": 2.61, "grad_norm": 0.49599790573120117, "learning_rate": 2.3305649633052133e-05, "loss": 1.5944, "step": 14575 }, { "epoch": 2.61, "grad_norm": 0.9817653894424438, "learning_rate": 2.3291622112710316e-05, "loss": 1.6229, "step": 14580 }, { "epoch": 2.61, "grad_norm": 0.6684935688972473, "learning_rate": 2.327759513272678e-05, "loss": 1.863, "step": 14585 }, { "epoch": 2.61, "grad_norm": 0.7037563323974609, "learning_rate": 2.326356869753825e-05, "loss": 2.113, "step": 14590 }, { "epoch": 2.61, "grad_norm": 0.6015907526016235, "learning_rate": 2.324954281158127e-05, "loss": 1.7673, "step": 14595 }, { "epoch": 2.61, "grad_norm": 0.7683700323104858, "learning_rate": 2.323551747929222e-05, "loss": 1.9316, "step": 14600 }, { "epoch": 2.61, "grad_norm": 3.4007463455200195, "learning_rate": 2.3221492705107294e-05, "loss": 1.6905, "step": 14605 }, { "epoch": 2.62, "grad_norm": 1.5388293266296387, "learning_rate": 2.320746849346251e-05, "loss": 2.086, "step": 14610 }, { "epoch": 2.62, "grad_norm": 0.5219169855117798, "learning_rate": 2.3193444848793713e-05, "loss": 1.9448, "step": 14615 }, { "epoch": 2.62, "grad_norm": 1.7064671516418457, "learning_rate": 2.317942177553658e-05, "loss": 1.5938, "step": 14620 }, { "epoch": 2.62, "grad_norm": 2.014631986618042, "learning_rate": 2.3165399278126583e-05, "loss": 1.9672, "step": 14625 }, { "epoch": 2.62, "grad_norm": 0.6399771571159363, "learning_rate": 2.3151377360999025e-05, "loss": 1.986, "step": 14630 }, { "epoch": 2.62, "grad_norm": 1.9371715784072876, "learning_rate": 2.3137356028589042e-05, "loss": 1.8568, "step": 14635 }, { "epoch": 2.62, "grad_norm": 1.1944173574447632, "learning_rate": 2.3123335285331547e-05, "loss": 2.001, "step": 14640 }, { "epoch": 2.62, "grad_norm": 0.6138972640037537, "learning_rate": 2.3109315135661302e-05, "loss": 1.8779, "step": 14645 }, { "epoch": 2.62, "grad_norm": 2.2491681575775146, "learning_rate": 2.3095295584012865e-05, "loss": 1.6365, "step": 14650 }, { "epoch": 2.62, "grad_norm": 0.8489646911621094, "learning_rate": 2.3081276634820595e-05, "loss": 1.7617, "step": 14655 }, { "epoch": 2.62, "grad_norm": 0.6606999635696411, "learning_rate": 2.3067258292518675e-05, "loss": 1.9637, "step": 14660 }, { "epoch": 2.62, "grad_norm": 0.7351112961769104, "learning_rate": 2.3053240561541107e-05, "loss": 1.6616, "step": 14665 }, { "epoch": 2.63, "grad_norm": 0.5655643343925476, "learning_rate": 2.303922344632167e-05, "loss": 1.6532, "step": 14670 }, { "epoch": 2.63, "grad_norm": 0.6417161226272583, "learning_rate": 2.3025206951293973e-05, "loss": 1.8476, "step": 14675 }, { "epoch": 2.63, "grad_norm": 0.9483529925346375, "learning_rate": 2.301119108089142e-05, "loss": 1.5566, "step": 14680 }, { "epoch": 2.63, "grad_norm": 0.7348762154579163, "learning_rate": 2.2997175839547215e-05, "loss": 1.8403, "step": 14685 }, { "epoch": 2.63, "grad_norm": 1.085129976272583, "learning_rate": 2.2983161231694364e-05, "loss": 1.5614, "step": 14690 }, { "epoch": 2.63, "grad_norm": 0.830359697341919, "learning_rate": 2.2969147261765675e-05, "loss": 1.5355, "step": 14695 }, { "epoch": 2.63, "grad_norm": 1.0169621706008911, "learning_rate": 2.2955133934193762e-05, "loss": 2.0021, "step": 14700 }, { "epoch": 2.63, "grad_norm": 0.5929272174835205, "learning_rate": 2.2941121253411008e-05, "loss": 1.65, "step": 14705 }, { "epoch": 2.63, "grad_norm": 1.135832667350769, "learning_rate": 2.2927109223849626e-05, "loss": 1.8439, "step": 14710 }, { "epoch": 2.63, "grad_norm": 0.7020158767700195, "learning_rate": 2.2913097849941602e-05, "loss": 1.857, "step": 14715 }, { "epoch": 2.63, "grad_norm": 0.45904645323753357, "learning_rate": 2.2899087136118717e-05, "loss": 1.6038, "step": 14720 }, { "epoch": 2.64, "grad_norm": 0.3881944417953491, "learning_rate": 2.2885077086812555e-05, "loss": 2.0703, "step": 14725 }, { "epoch": 2.64, "grad_norm": 0.6580604910850525, "learning_rate": 2.2871067706454472e-05, "loss": 1.718, "step": 14730 }, { "epoch": 2.64, "grad_norm": 1.6806758642196655, "learning_rate": 2.285705899947563e-05, "loss": 1.5215, "step": 14735 }, { "epoch": 2.64, "grad_norm": 0.8426775932312012, "learning_rate": 2.284305097030696e-05, "loss": 1.8828, "step": 14740 }, { "epoch": 2.64, "grad_norm": 0.8580935597419739, "learning_rate": 2.2829043623379202e-05, "loss": 1.9616, "step": 14745 }, { "epoch": 2.64, "grad_norm": 0.5852141380310059, "learning_rate": 2.281503696312285e-05, "loss": 1.8733, "step": 14750 }, { "epoch": 2.64, "grad_norm": 0.9512518644332886, "learning_rate": 2.2801030993968213e-05, "loss": 2.0427, "step": 14755 }, { "epoch": 2.64, "grad_norm": 0.8054193258285522, "learning_rate": 2.278702572034535e-05, "loss": 1.6561, "step": 14760 }, { "epoch": 2.64, "grad_norm": 0.729395866394043, "learning_rate": 2.277302114668412e-05, "loss": 1.7398, "step": 14765 }, { "epoch": 2.64, "grad_norm": 0.750243067741394, "learning_rate": 2.2759017277414166e-05, "loss": 1.7189, "step": 14770 }, { "epoch": 2.64, "grad_norm": 0.8767192959785461, "learning_rate": 2.2745014116964884e-05, "loss": 1.7503, "step": 14775 }, { "epoch": 2.65, "grad_norm": 1.0850597620010376, "learning_rate": 2.273101166976548e-05, "loss": 1.8113, "step": 14780 }, { "epoch": 2.65, "grad_norm": 1.1997833251953125, "learning_rate": 2.2717009940244892e-05, "loss": 1.9981, "step": 14785 }, { "epoch": 2.65, "grad_norm": 0.7039926648139954, "learning_rate": 2.2703008932831874e-05, "loss": 1.8546, "step": 14790 }, { "epoch": 2.65, "grad_norm": 0.5928205847740173, "learning_rate": 2.268900865195491e-05, "loss": 1.786, "step": 14795 }, { "epoch": 2.65, "grad_norm": 2.150042772293091, "learning_rate": 2.2675009102042297e-05, "loss": 2.0728, "step": 14800 }, { "epoch": 2.65, "grad_norm": 0.6576092839241028, "learning_rate": 2.2661010287522057e-05, "loss": 1.9185, "step": 14805 }, { "epoch": 2.65, "grad_norm": 0.43387851119041443, "learning_rate": 2.2647012212822023e-05, "loss": 2.0831, "step": 14810 }, { "epoch": 2.65, "grad_norm": 2.2747631072998047, "learning_rate": 2.263301488236976e-05, "loss": 1.9771, "step": 14815 }, { "epoch": 2.65, "grad_norm": 0.780261218547821, "learning_rate": 2.2619018300592603e-05, "loss": 1.7917, "step": 14820 }, { "epoch": 2.65, "grad_norm": 0.6798030734062195, "learning_rate": 2.260502247191768e-05, "loss": 1.5835, "step": 14825 }, { "epoch": 2.65, "grad_norm": 1.7987754344940186, "learning_rate": 2.2591027400771838e-05, "loss": 1.5079, "step": 14830 }, { "epoch": 2.66, "grad_norm": 0.6671625971794128, "learning_rate": 2.2577033091581715e-05, "loss": 1.9078, "step": 14835 }, { "epoch": 2.66, "grad_norm": 0.9358892440795898, "learning_rate": 2.25630395487737e-05, "loss": 1.7289, "step": 14840 }, { "epoch": 2.66, "grad_norm": 0.6218108534812927, "learning_rate": 2.2549046776773932e-05, "loss": 1.6662, "step": 14845 }, { "epoch": 2.66, "grad_norm": 1.6321266889572144, "learning_rate": 2.2535054780008302e-05, "loss": 1.9075, "step": 14850 }, { "epoch": 2.66, "grad_norm": 1.0128587484359741, "learning_rate": 2.252106356290248e-05, "loss": 1.9468, "step": 14855 }, { "epoch": 2.66, "grad_norm": 0.7265017628669739, "learning_rate": 2.2507073129881863e-05, "loss": 1.8416, "step": 14860 }, { "epoch": 2.66, "grad_norm": 0.9810836911201477, "learning_rate": 2.249308348537162e-05, "loss": 1.5815, "step": 14865 }, { "epoch": 2.66, "grad_norm": 0.7224981784820557, "learning_rate": 2.247909463379666e-05, "loss": 1.9182, "step": 14870 }, { "epoch": 2.66, "grad_norm": 0.7442790269851685, "learning_rate": 2.246510657958164e-05, "loss": 1.9225, "step": 14875 }, { "epoch": 2.66, "grad_norm": 0.815650224685669, "learning_rate": 2.245111932715097e-05, "loss": 1.8601, "step": 14880 }, { "epoch": 2.66, "grad_norm": 1.262229323387146, "learning_rate": 2.2437132880928817e-05, "loss": 2.0791, "step": 14885 }, { "epoch": 2.67, "grad_norm": 0.8286881446838379, "learning_rate": 2.2423147245339062e-05, "loss": 1.9286, "step": 14890 }, { "epoch": 2.67, "grad_norm": 2.7588956356048584, "learning_rate": 2.240916242480535e-05, "loss": 1.697, "step": 14895 }, { "epoch": 2.67, "grad_norm": 0.852202296257019, "learning_rate": 2.2395178423751076e-05, "loss": 1.7105, "step": 14900 }, { "epoch": 2.67, "grad_norm": 0.785529375076294, "learning_rate": 2.2381195246599356e-05, "loss": 1.7383, "step": 14905 }, { "epoch": 2.67, "grad_norm": 0.9461120963096619, "learning_rate": 2.236721289777306e-05, "loss": 1.8059, "step": 14910 }, { "epoch": 2.67, "grad_norm": 0.5357199311256409, "learning_rate": 2.2353231381694797e-05, "loss": 2.066, "step": 14915 }, { "epoch": 2.67, "grad_norm": 2.159280300140381, "learning_rate": 2.23392507027869e-05, "loss": 1.8313, "step": 14920 }, { "epoch": 2.67, "grad_norm": 0.9953837394714355, "learning_rate": 2.2325270865471444e-05, "loss": 1.8698, "step": 14925 }, { "epoch": 2.67, "grad_norm": 0.6990963220596313, "learning_rate": 2.231129187417025e-05, "loss": 1.7352, "step": 14930 }, { "epoch": 2.67, "grad_norm": 0.6935223937034607, "learning_rate": 2.2297313733304845e-05, "loss": 1.514, "step": 14935 }, { "epoch": 2.67, "grad_norm": 0.5469807386398315, "learning_rate": 2.22833364472965e-05, "loss": 1.8815, "step": 14940 }, { "epoch": 2.68, "grad_norm": 0.5818589925765991, "learning_rate": 2.2269360020566232e-05, "loss": 1.9459, "step": 14945 }, { "epoch": 2.68, "grad_norm": 1.0151652097702026, "learning_rate": 2.2255384457534757e-05, "loss": 1.8557, "step": 14950 }, { "epoch": 2.68, "grad_norm": 1.0191221237182617, "learning_rate": 2.2241409762622534e-05, "loss": 1.5992, "step": 14955 }, { "epoch": 2.68, "grad_norm": 1.0580757856369019, "learning_rate": 2.222743594024976e-05, "loss": 1.8454, "step": 14960 }, { "epoch": 2.68, "grad_norm": 0.6717436909675598, "learning_rate": 2.2213462994836323e-05, "loss": 2.0803, "step": 14965 }, { "epoch": 2.68, "grad_norm": 0.7515537738800049, "learning_rate": 2.2199490930801862e-05, "loss": 1.6758, "step": 14970 }, { "epoch": 2.68, "grad_norm": 1.05194890499115, "learning_rate": 2.2185519752565733e-05, "loss": 1.7331, "step": 14975 }, { "epoch": 2.68, "grad_norm": 1.2085919380187988, "learning_rate": 2.2171549464547e-05, "loss": 1.8599, "step": 14980 }, { "epoch": 2.68, "grad_norm": 0.7703227400779724, "learning_rate": 2.215758007116444e-05, "loss": 1.8927, "step": 14985 }, { "epoch": 2.68, "grad_norm": 0.7714345455169678, "learning_rate": 2.214361157683658e-05, "loss": 1.4521, "step": 14990 }, { "epoch": 2.68, "grad_norm": 0.7530484795570374, "learning_rate": 2.212964398598162e-05, "loss": 1.9209, "step": 14995 }, { "epoch": 2.68, "grad_norm": 0.6522998213768005, "learning_rate": 2.211567730301751e-05, "loss": 1.9337, "step": 15000 }, { "epoch": 2.69, "grad_norm": 0.6403537392616272, "learning_rate": 2.21017115323619e-05, "loss": 1.8753, "step": 15005 }, { "epoch": 2.69, "grad_norm": 0.9187439680099487, "learning_rate": 2.2087746678432135e-05, "loss": 1.7973, "step": 15010 }, { "epoch": 2.69, "grad_norm": 0.840508222579956, "learning_rate": 2.20737827456453e-05, "loss": 2.1179, "step": 15015 }, { "epoch": 2.69, "grad_norm": 1.1166287660598755, "learning_rate": 2.205981973841817e-05, "loss": 1.5053, "step": 15020 }, { "epoch": 2.69, "grad_norm": 1.1791390180587769, "learning_rate": 2.2045857661167235e-05, "loss": 1.7282, "step": 15025 }, { "epoch": 2.69, "grad_norm": 1.0932645797729492, "learning_rate": 2.203189651830867e-05, "loss": 1.9909, "step": 15030 }, { "epoch": 2.69, "grad_norm": 2.5372047424316406, "learning_rate": 2.2017936314258385e-05, "loss": 2.1166, "step": 15035 }, { "epoch": 2.69, "grad_norm": 0.7102367877960205, "learning_rate": 2.2003977053431972e-05, "loss": 2.0205, "step": 15040 }, { "epoch": 2.69, "grad_norm": 3.9497880935668945, "learning_rate": 2.1990018740244733e-05, "loss": 1.8872, "step": 15045 }, { "epoch": 2.69, "grad_norm": 0.714215874671936, "learning_rate": 2.1976061379111677e-05, "loss": 1.9495, "step": 15050 }, { "epoch": 2.69, "grad_norm": 0.8167940974235535, "learning_rate": 2.196210497444749e-05, "loss": 1.8913, "step": 15055 }, { "epoch": 2.7, "grad_norm": 1.2327383756637573, "learning_rate": 2.194814953066659e-05, "loss": 1.9469, "step": 15060 }, { "epoch": 2.7, "grad_norm": 0.683508574962616, "learning_rate": 2.1934195052183047e-05, "loss": 1.9397, "step": 15065 }, { "epoch": 2.7, "grad_norm": 1.7274824380874634, "learning_rate": 2.1920241543410673e-05, "loss": 1.9172, "step": 15070 }, { "epoch": 2.7, "grad_norm": 0.8672675490379333, "learning_rate": 2.1906289008762926e-05, "loss": 1.7328, "step": 15075 }, { "epoch": 2.7, "grad_norm": 1.210166335105896, "learning_rate": 2.1892337452652996e-05, "loss": 1.6836, "step": 15080 }, { "epoch": 2.7, "grad_norm": 0.42006659507751465, "learning_rate": 2.1878386879493732e-05, "loss": 1.7958, "step": 15085 }, { "epoch": 2.7, "grad_norm": 0.6762058734893799, "learning_rate": 2.18644372936977e-05, "loss": 1.9698, "step": 15090 }, { "epoch": 2.7, "grad_norm": 2.0761775970458984, "learning_rate": 2.1850488699677137e-05, "loss": 1.711, "step": 15095 }, { "epoch": 2.7, "grad_norm": 0.4220300614833832, "learning_rate": 2.1836541101843964e-05, "loss": 1.9976, "step": 15100 }, { "epoch": 2.7, "grad_norm": 1.0192917585372925, "learning_rate": 2.1822594504609802e-05, "loss": 1.6178, "step": 15105 }, { "epoch": 2.7, "grad_norm": 0.5573521256446838, "learning_rate": 2.180864891238594e-05, "loss": 1.8285, "step": 15110 }, { "epoch": 2.71, "grad_norm": 0.7691249847412109, "learning_rate": 2.1794704329583353e-05, "loss": 1.8416, "step": 15115 }, { "epoch": 2.71, "grad_norm": 0.6303778290748596, "learning_rate": 2.178076076061272e-05, "loss": 1.9445, "step": 15120 }, { "epoch": 2.71, "grad_norm": 1.794248342514038, "learning_rate": 2.1766818209884355e-05, "loss": 1.772, "step": 15125 }, { "epoch": 2.71, "grad_norm": 0.5555631518363953, "learning_rate": 2.1752876681808272e-05, "loss": 1.9228, "step": 15130 }, { "epoch": 2.71, "grad_norm": 0.8108105063438416, "learning_rate": 2.1738936180794173e-05, "loss": 1.7449, "step": 15135 }, { "epoch": 2.71, "grad_norm": 0.45947834849357605, "learning_rate": 2.1724996711251426e-05, "loss": 2.1185, "step": 15140 }, { "epoch": 2.71, "grad_norm": 0.4830736815929413, "learning_rate": 2.1711058277589066e-05, "loss": 1.8752, "step": 15145 }, { "epoch": 2.71, "grad_norm": 0.6274514198303223, "learning_rate": 2.1697120884215817e-05, "loss": 2.2124, "step": 15150 }, { "epoch": 2.71, "grad_norm": 0.7346020340919495, "learning_rate": 2.1683184535540046e-05, "loss": 1.7558, "step": 15155 }, { "epoch": 2.71, "grad_norm": 1.0452593564987183, "learning_rate": 2.166924923596982e-05, "loss": 1.692, "step": 15160 }, { "epoch": 2.71, "grad_norm": 0.8475756645202637, "learning_rate": 2.165531498991286e-05, "loss": 1.6763, "step": 15165 }, { "epoch": 2.72, "grad_norm": 0.5865801572799683, "learning_rate": 2.1641381801776557e-05, "loss": 1.998, "step": 15170 }, { "epoch": 2.72, "grad_norm": 0.8659747242927551, "learning_rate": 2.1627449675967945e-05, "loss": 1.9164, "step": 15175 }, { "epoch": 2.72, "grad_norm": 0.834214985370636, "learning_rate": 2.161351861689376e-05, "loss": 1.8393, "step": 15180 }, { "epoch": 2.72, "grad_norm": 0.6650885939598083, "learning_rate": 2.1599588628960384e-05, "loss": 1.8873, "step": 15185 }, { "epoch": 2.72, "grad_norm": 0.6627525687217712, "learning_rate": 2.158565971657385e-05, "loss": 1.8317, "step": 15190 }, { "epoch": 2.72, "grad_norm": 2.031599998474121, "learning_rate": 2.157173188413987e-05, "loss": 2.1292, "step": 15195 }, { "epoch": 2.72, "grad_norm": 0.6217500567436218, "learning_rate": 2.1557805136063787e-05, "loss": 1.9539, "step": 15200 }, { "epoch": 2.72, "grad_norm": 0.5643689036369324, "learning_rate": 2.1543879476750632e-05, "loss": 2.012, "step": 15205 }, { "epoch": 2.72, "grad_norm": 0.619275689125061, "learning_rate": 2.1529954910605084e-05, "loss": 1.8118, "step": 15210 }, { "epoch": 2.72, "grad_norm": 0.6966983675956726, "learning_rate": 2.1516031442031452e-05, "loss": 1.9702, "step": 15215 }, { "epoch": 2.72, "grad_norm": 0.824952244758606, "learning_rate": 2.1502109075433725e-05, "loss": 1.8539, "step": 15220 }, { "epoch": 2.73, "grad_norm": 0.7268326878547668, "learning_rate": 2.1488187815215527e-05, "loss": 1.8079, "step": 15225 }, { "epoch": 2.73, "grad_norm": 0.47795695066452026, "learning_rate": 2.1474267665780153e-05, "loss": 1.8452, "step": 15230 }, { "epoch": 2.73, "grad_norm": 1.0499993562698364, "learning_rate": 2.146034863153052e-05, "loss": 1.9181, "step": 15235 }, { "epoch": 2.73, "grad_norm": 0.5682938694953918, "learning_rate": 2.1446430716869215e-05, "loss": 1.7971, "step": 15240 }, { "epoch": 2.73, "grad_norm": 0.4888734519481659, "learning_rate": 2.143251392619845e-05, "loss": 1.979, "step": 15245 }, { "epoch": 2.73, "grad_norm": 1.2355575561523438, "learning_rate": 2.14185982639201e-05, "loss": 1.6015, "step": 15250 }, { "epoch": 2.73, "grad_norm": 2.380397319793701, "learning_rate": 2.140468373443568e-05, "loss": 1.8096, "step": 15255 }, { "epoch": 2.73, "grad_norm": 5.283520698547363, "learning_rate": 2.1390770342146338e-05, "loss": 1.8967, "step": 15260 }, { "epoch": 2.73, "grad_norm": 1.0056599378585815, "learning_rate": 2.137685809145286e-05, "loss": 1.6557, "step": 15265 }, { "epoch": 2.73, "grad_norm": 0.4523605704307556, "learning_rate": 2.1362946986755678e-05, "loss": 1.7145, "step": 15270 }, { "epoch": 2.73, "grad_norm": 0.9183207154273987, "learning_rate": 2.134903703245487e-05, "loss": 2.0983, "step": 15275 }, { "epoch": 2.74, "grad_norm": 0.6543048620223999, "learning_rate": 2.1335128232950135e-05, "loss": 2.0348, "step": 15280 }, { "epoch": 2.74, "grad_norm": 0.8366292119026184, "learning_rate": 2.132122059264082e-05, "loss": 1.6401, "step": 15285 }, { "epoch": 2.74, "grad_norm": 0.6985805034637451, "learning_rate": 2.1307314115925888e-05, "loss": 1.749, "step": 15290 }, { "epoch": 2.74, "grad_norm": 0.4857269823551178, "learning_rate": 2.1293408807203947e-05, "loss": 1.916, "step": 15295 }, { "epoch": 2.74, "grad_norm": 0.9709117412567139, "learning_rate": 2.1279504670873245e-05, "loss": 2.0241, "step": 15300 }, { "epoch": 2.74, "grad_norm": 1.052292823791504, "learning_rate": 2.1265601711331636e-05, "loss": 1.5705, "step": 15305 }, { "epoch": 2.74, "grad_norm": 0.7669774889945984, "learning_rate": 2.1251699932976606e-05, "loss": 1.7281, "step": 15310 }, { "epoch": 2.74, "grad_norm": 1.21905517578125, "learning_rate": 2.1237799340205283e-05, "loss": 2.013, "step": 15315 }, { "epoch": 2.74, "grad_norm": 0.9224725365638733, "learning_rate": 2.1223899937414415e-05, "loss": 1.8188, "step": 15320 }, { "epoch": 2.74, "grad_norm": 0.9945480823516846, "learning_rate": 2.1210001729000357e-05, "loss": 1.7489, "step": 15325 }, { "epoch": 2.74, "grad_norm": 1.0888783931732178, "learning_rate": 2.1196104719359115e-05, "loss": 1.8444, "step": 15330 }, { "epoch": 2.74, "grad_norm": 0.7765944004058838, "learning_rate": 2.1182208912886283e-05, "loss": 1.9981, "step": 15335 }, { "epoch": 2.75, "grad_norm": 0.3819294571876526, "learning_rate": 2.11683143139771e-05, "loss": 1.8742, "step": 15340 }, { "epoch": 2.75, "grad_norm": 1.1613242626190186, "learning_rate": 2.1154420927026424e-05, "loss": 1.526, "step": 15345 }, { "epoch": 2.75, "grad_norm": 0.8339876532554626, "learning_rate": 2.114052875642871e-05, "loss": 2.2053, "step": 15350 }, { "epoch": 2.75, "grad_norm": 0.7060007452964783, "learning_rate": 2.1126637806578027e-05, "loss": 1.9372, "step": 15355 }, { "epoch": 2.75, "grad_norm": 1.6718441247940063, "learning_rate": 2.111274808186809e-05, "loss": 1.7533, "step": 15360 }, { "epoch": 2.75, "grad_norm": 0.6338340044021606, "learning_rate": 2.1098859586692184e-05, "loss": 1.8297, "step": 15365 }, { "epoch": 2.75, "grad_norm": 0.91750568151474, "learning_rate": 2.108497232544324e-05, "loss": 1.8179, "step": 15370 }, { "epoch": 2.75, "grad_norm": 0.8784101009368896, "learning_rate": 2.1071086302513793e-05, "loss": 1.7564, "step": 15375 }, { "epoch": 2.75, "grad_norm": 0.9052116870880127, "learning_rate": 2.105720152229596e-05, "loss": 1.8512, "step": 15380 }, { "epoch": 2.75, "grad_norm": 1.1224867105484009, "learning_rate": 2.1043317989181497e-05, "loss": 1.7728, "step": 15385 }, { "epoch": 2.75, "grad_norm": 0.5035684704780579, "learning_rate": 2.102943570756175e-05, "loss": 1.9403, "step": 15390 }, { "epoch": 2.76, "grad_norm": 1.207096815109253, "learning_rate": 2.1015554681827665e-05, "loss": 1.7796, "step": 15395 }, { "epoch": 2.76, "grad_norm": 0.99371737241745, "learning_rate": 2.1001674916369812e-05, "loss": 2.0566, "step": 15400 }, { "epoch": 2.76, "grad_norm": 1.190355658531189, "learning_rate": 2.0987796415578333e-05, "loss": 1.8686, "step": 15405 }, { "epoch": 2.76, "grad_norm": 0.4896886944770813, "learning_rate": 2.0973919183842983e-05, "loss": 1.9051, "step": 15410 }, { "epoch": 2.76, "grad_norm": 1.8628058433532715, "learning_rate": 2.096004322555312e-05, "loss": 1.5376, "step": 15415 }, { "epoch": 2.76, "grad_norm": 2.157165765762329, "learning_rate": 2.0946168545097708e-05, "loss": 1.5625, "step": 15420 }, { "epoch": 2.76, "grad_norm": 0.9407357573509216, "learning_rate": 2.0932295146865277e-05, "loss": 1.674, "step": 15425 }, { "epoch": 2.76, "grad_norm": 0.7408738136291504, "learning_rate": 2.0918423035243983e-05, "loss": 1.8977, "step": 15430 }, { "epoch": 2.76, "grad_norm": 0.6515112519264221, "learning_rate": 2.090455221462156e-05, "loss": 1.8421, "step": 15435 }, { "epoch": 2.76, "grad_norm": 0.649272620677948, "learning_rate": 2.0890682689385324e-05, "loss": 1.9893, "step": 15440 }, { "epoch": 2.76, "grad_norm": 0.8793091177940369, "learning_rate": 2.0876814463922213e-05, "loss": 1.5602, "step": 15445 }, { "epoch": 2.77, "grad_norm": 0.6581525206565857, "learning_rate": 2.086294754261872e-05, "loss": 1.7371, "step": 15450 }, { "epoch": 2.77, "grad_norm": 0.5119839906692505, "learning_rate": 2.084908192986093e-05, "loss": 2.0622, "step": 15455 }, { "epoch": 2.77, "grad_norm": 0.5535023808479309, "learning_rate": 2.083521763003453e-05, "loss": 2.2007, "step": 15460 }, { "epoch": 2.77, "grad_norm": 1.1636162996292114, "learning_rate": 2.08213546475248e-05, "loss": 1.9082, "step": 15465 }, { "epoch": 2.77, "grad_norm": 0.48526015877723694, "learning_rate": 2.0807492986716566e-05, "loss": 2.0381, "step": 15470 }, { "epoch": 2.77, "grad_norm": 1.322176456451416, "learning_rate": 2.0793632651994268e-05, "loss": 1.7674, "step": 15475 }, { "epoch": 2.77, "grad_norm": 0.6021888256072998, "learning_rate": 2.0779773647741924e-05, "loss": 1.6768, "step": 15480 }, { "epoch": 2.77, "grad_norm": 0.8677191138267517, "learning_rate": 2.076591597834311e-05, "loss": 1.6359, "step": 15485 }, { "epoch": 2.77, "grad_norm": 0.6849371790885925, "learning_rate": 2.0752059648181008e-05, "loss": 1.7706, "step": 15490 }, { "epoch": 2.77, "grad_norm": 1.1195003986358643, "learning_rate": 2.073820466163835e-05, "loss": 1.7344, "step": 15495 }, { "epoch": 2.77, "grad_norm": 0.6078161001205444, "learning_rate": 2.072435102309745e-05, "loss": 1.687, "step": 15500 }, { "epoch": 2.78, "grad_norm": 0.8624263405799866, "learning_rate": 2.0710498736940208e-05, "loss": 1.8988, "step": 15505 }, { "epoch": 2.78, "grad_norm": 1.518776535987854, "learning_rate": 2.06966478075481e-05, "loss": 1.9482, "step": 15510 }, { "epoch": 2.78, "grad_norm": 1.0695035457611084, "learning_rate": 2.068279823930214e-05, "loss": 1.7764, "step": 15515 }, { "epoch": 2.78, "grad_norm": 0.537037193775177, "learning_rate": 2.066895003658295e-05, "loss": 1.8001, "step": 15520 }, { "epoch": 2.78, "grad_norm": 0.7043410539627075, "learning_rate": 2.0655103203770702e-05, "loss": 1.8046, "step": 15525 }, { "epoch": 2.78, "grad_norm": 0.6016290783882141, "learning_rate": 2.0641257745245124e-05, "loss": 1.8151, "step": 15530 }, { "epoch": 2.78, "grad_norm": 1.0047852993011475, "learning_rate": 2.0627413665385533e-05, "loss": 1.9005, "step": 15535 }, { "epoch": 2.78, "grad_norm": 0.5631789565086365, "learning_rate": 2.06135709685708e-05, "loss": 1.7507, "step": 15540 }, { "epoch": 2.78, "grad_norm": 0.7759091258049011, "learning_rate": 2.059972965917934e-05, "loss": 1.7911, "step": 15545 }, { "epoch": 2.78, "grad_norm": 0.9690524339675903, "learning_rate": 2.0585889741589155e-05, "loss": 1.6226, "step": 15550 }, { "epoch": 2.78, "grad_norm": 0.9261754751205444, "learning_rate": 2.0572051220177807e-05, "loss": 1.722, "step": 15555 }, { "epoch": 2.79, "grad_norm": 0.7827115058898926, "learning_rate": 2.0558214099322393e-05, "loss": 1.8318, "step": 15560 }, { "epoch": 2.79, "grad_norm": 0.7148807644844055, "learning_rate": 2.0544378383399593e-05, "loss": 1.6204, "step": 15565 }, { "epoch": 2.79, "grad_norm": 0.6687959432601929, "learning_rate": 2.0530544076785617e-05, "loss": 1.9251, "step": 15570 }, { "epoch": 2.79, "grad_norm": 1.5032340288162231, "learning_rate": 2.051671118385625e-05, "loss": 1.9225, "step": 15575 }, { "epoch": 2.79, "grad_norm": 1.197785496711731, "learning_rate": 2.050287970898683e-05, "loss": 1.9162, "step": 15580 }, { "epoch": 2.79, "grad_norm": 1.1399624347686768, "learning_rate": 2.0489049656552236e-05, "loss": 1.8481, "step": 15585 }, { "epoch": 2.79, "grad_norm": 1.0693291425704956, "learning_rate": 2.0475221030926888e-05, "loss": 1.919, "step": 15590 }, { "epoch": 2.79, "grad_norm": 0.7608375549316406, "learning_rate": 2.0461393836484776e-05, "loss": 1.8845, "step": 15595 }, { "epoch": 2.79, "grad_norm": 0.8283123970031738, "learning_rate": 2.0447568077599432e-05, "loss": 1.6992, "step": 15600 }, { "epoch": 2.79, "grad_norm": 0.8869904279708862, "learning_rate": 2.043374375864392e-05, "loss": 1.91, "step": 15605 }, { "epoch": 2.79, "grad_norm": 0.9428117871284485, "learning_rate": 2.0419920883990874e-05, "loss": 2.0304, "step": 15610 }, { "epoch": 2.8, "grad_norm": 1.6648744344711304, "learning_rate": 2.0406099458012438e-05, "loss": 1.5347, "step": 15615 }, { "epoch": 2.8, "grad_norm": 1.004983901977539, "learning_rate": 2.0392279485080323e-05, "loss": 1.6797, "step": 15620 }, { "epoch": 2.8, "grad_norm": 0.7050873637199402, "learning_rate": 2.0378460969565782e-05, "loss": 1.8073, "step": 15625 }, { "epoch": 2.8, "grad_norm": 0.7096034288406372, "learning_rate": 2.0364643915839587e-05, "loss": 1.694, "step": 15630 }, { "epoch": 2.8, "grad_norm": 0.6012222170829773, "learning_rate": 2.0350828328272076e-05, "loss": 1.8169, "step": 15635 }, { "epoch": 2.8, "grad_norm": 0.9153398275375366, "learning_rate": 2.0337014211233077e-05, "loss": 1.96, "step": 15640 }, { "epoch": 2.8, "grad_norm": 1.016750454902649, "learning_rate": 2.0323201569092003e-05, "loss": 1.9078, "step": 15645 }, { "epoch": 2.8, "grad_norm": 1.7948215007781982, "learning_rate": 2.0309390406217772e-05, "loss": 1.9035, "step": 15650 }, { "epoch": 2.8, "grad_norm": 0.7154805660247803, "learning_rate": 2.029558072697885e-05, "loss": 1.9442, "step": 15655 }, { "epoch": 2.8, "grad_norm": 0.8079678416252136, "learning_rate": 2.028177253574321e-05, "loss": 1.6577, "step": 15660 }, { "epoch": 2.8, "grad_norm": 0.8642460107803345, "learning_rate": 2.026796583687838e-05, "loss": 1.6969, "step": 15665 }, { "epoch": 2.8, "grad_norm": 0.655441164970398, "learning_rate": 2.025416063475141e-05, "loss": 1.854, "step": 15670 }, { "epoch": 2.81, "grad_norm": 0.6256705522537231, "learning_rate": 2.024035693372886e-05, "loss": 1.9581, "step": 15675 }, { "epoch": 2.81, "grad_norm": 0.7426342964172363, "learning_rate": 2.0226554738176845e-05, "loss": 1.8966, "step": 15680 }, { "epoch": 2.81, "grad_norm": 0.7382630109786987, "learning_rate": 2.0212754052460963e-05, "loss": 1.9278, "step": 15685 }, { "epoch": 2.81, "grad_norm": 1.1039477586746216, "learning_rate": 2.0198954880946376e-05, "loss": 1.8093, "step": 15690 }, { "epoch": 2.81, "grad_norm": 0.5927040576934814, "learning_rate": 2.018515722799774e-05, "loss": 1.8377, "step": 15695 }, { "epoch": 2.81, "grad_norm": 0.4983259439468384, "learning_rate": 2.017136109797924e-05, "loss": 1.9169, "step": 15700 }, { "epoch": 2.81, "grad_norm": 0.9285030364990234, "learning_rate": 2.0157566495254583e-05, "loss": 1.9493, "step": 15705 }, { "epoch": 2.81, "grad_norm": 0.9399527311325073, "learning_rate": 2.014377342418698e-05, "loss": 1.8094, "step": 15710 }, { "epoch": 2.81, "grad_norm": 0.7926543951034546, "learning_rate": 2.0129981889139175e-05, "loss": 1.8163, "step": 15715 }, { "epoch": 2.81, "grad_norm": 1.1448842287063599, "learning_rate": 2.0116191894473414e-05, "loss": 1.8834, "step": 15720 }, { "epoch": 2.81, "grad_norm": 1.2125980854034424, "learning_rate": 2.0102403444551467e-05, "loss": 1.9686, "step": 15725 }, { "epoch": 2.82, "grad_norm": 0.6270211935043335, "learning_rate": 2.008861654373459e-05, "loss": 1.6633, "step": 15730 }, { "epoch": 2.82, "grad_norm": 0.845684826374054, "learning_rate": 2.007483119638358e-05, "loss": 1.5904, "step": 15735 }, { "epoch": 2.82, "grad_norm": 0.9219332337379456, "learning_rate": 2.0061047406858725e-05, "loss": 1.7978, "step": 15740 }, { "epoch": 2.82, "grad_norm": 0.5408844351768494, "learning_rate": 2.0047265179519825e-05, "loss": 1.7812, "step": 15745 }, { "epoch": 2.82, "grad_norm": 0.7204087972640991, "learning_rate": 2.0033484518726182e-05, "loss": 2.0534, "step": 15750 }, { "epoch": 2.82, "grad_norm": 0.9531112313270569, "learning_rate": 2.0019705428836604e-05, "loss": 1.6982, "step": 15755 }, { "epoch": 2.82, "grad_norm": 0.7499538064002991, "learning_rate": 2.0005927914209417e-05, "loss": 1.8288, "step": 15760 }, { "epoch": 2.82, "grad_norm": 1.6334372758865356, "learning_rate": 1.9992151979202424e-05, "loss": 1.6551, "step": 15765 }, { "epoch": 2.82, "grad_norm": 0.9548289775848389, "learning_rate": 1.9978377628172938e-05, "loss": 2.0638, "step": 15770 }, { "epoch": 2.82, "grad_norm": 0.6916335225105286, "learning_rate": 1.996460486547779e-05, "loss": 2.0167, "step": 15775 }, { "epoch": 2.82, "grad_norm": 0.789394199848175, "learning_rate": 1.9950833695473277e-05, "loss": 1.9085, "step": 15780 }, { "epoch": 2.83, "grad_norm": 0.42912745475769043, "learning_rate": 1.9937064122515202e-05, "loss": 1.8567, "step": 15785 }, { "epoch": 2.83, "grad_norm": 1.1623276472091675, "learning_rate": 1.992329615095888e-05, "loss": 1.8448, "step": 15790 }, { "epoch": 2.83, "grad_norm": 0.7881998419761658, "learning_rate": 1.9909529785159097e-05, "loss": 1.9888, "step": 15795 }, { "epoch": 2.83, "grad_norm": 0.5520839691162109, "learning_rate": 1.9895765029470148e-05, "loss": 2.0507, "step": 15800 }, { "epoch": 2.83, "grad_norm": 1.4102293252944946, "learning_rate": 1.9882001888245816e-05, "loss": 1.9575, "step": 15805 }, { "epoch": 2.83, "grad_norm": 0.5933654308319092, "learning_rate": 1.9868240365839358e-05, "loss": 2.1449, "step": 15810 }, { "epoch": 2.83, "grad_norm": 0.37906643748283386, "learning_rate": 1.9854480466603542e-05, "loss": 1.9846, "step": 15815 }, { "epoch": 2.83, "grad_norm": 0.675449550151825, "learning_rate": 1.9840722194890604e-05, "loss": 2.0511, "step": 15820 }, { "epoch": 2.83, "grad_norm": 0.8554467558860779, "learning_rate": 1.9826965555052274e-05, "loss": 1.7434, "step": 15825 }, { "epoch": 2.83, "grad_norm": 0.68226557970047, "learning_rate": 1.9813210551439756e-05, "loss": 1.9633, "step": 15830 }, { "epoch": 2.83, "grad_norm": 0.6609245538711548, "learning_rate": 1.9799457188403758e-05, "loss": 2.0091, "step": 15835 }, { "epoch": 2.84, "grad_norm": 0.8956989049911499, "learning_rate": 1.9785705470294447e-05, "loss": 1.775, "step": 15840 }, { "epoch": 2.84, "grad_norm": 0.621740996837616, "learning_rate": 1.9771955401461477e-05, "loss": 1.8911, "step": 15845 }, { "epoch": 2.84, "grad_norm": 1.393864393234253, "learning_rate": 1.975820698625399e-05, "loss": 1.5529, "step": 15850 }, { "epoch": 2.84, "grad_norm": 0.920356273651123, "learning_rate": 1.9744460229020583e-05, "loss": 1.9129, "step": 15855 }, { "epoch": 2.84, "grad_norm": 1.2865760326385498, "learning_rate": 1.973071513410936e-05, "loss": 2.1003, "step": 15860 }, { "epoch": 2.84, "grad_norm": 0.5856181383132935, "learning_rate": 1.971697170586787e-05, "loss": 1.755, "step": 15865 }, { "epoch": 2.84, "grad_norm": 0.4005410969257355, "learning_rate": 1.970322994864315e-05, "loss": 1.992, "step": 15870 }, { "epoch": 2.84, "grad_norm": 0.9305061101913452, "learning_rate": 1.9689489866781695e-05, "loss": 1.9721, "step": 15875 }, { "epoch": 2.84, "grad_norm": 0.9853824377059937, "learning_rate": 1.9675751464629493e-05, "loss": 2.1053, "step": 15880 }, { "epoch": 2.84, "grad_norm": 0.6760913133621216, "learning_rate": 1.9662014746531977e-05, "loss": 1.8466, "step": 15885 }, { "epoch": 2.84, "grad_norm": 0.451269268989563, "learning_rate": 1.9648279716834058e-05, "loss": 2.0028, "step": 15890 }, { "epoch": 2.85, "grad_norm": 0.890579879283905, "learning_rate": 1.9634546379880127e-05, "loss": 1.7298, "step": 15895 }, { "epoch": 2.85, "grad_norm": 1.1749356985092163, "learning_rate": 1.962081474001401e-05, "loss": 2.0342, "step": 15900 }, { "epoch": 2.85, "grad_norm": 0.923694908618927, "learning_rate": 1.9607084801579018e-05, "loss": 2.0843, "step": 15905 }, { "epoch": 2.85, "grad_norm": 0.7525226473808289, "learning_rate": 1.9593356568917913e-05, "loss": 2.129, "step": 15910 }, { "epoch": 2.85, "grad_norm": 0.7968067526817322, "learning_rate": 1.9579630046372936e-05, "loss": 2.1804, "step": 15915 }, { "epoch": 2.85, "grad_norm": 0.6558468341827393, "learning_rate": 1.956590523828575e-05, "loss": 1.6117, "step": 15920 }, { "epoch": 2.85, "grad_norm": 0.8354988098144531, "learning_rate": 1.9552182148997513e-05, "loss": 1.7763, "step": 15925 }, { "epoch": 2.85, "grad_norm": 0.8877277374267578, "learning_rate": 1.9538460782848822e-05, "loss": 1.9353, "step": 15930 }, { "epoch": 2.85, "grad_norm": 0.990190327167511, "learning_rate": 1.9524741144179727e-05, "loss": 1.6819, "step": 15935 }, { "epoch": 2.85, "grad_norm": 1.0683060884475708, "learning_rate": 1.951102323732975e-05, "loss": 1.7892, "step": 15940 }, { "epoch": 2.85, "grad_norm": 0.5219915509223938, "learning_rate": 1.9497307066637837e-05, "loss": 2.1233, "step": 15945 }, { "epoch": 2.85, "grad_norm": 0.791999101638794, "learning_rate": 1.948359263644241e-05, "loss": 1.8079, "step": 15950 }, { "epoch": 2.86, "grad_norm": 1.5373420715332031, "learning_rate": 1.9469879951081323e-05, "loss": 1.8237, "step": 15955 }, { "epoch": 2.86, "grad_norm": 0.6582045555114746, "learning_rate": 1.94561690148919e-05, "loss": 1.8346, "step": 15960 }, { "epoch": 2.86, "grad_norm": 1.0421522855758667, "learning_rate": 1.944245983221087e-05, "loss": 2.09, "step": 15965 }, { "epoch": 2.86, "grad_norm": 0.5510540008544922, "learning_rate": 1.942875240737446e-05, "loss": 2.0754, "step": 15970 }, { "epoch": 2.86, "grad_norm": 0.7140408158302307, "learning_rate": 1.9415046744718295e-05, "loss": 2.1591, "step": 15975 }, { "epoch": 2.86, "grad_norm": 0.533048689365387, "learning_rate": 1.9401342848577476e-05, "loss": 2.0399, "step": 15980 }, { "epoch": 2.86, "grad_norm": 1.1084396839141846, "learning_rate": 1.9387640723286533e-05, "loss": 1.5784, "step": 15985 }, { "epoch": 2.86, "grad_norm": 0.6885467767715454, "learning_rate": 1.9373940373179424e-05, "loss": 1.6761, "step": 15990 }, { "epoch": 2.86, "grad_norm": 0.7324159145355225, "learning_rate": 1.936024180258957e-05, "loss": 1.9015, "step": 15995 }, { "epoch": 2.86, "grad_norm": 1.0003372430801392, "learning_rate": 1.93465450158498e-05, "loss": 1.5935, "step": 16000 }, { "epoch": 2.86, "grad_norm": 0.4618781805038452, "learning_rate": 1.933285001729242e-05, "loss": 2.0161, "step": 16005 }, { "epoch": 2.87, "grad_norm": 1.0251598358154297, "learning_rate": 1.931915681124911e-05, "loss": 1.7638, "step": 16010 }, { "epoch": 2.87, "grad_norm": 1.092031478881836, "learning_rate": 1.9305465402051047e-05, "loss": 1.7572, "step": 16015 }, { "epoch": 2.87, "grad_norm": 0.4748564064502716, "learning_rate": 1.92917757940288e-05, "loss": 1.8665, "step": 16020 }, { "epoch": 2.87, "grad_norm": 1.7850896120071411, "learning_rate": 1.927808799151237e-05, "loss": 1.5633, "step": 16025 }, { "epoch": 2.87, "grad_norm": 0.7904292941093445, "learning_rate": 1.9264401998831213e-05, "loss": 1.7375, "step": 16030 }, { "epoch": 2.87, "grad_norm": 0.9015944600105286, "learning_rate": 1.9250717820314182e-05, "loss": 1.5826, "step": 16035 }, { "epoch": 2.87, "grad_norm": 0.7508746981620789, "learning_rate": 1.923703546028958e-05, "loss": 2.1586, "step": 16040 }, { "epoch": 2.87, "grad_norm": 4.131949424743652, "learning_rate": 1.922335492308511e-05, "loss": 1.7214, "step": 16045 }, { "epoch": 2.87, "grad_norm": 1.2245509624481201, "learning_rate": 1.9209676213027923e-05, "loss": 1.7981, "step": 16050 }, { "epoch": 2.87, "grad_norm": 0.4659061133861542, "learning_rate": 1.919599933444459e-05, "loss": 1.8612, "step": 16055 }, { "epoch": 2.87, "grad_norm": 0.6791583895683289, "learning_rate": 1.9182324291661084e-05, "loss": 1.8788, "step": 16060 }, { "epoch": 2.88, "grad_norm": 1.2033220529556274, "learning_rate": 1.91686510890028e-05, "loss": 1.7272, "step": 16065 }, { "epoch": 2.88, "grad_norm": 0.5756292343139648, "learning_rate": 1.915497973079457e-05, "loss": 1.9688, "step": 16070 }, { "epoch": 2.88, "grad_norm": 0.4528789222240448, "learning_rate": 1.9141310221360632e-05, "loss": 1.9315, "step": 16075 }, { "epoch": 2.88, "grad_norm": 0.4836488664150238, "learning_rate": 1.9127642565024628e-05, "loss": 1.7283, "step": 16080 }, { "epoch": 2.88, "grad_norm": 1.785624623298645, "learning_rate": 1.9113976766109643e-05, "loss": 1.7548, "step": 16085 }, { "epoch": 2.88, "grad_norm": 0.9146924018859863, "learning_rate": 1.9100312828938135e-05, "loss": 1.8917, "step": 16090 }, { "epoch": 2.88, "grad_norm": 0.9764569401741028, "learning_rate": 1.9086650757832007e-05, "loss": 1.6859, "step": 16095 }, { "epoch": 2.88, "grad_norm": 0.8843137621879578, "learning_rate": 1.9072990557112564e-05, "loss": 1.8221, "step": 16100 }, { "epoch": 2.88, "grad_norm": 0.6954315900802612, "learning_rate": 1.905933223110051e-05, "loss": 1.8598, "step": 16105 }, { "epoch": 2.88, "grad_norm": 0.5043737888336182, "learning_rate": 1.904567578411594e-05, "loss": 2.0398, "step": 16110 }, { "epoch": 2.88, "grad_norm": 0.8501962423324585, "learning_rate": 1.9032021220478405e-05, "loss": 1.8814, "step": 16115 }, { "epoch": 2.89, "grad_norm": 0.6526318192481995, "learning_rate": 1.901836854450681e-05, "loss": 1.9833, "step": 16120 }, { "epoch": 2.89, "grad_norm": 0.7955963611602783, "learning_rate": 1.900471776051949e-05, "loss": 1.8694, "step": 16125 }, { "epoch": 2.89, "grad_norm": 1.724168062210083, "learning_rate": 1.8991068872834185e-05, "loss": 2.09, "step": 16130 }, { "epoch": 2.89, "grad_norm": 0.7435161471366882, "learning_rate": 1.8977421885768004e-05, "loss": 1.896, "step": 16135 }, { "epoch": 2.89, "grad_norm": 0.8156474828720093, "learning_rate": 1.896377680363749e-05, "loss": 2.0581, "step": 16140 }, { "epoch": 2.89, "grad_norm": 0.7331948280334473, "learning_rate": 1.8950133630758582e-05, "loss": 1.7547, "step": 16145 }, { "epoch": 2.89, "grad_norm": 0.7191774845123291, "learning_rate": 1.893649237144658e-05, "loss": 1.6202, "step": 16150 }, { "epoch": 2.89, "grad_norm": 1.1362369060516357, "learning_rate": 1.89228530300162e-05, "loss": 1.8106, "step": 16155 }, { "epoch": 2.89, "grad_norm": 0.5437344908714294, "learning_rate": 1.8909215610781566e-05, "loss": 1.9975, "step": 16160 }, { "epoch": 2.89, "grad_norm": 0.9459233283996582, "learning_rate": 1.8895580118056174e-05, "loss": 1.7396, "step": 16165 }, { "epoch": 2.89, "grad_norm": 0.5802138447761536, "learning_rate": 1.8881946556152918e-05, "loss": 1.8485, "step": 16170 }, { "epoch": 2.9, "grad_norm": 0.6844445466995239, "learning_rate": 1.8868314929384084e-05, "loss": 1.5007, "step": 16175 }, { "epoch": 2.9, "grad_norm": 0.7569360136985779, "learning_rate": 1.885468524206134e-05, "loss": 1.5597, "step": 16180 }, { "epoch": 2.9, "grad_norm": 0.7909181714057922, "learning_rate": 1.8841057498495736e-05, "loss": 1.8607, "step": 16185 }, { "epoch": 2.9, "grad_norm": 0.9658223986625671, "learning_rate": 1.8827431702997732e-05, "loss": 1.7466, "step": 16190 }, { "epoch": 2.9, "grad_norm": 0.6813267469406128, "learning_rate": 1.8813807859877147e-05, "loss": 1.925, "step": 16195 }, { "epoch": 2.9, "grad_norm": 0.9319490790367126, "learning_rate": 1.8800185973443174e-05, "loss": 1.9117, "step": 16200 }, { "epoch": 2.9, "grad_norm": 0.8836780190467834, "learning_rate": 1.878656604800442e-05, "loss": 2.0741, "step": 16205 }, { "epoch": 2.9, "grad_norm": 0.99202960729599, "learning_rate": 1.8772948087868846e-05, "loss": 1.7016, "step": 16210 }, { "epoch": 2.9, "grad_norm": 0.7995858788490295, "learning_rate": 1.8759332097343797e-05, "loss": 2.0791, "step": 16215 }, { "epoch": 2.9, "grad_norm": 1.3566383123397827, "learning_rate": 1.874571808073601e-05, "loss": 1.5975, "step": 16220 }, { "epoch": 2.9, "grad_norm": 1.005405306816101, "learning_rate": 1.8732106042351572e-05, "loss": 1.9958, "step": 16225 }, { "epoch": 2.91, "grad_norm": 0.589972198009491, "learning_rate": 1.8718495986495967e-05, "loss": 2.1223, "step": 16230 }, { "epoch": 2.91, "grad_norm": 1.41853928565979, "learning_rate": 1.8704887917474042e-05, "loss": 2.0237, "step": 16235 }, { "epoch": 2.91, "grad_norm": 0.5799002051353455, "learning_rate": 1.8691281839590018e-05, "loss": 1.8955, "step": 16240 }, { "epoch": 2.91, "grad_norm": 0.6206422448158264, "learning_rate": 1.867767775714747e-05, "loss": 1.9139, "step": 16245 }, { "epoch": 2.91, "grad_norm": 1.6891002655029297, "learning_rate": 1.8664075674449376e-05, "loss": 2.0885, "step": 16250 }, { "epoch": 2.91, "grad_norm": 0.5611288547515869, "learning_rate": 1.865047559579804e-05, "loss": 2.031, "step": 16255 }, { "epoch": 2.91, "grad_norm": 1.0590778589248657, "learning_rate": 1.8636877525495173e-05, "loss": 1.6809, "step": 16260 }, { "epoch": 2.91, "grad_norm": 0.43978947401046753, "learning_rate": 1.8623281467841823e-05, "loss": 1.9269, "step": 16265 }, { "epoch": 2.91, "grad_norm": 0.8819534182548523, "learning_rate": 1.8609687427138407e-05, "loss": 1.8985, "step": 16270 }, { "epoch": 2.91, "grad_norm": 1.4660248756408691, "learning_rate": 1.859609540768471e-05, "loss": 1.8683, "step": 16275 }, { "epoch": 2.91, "grad_norm": 0.7070092558860779, "learning_rate": 1.8582505413779885e-05, "loss": 1.7901, "step": 16280 }, { "epoch": 2.91, "grad_norm": 0.8060388565063477, "learning_rate": 1.8568917449722427e-05, "loss": 1.6429, "step": 16285 }, { "epoch": 2.92, "grad_norm": 1.0174509286880493, "learning_rate": 1.8555331519810188e-05, "loss": 1.8506, "step": 16290 }, { "epoch": 2.92, "grad_norm": 1.0679351091384888, "learning_rate": 1.8541747628340388e-05, "loss": 1.7145, "step": 16295 }, { "epoch": 2.92, "grad_norm": 0.9246571660041809, "learning_rate": 1.8528165779609603e-05, "loss": 1.6584, "step": 16300 }, { "epoch": 2.92, "grad_norm": 0.5674405097961426, "learning_rate": 1.851458597791375e-05, "loss": 1.5784, "step": 16305 }, { "epoch": 2.92, "grad_norm": 0.738040566444397, "learning_rate": 1.8501008227548123e-05, "loss": 1.8847, "step": 16310 }, { "epoch": 2.92, "grad_norm": 0.651881754398346, "learning_rate": 1.8487432532807335e-05, "loss": 1.8891, "step": 16315 }, { "epoch": 2.92, "grad_norm": 0.8285610675811768, "learning_rate": 1.847385889798538e-05, "loss": 1.7669, "step": 16320 }, { "epoch": 2.92, "grad_norm": 0.5357155203819275, "learning_rate": 1.8460287327375576e-05, "loss": 1.9032, "step": 16325 }, { "epoch": 2.92, "grad_norm": 0.7268362045288086, "learning_rate": 1.8446717825270595e-05, "loss": 1.841, "step": 16330 }, { "epoch": 2.92, "grad_norm": 0.7002309560775757, "learning_rate": 1.8433150395962473e-05, "loss": 2.0385, "step": 16335 }, { "epoch": 2.92, "grad_norm": 1.3066308498382568, "learning_rate": 1.841958504374256e-05, "loss": 1.8508, "step": 16340 }, { "epoch": 2.93, "grad_norm": 0.6260136961936951, "learning_rate": 1.8406021772901566e-05, "loss": 2.0906, "step": 16345 }, { "epoch": 2.93, "grad_norm": 0.523491382598877, "learning_rate": 1.839246058772954e-05, "loss": 1.9063, "step": 16350 }, { "epoch": 2.93, "grad_norm": 1.4432581663131714, "learning_rate": 1.837890149251588e-05, "loss": 2.143, "step": 16355 }, { "epoch": 2.93, "grad_norm": 1.09247887134552, "learning_rate": 1.83653444915493e-05, "loss": 1.9382, "step": 16360 }, { "epoch": 2.93, "grad_norm": 1.2057552337646484, "learning_rate": 1.835178958911789e-05, "loss": 1.8019, "step": 16365 }, { "epoch": 2.93, "grad_norm": 0.8609539866447449, "learning_rate": 1.8338236789509024e-05, "loss": 1.9375, "step": 16370 }, { "epoch": 2.93, "grad_norm": 1.2171326875686646, "learning_rate": 1.832468609700946e-05, "loss": 1.9367, "step": 16375 }, { "epoch": 2.93, "grad_norm": 1.5746159553527832, "learning_rate": 1.831113751590527e-05, "loss": 1.6848, "step": 16380 }, { "epoch": 2.93, "grad_norm": 0.9621313810348511, "learning_rate": 1.8297591050481843e-05, "loss": 1.7429, "step": 16385 }, { "epoch": 2.93, "grad_norm": 0.3991534113883972, "learning_rate": 1.8284046705023915e-05, "loss": 1.6909, "step": 16390 }, { "epoch": 2.93, "grad_norm": 0.6495162844657898, "learning_rate": 1.8270504483815558e-05, "loss": 1.974, "step": 16395 }, { "epoch": 2.94, "grad_norm": 0.6558966040611267, "learning_rate": 1.8256964391140164e-05, "loss": 1.9515, "step": 16400 }, { "epoch": 2.94, "grad_norm": 0.7836574912071228, "learning_rate": 1.824342643128044e-05, "loss": 1.9209, "step": 16405 }, { "epoch": 2.94, "grad_norm": 0.685867190361023, "learning_rate": 1.822989060851844e-05, "loss": 1.6289, "step": 16410 }, { "epoch": 2.94, "grad_norm": 2.84834361076355, "learning_rate": 1.8216356927135525e-05, "loss": 1.7064, "step": 16415 }, { "epoch": 2.94, "grad_norm": 0.5934162139892578, "learning_rate": 1.8202825391412393e-05, "loss": 1.7927, "step": 16420 }, { "epoch": 2.94, "grad_norm": 0.5504593253135681, "learning_rate": 1.8192001710585122e-05, "loss": 2.1149, "step": 16425 }, { "epoch": 2.94, "grad_norm": 0.6468228697776794, "learning_rate": 1.817847404783477e-05, "loss": 2.1836, "step": 16430 }, { "epoch": 2.94, "grad_norm": 0.6115560531616211, "learning_rate": 1.8164948542726506e-05, "loss": 1.7916, "step": 16435 }, { "epoch": 2.94, "grad_norm": 0.9618609547615051, "learning_rate": 1.8151425199538447e-05, "loss": 1.7555, "step": 16440 }, { "epoch": 2.94, "grad_norm": 0.8971806168556213, "learning_rate": 1.813790402254801e-05, "loss": 1.837, "step": 16445 }, { "epoch": 2.94, "grad_norm": 0.7342554926872253, "learning_rate": 1.812438501603193e-05, "loss": 1.9009, "step": 16450 }, { "epoch": 2.95, "grad_norm": 0.743880033493042, "learning_rate": 1.811086818426625e-05, "loss": 1.7526, "step": 16455 }, { "epoch": 2.95, "grad_norm": 1.2517614364624023, "learning_rate": 1.8097353531526346e-05, "loss": 1.9244, "step": 16460 }, { "epoch": 2.95, "grad_norm": 0.933478057384491, "learning_rate": 1.8083841062086875e-05, "loss": 1.7212, "step": 16465 }, { "epoch": 2.95, "grad_norm": 0.49430811405181885, "learning_rate": 1.807033078022183e-05, "loss": 1.997, "step": 16470 }, { "epoch": 2.95, "grad_norm": 1.0263806581497192, "learning_rate": 1.8056822690204495e-05, "loss": 1.9313, "step": 16475 }, { "epoch": 2.95, "grad_norm": 0.8995507955551147, "learning_rate": 1.804331679630746e-05, "loss": 1.7927, "step": 16480 }, { "epoch": 2.95, "grad_norm": 1.0173105001449585, "learning_rate": 1.8029813102802624e-05, "loss": 1.6534, "step": 16485 }, { "epoch": 2.95, "grad_norm": 1.0232231616973877, "learning_rate": 1.8016311613961206e-05, "loss": 1.9429, "step": 16490 }, { "epoch": 2.95, "grad_norm": 0.8685727715492249, "learning_rate": 1.8002812334053703e-05, "loss": 1.7248, "step": 16495 }, { "epoch": 2.95, "grad_norm": 0.9551222324371338, "learning_rate": 1.7989315267349936e-05, "loss": 1.978, "step": 16500 }, { "epoch": 2.95, "grad_norm": 0.7746654748916626, "learning_rate": 1.7975820418119e-05, "loss": 2.0558, "step": 16505 }, { "epoch": 2.96, "grad_norm": 1.1619644165039062, "learning_rate": 1.7962327790629308e-05, "loss": 1.8762, "step": 16510 }, { "epoch": 2.96, "grad_norm": 0.7726320028305054, "learning_rate": 1.794883738914857e-05, "loss": 1.74, "step": 16515 }, { "epoch": 2.96, "grad_norm": 0.9551023244857788, "learning_rate": 1.793534921794379e-05, "loss": 1.9455, "step": 16520 }, { "epoch": 2.96, "grad_norm": 1.1138999462127686, "learning_rate": 1.792186328128125e-05, "loss": 1.5777, "step": 16525 }, { "epoch": 2.96, "grad_norm": 0.5583162307739258, "learning_rate": 1.7908379583426542e-05, "loss": 1.774, "step": 16530 }, { "epoch": 2.96, "grad_norm": 0.6044538617134094, "learning_rate": 1.7894898128644557e-05, "loss": 1.7541, "step": 16535 }, { "epoch": 2.96, "grad_norm": 0.9106922149658203, "learning_rate": 1.7881418921199454e-05, "loss": 1.956, "step": 16540 }, { "epoch": 2.96, "grad_norm": 0.6646421551704407, "learning_rate": 1.7867941965354706e-05, "loss": 1.8526, "step": 16545 }, { "epoch": 2.96, "grad_norm": 0.8308748006820679, "learning_rate": 1.7854467265373047e-05, "loss": 1.6069, "step": 16550 }, { "epoch": 2.96, "grad_norm": 0.7582127451896667, "learning_rate": 1.7840994825516523e-05, "loss": 1.835, "step": 16555 }, { "epoch": 2.96, "grad_norm": 0.9956608414649963, "learning_rate": 1.7827524650046447e-05, "loss": 1.7829, "step": 16560 }, { "epoch": 2.97, "grad_norm": 1.3976410627365112, "learning_rate": 1.7814056743223424e-05, "loss": 1.8531, "step": 16565 }, { "epoch": 2.97, "grad_norm": 0.6910321712493896, "learning_rate": 1.780059110930735e-05, "loss": 1.8633, "step": 16570 }, { "epoch": 2.97, "grad_norm": 1.205909013748169, "learning_rate": 1.7787127752557378e-05, "loss": 1.9529, "step": 16575 }, { "epoch": 2.97, "grad_norm": 1.323149561882019, "learning_rate": 1.777366667723196e-05, "loss": 1.7019, "step": 16580 }, { "epoch": 2.97, "grad_norm": 0.8329334855079651, "learning_rate": 1.776020788758882e-05, "loss": 1.757, "step": 16585 }, { "epoch": 2.97, "grad_norm": 2.090344190597534, "learning_rate": 1.7746751387884962e-05, "loss": 1.7325, "step": 16590 }, { "epoch": 2.97, "grad_norm": 1.0283842086791992, "learning_rate": 1.7733297182376663e-05, "loss": 1.8203, "step": 16595 }, { "epoch": 2.97, "grad_norm": 2.198268175125122, "learning_rate": 1.7719845275319473e-05, "loss": 1.5753, "step": 16600 }, { "epoch": 2.97, "grad_norm": 0.6753280758857727, "learning_rate": 1.7706395670968225e-05, "loss": 1.7627, "step": 16605 }, { "epoch": 2.97, "grad_norm": 0.7107426524162292, "learning_rate": 1.7692948373577003e-05, "loss": 1.8784, "step": 16610 }, { "epoch": 2.97, "grad_norm": 0.8274734020233154, "learning_rate": 1.7679503387399193e-05, "loss": 2.058, "step": 16615 }, { "epoch": 2.97, "grad_norm": 0.8373821973800659, "learning_rate": 1.766606071668741e-05, "loss": 2.1438, "step": 16620 }, { "epoch": 2.98, "grad_norm": 0.6570589542388916, "learning_rate": 1.765262036569358e-05, "loss": 2.1135, "step": 16625 }, { "epoch": 2.98, "grad_norm": 0.4979616105556488, "learning_rate": 1.7639182338668847e-05, "loss": 1.8345, "step": 16630 }, { "epoch": 2.98, "grad_norm": 1.0736268758773804, "learning_rate": 1.7625746639863673e-05, "loss": 1.9596, "step": 16635 }, { "epoch": 2.98, "grad_norm": 0.7851144075393677, "learning_rate": 1.761231327352773e-05, "loss": 1.8697, "step": 16640 }, { "epoch": 2.98, "grad_norm": 0.7080187201499939, "learning_rate": 1.7598882243909997e-05, "loss": 1.5315, "step": 16645 }, { "epoch": 2.98, "grad_norm": 0.8879645466804504, "learning_rate": 1.7585453555258697e-05, "loss": 2.0436, "step": 16650 }, { "epoch": 2.98, "grad_norm": 0.9378961324691772, "learning_rate": 1.75720272118213e-05, "loss": 2.0527, "step": 16655 }, { "epoch": 2.98, "grad_norm": 0.886584460735321, "learning_rate": 1.755860321784456e-05, "loss": 1.8691, "step": 16660 }, { "epoch": 2.98, "grad_norm": 0.5075838565826416, "learning_rate": 1.7545181577574454e-05, "loss": 1.7712, "step": 16665 }, { "epoch": 2.98, "grad_norm": 0.6004984974861145, "learning_rate": 1.7531762295256254e-05, "loss": 1.8251, "step": 16670 }, { "epoch": 2.98, "grad_norm": 0.7136458158493042, "learning_rate": 1.7518345375134447e-05, "loss": 1.7217, "step": 16675 }, { "epoch": 2.99, "grad_norm": 0.8066684603691101, "learning_rate": 1.7504930821452808e-05, "loss": 2.0242, "step": 16680 }, { "epoch": 2.99, "grad_norm": 0.7061532735824585, "learning_rate": 1.7491518638454336e-05, "loss": 1.7245, "step": 16685 }, { "epoch": 2.99, "grad_norm": 0.7429984211921692, "learning_rate": 1.7478108830381296e-05, "loss": 1.6728, "step": 16690 }, { "epoch": 2.99, "grad_norm": 0.9607851505279541, "learning_rate": 1.74647014014752e-05, "loss": 1.3616, "step": 16695 }, { "epoch": 2.99, "grad_norm": 0.46374931931495667, "learning_rate": 1.74512963559768e-05, "loss": 1.7103, "step": 16700 }, { "epoch": 2.99, "grad_norm": 1.2484031915664673, "learning_rate": 1.7437893698126102e-05, "loss": 1.745, "step": 16705 }, { "epoch": 2.99, "grad_norm": 1.0168793201446533, "learning_rate": 1.7424493432162366e-05, "loss": 1.8125, "step": 16710 }, { "epoch": 2.99, "grad_norm": 0.9517542123794556, "learning_rate": 1.7411095562324062e-05, "loss": 1.7851, "step": 16715 }, { "epoch": 2.99, "grad_norm": 1.2883968353271484, "learning_rate": 1.7397700092848927e-05, "loss": 1.9432, "step": 16720 }, { "epoch": 2.99, "grad_norm": 0.5470624566078186, "learning_rate": 1.7384307027973946e-05, "loss": 1.8344, "step": 16725 }, { "epoch": 2.99, "grad_norm": 1.2388825416564941, "learning_rate": 1.7370916371935322e-05, "loss": 1.8179, "step": 16730 }, { "epoch": 3.0, "grad_norm": 0.9549524784088135, "learning_rate": 1.7357528128968513e-05, "loss": 1.6871, "step": 16735 }, { "epoch": 3.0, "grad_norm": 0.6752102375030518, "learning_rate": 1.7344142303308208e-05, "loss": 1.9497, "step": 16740 }, { "epoch": 3.0, "grad_norm": 0.8145148158073425, "learning_rate": 1.7330758899188322e-05, "loss": 1.8557, "step": 16745 }, { "epoch": 3.0, "grad_norm": 0.9011813402175903, "learning_rate": 1.7317377920842028e-05, "loss": 2.0228, "step": 16750 }, { "epoch": 3.0, "grad_norm": 0.5068954825401306, "learning_rate": 1.7303999372501705e-05, "loss": 1.677, "step": 16755 }, { "epoch": 3.0, "grad_norm": 0.4357677102088928, "learning_rate": 1.729062325839898e-05, "loss": 2.1066, "step": 16760 }, { "epoch": 3.0, "grad_norm": 0.6191133856773376, "learning_rate": 1.7277249582764697e-05, "loss": 1.931, "step": 16765 }, { "epoch": 3.0, "grad_norm": 0.5191856026649475, "learning_rate": 1.726387834982895e-05, "loss": 1.5549, "step": 16770 }, { "epoch": 3.0, "grad_norm": 0.7998222708702087, "learning_rate": 1.7250509563821032e-05, "loss": 1.5933, "step": 16775 }, { "epoch": 3.0, "grad_norm": 1.8126819133758545, "learning_rate": 1.7237143228969488e-05, "loss": 2.0985, "step": 16780 }, { "epoch": 3.0, "grad_norm": 0.6312077641487122, "learning_rate": 1.7223779349502073e-05, "loss": 1.9076, "step": 16785 }, { "epoch": 3.01, "grad_norm": 0.9065338969230652, "learning_rate": 1.721041792964577e-05, "loss": 1.7754, "step": 16790 }, { "epoch": 3.01, "grad_norm": 0.9230442047119141, "learning_rate": 1.7197058973626785e-05, "loss": 2.0562, "step": 16795 }, { "epoch": 3.01, "grad_norm": 0.8840682506561279, "learning_rate": 1.7183702485670543e-05, "loss": 1.9154, "step": 16800 }, { "epoch": 3.01, "grad_norm": 0.8318727612495422, "learning_rate": 1.7170348470001688e-05, "loss": 1.9867, "step": 16805 }, { "epoch": 3.01, "grad_norm": 0.6302641034126282, "learning_rate": 1.7156996930844073e-05, "loss": 1.9369, "step": 16810 }, { "epoch": 3.01, "grad_norm": 0.49491533637046814, "learning_rate": 1.7143647872420793e-05, "loss": 1.569, "step": 16815 }, { "epoch": 3.01, "grad_norm": 1.2738268375396729, "learning_rate": 1.7130301298954126e-05, "loss": 1.817, "step": 16820 }, { "epoch": 3.01, "grad_norm": 0.8552365899085999, "learning_rate": 1.7116957214665587e-05, "loss": 1.6973, "step": 16825 }, { "epoch": 3.01, "grad_norm": 1.1242488622665405, "learning_rate": 1.710361562377591e-05, "loss": 1.721, "step": 16830 }, { "epoch": 3.01, "grad_norm": 0.8571417927742004, "learning_rate": 1.7090276530505004e-05, "loss": 1.7216, "step": 16835 }, { "epoch": 3.01, "grad_norm": 8.810229301452637, "learning_rate": 1.707693993907203e-05, "loss": 2.1472, "step": 16840 }, { "epoch": 3.02, "grad_norm": 0.7794356346130371, "learning_rate": 1.706360585369533e-05, "loss": 1.9728, "step": 16845 }, { "epoch": 3.02, "grad_norm": 0.5598608255386353, "learning_rate": 1.705027427859247e-05, "loss": 1.7381, "step": 16850 }, { "epoch": 3.02, "grad_norm": 0.5534650087356567, "learning_rate": 1.7036945217980205e-05, "loss": 1.9653, "step": 16855 }, { "epoch": 3.02, "grad_norm": 1.1248143911361694, "learning_rate": 1.702361867607451e-05, "loss": 1.8914, "step": 16860 }, { "epoch": 3.02, "grad_norm": 5.001594543457031, "learning_rate": 1.7010294657090548e-05, "loss": 1.7459, "step": 16865 }, { "epoch": 3.02, "grad_norm": 0.5881880521774292, "learning_rate": 1.6996973165242704e-05, "loss": 1.5695, "step": 16870 }, { "epoch": 3.02, "grad_norm": 0.516708493232727, "learning_rate": 1.6983654204744552e-05, "loss": 1.8567, "step": 16875 }, { "epoch": 3.02, "grad_norm": 1.0162904262542725, "learning_rate": 1.6970337779808866e-05, "loss": 1.7251, "step": 16880 }, { "epoch": 3.02, "grad_norm": 0.7917419075965881, "learning_rate": 1.6957023894647614e-05, "loss": 1.7346, "step": 16885 }, { "epoch": 3.02, "grad_norm": 0.6009075045585632, "learning_rate": 1.6943712553471966e-05, "loss": 2.0054, "step": 16890 }, { "epoch": 3.02, "grad_norm": 1.0688451528549194, "learning_rate": 1.6930403760492297e-05, "loss": 1.7544, "step": 16895 }, { "epoch": 3.03, "grad_norm": 0.9357558488845825, "learning_rate": 1.691709751991815e-05, "loss": 1.9594, "step": 16900 }, { "epoch": 3.03, "grad_norm": 0.6222114562988281, "learning_rate": 1.6903793835958283e-05, "loss": 1.9862, "step": 16905 }, { "epoch": 3.03, "grad_norm": 1.3084805011749268, "learning_rate": 1.6890492712820636e-05, "loss": 1.7035, "step": 16910 }, { "epoch": 3.03, "grad_norm": 0.9012706279754639, "learning_rate": 1.6877194154712342e-05, "loss": 2.0331, "step": 16915 }, { "epoch": 3.03, "grad_norm": 2.868881940841675, "learning_rate": 1.686389816583973e-05, "loss": 1.6892, "step": 16920 }, { "epoch": 3.03, "grad_norm": 0.7443296909332275, "learning_rate": 1.6850604750408296e-05, "loss": 1.6757, "step": 16925 }, { "epoch": 3.03, "grad_norm": 1.3445460796356201, "learning_rate": 1.6837313912622748e-05, "loss": 1.9912, "step": 16930 }, { "epoch": 3.03, "grad_norm": 0.43768608570098877, "learning_rate": 1.682402565668695e-05, "loss": 1.843, "step": 16935 }, { "epoch": 3.03, "grad_norm": 1.3455616235733032, "learning_rate": 1.6810739986803987e-05, "loss": 1.9418, "step": 16940 }, { "epoch": 3.03, "grad_norm": 0.627327561378479, "learning_rate": 1.6797456907176074e-05, "loss": 1.3489, "step": 16945 }, { "epoch": 3.03, "grad_norm": 1.0638902187347412, "learning_rate": 1.678417642200466e-05, "loss": 1.8686, "step": 16950 }, { "epoch": 3.03, "grad_norm": 0.8032281398773193, "learning_rate": 1.6770898535490333e-05, "loss": 1.6365, "step": 16955 }, { "epoch": 3.04, "grad_norm": 0.4566420912742615, "learning_rate": 1.6757623251832887e-05, "loss": 2.1101, "step": 16960 }, { "epoch": 3.04, "grad_norm": 0.840287446975708, "learning_rate": 1.674435057523128e-05, "loss": 1.5239, "step": 16965 }, { "epoch": 3.04, "grad_norm": 1.571700930595398, "learning_rate": 1.673108050988364e-05, "loss": 1.9991, "step": 16970 }, { "epoch": 3.04, "grad_norm": 0.6242066621780396, "learning_rate": 1.6717813059987293e-05, "loss": 2.0588, "step": 16975 }, { "epoch": 3.04, "grad_norm": 1.1033291816711426, "learning_rate": 1.6704548229738697e-05, "loss": 1.7984, "step": 16980 }, { "epoch": 3.04, "grad_norm": 0.7740524411201477, "learning_rate": 1.6691286023333517e-05, "loss": 1.5941, "step": 16985 }, { "epoch": 3.04, "grad_norm": 0.677450954914093, "learning_rate": 1.6678026444966587e-05, "loss": 1.9171, "step": 16990 }, { "epoch": 3.04, "grad_norm": 0.6410496234893799, "learning_rate": 1.6664769498831884e-05, "loss": 1.9519, "step": 16995 }, { "epoch": 3.04, "grad_norm": 1.0398961305618286, "learning_rate": 1.6651515189122564e-05, "loss": 1.7254, "step": 17000 }, { "epoch": 3.04, "grad_norm": 0.7212021350860596, "learning_rate": 1.663826352003096e-05, "loss": 1.9161, "step": 17005 }, { "epoch": 3.04, "grad_norm": 0.8684729337692261, "learning_rate": 1.6625014495748566e-05, "loss": 1.8468, "step": 17010 }, { "epoch": 3.05, "grad_norm": 0.5007403492927551, "learning_rate": 1.661176812046603e-05, "loss": 2.1323, "step": 17015 }, { "epoch": 3.05, "grad_norm": 1.1957733631134033, "learning_rate": 1.659852439837317e-05, "loss": 1.7557, "step": 17020 }, { "epoch": 3.05, "grad_norm": 0.935827910900116, "learning_rate": 1.658528333365896e-05, "loss": 1.9512, "step": 17025 }, { "epoch": 3.05, "grad_norm": 0.5936657786369324, "learning_rate": 1.657204493051153e-05, "loss": 1.8086, "step": 17030 }, { "epoch": 3.05, "grad_norm": 0.7128114700317383, "learning_rate": 1.6558809193118202e-05, "loss": 1.7417, "step": 17035 }, { "epoch": 3.05, "grad_norm": 1.5274105072021484, "learning_rate": 1.6545576125665395e-05, "loss": 2.0459, "step": 17040 }, { "epoch": 3.05, "grad_norm": 1.2550373077392578, "learning_rate": 1.6532345732338724e-05, "loss": 1.689, "step": 17045 }, { "epoch": 3.05, "grad_norm": 0.6413518786430359, "learning_rate": 1.651911801732296e-05, "loss": 1.9651, "step": 17050 }, { "epoch": 3.05, "grad_norm": 0.7453106045722961, "learning_rate": 1.6505892984802e-05, "loss": 1.8327, "step": 17055 }, { "epoch": 3.05, "grad_norm": 0.7208839058876038, "learning_rate": 1.6492670638958924e-05, "loss": 1.7667, "step": 17060 }, { "epoch": 3.05, "grad_norm": 4.1532416343688965, "learning_rate": 1.6479450983975946e-05, "loss": 1.7473, "step": 17065 }, { "epoch": 3.06, "grad_norm": 1.0695805549621582, "learning_rate": 1.646623402403442e-05, "loss": 1.7827, "step": 17070 }, { "epoch": 3.06, "grad_norm": 0.7231548428535461, "learning_rate": 1.6453019763314862e-05, "loss": 1.8391, "step": 17075 }, { "epoch": 3.06, "grad_norm": 0.7367892265319824, "learning_rate": 1.6439808205996942e-05, "loss": 1.6613, "step": 17080 }, { "epoch": 3.06, "grad_norm": 0.9162821769714355, "learning_rate": 1.642659935625945e-05, "loss": 2.0619, "step": 17085 }, { "epoch": 3.06, "grad_norm": 0.6432048678398132, "learning_rate": 1.6413393218280328e-05, "loss": 2.0551, "step": 17090 }, { "epoch": 3.06, "grad_norm": 1.1106834411621094, "learning_rate": 1.640018979623668e-05, "loss": 1.7662, "step": 17095 }, { "epoch": 3.06, "grad_norm": 0.7890145778656006, "learning_rate": 1.6386989094304715e-05, "loss": 1.8003, "step": 17100 }, { "epoch": 3.06, "grad_norm": 0.9444851875305176, "learning_rate": 1.6373791116659823e-05, "loss": 1.7911, "step": 17105 }, { "epoch": 3.06, "grad_norm": 1.1127926111221313, "learning_rate": 1.6360595867476504e-05, "loss": 1.7545, "step": 17110 }, { "epoch": 3.06, "grad_norm": 0.997949481010437, "learning_rate": 1.63474033509284e-05, "loss": 2.0693, "step": 17115 }, { "epoch": 3.06, "grad_norm": 0.6662190556526184, "learning_rate": 1.6334213571188288e-05, "loss": 1.7301, "step": 17120 }, { "epoch": 3.07, "grad_norm": 1.0271848440170288, "learning_rate": 1.63210265324281e-05, "loss": 1.9007, "step": 17125 }, { "epoch": 3.07, "grad_norm": 1.0467654466629028, "learning_rate": 1.6307842238818874e-05, "loss": 1.7783, "step": 17130 }, { "epoch": 3.07, "grad_norm": 0.8379384279251099, "learning_rate": 1.6294660694530776e-05, "loss": 1.629, "step": 17135 }, { "epoch": 3.07, "grad_norm": 0.7403329014778137, "learning_rate": 1.628148190373313e-05, "loss": 1.5475, "step": 17140 }, { "epoch": 3.07, "grad_norm": 0.8592426776885986, "learning_rate": 1.626830587059437e-05, "loss": 1.7707, "step": 17145 }, { "epoch": 3.07, "grad_norm": 0.8583236932754517, "learning_rate": 1.6255132599282065e-05, "loss": 1.785, "step": 17150 }, { "epoch": 3.07, "grad_norm": 0.5623230934143066, "learning_rate": 1.624196209396291e-05, "loss": 1.9508, "step": 17155 }, { "epoch": 3.07, "grad_norm": 0.7288662195205688, "learning_rate": 1.622879435880272e-05, "loss": 1.9152, "step": 17160 }, { "epoch": 3.07, "grad_norm": 0.5778056383132935, "learning_rate": 1.621562939796643e-05, "loss": 1.8435, "step": 17165 }, { "epoch": 3.07, "grad_norm": 0.7465753555297852, "learning_rate": 1.620246721561812e-05, "loss": 1.6645, "step": 17170 }, { "epoch": 3.07, "grad_norm": 0.6527841687202454, "learning_rate": 1.6189307815920973e-05, "loss": 1.7769, "step": 17175 }, { "epoch": 3.08, "grad_norm": 0.9524030089378357, "learning_rate": 1.617615120303728e-05, "loss": 1.9306, "step": 17180 }, { "epoch": 3.08, "grad_norm": 0.8811773061752319, "learning_rate": 1.6162997381128478e-05, "loss": 1.9073, "step": 17185 }, { "epoch": 3.08, "grad_norm": 0.5965471863746643, "learning_rate": 1.61498463543551e-05, "loss": 2.1639, "step": 17190 }, { "epoch": 3.08, "grad_norm": 0.8437951803207397, "learning_rate": 1.6136698126876806e-05, "loss": 1.8132, "step": 17195 }, { "epoch": 3.08, "grad_norm": 0.6007370948791504, "learning_rate": 1.6123552702852372e-05, "loss": 1.9358, "step": 17200 }, { "epoch": 3.08, "grad_norm": 0.843265950679779, "learning_rate": 1.611041008643968e-05, "loss": 1.9867, "step": 17205 }, { "epoch": 3.08, "grad_norm": 0.7209492325782776, "learning_rate": 1.6097270281795722e-05, "loss": 2.0699, "step": 17210 }, { "epoch": 3.08, "grad_norm": 1.3360052108764648, "learning_rate": 1.608413329307662e-05, "loss": 1.8391, "step": 17215 }, { "epoch": 3.08, "grad_norm": 0.6539368033409119, "learning_rate": 1.6070999124437586e-05, "loss": 2.1616, "step": 17220 }, { "epoch": 3.08, "grad_norm": 0.6861870884895325, "learning_rate": 1.605786778003293e-05, "loss": 1.6254, "step": 17225 }, { "epoch": 3.08, "grad_norm": 0.804837703704834, "learning_rate": 1.60447392640161e-05, "loss": 1.6783, "step": 17230 }, { "epoch": 3.08, "grad_norm": 0.64955735206604, "learning_rate": 1.6031613580539617e-05, "loss": 1.9525, "step": 17235 }, { "epoch": 3.09, "grad_norm": 1.0166507959365845, "learning_rate": 1.6018490733755137e-05, "loss": 1.7277, "step": 17240 }, { "epoch": 3.09, "grad_norm": 0.7587583661079407, "learning_rate": 1.60053707278134e-05, "loss": 1.7941, "step": 17245 }, { "epoch": 3.09, "grad_norm": 0.623679518699646, "learning_rate": 1.5992253566864244e-05, "loss": 1.8599, "step": 17250 }, { "epoch": 3.09, "grad_norm": 0.8486801385879517, "learning_rate": 1.597913925505663e-05, "loss": 1.7489, "step": 17255 }, { "epoch": 3.09, "grad_norm": 0.41435787081718445, "learning_rate": 1.5966027796538586e-05, "loss": 1.8594, "step": 17260 }, { "epoch": 3.09, "grad_norm": 2.0519394874572754, "learning_rate": 1.595291919545726e-05, "loss": 2.012, "step": 17265 }, { "epoch": 3.09, "grad_norm": 1.2812122106552124, "learning_rate": 1.59398134559589e-05, "loss": 2.0921, "step": 17270 }, { "epoch": 3.09, "grad_norm": 2.542084217071533, "learning_rate": 1.5926710582188826e-05, "loss": 1.8695, "step": 17275 }, { "epoch": 3.09, "grad_norm": 0.564811646938324, "learning_rate": 1.591361057829146e-05, "loss": 2.3541, "step": 17280 }, { "epoch": 3.09, "grad_norm": 0.9718075394630432, "learning_rate": 1.5900513448410332e-05, "loss": 1.6868, "step": 17285 }, { "epoch": 3.09, "grad_norm": 0.9781553149223328, "learning_rate": 1.588741919668805e-05, "loss": 1.9755, "step": 17290 }, { "epoch": 3.1, "grad_norm": 0.5526376366615295, "learning_rate": 1.5874327827266304e-05, "loss": 2.0345, "step": 17295 }, { "epoch": 3.1, "grad_norm": 3.0677220821380615, "learning_rate": 1.5861239344285893e-05, "loss": 1.9055, "step": 17300 }, { "epoch": 3.1, "grad_norm": 1.1272741556167603, "learning_rate": 1.5848153751886685e-05, "loss": 2.0391, "step": 17305 }, { "epoch": 3.1, "grad_norm": 0.8001586198806763, "learning_rate": 1.5835071054207643e-05, "loss": 1.9079, "step": 17310 }, { "epoch": 3.1, "grad_norm": 0.9023197293281555, "learning_rate": 1.582199125538682e-05, "loss": 1.7115, "step": 17315 }, { "epoch": 3.1, "grad_norm": 0.6295259594917297, "learning_rate": 1.5808914359561328e-05, "loss": 1.819, "step": 17320 }, { "epoch": 3.1, "grad_norm": 0.5197964310646057, "learning_rate": 1.579584037086738e-05, "loss": 1.8448, "step": 17325 }, { "epoch": 3.1, "grad_norm": 0.8185873031616211, "learning_rate": 1.578276929344027e-05, "loss": 1.9797, "step": 17330 }, { "epoch": 3.1, "grad_norm": 1.1019208431243896, "learning_rate": 1.5769701131414372e-05, "loss": 1.5105, "step": 17335 }, { "epoch": 3.1, "grad_norm": 0.8213723301887512, "learning_rate": 1.575663588892312e-05, "loss": 1.8677, "step": 17340 }, { "epoch": 3.1, "grad_norm": 0.8821647763252258, "learning_rate": 1.574357357009905e-05, "loss": 1.7404, "step": 17345 }, { "epoch": 3.11, "grad_norm": 1.0914586782455444, "learning_rate": 1.5730514179073756e-05, "loss": 1.6639, "step": 17350 }, { "epoch": 3.11, "grad_norm": 0.6956515908241272, "learning_rate": 1.5717457719977906e-05, "loss": 1.9229, "step": 17355 }, { "epoch": 3.11, "grad_norm": 1.160091757774353, "learning_rate": 1.5704404196941265e-05, "loss": 1.4537, "step": 17360 }, { "epoch": 3.11, "grad_norm": 0.5331469774246216, "learning_rate": 1.5691353614092627e-05, "loss": 1.842, "step": 17365 }, { "epoch": 3.11, "grad_norm": 0.9176687598228455, "learning_rate": 1.5678305975559882e-05, "loss": 1.5422, "step": 17370 }, { "epoch": 3.11, "grad_norm": 0.5114845633506775, "learning_rate": 1.5665261285469992e-05, "loss": 2.0008, "step": 17375 }, { "epoch": 3.11, "grad_norm": 0.7605366706848145, "learning_rate": 1.5652219547948982e-05, "loss": 1.871, "step": 17380 }, { "epoch": 3.11, "grad_norm": 1.129357933998108, "learning_rate": 1.5639180767121938e-05, "loss": 1.7841, "step": 17385 }, { "epoch": 3.11, "grad_norm": 0.8872073292732239, "learning_rate": 1.562614494711301e-05, "loss": 1.8101, "step": 17390 }, { "epoch": 3.11, "grad_norm": 1.4235471487045288, "learning_rate": 1.5613112092045418e-05, "loss": 2.1259, "step": 17395 }, { "epoch": 3.11, "grad_norm": 0.5211644172668457, "learning_rate": 1.5600082206041443e-05, "loss": 2.0976, "step": 17400 }, { "epoch": 3.12, "grad_norm": 1.07545804977417, "learning_rate": 1.558705529322243e-05, "loss": 2.0866, "step": 17405 }, { "epoch": 3.12, "grad_norm": 0.8453513383865356, "learning_rate": 1.5574031357708778e-05, "loss": 1.822, "step": 17410 }, { "epoch": 3.12, "grad_norm": 0.5668812394142151, "learning_rate": 1.556101040361993e-05, "loss": 1.8889, "step": 17415 }, { "epoch": 3.12, "grad_norm": 0.6529809236526489, "learning_rate": 1.554799243507441e-05, "loss": 2.2048, "step": 17420 }, { "epoch": 3.12, "grad_norm": 1.1768380403518677, "learning_rate": 1.5534977456189797e-05, "loss": 2.0006, "step": 17425 }, { "epoch": 3.12, "grad_norm": 0.7664825320243835, "learning_rate": 1.552196547108271e-05, "loss": 1.8376, "step": 17430 }, { "epoch": 3.12, "grad_norm": 0.5944017767906189, "learning_rate": 1.550895648386883e-05, "loss": 1.7861, "step": 17435 }, { "epoch": 3.12, "grad_norm": 1.4491263628005981, "learning_rate": 1.5495950498662877e-05, "loss": 1.8945, "step": 17440 }, { "epoch": 3.12, "grad_norm": 0.961166501045227, "learning_rate": 1.5482947519578645e-05, "loss": 1.9871, "step": 17445 }, { "epoch": 3.12, "grad_norm": 1.2184598445892334, "learning_rate": 1.5469947550728958e-05, "loss": 1.7694, "step": 17450 }, { "epoch": 3.12, "grad_norm": 1.1210654973983765, "learning_rate": 1.5456950596225703e-05, "loss": 1.6446, "step": 17455 }, { "epoch": 3.13, "grad_norm": 0.3188503682613373, "learning_rate": 1.5443956660179783e-05, "loss": 1.8352, "step": 17460 }, { "epoch": 3.13, "grad_norm": 0.4729178249835968, "learning_rate": 1.5430965746701183e-05, "loss": 1.9481, "step": 17465 }, { "epoch": 3.13, "grad_norm": 0.6622750163078308, "learning_rate": 1.5417977859898914e-05, "loss": 1.9198, "step": 17470 }, { "epoch": 3.13, "grad_norm": 2.716395854949951, "learning_rate": 1.5404993003881027e-05, "loss": 1.7338, "step": 17475 }, { "epoch": 3.13, "grad_norm": 0.9567592144012451, "learning_rate": 1.5392011182754627e-05, "loss": 1.5414, "step": 17480 }, { "epoch": 3.13, "grad_norm": 0.6734287738800049, "learning_rate": 1.537903240062585e-05, "loss": 2.1804, "step": 17485 }, { "epoch": 3.13, "grad_norm": 0.9989105463027954, "learning_rate": 1.5366056661599866e-05, "loss": 1.6213, "step": 17490 }, { "epoch": 3.13, "grad_norm": 1.0770398378372192, "learning_rate": 1.5353083969780897e-05, "loss": 1.9834, "step": 17495 }, { "epoch": 3.13, "grad_norm": 1.3205726146697998, "learning_rate": 1.5340114329272194e-05, "loss": 1.8774, "step": 17500 }, { "epoch": 3.13, "grad_norm": 0.6853746771812439, "learning_rate": 1.532714774417603e-05, "loss": 1.8502, "step": 17505 }, { "epoch": 3.13, "grad_norm": 0.6563792824745178, "learning_rate": 1.5314184218593724e-05, "loss": 1.6979, "step": 17510 }, { "epoch": 3.14, "grad_norm": 0.8047040104866028, "learning_rate": 1.5301223756625643e-05, "loss": 1.5599, "step": 17515 }, { "epoch": 3.14, "grad_norm": 1.3527451753616333, "learning_rate": 1.5288266362371155e-05, "loss": 1.8567, "step": 17520 }, { "epoch": 3.14, "grad_norm": 0.8321057558059692, "learning_rate": 1.527531203992868e-05, "loss": 1.9165, "step": 17525 }, { "epoch": 3.14, "grad_norm": 0.5902025699615479, "learning_rate": 1.526236079339565e-05, "loss": 1.8397, "step": 17530 }, { "epoch": 3.14, "grad_norm": 0.8331001400947571, "learning_rate": 1.5249412626868526e-05, "loss": 1.9709, "step": 17535 }, { "epoch": 3.14, "grad_norm": 1.3767504692077637, "learning_rate": 1.5236467544442821e-05, "loss": 1.9877, "step": 17540 }, { "epoch": 3.14, "grad_norm": 0.8942415714263916, "learning_rate": 1.5223525550213034e-05, "loss": 1.8899, "step": 17545 }, { "epoch": 3.14, "grad_norm": 1.9923580884933472, "learning_rate": 1.521058664827272e-05, "loss": 1.6486, "step": 17550 }, { "epoch": 3.14, "grad_norm": 0.7297930121421814, "learning_rate": 1.5197650842714426e-05, "loss": 1.8752, "step": 17555 }, { "epoch": 3.14, "grad_norm": 0.7140510082244873, "learning_rate": 1.5184718137629745e-05, "loss": 1.8954, "step": 17560 }, { "epoch": 3.14, "grad_norm": 0.7358191013336182, "learning_rate": 1.5171788537109272e-05, "loss": 2.0138, "step": 17565 }, { "epoch": 3.14, "grad_norm": 0.8002922534942627, "learning_rate": 1.5158862045242633e-05, "loss": 2.0375, "step": 17570 }, { "epoch": 3.15, "grad_norm": 0.6648169755935669, "learning_rate": 1.5145938666118459e-05, "loss": 1.9708, "step": 17575 }, { "epoch": 3.15, "grad_norm": 1.1531585454940796, "learning_rate": 1.5133018403824411e-05, "loss": 1.72, "step": 17580 }, { "epoch": 3.15, "grad_norm": 0.5088279843330383, "learning_rate": 1.512010126244715e-05, "loss": 1.7109, "step": 17585 }, { "epoch": 3.15, "grad_norm": 0.825671911239624, "learning_rate": 1.5107187246072357e-05, "loss": 1.7681, "step": 17590 }, { "epoch": 3.15, "grad_norm": 1.1927964687347412, "learning_rate": 1.5094276358784728e-05, "loss": 1.897, "step": 17595 }, { "epoch": 3.15, "grad_norm": 0.9928845763206482, "learning_rate": 1.5081368604667956e-05, "loss": 1.8343, "step": 17600 }, { "epoch": 3.15, "grad_norm": 0.9306197166442871, "learning_rate": 1.5068463987804748e-05, "loss": 1.9001, "step": 17605 }, { "epoch": 3.15, "grad_norm": 0.7192040681838989, "learning_rate": 1.5055562512276827e-05, "loss": 1.8987, "step": 17610 }, { "epoch": 3.15, "grad_norm": 2.9316582679748535, "learning_rate": 1.5042664182164926e-05, "loss": 1.9867, "step": 17615 }, { "epoch": 3.15, "grad_norm": 0.7092540860176086, "learning_rate": 1.502976900154876e-05, "loss": 1.9289, "step": 17620 }, { "epoch": 3.15, "grad_norm": 3.189732313156128, "learning_rate": 1.5016876974507065e-05, "loss": 1.6191, "step": 17625 }, { "epoch": 3.16, "grad_norm": 0.6250412464141846, "learning_rate": 1.500398810511759e-05, "loss": 1.7705, "step": 17630 }, { "epoch": 3.16, "grad_norm": 0.9590062499046326, "learning_rate": 1.4991102397457058e-05, "loss": 1.6913, "step": 17635 }, { "epoch": 3.16, "grad_norm": 0.5470601916313171, "learning_rate": 1.4978219855601216e-05, "loss": 1.912, "step": 17640 }, { "epoch": 3.16, "grad_norm": 0.9383836984634399, "learning_rate": 1.496534048362479e-05, "loss": 1.8818, "step": 17645 }, { "epoch": 3.16, "grad_norm": 0.911783754825592, "learning_rate": 1.4952464285601515e-05, "loss": 1.9046, "step": 17650 }, { "epoch": 3.16, "grad_norm": 2.2841453552246094, "learning_rate": 1.493959126560412e-05, "loss": 1.5841, "step": 17655 }, { "epoch": 3.16, "grad_norm": 0.586314857006073, "learning_rate": 1.4926721427704335e-05, "loss": 1.8118, "step": 17660 }, { "epoch": 3.16, "grad_norm": 0.8905709981918335, "learning_rate": 1.4913854775972869e-05, "loss": 1.8675, "step": 17665 }, { "epoch": 3.16, "grad_norm": 1.4882047176361084, "learning_rate": 1.4900991314479434e-05, "loss": 1.5654, "step": 17670 }, { "epoch": 3.16, "grad_norm": 0.8829624652862549, "learning_rate": 1.4888131047292742e-05, "loss": 1.697, "step": 17675 }, { "epoch": 3.16, "grad_norm": 1.1058586835861206, "learning_rate": 1.4875273978480464e-05, "loss": 1.7776, "step": 17680 }, { "epoch": 3.17, "grad_norm": 1.0767732858657837, "learning_rate": 1.4862420112109293e-05, "loss": 1.9039, "step": 17685 }, { "epoch": 3.17, "grad_norm": 0.7137855291366577, "learning_rate": 1.48495694522449e-05, "loss": 1.7499, "step": 17690 }, { "epoch": 3.17, "grad_norm": 0.44608253240585327, "learning_rate": 1.4836722002951908e-05, "loss": 1.8009, "step": 17695 }, { "epoch": 3.17, "grad_norm": 1.0002092123031616, "learning_rate": 1.4823877768293975e-05, "loss": 1.4692, "step": 17700 }, { "epoch": 3.17, "grad_norm": 0.8534446358680725, "learning_rate": 1.4811036752333718e-05, "loss": 1.6299, "step": 17705 }, { "epoch": 3.17, "grad_norm": 0.4848932921886444, "learning_rate": 1.4798198959132733e-05, "loss": 2.0872, "step": 17710 }, { "epoch": 3.17, "grad_norm": 0.5030261874198914, "learning_rate": 1.4785364392751606e-05, "loss": 2.0136, "step": 17715 }, { "epoch": 3.17, "grad_norm": 0.644991397857666, "learning_rate": 1.4772533057249898e-05, "loss": 1.8112, "step": 17720 }, { "epoch": 3.17, "grad_norm": 0.925753653049469, "learning_rate": 1.4759704956686144e-05, "loss": 1.55, "step": 17725 }, { "epoch": 3.17, "grad_norm": 0.7059409618377686, "learning_rate": 1.4746880095117866e-05, "loss": 1.8275, "step": 17730 }, { "epoch": 3.17, "grad_norm": 1.2741386890411377, "learning_rate": 1.4734058476601553e-05, "loss": 1.9445, "step": 17735 }, { "epoch": 3.18, "grad_norm": 0.7439374923706055, "learning_rate": 1.4721240105192663e-05, "loss": 1.7957, "step": 17740 }, { "epoch": 3.18, "grad_norm": 0.9652694463729858, "learning_rate": 1.470842498494564e-05, "loss": 1.8712, "step": 17745 }, { "epoch": 3.18, "grad_norm": 0.7117319107055664, "learning_rate": 1.4695613119913903e-05, "loss": 1.6265, "step": 17750 }, { "epoch": 3.18, "grad_norm": 1.2906383275985718, "learning_rate": 1.4682804514149818e-05, "loss": 1.7699, "step": 17755 }, { "epoch": 3.18, "grad_norm": 1.508944034576416, "learning_rate": 1.466999917170474e-05, "loss": 1.7239, "step": 17760 }, { "epoch": 3.18, "grad_norm": 0.925870954990387, "learning_rate": 1.4657197096628994e-05, "loss": 2.3444, "step": 17765 }, { "epoch": 3.18, "grad_norm": 1.065234899520874, "learning_rate": 1.464439829297185e-05, "loss": 1.5987, "step": 17770 }, { "epoch": 3.18, "grad_norm": 0.7614918351173401, "learning_rate": 1.4631602764781572e-05, "loss": 2.0636, "step": 17775 }, { "epoch": 3.18, "grad_norm": 0.5512299537658691, "learning_rate": 1.4618810516105371e-05, "loss": 1.9394, "step": 17780 }, { "epoch": 3.18, "grad_norm": 0.7986428141593933, "learning_rate": 1.460602155098941e-05, "loss": 1.8252, "step": 17785 }, { "epoch": 3.18, "grad_norm": 0.6706057190895081, "learning_rate": 1.4593235873478839e-05, "loss": 1.8949, "step": 17790 }, { "epoch": 3.19, "grad_norm": 0.6929710507392883, "learning_rate": 1.4580453487617745e-05, "loss": 1.8602, "step": 17795 }, { "epoch": 3.19, "grad_norm": 0.9884799122810364, "learning_rate": 1.456767439744919e-05, "loss": 1.6815, "step": 17800 }, { "epoch": 3.19, "grad_norm": 1.1615020036697388, "learning_rate": 1.455489860701519e-05, "loss": 1.8135, "step": 17805 }, { "epoch": 3.19, "grad_norm": 0.9147149920463562, "learning_rate": 1.4542126120356714e-05, "loss": 1.7469, "step": 17810 }, { "epoch": 3.19, "grad_norm": 0.612957239151001, "learning_rate": 1.4529356941513697e-05, "loss": 1.8893, "step": 17815 }, { "epoch": 3.19, "grad_norm": 0.8941155076026917, "learning_rate": 1.4516591074524999e-05, "loss": 2.1368, "step": 17820 }, { "epoch": 3.19, "grad_norm": 0.798802375793457, "learning_rate": 1.4503828523428458e-05, "loss": 1.7414, "step": 17825 }, { "epoch": 3.19, "grad_norm": 1.113351821899414, "learning_rate": 1.4491069292260868e-05, "loss": 1.9364, "step": 17830 }, { "epoch": 3.19, "grad_norm": 0.9882722496986389, "learning_rate": 1.4478313385057946e-05, "loss": 1.75, "step": 17835 }, { "epoch": 3.19, "grad_norm": 0.7189344167709351, "learning_rate": 1.4465560805854378e-05, "loss": 1.9663, "step": 17840 }, { "epoch": 3.19, "grad_norm": 0.7250414490699768, "learning_rate": 1.4452811558683804e-05, "loss": 1.8795, "step": 17845 }, { "epoch": 3.2, "grad_norm": 0.8930559754371643, "learning_rate": 1.4440065647578777e-05, "loss": 1.9817, "step": 17850 }, { "epoch": 3.2, "grad_norm": 0.7246281504631042, "learning_rate": 1.4427323076570831e-05, "loss": 1.9332, "step": 17855 }, { "epoch": 3.2, "grad_norm": 0.7635440230369568, "learning_rate": 1.4414583849690428e-05, "loss": 1.8095, "step": 17860 }, { "epoch": 3.2, "grad_norm": 0.7749634981155396, "learning_rate": 1.4401847970966966e-05, "loss": 1.7567, "step": 17865 }, { "epoch": 3.2, "grad_norm": 0.7701842784881592, "learning_rate": 1.4389115444428811e-05, "loss": 1.8668, "step": 17870 }, { "epoch": 3.2, "grad_norm": 2.8975706100463867, "learning_rate": 1.4376386274103231e-05, "loss": 1.7546, "step": 17875 }, { "epoch": 3.2, "grad_norm": 1.0762674808502197, "learning_rate": 1.436366046401645e-05, "loss": 1.8692, "step": 17880 }, { "epoch": 3.2, "grad_norm": 0.6440033316612244, "learning_rate": 1.4350938018193626e-05, "loss": 1.6009, "step": 17885 }, { "epoch": 3.2, "grad_norm": 0.7783496975898743, "learning_rate": 1.433821894065887e-05, "loss": 1.7383, "step": 17890 }, { "epoch": 3.2, "grad_norm": 0.7856084704399109, "learning_rate": 1.4325503235435207e-05, "loss": 2.0661, "step": 17895 }, { "epoch": 3.2, "grad_norm": 0.7685439586639404, "learning_rate": 1.4312790906544598e-05, "loss": 1.596, "step": 17900 }, { "epoch": 3.2, "grad_norm": 0.8082401156425476, "learning_rate": 1.430008195800796e-05, "loss": 1.8206, "step": 17905 }, { "epoch": 3.21, "grad_norm": 0.9814727306365967, "learning_rate": 1.4287376393845095e-05, "loss": 1.61, "step": 17910 }, { "epoch": 3.21, "grad_norm": 0.779301106929779, "learning_rate": 1.4274674218074775e-05, "loss": 1.5867, "step": 17915 }, { "epoch": 3.21, "grad_norm": 1.0549635887145996, "learning_rate": 1.4261975434714686e-05, "loss": 1.9271, "step": 17920 }, { "epoch": 3.21, "grad_norm": 0.9940875768661499, "learning_rate": 1.4249280047781435e-05, "loss": 2.0389, "step": 17925 }, { "epoch": 3.21, "grad_norm": 0.7739433646202087, "learning_rate": 1.4236588061290563e-05, "loss": 1.9533, "step": 17930 }, { "epoch": 3.21, "grad_norm": 0.6601684093475342, "learning_rate": 1.4223899479256542e-05, "loss": 1.6776, "step": 17935 }, { "epoch": 3.21, "grad_norm": 0.9212928414344788, "learning_rate": 1.4211214305692744e-05, "loss": 1.8223, "step": 17940 }, { "epoch": 3.21, "grad_norm": 0.4190042018890381, "learning_rate": 1.419853254461148e-05, "loss": 1.9103, "step": 17945 }, { "epoch": 3.21, "grad_norm": 1.1490883827209473, "learning_rate": 1.4185854200023984e-05, "loss": 1.5747, "step": 17950 }, { "epoch": 3.21, "grad_norm": 0.8107929825782776, "learning_rate": 1.4173179275940401e-05, "loss": 1.9681, "step": 17955 }, { "epoch": 3.21, "grad_norm": 1.9732346534729004, "learning_rate": 1.4160507776369813e-05, "loss": 1.5155, "step": 17960 }, { "epoch": 3.22, "grad_norm": 0.7424911856651306, "learning_rate": 1.4147839705320177e-05, "loss": 1.8159, "step": 17965 }, { "epoch": 3.22, "grad_norm": 1.0739436149597168, "learning_rate": 1.4135175066798412e-05, "loss": 1.7462, "step": 17970 }, { "epoch": 3.22, "grad_norm": 0.9837630987167358, "learning_rate": 1.4122513864810317e-05, "loss": 1.5915, "step": 17975 }, { "epoch": 3.22, "grad_norm": 0.7608397006988525, "learning_rate": 1.410985610336062e-05, "loss": 1.9469, "step": 17980 }, { "epoch": 3.22, "grad_norm": 0.3973992168903351, "learning_rate": 1.4097201786452963e-05, "loss": 1.9932, "step": 17985 }, { "epoch": 3.22, "grad_norm": 0.6849148273468018, "learning_rate": 1.4084550918089895e-05, "loss": 1.4885, "step": 17990 }, { "epoch": 3.22, "grad_norm": 0.8641477227210999, "learning_rate": 1.4071903502272882e-05, "loss": 1.9552, "step": 17995 }, { "epoch": 3.22, "grad_norm": 1.1732696294784546, "learning_rate": 1.4059259543002267e-05, "loss": 1.7839, "step": 18000 }, { "epoch": 3.22, "grad_norm": 0.6170725226402283, "learning_rate": 1.4046619044277337e-05, "loss": 2.0093, "step": 18005 }, { "epoch": 3.22, "grad_norm": 0.7883812785148621, "learning_rate": 1.4033982010096269e-05, "loss": 1.5738, "step": 18010 }, { "epoch": 3.22, "grad_norm": 0.8948926329612732, "learning_rate": 1.402134844445615e-05, "loss": 1.7868, "step": 18015 }, { "epoch": 3.23, "grad_norm": 0.8349456787109375, "learning_rate": 1.400871835135295e-05, "loss": 1.7468, "step": 18020 }, { "epoch": 3.23, "grad_norm": 0.6893956661224365, "learning_rate": 1.3996091734781575e-05, "loss": 1.8127, "step": 18025 }, { "epoch": 3.23, "grad_norm": 0.509495198726654, "learning_rate": 1.3983468598735786e-05, "loss": 1.6751, "step": 18030 }, { "epoch": 3.23, "grad_norm": 0.7317885756492615, "learning_rate": 1.3970848947208284e-05, "loss": 2.0498, "step": 18035 }, { "epoch": 3.23, "grad_norm": 0.7369408011436462, "learning_rate": 1.395823278419065e-05, "loss": 1.6929, "step": 18040 }, { "epoch": 3.23, "grad_norm": 1.6330751180648804, "learning_rate": 1.3945620113673369e-05, "loss": 1.5587, "step": 18045 }, { "epoch": 3.23, "grad_norm": 0.796711802482605, "learning_rate": 1.3933010939645821e-05, "loss": 1.7163, "step": 18050 }, { "epoch": 3.23, "grad_norm": 1.373434066772461, "learning_rate": 1.392040526609626e-05, "loss": 1.7293, "step": 18055 }, { "epoch": 3.23, "grad_norm": 0.7607307434082031, "learning_rate": 1.3907803097011862e-05, "loss": 1.6402, "step": 18060 }, { "epoch": 3.23, "grad_norm": 1.729021668434143, "learning_rate": 1.3895204436378667e-05, "loss": 2.0188, "step": 18065 }, { "epoch": 3.23, "grad_norm": 0.6117586493492126, "learning_rate": 1.3882609288181628e-05, "loss": 1.5768, "step": 18070 }, { "epoch": 3.24, "grad_norm": 1.12637460231781, "learning_rate": 1.3870017656404577e-05, "loss": 1.6816, "step": 18075 }, { "epoch": 3.24, "grad_norm": 1.0976755619049072, "learning_rate": 1.3857429545030246e-05, "loss": 1.8734, "step": 18080 }, { "epoch": 3.24, "grad_norm": 0.9964720010757446, "learning_rate": 1.384484495804022e-05, "loss": 1.8126, "step": 18085 }, { "epoch": 3.24, "grad_norm": 0.9928443431854248, "learning_rate": 1.3832263899415006e-05, "loss": 1.8391, "step": 18090 }, { "epoch": 3.24, "grad_norm": 1.7136353254318237, "learning_rate": 1.381968637313398e-05, "loss": 1.5318, "step": 18095 }, { "epoch": 3.24, "grad_norm": 0.8411396741867065, "learning_rate": 1.3807112383175396e-05, "loss": 1.89, "step": 18100 }, { "epoch": 3.24, "grad_norm": 0.641628623008728, "learning_rate": 1.3794541933516402e-05, "loss": 1.9114, "step": 18105 }, { "epoch": 3.24, "grad_norm": 0.8874943852424622, "learning_rate": 1.3781975028133027e-05, "loss": 2.0295, "step": 18110 }, { "epoch": 3.24, "grad_norm": 0.7927294373512268, "learning_rate": 1.376941167100016e-05, "loss": 1.7748, "step": 18115 }, { "epoch": 3.24, "grad_norm": 1.0050328969955444, "learning_rate": 1.3756851866091566e-05, "loss": 2.0182, "step": 18120 }, { "epoch": 3.24, "grad_norm": 0.8478461503982544, "learning_rate": 1.3744295617379916e-05, "loss": 1.7941, "step": 18125 }, { "epoch": 3.25, "grad_norm": 0.9578462243080139, "learning_rate": 1.3731742928836732e-05, "loss": 1.8209, "step": 18130 }, { "epoch": 3.25, "grad_norm": 0.9452106356620789, "learning_rate": 1.3719193804432423e-05, "loss": 1.8019, "step": 18135 }, { "epoch": 3.25, "grad_norm": 0.9227070808410645, "learning_rate": 1.3706648248136272e-05, "loss": 2.0451, "step": 18140 }, { "epoch": 3.25, "grad_norm": 0.7945765256881714, "learning_rate": 1.3694106263916406e-05, "loss": 1.8797, "step": 18145 }, { "epoch": 3.25, "grad_norm": 0.7952302098274231, "learning_rate": 1.3681567855739857e-05, "loss": 1.8476, "step": 18150 }, { "epoch": 3.25, "grad_norm": 0.5315759778022766, "learning_rate": 1.3669033027572518e-05, "loss": 1.8467, "step": 18155 }, { "epoch": 3.25, "grad_norm": 0.5972190499305725, "learning_rate": 1.3656501783379122e-05, "loss": 1.7655, "step": 18160 }, { "epoch": 3.25, "grad_norm": 0.8763524293899536, "learning_rate": 1.3643974127123298e-05, "loss": 1.7197, "step": 18165 }, { "epoch": 3.25, "grad_norm": 0.714412271976471, "learning_rate": 1.363145006276755e-05, "loss": 1.8869, "step": 18170 }, { "epoch": 3.25, "grad_norm": 0.8367454409599304, "learning_rate": 1.36189295942732e-05, "loss": 1.7424, "step": 18175 }, { "epoch": 3.25, "grad_norm": 0.7740775942802429, "learning_rate": 1.3606412725600471e-05, "loss": 1.7942, "step": 18180 }, { "epoch": 3.26, "grad_norm": 0.6157845854759216, "learning_rate": 1.359389946070844e-05, "loss": 2.0431, "step": 18185 }, { "epoch": 3.26, "grad_norm": 0.6158450245857239, "learning_rate": 1.3581389803555036e-05, "loss": 2.0421, "step": 18190 }, { "epoch": 3.26, "grad_norm": 1.7718108892440796, "learning_rate": 1.3568883758097056e-05, "loss": 1.6756, "step": 18195 }, { "epoch": 3.26, "grad_norm": 1.0088618993759155, "learning_rate": 1.3556381328290154e-05, "loss": 1.6387, "step": 18200 }, { "epoch": 3.26, "grad_norm": 2.629340171813965, "learning_rate": 1.3543882518088835e-05, "loss": 1.8785, "step": 18205 }, { "epoch": 3.26, "grad_norm": 2.2919623851776123, "learning_rate": 1.3531387331446444e-05, "loss": 1.9667, "step": 18210 }, { "epoch": 3.26, "grad_norm": 0.5759145617485046, "learning_rate": 1.3518895772315207e-05, "loss": 1.8605, "step": 18215 }, { "epoch": 3.26, "grad_norm": 0.9002760648727417, "learning_rate": 1.3506407844646196e-05, "loss": 1.8257, "step": 18220 }, { "epoch": 3.26, "grad_norm": 0.7235890030860901, "learning_rate": 1.3493923552389326e-05, "loss": 1.8972, "step": 18225 }, { "epoch": 3.26, "grad_norm": 0.6796399354934692, "learning_rate": 1.3481442899493382e-05, "loss": 1.7873, "step": 18230 }, { "epoch": 3.26, "grad_norm": 2.119182586669922, "learning_rate": 1.346896588990596e-05, "loss": 1.9575, "step": 18235 }, { "epoch": 3.26, "grad_norm": 0.8993787169456482, "learning_rate": 1.3456492527573533e-05, "loss": 1.5285, "step": 18240 }, { "epoch": 3.27, "grad_norm": 1.0154492855072021, "learning_rate": 1.3444022816441414e-05, "loss": 1.89, "step": 18245 }, { "epoch": 3.27, "grad_norm": 0.5175526738166809, "learning_rate": 1.3431556760453778e-05, "loss": 1.8722, "step": 18250 }, { "epoch": 3.27, "grad_norm": 0.8413906097412109, "learning_rate": 1.3419094363553597e-05, "loss": 1.6884, "step": 18255 }, { "epoch": 3.27, "grad_norm": 0.7729244232177734, "learning_rate": 1.3406635629682743e-05, "loss": 1.8852, "step": 18260 }, { "epoch": 3.27, "grad_norm": 0.7230870127677917, "learning_rate": 1.3394180562781877e-05, "loss": 1.9346, "step": 18265 }, { "epoch": 3.27, "grad_norm": 0.7597714066505432, "learning_rate": 1.3381729166790535e-05, "loss": 1.7774, "step": 18270 }, { "epoch": 3.27, "grad_norm": 2.0124258995056152, "learning_rate": 1.336928144564708e-05, "loss": 1.8878, "step": 18275 }, { "epoch": 3.27, "grad_norm": 0.7441228628158569, "learning_rate": 1.3356837403288719e-05, "loss": 1.9835, "step": 18280 }, { "epoch": 3.27, "grad_norm": 0.8413273692131042, "learning_rate": 1.3344397043651496e-05, "loss": 1.7068, "step": 18285 }, { "epoch": 3.27, "grad_norm": 0.9278790950775146, "learning_rate": 1.333196037067027e-05, "loss": 1.572, "step": 18290 }, { "epoch": 3.27, "grad_norm": 1.6057875156402588, "learning_rate": 1.3319527388278764e-05, "loss": 1.6434, "step": 18295 }, { "epoch": 3.28, "grad_norm": 0.7727483510971069, "learning_rate": 1.33070981004095e-05, "loss": 1.8625, "step": 18300 }, { "epoch": 3.28, "grad_norm": 0.9005012512207031, "learning_rate": 1.329467251099386e-05, "loss": 1.8556, "step": 18305 }, { "epoch": 3.28, "grad_norm": 0.4766554832458496, "learning_rate": 1.3282250623962045e-05, "loss": 1.7374, "step": 18310 }, { "epoch": 3.28, "grad_norm": 1.3196171522140503, "learning_rate": 1.3269832443243085e-05, "loss": 1.6436, "step": 18315 }, { "epoch": 3.28, "grad_norm": 0.6074075698852539, "learning_rate": 1.3257417972764852e-05, "loss": 1.7342, "step": 18320 }, { "epoch": 3.28, "grad_norm": 1.836259126663208, "learning_rate": 1.3245007216454008e-05, "loss": 1.7356, "step": 18325 }, { "epoch": 3.28, "grad_norm": 1.7985725402832031, "learning_rate": 1.3232600178236077e-05, "loss": 2.0223, "step": 18330 }, { "epoch": 3.28, "grad_norm": 1.469251275062561, "learning_rate": 1.3220196862035388e-05, "loss": 1.8288, "step": 18335 }, { "epoch": 3.28, "grad_norm": 0.7703099250793457, "learning_rate": 1.3207797271775113e-05, "loss": 1.6407, "step": 18340 }, { "epoch": 3.28, "grad_norm": 4.6768717765808105, "learning_rate": 1.319540141137721e-05, "loss": 1.9388, "step": 18345 }, { "epoch": 3.28, "grad_norm": 0.891442596912384, "learning_rate": 1.3183009284762492e-05, "loss": 1.8492, "step": 18350 }, { "epoch": 3.29, "grad_norm": 0.5014522075653076, "learning_rate": 1.3170620895850563e-05, "loss": 1.8215, "step": 18355 }, { "epoch": 3.29, "grad_norm": 2.594623327255249, "learning_rate": 1.3158236248559866e-05, "loss": 1.6727, "step": 18360 }, { "epoch": 3.29, "grad_norm": 2.9839723110198975, "learning_rate": 1.3145855346807651e-05, "loss": 1.3427, "step": 18365 }, { "epoch": 3.29, "grad_norm": 0.809145450592041, "learning_rate": 1.313347819450999e-05, "loss": 2.0794, "step": 18370 }, { "epoch": 3.29, "grad_norm": 1.4444146156311035, "learning_rate": 1.3121104795581774e-05, "loss": 1.8111, "step": 18375 }, { "epoch": 3.29, "grad_norm": 3.0015711784362793, "learning_rate": 1.3108735153936669e-05, "loss": 2.0326, "step": 18380 }, { "epoch": 3.29, "grad_norm": 0.8941876292228699, "learning_rate": 1.3096369273487202e-05, "loss": 1.8635, "step": 18385 }, { "epoch": 3.29, "grad_norm": 0.9546188116073608, "learning_rate": 1.308400715814469e-05, "loss": 1.6841, "step": 18390 }, { "epoch": 3.29, "grad_norm": 2.428107500076294, "learning_rate": 1.3071648811819248e-05, "loss": 1.8043, "step": 18395 }, { "epoch": 3.29, "grad_norm": 1.22321617603302, "learning_rate": 1.3059294238419809e-05, "loss": 1.7878, "step": 18400 }, { "epoch": 3.29, "grad_norm": 0.8854243755340576, "learning_rate": 1.3046943441854115e-05, "loss": 1.8578, "step": 18405 }, { "epoch": 3.3, "grad_norm": 0.663280189037323, "learning_rate": 1.3034596426028727e-05, "loss": 1.9011, "step": 18410 }, { "epoch": 3.3, "grad_norm": 0.9913401007652283, "learning_rate": 1.3022253194848968e-05, "loss": 1.7046, "step": 18415 }, { "epoch": 3.3, "grad_norm": 0.7245838046073914, "learning_rate": 1.3009913752219006e-05, "loss": 2.0733, "step": 18420 }, { "epoch": 3.3, "grad_norm": 0.7399362921714783, "learning_rate": 1.2997578102041785e-05, "loss": 1.7638, "step": 18425 }, { "epoch": 3.3, "grad_norm": 0.6854051351547241, "learning_rate": 1.2985246248219068e-05, "loss": 1.6586, "step": 18430 }, { "epoch": 3.3, "grad_norm": 3.867100715637207, "learning_rate": 1.2972918194651417e-05, "loss": 1.7975, "step": 18435 }, { "epoch": 3.3, "grad_norm": 0.889519214630127, "learning_rate": 1.2960593945238173e-05, "loss": 1.6503, "step": 18440 }, { "epoch": 3.3, "grad_norm": 1.0203913450241089, "learning_rate": 1.2948273503877473e-05, "loss": 1.8388, "step": 18445 }, { "epoch": 3.3, "grad_norm": 0.6218311786651611, "learning_rate": 1.293595687446627e-05, "loss": 1.8764, "step": 18450 }, { "epoch": 3.3, "grad_norm": 0.6152927875518799, "learning_rate": 1.2923644060900305e-05, "loss": 1.722, "step": 18455 }, { "epoch": 3.3, "grad_norm": 1.706099033355713, "learning_rate": 1.2911335067074107e-05, "loss": 1.5927, "step": 18460 }, { "epoch": 3.31, "grad_norm": 0.9180427193641663, "learning_rate": 1.289902989688101e-05, "loss": 1.608, "step": 18465 }, { "epoch": 3.31, "grad_norm": 0.6481319665908813, "learning_rate": 1.2886728554213107e-05, "loss": 1.6192, "step": 18470 }, { "epoch": 3.31, "grad_norm": 0.6985259056091309, "learning_rate": 1.2874431042961311e-05, "loss": 2.0233, "step": 18475 }, { "epoch": 3.31, "grad_norm": 0.6424618363380432, "learning_rate": 1.2862137367015312e-05, "loss": 1.8963, "step": 18480 }, { "epoch": 3.31, "grad_norm": 0.7724943161010742, "learning_rate": 1.2849847530263598e-05, "loss": 1.8944, "step": 18485 }, { "epoch": 3.31, "grad_norm": 0.7090172171592712, "learning_rate": 1.2837561536593414e-05, "loss": 1.8762, "step": 18490 }, { "epoch": 3.31, "grad_norm": 0.8090688586235046, "learning_rate": 1.2825279389890817e-05, "loss": 1.7738, "step": 18495 }, { "epoch": 3.31, "grad_norm": 0.6650318503379822, "learning_rate": 1.2813001094040649e-05, "loss": 1.6909, "step": 18500 }, { "epoch": 3.31, "grad_norm": 0.4980815351009369, "learning_rate": 1.2800726652926503e-05, "loss": 1.7088, "step": 18505 }, { "epoch": 3.31, "grad_norm": 0.8817710280418396, "learning_rate": 1.2788456070430782e-05, "loss": 1.6356, "step": 18510 }, { "epoch": 3.31, "grad_norm": 0.5760951638221741, "learning_rate": 1.2776189350434659e-05, "loss": 1.7639, "step": 18515 }, { "epoch": 3.31, "grad_norm": 3.9502806663513184, "learning_rate": 1.2763926496818088e-05, "loss": 1.4146, "step": 18520 }, { "epoch": 3.32, "grad_norm": 1.6004568338394165, "learning_rate": 1.2751667513459802e-05, "loss": 2.014, "step": 18525 }, { "epoch": 3.32, "grad_norm": 0.7599541544914246, "learning_rate": 1.2739412404237306e-05, "loss": 1.9472, "step": 18530 }, { "epoch": 3.32, "grad_norm": 1.0068447589874268, "learning_rate": 1.2727161173026861e-05, "loss": 2.1016, "step": 18535 }, { "epoch": 3.32, "grad_norm": 0.5630433559417725, "learning_rate": 1.2714913823703534e-05, "loss": 1.8729, "step": 18540 }, { "epoch": 3.32, "grad_norm": 0.7253677845001221, "learning_rate": 1.2702670360141144e-05, "loss": 1.6124, "step": 18545 }, { "epoch": 3.32, "grad_norm": 1.3994909524917603, "learning_rate": 1.2690430786212292e-05, "loss": 1.65, "step": 18550 }, { "epoch": 3.32, "grad_norm": 1.1676279306411743, "learning_rate": 1.2678195105788353e-05, "loss": 1.5819, "step": 18555 }, { "epoch": 3.32, "grad_norm": 1.0580713748931885, "learning_rate": 1.2665963322739439e-05, "loss": 1.9325, "step": 18560 }, { "epoch": 3.32, "grad_norm": 1.0381492376327515, "learning_rate": 1.2653735440934464e-05, "loss": 2.1058, "step": 18565 }, { "epoch": 3.32, "grad_norm": 1.4881700277328491, "learning_rate": 1.264151146424109e-05, "loss": 1.8533, "step": 18570 }, { "epoch": 3.32, "grad_norm": 1.4090508222579956, "learning_rate": 1.2629291396525767e-05, "loss": 1.5407, "step": 18575 }, { "epoch": 3.33, "grad_norm": 1.4334440231323242, "learning_rate": 1.2617075241653665e-05, "loss": 1.7129, "step": 18580 }, { "epoch": 3.33, "grad_norm": 1.431878685951233, "learning_rate": 1.2604863003488752e-05, "loss": 1.6889, "step": 18585 }, { "epoch": 3.33, "grad_norm": 1.138646125793457, "learning_rate": 1.2592654685893757e-05, "loss": 2.1111, "step": 18590 }, { "epoch": 3.33, "grad_norm": 2.590043306350708, "learning_rate": 1.2580450292730142e-05, "loss": 1.9615, "step": 18595 }, { "epoch": 3.33, "grad_norm": 1.139367699623108, "learning_rate": 1.2568249827858153e-05, "loss": 1.7285, "step": 18600 }, { "epoch": 3.33, "grad_norm": 1.8558638095855713, "learning_rate": 1.2556053295136785e-05, "loss": 1.6827, "step": 18605 }, { "epoch": 3.33, "grad_norm": 0.39854827523231506, "learning_rate": 1.2543860698423792e-05, "loss": 1.9213, "step": 18610 }, { "epoch": 3.33, "grad_norm": 1.0664751529693604, "learning_rate": 1.2531672041575688e-05, "loss": 1.8553, "step": 18615 }, { "epoch": 3.33, "grad_norm": 0.6911807656288147, "learning_rate": 1.2519487328447715e-05, "loss": 1.8099, "step": 18620 }, { "epoch": 3.33, "grad_norm": 0.5039658546447754, "learning_rate": 1.2507306562893903e-05, "loss": 1.9478, "step": 18625 }, { "epoch": 3.33, "grad_norm": 0.6167506575584412, "learning_rate": 1.2495129748766998e-05, "loss": 1.7913, "step": 18630 }, { "epoch": 3.34, "grad_norm": 0.8103507161140442, "learning_rate": 1.2482956889918529e-05, "loss": 1.8247, "step": 18635 }, { "epoch": 3.34, "grad_norm": 1.3701515197753906, "learning_rate": 1.2470787990198755e-05, "loss": 1.7334, "step": 18640 }, { "epoch": 3.34, "grad_norm": 0.8373657464981079, "learning_rate": 1.2458623053456697e-05, "loss": 1.7793, "step": 18645 }, { "epoch": 3.34, "grad_norm": 0.7973329424858093, "learning_rate": 1.2446462083540094e-05, "loss": 1.5442, "step": 18650 }, { "epoch": 3.34, "grad_norm": 0.8437389731407166, "learning_rate": 1.2434305084295458e-05, "loss": 1.6464, "step": 18655 }, { "epoch": 3.34, "grad_norm": 0.8967138528823853, "learning_rate": 1.242215205956804e-05, "loss": 1.9497, "step": 18660 }, { "epoch": 3.34, "grad_norm": 0.665662407875061, "learning_rate": 1.2410003013201826e-05, "loss": 1.7312, "step": 18665 }, { "epoch": 3.34, "grad_norm": 0.7674746513366699, "learning_rate": 1.2397857949039557e-05, "loss": 1.9874, "step": 18670 }, { "epoch": 3.34, "grad_norm": 1.0936229228973389, "learning_rate": 1.2385716870922698e-05, "loss": 1.9003, "step": 18675 }, { "epoch": 3.34, "grad_norm": 0.903713047504425, "learning_rate": 1.237357978269145e-05, "loss": 1.6076, "step": 18680 }, { "epoch": 3.34, "grad_norm": 0.5833523273468018, "learning_rate": 1.2361446688184772e-05, "loss": 2.0229, "step": 18685 }, { "epoch": 3.35, "grad_norm": 1.014012098312378, "learning_rate": 1.234931759124035e-05, "loss": 1.9358, "step": 18690 }, { "epoch": 3.35, "grad_norm": 0.6757218241691589, "learning_rate": 1.2337192495694605e-05, "loss": 1.7718, "step": 18695 }, { "epoch": 3.35, "grad_norm": 1.0655688047409058, "learning_rate": 1.2325071405382693e-05, "loss": 1.7798, "step": 18700 }, { "epoch": 3.35, "grad_norm": 0.6979892253875732, "learning_rate": 1.2312954324138514e-05, "loss": 1.7062, "step": 18705 }, { "epoch": 3.35, "grad_norm": 0.6946207880973816, "learning_rate": 1.2300841255794668e-05, "loss": 1.9337, "step": 18710 }, { "epoch": 3.35, "grad_norm": 0.827663242816925, "learning_rate": 1.2288732204182529e-05, "loss": 1.7177, "step": 18715 }, { "epoch": 3.35, "grad_norm": 0.8608758449554443, "learning_rate": 1.2276627173132158e-05, "loss": 1.7821, "step": 18720 }, { "epoch": 3.35, "grad_norm": 1.8402502536773682, "learning_rate": 1.2264526166472373e-05, "loss": 1.7818, "step": 18725 }, { "epoch": 3.35, "grad_norm": 0.789306104183197, "learning_rate": 1.2252429188030709e-05, "loss": 2.2084, "step": 18730 }, { "epoch": 3.35, "grad_norm": 0.7342821359634399, "learning_rate": 1.2240336241633443e-05, "loss": 1.8832, "step": 18735 }, { "epoch": 3.35, "grad_norm": 0.7892515659332275, "learning_rate": 1.2228247331105541e-05, "loss": 1.6436, "step": 18740 }, { "epoch": 3.36, "grad_norm": 1.011290431022644, "learning_rate": 1.2216162460270725e-05, "loss": 1.8126, "step": 18745 }, { "epoch": 3.36, "grad_norm": 0.9589811563491821, "learning_rate": 1.2204081632951425e-05, "loss": 1.8529, "step": 18750 }, { "epoch": 3.36, "grad_norm": 0.4915600121021271, "learning_rate": 1.2192004852968797e-05, "loss": 2.1086, "step": 18755 }, { "epoch": 3.36, "grad_norm": 0.8448190689086914, "learning_rate": 1.217993212414272e-05, "loss": 1.8251, "step": 18760 }, { "epoch": 3.36, "grad_norm": 0.5597540140151978, "learning_rate": 1.2167863450291786e-05, "loss": 1.8408, "step": 18765 }, { "epoch": 3.36, "grad_norm": 0.7109097242355347, "learning_rate": 1.2155798835233286e-05, "loss": 2.3664, "step": 18770 }, { "epoch": 3.36, "grad_norm": 0.5528181195259094, "learning_rate": 1.2143738282783263e-05, "loss": 2.0561, "step": 18775 }, { "epoch": 3.36, "grad_norm": 0.8648270964622498, "learning_rate": 1.2131681796756452e-05, "loss": 1.7651, "step": 18780 }, { "epoch": 3.36, "grad_norm": 1.4095818996429443, "learning_rate": 1.2119629380966313e-05, "loss": 1.7121, "step": 18785 }, { "epoch": 3.36, "grad_norm": 0.9153962135314941, "learning_rate": 1.2107581039225014e-05, "loss": 1.9327, "step": 18790 }, { "epoch": 3.36, "grad_norm": 0.4868113398551941, "learning_rate": 1.2095536775343436e-05, "loss": 1.9, "step": 18795 }, { "epoch": 3.37, "grad_norm": 0.7522435784339905, "learning_rate": 1.2083496593131155e-05, "loss": 1.6748, "step": 18800 }, { "epoch": 3.37, "grad_norm": 0.6917173266410828, "learning_rate": 1.2071460496396481e-05, "loss": 1.7236, "step": 18805 }, { "epoch": 3.37, "grad_norm": 0.5699201822280884, "learning_rate": 1.2059428488946423e-05, "loss": 1.7056, "step": 18810 }, { "epoch": 3.37, "grad_norm": 1.4349066019058228, "learning_rate": 1.2047400574586676e-05, "loss": 1.6144, "step": 18815 }, { "epoch": 3.37, "grad_norm": 0.8018911480903625, "learning_rate": 1.203537675712167e-05, "loss": 2.1065, "step": 18820 }, { "epoch": 3.37, "grad_norm": 0.7882276773452759, "learning_rate": 1.2023357040354535e-05, "loss": 1.9657, "step": 18825 }, { "epoch": 3.37, "grad_norm": 1.0160123109817505, "learning_rate": 1.2011341428087077e-05, "loss": 2.0429, "step": 18830 }, { "epoch": 3.37, "grad_norm": 1.0329535007476807, "learning_rate": 1.1999329924119832e-05, "loss": 1.856, "step": 18835 }, { "epoch": 3.37, "grad_norm": 1.044445514678955, "learning_rate": 1.1987322532252024e-05, "loss": 1.3516, "step": 18840 }, { "epoch": 3.37, "grad_norm": 0.7211053371429443, "learning_rate": 1.1975319256281586e-05, "loss": 1.9731, "step": 18845 }, { "epoch": 3.37, "grad_norm": 6.382941246032715, "learning_rate": 1.1963320100005146e-05, "loss": 1.7734, "step": 18850 }, { "epoch": 3.37, "grad_norm": 0.6544294357299805, "learning_rate": 1.195132506721802e-05, "loss": 1.6961, "step": 18855 }, { "epoch": 3.38, "grad_norm": 1.0940073728561401, "learning_rate": 1.1939334161714216e-05, "loss": 1.8632, "step": 18860 }, { "epoch": 3.38, "grad_norm": 0.816892147064209, "learning_rate": 1.1927347387286452e-05, "loss": 1.9797, "step": 18865 }, { "epoch": 3.38, "grad_norm": 0.7033507227897644, "learning_rate": 1.1915364747726135e-05, "loss": 2.0543, "step": 18870 }, { "epoch": 3.38, "grad_norm": 1.3776378631591797, "learning_rate": 1.1903386246823361e-05, "loss": 1.7737, "step": 18875 }, { "epoch": 3.38, "grad_norm": 1.1460137367248535, "learning_rate": 1.1891411888366933e-05, "loss": 1.607, "step": 18880 }, { "epoch": 3.38, "grad_norm": 0.7334376573562622, "learning_rate": 1.1879441676144303e-05, "loss": 1.8029, "step": 18885 }, { "epoch": 3.38, "grad_norm": 1.3489125967025757, "learning_rate": 1.1867475613941653e-05, "loss": 1.8603, "step": 18890 }, { "epoch": 3.38, "grad_norm": 1.415173053741455, "learning_rate": 1.1855513705543833e-05, "loss": 1.7612, "step": 18895 }, { "epoch": 3.38, "grad_norm": 2.525381088256836, "learning_rate": 1.1843555954734387e-05, "loss": 1.7866, "step": 18900 }, { "epoch": 3.38, "grad_norm": 0.555586040019989, "learning_rate": 1.1831602365295545e-05, "loss": 1.9931, "step": 18905 }, { "epoch": 3.38, "grad_norm": 1.590984582901001, "learning_rate": 1.1819652941008197e-05, "loss": 1.6122, "step": 18910 }, { "epoch": 3.39, "grad_norm": 0.7002542614936829, "learning_rate": 1.1807707685651959e-05, "loss": 1.6119, "step": 18915 }, { "epoch": 3.39, "grad_norm": 0.8759981989860535, "learning_rate": 1.179576660300508e-05, "loss": 1.8901, "step": 18920 }, { "epoch": 3.39, "grad_norm": 0.4278295338153839, "learning_rate": 1.1783829696844522e-05, "loss": 1.7944, "step": 18925 }, { "epoch": 3.39, "grad_norm": 0.7735504508018494, "learning_rate": 1.177189697094592e-05, "loss": 1.6411, "step": 18930 }, { "epoch": 3.39, "grad_norm": 0.8619164824485779, "learning_rate": 1.175996842908358e-05, "loss": 2.0436, "step": 18935 }, { "epoch": 3.39, "grad_norm": 0.7432487607002258, "learning_rate": 1.1748044075030504e-05, "loss": 1.924, "step": 18940 }, { "epoch": 3.39, "grad_norm": 0.8288931846618652, "learning_rate": 1.1736123912558327e-05, "loss": 1.9304, "step": 18945 }, { "epoch": 3.39, "grad_norm": 2.158874273300171, "learning_rate": 1.172420794543741e-05, "loss": 1.8055, "step": 18950 }, { "epoch": 3.39, "grad_norm": 0.8662157654762268, "learning_rate": 1.171229617743674e-05, "loss": 1.7861, "step": 18955 }, { "epoch": 3.39, "grad_norm": 0.9218710064888, "learning_rate": 1.170038861232401e-05, "loss": 2.0901, "step": 18960 }, { "epoch": 3.39, "grad_norm": 0.7493339776992798, "learning_rate": 1.1688485253865567e-05, "loss": 1.9827, "step": 18965 }, { "epoch": 3.4, "grad_norm": 0.8299612402915955, "learning_rate": 1.1676586105826442e-05, "loss": 1.7931, "step": 18970 }, { "epoch": 3.4, "grad_norm": 1.1126405000686646, "learning_rate": 1.166469117197031e-05, "loss": 1.918, "step": 18975 }, { "epoch": 3.4, "grad_norm": 0.9959370493888855, "learning_rate": 1.1652800456059532e-05, "loss": 1.8812, "step": 18980 }, { "epoch": 3.4, "grad_norm": 0.6793087124824524, "learning_rate": 1.1640913961855132e-05, "loss": 2.0555, "step": 18985 }, { "epoch": 3.4, "grad_norm": 0.6176539659500122, "learning_rate": 1.1629031693116796e-05, "loss": 1.706, "step": 18990 }, { "epoch": 3.4, "grad_norm": 0.9862648248672485, "learning_rate": 1.1617153653602883e-05, "loss": 1.55, "step": 18995 }, { "epoch": 3.4, "grad_norm": 0.554785966873169, "learning_rate": 1.1605279847070388e-05, "loss": 2.1433, "step": 19000 }, { "epoch": 3.4, "grad_norm": 0.6524524688720703, "learning_rate": 1.1593410277275003e-05, "loss": 1.7382, "step": 19005 }, { "epoch": 3.4, "grad_norm": 0.8985192179679871, "learning_rate": 1.158154494797104e-05, "loss": 2.1055, "step": 19010 }, { "epoch": 3.4, "grad_norm": 0.636359453201294, "learning_rate": 1.1569683862911507e-05, "loss": 1.927, "step": 19015 }, { "epoch": 3.4, "grad_norm": 0.5719195604324341, "learning_rate": 1.1557827025848047e-05, "loss": 1.6219, "step": 19020 }, { "epoch": 3.41, "grad_norm": 0.8190729022026062, "learning_rate": 1.1545974440530968e-05, "loss": 1.5272, "step": 19025 }, { "epoch": 3.41, "grad_norm": 0.7199324369430542, "learning_rate": 1.1534126110709246e-05, "loss": 1.9192, "step": 19030 }, { "epoch": 3.41, "grad_norm": 0.5589447021484375, "learning_rate": 1.152228204013047e-05, "loss": 1.9119, "step": 19035 }, { "epoch": 3.41, "grad_norm": 0.8806366324424744, "learning_rate": 1.151044223254092e-05, "loss": 1.6215, "step": 19040 }, { "epoch": 3.41, "grad_norm": 0.65036940574646, "learning_rate": 1.1498606691685526e-05, "loss": 1.8842, "step": 19045 }, { "epoch": 3.41, "grad_norm": 0.6128280162811279, "learning_rate": 1.1486775421307842e-05, "loss": 1.9723, "step": 19050 }, { "epoch": 3.41, "grad_norm": 0.8853219747543335, "learning_rate": 1.1474948425150092e-05, "loss": 1.8919, "step": 19055 }, { "epoch": 3.41, "grad_norm": 1.1091639995574951, "learning_rate": 1.1463125706953156e-05, "loss": 2.0356, "step": 19060 }, { "epoch": 3.41, "grad_norm": 0.6628566980361938, "learning_rate": 1.145130727045653e-05, "loss": 1.9267, "step": 19065 }, { "epoch": 3.41, "grad_norm": 1.4559956789016724, "learning_rate": 1.1439493119398381e-05, "loss": 1.7764, "step": 19070 }, { "epoch": 3.41, "grad_norm": 0.5751055479049683, "learning_rate": 1.1427683257515515e-05, "loss": 1.5434, "step": 19075 }, { "epoch": 3.42, "grad_norm": 0.7229131460189819, "learning_rate": 1.1415877688543378e-05, "loss": 1.4448, "step": 19080 }, { "epoch": 3.42, "grad_norm": 0.6522566676139832, "learning_rate": 1.1404076416216072e-05, "loss": 1.9578, "step": 19085 }, { "epoch": 3.42, "grad_norm": 1.1385724544525146, "learning_rate": 1.139227944426631e-05, "loss": 1.794, "step": 19090 }, { "epoch": 3.42, "grad_norm": 1.248424768447876, "learning_rate": 1.1380486776425477e-05, "loss": 1.6419, "step": 19095 }, { "epoch": 3.42, "grad_norm": 0.9762007594108582, "learning_rate": 1.136869841642357e-05, "loss": 1.6544, "step": 19100 }, { "epoch": 3.42, "grad_norm": 0.4511853754520416, "learning_rate": 1.135691436798924e-05, "loss": 1.8262, "step": 19105 }, { "epoch": 3.42, "grad_norm": 0.8737003803253174, "learning_rate": 1.1345134634849775e-05, "loss": 1.9587, "step": 19110 }, { "epoch": 3.42, "grad_norm": 0.5674211382865906, "learning_rate": 1.133335922073109e-05, "loss": 1.9214, "step": 19115 }, { "epoch": 3.42, "grad_norm": 0.7211077213287354, "learning_rate": 1.1321588129357746e-05, "loss": 1.7593, "step": 19120 }, { "epoch": 3.42, "grad_norm": 1.222629427909851, "learning_rate": 1.130982136445291e-05, "loss": 1.971, "step": 19125 }, { "epoch": 3.42, "grad_norm": 1.0511137247085571, "learning_rate": 1.1298058929738411e-05, "loss": 1.7639, "step": 19130 }, { "epoch": 3.43, "grad_norm": 1.5264891386032104, "learning_rate": 1.1286300828934702e-05, "loss": 1.7966, "step": 19135 }, { "epoch": 3.43, "grad_norm": 1.0392403602600098, "learning_rate": 1.1274547065760838e-05, "loss": 1.5846, "step": 19140 }, { "epoch": 3.43, "grad_norm": 1.030380368232727, "learning_rate": 1.1262797643934537e-05, "loss": 1.7171, "step": 19145 }, { "epoch": 3.43, "grad_norm": 1.143563985824585, "learning_rate": 1.1251052567172135e-05, "loss": 2.1491, "step": 19150 }, { "epoch": 3.43, "grad_norm": 0.6801722049713135, "learning_rate": 1.1239311839188571e-05, "loss": 1.9382, "step": 19155 }, { "epoch": 3.43, "grad_norm": 1.0771989822387695, "learning_rate": 1.122757546369744e-05, "loss": 1.6079, "step": 19160 }, { "epoch": 3.43, "grad_norm": 1.1643553972244263, "learning_rate": 1.121584344441094e-05, "loss": 1.8147, "step": 19165 }, { "epoch": 3.43, "grad_norm": 1.202305555343628, "learning_rate": 1.1204115785039901e-05, "loss": 1.631, "step": 19170 }, { "epoch": 3.43, "grad_norm": 0.6472001075744629, "learning_rate": 1.1192392489293777e-05, "loss": 1.9842, "step": 19175 }, { "epoch": 3.43, "grad_norm": 0.7353944182395935, "learning_rate": 1.1180673560880619e-05, "loss": 1.6733, "step": 19180 }, { "epoch": 3.43, "grad_norm": 0.6678537726402283, "learning_rate": 1.1168959003507129e-05, "loss": 1.8173, "step": 19185 }, { "epoch": 3.43, "grad_norm": 0.6647874116897583, "learning_rate": 1.1157248820878596e-05, "loss": 1.7445, "step": 19190 }, { "epoch": 3.44, "grad_norm": 0.5945031046867371, "learning_rate": 1.1145543016698946e-05, "loss": 1.9277, "step": 19195 }, { "epoch": 3.44, "grad_norm": 2.9035160541534424, "learning_rate": 1.1133841594670711e-05, "loss": 2.0099, "step": 19200 }, { "epoch": 3.44, "grad_norm": 0.5868393182754517, "learning_rate": 1.1122144558495043e-05, "loss": 1.6714, "step": 19205 }, { "epoch": 3.44, "grad_norm": 0.993786633014679, "learning_rate": 1.111045191187171e-05, "loss": 1.7378, "step": 19210 }, { "epoch": 3.44, "grad_norm": 0.8114561438560486, "learning_rate": 1.109876365849907e-05, "loss": 2.2337, "step": 19215 }, { "epoch": 3.44, "grad_norm": 0.6973251700401306, "learning_rate": 1.1087079802074112e-05, "loss": 2.1974, "step": 19220 }, { "epoch": 3.44, "grad_norm": 1.055870771408081, "learning_rate": 1.1075400346292433e-05, "loss": 1.596, "step": 19225 }, { "epoch": 3.44, "grad_norm": 1.0499026775360107, "learning_rate": 1.1063725294848238e-05, "loss": 1.7357, "step": 19230 }, { "epoch": 3.44, "grad_norm": 1.200201392173767, "learning_rate": 1.1052054651434318e-05, "loss": 1.8395, "step": 19235 }, { "epoch": 3.44, "grad_norm": 0.6308977007865906, "learning_rate": 1.1040388419742106e-05, "loss": 2.024, "step": 19240 }, { "epoch": 3.44, "grad_norm": 0.704451322555542, "learning_rate": 1.1028726603461597e-05, "loss": 1.8261, "step": 19245 }, { "epoch": 3.45, "grad_norm": 1.1138906478881836, "learning_rate": 1.1017069206281427e-05, "loss": 1.6242, "step": 19250 }, { "epoch": 3.45, "grad_norm": 0.6243778467178345, "learning_rate": 1.1005416231888819e-05, "loss": 1.9646, "step": 19255 }, { "epoch": 3.45, "grad_norm": 0.723716676235199, "learning_rate": 1.0993767683969594e-05, "loss": 1.9183, "step": 19260 }, { "epoch": 3.45, "grad_norm": 0.4877179265022278, "learning_rate": 1.0982123566208185e-05, "loss": 1.7803, "step": 19265 }, { "epoch": 3.45, "grad_norm": 1.7348027229309082, "learning_rate": 1.0970483882287602e-05, "loss": 1.7823, "step": 19270 }, { "epoch": 3.45, "grad_norm": 1.0215563774108887, "learning_rate": 1.0958848635889482e-05, "loss": 1.6437, "step": 19275 }, { "epoch": 3.45, "grad_norm": 0.9767959117889404, "learning_rate": 1.0947217830694023e-05, "loss": 1.6133, "step": 19280 }, { "epoch": 3.45, "grad_norm": 1.0302644968032837, "learning_rate": 1.0935591470380046e-05, "loss": 1.8479, "step": 19285 }, { "epoch": 3.45, "grad_norm": 0.5490931868553162, "learning_rate": 1.0923969558624959e-05, "loss": 1.6583, "step": 19290 }, { "epoch": 3.45, "grad_norm": 1.0158607959747314, "learning_rate": 1.0912352099104762e-05, "loss": 1.7078, "step": 19295 }, { "epoch": 3.45, "grad_norm": 0.7652471661567688, "learning_rate": 1.0900739095494053e-05, "loss": 1.9895, "step": 19300 }, { "epoch": 3.46, "grad_norm": 0.7485408782958984, "learning_rate": 1.0889130551465999e-05, "loss": 1.7783, "step": 19305 }, { "epoch": 3.46, "grad_norm": 0.9016578197479248, "learning_rate": 1.0877526470692378e-05, "loss": 2.0336, "step": 19310 }, { "epoch": 3.46, "grad_norm": 0.8303315043449402, "learning_rate": 1.086592685684355e-05, "loss": 2.0914, "step": 19315 }, { "epoch": 3.46, "grad_norm": 0.6593153476715088, "learning_rate": 1.0854331713588461e-05, "loss": 1.8333, "step": 19320 }, { "epoch": 3.46, "grad_norm": 0.5653439164161682, "learning_rate": 1.0842741044594656e-05, "loss": 1.7719, "step": 19325 }, { "epoch": 3.46, "grad_norm": 0.8699783682823181, "learning_rate": 1.0831154853528238e-05, "loss": 2.05, "step": 19330 }, { "epoch": 3.46, "grad_norm": 2.1618435382843018, "learning_rate": 1.0819573144053904e-05, "loss": 1.7721, "step": 19335 }, { "epoch": 3.46, "grad_norm": 0.6418623924255371, "learning_rate": 1.0807995919834946e-05, "loss": 2.0052, "step": 19340 }, { "epoch": 3.46, "grad_norm": 0.4572185277938843, "learning_rate": 1.0796423184533224e-05, "loss": 2.1008, "step": 19345 }, { "epoch": 3.46, "grad_norm": 2.1507296562194824, "learning_rate": 1.0784854941809192e-05, "loss": 1.8341, "step": 19350 }, { "epoch": 3.46, "grad_norm": 1.2364892959594727, "learning_rate": 1.0773291195321872e-05, "loss": 1.9426, "step": 19355 }, { "epoch": 3.47, "grad_norm": 0.9573560357093811, "learning_rate": 1.0761731948728857e-05, "loss": 1.9652, "step": 19360 }, { "epoch": 3.47, "grad_norm": 1.1250308752059937, "learning_rate": 1.075017720568633e-05, "loss": 1.8002, "step": 19365 }, { "epoch": 3.47, "grad_norm": 3.7866458892822266, "learning_rate": 1.0738626969849056e-05, "loss": 1.7255, "step": 19370 }, { "epoch": 3.47, "grad_norm": 1.0425763130187988, "learning_rate": 1.0727081244870343e-05, "loss": 1.7018, "step": 19375 }, { "epoch": 3.47, "grad_norm": 0.9461259245872498, "learning_rate": 1.0715540034402105e-05, "loss": 1.9527, "step": 19380 }, { "epoch": 3.47, "grad_norm": 1.1015312671661377, "learning_rate": 1.0704003342094823e-05, "loss": 1.7186, "step": 19385 }, { "epoch": 3.47, "grad_norm": 0.7580620646476746, "learning_rate": 1.0692471171597524e-05, "loss": 1.8424, "step": 19390 }, { "epoch": 3.47, "grad_norm": 1.06261146068573, "learning_rate": 1.0680943526557829e-05, "loss": 1.8829, "step": 19395 }, { "epoch": 3.47, "grad_norm": 0.9854909777641296, "learning_rate": 1.0669420410621928e-05, "loss": 1.7523, "step": 19400 }, { "epoch": 3.47, "grad_norm": 0.6900574564933777, "learning_rate": 1.0657901827434563e-05, "loss": 1.9449, "step": 19405 }, { "epoch": 3.47, "grad_norm": 0.6487289667129517, "learning_rate": 1.0646387780639055e-05, "loss": 1.8822, "step": 19410 }, { "epoch": 3.48, "grad_norm": 0.5597279667854309, "learning_rate": 1.0634878273877297e-05, "loss": 2.0047, "step": 19415 }, { "epoch": 3.48, "grad_norm": 0.49827390909194946, "learning_rate": 1.0623373310789722e-05, "loss": 1.9747, "step": 19420 }, { "epoch": 3.48, "grad_norm": 0.751218855381012, "learning_rate": 1.0611872895015328e-05, "loss": 1.7458, "step": 19425 }, { "epoch": 3.48, "grad_norm": 1.029354214668274, "learning_rate": 1.0600377030191701e-05, "loss": 1.8894, "step": 19430 }, { "epoch": 3.48, "grad_norm": 1.2062971591949463, "learning_rate": 1.0588885719954971e-05, "loss": 1.876, "step": 19435 }, { "epoch": 3.48, "grad_norm": 0.9020774960517883, "learning_rate": 1.0577398967939822e-05, "loss": 1.855, "step": 19440 }, { "epoch": 3.48, "grad_norm": 0.6630207300186157, "learning_rate": 1.0565916777779519e-05, "loss": 2.1588, "step": 19445 }, { "epoch": 3.48, "grad_norm": 0.8399782180786133, "learning_rate": 1.0554439153105844e-05, "loss": 1.6518, "step": 19450 }, { "epoch": 3.48, "grad_norm": 0.8747025728225708, "learning_rate": 1.054296609754917e-05, "loss": 1.7769, "step": 19455 }, { "epoch": 3.48, "grad_norm": 0.9904999732971191, "learning_rate": 1.0531497614738414e-05, "loss": 1.6096, "step": 19460 }, { "epoch": 3.48, "grad_norm": 1.0800423622131348, "learning_rate": 1.0520033708301056e-05, "loss": 1.9036, "step": 19465 }, { "epoch": 3.49, "grad_norm": 2.292233467102051, "learning_rate": 1.0508574381863095e-05, "loss": 1.7432, "step": 19470 }, { "epoch": 3.49, "grad_norm": 2.433584451675415, "learning_rate": 1.049711963904913e-05, "loss": 1.6728, "step": 19475 }, { "epoch": 3.49, "grad_norm": 0.8242534399032593, "learning_rate": 1.0485669483482266e-05, "loss": 1.6268, "step": 19480 }, { "epoch": 3.49, "grad_norm": 0.3501177728176117, "learning_rate": 1.0474223918784178e-05, "loss": 1.8424, "step": 19485 }, { "epoch": 3.49, "grad_norm": 0.7704662084579468, "learning_rate": 1.0462782948575098e-05, "loss": 1.5104, "step": 19490 }, { "epoch": 3.49, "grad_norm": 1.05587637424469, "learning_rate": 1.0451346576473787e-05, "loss": 2.0312, "step": 19495 }, { "epoch": 3.49, "grad_norm": 0.6842361688613892, "learning_rate": 1.0439914806097559e-05, "loss": 1.8268, "step": 19500 }, { "epoch": 3.49, "grad_norm": 1.027362585067749, "learning_rate": 1.042848764106228e-05, "loss": 2.1638, "step": 19505 }, { "epoch": 3.49, "grad_norm": 1.474015235900879, "learning_rate": 1.0417065084982346e-05, "loss": 1.6934, "step": 19510 }, { "epoch": 3.49, "grad_norm": 0.5891749858856201, "learning_rate": 1.040564714147069e-05, "loss": 1.8321, "step": 19515 }, { "epoch": 3.49, "grad_norm": 0.8095927238464355, "learning_rate": 1.0394233814138804e-05, "loss": 1.7532, "step": 19520 }, { "epoch": 3.49, "grad_norm": 1.7392970323562622, "learning_rate": 1.0382825106596713e-05, "loss": 2.0015, "step": 19525 }, { "epoch": 3.5, "grad_norm": 1.0401033163070679, "learning_rate": 1.0371421022452981e-05, "loss": 2.0158, "step": 19530 }, { "epoch": 3.5, "grad_norm": 1.081748127937317, "learning_rate": 1.036002156531472e-05, "loss": 2.0205, "step": 19535 }, { "epoch": 3.5, "grad_norm": 0.47808757424354553, "learning_rate": 1.0348626738787548e-05, "loss": 1.4998, "step": 19540 }, { "epoch": 3.5, "grad_norm": 0.770613968372345, "learning_rate": 1.0337236546475642e-05, "loss": 1.8638, "step": 19545 }, { "epoch": 3.5, "grad_norm": 0.6831596493721008, "learning_rate": 1.0325850991981715e-05, "loss": 1.833, "step": 19550 }, { "epoch": 3.5, "grad_norm": 0.8213655352592468, "learning_rate": 1.0314470078907013e-05, "loss": 1.8172, "step": 19555 }, { "epoch": 3.5, "grad_norm": 0.9885990619659424, "learning_rate": 1.0303093810851288e-05, "loss": 1.7939, "step": 19560 }, { "epoch": 3.5, "grad_norm": 1.137285590171814, "learning_rate": 1.0291722191412865e-05, "loss": 1.7968, "step": 19565 }, { "epoch": 3.5, "grad_norm": 1.3749232292175293, "learning_rate": 1.0280355224188556e-05, "loss": 1.8718, "step": 19570 }, { "epoch": 3.5, "grad_norm": 0.9204155802726746, "learning_rate": 1.0268992912773732e-05, "loss": 1.6898, "step": 19575 }, { "epoch": 3.5, "grad_norm": 0.7502628564834595, "learning_rate": 1.025763526076228e-05, "loss": 1.669, "step": 19580 }, { "epoch": 3.51, "grad_norm": 1.0295674800872803, "learning_rate": 1.0246282271746612e-05, "loss": 1.7433, "step": 19585 }, { "epoch": 3.51, "grad_norm": 0.7040165662765503, "learning_rate": 1.0234933949317683e-05, "loss": 1.6466, "step": 19590 }, { "epoch": 3.51, "grad_norm": 0.6841382384300232, "learning_rate": 1.0223590297064933e-05, "loss": 2.0555, "step": 19595 }, { "epoch": 3.51, "grad_norm": 0.6829349994659424, "learning_rate": 1.0212251318576357e-05, "loss": 1.7514, "step": 19600 }, { "epoch": 3.51, "grad_norm": 1.0354279279708862, "learning_rate": 1.0200917017438478e-05, "loss": 1.7173, "step": 19605 }, { "epoch": 3.51, "grad_norm": 0.7070810794830322, "learning_rate": 1.01895873972363e-05, "loss": 2.1088, "step": 19610 }, { "epoch": 3.51, "grad_norm": 1.9190095663070679, "learning_rate": 1.017826246155338e-05, "loss": 2.2198, "step": 19615 }, { "epoch": 3.51, "grad_norm": 0.7562488913536072, "learning_rate": 1.0166942213971792e-05, "loss": 1.7641, "step": 19620 }, { "epoch": 3.51, "grad_norm": 0.9604626893997192, "learning_rate": 1.015562665807212e-05, "loss": 1.9173, "step": 19625 }, { "epoch": 3.51, "grad_norm": 2.0906529426574707, "learning_rate": 1.0144315797433449e-05, "loss": 1.9204, "step": 19630 }, { "epoch": 3.51, "grad_norm": 1.8429858684539795, "learning_rate": 1.0133009635633402e-05, "loss": 1.7915, "step": 19635 }, { "epoch": 3.52, "grad_norm": 1.0076621770858765, "learning_rate": 1.0121708176248107e-05, "loss": 1.6617, "step": 19640 }, { "epoch": 3.52, "grad_norm": 0.9586327075958252, "learning_rate": 1.0110411422852206e-05, "loss": 1.8037, "step": 19645 }, { "epoch": 3.52, "grad_norm": 0.7485795617103577, "learning_rate": 1.0099119379018856e-05, "loss": 2.0374, "step": 19650 }, { "epoch": 3.52, "grad_norm": 0.6466397643089294, "learning_rate": 1.0087832048319716e-05, "loss": 1.795, "step": 19655 }, { "epoch": 3.52, "grad_norm": 1.1926106214523315, "learning_rate": 1.0076549434324944e-05, "loss": 1.8342, "step": 19660 }, { "epoch": 3.52, "grad_norm": 0.621414840221405, "learning_rate": 1.0065271540603236e-05, "loss": 1.7218, "step": 19665 }, { "epoch": 3.52, "grad_norm": 1.1308784484863281, "learning_rate": 1.005399837072177e-05, "loss": 1.7901, "step": 19670 }, { "epoch": 3.52, "grad_norm": 0.8960298299789429, "learning_rate": 1.0042729928246245e-05, "loss": 1.8848, "step": 19675 }, { "epoch": 3.52, "grad_norm": 0.4410213232040405, "learning_rate": 1.0031466216740865e-05, "loss": 1.7415, "step": 19680 }, { "epoch": 3.52, "grad_norm": 0.7294899821281433, "learning_rate": 1.0020207239768314e-05, "loss": 2.1899, "step": 19685 }, { "epoch": 3.52, "grad_norm": 0.6985868215560913, "learning_rate": 1.0008953000889803e-05, "loss": 1.848, "step": 19690 }, { "epoch": 3.53, "grad_norm": 0.5390515327453613, "learning_rate": 9.997703503665051e-06, "loss": 2.0746, "step": 19695 }, { "epoch": 3.53, "grad_norm": 0.9798139929771423, "learning_rate": 9.986458751652238e-06, "loss": 1.8011, "step": 19700 }, { "epoch": 3.53, "grad_norm": 0.7954207062721252, "learning_rate": 9.975218748408083e-06, "loss": 1.9957, "step": 19705 }, { "epoch": 3.53, "grad_norm": 1.3126145601272583, "learning_rate": 9.963983497487783e-06, "loss": 1.8887, "step": 19710 }, { "epoch": 3.53, "grad_norm": 0.8574180006980896, "learning_rate": 9.95275300244505e-06, "loss": 1.6767, "step": 19715 }, { "epoch": 3.53, "grad_norm": 0.6159719228744507, "learning_rate": 9.941527266832062e-06, "loss": 2.1028, "step": 19720 }, { "epoch": 3.53, "grad_norm": 0.8486490249633789, "learning_rate": 9.930306294199512e-06, "loss": 1.8083, "step": 19725 }, { "epoch": 3.53, "grad_norm": 2.6029884815216064, "learning_rate": 9.919090088096589e-06, "loss": 1.7197, "step": 19730 }, { "epoch": 3.53, "grad_norm": 0.9481103420257568, "learning_rate": 9.907878652070962e-06, "loss": 1.7045, "step": 19735 }, { "epoch": 3.53, "grad_norm": 0.3375113606452942, "learning_rate": 9.896671989668813e-06, "loss": 1.8396, "step": 19740 }, { "epoch": 3.53, "grad_norm": 0.7272607684135437, "learning_rate": 9.885470104434783e-06, "loss": 1.7523, "step": 19745 }, { "epoch": 3.54, "grad_norm": 1.5993422269821167, "learning_rate": 9.874272999912013e-06, "loss": 2.0852, "step": 19750 }, { "epoch": 3.54, "grad_norm": 0.6138380169868469, "learning_rate": 9.863080679642143e-06, "loss": 1.8729, "step": 19755 }, { "epoch": 3.54, "grad_norm": 0.8599143028259277, "learning_rate": 9.851893147165295e-06, "loss": 2.0843, "step": 19760 }, { "epoch": 3.54, "grad_norm": 0.7874997854232788, "learning_rate": 9.840710406020072e-06, "loss": 1.6064, "step": 19765 }, { "epoch": 3.54, "grad_norm": 0.9153751730918884, "learning_rate": 9.829532459743579e-06, "loss": 1.6868, "step": 19770 }, { "epoch": 3.54, "grad_norm": 0.6881905794143677, "learning_rate": 9.818359311871367e-06, "loss": 1.6391, "step": 19775 }, { "epoch": 3.54, "grad_norm": 1.0899308919906616, "learning_rate": 9.807190965937499e-06, "loss": 1.9639, "step": 19780 }, { "epoch": 3.54, "grad_norm": 1.196337342262268, "learning_rate": 9.796027425474515e-06, "loss": 1.7011, "step": 19785 }, { "epoch": 3.54, "grad_norm": 1.8400663137435913, "learning_rate": 9.784868694013444e-06, "loss": 1.9638, "step": 19790 }, { "epoch": 3.54, "grad_norm": 0.783830463886261, "learning_rate": 9.773714775083759e-06, "loss": 1.7963, "step": 19795 }, { "epoch": 3.54, "grad_norm": 0.5852648615837097, "learning_rate": 9.762565672213444e-06, "loss": 1.6803, "step": 19800 }, { "epoch": 3.54, "grad_norm": 0.6797320246696472, "learning_rate": 9.751421388928961e-06, "loss": 1.9709, "step": 19805 }, { "epoch": 3.55, "grad_norm": 0.8105748295783997, "learning_rate": 9.740281928755218e-06, "loss": 2.0418, "step": 19810 }, { "epoch": 3.55, "grad_norm": 0.5812113285064697, "learning_rate": 9.72914729521562e-06, "loss": 1.8629, "step": 19815 }, { "epoch": 3.55, "grad_norm": 0.9974383115768433, "learning_rate": 9.718017491832046e-06, "loss": 2.231, "step": 19820 }, { "epoch": 3.55, "grad_norm": 1.628359079360962, "learning_rate": 9.706892522124839e-06, "loss": 1.7025, "step": 19825 }, { "epoch": 3.55, "grad_norm": 2.6423473358154297, "learning_rate": 9.695772389612826e-06, "loss": 1.7726, "step": 19830 }, { "epoch": 3.55, "grad_norm": 0.6964113712310791, "learning_rate": 9.684657097813282e-06, "loss": 1.5745, "step": 19835 }, { "epoch": 3.55, "grad_norm": 1.0182359218597412, "learning_rate": 9.673546650241963e-06, "loss": 2.1207, "step": 19840 }, { "epoch": 3.55, "grad_norm": 0.8013817071914673, "learning_rate": 9.662441050413093e-06, "loss": 1.7262, "step": 19845 }, { "epoch": 3.55, "grad_norm": 0.9642962217330933, "learning_rate": 9.651340301839367e-06, "loss": 1.9043, "step": 19850 }, { "epoch": 3.55, "grad_norm": 0.8014324307441711, "learning_rate": 9.64024440803194e-06, "loss": 1.8698, "step": 19855 }, { "epoch": 3.55, "grad_norm": 0.7242090106010437, "learning_rate": 9.62915337250044e-06, "loss": 1.6975, "step": 19860 }, { "epoch": 3.56, "grad_norm": 0.8667596578598022, "learning_rate": 9.618067198752937e-06, "loss": 1.7597, "step": 19865 }, { "epoch": 3.56, "grad_norm": 0.5412402749061584, "learning_rate": 9.606985890295984e-06, "loss": 1.7658, "step": 19870 }, { "epoch": 3.56, "grad_norm": 1.8349127769470215, "learning_rate": 9.595909450634586e-06, "loss": 1.7718, "step": 19875 }, { "epoch": 3.56, "grad_norm": 1.097348928451538, "learning_rate": 9.58483788327221e-06, "loss": 2.0921, "step": 19880 }, { "epoch": 3.56, "grad_norm": 0.8967131972312927, "learning_rate": 9.573771191710795e-06, "loss": 1.8385, "step": 19885 }, { "epoch": 3.56, "grad_norm": 7.748429298400879, "learning_rate": 9.562709379450705e-06, "loss": 1.4055, "step": 19890 }, { "epoch": 3.56, "grad_norm": 0.7774618268013, "learning_rate": 9.551652449990797e-06, "loss": 2.0261, "step": 19895 }, { "epoch": 3.56, "grad_norm": 0.6186455488204956, "learning_rate": 9.540600406828354e-06, "loss": 1.8112, "step": 19900 }, { "epoch": 3.56, "grad_norm": 0.5887669324874878, "learning_rate": 9.529553253459132e-06, "loss": 1.9148, "step": 19905 }, { "epoch": 3.56, "grad_norm": 0.9519124627113342, "learning_rate": 9.518510993377331e-06, "loss": 1.7988, "step": 19910 }, { "epoch": 3.56, "grad_norm": 0.5737485885620117, "learning_rate": 9.507473630075613e-06, "loss": 1.8965, "step": 19915 }, { "epoch": 3.57, "grad_norm": 1.0009839534759521, "learning_rate": 9.496441167045095e-06, "loss": 1.9366, "step": 19920 }, { "epoch": 3.57, "grad_norm": 0.9404077529907227, "learning_rate": 9.485413607775314e-06, "loss": 1.7763, "step": 19925 }, { "epoch": 3.57, "grad_norm": 0.8102321624755859, "learning_rate": 9.474390955754293e-06, "loss": 1.6678, "step": 19930 }, { "epoch": 3.57, "grad_norm": 1.2429808378219604, "learning_rate": 9.46337321446847e-06, "loss": 1.8401, "step": 19935 }, { "epoch": 3.57, "grad_norm": 0.66902095079422, "learning_rate": 9.452360387402756e-06, "loss": 1.6209, "step": 19940 }, { "epoch": 3.57, "grad_norm": 0.5885403752326965, "learning_rate": 9.441352478040497e-06, "loss": 2.0758, "step": 19945 }, { "epoch": 3.57, "grad_norm": 0.7154830694198608, "learning_rate": 9.430349489863493e-06, "loss": 1.9128, "step": 19950 }, { "epoch": 3.57, "grad_norm": 0.5632511973381042, "learning_rate": 9.419351426351958e-06, "loss": 2.2468, "step": 19955 }, { "epoch": 3.57, "grad_norm": 2.063032388687134, "learning_rate": 9.40835829098458e-06, "loss": 1.6646, "step": 19960 }, { "epoch": 3.57, "grad_norm": 0.9876983165740967, "learning_rate": 9.397370087238483e-06, "loss": 1.8661, "step": 19965 }, { "epoch": 3.57, "grad_norm": 0.797542154788971, "learning_rate": 9.386386818589212e-06, "loss": 2.0163, "step": 19970 }, { "epoch": 3.58, "grad_norm": 0.8260084390640259, "learning_rate": 9.375408488510787e-06, "loss": 1.7415, "step": 19975 }, { "epoch": 3.58, "grad_norm": 0.849568247795105, "learning_rate": 9.364435100475625e-06, "loss": 1.648, "step": 19980 }, { "epoch": 3.58, "grad_norm": 0.8121174573898315, "learning_rate": 9.35346665795459e-06, "loss": 2.2069, "step": 19985 }, { "epoch": 3.58, "grad_norm": 0.9171432256698608, "learning_rate": 9.342503164417005e-06, "loss": 2.1348, "step": 19990 }, { "epoch": 3.58, "grad_norm": 0.9130405783653259, "learning_rate": 9.331544623330606e-06, "loss": 2.0652, "step": 19995 }, { "epoch": 3.58, "grad_norm": 0.814951479434967, "learning_rate": 9.320591038161574e-06, "loss": 1.6998, "step": 20000 }, { "epoch": 3.58, "grad_norm": 1.7941076755523682, "learning_rate": 9.309642412374514e-06, "loss": 1.7472, "step": 20005 }, { "epoch": 3.58, "grad_norm": 0.9372169971466064, "learning_rate": 9.298698749432474e-06, "loss": 1.759, "step": 20010 }, { "epoch": 3.58, "grad_norm": 0.6178033351898193, "learning_rate": 9.287760052796909e-06, "loss": 1.8053, "step": 20015 }, { "epoch": 3.58, "grad_norm": 0.8956480622291565, "learning_rate": 9.27682632592773e-06, "loss": 1.8945, "step": 20020 }, { "epoch": 3.58, "grad_norm": 0.7049922347068787, "learning_rate": 9.265897572283263e-06, "loss": 1.9184, "step": 20025 }, { "epoch": 3.59, "grad_norm": 0.6929253935813904, "learning_rate": 9.254973795320254e-06, "loss": 1.9086, "step": 20030 }, { "epoch": 3.59, "grad_norm": 0.7709065675735474, "learning_rate": 9.244054998493886e-06, "loss": 1.7822, "step": 20035 }, { "epoch": 3.59, "grad_norm": 0.817094624042511, "learning_rate": 9.23314118525778e-06, "loss": 1.7846, "step": 20040 }, { "epoch": 3.59, "grad_norm": 0.9673941135406494, "learning_rate": 9.22223235906394e-06, "loss": 1.8459, "step": 20045 }, { "epoch": 3.59, "grad_norm": 0.639333188533783, "learning_rate": 9.211328523362828e-06, "loss": 1.8805, "step": 20050 }, { "epoch": 3.59, "grad_norm": 1.8863416910171509, "learning_rate": 9.200429681603318e-06, "loss": 1.7241, "step": 20055 }, { "epoch": 3.59, "grad_norm": 1.824625849723816, "learning_rate": 9.189535837232701e-06, "loss": 1.8988, "step": 20060 }, { "epoch": 3.59, "grad_norm": 0.7470659017562866, "learning_rate": 9.178646993696702e-06, "loss": 1.3453, "step": 20065 }, { "epoch": 3.59, "grad_norm": 1.1714149713516235, "learning_rate": 9.167763154439437e-06, "loss": 2.1238, "step": 20070 }, { "epoch": 3.59, "grad_norm": 0.8737668991088867, "learning_rate": 9.156884322903452e-06, "loss": 1.9031, "step": 20075 }, { "epoch": 3.59, "grad_norm": 1.0085196495056152, "learning_rate": 9.146010502529717e-06, "loss": 1.58, "step": 20080 }, { "epoch": 3.6, "grad_norm": 0.6037978529930115, "learning_rate": 9.13514169675761e-06, "loss": 1.8138, "step": 20085 }, { "epoch": 3.6, "grad_norm": 1.0545234680175781, "learning_rate": 9.124277909024923e-06, "loss": 1.7113, "step": 20090 }, { "epoch": 3.6, "grad_norm": 0.7123447060585022, "learning_rate": 9.113419142767864e-06, "loss": 1.7098, "step": 20095 }, { "epoch": 3.6, "grad_norm": 0.9203841090202332, "learning_rate": 9.102565401421056e-06, "loss": 1.6357, "step": 20100 }, { "epoch": 3.6, "grad_norm": 0.8946130871772766, "learning_rate": 9.091716688417515e-06, "loss": 2.1125, "step": 20105 }, { "epoch": 3.6, "grad_norm": 0.6853888630867004, "learning_rate": 9.080873007188681e-06, "loss": 1.9502, "step": 20110 }, { "epoch": 3.6, "grad_norm": 0.7109887003898621, "learning_rate": 9.070034361164412e-06, "loss": 1.7719, "step": 20115 }, { "epoch": 3.6, "grad_norm": 1.7917437553405762, "learning_rate": 9.059200753772942e-06, "loss": 1.8331, "step": 20120 }, { "epoch": 3.6, "grad_norm": 0.8030440211296082, "learning_rate": 9.048372188440942e-06, "loss": 1.8035, "step": 20125 }, { "epoch": 3.6, "grad_norm": 0.9415981769561768, "learning_rate": 9.037548668593482e-06, "loss": 2.0076, "step": 20130 }, { "epoch": 3.6, "grad_norm": 1.300551176071167, "learning_rate": 9.026730197654018e-06, "loss": 1.7933, "step": 20135 }, { "epoch": 3.6, "grad_norm": 0.8916707038879395, "learning_rate": 9.015916779044428e-06, "loss": 1.9135, "step": 20140 }, { "epoch": 3.61, "grad_norm": 0.4446997046470642, "learning_rate": 9.005108416184985e-06, "loss": 1.8592, "step": 20145 }, { "epoch": 3.61, "grad_norm": 0.7679259777069092, "learning_rate": 8.994305112494364e-06, "loss": 1.8635, "step": 20150 }, { "epoch": 3.61, "grad_norm": 1.084495186805725, "learning_rate": 8.983506871389648e-06, "loss": 1.6656, "step": 20155 }, { "epoch": 3.61, "grad_norm": 0.5544742941856384, "learning_rate": 8.972713696286297e-06, "loss": 2.0044, "step": 20160 }, { "epoch": 3.61, "grad_norm": 1.2430366277694702, "learning_rate": 8.961925590598196e-06, "loss": 1.9011, "step": 20165 }, { "epoch": 3.61, "grad_norm": 0.7293367385864258, "learning_rate": 8.951142557737597e-06, "loss": 1.8652, "step": 20170 }, { "epoch": 3.61, "grad_norm": 0.6293432116508484, "learning_rate": 8.940364601115167e-06, "loss": 2.0155, "step": 20175 }, { "epoch": 3.61, "grad_norm": 0.6995351910591125, "learning_rate": 8.929591724139966e-06, "loss": 1.5788, "step": 20180 }, { "epoch": 3.61, "grad_norm": 1.0130141973495483, "learning_rate": 8.918823930219455e-06, "loss": 2.1183, "step": 20185 }, { "epoch": 3.61, "grad_norm": 0.8194257020950317, "learning_rate": 8.908061222759457e-06, "loss": 1.8322, "step": 20190 }, { "epoch": 3.61, "grad_norm": 3.565441608428955, "learning_rate": 8.899454721330733e-06, "loss": 1.7912, "step": 20195 }, { "epoch": 3.62, "grad_norm": 0.643989622592926, "learning_rate": 8.888701178077265e-06, "loss": 1.8781, "step": 20200 }, { "epoch": 3.62, "grad_norm": 0.9238978624343872, "learning_rate": 8.877952730812116e-06, "loss": 1.8105, "step": 20205 }, { "epoch": 3.62, "grad_norm": 0.7382200360298157, "learning_rate": 8.867209382935022e-06, "loss": 1.6843, "step": 20210 }, { "epoch": 3.62, "grad_norm": 1.2155033349990845, "learning_rate": 8.856471137844071e-06, "loss": 1.6075, "step": 20215 }, { "epoch": 3.62, "grad_norm": 12.773721694946289, "learning_rate": 8.845737998935779e-06, "loss": 1.8195, "step": 20220 }, { "epoch": 3.62, "grad_norm": 1.3359897136688232, "learning_rate": 8.835009969605012e-06, "loss": 2.098, "step": 20225 }, { "epoch": 3.62, "grad_norm": 0.8200415968894958, "learning_rate": 8.824287053245046e-06, "loss": 1.639, "step": 20230 }, { "epoch": 3.62, "grad_norm": 1.8653521537780762, "learning_rate": 8.813569253247522e-06, "loss": 1.714, "step": 20235 }, { "epoch": 3.62, "grad_norm": 0.7212638258934021, "learning_rate": 8.802856573002474e-06, "loss": 1.5678, "step": 20240 }, { "epoch": 3.62, "grad_norm": 0.6007484793663025, "learning_rate": 8.792149015898321e-06, "loss": 2.021, "step": 20245 }, { "epoch": 3.62, "grad_norm": 0.8171059489250183, "learning_rate": 8.781446585321836e-06, "loss": 1.644, "step": 20250 }, { "epoch": 3.63, "grad_norm": 0.5657952427864075, "learning_rate": 8.770749284658198e-06, "loss": 1.7338, "step": 20255 }, { "epoch": 3.63, "grad_norm": 0.8464135527610779, "learning_rate": 8.760057117290956e-06, "loss": 1.8144, "step": 20260 }, { "epoch": 3.63, "grad_norm": 0.8611558079719543, "learning_rate": 8.749370086602022e-06, "loss": 1.9741, "step": 20265 }, { "epoch": 3.63, "grad_norm": 0.8753948211669922, "learning_rate": 8.738688195971698e-06, "loss": 2.034, "step": 20270 }, { "epoch": 3.63, "grad_norm": 0.4790874421596527, "learning_rate": 8.728011448778652e-06, "loss": 1.8537, "step": 20275 }, { "epoch": 3.63, "grad_norm": 0.5300187468528748, "learning_rate": 8.717339848399947e-06, "loss": 1.6698, "step": 20280 }, { "epoch": 3.63, "grad_norm": 1.3581863641738892, "learning_rate": 8.706673398210973e-06, "loss": 1.9417, "step": 20285 }, { "epoch": 3.63, "grad_norm": 0.47221723198890686, "learning_rate": 8.696012101585533e-06, "loss": 2.0264, "step": 20290 }, { "epoch": 3.63, "grad_norm": 1.0025262832641602, "learning_rate": 8.685355961895784e-06, "loss": 1.7893, "step": 20295 }, { "epoch": 3.63, "grad_norm": 1.1813714504241943, "learning_rate": 8.67470498251225e-06, "loss": 1.8123, "step": 20300 }, { "epoch": 3.63, "grad_norm": 2.1088335514068604, "learning_rate": 8.664059166803834e-06, "loss": 1.8788, "step": 20305 }, { "epoch": 3.64, "grad_norm": 0.9035912752151489, "learning_rate": 8.653418518137792e-06, "loss": 2.0769, "step": 20310 }, { "epoch": 3.64, "grad_norm": 0.8217169046401978, "learning_rate": 8.642783039879742e-06, "loss": 1.7871, "step": 20315 }, { "epoch": 3.64, "grad_norm": 1.7109413146972656, "learning_rate": 8.632152735393682e-06, "loss": 1.7824, "step": 20320 }, { "epoch": 3.64, "grad_norm": 1.142771601676941, "learning_rate": 8.621527608041969e-06, "loss": 1.7042, "step": 20325 }, { "epoch": 3.64, "grad_norm": 0.5646505355834961, "learning_rate": 8.610907661185316e-06, "loss": 1.8401, "step": 20330 }, { "epoch": 3.64, "grad_norm": 0.9124100208282471, "learning_rate": 8.600292898182815e-06, "loss": 2.1127, "step": 20335 }, { "epoch": 3.64, "grad_norm": 0.7633247971534729, "learning_rate": 8.589683322391889e-06, "loss": 1.9485, "step": 20340 }, { "epoch": 3.64, "grad_norm": 0.6282685399055481, "learning_rate": 8.579078937168344e-06, "loss": 1.8386, "step": 20345 }, { "epoch": 3.64, "grad_norm": 0.593157172203064, "learning_rate": 8.568479745866344e-06, "loss": 1.806, "step": 20350 }, { "epoch": 3.64, "grad_norm": 0.9064671397209167, "learning_rate": 8.557885751838384e-06, "loss": 1.8968, "step": 20355 }, { "epoch": 3.64, "grad_norm": 1.2595146894454956, "learning_rate": 8.54729695843535e-06, "loss": 1.8831, "step": 20360 }, { "epoch": 3.65, "grad_norm": 0.8819836378097534, "learning_rate": 8.53671336900647e-06, "loss": 1.7677, "step": 20365 }, { "epoch": 3.65, "grad_norm": 0.46707066893577576, "learning_rate": 8.526134986899304e-06, "loss": 1.7894, "step": 20370 }, { "epoch": 3.65, "grad_norm": 3.5591564178466797, "learning_rate": 8.5155618154598e-06, "loss": 1.6036, "step": 20375 }, { "epoch": 3.65, "grad_norm": 0.959492564201355, "learning_rate": 8.504993858032234e-06, "loss": 1.9008, "step": 20380 }, { "epoch": 3.65, "grad_norm": 0.6960819959640503, "learning_rate": 8.494431117959247e-06, "loss": 1.6686, "step": 20385 }, { "epoch": 3.65, "grad_norm": 1.4117478132247925, "learning_rate": 8.48387359858182e-06, "loss": 1.6402, "step": 20390 }, { "epoch": 3.65, "grad_norm": 1.1412084102630615, "learning_rate": 8.473321303239296e-06, "loss": 1.7246, "step": 20395 }, { "epoch": 3.65, "grad_norm": 0.6889460682868958, "learning_rate": 8.462774235269348e-06, "loss": 1.9935, "step": 20400 }, { "epoch": 3.65, "grad_norm": 3.105839490890503, "learning_rate": 8.452232398007993e-06, "loss": 1.9629, "step": 20405 }, { "epoch": 3.65, "grad_norm": 1.2569962739944458, "learning_rate": 8.44169579478961e-06, "loss": 1.8072, "step": 20410 }, { "epoch": 3.65, "grad_norm": 0.9013957977294922, "learning_rate": 8.431164428946927e-06, "loss": 2.0294, "step": 20415 }, { "epoch": 3.66, "grad_norm": 1.8932322263717651, "learning_rate": 8.420638303810993e-06, "loss": 1.7084, "step": 20420 }, { "epoch": 3.66, "grad_norm": 0.6387518644332886, "learning_rate": 8.410117422711227e-06, "loss": 1.9067, "step": 20425 }, { "epoch": 3.66, "grad_norm": 0.5416659712791443, "learning_rate": 8.399601788975354e-06, "loss": 1.9721, "step": 20430 }, { "epoch": 3.66, "grad_norm": 1.3782933950424194, "learning_rate": 8.389091405929467e-06, "loss": 1.9061, "step": 20435 }, { "epoch": 3.66, "grad_norm": 2.072787046432495, "learning_rate": 8.37858627689799e-06, "loss": 1.9161, "step": 20440 }, { "epoch": 3.66, "grad_norm": 0.5644092559814453, "learning_rate": 8.368086405203696e-06, "loss": 1.7495, "step": 20445 }, { "epoch": 3.66, "grad_norm": 1.1922764778137207, "learning_rate": 8.357591794167666e-06, "loss": 1.8278, "step": 20450 }, { "epoch": 3.66, "grad_norm": 0.7478966116905212, "learning_rate": 8.34710244710935e-06, "loss": 1.8286, "step": 20455 }, { "epoch": 3.66, "grad_norm": 0.6147327423095703, "learning_rate": 8.336618367346505e-06, "loss": 1.615, "step": 20460 }, { "epoch": 3.66, "grad_norm": 1.372970700263977, "learning_rate": 8.326139558195242e-06, "loss": 1.9454, "step": 20465 }, { "epoch": 3.66, "grad_norm": 0.6343486905097961, "learning_rate": 8.315666022969998e-06, "loss": 1.8179, "step": 20470 }, { "epoch": 3.66, "grad_norm": 0.6342081427574158, "learning_rate": 8.30519776498354e-06, "loss": 1.7307, "step": 20475 }, { "epoch": 3.67, "grad_norm": 1.3767012357711792, "learning_rate": 8.294734787546975e-06, "loss": 1.8573, "step": 20480 }, { "epoch": 3.67, "grad_norm": 0.8521339297294617, "learning_rate": 8.284277093969737e-06, "loss": 1.9258, "step": 20485 }, { "epoch": 3.67, "grad_norm": 0.3690893054008484, "learning_rate": 8.273824687559573e-06, "loss": 1.9283, "step": 20490 }, { "epoch": 3.67, "grad_norm": 0.61278235912323, "learning_rate": 8.263377571622563e-06, "loss": 1.6369, "step": 20495 }, { "epoch": 3.67, "grad_norm": 0.7713987827301025, "learning_rate": 8.252935749463132e-06, "loss": 1.9637, "step": 20500 }, { "epoch": 3.67, "grad_norm": 0.4912928640842438, "learning_rate": 8.242499224384012e-06, "loss": 1.9876, "step": 20505 }, { "epoch": 3.67, "grad_norm": 1.3459941148757935, "learning_rate": 8.232067999686269e-06, "loss": 1.5705, "step": 20510 }, { "epoch": 3.67, "grad_norm": 0.9344834089279175, "learning_rate": 8.221642078669296e-06, "loss": 1.7796, "step": 20515 }, { "epoch": 3.67, "grad_norm": 1.331256628036499, "learning_rate": 8.211221464630788e-06, "loss": 1.85, "step": 20520 }, { "epoch": 3.67, "grad_norm": 0.6549345850944519, "learning_rate": 8.200806160866784e-06, "loss": 1.7886, "step": 20525 }, { "epoch": 3.67, "grad_norm": 0.852634847164154, "learning_rate": 8.19039617067163e-06, "loss": 1.6675, "step": 20530 }, { "epoch": 3.68, "grad_norm": 0.6992741227149963, "learning_rate": 8.179991497337997e-06, "loss": 1.9973, "step": 20535 }, { "epoch": 3.68, "grad_norm": 0.8704904317855835, "learning_rate": 8.169592144156885e-06, "loss": 1.7932, "step": 20540 }, { "epoch": 3.68, "grad_norm": 4.262266635894775, "learning_rate": 8.159198114417591e-06, "loss": 1.679, "step": 20545 }, { "epoch": 3.68, "grad_norm": 0.6595445275306702, "learning_rate": 8.148809411407723e-06, "loss": 1.8552, "step": 20550 }, { "epoch": 3.68, "grad_norm": 0.7737083435058594, "learning_rate": 8.13842603841323e-06, "loss": 1.8383, "step": 20555 }, { "epoch": 3.68, "grad_norm": 0.8114953637123108, "learning_rate": 8.128047998718363e-06, "loss": 1.8305, "step": 20560 }, { "epoch": 3.68, "grad_norm": 0.9654433131217957, "learning_rate": 8.117675295605685e-06, "loss": 1.7202, "step": 20565 }, { "epoch": 3.68, "grad_norm": 1.0531843900680542, "learning_rate": 8.10730793235608e-06, "loss": 1.437, "step": 20570 }, { "epoch": 3.68, "grad_norm": 0.9250448942184448, "learning_rate": 8.096945912248721e-06, "loss": 1.4706, "step": 20575 }, { "epoch": 3.68, "grad_norm": 0.9229045510292053, "learning_rate": 8.08658923856111e-06, "loss": 1.8187, "step": 20580 }, { "epoch": 3.68, "grad_norm": 0.9853529930114746, "learning_rate": 8.076237914569063e-06, "loss": 1.7397, "step": 20585 }, { "epoch": 3.69, "grad_norm": 0.8005128502845764, "learning_rate": 8.065891943546675e-06, "loss": 1.9982, "step": 20590 }, { "epoch": 3.69, "grad_norm": 0.5977509021759033, "learning_rate": 8.055551328766378e-06, "loss": 1.8997, "step": 20595 }, { "epoch": 3.69, "grad_norm": 1.766401767730713, "learning_rate": 8.045216073498896e-06, "loss": 1.6968, "step": 20600 }, { "epoch": 3.69, "grad_norm": 1.244626760482788, "learning_rate": 8.03488618101327e-06, "loss": 1.7645, "step": 20605 }, { "epoch": 3.69, "grad_norm": 0.5766518115997314, "learning_rate": 8.02456165457682e-06, "loss": 1.8566, "step": 20610 }, { "epoch": 3.69, "grad_norm": 0.8886863589286804, "learning_rate": 8.014242497455191e-06, "loss": 1.7796, "step": 20615 }, { "epoch": 3.69, "grad_norm": 0.8238760232925415, "learning_rate": 8.00392871291232e-06, "loss": 2.0147, "step": 20620 }, { "epoch": 3.69, "grad_norm": 0.5203852653503418, "learning_rate": 7.993620304210447e-06, "loss": 2.0413, "step": 20625 }, { "epoch": 3.69, "grad_norm": 0.9186719059944153, "learning_rate": 7.983317274610125e-06, "loss": 1.8603, "step": 20630 }, { "epoch": 3.69, "grad_norm": 1.096434473991394, "learning_rate": 7.973019627370182e-06, "loss": 1.6407, "step": 20635 }, { "epoch": 3.69, "grad_norm": 0.8249112963676453, "learning_rate": 7.96272736574774e-06, "loss": 1.7465, "step": 20640 }, { "epoch": 3.7, "grad_norm": 1.604615330696106, "learning_rate": 7.95244049299825e-06, "loss": 1.7763, "step": 20645 }, { "epoch": 3.7, "grad_norm": 0.730076789855957, "learning_rate": 7.942159012375434e-06, "loss": 1.9924, "step": 20650 }, { "epoch": 3.7, "grad_norm": 1.068292260169983, "learning_rate": 7.93188292713131e-06, "loss": 2.1643, "step": 20655 }, { "epoch": 3.7, "grad_norm": 1.0101828575134277, "learning_rate": 7.92161224051621e-06, "loss": 1.4485, "step": 20660 }, { "epoch": 3.7, "grad_norm": 0.4581923186779022, "learning_rate": 7.911346955778722e-06, "loss": 1.9251, "step": 20665 }, { "epoch": 3.7, "grad_norm": 0.5387589335441589, "learning_rate": 7.901087076165756e-06, "loss": 1.9034, "step": 20670 }, { "epoch": 3.7, "grad_norm": 0.7266547679901123, "learning_rate": 7.8908326049225e-06, "loss": 1.8593, "step": 20675 }, { "epoch": 3.7, "grad_norm": 2.4284040927886963, "learning_rate": 7.880583545292441e-06, "loss": 1.5051, "step": 20680 }, { "epoch": 3.7, "grad_norm": 0.9465983510017395, "learning_rate": 7.870339900517335e-06, "loss": 1.8325, "step": 20685 }, { "epoch": 3.7, "grad_norm": 0.9285613894462585, "learning_rate": 7.86010167383724e-06, "loss": 1.8667, "step": 20690 }, { "epoch": 3.7, "grad_norm": 0.7407609820365906, "learning_rate": 7.849868868490506e-06, "loss": 1.8178, "step": 20695 }, { "epoch": 3.71, "grad_norm": 5.604633331298828, "learning_rate": 7.839641487713745e-06, "loss": 1.994, "step": 20700 }, { "epoch": 3.71, "grad_norm": 0.8995658159255981, "learning_rate": 7.829419534741875e-06, "loss": 1.9699, "step": 20705 }, { "epoch": 3.71, "grad_norm": 1.138853907585144, "learning_rate": 7.819203012808091e-06, "loss": 1.9048, "step": 20710 }, { "epoch": 3.71, "grad_norm": 0.8742569088935852, "learning_rate": 7.808991925143869e-06, "loss": 2.0381, "step": 20715 }, { "epoch": 3.71, "grad_norm": 0.40514862537384033, "learning_rate": 7.798786274978969e-06, "loss": 2.1029, "step": 20720 }, { "epoch": 3.71, "grad_norm": 1.0639851093292236, "learning_rate": 7.788586065541431e-06, "loss": 1.6771, "step": 20725 }, { "epoch": 3.71, "grad_norm": 2.2875404357910156, "learning_rate": 7.778391300057553e-06, "loss": 1.8035, "step": 20730 }, { "epoch": 3.71, "grad_norm": 0.6936883330345154, "learning_rate": 7.768201981751944e-06, "loss": 1.8963, "step": 20735 }, { "epoch": 3.71, "grad_norm": 0.7434184551239014, "learning_rate": 7.758018113847473e-06, "loss": 1.6205, "step": 20740 }, { "epoch": 3.71, "grad_norm": 1.422929048538208, "learning_rate": 7.747839699565288e-06, "loss": 2.0673, "step": 20745 }, { "epoch": 3.71, "grad_norm": 0.9202067852020264, "learning_rate": 7.737666742124816e-06, "loss": 1.6292, "step": 20750 }, { "epoch": 3.72, "grad_norm": 0.48007720708847046, "learning_rate": 7.727499244743747e-06, "loss": 1.8354, "step": 20755 }, { "epoch": 3.72, "grad_norm": 1.398300051689148, "learning_rate": 7.717337210638049e-06, "loss": 1.7383, "step": 20760 }, { "epoch": 3.72, "grad_norm": 1.650846004486084, "learning_rate": 7.707180643021969e-06, "loss": 1.8777, "step": 20765 }, { "epoch": 3.72, "grad_norm": 3.9538509845733643, "learning_rate": 7.697029545108025e-06, "loss": 1.6718, "step": 20770 }, { "epoch": 3.72, "grad_norm": 0.817905843257904, "learning_rate": 7.686883920106986e-06, "loss": 1.6816, "step": 20775 }, { "epoch": 3.72, "grad_norm": 0.4926592707633972, "learning_rate": 7.676743771227906e-06, "loss": 1.7673, "step": 20780 }, { "epoch": 3.72, "grad_norm": 0.5661248564720154, "learning_rate": 7.666609101678121e-06, "loss": 1.7792, "step": 20785 }, { "epoch": 3.72, "grad_norm": 0.8012621402740479, "learning_rate": 7.656479914663195e-06, "loss": 1.7374, "step": 20790 }, { "epoch": 3.72, "grad_norm": 1.1758484840393066, "learning_rate": 7.646356213386988e-06, "loss": 1.5582, "step": 20795 }, { "epoch": 3.72, "grad_norm": 1.1315603256225586, "learning_rate": 7.63623800105162e-06, "loss": 2.1121, "step": 20800 }, { "epoch": 3.72, "grad_norm": 0.9538705348968506, "learning_rate": 7.626125280857471e-06, "loss": 1.5779, "step": 20805 }, { "epoch": 3.72, "grad_norm": 0.8569843769073486, "learning_rate": 7.616018056003191e-06, "loss": 1.8019, "step": 20810 }, { "epoch": 3.73, "grad_norm": 1.2387229204177856, "learning_rate": 7.605916329685672e-06, "loss": 1.6668, "step": 20815 }, { "epoch": 3.73, "grad_norm": 0.8599191308021545, "learning_rate": 7.595820105100094e-06, "loss": 1.7848, "step": 20820 }, { "epoch": 3.73, "grad_norm": 0.7321486473083496, "learning_rate": 7.585729385439869e-06, "loss": 1.7338, "step": 20825 }, { "epoch": 3.73, "grad_norm": 0.7330595254898071, "learning_rate": 7.575644173896687e-06, "loss": 1.9163, "step": 20830 }, { "epoch": 3.73, "grad_norm": 0.7383427619934082, "learning_rate": 7.5655644736604945e-06, "loss": 2.0798, "step": 20835 }, { "epoch": 3.73, "grad_norm": 0.7931711673736572, "learning_rate": 7.555490287919497e-06, "loss": 1.9642, "step": 20840 }, { "epoch": 3.73, "grad_norm": 0.7924222350120544, "learning_rate": 7.545421619860133e-06, "loss": 1.6436, "step": 20845 }, { "epoch": 3.73, "grad_norm": 1.1156904697418213, "learning_rate": 7.53535847266712e-06, "loss": 1.853, "step": 20850 }, { "epoch": 3.73, "grad_norm": 0.7994991540908813, "learning_rate": 7.5253008495234255e-06, "loss": 1.7786, "step": 20855 }, { "epoch": 3.73, "grad_norm": 1.3960700035095215, "learning_rate": 7.515248753610266e-06, "loss": 1.9, "step": 20860 }, { "epoch": 3.73, "grad_norm": 0.9603045582771301, "learning_rate": 7.505202188107113e-06, "loss": 1.8195, "step": 20865 }, { "epoch": 3.74, "grad_norm": 1.181504487991333, "learning_rate": 7.495161156191679e-06, "loss": 1.6414, "step": 20870 }, { "epoch": 3.74, "grad_norm": 0.9459575414657593, "learning_rate": 7.485125661039927e-06, "loss": 1.9267, "step": 20875 }, { "epoch": 3.74, "grad_norm": 1.0895607471466064, "learning_rate": 7.47509570582608e-06, "loss": 1.8476, "step": 20880 }, { "epoch": 3.74, "grad_norm": 0.9939213395118713, "learning_rate": 7.465071293722606e-06, "loss": 1.8669, "step": 20885 }, { "epoch": 3.74, "grad_norm": 0.4708957076072693, "learning_rate": 7.455052427900217e-06, "loss": 1.7555, "step": 20890 }, { "epoch": 3.74, "grad_norm": 0.8547396659851074, "learning_rate": 7.445039111527866e-06, "loss": 1.8426, "step": 20895 }, { "epoch": 3.74, "grad_norm": 0.9265462160110474, "learning_rate": 7.435031347772767e-06, "loss": 2.0446, "step": 20900 }, { "epoch": 3.74, "grad_norm": 0.7409059405326843, "learning_rate": 7.425029139800349e-06, "loss": 2.0207, "step": 20905 }, { "epoch": 3.74, "grad_norm": 0.8008315563201904, "learning_rate": 7.415032490774318e-06, "loss": 1.6952, "step": 20910 }, { "epoch": 3.74, "grad_norm": 0.6471626162528992, "learning_rate": 7.4050414038565856e-06, "loss": 1.807, "step": 20915 }, { "epoch": 3.74, "grad_norm": 0.8940064907073975, "learning_rate": 7.3950558822073354e-06, "loss": 1.6712, "step": 20920 }, { "epoch": 3.75, "grad_norm": 0.8687788844108582, "learning_rate": 7.3850759289849745e-06, "loss": 1.9076, "step": 20925 }, { "epoch": 3.75, "grad_norm": 0.44189202785491943, "learning_rate": 7.3751015473461634e-06, "loss": 1.7675, "step": 20930 }, { "epoch": 3.75, "grad_norm": 0.8087650537490845, "learning_rate": 7.3671260556953974e-06, "loss": 1.9859, "step": 20935 }, { "epoch": 3.75, "grad_norm": 1.3803483247756958, "learning_rate": 7.357161710856081e-06, "loss": 2.1581, "step": 20940 }, { "epoch": 3.75, "grad_norm": 0.845194935798645, "learning_rate": 7.347202946429546e-06, "loss": 1.6597, "step": 20945 }, { "epoch": 3.75, "grad_norm": 1.7201052904129028, "learning_rate": 7.337249765565737e-06, "loss": 1.9686, "step": 20950 }, { "epoch": 3.75, "grad_norm": 0.8273900747299194, "learning_rate": 7.327302171412848e-06, "loss": 1.7416, "step": 20955 }, { "epoch": 3.75, "grad_norm": 0.9852076768875122, "learning_rate": 7.317360167117274e-06, "loss": 1.5727, "step": 20960 }, { "epoch": 3.75, "grad_norm": 0.5173888802528381, "learning_rate": 7.307423755823678e-06, "loss": 1.8316, "step": 20965 }, { "epoch": 3.75, "grad_norm": 0.5297005772590637, "learning_rate": 7.2974929406749246e-06, "loss": 1.5954, "step": 20970 }, { "epoch": 3.75, "grad_norm": 0.7687706351280212, "learning_rate": 7.287567724812125e-06, "loss": 2.2233, "step": 20975 }, { "epoch": 3.76, "grad_norm": 0.9378607869148254, "learning_rate": 7.277648111374616e-06, "loss": 1.8363, "step": 20980 }, { "epoch": 3.76, "grad_norm": 0.6281090378761292, "learning_rate": 7.2677341034999625e-06, "loss": 1.5648, "step": 20985 }, { "epoch": 3.76, "grad_norm": 1.1043148040771484, "learning_rate": 7.257825704323964e-06, "loss": 1.8477, "step": 20990 }, { "epoch": 3.76, "grad_norm": 1.0617643594741821, "learning_rate": 7.247922916980624e-06, "loss": 1.6808, "step": 20995 }, { "epoch": 3.76, "grad_norm": 2.293015480041504, "learning_rate": 7.238025744602186e-06, "loss": 1.8726, "step": 21000 }, { "epoch": 3.76, "grad_norm": 0.5096902847290039, "learning_rate": 7.22813419031913e-06, "loss": 1.7613, "step": 21005 }, { "epoch": 3.76, "grad_norm": 0.8138590455055237, "learning_rate": 7.218248257260127e-06, "loss": 1.6788, "step": 21010 }, { "epoch": 3.76, "grad_norm": 0.5406817197799683, "learning_rate": 7.2083679485520974e-06, "loss": 1.8645, "step": 21015 }, { "epoch": 3.76, "grad_norm": 0.9503873586654663, "learning_rate": 7.198493267320178e-06, "loss": 1.6199, "step": 21020 }, { "epoch": 3.76, "grad_norm": 1.1343449354171753, "learning_rate": 7.1886242166877084e-06, "loss": 1.4947, "step": 21025 }, { "epoch": 3.76, "grad_norm": 0.753250002861023, "learning_rate": 7.1787607997762654e-06, "loss": 2.0704, "step": 21030 }, { "epoch": 3.77, "grad_norm": 0.606606662273407, "learning_rate": 7.168903019705642e-06, "loss": 1.5729, "step": 21035 }, { "epoch": 3.77, "grad_norm": 1.2045537233352661, "learning_rate": 7.159050879593843e-06, "loss": 1.6266, "step": 21040 }, { "epoch": 3.77, "grad_norm": 1.3641705513000488, "learning_rate": 7.149204382557095e-06, "loss": 1.6646, "step": 21045 }, { "epoch": 3.77, "grad_norm": 1.94281804561615, "learning_rate": 7.139363531709825e-06, "loss": 1.7295, "step": 21050 }, { "epoch": 3.77, "grad_norm": 0.4692668318748474, "learning_rate": 7.129528330164695e-06, "loss": 1.7785, "step": 21055 }, { "epoch": 3.77, "grad_norm": 1.879866123199463, "learning_rate": 7.11969878103256e-06, "loss": 1.866, "step": 21060 }, { "epoch": 3.77, "grad_norm": 0.6057711243629456, "learning_rate": 7.109874887422499e-06, "loss": 1.9315, "step": 21065 }, { "epoch": 3.77, "grad_norm": 0.5978086590766907, "learning_rate": 7.1000566524418055e-06, "loss": 1.9369, "step": 21070 }, { "epoch": 3.77, "grad_norm": 1.631386637687683, "learning_rate": 7.090244079195976e-06, "loss": 1.6194, "step": 21075 }, { "epoch": 3.77, "grad_norm": 0.8467076420783997, "learning_rate": 7.080437170788723e-06, "loss": 1.9249, "step": 21080 }, { "epoch": 3.77, "grad_norm": 0.6821494698524475, "learning_rate": 7.070635930321948e-06, "loss": 1.7521, "step": 21085 }, { "epoch": 3.78, "grad_norm": 0.9568500518798828, "learning_rate": 7.060840360895785e-06, "loss": 1.7341, "step": 21090 }, { "epoch": 3.78, "grad_norm": 1.729252576828003, "learning_rate": 7.051050465608558e-06, "loss": 1.6962, "step": 21095 }, { "epoch": 3.78, "grad_norm": 1.2900949716567993, "learning_rate": 7.041266247556813e-06, "loss": 2.2104, "step": 21100 }, { "epoch": 3.78, "grad_norm": 1.797668695449829, "learning_rate": 7.031487709835272e-06, "loss": 1.7064, "step": 21105 }, { "epoch": 3.78, "grad_norm": 0.8635187745094299, "learning_rate": 7.021714855536893e-06, "loss": 1.7889, "step": 21110 }, { "epoch": 3.78, "grad_norm": 0.6783146858215332, "learning_rate": 7.011947687752804e-06, "loss": 1.8269, "step": 21115 }, { "epoch": 3.78, "grad_norm": 1.27726411819458, "learning_rate": 7.002186209572359e-06, "loss": 1.9167, "step": 21120 }, { "epoch": 3.78, "grad_norm": 0.873021125793457, "learning_rate": 6.9924304240831045e-06, "loss": 1.7524, "step": 21125 }, { "epoch": 3.78, "grad_norm": 0.6796855926513672, "learning_rate": 6.982680334370786e-06, "loss": 1.899, "step": 21130 }, { "epoch": 3.78, "grad_norm": 1.1195898056030273, "learning_rate": 6.972935943519354e-06, "loss": 2.2136, "step": 21135 }, { "epoch": 3.78, "grad_norm": 1.5191144943237305, "learning_rate": 6.963197254610937e-06, "loss": 2.0849, "step": 21140 }, { "epoch": 3.78, "grad_norm": 0.978111982345581, "learning_rate": 6.9534642707258875e-06, "loss": 1.9883, "step": 21145 }, { "epoch": 3.79, "grad_norm": 1.8795119524002075, "learning_rate": 6.943736994942721e-06, "loss": 1.6622, "step": 21150 }, { "epoch": 3.79, "grad_norm": 1.2708956003189087, "learning_rate": 6.934015430338176e-06, "loss": 1.9199, "step": 21155 }, { "epoch": 3.79, "grad_norm": 0.7446703910827637, "learning_rate": 6.924299579987173e-06, "loss": 1.885, "step": 21160 }, { "epoch": 3.79, "grad_norm": 1.003434658050537, "learning_rate": 6.914589446962824e-06, "loss": 1.8679, "step": 21165 }, { "epoch": 3.79, "grad_norm": 0.9047329425811768, "learning_rate": 6.904885034336448e-06, "loss": 1.7717, "step": 21170 }, { "epoch": 3.79, "grad_norm": 0.6826780438423157, "learning_rate": 6.895186345177521e-06, "loss": 1.9632, "step": 21175 }, { "epoch": 3.79, "grad_norm": 0.5355373024940491, "learning_rate": 6.885493382553737e-06, "loss": 2.0283, "step": 21180 }, { "epoch": 3.79, "grad_norm": 0.8014163374900818, "learning_rate": 6.875806149530975e-06, "loss": 1.7532, "step": 21185 }, { "epoch": 3.79, "grad_norm": 0.9986274242401123, "learning_rate": 6.866124649173295e-06, "loss": 1.9851, "step": 21190 }, { "epoch": 3.79, "grad_norm": 1.0653488636016846, "learning_rate": 6.8564488845429515e-06, "loss": 2.1604, "step": 21195 }, { "epoch": 3.79, "grad_norm": 1.1097596883773804, "learning_rate": 6.846778858700373e-06, "loss": 1.6555, "step": 21200 }, { "epoch": 3.8, "grad_norm": 15.828474044799805, "learning_rate": 6.837114574704176e-06, "loss": 1.6847, "step": 21205 }, { "epoch": 3.8, "grad_norm": 3.438732385635376, "learning_rate": 6.827456035611166e-06, "loss": 1.7038, "step": 21210 }, { "epoch": 3.8, "grad_norm": 0.7653161287307739, "learning_rate": 6.817803244476331e-06, "loss": 1.8852, "step": 21215 }, { "epoch": 3.8, "grad_norm": 1.15346360206604, "learning_rate": 6.8081562043528445e-06, "loss": 1.6198, "step": 21220 }, { "epoch": 3.8, "grad_norm": 0.6364823579788208, "learning_rate": 6.7985149182920564e-06, "loss": 1.8619, "step": 21225 }, { "epoch": 3.8, "grad_norm": 1.149183988571167, "learning_rate": 6.788879389343486e-06, "loss": 1.9567, "step": 21230 }, { "epoch": 3.8, "grad_norm": 0.8027864694595337, "learning_rate": 6.7792496205548475e-06, "loss": 1.9675, "step": 21235 }, { "epoch": 3.8, "grad_norm": 0.7436734437942505, "learning_rate": 6.769625614972036e-06, "loss": 1.8813, "step": 21240 }, { "epoch": 3.8, "grad_norm": 1.2169238328933716, "learning_rate": 6.760007375639099e-06, "loss": 1.8129, "step": 21245 }, { "epoch": 3.8, "grad_norm": 0.6234614253044128, "learning_rate": 6.750394905598284e-06, "loss": 1.8025, "step": 21250 }, { "epoch": 3.8, "grad_norm": 2.2738096714019775, "learning_rate": 6.740788207890017e-06, "loss": 2.0495, "step": 21255 }, { "epoch": 3.81, "grad_norm": 4.580897808074951, "learning_rate": 6.731187285552865e-06, "loss": 1.4265, "step": 21260 }, { "epoch": 3.81, "grad_norm": 1.344910740852356, "learning_rate": 6.721592141623606e-06, "loss": 1.9222, "step": 21265 }, { "epoch": 3.81, "grad_norm": 1.4023634195327759, "learning_rate": 6.712002779137169e-06, "loss": 1.7676, "step": 21270 }, { "epoch": 3.81, "grad_norm": 2.361953020095825, "learning_rate": 6.702419201126661e-06, "loss": 1.6256, "step": 21275 }, { "epoch": 3.81, "grad_norm": 0.7495207190513611, "learning_rate": 6.69284141062336e-06, "loss": 1.9166, "step": 21280 }, { "epoch": 3.81, "grad_norm": 0.5383853316307068, "learning_rate": 6.68326941065672e-06, "loss": 1.8247, "step": 21285 }, { "epoch": 3.81, "grad_norm": 0.7718889713287354, "learning_rate": 6.67370320425435e-06, "loss": 1.959, "step": 21290 }, { "epoch": 3.81, "grad_norm": 0.666534423828125, "learning_rate": 6.6641427944420185e-06, "loss": 1.9477, "step": 21295 }, { "epoch": 3.81, "grad_norm": 1.1692932844161987, "learning_rate": 6.654588184243682e-06, "loss": 1.7154, "step": 21300 }, { "epoch": 3.81, "grad_norm": 0.8114296793937683, "learning_rate": 6.645039376681459e-06, "loss": 1.6824, "step": 21305 }, { "epoch": 3.81, "grad_norm": 1.2504725456237793, "learning_rate": 6.635496374775626e-06, "loss": 1.7564, "step": 21310 }, { "epoch": 3.82, "grad_norm": 1.4884883165359497, "learning_rate": 6.6259591815446335e-06, "loss": 1.4289, "step": 21315 }, { "epoch": 3.82, "grad_norm": 0.705243706703186, "learning_rate": 6.616427800005068e-06, "loss": 1.8026, "step": 21320 }, { "epoch": 3.82, "grad_norm": 0.7473030090332031, "learning_rate": 6.606902233171711e-06, "loss": 1.9331, "step": 21325 }, { "epoch": 3.82, "grad_norm": 0.8204745054244995, "learning_rate": 6.597382484057482e-06, "loss": 1.798, "step": 21330 }, { "epoch": 3.82, "grad_norm": 0.48442649841308594, "learning_rate": 6.587868555673479e-06, "loss": 1.8777, "step": 21335 }, { "epoch": 3.82, "grad_norm": 1.1391490697860718, "learning_rate": 6.5783604510289365e-06, "loss": 1.8687, "step": 21340 }, { "epoch": 3.82, "grad_norm": 0.8502964973449707, "learning_rate": 6.56885817313127e-06, "loss": 1.8788, "step": 21345 }, { "epoch": 3.82, "grad_norm": 2.4996724128723145, "learning_rate": 6.559361724986029e-06, "loss": 2.0761, "step": 21350 }, { "epoch": 3.82, "grad_norm": 0.40672042965888977, "learning_rate": 6.549871109596939e-06, "loss": 1.8876, "step": 21355 }, { "epoch": 3.82, "grad_norm": 1.889902949333191, "learning_rate": 6.5403863299658665e-06, "loss": 1.8512, "step": 21360 }, { "epoch": 3.82, "grad_norm": 0.7777634263038635, "learning_rate": 6.530907389092842e-06, "loss": 2.0392, "step": 21365 }, { "epoch": 3.83, "grad_norm": 0.6818479299545288, "learning_rate": 6.521434289976047e-06, "loss": 1.6088, "step": 21370 }, { "epoch": 3.83, "grad_norm": 0.5815435647964478, "learning_rate": 6.511967035611816e-06, "loss": 1.9384, "step": 21375 }, { "epoch": 3.83, "grad_norm": 1.1326960325241089, "learning_rate": 6.5025056289946315e-06, "loss": 2.0075, "step": 21380 }, { "epoch": 3.83, "grad_norm": 0.7850456833839417, "learning_rate": 6.493050073117116e-06, "loss": 1.8053, "step": 21385 }, { "epoch": 3.83, "grad_norm": 1.2165858745574951, "learning_rate": 6.483600370970061e-06, "loss": 1.6863, "step": 21390 }, { "epoch": 3.83, "grad_norm": 1.1443709135055542, "learning_rate": 6.474156525542399e-06, "loss": 1.9593, "step": 21395 }, { "epoch": 3.83, "grad_norm": 0.9057660698890686, "learning_rate": 6.4647185398212075e-06, "loss": 1.7776, "step": 21400 }, { "epoch": 3.83, "grad_norm": 1.2972538471221924, "learning_rate": 6.45528641679172e-06, "loss": 1.7882, "step": 21405 }, { "epoch": 3.83, "grad_norm": 1.1421040296554565, "learning_rate": 6.445860159437295e-06, "loss": 1.4755, "step": 21410 }, { "epoch": 3.83, "grad_norm": 1.4928812980651855, "learning_rate": 6.4364397707394555e-06, "loss": 1.8019, "step": 21415 }, { "epoch": 3.83, "grad_norm": 1.1022720336914062, "learning_rate": 6.427025253677857e-06, "loss": 2.0423, "step": 21420 }, { "epoch": 3.83, "grad_norm": 0.6213718056678772, "learning_rate": 6.417616611230318e-06, "loss": 1.8774, "step": 21425 }, { "epoch": 3.84, "grad_norm": 1.03056800365448, "learning_rate": 6.40821384637276e-06, "loss": 1.995, "step": 21430 }, { "epoch": 3.84, "grad_norm": 0.6993535757064819, "learning_rate": 6.398816962079287e-06, "loss": 2.0732, "step": 21435 }, { "epoch": 3.84, "grad_norm": 1.2051020860671997, "learning_rate": 6.3894259613221095e-06, "loss": 1.5418, "step": 21440 }, { "epoch": 3.84, "grad_norm": 0.8141611218452454, "learning_rate": 6.380040847071603e-06, "loss": 1.536, "step": 21445 }, { "epoch": 3.84, "grad_norm": 0.784510612487793, "learning_rate": 6.370661622296261e-06, "loss": 1.8054, "step": 21450 }, { "epoch": 3.84, "grad_norm": 1.0735670328140259, "learning_rate": 6.361288289962733e-06, "loss": 1.9683, "step": 21455 }, { "epoch": 3.84, "grad_norm": 0.6221914887428284, "learning_rate": 6.351920853035795e-06, "loss": 1.829, "step": 21460 }, { "epoch": 3.84, "grad_norm": 0.5578571557998657, "learning_rate": 6.34255931447835e-06, "loss": 1.6748, "step": 21465 }, { "epoch": 3.84, "grad_norm": 0.4949614703655243, "learning_rate": 6.333203677251445e-06, "loss": 1.9189, "step": 21470 }, { "epoch": 3.84, "grad_norm": 1.253214716911316, "learning_rate": 6.323853944314267e-06, "loss": 1.5789, "step": 21475 }, { "epoch": 3.84, "grad_norm": 0.5360049605369568, "learning_rate": 6.3145101186241175e-06, "loss": 1.7498, "step": 21480 }, { "epoch": 3.85, "grad_norm": 0.6865981221199036, "learning_rate": 6.305172203136447e-06, "loss": 1.8506, "step": 21485 }, { "epoch": 3.85, "grad_norm": 0.7552772760391235, "learning_rate": 6.295840200804823e-06, "loss": 1.9033, "step": 21490 }, { "epoch": 3.85, "grad_norm": 0.9978206157684326, "learning_rate": 6.2865141145809604e-06, "loss": 2.001, "step": 21495 }, { "epoch": 3.85, "grad_norm": 0.6220951080322266, "learning_rate": 6.277193947414678e-06, "loss": 1.809, "step": 21500 }, { "epoch": 3.85, "grad_norm": 1.6220732927322388, "learning_rate": 6.267879702253937e-06, "loss": 1.789, "step": 21505 }, { "epoch": 3.85, "grad_norm": 0.6907435059547424, "learning_rate": 6.258571382044831e-06, "loss": 2.0546, "step": 21510 }, { "epoch": 3.85, "grad_norm": 0.738203227519989, "learning_rate": 6.2492689897315685e-06, "loss": 2.0193, "step": 21515 }, { "epoch": 3.85, "grad_norm": 0.5000423192977905, "learning_rate": 6.239972528256497e-06, "loss": 1.666, "step": 21520 }, { "epoch": 3.85, "grad_norm": 0.42941489815711975, "learning_rate": 6.23068200056007e-06, "loss": 1.9748, "step": 21525 }, { "epoch": 3.85, "grad_norm": 0.4890025854110718, "learning_rate": 6.221397409580864e-06, "loss": 2.0923, "step": 21530 }, { "epoch": 3.85, "grad_norm": 0.8558998107910156, "learning_rate": 6.212118758255595e-06, "loss": 1.7637, "step": 21535 }, { "epoch": 3.86, "grad_norm": 0.7938655614852905, "learning_rate": 6.202846049519093e-06, "loss": 1.752, "step": 21540 }, { "epoch": 3.86, "grad_norm": 1.0123392343521118, "learning_rate": 6.193579286304302e-06, "loss": 1.6815, "step": 21545 }, { "epoch": 3.86, "grad_norm": 3.046292781829834, "learning_rate": 6.184318471542303e-06, "loss": 1.7829, "step": 21550 }, { "epoch": 3.86, "grad_norm": 0.652489423751831, "learning_rate": 6.175063608162266e-06, "loss": 1.903, "step": 21555 }, { "epoch": 3.86, "grad_norm": 0.838995099067688, "learning_rate": 6.165814699091499e-06, "loss": 1.789, "step": 21560 }, { "epoch": 3.86, "grad_norm": 0.7666052579879761, "learning_rate": 6.156571747255435e-06, "loss": 1.6717, "step": 21565 }, { "epoch": 3.86, "grad_norm": 0.5006678700447083, "learning_rate": 6.147334755577596e-06, "loss": 1.9561, "step": 21570 }, { "epoch": 3.86, "grad_norm": 0.8589296340942383, "learning_rate": 6.138103726979641e-06, "loss": 1.9194, "step": 21575 }, { "epoch": 3.86, "grad_norm": 0.4786919951438904, "learning_rate": 6.128878664381332e-06, "loss": 1.9861, "step": 21580 }, { "epoch": 3.86, "grad_norm": 1.460368275642395, "learning_rate": 6.119659570700559e-06, "loss": 1.6571, "step": 21585 }, { "epoch": 3.86, "grad_norm": 0.6441243290901184, "learning_rate": 6.110446448853293e-06, "loss": 2.1022, "step": 21590 }, { "epoch": 3.87, "grad_norm": 0.36151692271232605, "learning_rate": 6.10123930175365e-06, "loss": 2.0311, "step": 21595 }, { "epoch": 3.87, "grad_norm": 0.6082046031951904, "learning_rate": 6.092038132313835e-06, "loss": 1.7961, "step": 21600 }, { "epoch": 3.87, "grad_norm": 0.5249208211898804, "learning_rate": 6.082842943444172e-06, "loss": 1.7438, "step": 21605 }, { "epoch": 3.87, "grad_norm": 0.9609794020652771, "learning_rate": 6.073653738053098e-06, "loss": 2.2195, "step": 21610 }, { "epoch": 3.87, "grad_norm": 0.8128685355186462, "learning_rate": 6.064470519047141e-06, "loss": 1.6784, "step": 21615 }, { "epoch": 3.87, "grad_norm": 1.8232717514038086, "learning_rate": 6.0552932893309374e-06, "loss": 1.8214, "step": 21620 }, { "epoch": 3.87, "grad_norm": 1.522959589958191, "learning_rate": 6.046122051807246e-06, "loss": 1.5712, "step": 21625 }, { "epoch": 3.87, "grad_norm": 0.9963710904121399, "learning_rate": 6.036956809376915e-06, "loss": 1.7489, "step": 21630 }, { "epoch": 3.87, "grad_norm": 1.037346601486206, "learning_rate": 6.027797564938906e-06, "loss": 1.5724, "step": 21635 }, { "epoch": 3.87, "grad_norm": 1.6422029733657837, "learning_rate": 6.018644321390288e-06, "loss": 1.7118, "step": 21640 }, { "epoch": 3.87, "grad_norm": 0.9936122298240662, "learning_rate": 6.009497081626203e-06, "loss": 1.8194, "step": 21645 }, { "epoch": 3.88, "grad_norm": 1.9703397750854492, "learning_rate": 6.000355848539926e-06, "loss": 1.6014, "step": 21650 }, { "epoch": 3.88, "grad_norm": 0.7200276851654053, "learning_rate": 5.99122062502282e-06, "loss": 1.9046, "step": 21655 }, { "epoch": 3.88, "grad_norm": 0.6927645802497864, "learning_rate": 5.982091413964352e-06, "loss": 1.6747, "step": 21660 }, { "epoch": 3.88, "grad_norm": 1.2343568801879883, "learning_rate": 5.97296821825207e-06, "loss": 1.6779, "step": 21665 }, { "epoch": 3.88, "grad_norm": 0.5954350829124451, "learning_rate": 5.9638510407716394e-06, "loss": 1.791, "step": 21670 }, { "epoch": 3.88, "grad_norm": 1.0342341661453247, "learning_rate": 5.954739884406821e-06, "loss": 1.7339, "step": 21675 }, { "epoch": 3.88, "grad_norm": 1.0114802122116089, "learning_rate": 5.94563475203945e-06, "loss": 1.9951, "step": 21680 }, { "epoch": 3.88, "grad_norm": 2.355144500732422, "learning_rate": 5.936535646549479e-06, "loss": 1.668, "step": 21685 }, { "epoch": 3.88, "grad_norm": 0.5509099960327148, "learning_rate": 5.927442570814948e-06, "loss": 1.819, "step": 21690 }, { "epoch": 3.88, "grad_norm": 0.8965473175048828, "learning_rate": 5.918355527711983e-06, "loss": 1.8374, "step": 21695 }, { "epoch": 3.88, "grad_norm": 0.8329721093177795, "learning_rate": 5.9092745201148215e-06, "loss": 1.7613, "step": 21700 }, { "epoch": 3.89, "grad_norm": 0.7440474033355713, "learning_rate": 5.900199550895763e-06, "loss": 1.7955, "step": 21705 }, { "epoch": 3.89, "grad_norm": 0.6702372431755066, "learning_rate": 5.891130622925209e-06, "loss": 1.5468, "step": 21710 }, { "epoch": 3.89, "grad_norm": 1.7360087633132935, "learning_rate": 5.882067739071659e-06, "loss": 1.8326, "step": 21715 }, { "epoch": 3.89, "grad_norm": 0.6325078010559082, "learning_rate": 5.873010902201695e-06, "loss": 1.8646, "step": 21720 }, { "epoch": 3.89, "grad_norm": 0.45870426297187805, "learning_rate": 5.863960115179984e-06, "loss": 1.8511, "step": 21725 }, { "epoch": 3.89, "grad_norm": 1.7255089282989502, "learning_rate": 5.854915380869286e-06, "loss": 1.8966, "step": 21730 }, { "epoch": 3.89, "grad_norm": 0.8191224932670593, "learning_rate": 5.845876702130434e-06, "loss": 1.8138, "step": 21735 }, { "epoch": 3.89, "grad_norm": 1.0439651012420654, "learning_rate": 5.836844081822357e-06, "loss": 1.6274, "step": 21740 }, { "epoch": 3.89, "grad_norm": 0.7920172810554504, "learning_rate": 5.827817522802065e-06, "loss": 1.9206, "step": 21745 }, { "epoch": 3.89, "grad_norm": 1.2962287664413452, "learning_rate": 5.818797027924647e-06, "loss": 1.9135, "step": 21750 }, { "epoch": 3.89, "grad_norm": 0.9365116357803345, "learning_rate": 5.809782600043287e-06, "loss": 1.7649, "step": 21755 }, { "epoch": 3.89, "grad_norm": 3.8692009449005127, "learning_rate": 5.800774242009227e-06, "loss": 1.8553, "step": 21760 }, { "epoch": 3.9, "grad_norm": 2.046795606613159, "learning_rate": 5.7917719566718095e-06, "loss": 1.7213, "step": 21765 }, { "epoch": 3.9, "grad_norm": 0.8467017412185669, "learning_rate": 5.782775746878444e-06, "loss": 1.9483, "step": 21770 }, { "epoch": 3.9, "grad_norm": 0.7216947078704834, "learning_rate": 5.773785615474622e-06, "loss": 2.0848, "step": 21775 }, { "epoch": 3.9, "grad_norm": 0.9281513690948486, "learning_rate": 5.764801565303918e-06, "loss": 1.9586, "step": 21780 }, { "epoch": 3.9, "grad_norm": 3.031953811645508, "learning_rate": 5.755823599207977e-06, "loss": 1.7133, "step": 21785 }, { "epoch": 3.9, "grad_norm": 0.6615840196609497, "learning_rate": 5.7468517200265276e-06, "loss": 2.0092, "step": 21790 }, { "epoch": 3.9, "grad_norm": 1.0913735628128052, "learning_rate": 5.737885930597351e-06, "loss": 1.6966, "step": 21795 }, { "epoch": 3.9, "grad_norm": 1.035690188407898, "learning_rate": 5.728926233756338e-06, "loss": 1.7959, "step": 21800 }, { "epoch": 3.9, "grad_norm": 0.7709459662437439, "learning_rate": 5.719972632337409e-06, "loss": 1.9455, "step": 21805 }, { "epoch": 3.9, "grad_norm": 0.6541898250579834, "learning_rate": 5.711025129172595e-06, "loss": 1.6532, "step": 21810 }, { "epoch": 3.9, "grad_norm": 0.6813231110572815, "learning_rate": 5.702083727091978e-06, "loss": 1.9995, "step": 21815 }, { "epoch": 3.91, "grad_norm": 0.7361324429512024, "learning_rate": 5.693148428923722e-06, "loss": 1.9465, "step": 21820 }, { "epoch": 3.91, "grad_norm": 1.0799798965454102, "learning_rate": 5.68421923749404e-06, "loss": 1.8866, "step": 21825 }, { "epoch": 3.91, "grad_norm": 0.9929239749908447, "learning_rate": 5.675296155627236e-06, "loss": 1.7591, "step": 21830 }, { "epoch": 3.91, "grad_norm": 0.9932503700256348, "learning_rate": 5.666379186145668e-06, "loss": 1.7421, "step": 21835 }, { "epoch": 3.91, "grad_norm": 0.8309764862060547, "learning_rate": 5.6574683318697665e-06, "loss": 1.9331, "step": 21840 }, { "epoch": 3.91, "grad_norm": 0.8258433938026428, "learning_rate": 5.6485635956180365e-06, "loss": 1.8122, "step": 21845 }, { "epoch": 3.91, "grad_norm": 1.11664617061615, "learning_rate": 5.639664980207024e-06, "loss": 1.8511, "step": 21850 }, { "epoch": 3.91, "grad_norm": 0.7466225624084473, "learning_rate": 5.63077248845135e-06, "loss": 1.9555, "step": 21855 }, { "epoch": 3.91, "grad_norm": 0.9185487031936646, "learning_rate": 5.621886123163708e-06, "loss": 1.6394, "step": 21860 }, { "epoch": 3.91, "grad_norm": 1.338221788406372, "learning_rate": 5.613005887154845e-06, "loss": 2.0797, "step": 21865 }, { "epoch": 3.91, "grad_norm": 0.5035430192947388, "learning_rate": 5.604131783233574e-06, "loss": 1.6932, "step": 21870 }, { "epoch": 3.92, "grad_norm": 1.3614168167114258, "learning_rate": 5.595263814206761e-06, "loss": 1.7473, "step": 21875 }, { "epoch": 3.92, "grad_norm": 1.0385257005691528, "learning_rate": 5.586401982879347e-06, "loss": 1.7702, "step": 21880 }, { "epoch": 3.92, "grad_norm": 0.7399263381958008, "learning_rate": 5.5775462920543035e-06, "loss": 1.8132, "step": 21885 }, { "epoch": 3.92, "grad_norm": 0.5867356061935425, "learning_rate": 5.568696744532687e-06, "loss": 1.7692, "step": 21890 }, { "epoch": 3.92, "grad_norm": 1.5751367807388306, "learning_rate": 5.5598533431136055e-06, "loss": 2.0297, "step": 21895 }, { "epoch": 3.92, "grad_norm": 0.8599883317947388, "learning_rate": 5.551016090594208e-06, "loss": 1.8222, "step": 21900 }, { "epoch": 3.92, "grad_norm": 0.6561763882637024, "learning_rate": 5.542184989769711e-06, "loss": 1.8176, "step": 21905 }, { "epoch": 3.92, "grad_norm": 1.0254570245742798, "learning_rate": 5.533360043433394e-06, "loss": 1.8785, "step": 21910 }, { "epoch": 3.92, "grad_norm": 1.0083361864089966, "learning_rate": 5.524541254376564e-06, "loss": 1.8599, "step": 21915 }, { "epoch": 3.92, "grad_norm": 0.6952515244483948, "learning_rate": 5.515728625388603e-06, "loss": 1.6481, "step": 21920 }, { "epoch": 3.92, "grad_norm": 0.6888887286186218, "learning_rate": 5.506922159256933e-06, "loss": 1.9352, "step": 21925 }, { "epoch": 3.93, "grad_norm": 1.0823732614517212, "learning_rate": 5.498121858767036e-06, "loss": 1.7055, "step": 21930 }, { "epoch": 3.93, "grad_norm": 1.0809731483459473, "learning_rate": 5.489327726702445e-06, "loss": 2.1196, "step": 21935 }, { "epoch": 3.93, "grad_norm": 2.2783148288726807, "learning_rate": 5.480539765844725e-06, "loss": 1.6809, "step": 21940 }, { "epoch": 3.93, "grad_norm": 1.1743180751800537, "learning_rate": 5.471757978973496e-06, "loss": 1.5346, "step": 21945 }, { "epoch": 3.93, "grad_norm": 0.7503143548965454, "learning_rate": 5.462982368866435e-06, "loss": 1.7871, "step": 21950 }, { "epoch": 3.93, "grad_norm": 0.5664401650428772, "learning_rate": 5.454212938299258e-06, "loss": 1.9305, "step": 21955 }, { "epoch": 3.93, "grad_norm": 0.5282362103462219, "learning_rate": 5.445449690045725e-06, "loss": 2.0377, "step": 21960 }, { "epoch": 3.93, "grad_norm": 0.9779241681098938, "learning_rate": 5.436692626877648e-06, "loss": 1.4596, "step": 21965 }, { "epoch": 3.93, "grad_norm": 1.3879930973052979, "learning_rate": 5.427941751564886e-06, "loss": 1.8408, "step": 21970 }, { "epoch": 3.93, "grad_norm": 0.5937609672546387, "learning_rate": 5.419197066875312e-06, "loss": 1.9729, "step": 21975 }, { "epoch": 3.93, "grad_norm": 1.0135959386825562, "learning_rate": 5.41045857557487e-06, "loss": 1.7513, "step": 21980 }, { "epoch": 3.94, "grad_norm": 0.7260029315948486, "learning_rate": 5.401726280427541e-06, "loss": 1.919, "step": 21985 }, { "epoch": 3.94, "grad_norm": 0.8399494290351868, "learning_rate": 5.393000184195343e-06, "loss": 1.6943, "step": 21990 }, { "epoch": 3.94, "grad_norm": 1.7650916576385498, "learning_rate": 5.384280289638322e-06, "loss": 1.5243, "step": 21995 }, { "epoch": 3.94, "grad_norm": 0.5501816868782043, "learning_rate": 5.375566599514581e-06, "loss": 2.116, "step": 22000 }, { "epoch": 3.94, "grad_norm": 0.9165323376655579, "learning_rate": 5.366859116580244e-06, "loss": 1.7613, "step": 22005 }, { "epoch": 3.94, "grad_norm": 0.5392761826515198, "learning_rate": 5.358157843589484e-06, "loss": 1.7364, "step": 22010 }, { "epoch": 3.94, "grad_norm": 0.6568076014518738, "learning_rate": 5.349462783294504e-06, "loss": 1.7346, "step": 22015 }, { "epoch": 3.94, "grad_norm": 1.0636271238327026, "learning_rate": 5.340773938445545e-06, "loss": 1.8723, "step": 22020 }, { "epoch": 3.94, "grad_norm": 0.8600271344184875, "learning_rate": 5.332091311790885e-06, "loss": 1.5156, "step": 22025 }, { "epoch": 3.94, "grad_norm": 0.602660596370697, "learning_rate": 5.323414906076821e-06, "loss": 1.7696, "step": 22030 }, { "epoch": 3.94, "grad_norm": 0.6825761198997498, "learning_rate": 5.314744724047702e-06, "loss": 1.6636, "step": 22035 }, { "epoch": 3.95, "grad_norm": 0.6560852527618408, "learning_rate": 5.306080768445881e-06, "loss": 1.8196, "step": 22040 }, { "epoch": 3.95, "grad_norm": 0.8872010111808777, "learning_rate": 5.297423042011774e-06, "loss": 1.9585, "step": 22045 }, { "epoch": 3.95, "grad_norm": 0.6532019972801208, "learning_rate": 5.288771547483806e-06, "loss": 1.7353, "step": 22050 }, { "epoch": 3.95, "grad_norm": 1.1290751695632935, "learning_rate": 5.280126287598444e-06, "loss": 1.617, "step": 22055 }, { "epoch": 3.95, "grad_norm": 0.7250970602035522, "learning_rate": 5.271487265090163e-06, "loss": 1.6928, "step": 22060 }, { "epoch": 3.95, "grad_norm": 0.8637431859970093, "learning_rate": 5.2628544826914846e-06, "loss": 1.5799, "step": 22065 }, { "epoch": 3.95, "grad_norm": 3.6587893962860107, "learning_rate": 5.254227943132948e-06, "loss": 1.6546, "step": 22070 }, { "epoch": 3.95, "grad_norm": 0.8665122985839844, "learning_rate": 5.245607649143119e-06, "loss": 1.8547, "step": 22075 }, { "epoch": 3.95, "grad_norm": 0.883367657661438, "learning_rate": 5.236993603448595e-06, "loss": 1.7483, "step": 22080 }, { "epoch": 3.95, "grad_norm": 2.016577959060669, "learning_rate": 5.2283858087739806e-06, "loss": 1.5007, "step": 22085 }, { "epoch": 3.95, "grad_norm": 0.9476630687713623, "learning_rate": 5.2197842678419204e-06, "loss": 1.8659, "step": 22090 }, { "epoch": 3.95, "grad_norm": 1.0685213804244995, "learning_rate": 5.211188983373067e-06, "loss": 1.8435, "step": 22095 }, { "epoch": 3.96, "grad_norm": 1.0245375633239746, "learning_rate": 5.202599958086102e-06, "loss": 1.6491, "step": 22100 }, { "epoch": 3.96, "grad_norm": 1.622461199760437, "learning_rate": 5.194017194697728e-06, "loss": 1.7717, "step": 22105 }, { "epoch": 3.96, "grad_norm": 0.615947961807251, "learning_rate": 5.185440695922664e-06, "loss": 1.9252, "step": 22110 }, { "epoch": 3.96, "grad_norm": 0.5909544229507446, "learning_rate": 5.176870464473657e-06, "loss": 2.1178, "step": 22115 }, { "epoch": 3.96, "grad_norm": 0.8277536034584045, "learning_rate": 5.168306503061446e-06, "loss": 1.8633, "step": 22120 }, { "epoch": 3.96, "grad_norm": 0.8978816270828247, "learning_rate": 5.159748814394813e-06, "loss": 1.622, "step": 22125 }, { "epoch": 3.96, "grad_norm": 0.6580782532691956, "learning_rate": 5.151197401180552e-06, "loss": 1.6254, "step": 22130 }, { "epoch": 3.96, "grad_norm": 0.612777590751648, "learning_rate": 5.142652266123454e-06, "loss": 1.7793, "step": 22135 }, { "epoch": 3.96, "grad_norm": 0.7944098114967346, "learning_rate": 5.1341134119263476e-06, "loss": 1.5077, "step": 22140 }, { "epoch": 3.96, "grad_norm": 0.791584849357605, "learning_rate": 5.125580841290062e-06, "loss": 1.7463, "step": 22145 }, { "epoch": 3.96, "grad_norm": 1.069318175315857, "learning_rate": 5.1170545569134365e-06, "loss": 1.7364, "step": 22150 }, { "epoch": 3.97, "grad_norm": 0.9720897078514099, "learning_rate": 5.108534561493328e-06, "loss": 1.8411, "step": 22155 }, { "epoch": 3.97, "grad_norm": 0.4824497699737549, "learning_rate": 5.100020857724605e-06, "loss": 1.7391, "step": 22160 }, { "epoch": 3.97, "grad_norm": 1.237189531326294, "learning_rate": 5.091513448300144e-06, "loss": 1.7938, "step": 22165 }, { "epoch": 3.97, "grad_norm": 0.8964689373970032, "learning_rate": 5.083012335910828e-06, "loss": 1.8008, "step": 22170 }, { "epoch": 3.97, "grad_norm": 0.6493983864784241, "learning_rate": 5.074517523245561e-06, "loss": 2.0167, "step": 22175 }, { "epoch": 3.97, "grad_norm": 1.8859643936157227, "learning_rate": 5.066029012991236e-06, "loss": 1.8854, "step": 22180 }, { "epoch": 3.97, "grad_norm": 0.8033244609832764, "learning_rate": 5.05754680783275e-06, "loss": 1.9866, "step": 22185 }, { "epoch": 3.97, "grad_norm": 1.5448501110076904, "learning_rate": 5.04907091045303e-06, "loss": 1.7147, "step": 22190 }, { "epoch": 3.97, "grad_norm": 2.1923139095306396, "learning_rate": 5.04060132353299e-06, "loss": 1.752, "step": 22195 }, { "epoch": 3.97, "grad_norm": 0.8507521152496338, "learning_rate": 5.03213804975155e-06, "loss": 1.8768, "step": 22200 }, { "epoch": 3.97, "grad_norm": 0.5235335826873779, "learning_rate": 5.0236810917856485e-06, "loss": 1.9388, "step": 22205 }, { "epoch": 3.98, "grad_norm": 0.9762120842933655, "learning_rate": 5.015230452310193e-06, "loss": 1.7633, "step": 22210 }, { "epoch": 3.98, "grad_norm": 0.9791060090065002, "learning_rate": 5.006786133998126e-06, "loss": 1.5675, "step": 22215 }, { "epoch": 3.98, "grad_norm": 1.307426929473877, "learning_rate": 4.998348139520378e-06, "loss": 1.5626, "step": 22220 }, { "epoch": 3.98, "grad_norm": 2.5995423793792725, "learning_rate": 4.989916471545869e-06, "loss": 1.8907, "step": 22225 }, { "epoch": 3.98, "grad_norm": 1.0652718544006348, "learning_rate": 4.981491132741531e-06, "loss": 2.1655, "step": 22230 }, { "epoch": 3.98, "grad_norm": 0.8676015734672546, "learning_rate": 4.973072125772302e-06, "loss": 1.9071, "step": 22235 }, { "epoch": 3.98, "grad_norm": 0.7337488532066345, "learning_rate": 4.9646594533010875e-06, "loss": 1.6795, "step": 22240 }, { "epoch": 3.98, "grad_norm": 0.6141374111175537, "learning_rate": 4.956253117988816e-06, "loss": 2.0898, "step": 22245 }, { "epoch": 3.98, "grad_norm": 0.5068919062614441, "learning_rate": 4.947853122494406e-06, "loss": 1.964, "step": 22250 }, { "epoch": 3.98, "grad_norm": 1.0111421346664429, "learning_rate": 4.939459469474764e-06, "loss": 1.8229, "step": 22255 }, { "epoch": 3.98, "grad_norm": 0.9964427947998047, "learning_rate": 4.931072161584802e-06, "loss": 1.886, "step": 22260 }, { "epoch": 3.99, "grad_norm": 0.7016617059707642, "learning_rate": 4.922691201477406e-06, "loss": 1.7212, "step": 22265 }, { "epoch": 3.99, "grad_norm": 0.8478853106498718, "learning_rate": 4.914316591803475e-06, "loss": 1.7993, "step": 22270 }, { "epoch": 3.99, "grad_norm": 0.7734459042549133, "learning_rate": 4.905948335211882e-06, "loss": 2.101, "step": 22275 }, { "epoch": 3.99, "grad_norm": 0.6338139176368713, "learning_rate": 4.897586434349502e-06, "loss": 1.8311, "step": 22280 }, { "epoch": 3.99, "grad_norm": 2.914543867111206, "learning_rate": 4.889230891861196e-06, "loss": 1.9766, "step": 22285 }, { "epoch": 3.99, "grad_norm": 0.9734477400779724, "learning_rate": 4.880881710389817e-06, "loss": 1.8301, "step": 22290 }, { "epoch": 3.99, "grad_norm": 1.7465955018997192, "learning_rate": 4.8725388925762064e-06, "loss": 1.8134, "step": 22295 }, { "epoch": 3.99, "grad_norm": 0.780785322189331, "learning_rate": 4.864202441059176e-06, "loss": 1.8292, "step": 22300 }, { "epoch": 3.99, "grad_norm": 0.9263495206832886, "learning_rate": 4.855872358475547e-06, "loss": 1.8219, "step": 22305 }, { "epoch": 3.99, "grad_norm": 0.8709249496459961, "learning_rate": 4.847548647460112e-06, "loss": 1.8204, "step": 22310 }, { "epoch": 3.99, "grad_norm": 0.9024534821510315, "learning_rate": 4.839231310645664e-06, "loss": 1.7256, "step": 22315 }, { "epoch": 4.0, "grad_norm": 1.1584830284118652, "learning_rate": 4.830920350662954e-06, "loss": 1.6609, "step": 22320 }, { "epoch": 4.0, "grad_norm": 0.5752809643745422, "learning_rate": 4.822615770140745e-06, "loss": 1.6129, "step": 22325 }, { "epoch": 4.0, "grad_norm": 1.2050139904022217, "learning_rate": 4.8143175717057505e-06, "loss": 1.5614, "step": 22330 }, { "epoch": 4.0, "grad_norm": 0.9474218487739563, "learning_rate": 4.806025757982693e-06, "loss": 1.9284, "step": 22335 }, { "epoch": 4.0, "grad_norm": 0.6500478982925415, "learning_rate": 4.797740331594264e-06, "loss": 1.9486, "step": 22340 }, { "epoch": 4.0, "grad_norm": 1.0756356716156006, "learning_rate": 4.789461295161138e-06, "loss": 1.9763, "step": 22345 }, { "epoch": 4.0, "grad_norm": 0.6653231382369995, "learning_rate": 4.781188651301969e-06, "loss": 2.0301, "step": 22350 }, { "epoch": 4.0, "grad_norm": 0.7933474779129028, "learning_rate": 4.772922402633373e-06, "loss": 1.9962, "step": 22355 }, { "epoch": 4.0, "grad_norm": 2.7801687717437744, "learning_rate": 4.764662551769975e-06, "loss": 1.9394, "step": 22360 }, { "epoch": 4.0, "grad_norm": 0.8057134747505188, "learning_rate": 4.756409101324338e-06, "loss": 1.7502, "step": 22365 }, { "epoch": 4.0, "grad_norm": 2.9395413398742676, "learning_rate": 4.748162053907035e-06, "loss": 1.8419, "step": 22370 }, { "epoch": 4.01, "grad_norm": 2.3610217571258545, "learning_rate": 4.739921412126591e-06, "loss": 1.9091, "step": 22375 }, { "epoch": 4.01, "grad_norm": 0.6767581105232239, "learning_rate": 4.731687178589517e-06, "loss": 1.7989, "step": 22380 }, { "epoch": 4.01, "grad_norm": 0.8745439052581787, "learning_rate": 4.7234593559003e-06, "loss": 1.8339, "step": 22385 }, { "epoch": 4.01, "grad_norm": 0.7209843993186951, "learning_rate": 4.715237946661377e-06, "loss": 1.8103, "step": 22390 }, { "epoch": 4.01, "grad_norm": 2.5547735691070557, "learning_rate": 4.707022953473184e-06, "loss": 2.2534, "step": 22395 }, { "epoch": 4.01, "grad_norm": 0.7963310480117798, "learning_rate": 4.698814378934108e-06, "loss": 1.888, "step": 22400 }, { "epoch": 4.01, "grad_norm": 1.050917148590088, "learning_rate": 4.690612225640517e-06, "loss": 1.9726, "step": 22405 }, { "epoch": 4.01, "grad_norm": 0.7625211477279663, "learning_rate": 4.68241649618675e-06, "loss": 1.9173, "step": 22410 }, { "epoch": 4.01, "grad_norm": 0.531043291091919, "learning_rate": 4.674227193165107e-06, "loss": 2.0842, "step": 22415 }, { "epoch": 4.01, "grad_norm": 0.671322226524353, "learning_rate": 4.666044319165841e-06, "loss": 1.6739, "step": 22420 }, { "epoch": 4.01, "grad_norm": 0.8251287341117859, "learning_rate": 4.6578678767771975e-06, "loss": 1.9518, "step": 22425 }, { "epoch": 4.01, "grad_norm": 1.6571036577224731, "learning_rate": 4.6496978685853786e-06, "loss": 1.6041, "step": 22430 }, { "epoch": 4.02, "grad_norm": 0.7980236411094666, "learning_rate": 4.641534297174549e-06, "loss": 1.4048, "step": 22435 }, { "epoch": 4.02, "grad_norm": 1.075265884399414, "learning_rate": 4.633377165126848e-06, "loss": 1.9821, "step": 22440 }, { "epoch": 4.02, "grad_norm": 1.2659225463867188, "learning_rate": 4.625226475022351e-06, "loss": 1.6767, "step": 22445 }, { "epoch": 4.02, "grad_norm": 3.332648754119873, "learning_rate": 4.6170822294391205e-06, "loss": 1.6765, "step": 22450 }, { "epoch": 4.02, "grad_norm": 1.1672558784484863, "learning_rate": 4.608944430953185e-06, "loss": 1.6675, "step": 22455 }, { "epoch": 4.02, "grad_norm": 0.6216001510620117, "learning_rate": 4.6008130821385025e-06, "loss": 1.7697, "step": 22460 }, { "epoch": 4.02, "grad_norm": 0.9190794825553894, "learning_rate": 4.592688185567023e-06, "loss": 1.9444, "step": 22465 }, { "epoch": 4.02, "grad_norm": 0.6193316578865051, "learning_rate": 4.58456974380864e-06, "loss": 1.942, "step": 22470 }, { "epoch": 4.02, "grad_norm": 0.9209474921226501, "learning_rate": 4.576457759431216e-06, "loss": 1.99, "step": 22475 }, { "epoch": 4.02, "grad_norm": 0.9471963047981262, "learning_rate": 4.568352235000553e-06, "loss": 1.6965, "step": 22480 }, { "epoch": 4.02, "grad_norm": 1.2862259149551392, "learning_rate": 4.560253173080423e-06, "loss": 1.7552, "step": 22485 }, { "epoch": 4.03, "grad_norm": 0.8248801231384277, "learning_rate": 4.552160576232553e-06, "loss": 1.9429, "step": 22490 }, { "epoch": 4.03, "grad_norm": 0.5932010412216187, "learning_rate": 4.544074447016628e-06, "loss": 1.6949, "step": 22495 }, { "epoch": 4.03, "grad_norm": 1.4545879364013672, "learning_rate": 4.535994787990283e-06, "loss": 1.9368, "step": 22500 }, { "epoch": 4.03, "grad_norm": 0.8596073985099792, "learning_rate": 4.527921601709104e-06, "loss": 1.8986, "step": 22505 }, { "epoch": 4.03, "grad_norm": 1.0523921251296997, "learning_rate": 4.519854890726624e-06, "loss": 1.7516, "step": 22510 }, { "epoch": 4.03, "grad_norm": 0.9680181741714478, "learning_rate": 4.511794657594343e-06, "loss": 1.6559, "step": 22515 }, { "epoch": 4.03, "grad_norm": 1.0432225465774536, "learning_rate": 4.503740904861703e-06, "loss": 1.6328, "step": 22520 }, { "epoch": 4.03, "grad_norm": 1.3763519525527954, "learning_rate": 4.495693635076101e-06, "loss": 1.6849, "step": 22525 }, { "epoch": 4.03, "grad_norm": 0.7782979607582092, "learning_rate": 4.487652850782886e-06, "loss": 1.6029, "step": 22530 }, { "epoch": 4.03, "grad_norm": 0.760043203830719, "learning_rate": 4.479618554525339e-06, "loss": 1.9747, "step": 22535 }, { "epoch": 4.03, "grad_norm": 0.7995366454124451, "learning_rate": 4.471590748844703e-06, "loss": 1.8518, "step": 22540 }, { "epoch": 4.04, "grad_norm": 1.1295702457427979, "learning_rate": 4.4635694362801684e-06, "loss": 2.1671, "step": 22545 }, { "epoch": 4.04, "grad_norm": 0.6298494338989258, "learning_rate": 4.4555546193688735e-06, "loss": 1.894, "step": 22550 }, { "epoch": 4.04, "grad_norm": 0.6293065547943115, "learning_rate": 4.4475463006458855e-06, "loss": 1.6044, "step": 22555 }, { "epoch": 4.04, "grad_norm": 1.1865392923355103, "learning_rate": 4.4395444826442395e-06, "loss": 1.8025, "step": 22560 }, { "epoch": 4.04, "grad_norm": 0.8114657998085022, "learning_rate": 4.431549167894891e-06, "loss": 1.973, "step": 22565 }, { "epoch": 4.04, "grad_norm": 0.7298006415367126, "learning_rate": 4.423560358926756e-06, "loss": 1.809, "step": 22570 }, { "epoch": 4.04, "grad_norm": 1.246186375617981, "learning_rate": 4.415578058266687e-06, "loss": 1.733, "step": 22575 }, { "epoch": 4.04, "grad_norm": 0.7696637511253357, "learning_rate": 4.407602268439479e-06, "loss": 1.481, "step": 22580 }, { "epoch": 4.04, "grad_norm": 0.4412114918231964, "learning_rate": 4.3996329919678666e-06, "loss": 2.2101, "step": 22585 }, { "epoch": 4.04, "grad_norm": 0.5162980556488037, "learning_rate": 4.3916702313725276e-06, "loss": 2.1751, "step": 22590 }, { "epoch": 4.04, "grad_norm": 1.8025277853012085, "learning_rate": 4.3837139891720755e-06, "loss": 1.7814, "step": 22595 }, { "epoch": 4.05, "grad_norm": 0.36622488498687744, "learning_rate": 4.375764267883048e-06, "loss": 2.1449, "step": 22600 }, { "epoch": 4.05, "grad_norm": 1.1410011053085327, "learning_rate": 4.367821070019948e-06, "loss": 1.7696, "step": 22605 }, { "epoch": 4.05, "grad_norm": 0.6294861435890198, "learning_rate": 4.359884398095196e-06, "loss": 1.8894, "step": 22610 }, { "epoch": 4.05, "grad_norm": 1.1744203567504883, "learning_rate": 4.351954254619156e-06, "loss": 1.687, "step": 22615 }, { "epoch": 4.05, "grad_norm": 0.8127630949020386, "learning_rate": 4.344030642100133e-06, "loss": 1.8386, "step": 22620 }, { "epoch": 4.05, "grad_norm": 0.8806772828102112, "learning_rate": 4.336113563044339e-06, "loss": 1.6666, "step": 22625 }, { "epoch": 4.05, "grad_norm": 1.0629003047943115, "learning_rate": 4.328203019955951e-06, "loss": 1.8621, "step": 22630 }, { "epoch": 4.05, "grad_norm": 0.7076244354248047, "learning_rate": 4.320299015337065e-06, "loss": 1.7092, "step": 22635 }, { "epoch": 4.05, "grad_norm": 1.0784873962402344, "learning_rate": 4.312401551687714e-06, "loss": 1.6767, "step": 22640 }, { "epoch": 4.05, "grad_norm": 1.1845802068710327, "learning_rate": 4.304510631505848e-06, "loss": 1.7707, "step": 22645 }, { "epoch": 4.05, "grad_norm": 0.7437544465065002, "learning_rate": 4.296626257287367e-06, "loss": 1.8259, "step": 22650 }, { "epoch": 4.06, "grad_norm": 1.0520071983337402, "learning_rate": 4.288748431526082e-06, "loss": 1.9545, "step": 22655 }, { "epoch": 4.06, "grad_norm": 0.893604040145874, "learning_rate": 4.280877156713748e-06, "loss": 1.7078, "step": 22660 }, { "epoch": 4.06, "grad_norm": 3.4858219623565674, "learning_rate": 4.273012435340038e-06, "loss": 2.1413, "step": 22665 }, { "epoch": 4.06, "grad_norm": 0.628693163394928, "learning_rate": 4.265154269892557e-06, "loss": 1.5482, "step": 22670 }, { "epoch": 4.06, "grad_norm": 1.399335265159607, "learning_rate": 4.257302662856838e-06, "loss": 1.751, "step": 22675 }, { "epoch": 4.06, "grad_norm": 0.8125795722007751, "learning_rate": 4.24945761671634e-06, "loss": 1.9325, "step": 22680 }, { "epoch": 4.06, "grad_norm": 1.0702699422836304, "learning_rate": 4.241619133952432e-06, "loss": 1.8058, "step": 22685 }, { "epoch": 4.06, "grad_norm": 0.5640363097190857, "learning_rate": 4.23378721704443e-06, "loss": 1.8514, "step": 22690 }, { "epoch": 4.06, "grad_norm": 0.6807628273963928, "learning_rate": 4.225961868469553e-06, "loss": 1.6916, "step": 22695 }, { "epoch": 4.06, "grad_norm": 0.7918212413787842, "learning_rate": 4.218143090702953e-06, "loss": 1.8782, "step": 22700 }, { "epoch": 4.06, "grad_norm": 0.6629428863525391, "learning_rate": 4.210330886217706e-06, "loss": 2.0677, "step": 22705 }, { "epoch": 4.06, "grad_norm": 0.6261160373687744, "learning_rate": 4.202525257484807e-06, "loss": 1.8922, "step": 22710 }, { "epoch": 4.07, "grad_norm": 0.9577205181121826, "learning_rate": 4.194726206973157e-06, "loss": 1.6897, "step": 22715 }, { "epoch": 4.07, "grad_norm": 1.4098031520843506, "learning_rate": 4.186933737149598e-06, "loss": 1.9442, "step": 22720 }, { "epoch": 4.07, "grad_norm": 1.430624008178711, "learning_rate": 4.179147850478876e-06, "loss": 1.9234, "step": 22725 }, { "epoch": 4.07, "grad_norm": 1.2811557054519653, "learning_rate": 4.171368549423665e-06, "loss": 1.9003, "step": 22730 }, { "epoch": 4.07, "grad_norm": 0.9562197923660278, "learning_rate": 4.163595836444551e-06, "loss": 1.8844, "step": 22735 }, { "epoch": 4.07, "grad_norm": 0.8576022982597351, "learning_rate": 4.1558297140000314e-06, "loss": 1.6477, "step": 22740 }, { "epoch": 4.07, "grad_norm": 2.603815793991089, "learning_rate": 4.1480701845465205e-06, "loss": 1.988, "step": 22745 }, { "epoch": 4.07, "grad_norm": 0.5836585760116577, "learning_rate": 4.14031725053835e-06, "loss": 2.0429, "step": 22750 }, { "epoch": 4.07, "grad_norm": 1.4115853309631348, "learning_rate": 4.132570914427772e-06, "loss": 2.0125, "step": 22755 }, { "epoch": 4.07, "grad_norm": 0.5206339359283447, "learning_rate": 4.12483117866494e-06, "loss": 1.9237, "step": 22760 }, { "epoch": 4.07, "grad_norm": 1.0422269105911255, "learning_rate": 4.117098045697931e-06, "loss": 1.6165, "step": 22765 }, { "epoch": 4.08, "grad_norm": 1.0876985788345337, "learning_rate": 4.109371517972721e-06, "loss": 1.6538, "step": 22770 }, { "epoch": 4.08, "grad_norm": 0.7427853345870972, "learning_rate": 4.101651597933204e-06, "loss": 2.3134, "step": 22775 }, { "epoch": 4.08, "grad_norm": 1.1198269128799438, "learning_rate": 4.09393828802119e-06, "loss": 1.8009, "step": 22780 }, { "epoch": 4.08, "grad_norm": 0.47877076268196106, "learning_rate": 4.086231590676382e-06, "loss": 1.9703, "step": 22785 }, { "epoch": 4.08, "grad_norm": 0.6987916231155396, "learning_rate": 4.078531508336403e-06, "loss": 1.4546, "step": 22790 }, { "epoch": 4.08, "grad_norm": 0.5703626275062561, "learning_rate": 4.070838043436786e-06, "loss": 1.911, "step": 22795 }, { "epoch": 4.08, "grad_norm": 1.0218112468719482, "learning_rate": 4.063151198410969e-06, "loss": 1.8741, "step": 22800 }, { "epoch": 4.08, "grad_norm": 0.6620162725448608, "learning_rate": 4.055470975690284e-06, "loss": 1.8318, "step": 22805 }, { "epoch": 4.08, "grad_norm": 2.1791741847991943, "learning_rate": 4.047797377703985e-06, "loss": 1.6375, "step": 22810 }, { "epoch": 4.08, "grad_norm": 0.8899803757667542, "learning_rate": 4.040130406879219e-06, "loss": 1.7384, "step": 22815 }, { "epoch": 4.08, "grad_norm": 0.5828153491020203, "learning_rate": 4.032470065641047e-06, "loss": 1.7185, "step": 22820 }, { "epoch": 4.09, "grad_norm": 0.5616529583930969, "learning_rate": 4.024816356412431e-06, "loss": 1.7911, "step": 22825 }, { "epoch": 4.09, "grad_norm": 0.8453607559204102, "learning_rate": 4.017169281614225e-06, "loss": 1.6574, "step": 22830 }, { "epoch": 4.09, "grad_norm": 0.6679532527923584, "learning_rate": 4.009528843665186e-06, "loss": 1.7073, "step": 22835 }, { "epoch": 4.09, "grad_norm": 1.1486124992370605, "learning_rate": 4.001895044981982e-06, "loss": 1.7303, "step": 22840 }, { "epoch": 4.09, "grad_norm": 0.7910205721855164, "learning_rate": 3.99426788797918e-06, "loss": 1.9838, "step": 22845 }, { "epoch": 4.09, "grad_norm": 1.2929234504699707, "learning_rate": 3.986647375069241e-06, "loss": 1.8347, "step": 22850 }, { "epoch": 4.09, "grad_norm": 0.7869713306427002, "learning_rate": 3.9790335086625275e-06, "loss": 1.8313, "step": 22855 }, { "epoch": 4.09, "grad_norm": 1.0830061435699463, "learning_rate": 3.971426291167294e-06, "loss": 1.9421, "step": 22860 }, { "epoch": 4.09, "grad_norm": 1.3566019535064697, "learning_rate": 3.963825724989695e-06, "loss": 1.8847, "step": 22865 }, { "epoch": 4.09, "grad_norm": 0.9219987988471985, "learning_rate": 3.956231812533784e-06, "loss": 2.0614, "step": 22870 }, { "epoch": 4.09, "grad_norm": 1.1585674285888672, "learning_rate": 3.948644556201517e-06, "loss": 2.011, "step": 22875 }, { "epoch": 4.1, "grad_norm": 1.1043351888656616, "learning_rate": 3.941063958392721e-06, "loss": 2.111, "step": 22880 }, { "epoch": 4.1, "grad_norm": 0.9520683884620667, "learning_rate": 3.93349002150514e-06, "loss": 1.7933, "step": 22885 }, { "epoch": 4.1, "grad_norm": 1.2550685405731201, "learning_rate": 3.925922747934407e-06, "loss": 1.7792, "step": 22890 }, { "epoch": 4.1, "grad_norm": 0.6732292175292969, "learning_rate": 3.918362140074031e-06, "loss": 1.9497, "step": 22895 }, { "epoch": 4.1, "grad_norm": 1.17646324634552, "learning_rate": 3.9108082003154325e-06, "loss": 1.7155, "step": 22900 }, { "epoch": 4.1, "grad_norm": 0.677506148815155, "learning_rate": 3.903260931047917e-06, "loss": 1.7703, "step": 22905 }, { "epoch": 4.1, "grad_norm": 1.1620838642120361, "learning_rate": 3.895720334658676e-06, "loss": 1.6102, "step": 22910 }, { "epoch": 4.1, "grad_norm": 0.9558453559875488, "learning_rate": 3.888186413532799e-06, "loss": 1.9861, "step": 22915 }, { "epoch": 4.1, "grad_norm": 0.9614786505699158, "learning_rate": 3.880659170053253e-06, "loss": 1.8458, "step": 22920 }, { "epoch": 4.1, "grad_norm": 0.40708935260772705, "learning_rate": 3.87313860660089e-06, "loss": 1.845, "step": 22925 }, { "epoch": 4.1, "grad_norm": 0.8563295006752014, "learning_rate": 3.865624725554465e-06, "loss": 1.598, "step": 22930 }, { "epoch": 4.11, "grad_norm": 2.2305939197540283, "learning_rate": 3.8581175292906084e-06, "loss": 1.9656, "step": 22935 }, { "epoch": 4.11, "grad_norm": 0.7936108112335205, "learning_rate": 3.850617020183845e-06, "loss": 1.9327, "step": 22940 }, { "epoch": 4.11, "grad_norm": 1.0209077596664429, "learning_rate": 3.843123200606577e-06, "loss": 2.0603, "step": 22945 }, { "epoch": 4.11, "grad_norm": 1.030806541442871, "learning_rate": 3.835636072929088e-06, "loss": 1.7855, "step": 22950 }, { "epoch": 4.11, "grad_norm": 0.9423354864120483, "learning_rate": 3.828155639519551e-06, "loss": 1.9576, "step": 22955 }, { "epoch": 4.11, "grad_norm": 1.0258960723876953, "learning_rate": 3.82068190274402e-06, "loss": 1.9906, "step": 22960 }, { "epoch": 4.11, "grad_norm": 0.5299312472343445, "learning_rate": 3.81321486496643e-06, "loss": 1.8934, "step": 22965 }, { "epoch": 4.11, "grad_norm": 1.3776746988296509, "learning_rate": 3.8057545285486053e-06, "loss": 1.9732, "step": 22970 }, { "epoch": 4.11, "grad_norm": 0.8569226264953613, "learning_rate": 3.798300895850232e-06, "loss": 1.8981, "step": 22975 }, { "epoch": 4.11, "grad_norm": 0.8730267882347107, "learning_rate": 3.790853969228897e-06, "loss": 2.07, "step": 22980 }, { "epoch": 4.11, "grad_norm": 0.7692422270774841, "learning_rate": 3.7834137510400465e-06, "loss": 1.5263, "step": 22985 }, { "epoch": 4.12, "grad_norm": 0.8507816195487976, "learning_rate": 3.7759802436370174e-06, "loss": 1.905, "step": 22990 }, { "epoch": 4.12, "grad_norm": 0.851302444934845, "learning_rate": 3.7685534493710235e-06, "loss": 1.7006, "step": 22995 }, { "epoch": 4.12, "grad_norm": 1.0782297849655151, "learning_rate": 3.7611333705911523e-06, "loss": 1.9236, "step": 23000 }, { "epoch": 4.12, "grad_norm": 0.5770434141159058, "learning_rate": 3.753720009644371e-06, "loss": 1.9372, "step": 23005 }, { "epoch": 4.12, "grad_norm": 0.7802814841270447, "learning_rate": 3.7463133688755066e-06, "loss": 1.8211, "step": 23010 }, { "epoch": 4.12, "grad_norm": 1.3625346422195435, "learning_rate": 3.7389134506272903e-06, "loss": 1.8262, "step": 23015 }, { "epoch": 4.12, "grad_norm": 0.8293248414993286, "learning_rate": 3.7315202572402896e-06, "loss": 1.7531, "step": 23020 }, { "epoch": 4.12, "grad_norm": 0.6201703548431396, "learning_rate": 3.724133791052975e-06, "loss": 1.9715, "step": 23025 }, { "epoch": 4.12, "grad_norm": 1.0974987745285034, "learning_rate": 3.7167540544016775e-06, "loss": 2.0178, "step": 23030 }, { "epoch": 4.12, "grad_norm": 0.7133528590202332, "learning_rate": 3.709381049620608e-06, "loss": 1.6524, "step": 23035 }, { "epoch": 4.12, "grad_norm": 0.9783563017845154, "learning_rate": 3.7020147790418263e-06, "loss": 2.1077, "step": 23040 }, { "epoch": 4.12, "grad_norm": 0.7139909863471985, "learning_rate": 3.6946552449952847e-06, "loss": 1.8681, "step": 23045 }, { "epoch": 4.13, "grad_norm": 2.149613380432129, "learning_rate": 3.6873024498087938e-06, "loss": 1.9458, "step": 23050 }, { "epoch": 4.13, "grad_norm": 1.5502854585647583, "learning_rate": 3.67995639580804e-06, "loss": 2.0566, "step": 23055 }, { "epoch": 4.13, "grad_norm": 0.7524001598358154, "learning_rate": 3.6726170853165763e-06, "loss": 1.4629, "step": 23060 }, { "epoch": 4.13, "grad_norm": 0.4955444633960724, "learning_rate": 3.665284520655807e-06, "loss": 1.5796, "step": 23065 }, { "epoch": 4.13, "grad_norm": 0.7171782851219177, "learning_rate": 3.6579587041450286e-06, "loss": 2.1074, "step": 23070 }, { "epoch": 4.13, "grad_norm": 0.5848670601844788, "learning_rate": 3.6506396381013804e-06, "loss": 1.8271, "step": 23075 }, { "epoch": 4.13, "grad_norm": 1.1107635498046875, "learning_rate": 3.6433273248398763e-06, "loss": 1.6248, "step": 23080 }, { "epoch": 4.13, "grad_norm": 1.556101679801941, "learning_rate": 3.6360217666733975e-06, "loss": 1.8316, "step": 23085 }, { "epoch": 4.13, "grad_norm": 1.174280047416687, "learning_rate": 3.628722965912687e-06, "loss": 1.6224, "step": 23090 }, { "epoch": 4.13, "grad_norm": 0.5544099807739258, "learning_rate": 3.621430924866348e-06, "loss": 2.1943, "step": 23095 }, { "epoch": 4.13, "grad_norm": 0.6023959517478943, "learning_rate": 3.6141456458408383e-06, "loss": 1.9006, "step": 23100 }, { "epoch": 4.14, "grad_norm": 0.9502812027931213, "learning_rate": 3.6068671311404927e-06, "loss": 1.6577, "step": 23105 }, { "epoch": 4.14, "grad_norm": 0.8921631574630737, "learning_rate": 3.5995953830675e-06, "loss": 1.1959, "step": 23110 }, { "epoch": 4.14, "grad_norm": 0.8794203996658325, "learning_rate": 3.592330403921898e-06, "loss": 1.6737, "step": 23115 }, { "epoch": 4.14, "grad_norm": 0.5600360035896301, "learning_rate": 3.585072196001599e-06, "loss": 2.1309, "step": 23120 }, { "epoch": 4.14, "grad_norm": 1.8456734418869019, "learning_rate": 3.5778207616023684e-06, "loss": 2.0503, "step": 23125 }, { "epoch": 4.14, "grad_norm": 0.7940518856048584, "learning_rate": 3.5705761030178224e-06, "loss": 1.8708, "step": 23130 }, { "epoch": 4.14, "grad_norm": 1.4641777276992798, "learning_rate": 3.563338222539442e-06, "loss": 1.7284, "step": 23135 }, { "epoch": 4.14, "grad_norm": 0.6386775374412537, "learning_rate": 3.5561071224565617e-06, "loss": 1.8958, "step": 23140 }, { "epoch": 4.14, "grad_norm": 0.9825034737586975, "learning_rate": 3.548882805056372e-06, "loss": 1.6577, "step": 23145 }, { "epoch": 4.14, "grad_norm": 0.915550172328949, "learning_rate": 3.541665272623923e-06, "loss": 1.6564, "step": 23150 }, { "epoch": 4.14, "grad_norm": 0.926216185092926, "learning_rate": 3.5344545274421077e-06, "loss": 1.9108, "step": 23155 }, { "epoch": 4.15, "grad_norm": 1.014319896697998, "learning_rate": 3.527250571791674e-06, "loss": 1.7595, "step": 23160 }, { "epoch": 4.15, "grad_norm": 0.9334921836853027, "learning_rate": 3.5200534079512295e-06, "loss": 1.7663, "step": 23165 }, { "epoch": 4.15, "grad_norm": 0.9318943619728088, "learning_rate": 3.512863038197231e-06, "loss": 2.109, "step": 23170 }, { "epoch": 4.15, "grad_norm": 0.7537411451339722, "learning_rate": 3.505679464803985e-06, "loss": 1.7133, "step": 23175 }, { "epoch": 4.15, "grad_norm": 0.6408186554908752, "learning_rate": 3.498502690043651e-06, "loss": 1.7261, "step": 23180 }, { "epoch": 4.15, "grad_norm": 0.8619161248207092, "learning_rate": 3.491332716186238e-06, "loss": 1.7955, "step": 23185 }, { "epoch": 4.15, "grad_norm": 0.9069615006446838, "learning_rate": 3.4841695454995936e-06, "loss": 1.9615, "step": 23190 }, { "epoch": 4.15, "grad_norm": 0.5599504113197327, "learning_rate": 3.4770131802494278e-06, "loss": 1.8248, "step": 23195 }, { "epoch": 4.15, "grad_norm": 1.420801043510437, "learning_rate": 3.469863622699293e-06, "loss": 1.7244, "step": 23200 }, { "epoch": 4.15, "grad_norm": 0.7983185052871704, "learning_rate": 3.462720875110584e-06, "loss": 2.053, "step": 23205 }, { "epoch": 4.15, "grad_norm": 0.9097558856010437, "learning_rate": 3.45558493974254e-06, "loss": 1.8446, "step": 23210 }, { "epoch": 4.16, "grad_norm": 0.5858725905418396, "learning_rate": 3.448455818852267e-06, "loss": 2.001, "step": 23215 }, { "epoch": 4.16, "grad_norm": 1.1941709518432617, "learning_rate": 3.4413335146946807e-06, "loss": 1.8848, "step": 23220 }, { "epoch": 4.16, "grad_norm": 0.5961235761642456, "learning_rate": 3.434218029522568e-06, "loss": 1.7481, "step": 23225 }, { "epoch": 4.16, "grad_norm": 1.2655341625213623, "learning_rate": 3.4271093655865476e-06, "loss": 1.5217, "step": 23230 }, { "epoch": 4.16, "grad_norm": 0.8132046461105347, "learning_rate": 3.4200075251350826e-06, "loss": 1.5498, "step": 23235 }, { "epoch": 4.16, "grad_norm": 0.48109278082847595, "learning_rate": 3.41291251041449e-06, "loss": 1.7054, "step": 23240 }, { "epoch": 4.16, "grad_norm": 0.6113245487213135, "learning_rate": 3.4058243236688993e-06, "loss": 1.7679, "step": 23245 }, { "epoch": 4.16, "grad_norm": 1.200701117515564, "learning_rate": 3.3987429671403088e-06, "loss": 1.8985, "step": 23250 }, { "epoch": 4.16, "grad_norm": 0.7015065550804138, "learning_rate": 3.391668443068535e-06, "loss": 1.9091, "step": 23255 }, { "epoch": 4.16, "grad_norm": 1.0896352529525757, "learning_rate": 3.3846007536912473e-06, "loss": 1.5577, "step": 23260 }, { "epoch": 4.16, "grad_norm": 1.282090425491333, "learning_rate": 3.3775399012439506e-06, "loss": 1.7736, "step": 23265 }, { "epoch": 4.17, "grad_norm": 0.48688602447509766, "learning_rate": 3.3704858879599843e-06, "loss": 1.6715, "step": 23270 }, { "epoch": 4.17, "grad_norm": 0.6023693084716797, "learning_rate": 3.3634387160705325e-06, "loss": 1.7387, "step": 23275 }, { "epoch": 4.17, "grad_norm": 1.431891679763794, "learning_rate": 3.356398387804596e-06, "loss": 1.6708, "step": 23280 }, { "epoch": 4.17, "grad_norm": 0.6251328587532043, "learning_rate": 3.3493649053890326e-06, "loss": 1.8878, "step": 23285 }, { "epoch": 4.17, "grad_norm": 1.2962218523025513, "learning_rate": 3.342338271048526e-06, "loss": 1.6701, "step": 23290 }, { "epoch": 4.17, "grad_norm": 0.9591766595840454, "learning_rate": 3.3353184870055954e-06, "loss": 1.78, "step": 23295 }, { "epoch": 4.17, "grad_norm": 1.259443759918213, "learning_rate": 3.3283055554805848e-06, "loss": 1.6922, "step": 23300 }, { "epoch": 4.17, "grad_norm": 1.0876199007034302, "learning_rate": 3.3212994786916836e-06, "loss": 1.9445, "step": 23305 }, { "epoch": 4.17, "grad_norm": 2.2998528480529785, "learning_rate": 3.314300258854902e-06, "loss": 1.7383, "step": 23310 }, { "epoch": 4.17, "grad_norm": 1.3563592433929443, "learning_rate": 3.307307898184089e-06, "loss": 1.8597, "step": 23315 }, { "epoch": 4.17, "grad_norm": 0.7580876350402832, "learning_rate": 3.3003223988909234e-06, "loss": 1.7143, "step": 23320 }, { "epoch": 4.18, "grad_norm": 0.7381443381309509, "learning_rate": 3.2933437631849085e-06, "loss": 1.8132, "step": 23325 }, { "epoch": 4.18, "grad_norm": 0.7825798988342285, "learning_rate": 3.2863719932733904e-06, "loss": 1.8894, "step": 23330 }, { "epoch": 4.18, "grad_norm": 0.96683269739151, "learning_rate": 3.2794070913615165e-06, "loss": 1.8698, "step": 23335 }, { "epoch": 4.18, "grad_norm": 0.9194793701171875, "learning_rate": 3.2724490596522937e-06, "loss": 1.8079, "step": 23340 }, { "epoch": 4.18, "grad_norm": 0.9263105988502502, "learning_rate": 3.2654979003465313e-06, "loss": 1.6324, "step": 23345 }, { "epoch": 4.18, "grad_norm": 0.6331471800804138, "learning_rate": 3.258553615642873e-06, "loss": 1.9431, "step": 23350 }, { "epoch": 4.18, "grad_norm": 0.6416884064674377, "learning_rate": 3.2516162077377953e-06, "loss": 1.865, "step": 23355 }, { "epoch": 4.18, "grad_norm": 0.7806147933006287, "learning_rate": 3.2446856788255958e-06, "loss": 1.8503, "step": 23360 }, { "epoch": 4.18, "grad_norm": 0.5219438076019287, "learning_rate": 3.2377620310983873e-06, "loss": 1.8821, "step": 23365 }, { "epoch": 4.18, "grad_norm": 0.8223024606704712, "learning_rate": 3.230845266746113e-06, "loss": 1.6572, "step": 23370 }, { "epoch": 4.18, "grad_norm": 0.9462054967880249, "learning_rate": 3.223935387956545e-06, "loss": 1.7214, "step": 23375 }, { "epoch": 4.18, "grad_norm": 0.538049042224884, "learning_rate": 3.217032396915265e-06, "loss": 1.751, "step": 23380 }, { "epoch": 4.19, "grad_norm": 0.767585813999176, "learning_rate": 3.210136295805685e-06, "loss": 1.6549, "step": 23385 }, { "epoch": 4.19, "grad_norm": 0.9927555322647095, "learning_rate": 3.2032470868090414e-06, "loss": 1.7279, "step": 23390 }, { "epoch": 4.19, "grad_norm": 0.8647646903991699, "learning_rate": 3.196364772104379e-06, "loss": 1.6213, "step": 23395 }, { "epoch": 4.19, "grad_norm": 0.8548038005828857, "learning_rate": 3.189489353868563e-06, "loss": 2.1221, "step": 23400 }, { "epoch": 4.19, "grad_norm": 0.7027987241744995, "learning_rate": 3.182620834276284e-06, "loss": 2.0151, "step": 23405 }, { "epoch": 4.19, "grad_norm": 0.8726295828819275, "learning_rate": 3.1757592155000486e-06, "loss": 1.8991, "step": 23410 }, { "epoch": 4.19, "grad_norm": 0.5377846360206604, "learning_rate": 3.168904499710182e-06, "loss": 2.0556, "step": 23415 }, { "epoch": 4.19, "grad_norm": 0.492355078458786, "learning_rate": 3.162056689074827e-06, "loss": 1.737, "step": 23420 }, { "epoch": 4.19, "grad_norm": 0.7007409930229187, "learning_rate": 3.1552157857599324e-06, "loss": 1.9326, "step": 23425 }, { "epoch": 4.19, "grad_norm": 0.9527152180671692, "learning_rate": 3.1483817919292726e-06, "loss": 2.0368, "step": 23430 }, { "epoch": 4.19, "grad_norm": 2.5235965251922607, "learning_rate": 3.1415547097444374e-06, "loss": 1.7703, "step": 23435 }, { "epoch": 4.2, "grad_norm": 0.6163821816444397, "learning_rate": 3.134734541364817e-06, "loss": 1.7116, "step": 23440 }, { "epoch": 4.2, "grad_norm": 0.6082310080528259, "learning_rate": 3.12792128894763e-06, "loss": 1.7943, "step": 23445 }, { "epoch": 4.2, "grad_norm": 0.7631220817565918, "learning_rate": 3.121114954647908e-06, "loss": 2.0513, "step": 23450 }, { "epoch": 4.2, "grad_norm": 0.9659873843193054, "learning_rate": 3.114315540618473e-06, "loss": 1.911, "step": 23455 }, { "epoch": 4.2, "grad_norm": 0.7039878368377686, "learning_rate": 3.107523049009983e-06, "loss": 1.9678, "step": 23460 }, { "epoch": 4.2, "grad_norm": 0.5036177039146423, "learning_rate": 3.1007374819708947e-06, "loss": 1.941, "step": 23465 }, { "epoch": 4.2, "grad_norm": 0.522300124168396, "learning_rate": 3.0939588416474762e-06, "loss": 1.882, "step": 23470 }, { "epoch": 4.2, "grad_norm": 1.4440438747406006, "learning_rate": 3.0871871301838053e-06, "loss": 1.4968, "step": 23475 }, { "epoch": 4.2, "grad_norm": 0.7592721581459045, "learning_rate": 3.0804223497217706e-06, "loss": 1.8058, "step": 23480 }, { "epoch": 4.2, "grad_norm": 1.0022876262664795, "learning_rate": 3.0736645024010664e-06, "loss": 1.7875, "step": 23485 }, { "epoch": 4.2, "grad_norm": 0.6901012063026428, "learning_rate": 3.066913590359183e-06, "loss": 1.7997, "step": 23490 }, { "epoch": 4.21, "grad_norm": 0.9948673844337463, "learning_rate": 3.0601696157314315e-06, "loss": 1.9634, "step": 23495 }, { "epoch": 4.21, "grad_norm": 0.49334949254989624, "learning_rate": 3.0534325806509307e-06, "loss": 1.8824, "step": 23500 }, { "epoch": 4.21, "grad_norm": 0.7687468528747559, "learning_rate": 3.046702487248593e-06, "loss": 1.7471, "step": 23505 }, { "epoch": 4.21, "grad_norm": 0.5290055274963379, "learning_rate": 3.0399793376531484e-06, "loss": 1.7987, "step": 23510 }, { "epoch": 4.21, "grad_norm": 0.5497667789459229, "learning_rate": 3.03326313399111e-06, "loss": 1.8059, "step": 23515 }, { "epoch": 4.21, "grad_norm": 1.2975267171859741, "learning_rate": 3.0265538783868117e-06, "loss": 1.8914, "step": 23520 }, { "epoch": 4.21, "grad_norm": 1.4567995071411133, "learning_rate": 3.01985157296239e-06, "loss": 1.7871, "step": 23525 }, { "epoch": 4.21, "grad_norm": 1.1147876977920532, "learning_rate": 3.013156219837776e-06, "loss": 1.9195, "step": 23530 }, { "epoch": 4.21, "grad_norm": 0.48365309834480286, "learning_rate": 3.006467821130696e-06, "loss": 1.8373, "step": 23535 }, { "epoch": 4.21, "grad_norm": 0.6062785983085632, "learning_rate": 2.9997863789566953e-06, "loss": 1.726, "step": 23540 }, { "epoch": 4.21, "grad_norm": 1.1633416414260864, "learning_rate": 2.993111895429093e-06, "loss": 1.7636, "step": 23545 }, { "epoch": 4.22, "grad_norm": 0.9300481677055359, "learning_rate": 2.9864443726590335e-06, "loss": 1.6565, "step": 23550 }, { "epoch": 4.22, "grad_norm": 1.1455543041229248, "learning_rate": 2.9797838127554433e-06, "loss": 2.0796, "step": 23555 }, { "epoch": 4.22, "grad_norm": 0.33199450373649597, "learning_rate": 2.9731302178250543e-06, "loss": 2.0225, "step": 23560 }, { "epoch": 4.22, "grad_norm": 0.6419559717178345, "learning_rate": 2.966483589972394e-06, "loss": 1.9895, "step": 23565 }, { "epoch": 4.22, "grad_norm": 0.8773286938667297, "learning_rate": 2.9598439312997745e-06, "loss": 1.6463, "step": 23570 }, { "epoch": 4.22, "grad_norm": 1.4193726778030396, "learning_rate": 2.953211243907325e-06, "loss": 1.9775, "step": 23575 }, { "epoch": 4.22, "grad_norm": 0.9343559741973877, "learning_rate": 2.946585529892951e-06, "loss": 1.7366, "step": 23580 }, { "epoch": 4.22, "grad_norm": 0.9293035268783569, "learning_rate": 2.9399667913523582e-06, "loss": 1.8817, "step": 23585 }, { "epoch": 4.22, "grad_norm": 0.7766603827476501, "learning_rate": 2.933355030379048e-06, "loss": 1.904, "step": 23590 }, { "epoch": 4.22, "grad_norm": 0.7661356329917908, "learning_rate": 2.9267502490643194e-06, "loss": 1.5144, "step": 23595 }, { "epoch": 4.22, "grad_norm": 1.3593940734863281, "learning_rate": 2.92015244949726e-06, "loss": 1.9262, "step": 23600 }, { "epoch": 4.23, "grad_norm": 1.014451265335083, "learning_rate": 2.9135616337647386e-06, "loss": 1.6614, "step": 23605 }, { "epoch": 4.23, "grad_norm": 1.7844657897949219, "learning_rate": 2.906977803951427e-06, "loss": 1.8037, "step": 23610 }, { "epoch": 4.23, "grad_norm": 1.5160473585128784, "learning_rate": 2.9004009621397847e-06, "loss": 1.6912, "step": 23615 }, { "epoch": 4.23, "grad_norm": 1.6145546436309814, "learning_rate": 2.893831110410067e-06, "loss": 1.9369, "step": 23620 }, { "epoch": 4.23, "grad_norm": 1.6145838499069214, "learning_rate": 2.8872682508403e-06, "loss": 1.9497, "step": 23625 }, { "epoch": 4.23, "grad_norm": 2.104137659072876, "learning_rate": 2.880712385506318e-06, "loss": 1.916, "step": 23630 }, { "epoch": 4.23, "grad_norm": 0.9931949973106384, "learning_rate": 2.8741635164817315e-06, "loss": 1.6568, "step": 23635 }, { "epoch": 4.23, "grad_norm": 1.3967357873916626, "learning_rate": 2.8676216458379396e-06, "loss": 1.5115, "step": 23640 }, { "epoch": 4.23, "grad_norm": 0.5755307078361511, "learning_rate": 2.861086775644134e-06, "loss": 1.8759, "step": 23645 }, { "epoch": 4.23, "grad_norm": 0.9810478091239929, "learning_rate": 2.8545589079672865e-06, "loss": 1.6617, "step": 23650 }, { "epoch": 4.23, "grad_norm": 0.7798149585723877, "learning_rate": 2.848038044872159e-06, "loss": 1.6842, "step": 23655 }, { "epoch": 4.24, "grad_norm": 1.8288719654083252, "learning_rate": 2.8415241884212853e-06, "loss": 1.7013, "step": 23660 }, { "epoch": 4.24, "grad_norm": 1.104956030845642, "learning_rate": 2.8350173406749973e-06, "loss": 1.8724, "step": 23665 }, { "epoch": 4.24, "grad_norm": 1.367289423942566, "learning_rate": 2.8285175036914107e-06, "loss": 1.7069, "step": 23670 }, { "epoch": 4.24, "grad_norm": 1.1730290651321411, "learning_rate": 2.8220246795264057e-06, "loss": 1.851, "step": 23675 }, { "epoch": 4.24, "grad_norm": 1.0620871782302856, "learning_rate": 2.815538870233661e-06, "loss": 1.7259, "step": 23680 }, { "epoch": 4.24, "grad_norm": 1.1758960485458374, "learning_rate": 2.8090600778646362e-06, "loss": 1.7401, "step": 23685 }, { "epoch": 4.24, "grad_norm": 1.1776374578475952, "learning_rate": 2.8025883044685646e-06, "loss": 1.9266, "step": 23690 }, { "epoch": 4.24, "grad_norm": 0.8739143013954163, "learning_rate": 2.7961235520924584e-06, "loss": 1.9129, "step": 23695 }, { "epoch": 4.24, "grad_norm": 0.8561976552009583, "learning_rate": 2.789665822781115e-06, "loss": 2.0287, "step": 23700 }, { "epoch": 4.24, "grad_norm": 0.6630309224128723, "learning_rate": 2.7832151185771093e-06, "loss": 1.8712, "step": 23705 }, { "epoch": 4.24, "grad_norm": 0.8686838150024414, "learning_rate": 2.7767714415207896e-06, "loss": 1.9666, "step": 23710 }, { "epoch": 4.24, "grad_norm": 1.0738399028778076, "learning_rate": 2.770334793650292e-06, "loss": 1.7293, "step": 23715 }, { "epoch": 4.25, "grad_norm": 0.6418025493621826, "learning_rate": 2.763905177001519e-06, "loss": 1.5567, "step": 23720 }, { "epoch": 4.25, "grad_norm": 0.8317776322364807, "learning_rate": 2.757482593608143e-06, "loss": 2.0659, "step": 23725 }, { "epoch": 4.25, "grad_norm": 1.3214927911758423, "learning_rate": 2.751067045501629e-06, "loss": 1.9451, "step": 23730 }, { "epoch": 4.25, "grad_norm": 1.1789897680282593, "learning_rate": 2.7446585347112086e-06, "loss": 1.7314, "step": 23735 }, { "epoch": 4.25, "grad_norm": 0.8994229435920715, "learning_rate": 2.7382570632638854e-06, "loss": 1.8044, "step": 23740 }, { "epoch": 4.25, "grad_norm": 1.0152734518051147, "learning_rate": 2.7318626331844456e-06, "loss": 1.7334, "step": 23745 }, { "epoch": 4.25, "grad_norm": 1.6840885877609253, "learning_rate": 2.725475246495432e-06, "loss": 1.9423, "step": 23750 }, { "epoch": 4.25, "grad_norm": 1.69658362865448, "learning_rate": 2.7190949052171703e-06, "loss": 1.6838, "step": 23755 }, { "epoch": 4.25, "grad_norm": 0.9725009799003601, "learning_rate": 2.7127216113677635e-06, "loss": 1.9143, "step": 23760 }, { "epoch": 4.25, "grad_norm": 0.49594831466674805, "learning_rate": 2.7063553669630702e-06, "loss": 1.8105, "step": 23765 }, { "epoch": 4.25, "grad_norm": 1.0006866455078125, "learning_rate": 2.6999961740167305e-06, "loss": 1.8241, "step": 23770 }, { "epoch": 4.26, "grad_norm": 0.8839683532714844, "learning_rate": 2.6936440345401493e-06, "loss": 1.7172, "step": 23775 }, { "epoch": 4.26, "grad_norm": 0.9161496162414551, "learning_rate": 2.6872989505425105e-06, "loss": 1.8713, "step": 23780 }, { "epoch": 4.26, "grad_norm": 0.7400037050247192, "learning_rate": 2.6809609240307456e-06, "loss": 1.6108, "step": 23785 }, { "epoch": 4.26, "grad_norm": 1.0765010118484497, "learning_rate": 2.6746299570095722e-06, "loss": 2.0275, "step": 23790 }, { "epoch": 4.26, "grad_norm": 1.344102144241333, "learning_rate": 2.66830605148147e-06, "loss": 1.6618, "step": 23795 }, { "epoch": 4.26, "grad_norm": 2.3063645362854004, "learning_rate": 2.661989209446683e-06, "loss": 1.5832, "step": 23800 }, { "epoch": 4.26, "grad_norm": 0.9692860245704651, "learning_rate": 2.655679432903227e-06, "loss": 1.9494, "step": 23805 }, { "epoch": 4.26, "grad_norm": 1.3169384002685547, "learning_rate": 2.649376723846875e-06, "loss": 1.7188, "step": 23810 }, { "epoch": 4.26, "grad_norm": 0.763640820980072, "learning_rate": 2.6430810842711595e-06, "loss": 1.8531, "step": 23815 }, { "epoch": 4.26, "grad_norm": 0.6789270639419556, "learning_rate": 2.636792516167394e-06, "loss": 1.6805, "step": 23820 }, { "epoch": 4.26, "grad_norm": 2.9606266021728516, "learning_rate": 2.6305110215246455e-06, "loss": 1.6875, "step": 23825 }, { "epoch": 4.27, "grad_norm": 0.8681931495666504, "learning_rate": 2.624236602329744e-06, "loss": 1.8095, "step": 23830 }, { "epoch": 4.27, "grad_norm": 0.6462576985359192, "learning_rate": 2.6179692605672877e-06, "loss": 1.8016, "step": 23835 }, { "epoch": 4.27, "grad_norm": 0.38575172424316406, "learning_rate": 2.6117089982196197e-06, "loss": 1.9387, "step": 23840 }, { "epoch": 4.27, "grad_norm": 0.7060482501983643, "learning_rate": 2.6054558172668607e-06, "loss": 1.691, "step": 23845 }, { "epoch": 4.27, "grad_norm": 0.779434859752655, "learning_rate": 2.5992097196868847e-06, "loss": 1.6565, "step": 23850 }, { "epoch": 4.27, "grad_norm": 4.394678115844727, "learning_rate": 2.5929707074553363e-06, "loss": 1.7052, "step": 23855 }, { "epoch": 4.27, "grad_norm": 0.5779292583465576, "learning_rate": 2.5867387825455917e-06, "loss": 1.861, "step": 23860 }, { "epoch": 4.27, "grad_norm": 0.7862660884857178, "learning_rate": 2.580513946928817e-06, "loss": 1.8878, "step": 23865 }, { "epoch": 4.27, "grad_norm": 1.2498642206192017, "learning_rate": 2.5742962025739115e-06, "loss": 1.8267, "step": 23870 }, { "epoch": 4.27, "grad_norm": 0.6954331398010254, "learning_rate": 2.5680855514475466e-06, "loss": 1.8246, "step": 23875 }, { "epoch": 4.27, "grad_norm": 0.8826925754547119, "learning_rate": 2.5618819955141456e-06, "loss": 1.7513, "step": 23880 }, { "epoch": 4.28, "grad_norm": 1.086248517036438, "learning_rate": 2.5556855367358857e-06, "loss": 1.6789, "step": 23885 }, { "epoch": 4.28, "grad_norm": 0.739668071269989, "learning_rate": 2.549496177072702e-06, "loss": 1.6101, "step": 23890 }, { "epoch": 4.28, "grad_norm": 1.1162097454071045, "learning_rate": 2.5433139184822876e-06, "loss": 1.8692, "step": 23895 }, { "epoch": 4.28, "grad_norm": 0.627552330493927, "learning_rate": 2.53713876292008e-06, "loss": 1.9747, "step": 23900 }, { "epoch": 4.28, "grad_norm": 1.044128179550171, "learning_rate": 2.5309707123392727e-06, "loss": 1.7328, "step": 23905 }, { "epoch": 4.28, "grad_norm": 1.0667206048965454, "learning_rate": 2.524809768690814e-06, "loss": 1.8482, "step": 23910 }, { "epoch": 4.28, "grad_norm": 0.584099292755127, "learning_rate": 2.5186559339234085e-06, "loss": 1.7584, "step": 23915 }, { "epoch": 4.28, "grad_norm": 1.1192121505737305, "learning_rate": 2.5125092099835047e-06, "loss": 1.5993, "step": 23920 }, { "epoch": 4.28, "grad_norm": 0.9891035556793213, "learning_rate": 2.506369598815314e-06, "loss": 1.6044, "step": 23925 }, { "epoch": 4.28, "grad_norm": 0.9022369384765625, "learning_rate": 2.500237102360778e-06, "loss": 1.5102, "step": 23930 }, { "epoch": 4.28, "grad_norm": 1.2190988063812256, "learning_rate": 2.4941117225596054e-06, "loss": 1.8696, "step": 23935 }, { "epoch": 4.29, "grad_norm": 1.111484408378601, "learning_rate": 2.4879934613492444e-06, "loss": 1.9025, "step": 23940 }, { "epoch": 4.29, "grad_norm": 1.1540158987045288, "learning_rate": 2.4818823206649024e-06, "loss": 1.9573, "step": 23945 }, { "epoch": 4.29, "grad_norm": 0.7060664892196655, "learning_rate": 2.475778302439524e-06, "loss": 1.7662, "step": 23950 }, { "epoch": 4.29, "grad_norm": 1.312042474746704, "learning_rate": 2.469681408603802e-06, "loss": 1.7862, "step": 23955 }, { "epoch": 4.29, "grad_norm": 0.7066253423690796, "learning_rate": 2.463591641086177e-06, "loss": 1.8912, "step": 23960 }, { "epoch": 4.29, "grad_norm": 0.7877931594848633, "learning_rate": 2.4575090018128383e-06, "loss": 1.8976, "step": 23965 }, { "epoch": 4.29, "grad_norm": 1.185813307762146, "learning_rate": 2.4514334927077167e-06, "loss": 1.9366, "step": 23970 }, { "epoch": 4.29, "grad_norm": 1.511576533317566, "learning_rate": 2.445365115692494e-06, "loss": 1.7834, "step": 23975 }, { "epoch": 4.29, "grad_norm": 0.49797123670578003, "learning_rate": 2.4393038726865907e-06, "loss": 1.9477, "step": 23980 }, { "epoch": 4.29, "grad_norm": 0.5764663815498352, "learning_rate": 2.4332497656071764e-06, "loss": 1.9526, "step": 23985 }, { "epoch": 4.29, "grad_norm": 0.518380880355835, "learning_rate": 2.427202796369152e-06, "loss": 1.6421, "step": 23990 }, { "epoch": 4.29, "grad_norm": 0.8762568831443787, "learning_rate": 2.421162966885171e-06, "loss": 1.8422, "step": 23995 }, { "epoch": 4.3, "grad_norm": 0.7528052926063538, "learning_rate": 2.415130279065625e-06, "loss": 1.9284, "step": 24000 }, { "epoch": 4.3, "grad_norm": 1.1197984218597412, "learning_rate": 2.409104734818646e-06, "loss": 1.8608, "step": 24005 }, { "epoch": 4.3, "grad_norm": 0.6748790144920349, "learning_rate": 2.4030863360501087e-06, "loss": 1.6257, "step": 24010 }, { "epoch": 4.3, "grad_norm": 0.5878778100013733, "learning_rate": 2.397075084663636e-06, "loss": 1.884, "step": 24015 }, { "epoch": 4.3, "grad_norm": 0.6087291240692139, "learning_rate": 2.391070982560564e-06, "loss": 1.8711, "step": 24020 }, { "epoch": 4.3, "grad_norm": 0.7970364689826965, "learning_rate": 2.3850740316399946e-06, "loss": 2.0835, "step": 24025 }, { "epoch": 4.3, "grad_norm": 1.3959071636199951, "learning_rate": 2.3790842337987564e-06, "loss": 1.6191, "step": 24030 }, { "epoch": 4.3, "grad_norm": 0.6767655611038208, "learning_rate": 2.3731015909314176e-06, "loss": 1.6902, "step": 24035 }, { "epoch": 4.3, "grad_norm": 0.8441483378410339, "learning_rate": 2.3671261049302865e-06, "loss": 2.0472, "step": 24040 }, { "epoch": 4.3, "grad_norm": 1.3324369192123413, "learning_rate": 2.3611577776853966e-06, "loss": 1.7657, "step": 24045 }, { "epoch": 4.3, "grad_norm": 1.7804863452911377, "learning_rate": 2.3551966110845203e-06, "loss": 1.7996, "step": 24050 }, { "epoch": 4.31, "grad_norm": 1.5548887252807617, "learning_rate": 2.3492426070131747e-06, "loss": 1.9328, "step": 24055 }, { "epoch": 4.31, "grad_norm": 0.956676185131073, "learning_rate": 2.3432957673546087e-06, "loss": 1.9565, "step": 24060 }, { "epoch": 4.31, "grad_norm": 0.9713985919952393, "learning_rate": 2.3373560939897954e-06, "loss": 1.65, "step": 24065 }, { "epoch": 4.31, "grad_norm": 2.308432102203369, "learning_rate": 2.3314235887974547e-06, "loss": 1.9528, "step": 24070 }, { "epoch": 4.31, "grad_norm": 1.1004524230957031, "learning_rate": 2.3254982536540267e-06, "loss": 1.617, "step": 24075 }, { "epoch": 4.31, "grad_norm": 0.641258180141449, "learning_rate": 2.3195800904336874e-06, "loss": 1.6639, "step": 24080 }, { "epoch": 4.31, "grad_norm": 1.2109898328781128, "learning_rate": 2.3136691010083535e-06, "loss": 1.8781, "step": 24085 }, { "epoch": 4.31, "grad_norm": 0.9537338614463806, "learning_rate": 2.3077652872476624e-06, "loss": 1.925, "step": 24090 }, { "epoch": 4.31, "grad_norm": 0.7373508214950562, "learning_rate": 2.301868651018982e-06, "loss": 1.8247, "step": 24095 }, { "epoch": 4.31, "grad_norm": 1.0468422174453735, "learning_rate": 2.2959791941874143e-06, "loss": 1.6905, "step": 24100 }, { "epoch": 4.31, "grad_norm": 0.8555766940116882, "learning_rate": 2.2900969186157916e-06, "loss": 2.035, "step": 24105 }, { "epoch": 4.32, "grad_norm": 1.271920084953308, "learning_rate": 2.2842218261646677e-06, "loss": 1.887, "step": 24110 }, { "epoch": 4.32, "grad_norm": 1.4928789138793945, "learning_rate": 2.2783539186923274e-06, "loss": 1.9741, "step": 24115 }, { "epoch": 4.32, "grad_norm": 1.0468617677688599, "learning_rate": 2.272493198054787e-06, "loss": 1.6921, "step": 24120 }, { "epoch": 4.32, "grad_norm": 0.6943525075912476, "learning_rate": 2.2666396661057907e-06, "loss": 2.0084, "step": 24125 }, { "epoch": 4.32, "grad_norm": 0.5706409811973572, "learning_rate": 2.2607933246968027e-06, "loss": 2.0996, "step": 24130 }, { "epoch": 4.32, "grad_norm": 1.1674983501434326, "learning_rate": 2.2549541756770156e-06, "loss": 1.8062, "step": 24135 }, { "epoch": 4.32, "grad_norm": 1.248380422592163, "learning_rate": 2.2491222208933376e-06, "loss": 1.8164, "step": 24140 }, { "epoch": 4.32, "grad_norm": 1.2721612453460693, "learning_rate": 2.243297462190419e-06, "loss": 1.6068, "step": 24145 }, { "epoch": 4.32, "grad_norm": 1.1479939222335815, "learning_rate": 2.2374799014106246e-06, "loss": 1.8664, "step": 24150 }, { "epoch": 4.32, "grad_norm": 0.7115007638931274, "learning_rate": 2.2316695403940433e-06, "loss": 2.029, "step": 24155 }, { "epoch": 4.32, "grad_norm": 0.615996778011322, "learning_rate": 2.2258663809784892e-06, "loss": 1.8326, "step": 24160 }, { "epoch": 4.33, "grad_norm": 0.7172175049781799, "learning_rate": 2.2200704249994874e-06, "loss": 1.791, "step": 24165 }, { "epoch": 4.33, "grad_norm": 0.530174732208252, "learning_rate": 2.2142816742903e-06, "loss": 1.8528, "step": 24170 }, { "epoch": 4.33, "grad_norm": 0.49278607964515686, "learning_rate": 2.208500130681901e-06, "loss": 1.9395, "step": 24175 }, { "epoch": 4.33, "grad_norm": 0.946496307849884, "learning_rate": 2.2027257960029914e-06, "loss": 1.7902, "step": 24180 }, { "epoch": 4.33, "grad_norm": 0.7843234539031982, "learning_rate": 2.1969586720799852e-06, "loss": 1.9402, "step": 24185 }, { "epoch": 4.33, "grad_norm": 1.0197824239730835, "learning_rate": 2.1911987607370123e-06, "loss": 1.7697, "step": 24190 }, { "epoch": 4.33, "grad_norm": 0.8750340342521667, "learning_rate": 2.18544606379594e-06, "loss": 1.9873, "step": 24195 }, { "epoch": 4.33, "grad_norm": 1.0324968099594116, "learning_rate": 2.1797005830763246e-06, "loss": 1.8219, "step": 24200 }, { "epoch": 4.33, "grad_norm": 1.028013825416565, "learning_rate": 2.1739623203954674e-06, "loss": 1.7537, "step": 24205 }, { "epoch": 4.33, "grad_norm": 1.2253060340881348, "learning_rate": 2.1682312775683723e-06, "loss": 1.8392, "step": 24210 }, { "epoch": 4.33, "grad_norm": 3.8781888484954834, "learning_rate": 2.1625074564077612e-06, "loss": 1.7971, "step": 24215 }, { "epoch": 4.34, "grad_norm": 0.5331137776374817, "learning_rate": 2.156790858724078e-06, "loss": 1.9863, "step": 24220 }, { "epoch": 4.34, "grad_norm": 0.9256309270858765, "learning_rate": 2.1510814863254737e-06, "loss": 2.0353, "step": 24225 }, { "epoch": 4.34, "grad_norm": 0.932483971118927, "learning_rate": 2.1453793410178196e-06, "loss": 1.7526, "step": 24230 }, { "epoch": 4.34, "grad_norm": 0.6007688641548157, "learning_rate": 2.1396844246046903e-06, "loss": 1.6332, "step": 24235 }, { "epoch": 4.34, "grad_norm": 0.9822659492492676, "learning_rate": 2.1339967388873895e-06, "loss": 1.3974, "step": 24240 }, { "epoch": 4.34, "grad_norm": 0.8610334992408752, "learning_rate": 2.1283162856649273e-06, "loss": 2.0177, "step": 24245 }, { "epoch": 4.34, "grad_norm": 0.5514267086982727, "learning_rate": 2.1226430667340285e-06, "loss": 1.8909, "step": 24250 }, { "epoch": 4.34, "grad_norm": 0.7719696164131165, "learning_rate": 2.1169770838891155e-06, "loss": 1.6996, "step": 24255 }, { "epoch": 4.34, "grad_norm": 0.8033381700515747, "learning_rate": 2.111318338922344e-06, "loss": 1.7499, "step": 24260 }, { "epoch": 4.34, "grad_norm": 1.272058367729187, "learning_rate": 2.1056668336235622e-06, "loss": 1.4928, "step": 24265 }, { "epoch": 4.34, "grad_norm": 1.0982578992843628, "learning_rate": 2.100022569780341e-06, "loss": 1.8814, "step": 24270 }, { "epoch": 4.35, "grad_norm": 0.8227397799491882, "learning_rate": 2.09438554917796e-06, "loss": 1.7183, "step": 24275 }, { "epoch": 4.35, "grad_norm": 1.012026071548462, "learning_rate": 2.0887557735993908e-06, "loss": 1.7103, "step": 24280 }, { "epoch": 4.35, "grad_norm": 1.1738848686218262, "learning_rate": 2.0831332448253386e-06, "loss": 1.6326, "step": 24285 }, { "epoch": 4.35, "grad_norm": 0.9276083707809448, "learning_rate": 2.077517964634193e-06, "loss": 1.7646, "step": 24290 }, { "epoch": 4.35, "grad_norm": 3.0199804306030273, "learning_rate": 2.071909934802066e-06, "loss": 1.9786, "step": 24295 }, { "epoch": 4.35, "grad_norm": 0.6301084756851196, "learning_rate": 2.066309157102772e-06, "loss": 1.6944, "step": 24300 }, { "epoch": 4.35, "grad_norm": 1.1437808275222778, "learning_rate": 2.0607156333078332e-06, "loss": 1.9109, "step": 24305 }, { "epoch": 4.35, "grad_norm": 0.41809070110321045, "learning_rate": 2.0551293651864783e-06, "loss": 2.0125, "step": 24310 }, { "epoch": 4.35, "grad_norm": 3.662910223007202, "learning_rate": 2.0495503545056303e-06, "loss": 1.8321, "step": 24315 }, { "epoch": 4.35, "grad_norm": 1.1465100049972534, "learning_rate": 2.0439786030299306e-06, "loss": 1.7392, "step": 24320 }, { "epoch": 4.35, "grad_norm": 1.2465860843658447, "learning_rate": 2.038414112521722e-06, "loss": 1.7251, "step": 24325 }, { "epoch": 4.35, "grad_norm": 0.9465045928955078, "learning_rate": 2.032856884741041e-06, "loss": 1.8722, "step": 24330 }, { "epoch": 4.36, "grad_norm": 0.7800765633583069, "learning_rate": 2.027306921445632e-06, "loss": 1.719, "step": 24335 }, { "epoch": 4.36, "grad_norm": 0.5892726182937622, "learning_rate": 2.021764224390954e-06, "loss": 1.4895, "step": 24340 }, { "epoch": 4.36, "grad_norm": 1.069069504737854, "learning_rate": 2.016228795330144e-06, "loss": 1.7287, "step": 24345 }, { "epoch": 4.36, "grad_norm": 0.602766215801239, "learning_rate": 2.0107006360140597e-06, "loss": 1.8655, "step": 24350 }, { "epoch": 4.36, "grad_norm": 1.3404914140701294, "learning_rate": 2.005179748191249e-06, "loss": 1.8222, "step": 24355 }, { "epoch": 4.36, "grad_norm": 0.8859615325927734, "learning_rate": 1.9996661336079693e-06, "loss": 2.0234, "step": 24360 }, { "epoch": 4.36, "grad_norm": 0.8665515780448914, "learning_rate": 1.9941597940081687e-06, "loss": 1.6752, "step": 24365 }, { "epoch": 4.36, "grad_norm": 0.8328173160552979, "learning_rate": 1.988660731133499e-06, "loss": 1.7841, "step": 24370 }, { "epoch": 4.36, "grad_norm": 0.7565486431121826, "learning_rate": 1.9831689467233015e-06, "loss": 2.0957, "step": 24375 }, { "epoch": 4.36, "grad_norm": 1.5388751029968262, "learning_rate": 1.977684442514624e-06, "loss": 2.0002, "step": 24380 }, { "epoch": 4.36, "grad_norm": 1.155758261680603, "learning_rate": 1.9722072202422144e-06, "loss": 2.0851, "step": 24385 }, { "epoch": 4.37, "grad_norm": 0.5179358720779419, "learning_rate": 1.9667372816385114e-06, "loss": 2.1737, "step": 24390 }, { "epoch": 4.37, "grad_norm": 0.9456248879432678, "learning_rate": 1.9612746284336513e-06, "loss": 1.7682, "step": 24395 }, { "epoch": 4.37, "grad_norm": 1.9018527269363403, "learning_rate": 1.955819262355471e-06, "loss": 1.6936, "step": 24400 }, { "epoch": 4.37, "grad_norm": 0.984753429889679, "learning_rate": 1.9503711851294846e-06, "loss": 1.6365, "step": 24405 }, { "epoch": 4.37, "grad_norm": 0.8661744594573975, "learning_rate": 1.9449303984789253e-06, "loss": 1.7751, "step": 24410 }, { "epoch": 4.37, "grad_norm": 2.2209432125091553, "learning_rate": 1.9394969041247074e-06, "loss": 1.8097, "step": 24415 }, { "epoch": 4.37, "grad_norm": 0.5390074253082275, "learning_rate": 1.9340707037854316e-06, "loss": 1.8734, "step": 24420 }, { "epoch": 4.37, "grad_norm": 0.6612688899040222, "learning_rate": 1.9286517991774085e-06, "loss": 1.8623, "step": 24425 }, { "epoch": 4.37, "grad_norm": 1.4000182151794434, "learning_rate": 1.9232401920146306e-06, "loss": 1.5901, "step": 24430 }, { "epoch": 4.37, "grad_norm": 0.7332963943481445, "learning_rate": 1.9178358840087814e-06, "loss": 1.6914, "step": 24435 }, { "epoch": 4.37, "grad_norm": 1.1628941297531128, "learning_rate": 1.912438876869238e-06, "loss": 1.7781, "step": 24440 }, { "epoch": 4.38, "grad_norm": 2.224209785461426, "learning_rate": 1.9070491723030715e-06, "loss": 1.4185, "step": 24445 }, { "epoch": 4.38, "grad_norm": 0.7662902474403381, "learning_rate": 1.90166677201504e-06, "loss": 1.845, "step": 24450 }, { "epoch": 4.38, "grad_norm": 1.3199970722198486, "learning_rate": 1.8962916777075928e-06, "loss": 1.8531, "step": 24455 }, { "epoch": 4.38, "grad_norm": 0.7640072107315063, "learning_rate": 1.890923891080862e-06, "loss": 2.048, "step": 24460 }, { "epoch": 4.38, "grad_norm": 0.6150866746902466, "learning_rate": 1.8855634138326806e-06, "loss": 1.8992, "step": 24465 }, { "epoch": 4.38, "grad_norm": 1.9289946556091309, "learning_rate": 1.8802102476585537e-06, "loss": 1.9587, "step": 24470 }, { "epoch": 4.38, "grad_norm": 0.9716728925704956, "learning_rate": 1.874864394251688e-06, "loss": 1.6301, "step": 24475 }, { "epoch": 4.38, "grad_norm": 0.83192378282547, "learning_rate": 1.8695258553029699e-06, "loss": 1.9311, "step": 24480 }, { "epoch": 4.38, "grad_norm": 1.113206148147583, "learning_rate": 1.8641946325009735e-06, "loss": 1.419, "step": 24485 }, { "epoch": 4.38, "grad_norm": 0.6721107363700867, "learning_rate": 1.8588707275319667e-06, "loss": 1.9316, "step": 24490 }, { "epoch": 4.38, "grad_norm": 0.9275158643722534, "learning_rate": 1.8535541420798858e-06, "loss": 2.1295, "step": 24495 }, { "epoch": 4.39, "grad_norm": 0.5885508060455322, "learning_rate": 1.8482448778263662e-06, "loss": 1.882, "step": 24500 }, { "epoch": 4.39, "grad_norm": 0.7526519894599915, "learning_rate": 1.8429429364507228e-06, "loss": 1.9953, "step": 24505 }, { "epoch": 4.39, "grad_norm": 0.7567302584648132, "learning_rate": 1.837648319629956e-06, "loss": 1.769, "step": 24510 }, { "epoch": 4.39, "grad_norm": 0.6800612211227417, "learning_rate": 1.8323610290387454e-06, "loss": 1.8711, "step": 24515 }, { "epoch": 4.39, "grad_norm": 0.9701638221740723, "learning_rate": 1.827081066349459e-06, "loss": 1.8497, "step": 24520 }, { "epoch": 4.39, "grad_norm": 1.5889774560928345, "learning_rate": 1.821808433232139e-06, "loss": 2.0364, "step": 24525 }, { "epoch": 4.39, "grad_norm": 0.7154751420021057, "learning_rate": 1.8165431313545144e-06, "loss": 1.7296, "step": 24530 }, { "epoch": 4.39, "grad_norm": 1.3648035526275635, "learning_rate": 1.811285162382001e-06, "loss": 1.804, "step": 24535 }, { "epoch": 4.39, "grad_norm": 0.6310946941375732, "learning_rate": 1.8060345279776848e-06, "loss": 2.1313, "step": 24540 }, { "epoch": 4.39, "grad_norm": 0.9399888515472412, "learning_rate": 1.80079122980234e-06, "loss": 1.5419, "step": 24545 }, { "epoch": 4.39, "grad_norm": 0.7775284647941589, "learning_rate": 1.7955552695144123e-06, "loss": 2.019, "step": 24550 }, { "epoch": 4.4, "grad_norm": 2.3950979709625244, "learning_rate": 1.7903266487700382e-06, "loss": 2.064, "step": 24555 }, { "epoch": 4.4, "grad_norm": 0.8698521852493286, "learning_rate": 1.785105369223014e-06, "loss": 1.8614, "step": 24560 }, { "epoch": 4.4, "grad_norm": 0.7981832027435303, "learning_rate": 1.7798914325248328e-06, "loss": 1.7595, "step": 24565 }, { "epoch": 4.4, "grad_norm": 1.5428972244262695, "learning_rate": 1.7746848403246559e-06, "loss": 1.869, "step": 24570 }, { "epoch": 4.4, "grad_norm": 0.7430126667022705, "learning_rate": 1.7694855942693239e-06, "loss": 1.8618, "step": 24575 }, { "epoch": 4.4, "grad_norm": 0.7668565511703491, "learning_rate": 1.7642936960033578e-06, "loss": 1.753, "step": 24580 }, { "epoch": 4.4, "grad_norm": 2.907975196838379, "learning_rate": 1.7591091471689414e-06, "loss": 1.9419, "step": 24585 }, { "epoch": 4.4, "grad_norm": 1.0696461200714111, "learning_rate": 1.7539319494059458e-06, "loss": 1.8617, "step": 24590 }, { "epoch": 4.4, "grad_norm": 0.8900358080863953, "learning_rate": 1.748762104351917e-06, "loss": 1.8093, "step": 24595 }, { "epoch": 4.4, "grad_norm": 0.6597603559494019, "learning_rate": 1.743599613642069e-06, "loss": 1.7068, "step": 24600 }, { "epoch": 4.4, "grad_norm": 0.9883655309677124, "learning_rate": 1.7384444789092957e-06, "loss": 1.6025, "step": 24605 }, { "epoch": 4.41, "grad_norm": 4.697280406951904, "learning_rate": 1.7332967017841623e-06, "loss": 1.8796, "step": 24610 }, { "epoch": 4.41, "grad_norm": 1.5211069583892822, "learning_rate": 1.7281562838948966e-06, "loss": 1.7911, "step": 24615 }, { "epoch": 4.41, "grad_norm": 0.9767752885818481, "learning_rate": 1.7230232268674124e-06, "loss": 1.7474, "step": 24620 }, { "epoch": 4.41, "grad_norm": 1.0923303365707397, "learning_rate": 1.7178975323252937e-06, "loss": 1.6909, "step": 24625 }, { "epoch": 4.41, "grad_norm": 0.7364457845687866, "learning_rate": 1.7127792018897914e-06, "loss": 1.9563, "step": 24630 }, { "epoch": 4.41, "grad_norm": 0.9624817371368408, "learning_rate": 1.7076682371798298e-06, "loss": 1.9367, "step": 24635 }, { "epoch": 4.41, "grad_norm": 0.6353481411933899, "learning_rate": 1.7025646398119988e-06, "loss": 1.8782, "step": 24640 }, { "epoch": 4.41, "grad_norm": 1.050934910774231, "learning_rate": 1.6974684114005601e-06, "loss": 1.6196, "step": 24645 }, { "epoch": 4.41, "grad_norm": 1.2270129919052124, "learning_rate": 1.692379553557455e-06, "loss": 2.0497, "step": 24650 }, { "epoch": 4.41, "grad_norm": 0.5485097765922546, "learning_rate": 1.6872980678922734e-06, "loss": 1.781, "step": 24655 }, { "epoch": 4.41, "grad_norm": 0.6211299300193787, "learning_rate": 1.6822239560122882e-06, "loss": 1.8401, "step": 24660 }, { "epoch": 4.41, "grad_norm": 4.1435017585754395, "learning_rate": 1.6771572195224433e-06, "loss": 1.8793, "step": 24665 }, { "epoch": 4.42, "grad_norm": 0.7795052528381348, "learning_rate": 1.672097860025329e-06, "loss": 1.629, "step": 24670 }, { "epoch": 4.42, "grad_norm": 0.4488808512687683, "learning_rate": 1.6670458791212263e-06, "loss": 2.0304, "step": 24675 }, { "epoch": 4.42, "grad_norm": 0.94510817527771, "learning_rate": 1.6620012784080652e-06, "loss": 1.6173, "step": 24680 }, { "epoch": 4.42, "grad_norm": 0.4074093997478485, "learning_rate": 1.6569640594814528e-06, "loss": 2.0128, "step": 24685 }, { "epoch": 4.42, "grad_norm": 0.7733169794082642, "learning_rate": 1.6519342239346564e-06, "loss": 1.9417, "step": 24690 }, { "epoch": 4.42, "grad_norm": 0.7011056542396545, "learning_rate": 1.6469117733586087e-06, "loss": 2.0944, "step": 24695 }, { "epoch": 4.42, "grad_norm": 0.8170765042304993, "learning_rate": 1.6418967093419058e-06, "loss": 1.9794, "step": 24700 }, { "epoch": 4.42, "grad_norm": 0.5875701308250427, "learning_rate": 1.6368890334708037e-06, "loss": 1.9456, "step": 24705 }, { "epoch": 4.42, "grad_norm": 0.7417488694190979, "learning_rate": 1.6318887473292243e-06, "loss": 1.6503, "step": 24710 }, { "epoch": 4.42, "grad_norm": 1.339301347732544, "learning_rate": 1.626895852498761e-06, "loss": 1.7944, "step": 24715 }, { "epoch": 4.42, "grad_norm": 0.6533433198928833, "learning_rate": 1.6219103505586531e-06, "loss": 1.764, "step": 24720 }, { "epoch": 4.43, "grad_norm": 0.9360514283180237, "learning_rate": 1.6169322430858198e-06, "loss": 1.7368, "step": 24725 }, { "epoch": 4.43, "grad_norm": 0.906342625617981, "learning_rate": 1.6119615316548237e-06, "loss": 1.7317, "step": 24730 }, { "epoch": 4.43, "grad_norm": 2.1452555656433105, "learning_rate": 1.606998217837899e-06, "loss": 1.6536, "step": 24735 }, { "epoch": 4.43, "grad_norm": 0.49390387535095215, "learning_rate": 1.6020423032049342e-06, "loss": 1.8927, "step": 24740 }, { "epoch": 4.43, "grad_norm": 0.8980386257171631, "learning_rate": 1.5970937893234894e-06, "loss": 1.5554, "step": 24745 }, { "epoch": 4.43, "grad_norm": 0.8987436890602112, "learning_rate": 1.5921526777587625e-06, "loss": 1.7345, "step": 24750 }, { "epoch": 4.43, "grad_norm": 0.7175293564796448, "learning_rate": 1.5872189700736339e-06, "loss": 2.1268, "step": 24755 }, { "epoch": 4.43, "grad_norm": 1.050519585609436, "learning_rate": 1.5822926678286194e-06, "loss": 1.9497, "step": 24760 }, { "epoch": 4.43, "grad_norm": 0.6734400987625122, "learning_rate": 1.577373772581911e-06, "loss": 1.5229, "step": 24765 }, { "epoch": 4.43, "grad_norm": 1.3259496688842773, "learning_rate": 1.572462285889345e-06, "loss": 1.8825, "step": 24770 }, { "epoch": 4.43, "grad_norm": 1.473742127418518, "learning_rate": 1.5675582093044261e-06, "loss": 1.9713, "step": 24775 }, { "epoch": 4.44, "grad_norm": 1.0151005983352661, "learning_rate": 1.5626615443783077e-06, "loss": 1.524, "step": 24780 }, { "epoch": 4.44, "grad_norm": 0.7851746678352356, "learning_rate": 1.557772292659801e-06, "loss": 1.8783, "step": 24785 }, { "epoch": 4.44, "grad_norm": 3.6031758785247803, "learning_rate": 1.552890455695369e-06, "loss": 1.9658, "step": 24790 }, { "epoch": 4.44, "grad_norm": 1.0066990852355957, "learning_rate": 1.5480160350291317e-06, "loss": 1.7486, "step": 24795 }, { "epoch": 4.44, "grad_norm": 0.8470297455787659, "learning_rate": 1.543149032202862e-06, "loss": 1.7935, "step": 24800 }, { "epoch": 4.44, "grad_norm": 3.9266157150268555, "learning_rate": 1.538289448755989e-06, "loss": 1.6526, "step": 24805 }, { "epoch": 4.44, "grad_norm": 1.250511884689331, "learning_rate": 1.5334372862256002e-06, "loss": 1.6387, "step": 24810 }, { "epoch": 4.44, "grad_norm": 0.848745584487915, "learning_rate": 1.5285925461464263e-06, "loss": 2.0486, "step": 24815 }, { "epoch": 4.44, "grad_norm": 0.8530756831169128, "learning_rate": 1.5237552300508524e-06, "loss": 1.4405, "step": 24820 }, { "epoch": 4.44, "grad_norm": 0.5230215787887573, "learning_rate": 1.5189253394689156e-06, "loss": 1.7863, "step": 24825 }, { "epoch": 4.44, "grad_norm": 1.091163992881775, "learning_rate": 1.514102875928311e-06, "loss": 1.5996, "step": 24830 }, { "epoch": 4.45, "grad_norm": 1.1136764287948608, "learning_rate": 1.5092878409543764e-06, "loss": 1.8101, "step": 24835 }, { "epoch": 4.45, "grad_norm": 1.0806092023849487, "learning_rate": 1.5044802360700994e-06, "loss": 1.949, "step": 24840 }, { "epoch": 4.45, "grad_norm": 0.9689674973487854, "learning_rate": 1.49968006279613e-06, "loss": 1.7404, "step": 24845 }, { "epoch": 4.45, "grad_norm": 1.1383538246154785, "learning_rate": 1.4948873226507481e-06, "loss": 1.6446, "step": 24850 }, { "epoch": 4.45, "grad_norm": 0.7477939128875732, "learning_rate": 1.490102017149894e-06, "loss": 1.7101, "step": 24855 }, { "epoch": 4.45, "grad_norm": 0.6716863512992859, "learning_rate": 1.48532414780716e-06, "loss": 1.7832, "step": 24860 }, { "epoch": 4.45, "grad_norm": 1.1172704696655273, "learning_rate": 1.4805537161337813e-06, "loss": 1.6232, "step": 24865 }, { "epoch": 4.45, "grad_norm": 0.48017409443855286, "learning_rate": 1.4757907236386426e-06, "loss": 1.7984, "step": 24870 }, { "epoch": 4.45, "grad_norm": 0.6822175979614258, "learning_rate": 1.4710351718282667e-06, "loss": 1.745, "step": 24875 }, { "epoch": 4.45, "grad_norm": 0.6264699697494507, "learning_rate": 1.466287062206831e-06, "loss": 1.778, "step": 24880 }, { "epoch": 4.45, "grad_norm": 0.8009613752365112, "learning_rate": 1.4615463962761671e-06, "loss": 1.8883, "step": 24885 }, { "epoch": 4.46, "grad_norm": 26.099721908569336, "learning_rate": 1.4568131755357346e-06, "loss": 1.8559, "step": 24890 }, { "epoch": 4.46, "grad_norm": 1.0402318239212036, "learning_rate": 1.4520874014826463e-06, "loss": 1.7209, "step": 24895 }, { "epoch": 4.46, "grad_norm": 0.6136242151260376, "learning_rate": 1.4473690756116654e-06, "loss": 1.8066, "step": 24900 }, { "epoch": 4.46, "grad_norm": 1.0361982583999634, "learning_rate": 1.442658199415192e-06, "loss": 1.5658, "step": 24905 }, { "epoch": 4.46, "grad_norm": 0.9705806374549866, "learning_rate": 1.4379547743832683e-06, "loss": 1.9806, "step": 24910 }, { "epoch": 4.46, "grad_norm": 0.6670936346054077, "learning_rate": 1.433258802003587e-06, "loss": 1.9256, "step": 24915 }, { "epoch": 4.46, "grad_norm": 0.9053754806518555, "learning_rate": 1.4285702837614768e-06, "loss": 1.8113, "step": 24920 }, { "epoch": 4.46, "grad_norm": 0.8029524087905884, "learning_rate": 1.4238892211399152e-06, "loss": 1.9991, "step": 24925 }, { "epoch": 4.46, "grad_norm": 0.7751874327659607, "learning_rate": 1.4192156156195151e-06, "loss": 1.6467, "step": 24930 }, { "epoch": 4.46, "grad_norm": 0.59059077501297, "learning_rate": 1.4145494686785387e-06, "loss": 2.0298, "step": 24935 }, { "epoch": 4.46, "grad_norm": 0.6298719048500061, "learning_rate": 1.409890781792872e-06, "loss": 1.7394, "step": 24940 }, { "epoch": 4.47, "grad_norm": 1.1347723007202148, "learning_rate": 1.405239556436061e-06, "loss": 1.8038, "step": 24945 }, { "epoch": 4.47, "grad_norm": 1.0450183153152466, "learning_rate": 1.4005957940792846e-06, "loss": 1.9163, "step": 24950 }, { "epoch": 4.47, "grad_norm": 0.9296015501022339, "learning_rate": 1.395959496191357e-06, "loss": 1.8794, "step": 24955 }, { "epoch": 4.47, "grad_norm": 0.8542814254760742, "learning_rate": 1.3913306642387409e-06, "loss": 1.7646, "step": 24960 }, { "epoch": 4.47, "grad_norm": 1.308875560760498, "learning_rate": 1.3867092996855236e-06, "loss": 1.9903, "step": 24965 }, { "epoch": 4.47, "grad_norm": 0.5955923795700073, "learning_rate": 1.3820954039934413e-06, "loss": 1.9698, "step": 24970 }, { "epoch": 4.47, "grad_norm": 0.6191591024398804, "learning_rate": 1.3774889786218687e-06, "loss": 1.7368, "step": 24975 }, { "epoch": 4.47, "grad_norm": 0.9830487966537476, "learning_rate": 1.3728900250278065e-06, "loss": 1.802, "step": 24980 }, { "epoch": 4.47, "grad_norm": 1.0741496086120605, "learning_rate": 1.3682985446659025e-06, "loss": 1.7966, "step": 24985 }, { "epoch": 4.47, "grad_norm": 0.583555281162262, "learning_rate": 1.3637145389884394e-06, "loss": 1.7261, "step": 24990 }, { "epoch": 4.47, "grad_norm": 0.9671819806098938, "learning_rate": 1.359138009445335e-06, "loss": 1.8888, "step": 24995 }, { "epoch": 4.47, "grad_norm": 0.8640641570091248, "learning_rate": 1.3545689574841342e-06, "loss": 1.8415, "step": 25000 }, { "epoch": 4.48, "grad_norm": 0.8222974538803101, "learning_rate": 1.3500073845500284e-06, "loss": 1.9049, "step": 25005 }, { "epoch": 4.48, "grad_norm": 1.0911484956741333, "learning_rate": 1.345453292085841e-06, "loss": 1.9566, "step": 25010 }, { "epoch": 4.48, "grad_norm": 0.6996002793312073, "learning_rate": 1.3409066815320226e-06, "loss": 1.9323, "step": 25015 }, { "epoch": 4.48, "grad_norm": 0.5919438600540161, "learning_rate": 1.3363675543266674e-06, "loss": 1.8602, "step": 25020 }, { "epoch": 4.48, "grad_norm": 0.7225549221038818, "learning_rate": 1.3318359119054935e-06, "loss": 2.2012, "step": 25025 }, { "epoch": 4.48, "grad_norm": 1.0485141277313232, "learning_rate": 1.327311755701849e-06, "loss": 1.6628, "step": 25030 }, { "epoch": 4.48, "grad_norm": 0.6466124057769775, "learning_rate": 1.3227950871467304e-06, "loss": 1.8366, "step": 25035 }, { "epoch": 4.48, "grad_norm": 1.215564489364624, "learning_rate": 1.3182859076687481e-06, "loss": 1.8886, "step": 25040 }, { "epoch": 4.48, "grad_norm": 1.6709582805633545, "learning_rate": 1.313784218694153e-06, "loss": 1.6098, "step": 25045 }, { "epoch": 4.48, "grad_norm": 1.1631311178207397, "learning_rate": 1.3092900216468336e-06, "loss": 1.9456, "step": 25050 }, { "epoch": 4.48, "grad_norm": 0.9077101349830627, "learning_rate": 1.304803317948286e-06, "loss": 1.9834, "step": 25055 }, { "epoch": 4.49, "grad_norm": 1.2171112298965454, "learning_rate": 1.3003241090176587e-06, "loss": 1.7559, "step": 25060 }, { "epoch": 4.49, "grad_norm": 1.7720481157302856, "learning_rate": 1.2958523962717185e-06, "loss": 1.7203, "step": 25065 }, { "epoch": 4.49, "grad_norm": 0.6833968162536621, "learning_rate": 1.2913881811248696e-06, "loss": 1.8261, "step": 25070 }, { "epoch": 4.49, "grad_norm": 0.8987765312194824, "learning_rate": 1.28693146498913e-06, "loss": 1.977, "step": 25075 }, { "epoch": 4.49, "grad_norm": 0.8224658966064453, "learning_rate": 1.2824822492741584e-06, "loss": 1.7903, "step": 25080 }, { "epoch": 4.49, "grad_norm": 0.8352959752082825, "learning_rate": 1.2780405353872426e-06, "loss": 1.6669, "step": 25085 }, { "epoch": 4.49, "grad_norm": 1.408292293548584, "learning_rate": 1.273606324733284e-06, "loss": 1.5876, "step": 25090 }, { "epoch": 4.49, "grad_norm": 1.146064281463623, "learning_rate": 1.2691796187148247e-06, "loss": 1.7544, "step": 25095 }, { "epoch": 4.49, "grad_norm": 1.177251935005188, "learning_rate": 1.2647604187320223e-06, "loss": 1.918, "step": 25100 }, { "epoch": 4.49, "grad_norm": 0.5201444029808044, "learning_rate": 1.2603487261826724e-06, "loss": 2.0014, "step": 25105 }, { "epoch": 4.49, "grad_norm": 2.2395122051239014, "learning_rate": 1.2559445424621868e-06, "loss": 1.7281, "step": 25110 }, { "epoch": 4.5, "grad_norm": 0.5539356470108032, "learning_rate": 1.251547868963604e-06, "loss": 1.9028, "step": 25115 }, { "epoch": 4.5, "grad_norm": 0.8390756249427795, "learning_rate": 1.2471587070775837e-06, "loss": 1.9719, "step": 25120 }, { "epoch": 4.5, "grad_norm": 0.5405310392379761, "learning_rate": 1.2427770581924181e-06, "loss": 1.8543, "step": 25125 }, { "epoch": 4.5, "grad_norm": 2.3233208656311035, "learning_rate": 1.2384029236940181e-06, "loss": 1.5512, "step": 25130 }, { "epoch": 4.5, "grad_norm": 0.636073648929596, "learning_rate": 1.2340363049659159e-06, "loss": 1.9004, "step": 25135 }, { "epoch": 4.5, "grad_norm": 0.8926206827163696, "learning_rate": 1.2296772033892733e-06, "loss": 1.7828, "step": 25140 }, { "epoch": 4.5, "grad_norm": 0.6021214127540588, "learning_rate": 1.2253256203428653e-06, "loss": 2.0191, "step": 25145 }, { "epoch": 4.5, "grad_norm": 0.6334590911865234, "learning_rate": 1.2209815572030937e-06, "loss": 2.1132, "step": 25150 }, { "epoch": 4.5, "grad_norm": 7.690194606781006, "learning_rate": 1.216645015343984e-06, "loss": 1.5651, "step": 25155 }, { "epoch": 4.5, "grad_norm": 0.8695514798164368, "learning_rate": 1.212315996137181e-06, "loss": 1.8521, "step": 25160 }, { "epoch": 4.5, "grad_norm": 1.237356185913086, "learning_rate": 1.2079945009519478e-06, "loss": 1.638, "step": 25165 }, { "epoch": 4.51, "grad_norm": 0.9352409243583679, "learning_rate": 1.2036805311551714e-06, "loss": 1.7746, "step": 25170 }, { "epoch": 4.51, "grad_norm": 0.6192563772201538, "learning_rate": 1.1993740881113492e-06, "loss": 1.8845, "step": 25175 }, { "epoch": 4.51, "grad_norm": 0.8423218131065369, "learning_rate": 1.1950751731826083e-06, "loss": 1.7119, "step": 25180 }, { "epoch": 4.51, "grad_norm": 0.6161177158355713, "learning_rate": 1.1907837877286942e-06, "loss": 2.1457, "step": 25185 }, { "epoch": 4.51, "grad_norm": 0.7731286287307739, "learning_rate": 1.1864999331069687e-06, "loss": 1.849, "step": 25190 }, { "epoch": 4.51, "grad_norm": 1.3494420051574707, "learning_rate": 1.182223610672406e-06, "loss": 1.5065, "step": 25195 }, { "epoch": 4.51, "grad_norm": 0.9249375462532043, "learning_rate": 1.1779548217776077e-06, "loss": 1.7186, "step": 25200 }, { "epoch": 4.51, "grad_norm": 1.1438442468643188, "learning_rate": 1.1736935677727852e-06, "loss": 1.7934, "step": 25205 }, { "epoch": 4.51, "grad_norm": 1.2738542556762695, "learning_rate": 1.1694398500057714e-06, "loss": 1.6784, "step": 25210 }, { "epoch": 4.51, "grad_norm": 0.7526887059211731, "learning_rate": 1.1651936698220068e-06, "loss": 2.0072, "step": 25215 }, { "epoch": 4.51, "grad_norm": 3.0581088066101074, "learning_rate": 1.1609550285645588e-06, "loss": 1.598, "step": 25220 }, { "epoch": 4.52, "grad_norm": 1.558251976966858, "learning_rate": 1.156723927574105e-06, "loss": 1.9926, "step": 25225 }, { "epoch": 4.52, "grad_norm": 1.8919259309768677, "learning_rate": 1.1525003681889413e-06, "loss": 1.9012, "step": 25230 }, { "epoch": 4.52, "grad_norm": 0.8434092998504639, "learning_rate": 1.1482843517449688e-06, "loss": 2.1936, "step": 25235 }, { "epoch": 4.52, "grad_norm": 1.0325753688812256, "learning_rate": 1.144075879575715e-06, "loss": 1.9423, "step": 25240 }, { "epoch": 4.52, "grad_norm": 0.6491246223449707, "learning_rate": 1.1398749530123127e-06, "loss": 1.7421, "step": 25245 }, { "epoch": 4.52, "grad_norm": 1.0722302198410034, "learning_rate": 1.1356815733835153e-06, "loss": 1.8059, "step": 25250 }, { "epoch": 4.52, "grad_norm": 0.7224903702735901, "learning_rate": 1.1314957420156812e-06, "loss": 1.8328, "step": 25255 }, { "epoch": 4.52, "grad_norm": 0.8088222742080688, "learning_rate": 1.127317460232788e-06, "loss": 1.9416, "step": 25260 }, { "epoch": 4.52, "grad_norm": 0.5477873682975769, "learning_rate": 1.123146729356414e-06, "loss": 1.818, "step": 25265 }, { "epoch": 4.52, "grad_norm": 2.747405767440796, "learning_rate": 1.1189835507057655e-06, "loss": 1.9102, "step": 25270 }, { "epoch": 4.52, "grad_norm": 0.7023490071296692, "learning_rate": 1.1148279255976475e-06, "loss": 1.7424, "step": 25275 }, { "epoch": 4.52, "grad_norm": 0.6943578720092773, "learning_rate": 1.1106798553464804e-06, "loss": 1.7916, "step": 25280 }, { "epoch": 4.53, "grad_norm": 1.317313313484192, "learning_rate": 1.1065393412642982e-06, "loss": 1.8638, "step": 25285 }, { "epoch": 4.53, "grad_norm": 1.2347160577774048, "learning_rate": 1.102406384660745e-06, "loss": 1.8903, "step": 25290 }, { "epoch": 4.53, "grad_norm": 0.872135579586029, "learning_rate": 1.0982809868430589e-06, "loss": 1.7901, "step": 25295 }, { "epoch": 4.53, "grad_norm": 0.9331486225128174, "learning_rate": 1.0941631491161093e-06, "loss": 1.8265, "step": 25300 }, { "epoch": 4.53, "grad_norm": 0.7161145806312561, "learning_rate": 1.0900528727823627e-06, "loss": 1.7577, "step": 25305 }, { "epoch": 4.53, "grad_norm": 0.8512054085731506, "learning_rate": 1.0859501591418907e-06, "loss": 1.6612, "step": 25310 }, { "epoch": 4.53, "grad_norm": 0.9483695030212402, "learning_rate": 1.081855009492383e-06, "loss": 1.9453, "step": 25315 }, { "epoch": 4.53, "grad_norm": 1.1600202322006226, "learning_rate": 1.0777674251291308e-06, "loss": 2.1087, "step": 25320 }, { "epoch": 4.53, "grad_norm": 0.7305920124053955, "learning_rate": 1.0736874073450282e-06, "loss": 1.8974, "step": 25325 }, { "epoch": 4.53, "grad_norm": 0.8582291603088379, "learning_rate": 1.0696149574305842e-06, "loss": 1.9827, "step": 25330 }, { "epoch": 4.53, "grad_norm": 0.6721082329750061, "learning_rate": 1.065550076673913e-06, "loss": 1.8574, "step": 25335 }, { "epoch": 4.54, "grad_norm": 0.8562425971031189, "learning_rate": 1.0614927663607278e-06, "loss": 2.1481, "step": 25340 }, { "epoch": 4.54, "grad_norm": 0.5766037702560425, "learning_rate": 1.0574430277743602e-06, "loss": 1.8303, "step": 25345 }, { "epoch": 4.54, "grad_norm": 1.4017845392227173, "learning_rate": 1.0534008621957331e-06, "loss": 1.7077, "step": 25350 }, { "epoch": 4.54, "grad_norm": 1.1896699666976929, "learning_rate": 1.0493662709033764e-06, "loss": 1.6934, "step": 25355 }, { "epoch": 4.54, "grad_norm": 1.20075261592865, "learning_rate": 1.0453392551734276e-06, "loss": 1.705, "step": 25360 }, { "epoch": 4.54, "grad_norm": 0.5349302887916565, "learning_rate": 1.0413198162796346e-06, "loss": 2.1699, "step": 25365 }, { "epoch": 4.54, "grad_norm": 0.656079113483429, "learning_rate": 1.0373079554933357e-06, "loss": 1.6925, "step": 25370 }, { "epoch": 4.54, "grad_norm": 0.7854536771774292, "learning_rate": 1.0333036740834856e-06, "loss": 2.0432, "step": 25375 }, { "epoch": 4.54, "grad_norm": 1.6201163530349731, "learning_rate": 1.0293069733166294e-06, "loss": 1.7864, "step": 25380 }, { "epoch": 4.54, "grad_norm": 0.7943071722984314, "learning_rate": 1.025317854456917e-06, "loss": 1.573, "step": 25385 }, { "epoch": 4.54, "grad_norm": 1.6613608598709106, "learning_rate": 1.0213363187661084e-06, "loss": 1.4672, "step": 25390 }, { "epoch": 4.55, "grad_norm": 1.1010265350341797, "learning_rate": 1.0173623675035604e-06, "loss": 1.9815, "step": 25395 }, { "epoch": 4.55, "grad_norm": 0.885063648223877, "learning_rate": 1.0133960019262256e-06, "loss": 1.6383, "step": 25400 }, { "epoch": 4.55, "grad_norm": 0.5052914619445801, "learning_rate": 1.0094372232886617e-06, "loss": 1.8147, "step": 25405 }, { "epoch": 4.55, "grad_norm": 0.9033421277999878, "learning_rate": 1.005486032843031e-06, "loss": 1.7289, "step": 25410 }, { "epoch": 4.55, "grad_norm": 0.7714688181877136, "learning_rate": 1.0015424318390837e-06, "loss": 1.5649, "step": 25415 }, { "epoch": 4.55, "grad_norm": 0.6860387921333313, "learning_rate": 9.97606421524186e-07, "loss": 1.8035, "step": 25420 }, { "epoch": 4.55, "grad_norm": 0.9091027975082397, "learning_rate": 9.936780031432863e-07, "loss": 1.7979, "step": 25425 }, { "epoch": 4.55, "grad_norm": 1.081512212753296, "learning_rate": 9.897571779389437e-07, "loss": 1.6822, "step": 25430 }, { "epoch": 4.55, "grad_norm": 0.972420871257782, "learning_rate": 9.858439471513131e-07, "loss": 1.8356, "step": 25435 }, { "epoch": 4.55, "grad_norm": 0.838660478591919, "learning_rate": 9.819383120181436e-07, "loss": 1.553, "step": 25440 }, { "epoch": 4.55, "grad_norm": 0.8015768527984619, "learning_rate": 9.780402737747828e-07, "loss": 1.5111, "step": 25445 }, { "epoch": 4.56, "grad_norm": 0.8660455942153931, "learning_rate": 9.741498336541754e-07, "loss": 1.9797, "step": 25450 }, { "epoch": 4.56, "grad_norm": 0.7163037061691284, "learning_rate": 9.702669928868674e-07, "loss": 1.6109, "step": 25455 }, { "epoch": 4.56, "grad_norm": 1.0026417970657349, "learning_rate": 9.66391752700993e-07, "loss": 1.8042, "step": 25460 }, { "epoch": 4.56, "grad_norm": 0.9547916650772095, "learning_rate": 9.62524114322294e-07, "loss": 1.7232, "step": 25465 }, { "epoch": 4.56, "grad_norm": 0.43152350187301636, "learning_rate": 9.586640789740946e-07, "loss": 2.017, "step": 25470 }, { "epoch": 4.56, "grad_norm": 0.8599415421485901, "learning_rate": 9.54811647877321e-07, "loss": 1.6867, "step": 25475 }, { "epoch": 4.56, "grad_norm": 1.4471591711044312, "learning_rate": 9.509668222504958e-07, "loss": 1.8693, "step": 25480 }, { "epoch": 4.56, "grad_norm": 0.8084843158721924, "learning_rate": 9.471296033097321e-07, "loss": 1.8386, "step": 25485 }, { "epoch": 4.56, "grad_norm": 0.5984373688697815, "learning_rate": 9.432999922687396e-07, "loss": 1.7504, "step": 25490 }, { "epoch": 4.56, "grad_norm": 0.80012047290802, "learning_rate": 9.39477990338819e-07, "loss": 2.0373, "step": 25495 }, { "epoch": 4.56, "grad_norm": 1.0529673099517822, "learning_rate": 9.356635987288698e-07, "loss": 1.5368, "step": 25500 }, { "epoch": 4.57, "grad_norm": 0.46101412177085876, "learning_rate": 9.318568186453741e-07, "loss": 1.7577, "step": 25505 }, { "epoch": 4.57, "grad_norm": 0.7907940149307251, "learning_rate": 9.280576512924188e-07, "loss": 1.8975, "step": 25510 }, { "epoch": 4.57, "grad_norm": 1.587792992591858, "learning_rate": 9.242660978716705e-07, "loss": 2.0775, "step": 25515 }, { "epoch": 4.57, "grad_norm": 0.5791236758232117, "learning_rate": 9.204821595823976e-07, "loss": 2.0065, "step": 25520 }, { "epoch": 4.57, "grad_norm": 0.8675762414932251, "learning_rate": 9.167058376214621e-07, "loss": 1.8338, "step": 25525 }, { "epoch": 4.57, "grad_norm": 0.7436877489089966, "learning_rate": 9.129371331832975e-07, "loss": 1.7142, "step": 25530 }, { "epoch": 4.57, "grad_norm": 0.7845025658607483, "learning_rate": 9.091760474599559e-07, "loss": 1.9165, "step": 25535 }, { "epoch": 4.57, "grad_norm": 0.5198835730552673, "learning_rate": 9.054225816410522e-07, "loss": 1.7308, "step": 25540 }, { "epoch": 4.57, "grad_norm": 3.1437716484069824, "learning_rate": 9.016767369138118e-07, "loss": 1.8479, "step": 25545 }, { "epoch": 4.57, "grad_norm": 1.3514385223388672, "learning_rate": 8.979385144630397e-07, "loss": 2.1328, "step": 25550 }, { "epoch": 4.57, "grad_norm": 0.615734875202179, "learning_rate": 8.942079154711347e-07, "loss": 1.9266, "step": 25555 }, { "epoch": 4.58, "grad_norm": 0.8276461958885193, "learning_rate": 8.904849411180749e-07, "loss": 1.844, "step": 25560 }, { "epoch": 4.58, "grad_norm": 0.8791263103485107, "learning_rate": 8.867695925814406e-07, "loss": 2.0249, "step": 25565 }, { "epoch": 4.58, "grad_norm": 1.3986464738845825, "learning_rate": 8.830618710363892e-07, "loss": 1.6506, "step": 25570 }, { "epoch": 4.58, "grad_norm": 1.0997428894042969, "learning_rate": 8.793617776556685e-07, "loss": 1.9504, "step": 25575 }, { "epoch": 4.58, "grad_norm": 0.9867760539054871, "learning_rate": 8.7566931360962e-07, "loss": 1.8399, "step": 25580 }, { "epoch": 4.58, "grad_norm": 0.6265989542007446, "learning_rate": 8.719844800661625e-07, "loss": 2.0406, "step": 25585 }, { "epoch": 4.58, "grad_norm": 1.4594182968139648, "learning_rate": 8.68307278190808e-07, "loss": 2.0839, "step": 25590 }, { "epoch": 4.58, "grad_norm": 1.0297359228134155, "learning_rate": 8.646377091466457e-07, "loss": 1.5879, "step": 25595 }, { "epoch": 4.58, "grad_norm": 0.9616431593894958, "learning_rate": 8.609757740943608e-07, "loss": 1.7454, "step": 25600 }, { "epoch": 4.58, "grad_norm": 0.7618218064308167, "learning_rate": 8.5805172330512e-07, "loss": 1.7113, "step": 25605 }, { "epoch": 4.58, "grad_norm": 0.9779794216156006, "learning_rate": 8.544035323554217e-07, "loss": 1.6699, "step": 25610 }, { "epoch": 4.58, "grad_norm": 0.6378960013389587, "learning_rate": 8.507629786346671e-07, "loss": 1.5722, "step": 25615 }, { "epoch": 4.59, "grad_norm": 1.0023730993270874, "learning_rate": 8.47130063294349e-07, "loss": 1.7534, "step": 25620 }, { "epoch": 4.59, "grad_norm": 0.8808668255805969, "learning_rate": 8.435047874835594e-07, "loss": 1.5374, "step": 25625 }, { "epoch": 4.59, "grad_norm": 6.013002395629883, "learning_rate": 8.398871523489727e-07, "loss": 1.658, "step": 25630 }, { "epoch": 4.59, "grad_norm": 1.1340384483337402, "learning_rate": 8.362771590348373e-07, "loss": 1.7486, "step": 25635 }, { "epoch": 4.59, "grad_norm": 0.4705217480659485, "learning_rate": 8.326748086829956e-07, "loss": 1.6366, "step": 25640 }, { "epoch": 4.59, "grad_norm": 0.8444132804870605, "learning_rate": 8.290801024328693e-07, "loss": 1.8246, "step": 25645 }, { "epoch": 4.59, "grad_norm": 1.68918776512146, "learning_rate": 8.254930414214518e-07, "loss": 1.8512, "step": 25650 }, { "epoch": 4.59, "grad_norm": 0.906139075756073, "learning_rate": 8.219136267833355e-07, "loss": 1.8157, "step": 25655 }, { "epoch": 4.59, "grad_norm": 0.8343780040740967, "learning_rate": 8.183418596506786e-07, "loss": 2.0715, "step": 25660 }, { "epoch": 4.59, "grad_norm": 0.7080813050270081, "learning_rate": 8.147777411532359e-07, "loss": 2.1484, "step": 25665 }, { "epoch": 4.59, "grad_norm": 0.721260130405426, "learning_rate": 8.112212724183277e-07, "loss": 1.6142, "step": 25670 }, { "epoch": 4.6, "grad_norm": 1.5041923522949219, "learning_rate": 8.076724545708681e-07, "loss": 2.0871, "step": 25675 }, { "epoch": 4.6, "grad_norm": 3.3682503700256348, "learning_rate": 8.041312887333397e-07, "loss": 1.5679, "step": 25680 }, { "epoch": 4.6, "grad_norm": 0.7922682166099548, "learning_rate": 8.005977760258105e-07, "loss": 1.696, "step": 25685 }, { "epoch": 4.6, "grad_norm": 1.6431972980499268, "learning_rate": 7.970719175659281e-07, "loss": 1.7266, "step": 25690 }, { "epoch": 4.6, "grad_norm": 0.8077892065048218, "learning_rate": 7.935537144689142e-07, "loss": 1.7022, "step": 25695 }, { "epoch": 4.6, "grad_norm": 1.4477850198745728, "learning_rate": 7.900431678475817e-07, "loss": 1.9362, "step": 25700 }, { "epoch": 4.6, "grad_norm": 1.0458213090896606, "learning_rate": 7.865402788123088e-07, "loss": 1.6532, "step": 25705 }, { "epoch": 4.6, "grad_norm": 1.3129156827926636, "learning_rate": 7.830450484710511e-07, "loss": 1.86, "step": 25710 }, { "epoch": 4.6, "grad_norm": 0.689514696598053, "learning_rate": 7.795574779293519e-07, "loss": 1.9175, "step": 25715 }, { "epoch": 4.6, "grad_norm": 1.1813409328460693, "learning_rate": 7.76077568290326e-07, "loss": 1.8346, "step": 25720 }, { "epoch": 4.6, "grad_norm": 0.8686524033546448, "learning_rate": 7.72605320654668e-07, "loss": 1.9223, "step": 25725 }, { "epoch": 4.61, "grad_norm": 0.858902096748352, "learning_rate": 7.691407361206382e-07, "loss": 1.9596, "step": 25730 }, { "epoch": 4.61, "grad_norm": 0.6625041961669922, "learning_rate": 7.656838157840906e-07, "loss": 1.9814, "step": 25735 }, { "epoch": 4.61, "grad_norm": 0.6935396790504456, "learning_rate": 7.622345607384396e-07, "loss": 1.8825, "step": 25740 }, { "epoch": 4.61, "grad_norm": 1.0851209163665771, "learning_rate": 7.587929720746817e-07, "loss": 1.6372, "step": 25745 }, { "epoch": 4.61, "grad_norm": 0.629657506942749, "learning_rate": 7.553590508813907e-07, "loss": 2.0008, "step": 25750 }, { "epoch": 4.61, "grad_norm": 2.5433287620544434, "learning_rate": 7.519327982447117e-07, "loss": 1.7939, "step": 25755 }, { "epoch": 4.61, "grad_norm": 0.5418505668640137, "learning_rate": 7.485142152483665e-07, "loss": 1.8458, "step": 25760 }, { "epoch": 4.61, "grad_norm": 1.0671522617340088, "learning_rate": 7.451033029736459e-07, "loss": 1.7501, "step": 25765 }, { "epoch": 4.61, "grad_norm": 0.7587215304374695, "learning_rate": 7.417000624994231e-07, "loss": 1.713, "step": 25770 }, { "epoch": 4.61, "grad_norm": 0.7475984692573547, "learning_rate": 7.383044949021339e-07, "loss": 1.6557, "step": 25775 }, { "epoch": 4.61, "grad_norm": 3.0722339153289795, "learning_rate": 7.349166012557973e-07, "loss": 1.6571, "step": 25780 }, { "epoch": 4.62, "grad_norm": 0.7572489976882935, "learning_rate": 7.315363826320005e-07, "loss": 1.9279, "step": 25785 }, { "epoch": 4.62, "grad_norm": 1.7964684963226318, "learning_rate": 7.281638400998992e-07, "loss": 1.8172, "step": 25790 }, { "epoch": 4.62, "grad_norm": 1.0750715732574463, "learning_rate": 7.247989747262346e-07, "loss": 1.7758, "step": 25795 }, { "epoch": 4.62, "grad_norm": 1.502988338470459, "learning_rate": 7.214417875752999e-07, "loss": 1.9997, "step": 25800 }, { "epoch": 4.62, "grad_norm": 1.200313687324524, "learning_rate": 7.180922797089762e-07, "loss": 1.8816, "step": 25805 }, { "epoch": 4.62, "grad_norm": 1.1731305122375488, "learning_rate": 7.147504521867076e-07, "loss": 2.0038, "step": 25810 }, { "epoch": 4.62, "grad_norm": 0.778863787651062, "learning_rate": 7.114163060655126e-07, "loss": 1.4799, "step": 25815 }, { "epoch": 4.62, "grad_norm": 1.777086853981018, "learning_rate": 7.080898423999782e-07, "loss": 1.2548, "step": 25820 }, { "epoch": 4.62, "grad_norm": 0.545483410358429, "learning_rate": 7.047710622422599e-07, "loss": 1.5645, "step": 25825 }, { "epoch": 4.62, "grad_norm": 1.450973629951477, "learning_rate": 7.014599666420846e-07, "loss": 1.5726, "step": 25830 }, { "epoch": 4.62, "grad_norm": 0.8481055498123169, "learning_rate": 6.981565566467507e-07, "loss": 1.7464, "step": 25835 }, { "epoch": 4.63, "grad_norm": 0.9643117785453796, "learning_rate": 6.948608333011197e-07, "loss": 1.5418, "step": 25840 }, { "epoch": 4.63, "grad_norm": 0.6587722301483154, "learning_rate": 6.915727976476299e-07, "loss": 1.7643, "step": 25845 }, { "epoch": 4.63, "grad_norm": 0.37052008509635925, "learning_rate": 6.882924507262855e-07, "loss": 1.6896, "step": 25850 }, { "epoch": 4.63, "grad_norm": 0.8936975002288818, "learning_rate": 6.850197935746511e-07, "loss": 1.7418, "step": 25855 }, { "epoch": 4.63, "grad_norm": 0.858160674571991, "learning_rate": 6.817548272278706e-07, "loss": 1.986, "step": 25860 }, { "epoch": 4.63, "grad_norm": 0.920091450214386, "learning_rate": 6.78497552718646e-07, "loss": 1.8721, "step": 25865 }, { "epoch": 4.63, "grad_norm": 0.7904292941093445, "learning_rate": 6.752479710772502e-07, "loss": 1.7012, "step": 25870 }, { "epoch": 4.63, "grad_norm": 0.7170063853263855, "learning_rate": 6.720060833315223e-07, "loss": 1.9776, "step": 25875 }, { "epoch": 4.63, "grad_norm": 0.6264550685882568, "learning_rate": 6.687718905068697e-07, "loss": 1.953, "step": 25880 }, { "epoch": 4.63, "grad_norm": 1.133542537689209, "learning_rate": 6.655453936262685e-07, "loss": 1.8424, "step": 25885 }, { "epoch": 4.63, "grad_norm": 1.201373815536499, "learning_rate": 6.623265937102496e-07, "loss": 1.7449, "step": 25890 }, { "epoch": 4.64, "grad_norm": 1.5119297504425049, "learning_rate": 6.591154917769183e-07, "loss": 2.0277, "step": 25895 }, { "epoch": 4.64, "grad_norm": 1.423034906387329, "learning_rate": 6.559120888419451e-07, "loss": 1.6196, "step": 25900 }, { "epoch": 4.64, "grad_norm": 0.8784555196762085, "learning_rate": 6.527163859185614e-07, "loss": 1.806, "step": 25905 }, { "epoch": 4.64, "grad_norm": 0.8068978190422058, "learning_rate": 6.495283840175697e-07, "loss": 1.8265, "step": 25910 }, { "epoch": 4.64, "grad_norm": 0.9359546899795532, "learning_rate": 6.463480841473274e-07, "loss": 1.5987, "step": 25915 }, { "epoch": 4.64, "grad_norm": 1.0493196249008179, "learning_rate": 6.431754873137575e-07, "loss": 1.8015, "step": 25920 }, { "epoch": 4.64, "grad_norm": 0.918982207775116, "learning_rate": 6.400105945203544e-07, "loss": 1.9332, "step": 25925 }, { "epoch": 4.64, "grad_norm": 0.6983169913291931, "learning_rate": 6.368534067681675e-07, "loss": 1.8346, "step": 25930 }, { "epoch": 4.64, "grad_norm": 0.8050747513771057, "learning_rate": 6.337039250558119e-07, "loss": 2.021, "step": 25935 }, { "epoch": 4.64, "grad_norm": 2.0071403980255127, "learning_rate": 6.305621503794712e-07, "loss": 1.8157, "step": 25940 }, { "epoch": 4.64, "grad_norm": 1.5265626907348633, "learning_rate": 6.274280837328811e-07, "loss": 1.7936, "step": 25945 }, { "epoch": 4.64, "grad_norm": 0.8027945160865784, "learning_rate": 6.24301726107343e-07, "loss": 1.8368, "step": 25950 }, { "epoch": 4.65, "grad_norm": 0.6967876553535461, "learning_rate": 6.211830784917244e-07, "loss": 1.8268, "step": 25955 }, { "epoch": 4.65, "grad_norm": 0.5953835844993591, "learning_rate": 6.1807214187245e-07, "loss": 1.7233, "step": 25960 }, { "epoch": 4.65, "grad_norm": 1.5460771322250366, "learning_rate": 6.149689172334994e-07, "loss": 1.4829, "step": 25965 }, { "epoch": 4.65, "grad_norm": 0.8539479374885559, "learning_rate": 6.118734055564291e-07, "loss": 1.9009, "step": 25970 }, { "epoch": 4.65, "grad_norm": 0.4527439475059509, "learning_rate": 6.087856078203419e-07, "loss": 1.8119, "step": 25975 }, { "epoch": 4.65, "grad_norm": 0.5889864563941956, "learning_rate": 6.057055250019039e-07, "loss": 1.9578, "step": 25980 }, { "epoch": 4.65, "grad_norm": 1.3849375247955322, "learning_rate": 6.026331580753442e-07, "loss": 1.8432, "step": 25985 }, { "epoch": 4.65, "grad_norm": 0.8188489079475403, "learning_rate": 5.995685080124491e-07, "loss": 1.8869, "step": 25990 }, { "epoch": 4.65, "grad_norm": 0.6627269983291626, "learning_rate": 5.965115757825657e-07, "loss": 1.8587, "step": 25995 }, { "epoch": 4.65, "grad_norm": 1.0890119075775146, "learning_rate": 5.934623623526009e-07, "loss": 1.7339, "step": 26000 }, { "epoch": 4.65, "grad_norm": 1.010817050933838, "learning_rate": 5.90420868687011e-07, "loss": 1.9268, "step": 26005 }, { "epoch": 4.66, "grad_norm": 1.0825525522232056, "learning_rate": 5.873870957478211e-07, "loss": 1.7062, "step": 26010 }, { "epoch": 4.66, "grad_norm": 0.8610982894897461, "learning_rate": 5.843610444946135e-07, "loss": 1.726, "step": 26015 }, { "epoch": 4.66, "grad_norm": 1.058080792427063, "learning_rate": 5.813427158845198e-07, "loss": 1.7532, "step": 26020 }, { "epoch": 4.66, "grad_norm": 2.5200893878936768, "learning_rate": 5.783321108722373e-07, "loss": 1.719, "step": 26025 }, { "epoch": 4.66, "grad_norm": 0.7497760057449341, "learning_rate": 5.753292304100183e-07, "loss": 1.9987, "step": 26030 }, { "epoch": 4.66, "grad_norm": 0.674341082572937, "learning_rate": 5.723340754476697e-07, "loss": 1.7749, "step": 26035 }, { "epoch": 4.66, "grad_norm": 1.0011730194091797, "learning_rate": 5.693466469325587e-07, "loss": 1.8461, "step": 26040 }, { "epoch": 4.66, "grad_norm": 2.4020795822143555, "learning_rate": 5.663669458096015e-07, "loss": 1.8198, "step": 26045 }, { "epoch": 4.66, "grad_norm": 0.7626039981842041, "learning_rate": 5.633949730212806e-07, "loss": 1.7917, "step": 26050 }, { "epoch": 4.66, "grad_norm": 1.1910889148712158, "learning_rate": 5.604307295076188e-07, "loss": 1.9962, "step": 26055 }, { "epoch": 4.66, "grad_norm": 0.8416028618812561, "learning_rate": 5.574742162062163e-07, "loss": 1.8465, "step": 26060 }, { "epoch": 4.67, "grad_norm": 1.4080479145050049, "learning_rate": 5.545254340522027e-07, "loss": 1.948, "step": 26065 }, { "epoch": 4.67, "grad_norm": 0.9326726794242859, "learning_rate": 5.515843839782848e-07, "loss": 1.9587, "step": 26070 }, { "epoch": 4.67, "grad_norm": 0.7399839162826538, "learning_rate": 5.486510669147072e-07, "loss": 1.7994, "step": 26075 }, { "epoch": 4.67, "grad_norm": 0.7813273072242737, "learning_rate": 5.457254837892778e-07, "loss": 1.7135, "step": 26080 }, { "epoch": 4.67, "grad_norm": 0.8069972991943359, "learning_rate": 5.428076355273564e-07, "loss": 1.7997, "step": 26085 }, { "epoch": 4.67, "grad_norm": 0.7533904910087585, "learning_rate": 5.398975230518577e-07, "loss": 1.745, "step": 26090 }, { "epoch": 4.67, "grad_norm": 0.5249228477478027, "learning_rate": 5.369951472832424e-07, "loss": 1.8554, "step": 26095 }, { "epoch": 4.67, "grad_norm": 0.8555483818054199, "learning_rate": 5.341005091395318e-07, "loss": 1.7167, "step": 26100 }, { "epoch": 4.67, "grad_norm": 2.859463691711426, "learning_rate": 5.312136095362963e-07, "loss": 1.8371, "step": 26105 }, { "epoch": 4.67, "grad_norm": 0.6293110847473145, "learning_rate": 5.28334449386661e-07, "loss": 1.8762, "step": 26110 }, { "epoch": 4.67, "grad_norm": 0.6581356525421143, "learning_rate": 5.254630296013002e-07, "loss": 1.9024, "step": 26115 }, { "epoch": 4.68, "grad_norm": 0.9610373377799988, "learning_rate": 5.22599351088443e-07, "loss": 1.9915, "step": 26120 }, { "epoch": 4.68, "grad_norm": 0.6756641864776611, "learning_rate": 5.19743414753865e-07, "loss": 1.9754, "step": 26125 }, { "epoch": 4.68, "grad_norm": 0.6355516910552979, "learning_rate": 5.168952215008988e-07, "loss": 1.6428, "step": 26130 }, { "epoch": 4.68, "grad_norm": 1.207905888557434, "learning_rate": 5.140547722304212e-07, "loss": 1.999, "step": 26135 }, { "epoch": 4.68, "grad_norm": 0.9611450433731079, "learning_rate": 5.112220678408692e-07, "loss": 1.9363, "step": 26140 }, { "epoch": 4.68, "grad_norm": 1.3729864358901978, "learning_rate": 5.083971092282258e-07, "loss": 1.5812, "step": 26145 }, { "epoch": 4.68, "grad_norm": 1.105804204940796, "learning_rate": 5.055798972860154e-07, "loss": 2.1176, "step": 26150 }, { "epoch": 4.68, "grad_norm": 0.9496538043022156, "learning_rate": 5.027704329053223e-07, "loss": 1.7854, "step": 26155 }, { "epoch": 4.68, "grad_norm": 0.8394901156425476, "learning_rate": 4.999687169747775e-07, "loss": 1.6806, "step": 26160 }, { "epoch": 4.68, "grad_norm": 0.6391941905021667, "learning_rate": 4.971747503805608e-07, "loss": 1.415, "step": 26165 }, { "epoch": 4.68, "grad_norm": 1.3702791929244995, "learning_rate": 4.943885340064042e-07, "loss": 1.4161, "step": 26170 }, { "epoch": 4.69, "grad_norm": 2.06182599067688, "learning_rate": 4.916100687335806e-07, "loss": 2.0121, "step": 26175 }, { "epoch": 4.69, "grad_norm": 0.6842718124389648, "learning_rate": 4.888393554409204e-07, "loss": 1.5254, "step": 26180 }, { "epoch": 4.69, "grad_norm": 1.0825421810150146, "learning_rate": 4.860763950047948e-07, "loss": 1.6063, "step": 26185 }, { "epoch": 4.69, "grad_norm": 1.3437987565994263, "learning_rate": 4.833211882991268e-07, "loss": 1.8736, "step": 26190 }, { "epoch": 4.69, "grad_norm": 0.6650128960609436, "learning_rate": 4.805737361953833e-07, "loss": 1.7054, "step": 26195 }, { "epoch": 4.69, "grad_norm": 1.0552135705947876, "learning_rate": 4.77834039562583e-07, "loss": 1.6637, "step": 26200 }, { "epoch": 4.69, "grad_norm": 1.4072233438491821, "learning_rate": 4.7510209926728834e-07, "loss": 1.7958, "step": 26205 }, { "epoch": 4.69, "grad_norm": 0.7076230049133301, "learning_rate": 4.7237791617361085e-07, "loss": 1.9844, "step": 26210 }, { "epoch": 4.69, "grad_norm": 1.0958219766616821, "learning_rate": 4.696614911432057e-07, "loss": 1.8564, "step": 26215 }, { "epoch": 4.69, "grad_norm": 1.130049467086792, "learning_rate": 4.669528250352745e-07, "loss": 1.9523, "step": 26220 }, { "epoch": 4.69, "grad_norm": 0.9261044263839722, "learning_rate": 4.6425191870656525e-07, "loss": 1.6797, "step": 26225 }, { "epoch": 4.7, "grad_norm": 0.8678873181343079, "learning_rate": 4.615587730113752e-07, "loss": 1.8165, "step": 26230 }, { "epoch": 4.7, "grad_norm": 0.6471136212348938, "learning_rate": 4.5887338880154504e-07, "loss": 1.8645, "step": 26235 }, { "epoch": 4.7, "grad_norm": 0.8412356972694397, "learning_rate": 4.561957669264566e-07, "loss": 1.6603, "step": 26240 }, { "epoch": 4.7, "grad_norm": 0.5464197993278503, "learning_rate": 4.535259082330351e-07, "loss": 1.9706, "step": 26245 }, { "epoch": 4.7, "grad_norm": 0.6575042009353638, "learning_rate": 4.508638135657606e-07, "loss": 1.9023, "step": 26250 }, { "epoch": 4.7, "grad_norm": 1.1826757192611694, "learning_rate": 4.482094837666456e-07, "loss": 1.766, "step": 26255 }, { "epoch": 4.7, "grad_norm": 0.831210196018219, "learning_rate": 4.4556291967525755e-07, "loss": 1.8469, "step": 26260 }, { "epoch": 4.7, "grad_norm": 1.009545922279358, "learning_rate": 4.4292412212869895e-07, "loss": 1.803, "step": 26265 }, { "epoch": 4.7, "grad_norm": 0.8731971979141235, "learning_rate": 4.402930919616216e-07, "loss": 1.9441, "step": 26270 }, { "epoch": 4.7, "grad_norm": 0.8651910424232483, "learning_rate": 4.3766983000621266e-07, "loss": 1.6736, "step": 26275 }, { "epoch": 4.7, "grad_norm": 1.4278265237808228, "learning_rate": 4.350543370922111e-07, "loss": 1.6739, "step": 26280 }, { "epoch": 4.7, "grad_norm": 0.8426223993301392, "learning_rate": 4.324466140468969e-07, "loss": 1.9826, "step": 26285 }, { "epoch": 4.71, "grad_norm": 1.1639264822006226, "learning_rate": 4.2984666169508525e-07, "loss": 1.5842, "step": 26290 }, { "epoch": 4.71, "grad_norm": 1.0826095342636108, "learning_rate": 4.2725448085914056e-07, "loss": 1.6018, "step": 26295 }, { "epoch": 4.71, "grad_norm": 0.7595674991607666, "learning_rate": 4.246700723589708e-07, "loss": 2.0188, "step": 26300 }, { "epoch": 4.71, "grad_norm": 1.1753851175308228, "learning_rate": 4.2209343701201666e-07, "loss": 1.6287, "step": 26305 }, { "epoch": 4.71, "grad_norm": 1.429380178451538, "learning_rate": 4.1952457563326773e-07, "loss": 1.9633, "step": 26310 }, { "epoch": 4.71, "grad_norm": 0.7897043228149414, "learning_rate": 4.16963489035252e-07, "loss": 1.6705, "step": 26315 }, { "epoch": 4.71, "grad_norm": 0.7799373865127563, "learning_rate": 4.1441017802803793e-07, "loss": 2.1176, "step": 26320 }, { "epoch": 4.71, "grad_norm": 0.7283823490142822, "learning_rate": 4.1186464341923805e-07, "loss": 1.5486, "step": 26325 }, { "epoch": 4.71, "grad_norm": 1.3124147653579712, "learning_rate": 4.093268860139998e-07, "loss": 1.9634, "step": 26330 }, { "epoch": 4.71, "grad_norm": 1.9731324911117554, "learning_rate": 4.067969066150146e-07, "loss": 1.4846, "step": 26335 }, { "epoch": 4.71, "grad_norm": 0.9217849969863892, "learning_rate": 4.042747060225116e-07, "loss": 2.1779, "step": 26340 }, { "epoch": 4.72, "grad_norm": 2.033876419067383, "learning_rate": 4.0176028503425835e-07, "loss": 1.6705, "step": 26345 }, { "epoch": 4.72, "grad_norm": 0.941889226436615, "learning_rate": 3.9925364444556857e-07, "loss": 1.6641, "step": 26350 }, { "epoch": 4.72, "grad_norm": 1.0339444875717163, "learning_rate": 3.9675478504929144e-07, "loss": 1.7571, "step": 26355 }, { "epoch": 4.72, "grad_norm": 0.7928982973098755, "learning_rate": 3.942637076358058e-07, "loss": 1.8514, "step": 26360 }, { "epoch": 4.72, "grad_norm": 1.492556095123291, "learning_rate": 3.917804129930452e-07, "loss": 1.8306, "step": 26365 }, { "epoch": 4.72, "grad_norm": 0.8601492643356323, "learning_rate": 3.893049019064676e-07, "loss": 1.8331, "step": 26370 }, { "epoch": 4.72, "grad_norm": 1.3913602828979492, "learning_rate": 3.8683717515907714e-07, "loss": 1.7354, "step": 26375 }, { "epoch": 4.72, "grad_norm": 1.4564555883407593, "learning_rate": 3.843772335314133e-07, "loss": 1.5457, "step": 26380 }, { "epoch": 4.72, "grad_norm": 0.7062227725982666, "learning_rate": 3.8192507780155375e-07, "loss": 1.6495, "step": 26385 }, { "epoch": 4.72, "grad_norm": 0.7881594300270081, "learning_rate": 3.794807087451141e-07, "loss": 1.963, "step": 26390 }, { "epoch": 4.72, "grad_norm": 1.6842689514160156, "learning_rate": 3.770441271352426e-07, "loss": 1.5499, "step": 26395 }, { "epoch": 4.73, "grad_norm": 0.4173828065395355, "learning_rate": 3.7461533374262837e-07, "loss": 2.0362, "step": 26400 }, { "epoch": 4.73, "grad_norm": 1.6171375513076782, "learning_rate": 3.7219432933549845e-07, "loss": 1.6515, "step": 26405 }, { "epoch": 4.73, "grad_norm": 0.5490692853927612, "learning_rate": 3.6978111467961263e-07, "loss": 1.8352, "step": 26410 }, { "epoch": 4.73, "grad_norm": 0.5645069479942322, "learning_rate": 3.6737569053827137e-07, "loss": 1.7449, "step": 26415 }, { "epoch": 4.73, "grad_norm": 0.6575131416320801, "learning_rate": 3.6497805767230495e-07, "loss": 1.8291, "step": 26420 }, { "epoch": 4.73, "grad_norm": 2.23494815826416, "learning_rate": 3.625882168400846e-07, "loss": 1.6557, "step": 26425 }, { "epoch": 4.73, "grad_norm": 2.7113113403320312, "learning_rate": 3.6020616879750835e-07, "loss": 1.8655, "step": 26430 }, { "epoch": 4.73, "grad_norm": 0.5787123441696167, "learning_rate": 3.5783191429802354e-07, "loss": 1.7335, "step": 26435 }, { "epoch": 4.73, "grad_norm": 1.4365309476852417, "learning_rate": 3.554654540925989e-07, "loss": 1.8369, "step": 26440 }, { "epoch": 4.73, "grad_norm": 0.9646137356758118, "learning_rate": 3.531067889297496e-07, "loss": 1.6218, "step": 26445 }, { "epoch": 4.73, "grad_norm": 1.048437237739563, "learning_rate": 3.507559195555149e-07, "loss": 1.8628, "step": 26450 }, { "epoch": 4.74, "grad_norm": 1.032910943031311, "learning_rate": 3.484128467134723e-07, "loss": 1.6514, "step": 26455 }, { "epoch": 4.74, "grad_norm": 1.4005285501480103, "learning_rate": 3.4607757114473174e-07, "loss": 1.5883, "step": 26460 }, { "epoch": 4.74, "grad_norm": 0.918569803237915, "learning_rate": 3.4375009358794395e-07, "loss": 1.9173, "step": 26465 }, { "epoch": 4.74, "grad_norm": 1.5146937370300293, "learning_rate": 3.4143041477928686e-07, "loss": 1.736, "step": 26470 }, { "epoch": 4.74, "grad_norm": 1.1808782815933228, "learning_rate": 3.391185354524706e-07, "loss": 1.8146, "step": 26475 }, { "epoch": 4.74, "grad_norm": 1.120993971824646, "learning_rate": 3.36814456338741e-07, "loss": 1.7978, "step": 26480 }, { "epoch": 4.74, "grad_norm": 0.922327995300293, "learning_rate": 3.3451817816687323e-07, "loss": 1.7257, "step": 26485 }, { "epoch": 4.74, "grad_norm": 2.25972056388855, "learning_rate": 3.322297016631809e-07, "loss": 1.6793, "step": 26490 }, { "epoch": 4.74, "grad_norm": 0.4978795349597931, "learning_rate": 3.2994902755150715e-07, "loss": 1.9777, "step": 26495 }, { "epoch": 4.74, "grad_norm": 0.6013870239257812, "learning_rate": 3.276761565532277e-07, "loss": 1.9321, "step": 26500 }, { "epoch": 4.74, "grad_norm": 0.6871728897094727, "learning_rate": 3.254110893872481e-07, "loss": 1.8226, "step": 26505 }, { "epoch": 4.75, "grad_norm": 1.049520492553711, "learning_rate": 3.231538267700063e-07, "loss": 1.6252, "step": 26510 }, { "epoch": 4.75, "grad_norm": 1.513724684715271, "learning_rate": 3.209043694154729e-07, "loss": 1.7386, "step": 26515 }, { "epoch": 4.75, "grad_norm": 3.1947197914123535, "learning_rate": 3.186627180351509e-07, "loss": 1.9751, "step": 26520 }, { "epoch": 4.75, "grad_norm": 0.9903146624565125, "learning_rate": 3.164288733380677e-07, "loss": 1.5443, "step": 26525 }, { "epoch": 4.75, "grad_norm": 0.7547667622566223, "learning_rate": 3.1420283603079135e-07, "loss": 1.9813, "step": 26530 }, { "epoch": 4.75, "grad_norm": 1.0118529796600342, "learning_rate": 3.1198460681741427e-07, "loss": 1.9654, "step": 26535 }, { "epoch": 4.75, "grad_norm": 0.8024758100509644, "learning_rate": 3.097741863995557e-07, "loss": 1.6795, "step": 26540 }, { "epoch": 4.75, "grad_norm": 0.8070296049118042, "learning_rate": 3.0757157547637307e-07, "loss": 1.9996, "step": 26545 }, { "epoch": 4.75, "grad_norm": 0.769829273223877, "learning_rate": 3.053767747445535e-07, "loss": 1.8527, "step": 26550 }, { "epoch": 4.75, "grad_norm": 0.8586844801902771, "learning_rate": 3.0318978489830286e-07, "loss": 1.6287, "step": 26555 }, { "epoch": 4.75, "grad_norm": 0.8367322683334351, "learning_rate": 3.010106066293733e-07, "loss": 2.0245, "step": 26560 }, { "epoch": 4.76, "grad_norm": 0.4964026212692261, "learning_rate": 2.988392406270274e-07, "loss": 1.4126, "step": 26565 }, { "epoch": 4.76, "grad_norm": 0.8849166035652161, "learning_rate": 2.966756875780713e-07, "loss": 1.9657, "step": 26570 }, { "epoch": 4.76, "grad_norm": 0.6733694076538086, "learning_rate": 2.945199481668326e-07, "loss": 1.8609, "step": 26575 }, { "epoch": 4.76, "grad_norm": 2.2621817588806152, "learning_rate": 2.923720230751714e-07, "loss": 1.9039, "step": 26580 }, { "epoch": 4.76, "grad_norm": 0.6737426519393921, "learning_rate": 2.9023191298247487e-07, "loss": 1.6034, "step": 26585 }, { "epoch": 4.76, "grad_norm": 0.8793543577194214, "learning_rate": 2.8809961856565425e-07, "loss": 2.0761, "step": 26590 }, { "epoch": 4.76, "grad_norm": 0.6130470633506775, "learning_rate": 2.8597514049915617e-07, "loss": 2.0985, "step": 26595 }, { "epoch": 4.76, "grad_norm": 0.6239492297172546, "learning_rate": 2.838584794549487e-07, "loss": 1.7084, "step": 26600 }, { "epoch": 4.76, "grad_norm": 1.3211489915847778, "learning_rate": 2.817496361025296e-07, "loss": 1.6366, "step": 26605 }, { "epoch": 4.76, "grad_norm": 0.7192025184631348, "learning_rate": 2.7964861110892636e-07, "loss": 1.8488, "step": 26610 }, { "epoch": 4.76, "grad_norm": 1.1748217344284058, "learning_rate": 2.7755540513868805e-07, "loss": 1.9784, "step": 26615 }, { "epoch": 4.76, "grad_norm": 0.6899864673614502, "learning_rate": 2.754700188538961e-07, "loss": 1.8471, "step": 26620 }, { "epoch": 4.77, "grad_norm": 1.0577079057693481, "learning_rate": 2.733924529141535e-07, "loss": 1.7249, "step": 26625 }, { "epoch": 4.77, "grad_norm": 0.9071574211120605, "learning_rate": 2.7132270797659563e-07, "loss": 1.6626, "step": 26630 }, { "epoch": 4.77, "grad_norm": 0.7166086435317993, "learning_rate": 2.6926078469587944e-07, "loss": 2.038, "step": 26635 }, { "epoch": 4.77, "grad_norm": 0.6789818406105042, "learning_rate": 2.67206683724186e-07, "loss": 1.8971, "step": 26640 }, { "epoch": 4.77, "grad_norm": 1.1377719640731812, "learning_rate": 2.6516040571123166e-07, "loss": 1.5134, "step": 26645 }, { "epoch": 4.77, "grad_norm": 1.127935767173767, "learning_rate": 2.6312195130424875e-07, "loss": 2.0224, "step": 26650 }, { "epoch": 4.77, "grad_norm": 1.656747817993164, "learning_rate": 2.610913211479993e-07, "loss": 1.9184, "step": 26655 }, { "epoch": 4.77, "grad_norm": 1.3809555768966675, "learning_rate": 2.5906851588476945e-07, "loss": 1.5658, "step": 26660 }, { "epoch": 4.77, "grad_norm": 0.8653160333633423, "learning_rate": 2.570535361543669e-07, "loss": 1.7374, "step": 26665 }, { "epoch": 4.77, "grad_norm": 1.040223479270935, "learning_rate": 2.550463825941346e-07, "loss": 1.9036, "step": 26670 }, { "epoch": 4.77, "grad_norm": 1.1421611309051514, "learning_rate": 2.530470558389286e-07, "loss": 1.9153, "step": 26675 }, { "epoch": 4.78, "grad_norm": 1.0162934064865112, "learning_rate": 2.510555565211348e-07, "loss": 1.6926, "step": 26680 }, { "epoch": 4.78, "grad_norm": 0.722922682762146, "learning_rate": 2.4907188527066303e-07, "loss": 1.6469, "step": 26685 }, { "epoch": 4.78, "grad_norm": 0.6988521218299866, "learning_rate": 2.4709604271494766e-07, "loss": 1.4591, "step": 26690 }, { "epoch": 4.78, "grad_norm": 0.9216463565826416, "learning_rate": 2.451280294789443e-07, "loss": 1.6103, "step": 26695 }, { "epoch": 4.78, "grad_norm": 0.6147362589836121, "learning_rate": 2.4316784618513276e-07, "loss": 1.788, "step": 26700 }, { "epoch": 4.78, "grad_norm": 0.583010196685791, "learning_rate": 2.412154934535199e-07, "loss": 1.7635, "step": 26705 }, { "epoch": 4.78, "grad_norm": 1.2794963121414185, "learning_rate": 2.392709719016312e-07, "loss": 1.8667, "step": 26710 }, { "epoch": 4.78, "grad_norm": 1.555069923400879, "learning_rate": 2.3733428214451903e-07, "loss": 1.7455, "step": 26715 }, { "epoch": 4.78, "grad_norm": 0.6522120833396912, "learning_rate": 2.354054247947518e-07, "loss": 1.8311, "step": 26720 }, { "epoch": 4.78, "grad_norm": 0.864811360836029, "learning_rate": 2.3348440046243035e-07, "loss": 1.8641, "step": 26725 }, { "epoch": 4.78, "grad_norm": 0.828033983707428, "learning_rate": 2.3157120975516867e-07, "loss": 1.8355, "step": 26730 }, { "epoch": 4.79, "grad_norm": 0.6203212141990662, "learning_rate": 2.2966585327811041e-07, "loss": 2.0341, "step": 26735 }, { "epoch": 4.79, "grad_norm": 0.5540245771408081, "learning_rate": 2.2776833163391796e-07, "loss": 2.0733, "step": 26740 }, { "epoch": 4.79, "grad_norm": 0.819915235042572, "learning_rate": 2.2587864542277237e-07, "loss": 1.8464, "step": 26745 }, { "epoch": 4.79, "grad_norm": 3.0156285762786865, "learning_rate": 2.239967952423816e-07, "loss": 1.5715, "step": 26750 }, { "epoch": 4.79, "grad_norm": 0.48900169134140015, "learning_rate": 2.2212278168797507e-07, "loss": 1.6596, "step": 26755 }, { "epoch": 4.79, "grad_norm": 0.8913301825523376, "learning_rate": 2.202566053522953e-07, "loss": 1.7543, "step": 26760 }, { "epoch": 4.79, "grad_norm": 0.641745924949646, "learning_rate": 2.1839826682562015e-07, "loss": 2.1231, "step": 26765 }, { "epoch": 4.79, "grad_norm": 0.7494519352912903, "learning_rate": 2.1654776669573496e-07, "loss": 1.89, "step": 26770 }, { "epoch": 4.79, "grad_norm": 1.3078620433807373, "learning_rate": 2.147051055479521e-07, "loss": 1.4582, "step": 26775 }, { "epoch": 4.79, "grad_norm": 3.589418411254883, "learning_rate": 2.1287028396510534e-07, "loss": 1.5499, "step": 26780 }, { "epoch": 4.79, "grad_norm": 1.803573489189148, "learning_rate": 2.1104330252754435e-07, "loss": 1.9718, "step": 26785 }, { "epoch": 4.8, "grad_norm": 0.8699598908424377, "learning_rate": 2.092241618131402e-07, "loss": 1.8986, "step": 26790 }, { "epoch": 4.8, "grad_norm": 0.7145684361457825, "learning_rate": 2.0741286239729097e-07, "loss": 1.6495, "step": 26795 }, { "epoch": 4.8, "grad_norm": 1.2166204452514648, "learning_rate": 2.0560940485290782e-07, "loss": 1.5914, "step": 26800 }, { "epoch": 4.8, "grad_norm": 0.681698739528656, "learning_rate": 2.0381378975041775e-07, "loss": 1.5633, "step": 26805 }, { "epoch": 4.8, "grad_norm": 0.5658136606216431, "learning_rate": 2.0202601765777762e-07, "loss": 1.8271, "step": 26810 }, { "epoch": 4.8, "grad_norm": 0.5894467234611511, "learning_rate": 2.0024608914045173e-07, "loss": 1.8671, "step": 26815 }, { "epoch": 4.8, "grad_norm": 1.0941160917282104, "learning_rate": 1.9847400476143695e-07, "loss": 1.679, "step": 26820 }, { "epoch": 4.8, "grad_norm": 1.179622769355774, "learning_rate": 1.9670976508123495e-07, "loss": 2.0309, "step": 26825 }, { "epoch": 4.8, "grad_norm": 1.4097144603729248, "learning_rate": 1.9495337065787988e-07, "loss": 2.1893, "step": 26830 }, { "epoch": 4.8, "grad_norm": 0.7634019255638123, "learning_rate": 1.9320482204691348e-07, "loss": 1.7503, "step": 26835 }, { "epoch": 4.8, "grad_norm": 0.8397713303565979, "learning_rate": 1.914641198013989e-07, "loss": 2.0055, "step": 26840 }, { "epoch": 4.81, "grad_norm": 0.6048543453216553, "learning_rate": 1.8973126447192345e-07, "loss": 2.0813, "step": 26845 }, { "epoch": 4.81, "grad_norm": 0.7331050634384155, "learning_rate": 1.880062566065821e-07, "loss": 1.8715, "step": 26850 }, { "epoch": 4.81, "grad_norm": 0.8746078014373779, "learning_rate": 1.862890967509967e-07, "loss": 1.719, "step": 26855 }, { "epoch": 4.81, "grad_norm": 1.2344975471496582, "learning_rate": 1.8457978544830224e-07, "loss": 2.062, "step": 26860 }, { "epoch": 4.81, "grad_norm": 0.6543428301811218, "learning_rate": 1.8287832323915233e-07, "loss": 1.6365, "step": 26865 }, { "epoch": 4.81, "grad_norm": 0.6258736252784729, "learning_rate": 1.8118471066171648e-07, "loss": 1.6869, "step": 26870 }, { "epoch": 4.81, "grad_norm": 0.3230958580970764, "learning_rate": 1.7949894825168568e-07, "loss": 1.9677, "step": 26875 }, { "epoch": 4.81, "grad_norm": 0.5818564891815186, "learning_rate": 1.778210365422611e-07, "loss": 1.7614, "step": 26880 }, { "epoch": 4.81, "grad_norm": 0.5392193794250488, "learning_rate": 1.761509760641683e-07, "loss": 1.8393, "step": 26885 }, { "epoch": 4.81, "grad_norm": 0.8611071109771729, "learning_rate": 1.744887673456458e-07, "loss": 1.7183, "step": 26890 }, { "epoch": 4.81, "grad_norm": 1.2561523914337158, "learning_rate": 1.7283441091244523e-07, "loss": 1.9883, "step": 26895 }, { "epoch": 4.81, "grad_norm": 0.6996719837188721, "learning_rate": 1.7118790728783973e-07, "loss": 2.3327, "step": 26900 }, { "epoch": 4.82, "grad_norm": 0.7118642330169678, "learning_rate": 1.6954925699261825e-07, "loss": 1.9309, "step": 26905 }, { "epoch": 4.82, "grad_norm": 1.1299420595169067, "learning_rate": 1.6791846054508008e-07, "loss": 1.8533, "step": 26910 }, { "epoch": 4.82, "grad_norm": 1.176224946975708, "learning_rate": 1.6629551846104874e-07, "loss": 1.7047, "step": 26915 }, { "epoch": 4.82, "grad_norm": 0.4716276228427887, "learning_rate": 1.646804312538608e-07, "loss": 2.2831, "step": 26920 }, { "epoch": 4.82, "grad_norm": 0.8160402774810791, "learning_rate": 1.630731994343604e-07, "loss": 1.8131, "step": 26925 }, { "epoch": 4.82, "grad_norm": 1.1298454999923706, "learning_rate": 1.6147382351091865e-07, "loss": 1.829, "step": 26930 }, { "epoch": 4.82, "grad_norm": 1.2337285280227661, "learning_rate": 1.5988230398941428e-07, "loss": 1.9875, "step": 26935 }, { "epoch": 4.82, "grad_norm": 0.5422369837760925, "learning_rate": 1.5829864137324457e-07, "loss": 2.1579, "step": 26940 }, { "epoch": 4.82, "grad_norm": 0.5247811079025269, "learning_rate": 1.5672283616331717e-07, "loss": 1.7073, "step": 26945 }, { "epoch": 4.82, "grad_norm": 0.9248299598693848, "learning_rate": 1.5515488885806394e-07, "loss": 1.8775, "step": 26950 }, { "epoch": 4.82, "grad_norm": 0.7368882298469543, "learning_rate": 1.535947999534215e-07, "loss": 1.7339, "step": 26955 }, { "epoch": 4.83, "grad_norm": 1.1814441680908203, "learning_rate": 1.5204256994284238e-07, "loss": 1.7054, "step": 26960 }, { "epoch": 4.83, "grad_norm": 0.6178357005119324, "learning_rate": 1.504981993172977e-07, "loss": 2.0362, "step": 26965 }, { "epoch": 4.83, "grad_norm": 0.6023995280265808, "learning_rate": 1.489616885652717e-07, "loss": 2.0766, "step": 26970 }, { "epoch": 4.83, "grad_norm": 0.8028101921081543, "learning_rate": 1.4743303817275899e-07, "loss": 2.2482, "step": 26975 }, { "epoch": 4.83, "grad_norm": 0.9648069143295288, "learning_rate": 1.459122486232728e-07, "loss": 1.8054, "step": 26980 }, { "epoch": 4.83, "grad_norm": 1.3614518642425537, "learning_rate": 1.4439932039783665e-07, "loss": 1.6154, "step": 26985 }, { "epoch": 4.83, "grad_norm": 1.080051302909851, "learning_rate": 1.4289425397498445e-07, "loss": 1.6912, "step": 26990 }, { "epoch": 4.83, "grad_norm": 0.6755841374397278, "learning_rate": 1.413970498307715e-07, "loss": 1.7686, "step": 26995 }, { "epoch": 4.83, "grad_norm": 12.691487312316895, "learning_rate": 1.3990770843876067e-07, "loss": 1.681, "step": 27000 }, { "epoch": 4.83, "grad_norm": 0.9654897451400757, "learning_rate": 1.3842623027002787e-07, "loss": 1.8103, "step": 27005 }, { "epoch": 4.83, "grad_norm": 1.236765742301941, "learning_rate": 1.3695261579316777e-07, "loss": 1.5977, "step": 27010 }, { "epoch": 4.84, "grad_norm": 2.983935594558716, "learning_rate": 1.3548686547427692e-07, "loss": 1.7417, "step": 27015 }, { "epoch": 4.84, "grad_norm": 0.7265644669532776, "learning_rate": 1.3402897977697614e-07, "loss": 1.9217, "step": 27020 }, { "epoch": 4.84, "grad_norm": 1.0526750087738037, "learning_rate": 1.3257895916238828e-07, "loss": 1.8388, "step": 27025 }, { "epoch": 4.84, "grad_norm": 0.9290766716003418, "learning_rate": 1.3113680408915752e-07, "loss": 1.7867, "step": 27030 }, { "epoch": 4.84, "grad_norm": 0.6184346079826355, "learning_rate": 1.2970251501343844e-07, "loss": 1.7741, "step": 27035 }, { "epoch": 4.84, "grad_norm": 1.0642505884170532, "learning_rate": 1.2827609238888762e-07, "loss": 1.7068, "step": 27040 }, { "epoch": 4.84, "grad_norm": 0.8582693338394165, "learning_rate": 1.2685753666668576e-07, "loss": 1.4969, "step": 27045 }, { "epoch": 4.84, "grad_norm": 0.6961191892623901, "learning_rate": 1.2544684829552122e-07, "loss": 1.6328, "step": 27050 }, { "epoch": 4.84, "grad_norm": 1.6947689056396484, "learning_rate": 1.2404402772159262e-07, "loss": 1.6862, "step": 27055 }, { "epoch": 4.84, "grad_norm": 0.803805947303772, "learning_rate": 1.2264907538860893e-07, "loss": 1.5166, "step": 27060 }, { "epoch": 4.84, "grad_norm": 1.7080804109573364, "learning_rate": 1.2126199173779496e-07, "loss": 1.632, "step": 27065 }, { "epoch": 4.85, "grad_norm": 0.6481025815010071, "learning_rate": 1.1988277720788034e-07, "loss": 1.9797, "step": 27070 }, { "epoch": 4.85, "grad_norm": 1.0819138288497925, "learning_rate": 1.1851143223511329e-07, "loss": 1.5857, "step": 27075 }, { "epoch": 4.85, "grad_norm": 0.7787542343139648, "learning_rate": 1.1714795725324967e-07, "loss": 1.5872, "step": 27080 }, { "epoch": 4.85, "grad_norm": 2.253755807876587, "learning_rate": 1.1579235269355005e-07, "loss": 1.7227, "step": 27085 }, { "epoch": 4.85, "grad_norm": 0.5482271313667297, "learning_rate": 1.1444461898479364e-07, "loss": 1.7388, "step": 27090 }, { "epoch": 4.85, "grad_norm": 1.8450013399124146, "learning_rate": 1.1310475655326724e-07, "loss": 1.908, "step": 27095 }, { "epoch": 4.85, "grad_norm": 0.5819000601768494, "learning_rate": 1.1177276582276797e-07, "loss": 1.613, "step": 27100 }, { "epoch": 4.85, "grad_norm": 0.8016330003738403, "learning_rate": 1.1044864721460602e-07, "loss": 1.6827, "step": 27105 }, { "epoch": 4.85, "grad_norm": 1.288928508758545, "learning_rate": 1.091324011475936e-07, "loss": 1.846, "step": 27110 }, { "epoch": 4.85, "grad_norm": 1.4904510974884033, "learning_rate": 1.0782402803806158e-07, "loss": 2.0203, "step": 27115 }, { "epoch": 4.85, "grad_norm": 2.6243813037872314, "learning_rate": 1.0652352829984558e-07, "loss": 1.6311, "step": 27120 }, { "epoch": 4.86, "grad_norm": 2.045247793197632, "learning_rate": 1.0523090234429434e-07, "loss": 2.0991, "step": 27125 }, { "epoch": 4.86, "grad_norm": 0.736707329750061, "learning_rate": 1.0394615058026414e-07, "loss": 1.7908, "step": 27130 }, { "epoch": 4.86, "grad_norm": 1.2909306287765503, "learning_rate": 1.0266927341411602e-07, "loss": 2.2229, "step": 27135 }, { "epoch": 4.86, "grad_norm": 0.4913961887359619, "learning_rate": 1.0140027124973251e-07, "loss": 1.8207, "step": 27140 }, { "epoch": 4.86, "grad_norm": 0.5628072023391724, "learning_rate": 1.0013914448848971e-07, "loss": 2.0149, "step": 27145 }, { "epoch": 4.86, "grad_norm": 0.8567386865615845, "learning_rate": 9.8885893529288e-08, "loss": 2.0128, "step": 27150 }, { "epoch": 4.86, "grad_norm": 0.6592267155647278, "learning_rate": 9.764051876852421e-08, "loss": 2.0507, "step": 27155 }, { "epoch": 4.86, "grad_norm": 1.0533310174942017, "learning_rate": 9.640302060011375e-08, "loss": 1.8298, "step": 27160 }, { "epoch": 4.86, "grad_norm": 1.0910136699676514, "learning_rate": 9.517339941547132e-08, "loss": 1.7826, "step": 27165 }, { "epoch": 4.86, "grad_norm": 1.4886125326156616, "learning_rate": 9.395165560352748e-08, "loss": 1.424, "step": 27170 }, { "epoch": 4.86, "grad_norm": 0.8163747191429138, "learning_rate": 9.273778955072032e-08, "loss": 1.8783, "step": 27175 }, { "epoch": 4.87, "grad_norm": 0.813804566860199, "learning_rate": 9.153180164098995e-08, "loss": 1.9613, "step": 27180 }, { "epoch": 4.87, "grad_norm": 1.107713222503662, "learning_rate": 9.033369225579514e-08, "loss": 1.783, "step": 27185 }, { "epoch": 4.87, "grad_norm": 1.2432942390441895, "learning_rate": 8.914346177409105e-08, "loss": 1.7609, "step": 27190 }, { "epoch": 4.87, "grad_norm": 0.697809636592865, "learning_rate": 8.79611105723488e-08, "loss": 1.8995, "step": 27195 }, { "epoch": 4.87, "grad_norm": 0.8270529508590698, "learning_rate": 8.67866390245442e-08, "loss": 1.8255, "step": 27200 }, { "epoch": 4.87, "grad_norm": 0.9013446569442749, "learning_rate": 8.562004750216346e-08, "loss": 1.7947, "step": 27205 }, { "epoch": 4.87, "grad_norm": 1.4320096969604492, "learning_rate": 8.446133637419751e-08, "loss": 1.6338, "step": 27210 }, { "epoch": 4.87, "grad_norm": 0.9843838810920715, "learning_rate": 8.331050600714485e-08, "loss": 1.8977, "step": 27215 }, { "epoch": 4.87, "grad_norm": 1.381026029586792, "learning_rate": 8.216755676501431e-08, "loss": 1.7343, "step": 27220 }, { "epoch": 4.87, "grad_norm": 1.0219390392303467, "learning_rate": 8.103248900931393e-08, "loss": 1.7915, "step": 27225 }, { "epoch": 4.87, "grad_norm": 1.4810404777526855, "learning_rate": 7.99053030990704e-08, "loss": 1.9256, "step": 27230 }, { "epoch": 4.87, "grad_norm": 1.307970643043518, "learning_rate": 7.878599939080689e-08, "loss": 1.9488, "step": 27235 }, { "epoch": 4.88, "grad_norm": 0.675598680973053, "learning_rate": 7.767457823856239e-08, "loss": 1.6861, "step": 27240 }, { "epoch": 4.88, "grad_norm": 1.2670272588729858, "learning_rate": 7.657103999387794e-08, "loss": 1.7473, "step": 27245 }, { "epoch": 4.88, "grad_norm": 1.0107417106628418, "learning_rate": 7.547538500579932e-08, "loss": 1.5708, "step": 27250 }, { "epoch": 4.88, "grad_norm": 0.8755956888198853, "learning_rate": 7.438761362087987e-08, "loss": 1.728, "step": 27255 }, { "epoch": 4.88, "grad_norm": 1.5213618278503418, "learning_rate": 7.330772618318604e-08, "loss": 1.6688, "step": 27260 }, { "epoch": 4.88, "grad_norm": 0.8350248336791992, "learning_rate": 7.223572303428072e-08, "loss": 1.5138, "step": 27265 }, { "epoch": 4.88, "grad_norm": 0.5836665630340576, "learning_rate": 7.11716045132399e-08, "loss": 1.6876, "step": 27270 }, { "epoch": 4.88, "grad_norm": 0.3720304071903229, "learning_rate": 7.011537095663878e-08, "loss": 2.2579, "step": 27275 }, { "epoch": 4.88, "grad_norm": 0.8098456859588623, "learning_rate": 6.906702269856846e-08, "loss": 1.8464, "step": 27280 }, { "epoch": 4.88, "grad_norm": 1.2636229991912842, "learning_rate": 6.802656007061925e-08, "loss": 1.9191, "step": 27285 }, { "epoch": 4.88, "grad_norm": 0.7597413063049316, "learning_rate": 6.699398340188623e-08, "loss": 1.9295, "step": 27290 }, { "epoch": 4.89, "grad_norm": 0.806469738483429, "learning_rate": 6.596929301897481e-08, "loss": 1.8776, "step": 27295 }, { "epoch": 4.89, "grad_norm": 0.9175061583518982, "learning_rate": 6.49524892459924e-08, "loss": 1.7537, "step": 27300 }, { "epoch": 4.89, "grad_norm": 1.0410821437835693, "learning_rate": 6.394357240455118e-08, "loss": 1.7922, "step": 27305 }, { "epoch": 4.89, "grad_norm": 0.9964597821235657, "learning_rate": 6.294254281377366e-08, "loss": 1.9924, "step": 27310 }, { "epoch": 4.89, "grad_norm": 0.8122162818908691, "learning_rate": 6.194940079028432e-08, "loss": 1.8831, "step": 27315 }, { "epoch": 4.89, "grad_norm": 2.2000720500946045, "learning_rate": 6.096414664821248e-08, "loss": 1.7875, "step": 27320 }, { "epoch": 4.89, "grad_norm": 0.8936071395874023, "learning_rate": 5.998678069919216e-08, "loss": 1.8224, "step": 27325 }, { "epoch": 4.89, "grad_norm": 0.8874375224113464, "learning_rate": 5.901730325236221e-08, "loss": 2.0273, "step": 27330 }, { "epoch": 4.89, "grad_norm": 1.6214640140533447, "learning_rate": 5.805571461437176e-08, "loss": 1.7428, "step": 27335 }, { "epoch": 4.89, "grad_norm": 1.3129786252975464, "learning_rate": 5.7102015089366415e-08, "loss": 1.6939, "step": 27340 }, { "epoch": 4.89, "grad_norm": 1.0049517154693604, "learning_rate": 5.615620497900209e-08, "loss": 1.6544, "step": 27345 }, { "epoch": 4.9, "grad_norm": 0.76762455701828, "learning_rate": 5.521828458243672e-08, "loss": 1.9076, "step": 27350 }, { "epoch": 4.9, "grad_norm": 0.8442772030830383, "learning_rate": 5.428825419633299e-08, "loss": 1.7669, "step": 27355 }, { "epoch": 4.9, "grad_norm": 1.0416758060455322, "learning_rate": 5.336611411486114e-08, "loss": 2.1945, "step": 27360 }, { "epoch": 4.9, "grad_norm": 0.9710814356803894, "learning_rate": 5.245186462969065e-08, "loss": 1.6822, "step": 27365 }, { "epoch": 4.9, "grad_norm": 0.7552046775817871, "learning_rate": 5.15455060300013e-08, "loss": 1.9038, "step": 27370 }, { "epoch": 4.9, "grad_norm": 0.8535296320915222, "learning_rate": 5.064703860246933e-08, "loss": 1.8449, "step": 27375 }, { "epoch": 4.9, "grad_norm": 5.469326019287109, "learning_rate": 4.975646263128131e-08, "loss": 1.8349, "step": 27380 }, { "epoch": 4.9, "grad_norm": 0.8397666215896606, "learning_rate": 4.8873778398123015e-08, "loss": 1.7978, "step": 27385 }, { "epoch": 4.9, "grad_norm": 0.7340749502182007, "learning_rate": 4.799898618219057e-08, "loss": 1.7184, "step": 27390 }, { "epoch": 4.9, "grad_norm": 0.6980647444725037, "learning_rate": 4.713208626017929e-08, "loss": 1.8878, "step": 27395 }, { "epoch": 4.9, "grad_norm": 0.6763983964920044, "learning_rate": 4.627307890628374e-08, "loss": 2.0292, "step": 27400 }, { "epoch": 4.91, "grad_norm": 2.8043484687805176, "learning_rate": 4.5421964392214354e-08, "loss": 1.76, "step": 27405 }, { "epoch": 4.91, "grad_norm": 1.528351068496704, "learning_rate": 4.457874298717246e-08, "loss": 1.8167, "step": 27410 }, { "epoch": 4.91, "grad_norm": 1.0853112936019897, "learning_rate": 4.3743414957872485e-08, "loss": 2.0553, "step": 27415 }, { "epoch": 4.91, "grad_norm": 0.819485068321228, "learning_rate": 4.291598056852253e-08, "loss": 1.9577, "step": 27420 }, { "epoch": 4.91, "grad_norm": 0.8028863072395325, "learning_rate": 4.209644008084379e-08, "loss": 1.927, "step": 27425 }, { "epoch": 4.91, "grad_norm": 0.6567749977111816, "learning_rate": 4.128479375405392e-08, "loss": 1.9202, "step": 27430 }, { "epoch": 4.91, "grad_norm": 1.1371631622314453, "learning_rate": 4.048104184487811e-08, "loss": 2.0074, "step": 27435 }, { "epoch": 4.91, "grad_norm": 0.6734791994094849, "learning_rate": 3.968518460753801e-08, "loss": 1.7147, "step": 27440 }, { "epoch": 4.91, "grad_norm": 1.4970152378082275, "learning_rate": 3.8897222293768356e-08, "loss": 1.8979, "step": 27445 }, { "epoch": 4.91, "grad_norm": 0.4336738586425781, "learning_rate": 3.8117155152797567e-08, "loss": 1.8246, "step": 27450 }, { "epoch": 4.91, "grad_norm": 1.3205974102020264, "learning_rate": 3.734498343135884e-08, "loss": 1.837, "step": 27455 }, { "epoch": 4.92, "grad_norm": 1.5103232860565186, "learning_rate": 3.658070737369013e-08, "loss": 1.7212, "step": 27460 }, { "epoch": 4.92, "grad_norm": 0.7287768721580505, "learning_rate": 3.5824327221534174e-08, "loss": 1.9573, "step": 27465 }, { "epoch": 4.92, "grad_norm": 1.077132225036621, "learning_rate": 3.507584321412738e-08, "loss": 1.9364, "step": 27470 }, { "epoch": 4.92, "grad_norm": 0.7396588921546936, "learning_rate": 3.433525558822204e-08, "loss": 1.8377, "step": 27475 }, { "epoch": 4.92, "grad_norm": 0.9306398034095764, "learning_rate": 3.360256457805855e-08, "loss": 1.8822, "step": 27480 }, { "epoch": 4.92, "grad_norm": 1.1197668313980103, "learning_rate": 3.287777041539042e-08, "loss": 1.9135, "step": 27485 }, { "epoch": 4.92, "grad_norm": 1.111188292503357, "learning_rate": 3.2160873329470374e-08, "loss": 1.8155, "step": 27490 }, { "epoch": 4.92, "grad_norm": 0.7195913791656494, "learning_rate": 3.145187354704759e-08, "loss": 1.7282, "step": 27495 }, { "epoch": 4.92, "grad_norm": 1.0142110586166382, "learning_rate": 3.075077129238158e-08, "loss": 1.9826, "step": 27500 }, { "epoch": 4.92, "grad_norm": 0.8807904720306396, "learning_rate": 3.005756678722826e-08, "loss": 1.9266, "step": 27505 }, { "epoch": 4.92, "grad_norm": 0.3830811381340027, "learning_rate": 2.9372260250848384e-08, "loss": 1.7985, "step": 27510 }, { "epoch": 4.93, "grad_norm": 1.0831133127212524, "learning_rate": 2.869485190000465e-08, "loss": 1.6724, "step": 27515 }, { "epoch": 4.93, "grad_norm": 1.3438478708267212, "learning_rate": 2.8025341948959007e-08, "loss": 1.7674, "step": 27520 }, { "epoch": 4.93, "grad_norm": 0.8136791586875916, "learning_rate": 2.7363730609478166e-08, "loss": 1.8693, "step": 27525 }, { "epoch": 4.93, "grad_norm": 0.6372392177581787, "learning_rate": 2.6710018090828072e-08, "loss": 1.7738, "step": 27530 }, { "epoch": 4.93, "grad_norm": 0.6655257344245911, "learning_rate": 2.606420459977943e-08, "loss": 1.7049, "step": 27535 }, { "epoch": 4.93, "grad_norm": 0.8369601964950562, "learning_rate": 2.555324124416669e-08, "loss": 1.6822, "step": 27540 }, { "epoch": 4.93, "grad_norm": 0.9216079711914062, "learning_rate": 2.4921646515879692e-08, "loss": 1.6634, "step": 27545 }, { "epoch": 4.93, "grad_norm": 0.874138593673706, "learning_rate": 2.429795138085278e-08, "loss": 1.948, "step": 27550 }, { "epoch": 4.93, "grad_norm": 0.4770451784133911, "learning_rate": 2.3682156036358704e-08, "loss": 1.7761, "step": 27555 }, { "epoch": 4.93, "grad_norm": 0.8585416078567505, "learning_rate": 2.3074260677177773e-08, "loss": 1.7923, "step": 27560 }, { "epoch": 4.93, "grad_norm": 0.9223566055297852, "learning_rate": 2.247426549558118e-08, "loss": 1.7544, "step": 27565 }, { "epoch": 4.93, "grad_norm": 3.3756611347198486, "learning_rate": 2.188217068134768e-08, "loss": 1.961, "step": 27570 }, { "epoch": 4.94, "grad_norm": 0.6999983191490173, "learning_rate": 2.129797642176079e-08, "loss": 1.8334, "step": 27575 }, { "epoch": 4.94, "grad_norm": 0.6343751549720764, "learning_rate": 2.0721682901594928e-08, "loss": 2.1177, "step": 27580 }, { "epoch": 4.94, "grad_norm": 1.4471246004104614, "learning_rate": 2.0153290303134843e-08, "loss": 1.4078, "step": 27585 }, { "epoch": 4.94, "grad_norm": 0.7915092706680298, "learning_rate": 1.9592798806161716e-08, "loss": 1.6325, "step": 27590 }, { "epoch": 4.94, "grad_norm": 0.42571014165878296, "learning_rate": 1.9040208587958742e-08, "loss": 2.0373, "step": 27595 }, { "epoch": 4.94, "grad_norm": 1.225578784942627, "learning_rate": 1.8495519823308327e-08, "loss": 1.7639, "step": 27600 }, { "epoch": 4.94, "grad_norm": 3.2549374103546143, "learning_rate": 1.7958732684497657e-08, "loss": 1.7882, "step": 27605 }, { "epoch": 4.94, "grad_norm": 0.732199490070343, "learning_rate": 1.7429847341307594e-08, "loss": 1.9269, "step": 27610 }, { "epoch": 4.94, "grad_norm": 0.9305698871612549, "learning_rate": 1.6908863961026534e-08, "loss": 1.9614, "step": 27615 }, { "epoch": 4.94, "grad_norm": 1.5303813219070435, "learning_rate": 1.6395782708442108e-08, "loss": 1.7238, "step": 27620 }, { "epoch": 4.94, "grad_norm": 1.0078951120376587, "learning_rate": 1.589060374584117e-08, "loss": 1.7562, "step": 27625 }, { "epoch": 4.95, "grad_norm": 0.5599997639656067, "learning_rate": 1.539332723300979e-08, "loss": 1.6895, "step": 27630 }, { "epoch": 4.95, "grad_norm": 1.057673692703247, "learning_rate": 1.4903953327238818e-08, "loss": 1.6365, "step": 27635 }, { "epoch": 4.95, "grad_norm": 1.3415483236312866, "learning_rate": 1.442248218331277e-08, "loss": 1.8398, "step": 27640 }, { "epoch": 4.95, "grad_norm": 2.2374091148376465, "learning_rate": 1.3948913953523713e-08, "loss": 1.6291, "step": 27645 }, { "epoch": 4.95, "grad_norm": 0.6912126541137695, "learning_rate": 1.3483248787657387e-08, "loss": 1.5426, "step": 27650 }, { "epoch": 4.95, "grad_norm": 0.8157421350479126, "learning_rate": 1.3025486833007083e-08, "loss": 1.8275, "step": 27655 }, { "epoch": 4.95, "grad_norm": 0.5781735181808472, "learning_rate": 1.257562823435976e-08, "loss": 1.7979, "step": 27660 }, { "epoch": 4.95, "grad_norm": 0.9726621508598328, "learning_rate": 1.2133673134007152e-08, "loss": 1.719, "step": 27665 }, { "epoch": 4.95, "grad_norm": 0.7362151145935059, "learning_rate": 1.1699621671740212e-08, "loss": 1.5477, "step": 27670 }, { "epoch": 4.95, "grad_norm": 0.6761454343795776, "learning_rate": 1.1273473984843575e-08, "loss": 2.0092, "step": 27675 }, { "epoch": 4.95, "grad_norm": 0.835422158241272, "learning_rate": 1.0855230208114964e-08, "loss": 1.7663, "step": 27680 }, { "epoch": 4.96, "grad_norm": 0.9953777194023132, "learning_rate": 1.0444890473837454e-08, "loss": 1.5584, "step": 27685 }, { "epoch": 4.96, "grad_norm": 0.6336172223091125, "learning_rate": 1.0042454911804444e-08, "loss": 1.968, "step": 27690 }, { "epoch": 4.96, "grad_norm": 1.8674088716506958, "learning_rate": 9.647923649308554e-09, "loss": 1.7018, "step": 27695 }, { "epoch": 4.96, "grad_norm": 0.79061359167099, "learning_rate": 9.261296811133301e-09, "loss": 2.1185, "step": 27700 }, { "epoch": 4.96, "grad_norm": 0.5622634887695312, "learning_rate": 8.882574519572529e-09, "loss": 1.7226, "step": 27705 }, { "epoch": 4.96, "grad_norm": 0.5330947637557983, "learning_rate": 8.511756894416523e-09, "loss": 1.8869, "step": 27710 }, { "epoch": 4.96, "grad_norm": 0.9711994528770447, "learning_rate": 8.14884405295202e-09, "loss": 1.9226, "step": 27715 }, { "epoch": 4.96, "grad_norm": 0.642650842666626, "learning_rate": 7.793836109970532e-09, "loss": 1.8734, "step": 27720 }, { "epoch": 4.96, "grad_norm": 0.5877659916877747, "learning_rate": 7.446733177757237e-09, "loss": 1.7147, "step": 27725 }, { "epoch": 4.96, "grad_norm": 0.6836099028587341, "learning_rate": 7.107535366102092e-09, "loss": 1.6646, "step": 27730 }, { "epoch": 4.96, "grad_norm": 0.694749653339386, "learning_rate": 6.776242782297049e-09, "loss": 1.6795, "step": 27735 }, { "epoch": 4.97, "grad_norm": 1.7434329986572266, "learning_rate": 6.452855531122182e-09, "loss": 1.8301, "step": 27740 }, { "epoch": 4.97, "grad_norm": 1.0804805755615234, "learning_rate": 6.137373714870664e-09, "loss": 1.6298, "step": 27745 }, { "epoch": 4.97, "grad_norm": 0.8923995494842529, "learning_rate": 5.829797433323791e-09, "loss": 1.7, "step": 27750 }, { "epoch": 4.97, "grad_norm": 1.1635453701019287, "learning_rate": 5.530126783773182e-09, "loss": 1.6506, "step": 27755 }, { "epoch": 4.97, "grad_norm": 2.5559163093566895, "learning_rate": 5.238361861001351e-09, "loss": 1.4222, "step": 27760 }, { "epoch": 4.97, "grad_norm": 0.6483443975448608, "learning_rate": 4.954502757295587e-09, "loss": 2.0705, "step": 27765 }, { "epoch": 4.97, "grad_norm": 0.3663721978664398, "learning_rate": 4.678549562434076e-09, "loss": 1.8228, "step": 27770 }, { "epoch": 4.97, "grad_norm": 0.8673094511032104, "learning_rate": 4.410502363708102e-09, "loss": 1.8538, "step": 27775 }, { "epoch": 4.97, "grad_norm": 1.0372596979141235, "learning_rate": 4.150361245897072e-09, "loss": 1.7484, "step": 27780 }, { "epoch": 4.97, "grad_norm": 0.5771445035934448, "learning_rate": 3.898126291285164e-09, "loss": 1.9486, "step": 27785 }, { "epoch": 4.97, "grad_norm": 0.7575253844261169, "learning_rate": 3.653797579650231e-09, "loss": 1.8841, "step": 27790 }, { "epoch": 4.98, "grad_norm": 0.6591800451278687, "learning_rate": 3.417375188274896e-09, "loss": 2.1256, "step": 27795 }, { "epoch": 4.98, "grad_norm": 0.8006610870361328, "learning_rate": 3.1888591919437826e-09, "loss": 2.0, "step": 27800 }, { "epoch": 4.98, "grad_norm": 1.2859355211257935, "learning_rate": 2.968249662929634e-09, "loss": 1.8523, "step": 27805 }, { "epoch": 4.98, "grad_norm": 0.5861554145812988, "learning_rate": 2.755546671015519e-09, "loss": 1.7343, "step": 27810 }, { "epoch": 4.98, "grad_norm": 0.8748874068260193, "learning_rate": 2.550750283478176e-09, "loss": 1.8467, "step": 27815 }, { "epoch": 4.98, "grad_norm": 0.9922271966934204, "learning_rate": 2.353860565090793e-09, "loss": 1.5163, "step": 27820 }, { "epoch": 4.98, "grad_norm": 1.116941213607788, "learning_rate": 2.164877578136881e-09, "loss": 1.9498, "step": 27825 }, { "epoch": 4.98, "grad_norm": 1.195860743522644, "learning_rate": 1.9838013823852974e-09, "loss": 1.593, "step": 27830 }, { "epoch": 4.98, "grad_norm": 0.7228025794029236, "learning_rate": 1.8106320351124472e-09, "loss": 2.0734, "step": 27835 }, { "epoch": 4.98, "grad_norm": 0.9624055624008179, "learning_rate": 1.6453695910911837e-09, "loss": 1.6158, "step": 27840 }, { "epoch": 4.98, "grad_norm": 1.733687162399292, "learning_rate": 1.4880141025935824e-09, "loss": 2.0339, "step": 27845 }, { "epoch": 4.99, "grad_norm": 0.7384397983551025, "learning_rate": 1.3385656193909413e-09, "loss": 1.9362, "step": 27850 }, { "epoch": 4.99, "grad_norm": 1.2739673852920532, "learning_rate": 1.197024188756557e-09, "loss": 1.6635, "step": 27855 }, { "epoch": 4.99, "grad_norm": 0.9207239747047424, "learning_rate": 1.0633898554573972e-09, "loss": 1.7917, "step": 27860 }, { "epoch": 4.99, "grad_norm": 2.776045560836792, "learning_rate": 9.376626617624286e-10, "loss": 1.9558, "step": 27865 }, { "epoch": 4.99, "grad_norm": 0.9185320734977722, "learning_rate": 8.198426474370635e-10, "loss": 1.8871, "step": 27870 }, { "epoch": 4.99, "grad_norm": 0.9698821306228638, "learning_rate": 7.099298497487139e-10, "loss": 1.5789, "step": 27875 }, { "epoch": 4.99, "grad_norm": 1.133810043334961, "learning_rate": 6.079243034667891e-10, "loss": 1.7469, "step": 27880 }, { "epoch": 4.99, "grad_norm": 0.8469708561897278, "learning_rate": 5.138260408488194e-10, "loss": 2.05, "step": 27885 }, { "epoch": 4.99, "grad_norm": 1.0808436870574951, "learning_rate": 4.2763509166265927e-10, "loss": 1.9934, "step": 27890 }, { "epoch": 4.99, "grad_norm": 1.0401432514190674, "learning_rate": 3.4935148316705967e-10, "loss": 1.8931, "step": 27895 }, { "epoch": 4.99, "grad_norm": 0.6832406520843506, "learning_rate": 2.789752401283208e-10, "loss": 2.1373, "step": 27900 }, { "epoch": 4.99, "grad_norm": 0.7210304141044617, "learning_rate": 2.1650638480086305e-10, "loss": 1.9971, "step": 27905 }, { "epoch": 5.0, "grad_norm": 1.0963140726089478, "learning_rate": 1.6194493694665635e-10, "loss": 1.7998, "step": 27910 }, { "epoch": 5.0, "grad_norm": 0.7948571443557739, "learning_rate": 1.1529091382134205e-10, "loss": 1.9675, "step": 27915 }, { "epoch": 5.0, "grad_norm": 0.8925894498825073, "learning_rate": 7.654433018255969e-11, "loss": 1.7117, "step": 27920 }, { "epoch": 5.0, "grad_norm": 0.588504433631897, "learning_rate": 4.5705198287171456e-11, "loss": 1.6579, "step": 27925 }, { "epoch": 5.0, "grad_norm": 2.567326068878174, "learning_rate": 2.2773527888486635e-11, "loss": 1.817, "step": 27930 }, { "epoch": 5.0, "step": 27930, "total_flos": 5.219882421458043e+18, "train_loss": 1.860821307947505, "train_runtime": 129586.2862, "train_samples_per_second": 1.724, "train_steps_per_second": 0.216 } ], "logging_steps": 5, "max_steps": 27930, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "total_flos": 5.219882421458043e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }