{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5275206529312233, "eval_steps": 100, "global_step": 5300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009953219866626853, "grad_norm": 1.912980556488037, "learning_rate": 9.995023390066686e-06, "loss": 1.8703, "step": 10 }, { "epoch": 0.0019906439733253707, "grad_norm": 1.866821050643921, "learning_rate": 9.990046780133374e-06, "loss": 1.8723, "step": 20 }, { "epoch": 0.002985965959988056, "grad_norm": 2.058809280395508, "learning_rate": 9.985070170200061e-06, "loss": 1.8097, "step": 30 }, { "epoch": 0.003981287946650741, "grad_norm": 1.459013819694519, "learning_rate": 9.980093560266747e-06, "loss": 1.7456, "step": 40 }, { "epoch": 0.004976609933313427, "grad_norm": 0.9095586538314819, "learning_rate": 9.975116950333434e-06, "loss": 1.7195, "step": 50 }, { "epoch": 0.005971931919976112, "grad_norm": 1.1065226793289185, "learning_rate": 9.970140340400121e-06, "loss": 1.6502, "step": 60 }, { "epoch": 0.0069672539066387975, "grad_norm": 0.8301252126693726, "learning_rate": 9.965163730466807e-06, "loss": 1.5699, "step": 70 }, { "epoch": 0.007962575893301483, "grad_norm": 1.0762828588485718, "learning_rate": 9.960187120533493e-06, "loss": 1.5072, "step": 80 }, { "epoch": 0.008957897879964169, "grad_norm": 1.0814900398254395, "learning_rate": 9.95521051060018e-06, "loss": 1.4369, "step": 90 }, { "epoch": 0.009953219866626855, "grad_norm": 1.3561326265335083, "learning_rate": 9.950233900666867e-06, "loss": 1.3467, "step": 100 }, { "epoch": 0.009953219866626855, "eval_loss": 1.2846794128417969, "eval_runtime": 147.6242, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 100 }, { "epoch": 0.010948541853289539, "grad_norm": 1.438547968864441, "learning_rate": 9.945257290733553e-06, "loss": 1.2222, "step": 110 }, { "epoch": 0.011943863839952225, "grad_norm": 1.402588963508606, "learning_rate": 9.94028068080024e-06, "loss": 1.1001, "step": 120 }, { "epoch": 0.012939185826614909, "grad_norm": 1.4357985258102417, "learning_rate": 9.935304070866926e-06, "loss": 0.9657, "step": 130 }, { "epoch": 0.013934507813277595, "grad_norm": 2.137953042984009, "learning_rate": 9.930327460933613e-06, "loss": 0.8211, "step": 140 }, { "epoch": 0.014929829799940281, "grad_norm": 1.374299168586731, "learning_rate": 9.925350851000299e-06, "loss": 0.7142, "step": 150 }, { "epoch": 0.015925151786602965, "grad_norm": 1.1510456800460815, "learning_rate": 9.920374241066986e-06, "loss": 0.656, "step": 160 }, { "epoch": 0.01692047377326565, "grad_norm": 1.0226788520812988, "learning_rate": 9.915397631133673e-06, "loss": 0.6212, "step": 170 }, { "epoch": 0.017915795759928337, "grad_norm": 0.9365411400794983, "learning_rate": 9.910421021200359e-06, "loss": 0.6069, "step": 180 }, { "epoch": 0.018911117746591023, "grad_norm": 0.6880003213882446, "learning_rate": 9.905444411267046e-06, "loss": 0.6128, "step": 190 }, { "epoch": 0.01990643973325371, "grad_norm": 1.1190361976623535, "learning_rate": 9.900467801333732e-06, "loss": 0.5426, "step": 200 }, { "epoch": 0.01990643973325371, "eval_loss": 0.5788590908050537, "eval_runtime": 147.511, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.691, "step": 200 }, { "epoch": 0.02090176171991639, "grad_norm": 1.184279441833496, "learning_rate": 9.895491191400419e-06, "loss": 0.5887, "step": 210 }, { "epoch": 0.021897083706579078, "grad_norm": 0.7627615928649902, "learning_rate": 9.890514581467106e-06, "loss": 0.5433, "step": 220 }, { "epoch": 0.022892405693241764, "grad_norm": 0.7858164310455322, "learning_rate": 9.885537971533792e-06, "loss": 0.5843, "step": 230 }, { "epoch": 0.02388772767990445, "grad_norm": 0.695697009563446, "learning_rate": 9.880561361600478e-06, "loss": 0.5365, "step": 240 }, { "epoch": 0.024883049666567136, "grad_norm": 0.8994197845458984, "learning_rate": 9.875584751667165e-06, "loss": 0.5662, "step": 250 }, { "epoch": 0.025878371653229818, "grad_norm": 0.8016309142112732, "learning_rate": 9.870608141733852e-06, "loss": 0.5592, "step": 260 }, { "epoch": 0.026873693639892504, "grad_norm": 0.8534384369850159, "learning_rate": 9.865631531800538e-06, "loss": 0.5248, "step": 270 }, { "epoch": 0.02786901562655519, "grad_norm": 0.9857029914855957, "learning_rate": 9.860654921867225e-06, "loss": 0.5294, "step": 280 }, { "epoch": 0.028864337613217876, "grad_norm": 0.7766090631484985, "learning_rate": 9.855678311933912e-06, "loss": 0.5198, "step": 290 }, { "epoch": 0.029859659599880562, "grad_norm": 0.6832401752471924, "learning_rate": 9.850701702000598e-06, "loss": 0.5844, "step": 300 }, { "epoch": 0.029859659599880562, "eval_loss": 0.536589503288269, "eval_runtime": 147.4968, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.692, "step": 300 }, { "epoch": 0.030854981586543248, "grad_norm": 0.7720848917961121, "learning_rate": 9.845725092067284e-06, "loss": 0.5365, "step": 310 }, { "epoch": 0.03185030357320593, "grad_norm": 0.7022100687026978, "learning_rate": 9.840748482133971e-06, "loss": 0.4841, "step": 320 }, { "epoch": 0.03284562555986862, "grad_norm": 1.0030310153961182, "learning_rate": 9.835771872200658e-06, "loss": 0.4635, "step": 330 }, { "epoch": 0.0338409475465313, "grad_norm": 0.8628882765769958, "learning_rate": 9.830795262267344e-06, "loss": 0.4932, "step": 340 }, { "epoch": 0.034836269533193985, "grad_norm": 0.7178316712379456, "learning_rate": 9.825818652334031e-06, "loss": 0.6057, "step": 350 }, { "epoch": 0.035831591519856675, "grad_norm": 0.9564626216888428, "learning_rate": 9.820842042400718e-06, "loss": 0.5371, "step": 360 }, { "epoch": 0.03682691350651936, "grad_norm": 0.7041760683059692, "learning_rate": 9.815865432467404e-06, "loss": 0.513, "step": 370 }, { "epoch": 0.037822235493182046, "grad_norm": 1.0203750133514404, "learning_rate": 9.81088882253409e-06, "loss": 0.5118, "step": 380 }, { "epoch": 0.03881755747984473, "grad_norm": 0.8765382170677185, "learning_rate": 9.805912212600777e-06, "loss": 0.4529, "step": 390 }, { "epoch": 0.03981287946650742, "grad_norm": 0.9951983690261841, "learning_rate": 9.800935602667464e-06, "loss": 0.5336, "step": 400 }, { "epoch": 0.03981287946650742, "eval_loss": 0.5151349306106567, "eval_runtime": 147.6615, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 400 }, { "epoch": 0.0408082014531701, "grad_norm": 0.7691435813903809, "learning_rate": 9.79595899273415e-06, "loss": 0.506, "step": 410 }, { "epoch": 0.04180352343983278, "grad_norm": 1.1955533027648926, "learning_rate": 9.790982382800837e-06, "loss": 0.4692, "step": 420 }, { "epoch": 0.04279884542649547, "grad_norm": 1.128085732460022, "learning_rate": 9.786005772867525e-06, "loss": 0.4608, "step": 430 }, { "epoch": 0.043794167413158155, "grad_norm": 0.5518949627876282, "learning_rate": 9.78102916293421e-06, "loss": 0.5006, "step": 440 }, { "epoch": 0.044789489399820845, "grad_norm": 0.7164484858512878, "learning_rate": 9.776052553000896e-06, "loss": 0.4996, "step": 450 }, { "epoch": 0.04578481138648353, "grad_norm": 0.5959630012512207, "learning_rate": 9.771075943067583e-06, "loss": 0.4843, "step": 460 }, { "epoch": 0.04678013337314621, "grad_norm": 0.743648111820221, "learning_rate": 9.76609933313427e-06, "loss": 0.4363, "step": 470 }, { "epoch": 0.0477754553598089, "grad_norm": 0.8757079243659973, "learning_rate": 9.761122723200956e-06, "loss": 0.4665, "step": 480 }, { "epoch": 0.04877077734647158, "grad_norm": 1.0122153759002686, "learning_rate": 9.756146113267643e-06, "loss": 0.492, "step": 490 }, { "epoch": 0.04976609933313427, "grad_norm": 0.6179729700088501, "learning_rate": 9.751169503334329e-06, "loss": 0.5022, "step": 500 }, { "epoch": 0.04976609933313427, "eval_loss": 0.4993921220302582, "eval_runtime": 147.7401, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.69, "step": 500 }, { "epoch": 0.050761421319796954, "grad_norm": 0.952812671661377, "learning_rate": 9.746192893401016e-06, "loss": 0.4901, "step": 510 }, { "epoch": 0.051756743306459636, "grad_norm": 0.6715916991233826, "learning_rate": 9.741216283467702e-06, "loss": 0.5055, "step": 520 }, { "epoch": 0.052752065293122326, "grad_norm": 0.674640953540802, "learning_rate": 9.736239673534389e-06, "loss": 0.4874, "step": 530 }, { "epoch": 0.05374738727978501, "grad_norm": 0.7867962718009949, "learning_rate": 9.731263063601075e-06, "loss": 0.4956, "step": 540 }, { "epoch": 0.0547427092664477, "grad_norm": 0.9035332202911377, "learning_rate": 9.726286453667762e-06, "loss": 0.499, "step": 550 }, { "epoch": 0.05573803125311038, "grad_norm": 0.7009295225143433, "learning_rate": 9.72130984373445e-06, "loss": 0.5034, "step": 560 }, { "epoch": 0.05673335323977307, "grad_norm": 0.7018862366676331, "learning_rate": 9.716333233801135e-06, "loss": 0.5137, "step": 570 }, { "epoch": 0.05772867522643575, "grad_norm": 0.7812825441360474, "learning_rate": 9.711356623867822e-06, "loss": 0.4724, "step": 580 }, { "epoch": 0.058723997213098435, "grad_norm": 0.6245225071907043, "learning_rate": 9.70638001393451e-06, "loss": 0.4446, "step": 590 }, { "epoch": 0.059719319199761124, "grad_norm": 0.9083976149559021, "learning_rate": 9.701403404001195e-06, "loss": 0.4884, "step": 600 }, { "epoch": 0.059719319199761124, "eval_loss": 0.4891846477985382, "eval_runtime": 147.5284, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.691, "step": 600 }, { "epoch": 0.06071464118642381, "grad_norm": 0.6195352673530579, "learning_rate": 9.69642679406788e-06, "loss": 0.5121, "step": 610 }, { "epoch": 0.061709963173086496, "grad_norm": 0.8068727254867554, "learning_rate": 9.691450184134568e-06, "loss": 0.4689, "step": 620 }, { "epoch": 0.06270528515974919, "grad_norm": 1.0427749156951904, "learning_rate": 9.686473574201255e-06, "loss": 0.4968, "step": 630 }, { "epoch": 0.06370060714641186, "grad_norm": 0.698349118232727, "learning_rate": 9.681496964267941e-06, "loss": 0.4691, "step": 640 }, { "epoch": 0.06469592913307455, "grad_norm": 0.9104384183883667, "learning_rate": 9.676520354334628e-06, "loss": 0.4775, "step": 650 }, { "epoch": 0.06569125111973724, "grad_norm": 0.8729726076126099, "learning_rate": 9.671543744401316e-06, "loss": 0.5201, "step": 660 }, { "epoch": 0.06668657310639992, "grad_norm": 0.9858236908912659, "learning_rate": 9.666567134468001e-06, "loss": 0.4268, "step": 670 }, { "epoch": 0.0676818950930626, "grad_norm": 2.322754383087158, "learning_rate": 9.661590524534687e-06, "loss": 0.4744, "step": 680 }, { "epoch": 0.0686772170797253, "grad_norm": 0.9327623248100281, "learning_rate": 9.656613914601374e-06, "loss": 0.4355, "step": 690 }, { "epoch": 0.06967253906638797, "grad_norm": 0.6949413418769836, "learning_rate": 9.651637304668062e-06, "loss": 0.465, "step": 700 }, { "epoch": 0.06967253906638797, "eval_loss": 0.4817120432853699, "eval_runtime": 147.5643, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.691, "step": 700 }, { "epoch": 0.07066786105305066, "grad_norm": 0.5208165049552917, "learning_rate": 9.646660694734747e-06, "loss": 0.4973, "step": 710 }, { "epoch": 0.07166318303971335, "grad_norm": 0.8434884548187256, "learning_rate": 9.641684084801434e-06, "loss": 0.4721, "step": 720 }, { "epoch": 0.07265850502637604, "grad_norm": 0.7161769866943359, "learning_rate": 9.636707474868122e-06, "loss": 0.498, "step": 730 }, { "epoch": 0.07365382701303871, "grad_norm": 0.7036088705062866, "learning_rate": 9.631730864934807e-06, "loss": 0.4672, "step": 740 }, { "epoch": 0.0746491489997014, "grad_norm": 0.9175013899803162, "learning_rate": 9.626754255001493e-06, "loss": 0.4781, "step": 750 }, { "epoch": 0.07564447098636409, "grad_norm": 0.678519606590271, "learning_rate": 9.62177764506818e-06, "loss": 0.4048, "step": 760 }, { "epoch": 0.07663979297302677, "grad_norm": 0.6295528411865234, "learning_rate": 9.616801035134868e-06, "loss": 0.449, "step": 770 }, { "epoch": 0.07763511495968946, "grad_norm": 0.5424385666847229, "learning_rate": 9.611824425201553e-06, "loss": 0.4394, "step": 780 }, { "epoch": 0.07863043694635215, "grad_norm": 0.508836030960083, "learning_rate": 9.60684781526824e-06, "loss": 0.4317, "step": 790 }, { "epoch": 0.07962575893301484, "grad_norm": 0.6004147529602051, "learning_rate": 9.601871205334926e-06, "loss": 0.4308, "step": 800 }, { "epoch": 0.07962575893301484, "eval_loss": 0.47557342052459717, "eval_runtime": 147.5812, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.691, "step": 800 }, { "epoch": 0.08062108091967751, "grad_norm": 0.5553786754608154, "learning_rate": 9.596894595401613e-06, "loss": 0.4376, "step": 810 }, { "epoch": 0.0816164029063402, "grad_norm": 0.7254445552825928, "learning_rate": 9.591917985468299e-06, "loss": 0.4884, "step": 820 }, { "epoch": 0.08261172489300289, "grad_norm": 0.7175013422966003, "learning_rate": 9.586941375534986e-06, "loss": 0.4167, "step": 830 }, { "epoch": 0.08360704687966557, "grad_norm": 0.6464620232582092, "learning_rate": 9.581964765601674e-06, "loss": 0.4622, "step": 840 }, { "epoch": 0.08460236886632826, "grad_norm": 0.6999176144599915, "learning_rate": 9.57698815566836e-06, "loss": 0.4708, "step": 850 }, { "epoch": 0.08559769085299095, "grad_norm": 0.7939727306365967, "learning_rate": 9.572011545735047e-06, "loss": 0.4633, "step": 860 }, { "epoch": 0.08659301283965362, "grad_norm": 0.473017156124115, "learning_rate": 9.567034935801732e-06, "loss": 0.4585, "step": 870 }, { "epoch": 0.08758833482631631, "grad_norm": 0.7265183329582214, "learning_rate": 9.56205832586842e-06, "loss": 0.4485, "step": 880 }, { "epoch": 0.088583656812979, "grad_norm": 0.539735734462738, "learning_rate": 9.557081715935105e-06, "loss": 0.475, "step": 890 }, { "epoch": 0.08957897879964169, "grad_norm": 0.7587076425552368, "learning_rate": 9.552105106001792e-06, "loss": 0.4347, "step": 900 }, { "epoch": 0.08957897879964169, "eval_loss": 0.4690374732017517, "eval_runtime": 147.5672, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.691, "step": 900 }, { "epoch": 0.09057430078630437, "grad_norm": 0.7549741864204407, "learning_rate": 9.547128496068478e-06, "loss": 0.4434, "step": 910 }, { "epoch": 0.09156962277296705, "grad_norm": 0.686689555644989, "learning_rate": 9.542151886135165e-06, "loss": 0.4052, "step": 920 }, { "epoch": 0.09256494475962974, "grad_norm": 1.02870512008667, "learning_rate": 9.537175276201853e-06, "loss": 0.4806, "step": 930 }, { "epoch": 0.09356026674629242, "grad_norm": 0.7680675983428955, "learning_rate": 9.532198666268538e-06, "loss": 0.4609, "step": 940 }, { "epoch": 0.09455558873295511, "grad_norm": 0.5478435754776001, "learning_rate": 9.527222056335224e-06, "loss": 0.4171, "step": 950 }, { "epoch": 0.0955509107196178, "grad_norm": 0.5974985361099243, "learning_rate": 9.522245446401913e-06, "loss": 0.4686, "step": 960 }, { "epoch": 0.09654623270628049, "grad_norm": 0.997151792049408, "learning_rate": 9.517268836468598e-06, "loss": 0.4676, "step": 970 }, { "epoch": 0.09754155469294316, "grad_norm": 0.6366075277328491, "learning_rate": 9.512292226535284e-06, "loss": 0.4467, "step": 980 }, { "epoch": 0.09853687667960585, "grad_norm": 0.5682553052902222, "learning_rate": 9.507315616601971e-06, "loss": 0.4772, "step": 990 }, { "epoch": 0.09953219866626854, "grad_norm": 0.5869882106781006, "learning_rate": 9.502339006668659e-06, "loss": 0.3976, "step": 1000 }, { "epoch": 0.09953219866626854, "eval_loss": 0.46156319975852966, "eval_runtime": 147.6656, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1000 }, { "epoch": 0.10052752065293122, "grad_norm": 0.5758237838745117, "learning_rate": 9.497362396735344e-06, "loss": 0.4528, "step": 1010 }, { "epoch": 0.10152284263959391, "grad_norm": 0.700281023979187, "learning_rate": 9.492385786802032e-06, "loss": 0.4545, "step": 1020 }, { "epoch": 0.1025181646262566, "grad_norm": 1.1320914030075073, "learning_rate": 9.487409176868719e-06, "loss": 0.4331, "step": 1030 }, { "epoch": 0.10351348661291927, "grad_norm": 0.6469867825508118, "learning_rate": 9.482432566935405e-06, "loss": 0.3759, "step": 1040 }, { "epoch": 0.10450880859958196, "grad_norm": 0.9471383094787598, "learning_rate": 9.47745595700209e-06, "loss": 0.4041, "step": 1050 }, { "epoch": 0.10550413058624465, "grad_norm": 0.5729160904884338, "learning_rate": 9.472479347068777e-06, "loss": 0.4871, "step": 1060 }, { "epoch": 0.10649945257290734, "grad_norm": 0.642436683177948, "learning_rate": 9.467502737135465e-06, "loss": 0.3893, "step": 1070 }, { "epoch": 0.10749477455957002, "grad_norm": 0.95659339427948, "learning_rate": 9.46252612720215e-06, "loss": 0.4486, "step": 1080 }, { "epoch": 0.1084900965462327, "grad_norm": 0.6642667055130005, "learning_rate": 9.457549517268838e-06, "loss": 0.5168, "step": 1090 }, { "epoch": 0.1094854185328954, "grad_norm": 0.5805796980857849, "learning_rate": 9.452572907335525e-06, "loss": 0.4019, "step": 1100 }, { "epoch": 0.1094854185328954, "eval_loss": 0.4559178054332733, "eval_runtime": 147.5891, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1100 }, { "epoch": 0.11048074051955807, "grad_norm": 0.7006909251213074, "learning_rate": 9.44759629740221e-06, "loss": 0.457, "step": 1110 }, { "epoch": 0.11147606250622076, "grad_norm": 1.1821540594100952, "learning_rate": 9.442619687468896e-06, "loss": 0.3484, "step": 1120 }, { "epoch": 0.11247138449288345, "grad_norm": 0.7232743501663208, "learning_rate": 9.437643077535584e-06, "loss": 0.417, "step": 1130 }, { "epoch": 0.11346670647954614, "grad_norm": 0.6104183197021484, "learning_rate": 9.43266646760227e-06, "loss": 0.4821, "step": 1140 }, { "epoch": 0.11446202846620881, "grad_norm": 0.5961386561393738, "learning_rate": 9.427689857668956e-06, "loss": 0.4834, "step": 1150 }, { "epoch": 0.1154573504528715, "grad_norm": 0.5530894994735718, "learning_rate": 9.422713247735644e-06, "loss": 0.443, "step": 1160 }, { "epoch": 0.1164526724395342, "grad_norm": 0.5148622393608093, "learning_rate": 9.41773663780233e-06, "loss": 0.4029, "step": 1170 }, { "epoch": 0.11744799442619687, "grad_norm": 0.6148583292961121, "learning_rate": 9.412760027869017e-06, "loss": 0.4308, "step": 1180 }, { "epoch": 0.11844331641285956, "grad_norm": 0.7840449213981628, "learning_rate": 9.407783417935702e-06, "loss": 0.499, "step": 1190 }, { "epoch": 0.11943863839952225, "grad_norm": 0.6757422089576721, "learning_rate": 9.40280680800239e-06, "loss": 0.4263, "step": 1200 }, { "epoch": 0.11943863839952225, "eval_loss": 0.4505193829536438, "eval_runtime": 147.6664, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1200 }, { "epoch": 0.12043396038618492, "grad_norm": 0.630874752998352, "learning_rate": 9.397830198069075e-06, "loss": 0.492, "step": 1210 }, { "epoch": 0.12142928237284761, "grad_norm": 0.7458256483078003, "learning_rate": 9.392853588135763e-06, "loss": 0.4612, "step": 1220 }, { "epoch": 0.1224246043595103, "grad_norm": 0.6903111934661865, "learning_rate": 9.38787697820245e-06, "loss": 0.4882, "step": 1230 }, { "epoch": 0.12341992634617299, "grad_norm": 1.0817712545394897, "learning_rate": 9.382900368269135e-06, "loss": 0.4658, "step": 1240 }, { "epoch": 0.12441524833283567, "grad_norm": 0.8182739615440369, "learning_rate": 9.377923758335823e-06, "loss": 0.4281, "step": 1250 }, { "epoch": 0.12541057031949837, "grad_norm": 0.5155394077301025, "learning_rate": 9.372947148402508e-06, "loss": 0.4312, "step": 1260 }, { "epoch": 0.12640589230616103, "grad_norm": 0.6190319657325745, "learning_rate": 9.367970538469196e-06, "loss": 0.4537, "step": 1270 }, { "epoch": 0.12740121429282372, "grad_norm": 0.7704219222068787, "learning_rate": 9.362993928535881e-06, "loss": 0.4873, "step": 1280 }, { "epoch": 0.1283965362794864, "grad_norm": 0.6395025849342346, "learning_rate": 9.358017318602569e-06, "loss": 0.4374, "step": 1290 }, { "epoch": 0.1293918582661491, "grad_norm": 0.9248729944229126, "learning_rate": 9.353040708669256e-06, "loss": 0.4183, "step": 1300 }, { "epoch": 0.1293918582661491, "eval_loss": 0.44450852274894714, "eval_runtime": 147.6747, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1300 }, { "epoch": 0.1303871802528118, "grad_norm": 0.6703208088874817, "learning_rate": 9.348064098735942e-06, "loss": 0.4017, "step": 1310 }, { "epoch": 0.13138250223947448, "grad_norm": 0.7091432213783264, "learning_rate": 9.343087488802627e-06, "loss": 0.4858, "step": 1320 }, { "epoch": 0.13237782422613714, "grad_norm": 0.6519076824188232, "learning_rate": 9.338110878869316e-06, "loss": 0.402, "step": 1330 }, { "epoch": 0.13337314621279983, "grad_norm": 0.7192474603652954, "learning_rate": 9.333134268936002e-06, "loss": 0.4275, "step": 1340 }, { "epoch": 0.13436846819946252, "grad_norm": 0.626981794834137, "learning_rate": 9.328157659002687e-06, "loss": 0.4276, "step": 1350 }, { "epoch": 0.1353637901861252, "grad_norm": 0.8239569664001465, "learning_rate": 9.323181049069375e-06, "loss": 0.4384, "step": 1360 }, { "epoch": 0.1363591121727879, "grad_norm": 0.727737307548523, "learning_rate": 9.318204439136062e-06, "loss": 0.3892, "step": 1370 }, { "epoch": 0.1373544341594506, "grad_norm": 0.6430094242095947, "learning_rate": 9.313227829202748e-06, "loss": 0.3579, "step": 1380 }, { "epoch": 0.13834975614611328, "grad_norm": 0.7504476308822632, "learning_rate": 9.308251219269435e-06, "loss": 0.4585, "step": 1390 }, { "epoch": 0.13934507813277594, "grad_norm": 1.0239664316177368, "learning_rate": 9.303274609336122e-06, "loss": 0.4696, "step": 1400 }, { "epoch": 0.13934507813277594, "eval_loss": 0.43923673033714294, "eval_runtime": 147.7239, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.69, "step": 1400 }, { "epoch": 0.14034040011943863, "grad_norm": 0.6847706437110901, "learning_rate": 9.298297999402808e-06, "loss": 0.4823, "step": 1410 }, { "epoch": 0.14133572210610132, "grad_norm": 0.5733935832977295, "learning_rate": 9.293321389469493e-06, "loss": 0.4088, "step": 1420 }, { "epoch": 0.142331044092764, "grad_norm": 0.8858775496482849, "learning_rate": 9.28834477953618e-06, "loss": 0.3863, "step": 1430 }, { "epoch": 0.1433263660794267, "grad_norm": 0.6404774785041809, "learning_rate": 9.283368169602868e-06, "loss": 0.3951, "step": 1440 }, { "epoch": 0.1443216880660894, "grad_norm": 0.6125516891479492, "learning_rate": 9.278391559669554e-06, "loss": 0.4408, "step": 1450 }, { "epoch": 0.14531701005275208, "grad_norm": 0.5629742741584778, "learning_rate": 9.273414949736241e-06, "loss": 0.4319, "step": 1460 }, { "epoch": 0.14631233203941474, "grad_norm": 0.6768545508384705, "learning_rate": 9.268438339802927e-06, "loss": 0.4002, "step": 1470 }, { "epoch": 0.14730765402607743, "grad_norm": 0.6743785738945007, "learning_rate": 9.263461729869614e-06, "loss": 0.4779, "step": 1480 }, { "epoch": 0.14830297601274012, "grad_norm": 0.5943326354026794, "learning_rate": 9.2584851199363e-06, "loss": 0.4406, "step": 1490 }, { "epoch": 0.1492982979994028, "grad_norm": 0.8586482405662537, "learning_rate": 9.253508510002987e-06, "loss": 0.4326, "step": 1500 }, { "epoch": 0.1492982979994028, "eval_loss": 0.43489304184913635, "eval_runtime": 147.6747, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1500 }, { "epoch": 0.1502936199860655, "grad_norm": 0.862763524055481, "learning_rate": 9.248531900069674e-06, "loss": 0.4917, "step": 1510 }, { "epoch": 0.15128894197272819, "grad_norm": 0.6556192636489868, "learning_rate": 9.24355529013636e-06, "loss": 0.4333, "step": 1520 }, { "epoch": 0.15228426395939088, "grad_norm": 0.5479542016983032, "learning_rate": 9.238578680203047e-06, "loss": 0.4176, "step": 1530 }, { "epoch": 0.15327958594605354, "grad_norm": 0.8119767308235168, "learning_rate": 9.233602070269733e-06, "loss": 0.4171, "step": 1540 }, { "epoch": 0.15427490793271623, "grad_norm": 0.9051875472068787, "learning_rate": 9.22862546033642e-06, "loss": 0.4529, "step": 1550 }, { "epoch": 0.15527022991937892, "grad_norm": 0.5972510576248169, "learning_rate": 9.223648850403106e-06, "loss": 0.4752, "step": 1560 }, { "epoch": 0.1562655519060416, "grad_norm": 0.6712588667869568, "learning_rate": 9.218672240469793e-06, "loss": 0.4179, "step": 1570 }, { "epoch": 0.1572608738927043, "grad_norm": 0.637656569480896, "learning_rate": 9.213695630536478e-06, "loss": 0.4624, "step": 1580 }, { "epoch": 0.15825619587936698, "grad_norm": 0.7319675087928772, "learning_rate": 9.208719020603166e-06, "loss": 0.4149, "step": 1590 }, { "epoch": 0.15925151786602967, "grad_norm": 0.6740835905075073, "learning_rate": 9.203742410669853e-06, "loss": 0.4348, "step": 1600 }, { "epoch": 0.15925151786602967, "eval_loss": 0.4290333390235901, "eval_runtime": 147.7478, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.69, "step": 1600 }, { "epoch": 0.16024683985269234, "grad_norm": 0.7110456824302673, "learning_rate": 9.198765800736539e-06, "loss": 0.3808, "step": 1610 }, { "epoch": 0.16124216183935502, "grad_norm": 0.6934688091278076, "learning_rate": 9.193789190803224e-06, "loss": 0.4279, "step": 1620 }, { "epoch": 0.16223748382601771, "grad_norm": 0.6783742308616638, "learning_rate": 9.188812580869912e-06, "loss": 0.413, "step": 1630 }, { "epoch": 0.1632328058126804, "grad_norm": 0.5934478044509888, "learning_rate": 9.183835970936599e-06, "loss": 0.476, "step": 1640 }, { "epoch": 0.1642281277993431, "grad_norm": 0.9043450951576233, "learning_rate": 9.178859361003285e-06, "loss": 0.392, "step": 1650 }, { "epoch": 0.16522344978600578, "grad_norm": 0.4757988154888153, "learning_rate": 9.173882751069972e-06, "loss": 0.3812, "step": 1660 }, { "epoch": 0.16621877177266844, "grad_norm": 0.7402971982955933, "learning_rate": 9.16890614113666e-06, "loss": 0.4293, "step": 1670 }, { "epoch": 0.16721409375933113, "grad_norm": 0.6279808282852173, "learning_rate": 9.163929531203345e-06, "loss": 0.4453, "step": 1680 }, { "epoch": 0.16820941574599382, "grad_norm": 0.6272904276847839, "learning_rate": 9.15895292127003e-06, "loss": 0.4215, "step": 1690 }, { "epoch": 0.1692047377326565, "grad_norm": 0.806103527545929, "learning_rate": 9.15397631133672e-06, "loss": 0.4236, "step": 1700 }, { "epoch": 0.1692047377326565, "eval_loss": 0.424538791179657, "eval_runtime": 147.6192, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1700 }, { "epoch": 0.1702000597193192, "grad_norm": 0.7595136165618896, "learning_rate": 9.148999701403405e-06, "loss": 0.4473, "step": 1710 }, { "epoch": 0.1711953817059819, "grad_norm": 0.5029250979423523, "learning_rate": 9.14402309147009e-06, "loss": 0.4248, "step": 1720 }, { "epoch": 0.17219070369264458, "grad_norm": 0.7487345933914185, "learning_rate": 9.139046481536778e-06, "loss": 0.3795, "step": 1730 }, { "epoch": 0.17318602567930724, "grad_norm": 1.122206211090088, "learning_rate": 9.134069871603465e-06, "loss": 0.4026, "step": 1740 }, { "epoch": 0.17418134766596993, "grad_norm": 0.6429542899131775, "learning_rate": 9.129093261670151e-06, "loss": 0.4142, "step": 1750 }, { "epoch": 0.17517666965263262, "grad_norm": 0.7902116775512695, "learning_rate": 9.124116651736838e-06, "loss": 0.4266, "step": 1760 }, { "epoch": 0.1761719916392953, "grad_norm": 0.6928035020828247, "learning_rate": 9.119140041803524e-06, "loss": 0.4036, "step": 1770 }, { "epoch": 0.177167313625958, "grad_norm": 0.637829601764679, "learning_rate": 9.114163431870211e-06, "loss": 0.4139, "step": 1780 }, { "epoch": 0.1781626356126207, "grad_norm": 0.8418923616409302, "learning_rate": 9.109186821936897e-06, "loss": 0.4538, "step": 1790 }, { "epoch": 0.17915795759928338, "grad_norm": 0.6597120761871338, "learning_rate": 9.104210212003584e-06, "loss": 0.428, "step": 1800 }, { "epoch": 0.17915795759928338, "eval_loss": 0.4206041693687439, "eval_runtime": 147.6714, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1800 }, { "epoch": 0.18015327958594604, "grad_norm": 0.9092034101486206, "learning_rate": 9.099233602070271e-06, "loss": 0.3827, "step": 1810 }, { "epoch": 0.18114860157260873, "grad_norm": 0.7151809334754944, "learning_rate": 9.094256992136957e-06, "loss": 0.4096, "step": 1820 }, { "epoch": 0.18214392355927142, "grad_norm": 0.812656819820404, "learning_rate": 9.089280382203644e-06, "loss": 0.398, "step": 1830 }, { "epoch": 0.1831392455459341, "grad_norm": 0.6819058060646057, "learning_rate": 9.08430377227033e-06, "loss": 0.4289, "step": 1840 }, { "epoch": 0.1841345675325968, "grad_norm": 0.6796212792396545, "learning_rate": 9.079327162337017e-06, "loss": 0.4107, "step": 1850 }, { "epoch": 0.1851298895192595, "grad_norm": 0.604881227016449, "learning_rate": 9.074350552403703e-06, "loss": 0.3888, "step": 1860 }, { "epoch": 0.18612521150592218, "grad_norm": 0.5823159217834473, "learning_rate": 9.06937394247039e-06, "loss": 0.4292, "step": 1870 }, { "epoch": 0.18712053349258484, "grad_norm": 0.6591698527336121, "learning_rate": 9.064397332537076e-06, "loss": 0.4559, "step": 1880 }, { "epoch": 0.18811585547924753, "grad_norm": 0.666591465473175, "learning_rate": 9.059420722603763e-06, "loss": 0.4486, "step": 1890 }, { "epoch": 0.18911117746591022, "grad_norm": 0.8700873255729675, "learning_rate": 9.05444411267045e-06, "loss": 0.3934, "step": 1900 }, { "epoch": 0.18911117746591022, "eval_loss": 0.41719409823417664, "eval_runtime": 147.6671, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 1900 }, { "epoch": 0.1901064994525729, "grad_norm": 0.5683835744857788, "learning_rate": 9.049467502737136e-06, "loss": 0.4148, "step": 1910 }, { "epoch": 0.1911018214392356, "grad_norm": 0.7323755621910095, "learning_rate": 9.044490892803823e-06, "loss": 0.4473, "step": 1920 }, { "epoch": 0.1920971434258983, "grad_norm": 0.8059419393539429, "learning_rate": 9.039514282870509e-06, "loss": 0.4092, "step": 1930 }, { "epoch": 0.19309246541256098, "grad_norm": 0.5238020420074463, "learning_rate": 9.034537672937196e-06, "loss": 0.4161, "step": 1940 }, { "epoch": 0.19408778739922364, "grad_norm": 0.7691717147827148, "learning_rate": 9.029561063003882e-06, "loss": 0.3996, "step": 1950 }, { "epoch": 0.19508310938588633, "grad_norm": 0.5275344848632812, "learning_rate": 9.024584453070569e-06, "loss": 0.3936, "step": 1960 }, { "epoch": 0.19607843137254902, "grad_norm": 0.9201516509056091, "learning_rate": 9.019607843137256e-06, "loss": 0.4327, "step": 1970 }, { "epoch": 0.1970737533592117, "grad_norm": 0.6645549535751343, "learning_rate": 9.014631233203942e-06, "loss": 0.439, "step": 1980 }, { "epoch": 0.1980690753458744, "grad_norm": 0.4919885993003845, "learning_rate": 9.009654623270628e-06, "loss": 0.3584, "step": 1990 }, { "epoch": 0.19906439733253709, "grad_norm": 0.7819716930389404, "learning_rate": 9.004678013337315e-06, "loss": 0.4258, "step": 2000 }, { "epoch": 0.19906439733253709, "eval_loss": 0.4135349690914154, "eval_runtime": 147.6676, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.691, "step": 2000 }, { "epoch": 0.20005971931919977, "grad_norm": 0.6763346195220947, "learning_rate": 8.999701403404002e-06, "loss": 0.3734, "step": 2010 }, { "epoch": 0.20105504130586244, "grad_norm": 0.974773108959198, "learning_rate": 8.994724793470688e-06, "loss": 0.4128, "step": 2020 }, { "epoch": 0.20205036329252513, "grad_norm": 0.7922454476356506, "learning_rate": 8.989748183537375e-06, "loss": 0.4699, "step": 2030 }, { "epoch": 0.20304568527918782, "grad_norm": 0.7217792272567749, "learning_rate": 8.984771573604062e-06, "loss": 0.4368, "step": 2040 }, { "epoch": 0.2040410072658505, "grad_norm": 0.9531657695770264, "learning_rate": 8.979794963670748e-06, "loss": 0.4124, "step": 2050 }, { "epoch": 0.2050363292525132, "grad_norm": 0.5895671248435974, "learning_rate": 8.974818353737434e-06, "loss": 0.4065, "step": 2060 }, { "epoch": 0.20603165123917588, "grad_norm": 0.6587451100349426, "learning_rate": 8.969841743804123e-06, "loss": 0.4182, "step": 2070 }, { "epoch": 0.20702697322583855, "grad_norm": 0.5056644678115845, "learning_rate": 8.964865133870808e-06, "loss": 0.4146, "step": 2080 }, { "epoch": 0.20802229521250123, "grad_norm": 0.8369359374046326, "learning_rate": 8.959888523937494e-06, "loss": 0.4258, "step": 2090 }, { "epoch": 0.20901761719916392, "grad_norm": 0.8079156279563904, "learning_rate": 8.954911914004181e-06, "loss": 0.4172, "step": 2100 }, { "epoch": 0.20901761719916392, "eval_loss": 0.40956470370292664, "eval_runtime": 147.7554, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.69, "step": 2100 }, { "epoch": 0.2100129391858266, "grad_norm": 0.5938236117362976, "learning_rate": 8.949935304070869e-06, "loss": 0.4058, "step": 2110 }, { "epoch": 0.2110082611724893, "grad_norm": 0.5103029608726501, "learning_rate": 8.944958694137554e-06, "loss": 0.3338, "step": 2120 }, { "epoch": 0.212003583159152, "grad_norm": 0.8399671316146851, "learning_rate": 8.939982084204241e-06, "loss": 0.4135, "step": 2130 }, { "epoch": 0.21299890514581468, "grad_norm": 0.8162589073181152, "learning_rate": 8.935005474270927e-06, "loss": 0.379, "step": 2140 }, { "epoch": 0.21399422713247734, "grad_norm": 0.5345713496208191, "learning_rate": 8.930028864337614e-06, "loss": 0.4356, "step": 2150 }, { "epoch": 0.21498954911914003, "grad_norm": 0.5709038972854614, "learning_rate": 8.9250522544043e-06, "loss": 0.3961, "step": 2160 }, { "epoch": 0.21598487110580272, "grad_norm": 0.8017010688781738, "learning_rate": 8.920075644470987e-06, "loss": 0.3934, "step": 2170 }, { "epoch": 0.2169801930924654, "grad_norm": 0.7133475542068481, "learning_rate": 8.915099034537673e-06, "loss": 0.386, "step": 2180 }, { "epoch": 0.2179755150791281, "grad_norm": 0.861768901348114, "learning_rate": 8.91012242460436e-06, "loss": 0.3981, "step": 2190 }, { "epoch": 0.2189708370657908, "grad_norm": 0.6387837529182434, "learning_rate": 8.905145814671047e-06, "loss": 0.4277, "step": 2200 }, { "epoch": 0.2189708370657908, "eval_loss": 0.40670302510261536, "eval_runtime": 147.76, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.69, "step": 2200 }, { "epoch": 0.21996615905245348, "grad_norm": 0.9591347575187683, "learning_rate": 8.900169204737733e-06, "loss": 0.3809, "step": 2210 }, { "epoch": 0.22096148103911614, "grad_norm": 0.6483083963394165, "learning_rate": 8.89519259480442e-06, "loss": 0.4071, "step": 2220 }, { "epoch": 0.22195680302577883, "grad_norm": 1.0261069536209106, "learning_rate": 8.890215984871106e-06, "loss": 0.4145, "step": 2230 }, { "epoch": 0.22295212501244152, "grad_norm": 0.6538086533546448, "learning_rate": 8.885239374937793e-06, "loss": 0.4322, "step": 2240 }, { "epoch": 0.2239474469991042, "grad_norm": 0.4469331204891205, "learning_rate": 8.880262765004479e-06, "loss": 0.4052, "step": 2250 }, { "epoch": 0.2249427689857669, "grad_norm": 0.5114856958389282, "learning_rate": 8.875286155071166e-06, "loss": 0.4143, "step": 2260 }, { "epoch": 0.2259380909724296, "grad_norm": 0.7658188343048096, "learning_rate": 8.870309545137854e-06, "loss": 0.4345, "step": 2270 }, { "epoch": 0.22693341295909228, "grad_norm": 0.6381837725639343, "learning_rate": 8.86533293520454e-06, "loss": 0.3868, "step": 2280 }, { "epoch": 0.22792873494575494, "grad_norm": 0.5213243961334229, "learning_rate": 8.860356325271225e-06, "loss": 0.3849, "step": 2290 }, { "epoch": 0.22892405693241763, "grad_norm": 0.7393907904624939, "learning_rate": 8.855379715337912e-06, "loss": 0.4282, "step": 2300 }, { "epoch": 0.22892405693241763, "eval_loss": 0.4041208326816559, "eval_runtime": 147.7723, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.69, "step": 2300 }, { "epoch": 0.22991937891908032, "grad_norm": 0.5622240304946899, "learning_rate": 8.8504031054046e-06, "loss": 0.3818, "step": 2310 }, { "epoch": 0.230914700905743, "grad_norm": 0.7211191654205322, "learning_rate": 8.845426495471285e-06, "loss": 0.3596, "step": 2320 }, { "epoch": 0.2319100228924057, "grad_norm": 0.5431678295135498, "learning_rate": 8.840449885537972e-06, "loss": 0.3645, "step": 2330 }, { "epoch": 0.2329053448790684, "grad_norm": 1.0264047384262085, "learning_rate": 8.83547327560466e-06, "loss": 0.4152, "step": 2340 }, { "epoch": 0.23390066686573108, "grad_norm": 0.6439436078071594, "learning_rate": 8.830496665671345e-06, "loss": 0.4169, "step": 2350 }, { "epoch": 0.23489598885239374, "grad_norm": 0.6291099786758423, "learning_rate": 8.825520055738031e-06, "loss": 0.4246, "step": 2360 }, { "epoch": 0.23589131083905643, "grad_norm": 0.5020752549171448, "learning_rate": 8.820543445804718e-06, "loss": 0.3649, "step": 2370 }, { "epoch": 0.23688663282571912, "grad_norm": 0.5813655257225037, "learning_rate": 8.815566835871405e-06, "loss": 0.403, "step": 2380 }, { "epoch": 0.2378819548123818, "grad_norm": 0.7793263792991638, "learning_rate": 8.810590225938091e-06, "loss": 0.4044, "step": 2390 }, { "epoch": 0.2388772767990445, "grad_norm": 1.0214496850967407, "learning_rate": 8.805613616004778e-06, "loss": 0.3804, "step": 2400 }, { "epoch": 0.2388772767990445, "eval_loss": 0.4011123776435852, "eval_runtime": 147.7863, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.69, "step": 2400 }, { "epoch": 0.2398725987857072, "grad_norm": 0.8854981064796448, "learning_rate": 8.800637006071466e-06, "loss": 0.3915, "step": 2410 }, { "epoch": 0.24086792077236985, "grad_norm": 0.6463388800621033, "learning_rate": 8.795660396138151e-06, "loss": 0.412, "step": 2420 }, { "epoch": 0.24186324275903254, "grad_norm": 1.0134918689727783, "learning_rate": 8.790683786204837e-06, "loss": 0.4514, "step": 2430 }, { "epoch": 0.24285856474569523, "grad_norm": 0.5260724425315857, "learning_rate": 8.785707176271524e-06, "loss": 0.393, "step": 2440 }, { "epoch": 0.24385388673235792, "grad_norm": 0.7072359323501587, "learning_rate": 8.780730566338212e-06, "loss": 0.4061, "step": 2450 }, { "epoch": 0.2448492087190206, "grad_norm": 0.505009114742279, "learning_rate": 8.775753956404897e-06, "loss": 0.4435, "step": 2460 }, { "epoch": 0.2458445307056833, "grad_norm": 0.707790195941925, "learning_rate": 8.770777346471584e-06, "loss": 0.3803, "step": 2470 }, { "epoch": 0.24683985269234598, "grad_norm": 1.0153621435165405, "learning_rate": 8.765800736538272e-06, "loss": 0.3942, "step": 2480 }, { "epoch": 0.24783517467900865, "grad_norm": 0.6652597188949585, "learning_rate": 8.760824126604957e-06, "loss": 0.3481, "step": 2490 }, { "epoch": 0.24883049666567134, "grad_norm": 0.49689826369285583, "learning_rate": 8.755847516671645e-06, "loss": 0.4101, "step": 2500 }, { "epoch": 0.24883049666567134, "eval_loss": 0.39822638034820557, "eval_runtime": 147.9245, "eval_samples_per_second": 1.372, "eval_steps_per_second": 0.69, "step": 2500 }, { "epoch": 0.24982581865233403, "grad_norm": 0.7141602635383606, "learning_rate": 8.75087090673833e-06, "loss": 0.362, "step": 2510 }, { "epoch": 0.25082114063899674, "grad_norm": 0.5883095264434814, "learning_rate": 8.745894296805018e-06, "loss": 0.4115, "step": 2520 }, { "epoch": 0.2518164626256594, "grad_norm": 0.6165831685066223, "learning_rate": 8.740917686871703e-06, "loss": 0.3849, "step": 2530 }, { "epoch": 0.25281178461232207, "grad_norm": 0.5670954585075378, "learning_rate": 8.73594107693839e-06, "loss": 0.3491, "step": 2540 }, { "epoch": 0.25380710659898476, "grad_norm": 1.0700769424438477, "learning_rate": 8.730964467005076e-06, "loss": 0.4068, "step": 2550 }, { "epoch": 0.25480242858564744, "grad_norm": 0.7089443206787109, "learning_rate": 8.725987857071763e-06, "loss": 0.4567, "step": 2560 }, { "epoch": 0.25579775057231013, "grad_norm": 0.5670477747917175, "learning_rate": 8.72101124713845e-06, "loss": 0.4037, "step": 2570 }, { "epoch": 0.2567930725589728, "grad_norm": 0.6892909407615662, "learning_rate": 8.716034637205136e-06, "loss": 0.3714, "step": 2580 }, { "epoch": 0.2577883945456355, "grad_norm": 0.8213964104652405, "learning_rate": 8.711058027271822e-06, "loss": 0.4305, "step": 2590 }, { "epoch": 0.2587837165322982, "grad_norm": 0.7234606146812439, "learning_rate": 8.70608141733851e-06, "loss": 0.4213, "step": 2600 }, { "epoch": 0.2587837165322982, "eval_loss": 0.39483293890953064, "eval_runtime": 147.915, "eval_samples_per_second": 1.372, "eval_steps_per_second": 0.69, "step": 2600 }, { "epoch": 0.2597790385189609, "grad_norm": 0.6947128176689148, "learning_rate": 8.701104807405197e-06, "loss": 0.3851, "step": 2610 }, { "epoch": 0.2607743605056236, "grad_norm": 0.8997359275817871, "learning_rate": 8.696128197471882e-06, "loss": 0.379, "step": 2620 }, { "epoch": 0.26176968249228627, "grad_norm": 0.8184422254562378, "learning_rate": 8.69115158753857e-06, "loss": 0.3615, "step": 2630 }, { "epoch": 0.26276500447894896, "grad_norm": 0.7109666466712952, "learning_rate": 8.686174977605257e-06, "loss": 0.4233, "step": 2640 }, { "epoch": 0.26376032646561165, "grad_norm": 0.6844655275344849, "learning_rate": 8.681198367671942e-06, "loss": 0.4142, "step": 2650 }, { "epoch": 0.2647556484522743, "grad_norm": 0.8344716429710388, "learning_rate": 8.676221757738628e-06, "loss": 0.3611, "step": 2660 }, { "epoch": 0.265750970438937, "grad_norm": 0.7269201278686523, "learning_rate": 8.671245147805315e-06, "loss": 0.4397, "step": 2670 }, { "epoch": 0.26674629242559966, "grad_norm": 0.5457523465156555, "learning_rate": 8.666268537872003e-06, "loss": 0.3724, "step": 2680 }, { "epoch": 0.26774161441226235, "grad_norm": 0.7520753145217896, "learning_rate": 8.661291927938688e-06, "loss": 0.3882, "step": 2690 }, { "epoch": 0.26873693639892504, "grad_norm": 0.49623236060142517, "learning_rate": 8.656315318005376e-06, "loss": 0.4115, "step": 2700 }, { "epoch": 0.26873693639892504, "eval_loss": 0.39236727356910706, "eval_runtime": 147.7377, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.69, "step": 2700 }, { "epoch": 0.26973225838558773, "grad_norm": 0.6592463254928589, "learning_rate": 8.651338708072063e-06, "loss": 0.3628, "step": 2710 }, { "epoch": 0.2707275803722504, "grad_norm": 0.9473317265510559, "learning_rate": 8.646362098138749e-06, "loss": 0.3842, "step": 2720 }, { "epoch": 0.2717229023589131, "grad_norm": 0.7774178385734558, "learning_rate": 8.641385488205434e-06, "loss": 0.3643, "step": 2730 }, { "epoch": 0.2727182243455758, "grad_norm": 0.6194160580635071, "learning_rate": 8.636408878272121e-06, "loss": 0.4647, "step": 2740 }, { "epoch": 0.2737135463322385, "grad_norm": 0.5518766641616821, "learning_rate": 8.631432268338809e-06, "loss": 0.3755, "step": 2750 }, { "epoch": 0.2747088683189012, "grad_norm": 0.9331585764884949, "learning_rate": 8.626455658405494e-06, "loss": 0.3881, "step": 2760 }, { "epoch": 0.27570419030556387, "grad_norm": 0.6080964207649231, "learning_rate": 8.621479048472182e-06, "loss": 0.3965, "step": 2770 }, { "epoch": 0.27669951229222656, "grad_norm": 0.8619922399520874, "learning_rate": 8.616502438538869e-06, "loss": 0.387, "step": 2780 }, { "epoch": 0.27769483427888925, "grad_norm": 0.7429324984550476, "learning_rate": 8.611525828605555e-06, "loss": 0.3837, "step": 2790 }, { "epoch": 0.2786901562655519, "grad_norm": 0.7918853759765625, "learning_rate": 8.60654921867224e-06, "loss": 0.3921, "step": 2800 }, { "epoch": 0.2786901562655519, "eval_loss": 0.3901057541370392, "eval_runtime": 147.7809, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.69, "step": 2800 }, { "epoch": 0.27968547825221457, "grad_norm": 0.6200188398361206, "learning_rate": 8.601572608738928e-06, "loss": 0.398, "step": 2810 }, { "epoch": 0.28068080023887726, "grad_norm": 0.6285167336463928, "learning_rate": 8.596595998805615e-06, "loss": 0.3676, "step": 2820 }, { "epoch": 0.28167612222553995, "grad_norm": 0.7586702704429626, "learning_rate": 8.5916193888723e-06, "loss": 0.3658, "step": 2830 }, { "epoch": 0.28267144421220264, "grad_norm": 0.915360152721405, "learning_rate": 8.586642778938988e-06, "loss": 0.3444, "step": 2840 }, { "epoch": 0.2836667661988653, "grad_norm": 0.8675612807273865, "learning_rate": 8.581666169005673e-06, "loss": 0.3939, "step": 2850 }, { "epoch": 0.284662088185528, "grad_norm": 0.8629066944122314, "learning_rate": 8.57668955907236e-06, "loss": 0.4055, "step": 2860 }, { "epoch": 0.2856574101721907, "grad_norm": 0.8615571856498718, "learning_rate": 8.571712949139048e-06, "loss": 0.4392, "step": 2870 }, { "epoch": 0.2866527321588534, "grad_norm": 0.675205409526825, "learning_rate": 8.566736339205734e-06, "loss": 0.3289, "step": 2880 }, { "epoch": 0.2876480541455161, "grad_norm": 0.6187378764152527, "learning_rate": 8.561759729272421e-06, "loss": 0.4067, "step": 2890 }, { "epoch": 0.2886433761321788, "grad_norm": 0.7826117277145386, "learning_rate": 8.556783119339106e-06, "loss": 0.367, "step": 2900 }, { "epoch": 0.2886433761321788, "eval_loss": 0.38809624314308167, "eval_runtime": 147.8617, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.69, "step": 2900 }, { "epoch": 0.28963869811884146, "grad_norm": 0.6546410322189331, "learning_rate": 8.551806509405794e-06, "loss": 0.3727, "step": 2910 }, { "epoch": 0.29063402010550415, "grad_norm": 0.8760982155799866, "learning_rate": 8.54682989947248e-06, "loss": 0.3967, "step": 2920 }, { "epoch": 0.29162934209216684, "grad_norm": 0.64844810962677, "learning_rate": 8.541853289539167e-06, "loss": 0.4046, "step": 2930 }, { "epoch": 0.2926246640788295, "grad_norm": 0.5126065015792847, "learning_rate": 8.536876679605854e-06, "loss": 0.3783, "step": 2940 }, { "epoch": 0.29361998606549217, "grad_norm": 0.7168049216270447, "learning_rate": 8.53190006967254e-06, "loss": 0.3606, "step": 2950 }, { "epoch": 0.29461530805215486, "grad_norm": 0.4847118854522705, "learning_rate": 8.526923459739225e-06, "loss": 0.3617, "step": 2960 }, { "epoch": 0.29561063003881755, "grad_norm": 0.6937541365623474, "learning_rate": 8.521946849805913e-06, "loss": 0.3878, "step": 2970 }, { "epoch": 0.29660595202548024, "grad_norm": 0.7482075095176697, "learning_rate": 8.5169702398726e-06, "loss": 0.4173, "step": 2980 }, { "epoch": 0.2976012740121429, "grad_norm": 0.7130847573280334, "learning_rate": 8.511993629939285e-06, "loss": 0.3717, "step": 2990 }, { "epoch": 0.2985965959988056, "grad_norm": 0.7087443470954895, "learning_rate": 8.507017020005973e-06, "loss": 0.3945, "step": 3000 }, { "epoch": 0.2985965959988056, "eval_loss": 0.3846234977245331, "eval_runtime": 147.9506, "eval_samples_per_second": 1.372, "eval_steps_per_second": 0.689, "step": 3000 }, { "epoch": 0.2995919179854683, "grad_norm": 0.5839470624923706, "learning_rate": 8.50204041007266e-06, "loss": 0.3672, "step": 3010 }, { "epoch": 0.300587239972131, "grad_norm": 0.5632269978523254, "learning_rate": 8.497063800139346e-06, "loss": 0.4038, "step": 3020 }, { "epoch": 0.3015825619587937, "grad_norm": 0.9807242155075073, "learning_rate": 8.492087190206031e-06, "loss": 0.435, "step": 3030 }, { "epoch": 0.30257788394545637, "grad_norm": 0.6134958267211914, "learning_rate": 8.487110580272719e-06, "loss": 0.3857, "step": 3040 }, { "epoch": 0.30357320593211906, "grad_norm": 0.9714884757995605, "learning_rate": 8.482133970339406e-06, "loss": 0.3375, "step": 3050 }, { "epoch": 0.30456852791878175, "grad_norm": 0.6158900856971741, "learning_rate": 8.477157360406092e-06, "loss": 0.3768, "step": 3060 }, { "epoch": 0.3055638499054444, "grad_norm": 0.5510846376419067, "learning_rate": 8.472180750472779e-06, "loss": 0.3618, "step": 3070 }, { "epoch": 0.3065591718921071, "grad_norm": 0.6374019384384155, "learning_rate": 8.467204140539466e-06, "loss": 0.3444, "step": 3080 }, { "epoch": 0.30755449387876976, "grad_norm": 0.6322264075279236, "learning_rate": 8.462227530606152e-06, "loss": 0.3841, "step": 3090 }, { "epoch": 0.30854981586543245, "grad_norm": 0.6326218843460083, "learning_rate": 8.457250920672837e-06, "loss": 0.3627, "step": 3100 }, { "epoch": 0.30854981586543245, "eval_loss": 0.38287338614463806, "eval_runtime": 147.987, "eval_samples_per_second": 1.372, "eval_steps_per_second": 0.689, "step": 3100 }, { "epoch": 0.30954513785209514, "grad_norm": 0.8483834862709045, "learning_rate": 8.452274310739525e-06, "loss": 0.4364, "step": 3110 }, { "epoch": 0.31054045983875783, "grad_norm": 0.9434365034103394, "learning_rate": 8.447297700806212e-06, "loss": 0.4027, "step": 3120 }, { "epoch": 0.3115357818254205, "grad_norm": 0.7766565680503845, "learning_rate": 8.442321090872898e-06, "loss": 0.3241, "step": 3130 }, { "epoch": 0.3125311038120832, "grad_norm": 0.7761719822883606, "learning_rate": 8.437344480939585e-06, "loss": 0.4041, "step": 3140 }, { "epoch": 0.3135264257987459, "grad_norm": 0.8227534890174866, "learning_rate": 8.432367871006272e-06, "loss": 0.3915, "step": 3150 }, { "epoch": 0.3145217477854086, "grad_norm": 0.6961987614631653, "learning_rate": 8.427391261072958e-06, "loss": 0.4119, "step": 3160 }, { "epoch": 0.3155170697720713, "grad_norm": 0.725043773651123, "learning_rate": 8.422414651139643e-06, "loss": 0.3811, "step": 3170 }, { "epoch": 0.31651239175873397, "grad_norm": 0.6801613569259644, "learning_rate": 8.41743804120633e-06, "loss": 0.3752, "step": 3180 }, { "epoch": 0.31750771374539666, "grad_norm": 0.6735227108001709, "learning_rate": 8.412461431273018e-06, "loss": 0.3538, "step": 3190 }, { "epoch": 0.31850303573205935, "grad_norm": 0.7424077391624451, "learning_rate": 8.407484821339704e-06, "loss": 0.3347, "step": 3200 }, { "epoch": 0.31850303573205935, "eval_loss": 0.38142284750938416, "eval_runtime": 148.3323, "eval_samples_per_second": 1.369, "eval_steps_per_second": 0.688, "step": 3200 }, { "epoch": 0.319498357718722, "grad_norm": 0.6526059508323669, "learning_rate": 8.402508211406391e-06, "loss": 0.4098, "step": 3210 }, { "epoch": 0.32049367970538467, "grad_norm": 0.8221137523651123, "learning_rate": 8.397531601473077e-06, "loss": 0.4044, "step": 3220 }, { "epoch": 0.32148900169204736, "grad_norm": 0.7967231869697571, "learning_rate": 8.392554991539764e-06, "loss": 0.3989, "step": 3230 }, { "epoch": 0.32248432367871005, "grad_norm": 0.8786621689796448, "learning_rate": 8.387578381606451e-06, "loss": 0.3113, "step": 3240 }, { "epoch": 0.32347964566537274, "grad_norm": 1.084957480430603, "learning_rate": 8.382601771673137e-06, "loss": 0.3855, "step": 3250 }, { "epoch": 0.32447496765203543, "grad_norm": 0.6978799104690552, "learning_rate": 8.377625161739822e-06, "loss": 0.3752, "step": 3260 }, { "epoch": 0.3254702896386981, "grad_norm": 0.6280369162559509, "learning_rate": 8.37264855180651e-06, "loss": 0.3831, "step": 3270 }, { "epoch": 0.3264656116253608, "grad_norm": 0.5700563192367554, "learning_rate": 8.367671941873197e-06, "loss": 0.3848, "step": 3280 }, { "epoch": 0.3274609336120235, "grad_norm": 0.6714605093002319, "learning_rate": 8.362695331939883e-06, "loss": 0.3894, "step": 3290 }, { "epoch": 0.3284562555986862, "grad_norm": 0.6634580492973328, "learning_rate": 8.35771872200657e-06, "loss": 0.4055, "step": 3300 }, { "epoch": 0.3284562555986862, "eval_loss": 0.3794529438018799, "eval_runtime": 147.906, "eval_samples_per_second": 1.372, "eval_steps_per_second": 0.69, "step": 3300 }, { "epoch": 0.3294515775853489, "grad_norm": 0.6699293255805969, "learning_rate": 8.352742112073257e-06, "loss": 0.3997, "step": 3310 }, { "epoch": 0.33044689957201157, "grad_norm": 0.5837434530258179, "learning_rate": 8.347765502139943e-06, "loss": 0.3506, "step": 3320 }, { "epoch": 0.33144222155867425, "grad_norm": 0.7900473475456238, "learning_rate": 8.342788892206629e-06, "loss": 0.3712, "step": 3330 }, { "epoch": 0.3324375435453369, "grad_norm": 0.5419691205024719, "learning_rate": 8.337812282273316e-06, "loss": 0.3755, "step": 3340 }, { "epoch": 0.3334328655319996, "grad_norm": 0.635683536529541, "learning_rate": 8.332835672340003e-06, "loss": 0.3995, "step": 3350 }, { "epoch": 0.33442818751866227, "grad_norm": 0.7266948223114014, "learning_rate": 8.327859062406689e-06, "loss": 0.398, "step": 3360 }, { "epoch": 0.33542350950532496, "grad_norm": 0.8439323902130127, "learning_rate": 8.322882452473376e-06, "loss": 0.4093, "step": 3370 }, { "epoch": 0.33641883149198765, "grad_norm": 0.6754797697067261, "learning_rate": 8.317905842540063e-06, "loss": 0.3638, "step": 3380 }, { "epoch": 0.33741415347865034, "grad_norm": 0.7690572142601013, "learning_rate": 8.312929232606749e-06, "loss": 0.3408, "step": 3390 }, { "epoch": 0.338409475465313, "grad_norm": 0.765877902507782, "learning_rate": 8.307952622673435e-06, "loss": 0.3418, "step": 3400 }, { "epoch": 0.338409475465313, "eval_loss": 0.37782156467437744, "eval_runtime": 147.8891, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.69, "step": 3400 }, { "epoch": 0.3394047974519757, "grad_norm": 0.7344104051589966, "learning_rate": 8.302976012740122e-06, "loss": 0.3443, "step": 3410 }, { "epoch": 0.3404001194386384, "grad_norm": 1.0199452638626099, "learning_rate": 8.29799940280681e-06, "loss": 0.4294, "step": 3420 }, { "epoch": 0.3413954414253011, "grad_norm": 0.5666326880455017, "learning_rate": 8.293022792873495e-06, "loss": 0.3274, "step": 3430 }, { "epoch": 0.3423907634119638, "grad_norm": 0.8385756611824036, "learning_rate": 8.288046182940182e-06, "loss": 0.4122, "step": 3440 }, { "epoch": 0.3433860853986265, "grad_norm": 0.777019739151001, "learning_rate": 8.28306957300687e-06, "loss": 0.4089, "step": 3450 }, { "epoch": 0.34438140738528916, "grad_norm": 0.682658851146698, "learning_rate": 8.278092963073555e-06, "loss": 0.3772, "step": 3460 }, { "epoch": 0.34537672937195185, "grad_norm": 0.6811783313751221, "learning_rate": 8.27311635314024e-06, "loss": 0.3523, "step": 3470 }, { "epoch": 0.3463720513586145, "grad_norm": 0.9056878685951233, "learning_rate": 8.268139743206928e-06, "loss": 0.3292, "step": 3480 }, { "epoch": 0.3473673733452772, "grad_norm": 0.6763057708740234, "learning_rate": 8.263163133273615e-06, "loss": 0.3326, "step": 3490 }, { "epoch": 0.34836269533193986, "grad_norm": 0.8847700953483582, "learning_rate": 8.258186523340301e-06, "loss": 0.4062, "step": 3500 }, { "epoch": 0.34836269533193986, "eval_loss": 0.37572577595710754, "eval_runtime": 147.9751, "eval_samples_per_second": 1.372, "eval_steps_per_second": 0.689, "step": 3500 }, { "epoch": 0.34935801731860255, "grad_norm": 0.7903834581375122, "learning_rate": 8.253209913406988e-06, "loss": 0.3546, "step": 3510 }, { "epoch": 0.35035333930526524, "grad_norm": 0.6501933336257935, "learning_rate": 8.248233303473674e-06, "loss": 0.3909, "step": 3520 }, { "epoch": 0.35134866129192793, "grad_norm": 0.6443967819213867, "learning_rate": 8.243256693540361e-06, "loss": 0.3315, "step": 3530 }, { "epoch": 0.3523439832785906, "grad_norm": 0.7020339965820312, "learning_rate": 8.238280083607047e-06, "loss": 0.383, "step": 3540 }, { "epoch": 0.3533393052652533, "grad_norm": 0.8711917400360107, "learning_rate": 8.233303473673734e-06, "loss": 0.3771, "step": 3550 }, { "epoch": 0.354334627251916, "grad_norm": 0.788311243057251, "learning_rate": 8.228326863740421e-06, "loss": 0.3299, "step": 3560 }, { "epoch": 0.3553299492385787, "grad_norm": 0.43669214844703674, "learning_rate": 8.223350253807107e-06, "loss": 0.3659, "step": 3570 }, { "epoch": 0.3563252712252414, "grad_norm": 0.550014078617096, "learning_rate": 8.218373643873794e-06, "loss": 0.3586, "step": 3580 }, { "epoch": 0.35732059321190407, "grad_norm": 0.9948114156723022, "learning_rate": 8.21339703394048e-06, "loss": 0.3743, "step": 3590 }, { "epoch": 0.35831591519856676, "grad_norm": 0.6710416078567505, "learning_rate": 8.208420424007167e-06, "loss": 0.3724, "step": 3600 }, { "epoch": 0.35831591519856676, "eval_loss": 0.37383729219436646, "eval_runtime": 147.9999, "eval_samples_per_second": 1.372, "eval_steps_per_second": 0.689, "step": 3600 }, { "epoch": 0.35931123718522945, "grad_norm": 0.7629538774490356, "learning_rate": 8.203443814073854e-06, "loss": 0.3942, "step": 3610 }, { "epoch": 0.3603065591718921, "grad_norm": 0.7567903399467468, "learning_rate": 8.19846720414054e-06, "loss": 0.3895, "step": 3620 }, { "epoch": 0.36130188115855477, "grad_norm": 0.5209780335426331, "learning_rate": 8.193490594207226e-06, "loss": 0.3395, "step": 3630 }, { "epoch": 0.36229720314521746, "grad_norm": 0.5655366778373718, "learning_rate": 8.188513984273913e-06, "loss": 0.3435, "step": 3640 }, { "epoch": 0.36329252513188015, "grad_norm": 0.8822707533836365, "learning_rate": 8.1835373743406e-06, "loss": 0.3442, "step": 3650 }, { "epoch": 0.36428784711854284, "grad_norm": 0.6264866590499878, "learning_rate": 8.178560764407286e-06, "loss": 0.3902, "step": 3660 }, { "epoch": 0.36528316910520553, "grad_norm": 0.6163113713264465, "learning_rate": 8.173584154473973e-06, "loss": 0.301, "step": 3670 }, { "epoch": 0.3662784910918682, "grad_norm": 0.7627054452896118, "learning_rate": 8.16860754454066e-06, "loss": 0.3504, "step": 3680 }, { "epoch": 0.3672738130785309, "grad_norm": 0.7021706104278564, "learning_rate": 8.163630934607346e-06, "loss": 0.3761, "step": 3690 }, { "epoch": 0.3682691350651936, "grad_norm": 0.8463016152381897, "learning_rate": 8.158654324674032e-06, "loss": 0.4096, "step": 3700 }, { "epoch": 0.3682691350651936, "eval_loss": 0.3721456229686737, "eval_runtime": 148.0333, "eval_samples_per_second": 1.371, "eval_steps_per_second": 0.689, "step": 3700 }, { "epoch": 0.3692644570518563, "grad_norm": 0.7081176042556763, "learning_rate": 8.153677714740719e-06, "loss": 0.3609, "step": 3710 }, { "epoch": 0.370259779038519, "grad_norm": 0.6312963366508484, "learning_rate": 8.148701104807406e-06, "loss": 0.3964, "step": 3720 }, { "epoch": 0.37125510102518167, "grad_norm": 0.5755221247673035, "learning_rate": 8.143724494874092e-06, "loss": 0.3701, "step": 3730 }, { "epoch": 0.37225042301184436, "grad_norm": 0.584368884563446, "learning_rate": 8.13874788494078e-06, "loss": 0.3748, "step": 3740 }, { "epoch": 0.373245744998507, "grad_norm": 0.588197648525238, "learning_rate": 8.133771275007467e-06, "loss": 0.3775, "step": 3750 }, { "epoch": 0.3742410669851697, "grad_norm": 0.6824856996536255, "learning_rate": 8.128794665074152e-06, "loss": 0.3842, "step": 3760 }, { "epoch": 0.37523638897183237, "grad_norm": 0.4867573082447052, "learning_rate": 8.123818055140838e-06, "loss": 0.3349, "step": 3770 }, { "epoch": 0.37623171095849506, "grad_norm": 1.023980975151062, "learning_rate": 8.118841445207525e-06, "loss": 0.2991, "step": 3780 }, { "epoch": 0.37722703294515775, "grad_norm": 0.8464593291282654, "learning_rate": 8.113864835274212e-06, "loss": 0.3673, "step": 3790 }, { "epoch": 0.37822235493182044, "grad_norm": 0.7149996757507324, "learning_rate": 8.108888225340898e-06, "loss": 0.3913, "step": 3800 }, { "epoch": 0.37822235493182044, "eval_loss": 0.37008264660835266, "eval_runtime": 148.0619, "eval_samples_per_second": 1.371, "eval_steps_per_second": 0.689, "step": 3800 }, { "epoch": 0.3792176769184831, "grad_norm": 0.5620415210723877, "learning_rate": 8.103911615407585e-06, "loss": 0.3409, "step": 3810 }, { "epoch": 0.3802129989051458, "grad_norm": 0.7163406014442444, "learning_rate": 8.098935005474273e-06, "loss": 0.3566, "step": 3820 }, { "epoch": 0.3812083208918085, "grad_norm": 0.6729508638381958, "learning_rate": 8.093958395540958e-06, "loss": 0.3606, "step": 3830 }, { "epoch": 0.3822036428784712, "grad_norm": 0.5905406475067139, "learning_rate": 8.088981785607644e-06, "loss": 0.3948, "step": 3840 }, { "epoch": 0.3831989648651339, "grad_norm": 0.896960437297821, "learning_rate": 8.084005175674331e-06, "loss": 0.3881, "step": 3850 }, { "epoch": 0.3841942868517966, "grad_norm": 0.6188758015632629, "learning_rate": 8.079028565741019e-06, "loss": 0.3632, "step": 3860 }, { "epoch": 0.38518960883845926, "grad_norm": 0.7011315822601318, "learning_rate": 8.074051955807704e-06, "loss": 0.3768, "step": 3870 }, { "epoch": 0.38618493082512195, "grad_norm": 0.546981930732727, "learning_rate": 8.069075345874391e-06, "loss": 0.3556, "step": 3880 }, { "epoch": 0.3871802528117846, "grad_norm": 0.6722966432571411, "learning_rate": 8.064098735941077e-06, "loss": 0.4264, "step": 3890 }, { "epoch": 0.3881755747984473, "grad_norm": 0.6407563090324402, "learning_rate": 8.059122126007764e-06, "loss": 0.3592, "step": 3900 }, { "epoch": 0.3881755747984473, "eval_loss": 0.3688708245754242, "eval_runtime": 148.1311, "eval_samples_per_second": 1.37, "eval_steps_per_second": 0.689, "step": 3900 }, { "epoch": 0.38917089678510997, "grad_norm": 0.45177608728408813, "learning_rate": 8.05414551607445e-06, "loss": 0.3733, "step": 3910 }, { "epoch": 0.39016621877177265, "grad_norm": 1.0299266576766968, "learning_rate": 8.049168906141137e-06, "loss": 0.351, "step": 3920 }, { "epoch": 0.39116154075843534, "grad_norm": 0.6861090660095215, "learning_rate": 8.044192296207823e-06, "loss": 0.3899, "step": 3930 }, { "epoch": 0.39215686274509803, "grad_norm": 0.6434109210968018, "learning_rate": 8.03921568627451e-06, "loss": 0.3285, "step": 3940 }, { "epoch": 0.3931521847317607, "grad_norm": 0.6049661040306091, "learning_rate": 8.034239076341198e-06, "loss": 0.37, "step": 3950 }, { "epoch": 0.3941475067184234, "grad_norm": 0.6799841523170471, "learning_rate": 8.029262466407883e-06, "loss": 0.381, "step": 3960 }, { "epoch": 0.3951428287050861, "grad_norm": 0.7383856177330017, "learning_rate": 8.02428585647457e-06, "loss": 0.3707, "step": 3970 }, { "epoch": 0.3961381506917488, "grad_norm": 0.8234820365905762, "learning_rate": 8.019309246541258e-06, "loss": 0.379, "step": 3980 }, { "epoch": 0.3971334726784115, "grad_norm": 0.743027925491333, "learning_rate": 8.014332636607943e-06, "loss": 0.362, "step": 3990 }, { "epoch": 0.39812879466507417, "grad_norm": 0.48385190963745117, "learning_rate": 8.009356026674629e-06, "loss": 0.3726, "step": 4000 }, { "epoch": 0.39812879466507417, "eval_loss": 0.36677852272987366, "eval_runtime": 148.1274, "eval_samples_per_second": 1.37, "eval_steps_per_second": 0.689, "step": 4000 }, { "epoch": 0.39912411665173686, "grad_norm": 0.776292622089386, "learning_rate": 8.004379416741316e-06, "loss": 0.3258, "step": 4010 }, { "epoch": 0.40011943863839955, "grad_norm": 0.7187590599060059, "learning_rate": 7.999402806808004e-06, "loss": 0.3639, "step": 4020 }, { "epoch": 0.4011147606250622, "grad_norm": 0.6233355402946472, "learning_rate": 7.99442619687469e-06, "loss": 0.3418, "step": 4030 }, { "epoch": 0.4021100826117249, "grad_norm": 0.9605082869529724, "learning_rate": 7.989449586941377e-06, "loss": 0.3686, "step": 4040 }, { "epoch": 0.40310540459838756, "grad_norm": 0.7882612943649292, "learning_rate": 7.984472977008064e-06, "loss": 0.3386, "step": 4050 }, { "epoch": 0.40410072658505025, "grad_norm": 0.8124802708625793, "learning_rate": 7.97949636707475e-06, "loss": 0.3412, "step": 4060 }, { "epoch": 0.40509604857171294, "grad_norm": 0.6348981857299805, "learning_rate": 7.974519757141435e-06, "loss": 0.3624, "step": 4070 }, { "epoch": 0.40609137055837563, "grad_norm": 0.8518906831741333, "learning_rate": 7.969543147208122e-06, "loss": 0.3494, "step": 4080 }, { "epoch": 0.4070866925450383, "grad_norm": 0.979092538356781, "learning_rate": 7.96456653727481e-06, "loss": 0.3677, "step": 4090 }, { "epoch": 0.408082014531701, "grad_norm": 0.6732219457626343, "learning_rate": 7.959589927341495e-06, "loss": 0.3395, "step": 4100 }, { "epoch": 0.408082014531701, "eval_loss": 0.365203857421875, "eval_runtime": 148.0813, "eval_samples_per_second": 1.371, "eval_steps_per_second": 0.689, "step": 4100 }, { "epoch": 0.4090773365183637, "grad_norm": 0.9068031907081604, "learning_rate": 7.954613317408183e-06, "loss": 0.3715, "step": 4110 }, { "epoch": 0.4100726585050264, "grad_norm": 0.8246614336967468, "learning_rate": 7.94963670747487e-06, "loss": 0.3661, "step": 4120 }, { "epoch": 0.4110679804916891, "grad_norm": 0.5856474041938782, "learning_rate": 7.944660097541556e-06, "loss": 0.3567, "step": 4130 }, { "epoch": 0.41206330247835177, "grad_norm": 0.4393113851547241, "learning_rate": 7.939683487608241e-06, "loss": 0.3469, "step": 4140 }, { "epoch": 0.41305862446501446, "grad_norm": 1.0827676057815552, "learning_rate": 7.934706877674928e-06, "loss": 0.3318, "step": 4150 }, { "epoch": 0.4140539464516771, "grad_norm": 0.6830149292945862, "learning_rate": 7.929730267741616e-06, "loss": 0.3726, "step": 4160 }, { "epoch": 0.4150492684383398, "grad_norm": 0.563925564289093, "learning_rate": 7.924753657808301e-06, "loss": 0.3732, "step": 4170 }, { "epoch": 0.41604459042500247, "grad_norm": 0.5630573034286499, "learning_rate": 7.919777047874989e-06, "loss": 0.3626, "step": 4180 }, { "epoch": 0.41703991241166516, "grad_norm": 0.7267017960548401, "learning_rate": 7.914800437941674e-06, "loss": 0.3414, "step": 4190 }, { "epoch": 0.41803523439832785, "grad_norm": 0.7420011758804321, "learning_rate": 7.909823828008362e-06, "loss": 0.379, "step": 4200 }, { "epoch": 0.41803523439832785, "eval_loss": 0.3634182810783386, "eval_runtime": 148.0601, "eval_samples_per_second": 1.371, "eval_steps_per_second": 0.689, "step": 4200 }, { "epoch": 0.41903055638499054, "grad_norm": 0.6270275115966797, "learning_rate": 7.904847218075047e-06, "loss": 0.347, "step": 4210 }, { "epoch": 0.4200258783716532, "grad_norm": 0.6264152526855469, "learning_rate": 7.899870608141735e-06, "loss": 0.3984, "step": 4220 }, { "epoch": 0.4210212003583159, "grad_norm": 0.7452067136764526, "learning_rate": 7.894893998208422e-06, "loss": 0.392, "step": 4230 }, { "epoch": 0.4220165223449786, "grad_norm": 0.5158396363258362, "learning_rate": 7.889917388275107e-06, "loss": 0.3624, "step": 4240 }, { "epoch": 0.4230118443316413, "grad_norm": 0.6692706942558289, "learning_rate": 7.884940778341795e-06, "loss": 0.359, "step": 4250 }, { "epoch": 0.424007166318304, "grad_norm": 1.1387830972671509, "learning_rate": 7.87996416840848e-06, "loss": 0.39, "step": 4260 }, { "epoch": 0.4250024883049667, "grad_norm": 0.76036137342453, "learning_rate": 7.874987558475168e-06, "loss": 0.299, "step": 4270 }, { "epoch": 0.42599781029162936, "grad_norm": 0.45447903871536255, "learning_rate": 7.870010948541853e-06, "loss": 0.3926, "step": 4280 }, { "epoch": 0.42699313227829205, "grad_norm": 0.8221507668495178, "learning_rate": 7.86503433860854e-06, "loss": 0.3743, "step": 4290 }, { "epoch": 0.4279884542649547, "grad_norm": 0.7328831553459167, "learning_rate": 7.860057728675226e-06, "loss": 0.3699, "step": 4300 }, { "epoch": 0.4279884542649547, "eval_loss": 0.36196640133857727, "eval_runtime": 148.0658, "eval_samples_per_second": 1.371, "eval_steps_per_second": 0.689, "step": 4300 }, { "epoch": 0.4289837762516174, "grad_norm": 0.8411442637443542, "learning_rate": 7.855081118741913e-06, "loss": 0.4047, "step": 4310 }, { "epoch": 0.42997909823828007, "grad_norm": 0.7502423524856567, "learning_rate": 7.8501045088086e-06, "loss": 0.3513, "step": 4320 }, { "epoch": 0.43097442022494276, "grad_norm": 0.566929042339325, "learning_rate": 7.845127898875286e-06, "loss": 0.3935, "step": 4330 }, { "epoch": 0.43196974221160545, "grad_norm": 0.7588290572166443, "learning_rate": 7.840151288941972e-06, "loss": 0.3324, "step": 4340 }, { "epoch": 0.43296506419826813, "grad_norm": 0.7947611808776855, "learning_rate": 7.835174679008661e-06, "loss": 0.3506, "step": 4350 }, { "epoch": 0.4339603861849308, "grad_norm": 0.6475954651832581, "learning_rate": 7.830198069075347e-06, "loss": 0.3103, "step": 4360 }, { "epoch": 0.4349557081715935, "grad_norm": 0.5702581405639648, "learning_rate": 7.825221459142032e-06, "loss": 0.3373, "step": 4370 }, { "epoch": 0.4359510301582562, "grad_norm": 0.7424353957176208, "learning_rate": 7.82024484920872e-06, "loss": 0.3593, "step": 4380 }, { "epoch": 0.4369463521449189, "grad_norm": 0.5749756693840027, "learning_rate": 7.815268239275407e-06, "loss": 0.3133, "step": 4390 }, { "epoch": 0.4379416741315816, "grad_norm": 0.5407712459564209, "learning_rate": 7.810291629342092e-06, "loss": 0.3584, "step": 4400 }, { "epoch": 0.4379416741315816, "eval_loss": 0.360762357711792, "eval_runtime": 148.1111, "eval_samples_per_second": 1.371, "eval_steps_per_second": 0.689, "step": 4400 }, { "epoch": 0.43893699611824427, "grad_norm": 0.5194666981697083, "learning_rate": 7.80531501940878e-06, "loss": 0.2957, "step": 4410 }, { "epoch": 0.43993231810490696, "grad_norm": 0.7961593866348267, "learning_rate": 7.800338409475467e-06, "loss": 0.3819, "step": 4420 }, { "epoch": 0.44092764009156965, "grad_norm": 0.6336628198623657, "learning_rate": 7.795361799542153e-06, "loss": 0.3123, "step": 4430 }, { "epoch": 0.4419229620782323, "grad_norm": 0.6935514211654663, "learning_rate": 7.790385189608838e-06, "loss": 0.3519, "step": 4440 }, { "epoch": 0.442918284064895, "grad_norm": 0.6400023698806763, "learning_rate": 7.785408579675526e-06, "loss": 0.3806, "step": 4450 }, { "epoch": 0.44391360605155766, "grad_norm": 0.9406591057777405, "learning_rate": 7.780431969742213e-06, "loss": 0.3282, "step": 4460 }, { "epoch": 0.44490892803822035, "grad_norm": 0.6432562470436096, "learning_rate": 7.775455359808899e-06, "loss": 0.302, "step": 4470 }, { "epoch": 0.44590425002488304, "grad_norm": 0.5700191259384155, "learning_rate": 7.770478749875586e-06, "loss": 0.3608, "step": 4480 }, { "epoch": 0.44689957201154573, "grad_norm": 0.7987110614776611, "learning_rate": 7.765502139942271e-06, "loss": 0.3363, "step": 4490 }, { "epoch": 0.4478948939982084, "grad_norm": 0.6581839323043823, "learning_rate": 7.760525530008959e-06, "loss": 0.3414, "step": 4500 }, { "epoch": 0.4478948939982084, "eval_loss": 0.35966184735298157, "eval_runtime": 148.1465, "eval_samples_per_second": 1.37, "eval_steps_per_second": 0.689, "step": 4500 }, { "epoch": 0.4488902159848711, "grad_norm": 0.6311335563659668, "learning_rate": 7.755548920075644e-06, "loss": 0.3768, "step": 4510 }, { "epoch": 0.4498855379715338, "grad_norm": 0.8850741982460022, "learning_rate": 7.750572310142332e-06, "loss": 0.3763, "step": 4520 }, { "epoch": 0.4508808599581965, "grad_norm": 0.5066502094268799, "learning_rate": 7.745595700209019e-06, "loss": 0.3412, "step": 4530 }, { "epoch": 0.4518761819448592, "grad_norm": 0.545430600643158, "learning_rate": 7.740619090275705e-06, "loss": 0.3737, "step": 4540 }, { "epoch": 0.45287150393152187, "grad_norm": 0.7061020731925964, "learning_rate": 7.735642480342392e-06, "loss": 0.3218, "step": 4550 }, { "epoch": 0.45386682591818456, "grad_norm": 0.5185464024543762, "learning_rate": 7.730665870409078e-06, "loss": 0.3489, "step": 4560 }, { "epoch": 0.4548621479048472, "grad_norm": 0.9102675318717957, "learning_rate": 7.725689260475765e-06, "loss": 0.3515, "step": 4570 }, { "epoch": 0.4558574698915099, "grad_norm": 0.7395256757736206, "learning_rate": 7.72071265054245e-06, "loss": 0.2873, "step": 4580 }, { "epoch": 0.45685279187817257, "grad_norm": 0.9186689853668213, "learning_rate": 7.715736040609138e-06, "loss": 0.3705, "step": 4590 }, { "epoch": 0.45784811386483526, "grad_norm": 0.6102734804153442, "learning_rate": 7.710759430675823e-06, "loss": 0.3389, "step": 4600 }, { "epoch": 0.45784811386483526, "eval_loss": 0.35844776034355164, "eval_runtime": 148.1097, "eval_samples_per_second": 1.371, "eval_steps_per_second": 0.689, "step": 4600 }, { "epoch": 0.45884343585149795, "grad_norm": 0.418071985244751, "learning_rate": 7.70578282074251e-06, "loss": 0.3454, "step": 4610 }, { "epoch": 0.45983875783816064, "grad_norm": 0.504802942276001, "learning_rate": 7.700806210809198e-06, "loss": 0.3419, "step": 4620 }, { "epoch": 0.46083407982482333, "grad_norm": 0.7918646335601807, "learning_rate": 7.695829600875884e-06, "loss": 0.3992, "step": 4630 }, { "epoch": 0.461829401811486, "grad_norm": 0.6944281458854675, "learning_rate": 7.690852990942571e-06, "loss": 0.3945, "step": 4640 }, { "epoch": 0.4628247237981487, "grad_norm": 0.648303210735321, "learning_rate": 7.685876381009257e-06, "loss": 0.3401, "step": 4650 }, { "epoch": 0.4638200457848114, "grad_norm": 0.812044084072113, "learning_rate": 7.680899771075944e-06, "loss": 0.3548, "step": 4660 }, { "epoch": 0.4648153677714741, "grad_norm": 0.7709999680519104, "learning_rate": 7.67592316114263e-06, "loss": 0.3702, "step": 4670 }, { "epoch": 0.4658106897581368, "grad_norm": 0.7904644012451172, "learning_rate": 7.670946551209317e-06, "loss": 0.3763, "step": 4680 }, { "epoch": 0.46680601174479947, "grad_norm": 0.7763231992721558, "learning_rate": 7.665969941276004e-06, "loss": 0.3495, "step": 4690 }, { "epoch": 0.46780133373146215, "grad_norm": 0.5270109176635742, "learning_rate": 7.66099333134269e-06, "loss": 0.3016, "step": 4700 }, { "epoch": 0.46780133373146215, "eval_loss": 0.35714709758758545, "eval_runtime": 148.1115, "eval_samples_per_second": 1.371, "eval_steps_per_second": 0.689, "step": 4700 }, { "epoch": 0.4687966557181248, "grad_norm": 0.6368373036384583, "learning_rate": 7.656016721409375e-06, "loss": 0.3323, "step": 4710 }, { "epoch": 0.4697919777047875, "grad_norm": 0.3973361551761627, "learning_rate": 7.651040111476064e-06, "loss": 0.3405, "step": 4720 }, { "epoch": 0.47078729969145017, "grad_norm": 0.8075085878372192, "learning_rate": 7.64606350154275e-06, "loss": 0.3436, "step": 4730 }, { "epoch": 0.47178262167811286, "grad_norm": 0.892672598361969, "learning_rate": 7.641086891609436e-06, "loss": 0.3662, "step": 4740 }, { "epoch": 0.47277794366477555, "grad_norm": 0.6311262845993042, "learning_rate": 7.636110281676123e-06, "loss": 0.3559, "step": 4750 }, { "epoch": 0.47377326565143824, "grad_norm": 0.7950363159179688, "learning_rate": 7.63113367174281e-06, "loss": 0.2974, "step": 4760 }, { "epoch": 0.4747685876381009, "grad_norm": 0.6539332270622253, "learning_rate": 7.626157061809496e-06, "loss": 0.3312, "step": 4770 }, { "epoch": 0.4757639096247636, "grad_norm": 0.7384660840034485, "learning_rate": 7.621180451876182e-06, "loss": 0.3825, "step": 4780 }, { "epoch": 0.4767592316114263, "grad_norm": 0.43817830085754395, "learning_rate": 7.6162038419428695e-06, "loss": 0.3462, "step": 4790 }, { "epoch": 0.477754553598089, "grad_norm": 0.7346156239509583, "learning_rate": 7.611227232009556e-06, "loss": 0.3377, "step": 4800 }, { "epoch": 0.477754553598089, "eval_loss": 0.355719655752182, "eval_runtime": 148.1914, "eval_samples_per_second": 1.37, "eval_steps_per_second": 0.688, "step": 4800 }, { "epoch": 0.4787498755847517, "grad_norm": 0.8043003082275391, "learning_rate": 7.6062506220762424e-06, "loss": 0.3625, "step": 4810 }, { "epoch": 0.4797451975714144, "grad_norm": 0.6644107103347778, "learning_rate": 7.601274012142929e-06, "loss": 0.3023, "step": 4820 }, { "epoch": 0.48074051955807706, "grad_norm": 0.7794090509414673, "learning_rate": 7.596297402209616e-06, "loss": 0.3552, "step": 4830 }, { "epoch": 0.4817358415447397, "grad_norm": 0.7449871301651001, "learning_rate": 7.591320792276302e-06, "loss": 0.3659, "step": 4840 }, { "epoch": 0.4827311635314024, "grad_norm": 0.881610631942749, "learning_rate": 7.586344182342988e-06, "loss": 0.3184, "step": 4850 }, { "epoch": 0.4837264855180651, "grad_norm": 0.8672296404838562, "learning_rate": 7.581367572409675e-06, "loss": 0.3324, "step": 4860 }, { "epoch": 0.48472180750472776, "grad_norm": 0.4788852334022522, "learning_rate": 7.576390962476362e-06, "loss": 0.3406, "step": 4870 }, { "epoch": 0.48571712949139045, "grad_norm": 0.6023631691932678, "learning_rate": 7.5714143525430485e-06, "loss": 0.3797, "step": 4880 }, { "epoch": 0.48671245147805314, "grad_norm": 0.6595234870910645, "learning_rate": 7.566437742609735e-06, "loss": 0.3199, "step": 4890 }, { "epoch": 0.48770777346471583, "grad_norm": 0.6189759969711304, "learning_rate": 7.561461132676421e-06, "loss": 0.373, "step": 4900 }, { "epoch": 0.48770777346471583, "eval_loss": 0.35428938269615173, "eval_runtime": 148.2777, "eval_samples_per_second": 1.369, "eval_steps_per_second": 0.688, "step": 4900 }, { "epoch": 0.4887030954513785, "grad_norm": 0.71135413646698, "learning_rate": 7.556484522743108e-06, "loss": 0.3232, "step": 4910 }, { "epoch": 0.4896984174380412, "grad_norm": 0.5228835940361023, "learning_rate": 7.551507912809794e-06, "loss": 0.3428, "step": 4920 }, { "epoch": 0.4906937394247039, "grad_norm": 0.9015726447105408, "learning_rate": 7.546531302876481e-06, "loss": 0.3889, "step": 4930 }, { "epoch": 0.4916890614113666, "grad_norm": 0.8351202011108398, "learning_rate": 7.541554692943168e-06, "loss": 0.3367, "step": 4940 }, { "epoch": 0.4926843833980293, "grad_norm": 0.6578547954559326, "learning_rate": 7.536578083009855e-06, "loss": 0.3646, "step": 4950 }, { "epoch": 0.49367970538469197, "grad_norm": 1.1061774492263794, "learning_rate": 7.531601473076541e-06, "loss": 0.351, "step": 4960 }, { "epoch": 0.49467502737135466, "grad_norm": 0.636061429977417, "learning_rate": 7.526624863143227e-06, "loss": 0.3434, "step": 4970 }, { "epoch": 0.4956703493580173, "grad_norm": 0.6666164994239807, "learning_rate": 7.521648253209915e-06, "loss": 0.3462, "step": 4980 }, { "epoch": 0.49666567134468, "grad_norm": 0.8288053274154663, "learning_rate": 7.5166716432766e-06, "loss": 0.3862, "step": 4990 }, { "epoch": 0.49766099333134267, "grad_norm": 0.5653735399246216, "learning_rate": 7.511695033343287e-06, "loss": 0.3559, "step": 5000 }, { "epoch": 0.49766099333134267, "eval_loss": 0.35338979959487915, "eval_runtime": 148.2313, "eval_samples_per_second": 1.369, "eval_steps_per_second": 0.688, "step": 5000 }, { "epoch": 0.49865631531800536, "grad_norm": 1.083835482597351, "learning_rate": 7.506718423409973e-06, "loss": 0.3697, "step": 5010 }, { "epoch": 0.49965163730466805, "grad_norm": 0.7271355986595154, "learning_rate": 7.501741813476661e-06, "loss": 0.2915, "step": 5020 }, { "epoch": 0.5006469592913307, "grad_norm": 0.6525740027427673, "learning_rate": 7.496765203543347e-06, "loss": 0.3571, "step": 5030 }, { "epoch": 0.5016422812779935, "grad_norm": 1.00348961353302, "learning_rate": 7.4917885936100336e-06, "loss": 0.3254, "step": 5040 }, { "epoch": 0.5026376032646561, "grad_norm": 0.7707570195198059, "learning_rate": 7.486811983676721e-06, "loss": 0.3544, "step": 5050 }, { "epoch": 0.5036329252513188, "grad_norm": 0.7804340720176697, "learning_rate": 7.4818353737434065e-06, "loss": 0.3346, "step": 5060 }, { "epoch": 0.5046282472379815, "grad_norm": 1.0899609327316284, "learning_rate": 7.476858763810093e-06, "loss": 0.3296, "step": 5070 }, { "epoch": 0.5056235692246441, "grad_norm": 0.6863502264022827, "learning_rate": 7.471882153876779e-06, "loss": 0.3569, "step": 5080 }, { "epoch": 0.5066188912113069, "grad_norm": 1.15005362033844, "learning_rate": 7.466905543943467e-06, "loss": 0.2829, "step": 5090 }, { "epoch": 0.5076142131979695, "grad_norm": 0.699102520942688, "learning_rate": 7.461928934010153e-06, "loss": 0.3727, "step": 5100 }, { "epoch": 0.5076142131979695, "eval_loss": 0.35266318917274475, "eval_runtime": 148.2339, "eval_samples_per_second": 1.369, "eval_steps_per_second": 0.688, "step": 5100 }, { "epoch": 0.5086095351846323, "grad_norm": 0.9547719359397888, "learning_rate": 7.45695232407684e-06, "loss": 0.4042, "step": 5110 }, { "epoch": 0.5096048571712949, "grad_norm": 0.9959189891815186, "learning_rate": 7.451975714143525e-06, "loss": 0.3115, "step": 5120 }, { "epoch": 0.5106001791579576, "grad_norm": 0.6266285181045532, "learning_rate": 7.446999104210213e-06, "loss": 0.3485, "step": 5130 }, { "epoch": 0.5115955011446203, "grad_norm": 0.711664617061615, "learning_rate": 7.442022494276899e-06, "loss": 0.3699, "step": 5140 }, { "epoch": 0.512590823131283, "grad_norm": 1.0690807104110718, "learning_rate": 7.4370458843435855e-06, "loss": 0.3248, "step": 5150 }, { "epoch": 0.5135861451179456, "grad_norm": 1.2619460821151733, "learning_rate": 7.432069274410272e-06, "loss": 0.3284, "step": 5160 }, { "epoch": 0.5145814671046084, "grad_norm": 0.9510999917984009, "learning_rate": 7.427092664476959e-06, "loss": 0.3491, "step": 5170 }, { "epoch": 0.515576789091271, "grad_norm": 1.012990117073059, "learning_rate": 7.422116054543646e-06, "loss": 0.3659, "step": 5180 }, { "epoch": 0.5165721110779337, "grad_norm": 0.5469540953636169, "learning_rate": 7.417139444610332e-06, "loss": 0.2709, "step": 5190 }, { "epoch": 0.5175674330645964, "grad_norm": 0.6974226236343384, "learning_rate": 7.4121628346770195e-06, "loss": 0.3668, "step": 5200 }, { "epoch": 0.5175674330645964, "eval_loss": 0.35165390372276306, "eval_runtime": 148.2087, "eval_samples_per_second": 1.37, "eval_steps_per_second": 0.688, "step": 5200 }, { "epoch": 0.518562755051259, "grad_norm": 0.8949996829032898, "learning_rate": 7.407186224743705e-06, "loss": 0.3305, "step": 5210 }, { "epoch": 0.5195580770379218, "grad_norm": 0.6786302328109741, "learning_rate": 7.4022096148103915e-06, "loss": 0.3312, "step": 5220 }, { "epoch": 0.5205533990245844, "grad_norm": 0.6699957251548767, "learning_rate": 7.397233004877078e-06, "loss": 0.3429, "step": 5230 }, { "epoch": 0.5215487210112472, "grad_norm": 0.5877237915992737, "learning_rate": 7.392256394943765e-06, "loss": 0.3214, "step": 5240 }, { "epoch": 0.5225440429979098, "grad_norm": 0.7005926966667175, "learning_rate": 7.387279785010452e-06, "loss": 0.3816, "step": 5250 }, { "epoch": 0.5235393649845725, "grad_norm": 0.7223731279373169, "learning_rate": 7.382303175077138e-06, "loss": 0.3773, "step": 5260 }, { "epoch": 0.5245346869712352, "grad_norm": 0.9617743492126465, "learning_rate": 7.377326565143824e-06, "loss": 0.3441, "step": 5270 }, { "epoch": 0.5255300089578979, "grad_norm": 0.6759951114654541, "learning_rate": 7.372349955210511e-06, "loss": 0.3464, "step": 5280 }, { "epoch": 0.5265253309445606, "grad_norm": 0.600290834903717, "learning_rate": 7.367373345277198e-06, "loss": 0.3202, "step": 5290 }, { "epoch": 0.5275206529312233, "grad_norm": 0.6212776899337769, "learning_rate": 7.362396735343884e-06, "loss": 0.3995, "step": 5300 }, { "epoch": 0.5275206529312233, "eval_loss": 0.35058361291885376, "eval_runtime": 148.2235, "eval_samples_per_second": 1.37, "eval_steps_per_second": 0.688, "step": 5300 } ], "logging_steps": 10, "max_steps": 20094, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.109336661739546e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }