{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 36806, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013584741618214422, "grad_norm": 0.336629520848516, "learning_rate": 8.868778280542987e-05, "loss": 0.9342, "step": 50 }, { "epoch": 0.0027169483236428845, "grad_norm": 0.41112101024853426, "learning_rate": 0.00017918552036199096, "loss": 0.5198, "step": 100 }, { "epoch": 0.004075422485464326, "grad_norm": 0.38040867145053625, "learning_rate": 0.0002696832579185521, "loss": 0.4962, "step": 150 }, { "epoch": 0.005433896647285769, "grad_norm": 0.5231995312551354, "learning_rate": 0.00036018099547511313, "loss": 0.5198, "step": 200 }, { "epoch": 0.006792370809107211, "grad_norm": 0.37419434381789457, "learning_rate": 0.00039999942189133334, "loss": 0.5615, "step": 250 }, { "epoch": 0.008150844970928653, "grad_norm": 0.5051525763054373, "learning_rate": 0.00039999551377337605, "loss": 0.5629, "step": 300 }, { "epoch": 0.009509319132750094, "grad_norm": 0.37940069661143105, "learning_rate": 0.0003999879188244911, "loss": 0.5715, "step": 350 }, { "epoch": 0.010867793294571538, "grad_norm": 0.4187127030134641, "learning_rate": 0.0003999766371846881, "loss": 0.5191, "step": 400 }, { "epoch": 0.01222626745639298, "grad_norm": 0.4287344223835902, "learning_rate": 0.00039996166906193926, "loss": 0.5266, "step": 450 }, { "epoch": 0.013584741618214421, "grad_norm": 0.42083341057731205, "learning_rate": 0.00039994301473217543, "loss": 0.4993, "step": 500 }, { "epoch": 0.014943215780035863, "grad_norm": 0.44004507818862004, "learning_rate": 0.00039992067453928115, "loss": 0.5008, "step": 550 }, { "epoch": 0.016301689941857305, "grad_norm": 0.3947637329018398, "learning_rate": 0.0003998946488950882, "loss": 0.5199, "step": 600 }, { "epoch": 0.01766016410367875, "grad_norm": 0.35666971296762606, "learning_rate": 0.0003998649382793681, "loss": 0.5024, "step": 650 }, { "epoch": 0.01901863826550019, "grad_norm": 0.3864531806325715, "learning_rate": 0.0003998315432398232, "loss": 0.5058, "step": 700 }, { "epoch": 0.020377112427321632, "grad_norm": 0.3979644110719079, "learning_rate": 0.00039979446439207663, "loss": 0.4833, "step": 750 }, { "epoch": 0.021735586589143076, "grad_norm": 0.32463557091237133, "learning_rate": 0.000399753702419661, "loss": 0.5181, "step": 800 }, { "epoch": 0.023094060750964516, "grad_norm": 0.36185137659998867, "learning_rate": 0.0003997092580740055, "loss": 0.4731, "step": 850 }, { "epoch": 0.02445253491278596, "grad_norm": 0.36585435226850893, "learning_rate": 0.00039966113217442266, "loss": 0.5028, "step": 900 }, { "epoch": 0.0258110090746074, "grad_norm": 0.41125601490924335, "learning_rate": 0.00039960932560809256, "loss": 0.4907, "step": 950 }, { "epoch": 0.027169483236428843, "grad_norm": 0.3848215921898568, "learning_rate": 0.0003995538393300469, "loss": 0.483, "step": 1000 }, { "epoch": 0.028527957398250287, "grad_norm": 0.33951726776456204, "learning_rate": 0.0003994946743631513, "loss": 0.4567, "step": 1050 }, { "epoch": 0.029886431560071727, "grad_norm": 0.3234265221555652, "learning_rate": 0.0003994318317980862, "loss": 0.4663, "step": 1100 }, { "epoch": 0.03124490572189317, "grad_norm": 0.3997241882392808, "learning_rate": 0.0003993666791947118, "loss": 0.4709, "step": 1150 }, { "epoch": 0.03260337988371461, "grad_norm": 0.45209570419286, "learning_rate": 0.00039929799689193896, "loss": 0.4836, "step": 1200 }, { "epoch": 0.033961854045536054, "grad_norm": 0.31674750892690945, "learning_rate": 0.000399224275685374, "loss": 0.4708, "step": 1250 }, { "epoch": 0.0353203282073575, "grad_norm": 0.3336127152054193, "learning_rate": 0.0003991468818653226, "loss": 0.48, "step": 1300 }, { "epoch": 0.03667880236917894, "grad_norm": 0.30959090922038635, "learning_rate": 0.0003990658168585062, "loss": 0.4568, "step": 1350 }, { "epoch": 0.03803727653100038, "grad_norm": 0.37908103624960793, "learning_rate": 0.0003989810821593234, "loss": 0.4867, "step": 1400 }, { "epoch": 0.03939575069282182, "grad_norm": 0.3746655333602711, "learning_rate": 0.0003988944833236597, "loss": 0.4685, "step": 1450 }, { "epoch": 0.040754224854643264, "grad_norm": 0.3841226468141105, "learning_rate": 0.0003988043631486177, "loss": 0.4788, "step": 1500 }, { "epoch": 0.04211269901646471, "grad_norm": 0.4460643403291581, "learning_rate": 0.0003987087755737732, "loss": 0.4502, "step": 1550 }, { "epoch": 0.04347117317828615, "grad_norm": 0.3671013360773688, "learning_rate": 0.00039860952488846415, "loss": 0.4534, "step": 1600 }, { "epoch": 0.04482964734010759, "grad_norm": 0.34747090480175297, "learning_rate": 0.000398506612922334, "loss": 0.4405, "step": 1650 }, { "epoch": 0.04618812150192903, "grad_norm": 0.36167597296572834, "learning_rate": 0.00039840004157252047, "loss": 0.4778, "step": 1700 }, { "epoch": 0.047546595663750475, "grad_norm": 0.4009770951011953, "learning_rate": 0.0003982898128036203, "loss": 0.479, "step": 1750 }, { "epoch": 0.04890506982557192, "grad_norm": 0.5968187908901367, "learning_rate": 0.00039818055417120837, "loss": 0.5444, "step": 1800 }, { "epoch": 0.05026354398739336, "grad_norm": 1.8105646301472897, "learning_rate": 0.0003980655464341668, "loss": 0.5653, "step": 1850 }, { "epoch": 0.0516220181492148, "grad_norm": 0.789226312089056, "learning_rate": 0.0003979470320337101, "loss": 0.7458, "step": 1900 }, { "epoch": 0.05298049231103624, "grad_norm": 0.9263531743492742, "learning_rate": 0.0003978224864094713, "loss": 0.5415, "step": 1950 }, { "epoch": 0.054338966472857686, "grad_norm": 0.47720194915599734, "learning_rate": 0.00039769429401312583, "loss": 0.5115, "step": 2000 }, { "epoch": 0.05569744063467913, "grad_norm": 0.6259624884089505, "learning_rate": 0.0003975624572078452, "loss": 0.4968, "step": 2050 }, { "epoch": 0.05705591479650057, "grad_norm": 0.6070556198109234, "learning_rate": 0.00039742697842398407, "loss": 0.4721, "step": 2100 }, { "epoch": 0.05841438895832201, "grad_norm": 0.9495402483045952, "learning_rate": 0.00039728786015903527, "loss": 0.5384, "step": 2150 }, { "epoch": 0.05977286312014345, "grad_norm": 0.5094390312707457, "learning_rate": 0.0003971451049775838, "loss": 0.4768, "step": 2200 }, { "epoch": 0.0611313372819649, "grad_norm": 0.4164209148361273, "learning_rate": 0.00039699871551126, "loss": 0.4745, "step": 2250 }, { "epoch": 0.06248981144378634, "grad_norm": 0.4096404988646368, "learning_rate": 0.0003968486944586903, "loss": 0.4571, "step": 2300 }, { "epoch": 0.06384828560560778, "grad_norm": 0.36680803179529065, "learning_rate": 0.00039669504458544815, "loss": 0.4542, "step": 2350 }, { "epoch": 0.06520675976742922, "grad_norm": 0.4436494287681541, "learning_rate": 0.00039653776872400245, "loss": 0.4526, "step": 2400 }, { "epoch": 0.06656523392925066, "grad_norm": 0.632255456628572, "learning_rate": 0.0003963768697736659, "loss": 0.4614, "step": 2450 }, { "epoch": 0.06792370809107211, "grad_norm": 0.27294306772696525, "learning_rate": 0.0003962123507005411, "loss": 0.4247, "step": 2500 }, { "epoch": 0.06928218225289355, "grad_norm": 0.4922506411666624, "learning_rate": 0.00039604421453746615, "loss": 0.4336, "step": 2550 }, { "epoch": 0.070640656414715, "grad_norm": 0.3836605187611845, "learning_rate": 0.00039587246438395866, "loss": 0.4308, "step": 2600 }, { "epoch": 0.07199913057653644, "grad_norm": 0.30031785058018967, "learning_rate": 0.0003956971034061584, "loss": 0.4336, "step": 2650 }, { "epoch": 0.07335760473835788, "grad_norm": 0.30896010198949975, "learning_rate": 0.00039551813483676944, "loss": 0.4282, "step": 2700 }, { "epoch": 0.07471607890017933, "grad_norm": 0.3264468040624618, "learning_rate": 0.0003953355619749999, "loss": 0.4466, "step": 2750 }, { "epoch": 0.07607455306200075, "grad_norm": 0.47417679268203056, "learning_rate": 0.0003951493881865018, "loss": 0.455, "step": 2800 }, { "epoch": 0.0774330272238222, "grad_norm": 0.467248267974456, "learning_rate": 0.0003949596169033084, "loss": 0.436, "step": 2850 }, { "epoch": 0.07879150138564364, "grad_norm": 0.2574312755334094, "learning_rate": 0.0003947662516237714, "loss": 0.4263, "step": 2900 }, { "epoch": 0.08014997554746509, "grad_norm": 0.40465633762609327, "learning_rate": 0.0003945692959124962, "loss": 0.4275, "step": 2950 }, { "epoch": 0.08150844970928653, "grad_norm": 0.3105964962014554, "learning_rate": 0.0003943687534002764, "loss": 0.4063, "step": 3000 }, { "epoch": 0.08286692387110797, "grad_norm": 0.36079130494736616, "learning_rate": 0.00039416462778402644, "loss": 0.4291, "step": 3050 }, { "epoch": 0.08422539803292942, "grad_norm": 0.27991323241801247, "learning_rate": 0.0003939569228267139, "loss": 0.4294, "step": 3100 }, { "epoch": 0.08558387219475086, "grad_norm": 0.28381836513166353, "learning_rate": 0.00039374564235729017, "loss": 0.4198, "step": 3150 }, { "epoch": 0.0869423463565723, "grad_norm": 0.3299204189197327, "learning_rate": 0.00039353079027061935, "loss": 0.4103, "step": 3200 }, { "epoch": 0.08830082051839375, "grad_norm": 0.326311631143589, "learning_rate": 0.0003933123705274068, "loss": 0.4297, "step": 3250 }, { "epoch": 0.08965929468021518, "grad_norm": 0.3302186726703939, "learning_rate": 0.0003930903871541262, "loss": 0.4129, "step": 3300 }, { "epoch": 0.09101776884203662, "grad_norm": 0.24354600292418105, "learning_rate": 0.00039286484424294534, "loss": 0.4178, "step": 3350 }, { "epoch": 0.09237624300385806, "grad_norm": 0.33427871787687957, "learning_rate": 0.00039263574595165007, "loss": 0.4229, "step": 3400 }, { "epoch": 0.09373471716567951, "grad_norm": 0.3922838514063193, "learning_rate": 0.00039240309650356874, "loss": 0.416, "step": 3450 }, { "epoch": 0.09509319132750095, "grad_norm": 0.37066104429189983, "learning_rate": 0.0003921669001874933, "loss": 0.4359, "step": 3500 }, { "epoch": 0.0964516654893224, "grad_norm": 0.29257797493644805, "learning_rate": 0.0003919271613576008, "loss": 0.4286, "step": 3550 }, { "epoch": 0.09781013965114384, "grad_norm": 0.45431531124773644, "learning_rate": 0.0003916838844333732, "loss": 0.4291, "step": 3600 }, { "epoch": 0.09916861381296528, "grad_norm": 0.31092097963579685, "learning_rate": 0.0003914370738995154, "loss": 0.431, "step": 3650 }, { "epoch": 0.10052708797478672, "grad_norm": 0.2895204897106821, "learning_rate": 0.00039118673430587307, "loss": 0.4372, "step": 3700 }, { "epoch": 0.10188556213660817, "grad_norm": 0.2576792476027214, "learning_rate": 0.0003909328702673485, "loss": 0.4527, "step": 3750 }, { "epoch": 0.1032440362984296, "grad_norm": 0.30963702592618436, "learning_rate": 0.0003906754864638156, "loss": 0.4121, "step": 3800 }, { "epoch": 0.10460251046025104, "grad_norm": 0.6475657508703404, "learning_rate": 0.0003904145876400337, "loss": 0.4224, "step": 3850 }, { "epoch": 0.10596098462207248, "grad_norm": 0.3256994801441109, "learning_rate": 0.00039015017860555984, "loss": 0.4363, "step": 3900 }, { "epoch": 0.10731945878389393, "grad_norm": 0.3471706882794347, "learning_rate": 0.0003898822642346604, "loss": 0.4252, "step": 3950 }, { "epoch": 0.10867793294571537, "grad_norm": 0.29742285163537485, "learning_rate": 0.00038961084946622114, "loss": 0.41, "step": 4000 }, { "epoch": 0.11003640710753682, "grad_norm": 0.42367795916225637, "learning_rate": 0.0003893359393036561, "loss": 0.4047, "step": 4050 }, { "epoch": 0.11139488126935826, "grad_norm": 0.3766799857882688, "learning_rate": 0.0003890575388148154, "loss": 0.4142, "step": 4100 }, { "epoch": 0.1127533554311797, "grad_norm": 0.4163460710146468, "learning_rate": 0.00038877565313189184, "loss": 0.4467, "step": 4150 }, { "epoch": 0.11411182959300115, "grad_norm": 0.2826407429662945, "learning_rate": 0.00038849028745132627, "loss": 0.4149, "step": 4200 }, { "epoch": 0.11547030375482259, "grad_norm": 0.3265965959677555, "learning_rate": 0.0003882014470337117, "loss": 0.4358, "step": 4250 }, { "epoch": 0.11682877791664402, "grad_norm": 0.24777830179313484, "learning_rate": 0.00038790913720369657, "loss": 0.4012, "step": 4300 }, { "epoch": 0.11818725207846546, "grad_norm": 0.2915403708081659, "learning_rate": 0.00038761336334988634, "loss": 0.4069, "step": 4350 }, { "epoch": 0.1195457262402869, "grad_norm": 0.3326202807353683, "learning_rate": 0.00038731413092474423, "loss": 0.3902, "step": 4400 }, { "epoch": 0.12090420040210835, "grad_norm": 0.3965527555219645, "learning_rate": 0.00038701144544449085, "loss": 0.3894, "step": 4450 }, { "epoch": 0.1222626745639298, "grad_norm": 0.36617448279834447, "learning_rate": 0.0003867053124890022, "loss": 0.3993, "step": 4500 }, { "epoch": 0.12362114872575124, "grad_norm": 0.2978802526091461, "learning_rate": 0.0003863957377017073, "loss": 0.3934, "step": 4550 }, { "epoch": 0.12497962288757268, "grad_norm": 0.3199935306648141, "learning_rate": 0.0003860827267894834, "loss": 0.4015, "step": 4600 }, { "epoch": 0.1263380970493941, "grad_norm": 0.28870094415921566, "learning_rate": 0.00038576628552255173, "loss": 0.4242, "step": 4650 }, { "epoch": 0.12769657121121555, "grad_norm": 0.3368662682662353, "learning_rate": 0.00038544641973437026, "loss": 0.4078, "step": 4700 }, { "epoch": 0.129055045373037, "grad_norm": 0.34171191785593347, "learning_rate": 0.0003851231353215267, "loss": 0.4184, "step": 4750 }, { "epoch": 0.13041351953485844, "grad_norm": 0.3456350837751831, "learning_rate": 0.00038479643824362956, "loss": 0.4011, "step": 4800 }, { "epoch": 0.13177199369667988, "grad_norm": 0.44062561750629825, "learning_rate": 0.00038446633452319845, "loss": 0.4179, "step": 4850 }, { "epoch": 0.13313046785850133, "grad_norm": 0.30494197906676423, "learning_rate": 0.00038413283024555284, "loss": 0.3987, "step": 4900 }, { "epoch": 0.13448894202032277, "grad_norm": 0.26281770974778434, "learning_rate": 0.00038379593155870006, "loss": 0.3745, "step": 4950 }, { "epoch": 0.13584741618214421, "grad_norm": 0.33576603130586835, "learning_rate": 0.00038345564467322197, "loss": 0.3981, "step": 5000 }, { "epoch": 0.13720589034396566, "grad_norm": 0.3395300651756381, "learning_rate": 0.00038311197586216023, "loss": 0.3908, "step": 5050 }, { "epoch": 0.1385643645057871, "grad_norm": 0.3641141735123601, "learning_rate": 0.0003827649314609011, "loss": 0.4156, "step": 5100 }, { "epoch": 0.13992283866760855, "grad_norm": 0.3880481801495523, "learning_rate": 0.00038241451786705824, "loss": 0.4225, "step": 5150 }, { "epoch": 0.14128131282943, "grad_norm": 0.33587356117829287, "learning_rate": 0.0003820607415403548, "loss": 0.4322, "step": 5200 }, { "epoch": 0.14263978699125143, "grad_norm": 0.2651951238410833, "learning_rate": 0.0003817036090025046, "loss": 0.3882, "step": 5250 }, { "epoch": 0.14399826115307288, "grad_norm": 0.3108835459594419, "learning_rate": 0.0003813431268370919, "loss": 0.3962, "step": 5300 }, { "epoch": 0.14535673531489432, "grad_norm": 0.5822321494392535, "learning_rate": 0.0003809793016894496, "loss": 0.4092, "step": 5350 }, { "epoch": 0.14671520947671576, "grad_norm": 0.37563297659114, "learning_rate": 0.0003806121402665372, "loss": 0.4168, "step": 5400 }, { "epoch": 0.1480736836385372, "grad_norm": 0.3315292653141971, "learning_rate": 0.00038024164933681703, "loss": 0.4094, "step": 5450 }, { "epoch": 0.14943215780035865, "grad_norm": 0.3989400802203129, "learning_rate": 0.00037986783573012935, "loss": 0.4068, "step": 5500 }, { "epoch": 0.15079063196218007, "grad_norm": 0.3194388411256492, "learning_rate": 0.0003794907063375666, "loss": 0.4003, "step": 5550 }, { "epoch": 0.1521491061240015, "grad_norm": 0.30240424166641394, "learning_rate": 0.00037911026811134616, "loss": 0.407, "step": 5600 }, { "epoch": 0.15350758028582295, "grad_norm": 0.351737936530188, "learning_rate": 0.0003787265280646825, "loss": 0.4107, "step": 5650 }, { "epoch": 0.1548660544476444, "grad_norm": 0.3836451635236239, "learning_rate": 0.0003783394932716577, "loss": 0.3999, "step": 5700 }, { "epoch": 0.15622452860946584, "grad_norm": 0.25767700494067813, "learning_rate": 0.0003779491708670909, "loss": 0.388, "step": 5750 }, { "epoch": 0.15758300277128728, "grad_norm": 0.35195866802663683, "learning_rate": 0.00037755556804640723, "loss": 0.3986, "step": 5800 }, { "epoch": 0.15894147693310873, "grad_norm": 0.37789059875509895, "learning_rate": 0.00037715869206550467, "loss": 0.4124, "step": 5850 }, { "epoch": 0.16029995109493017, "grad_norm": 0.29714505650099465, "learning_rate": 0.0003767585502406204, "loss": 0.382, "step": 5900 }, { "epoch": 0.16165842525675161, "grad_norm": 0.36569606019843054, "learning_rate": 0.0003763551499481964, "loss": 0.4091, "step": 5950 }, { "epoch": 0.16301689941857306, "grad_norm": 0.5124251442347727, "learning_rate": 0.0003759484986247426, "loss": 0.3957, "step": 6000 }, { "epoch": 0.1643753735803945, "grad_norm": 0.42061839044447147, "learning_rate": 0.0003755386037667007, "loss": 0.3939, "step": 6050 }, { "epoch": 0.16573384774221595, "grad_norm": 0.278996914346875, "learning_rate": 0.0003751254729303053, "loss": 0.4171, "step": 6100 }, { "epoch": 0.1670923219040374, "grad_norm": 0.22931110142699168, "learning_rate": 0.0003747091137314451, "loss": 0.4037, "step": 6150 }, { "epoch": 0.16845079606585883, "grad_norm": 0.4632003674215028, "learning_rate": 0.00037428953384552197, "loss": 0.3856, "step": 6200 }, { "epoch": 0.16980927022768028, "grad_norm": 0.3456285538182738, "learning_rate": 0.00037386674100730986, "loss": 0.3887, "step": 6250 }, { "epoch": 0.17116774438950172, "grad_norm": 0.4299448792360789, "learning_rate": 0.0003734407430108124, "loss": 0.3802, "step": 6300 }, { "epoch": 0.17252621855132316, "grad_norm": 0.26911515847901823, "learning_rate": 0.0003730115477091185, "loss": 0.3906, "step": 6350 }, { "epoch": 0.1738846927131446, "grad_norm": 0.24894262795042146, "learning_rate": 0.00037257916301425823, "loss": 0.3743, "step": 6400 }, { "epoch": 0.17524316687496605, "grad_norm": 0.36696930956387547, "learning_rate": 0.00037214359689705676, "loss": 0.3977, "step": 6450 }, { "epoch": 0.1766016410367875, "grad_norm": 0.4978113756154764, "learning_rate": 0.0003717048573869873, "loss": 0.3782, "step": 6500 }, { "epoch": 0.1779601151986089, "grad_norm": 0.3180001579838143, "learning_rate": 0.00037126295257202324, "loss": 0.3975, "step": 6550 }, { "epoch": 0.17931858936043035, "grad_norm": 0.408582540741261, "learning_rate": 0.0003708178905984891, "loss": 0.3763, "step": 6600 }, { "epoch": 0.1806770635222518, "grad_norm": 0.27659616140789006, "learning_rate": 0.00037036967967091005, "loss": 0.4013, "step": 6650 }, { "epoch": 0.18203553768407324, "grad_norm": 0.16425635963494883, "learning_rate": 0.00036991832805186107, "loss": 0.3865, "step": 6700 }, { "epoch": 0.18339401184589468, "grad_norm": 0.37646449133193777, "learning_rate": 0.00036946384406181425, "loss": 0.3892, "step": 6750 }, { "epoch": 0.18475248600771613, "grad_norm": 0.3427684491705063, "learning_rate": 0.0003690062360789858, "loss": 0.3969, "step": 6800 }, { "epoch": 0.18611096016953757, "grad_norm": 0.392156716422773, "learning_rate": 0.0003685455125391811, "loss": 0.3709, "step": 6850 }, { "epoch": 0.18746943433135901, "grad_norm": 0.3626113579097081, "learning_rate": 0.0003680816819356398, "loss": 0.3929, "step": 6900 }, { "epoch": 0.18882790849318046, "grad_norm": 0.3652111299862884, "learning_rate": 0.00036761475281887863, "loss": 0.3941, "step": 6950 }, { "epoch": 0.1901863826550019, "grad_norm": 0.3038051956143636, "learning_rate": 0.0003671541644021072, "loss": 0.4019, "step": 7000 }, { "epoch": 0.19154485681682334, "grad_norm": 0.34844140332875567, "learning_rate": 0.00036668112567831633, "loss": 0.3666, "step": 7050 }, { "epoch": 0.1929033309786448, "grad_norm": 0.31453187134210386, "learning_rate": 0.0003662050142599555, "loss": 0.4062, "step": 7100 }, { "epoch": 0.19426180514046623, "grad_norm": 0.29627297234275446, "learning_rate": 0.00036572583892393305, "loss": 0.3807, "step": 7150 }, { "epoch": 0.19562027930228768, "grad_norm": 0.37160716135610367, "learning_rate": 0.0003652436085036393, "loss": 0.3936, "step": 7200 }, { "epoch": 0.19697875346410912, "grad_norm": 0.3178371792735983, "learning_rate": 0.0003647583318887839, "loss": 0.3942, "step": 7250 }, { "epoch": 0.19833722762593056, "grad_norm": 0.36580162938243826, "learning_rate": 0.0003642700180252315, "loss": 0.3932, "step": 7300 }, { "epoch": 0.199695701787752, "grad_norm": 0.2503536243665017, "learning_rate": 0.0003637786759148375, "loss": 0.3835, "step": 7350 }, { "epoch": 0.20105417594957345, "grad_norm": 0.3422541205714727, "learning_rate": 0.0003632942313704729, "loss": 0.3869, "step": 7400 }, { "epoch": 0.2024126501113949, "grad_norm": 0.2635673669764689, "learning_rate": 0.00036279692010693837, "loss": 0.374, "step": 7450 }, { "epoch": 0.20377112427321634, "grad_norm": 0.32133691105349677, "learning_rate": 0.0003622966077524861, "loss": 0.3829, "step": 7500 }, { "epoch": 0.20512959843503775, "grad_norm": 0.2963373700211773, "learning_rate": 0.0003617933035301583, "loss": 0.3784, "step": 7550 }, { "epoch": 0.2064880725968592, "grad_norm": 0.34175259284676457, "learning_rate": 0.000361287016718151, "loss": 0.3634, "step": 7600 }, { "epoch": 0.20784654675868064, "grad_norm": 0.21695916175140184, "learning_rate": 0.0003607777566496428, "loss": 0.3913, "step": 7650 }, { "epoch": 0.20920502092050208, "grad_norm": 0.38835064397473545, "learning_rate": 0.00036027580617629013, "loss": 0.3937, "step": 7700 }, { "epoch": 0.21056349508232353, "grad_norm": 0.33577145967062894, "learning_rate": 0.00035976068680901367, "loss": 0.4041, "step": 7750 }, { "epoch": 0.21192196924414497, "grad_norm": 0.2829507495162838, "learning_rate": 0.0003592426223224691, "loss": 0.3885, "step": 7800 }, { "epoch": 0.2132804434059664, "grad_norm": 0.30282305495180506, "learning_rate": 0.00035872162226695156, "loss": 0.425, "step": 7850 }, { "epoch": 0.21463891756778786, "grad_norm": 0.26572704020691806, "learning_rate": 0.000358197696246872, "loss": 0.3719, "step": 7900 }, { "epoch": 0.2159973917296093, "grad_norm": 0.4362525108778928, "learning_rate": 0.0003576708539205804, "loss": 0.3751, "step": 7950 }, { "epoch": 0.21735586589143074, "grad_norm": 0.25828364696271605, "learning_rate": 0.0003571411050001875, "loss": 0.3863, "step": 8000 }, { "epoch": 0.2187143400532522, "grad_norm": 0.3678501770422065, "learning_rate": 0.00035660845925138585, "loss": 0.3931, "step": 8050 }, { "epoch": 0.22007281421507363, "grad_norm": 0.27819454488467, "learning_rate": 0.00035607292649326983, "loss": 0.3633, "step": 8100 }, { "epoch": 0.22143128837689507, "grad_norm": 0.32104809246529153, "learning_rate": 0.00035553451659815457, "loss": 0.3914, "step": 8150 }, { "epoch": 0.22278976253871652, "grad_norm": 0.32593288782577173, "learning_rate": 0.000354993239491394, "loss": 0.3721, "step": 8200 }, { "epoch": 0.22414823670053796, "grad_norm": 0.2740573766851601, "learning_rate": 0.00035444910515119776, "loss": 0.3725, "step": 8250 }, { "epoch": 0.2255067108623594, "grad_norm": 0.6372146103791719, "learning_rate": 0.00035390212360844744, "loss": 0.3786, "step": 8300 }, { "epoch": 0.22686518502418085, "grad_norm": 0.3847972604563355, "learning_rate": 0.00035335230494651165, "loss": 0.3807, "step": 8350 }, { "epoch": 0.2282236591860023, "grad_norm": 0.3297830679777594, "learning_rate": 0.00035279965930105987, "loss": 0.3757, "step": 8400 }, { "epoch": 0.22958213334782374, "grad_norm": 0.27775568777577214, "learning_rate": 0.00035224419685987593, "loss": 0.3796, "step": 8450 }, { "epoch": 0.23094060750964518, "grad_norm": 0.2985445199508582, "learning_rate": 0.0003516859278626702, "loss": 0.385, "step": 8500 }, { "epoch": 0.2322990816714666, "grad_norm": 0.2822498403343061, "learning_rate": 0.00035112486260089026, "loss": 0.3654, "step": 8550 }, { "epoch": 0.23365755583328804, "grad_norm": 0.29173659614651143, "learning_rate": 0.0003505610114175323, "loss": 0.3693, "step": 8600 }, { "epoch": 0.23501602999510948, "grad_norm": 0.37316571053101977, "learning_rate": 0.00034999438470694903, "loss": 0.3624, "step": 8650 }, { "epoch": 0.23637450415693093, "grad_norm": 0.2991870149215513, "learning_rate": 0.0003494249929146593, "loss": 0.3944, "step": 8700 }, { "epoch": 0.23773297831875237, "grad_norm": 0.2626766731044403, "learning_rate": 0.000348852846537155, "loss": 0.3562, "step": 8750 }, { "epoch": 0.2390914524805738, "grad_norm": 0.3742270372557048, "learning_rate": 0.0003482779561217074, "loss": 0.3737, "step": 8800 }, { "epoch": 0.24044992664239526, "grad_norm": 0.33975911074890713, "learning_rate": 0.000347700332266173, "loss": 0.3673, "step": 8850 }, { "epoch": 0.2418084008042167, "grad_norm": 0.24493733544746576, "learning_rate": 0.00034711998561879823, "loss": 0.3863, "step": 8900 }, { "epoch": 0.24316687496603814, "grad_norm": 0.24367293507823867, "learning_rate": 0.00034653692687802295, "loss": 0.3597, "step": 8950 }, { "epoch": 0.2445253491278596, "grad_norm": 0.4992858322232737, "learning_rate": 0.0003459511667922831, "loss": 0.3759, "step": 9000 }, { "epoch": 0.24588382328968103, "grad_norm": 0.34074243370950325, "learning_rate": 0.000345362716159813, "loss": 0.3704, "step": 9050 }, { "epoch": 0.24724229745150247, "grad_norm": 0.37589710424283923, "learning_rate": 0.0003447715858284458, "loss": 0.3605, "step": 9100 }, { "epoch": 0.24860077161332392, "grad_norm": 0.2954917370820438, "learning_rate": 0.00034417778669541414, "loss": 0.3619, "step": 9150 }, { "epoch": 0.24995924577514536, "grad_norm": 0.2682683326827451, "learning_rate": 0.00034358132970714833, "loss": 0.3548, "step": 9200 }, { "epoch": 0.2513177199369668, "grad_norm": 0.3334186903450503, "learning_rate": 0.00034298222585907556, "loss": 0.3582, "step": 9250 }, { "epoch": 0.2526761940987882, "grad_norm": 0.3037548456565183, "learning_rate": 0.0003423804861954165, "loss": 0.3598, "step": 9300 }, { "epoch": 0.25403466826060966, "grad_norm": 0.3415172861536472, "learning_rate": 0.00034177612180898186, "loss": 0.3596, "step": 9350 }, { "epoch": 0.2553931424224311, "grad_norm": 0.2778208976878839, "learning_rate": 0.0003411691438409683, "loss": 0.3557, "step": 9400 }, { "epoch": 0.25675161658425255, "grad_norm": 0.2722662895108223, "learning_rate": 0.0003405595634807524, "loss": 0.3568, "step": 9450 }, { "epoch": 0.258110090746074, "grad_norm": 0.2470305006609605, "learning_rate": 0.00033994739196568485, "loss": 0.3693, "step": 9500 }, { "epoch": 0.25946856490789544, "grad_norm": 0.3204976945527186, "learning_rate": 0.00033933264058088323, "loss": 0.3744, "step": 9550 }, { "epoch": 0.2608270390697169, "grad_norm": 0.29419054157347835, "learning_rate": 0.0003387153206590238, "loss": 0.3578, "step": 9600 }, { "epoch": 0.2621855132315383, "grad_norm": 0.285418283786098, "learning_rate": 0.0003380954435801327, "loss": 0.3666, "step": 9650 }, { "epoch": 0.26354398739335977, "grad_norm": 0.40228260917678826, "learning_rate": 0.0003374730207713763, "loss": 0.3642, "step": 9700 }, { "epoch": 0.2649024615551812, "grad_norm": 0.30226961110995426, "learning_rate": 0.0003368480637068501, "loss": 0.3955, "step": 9750 }, { "epoch": 0.26626093571700266, "grad_norm": 0.445566495529195, "learning_rate": 0.00033622058390736785, "loss": 0.3756, "step": 9800 }, { "epoch": 0.2676194098788241, "grad_norm": 0.387905313007887, "learning_rate": 0.00033559059294024864, "loss": 0.3657, "step": 9850 }, { "epoch": 0.26897788404064554, "grad_norm": 0.22352584167074716, "learning_rate": 0.00033495810241910385, "loss": 0.3452, "step": 9900 }, { "epoch": 0.270336358202467, "grad_norm": 0.2556859512831143, "learning_rate": 0.00033432312400362305, "loss": 0.3463, "step": 9950 }, { "epoch": 0.27169483236428843, "grad_norm": 0.5717326212718582, "learning_rate": 0.00033368566939935925, "loss": 0.3731, "step": 10000 }, { "epoch": 0.2730533065261099, "grad_norm": 0.32255486027652513, "learning_rate": 0.0003330457503575127, "loss": 0.3698, "step": 10050 }, { "epoch": 0.2744117806879313, "grad_norm": 0.25946616692420554, "learning_rate": 0.0003324033786747145, "loss": 0.3637, "step": 10100 }, { "epoch": 0.27577025484975276, "grad_norm": 0.3530893683862247, "learning_rate": 0.0003317585661928094, "loss": 0.3646, "step": 10150 }, { "epoch": 0.2771287290115742, "grad_norm": 0.2853492397892913, "learning_rate": 0.000331111324798637, "loss": 0.3295, "step": 10200 }, { "epoch": 0.27848720317339565, "grad_norm": 0.19156693197511587, "learning_rate": 0.0003304616664238127, "loss": 0.359, "step": 10250 }, { "epoch": 0.2798456773352171, "grad_norm": 0.26150890027393986, "learning_rate": 0.00032980960304450834, "loss": 0.3665, "step": 10300 }, { "epoch": 0.28120415149703853, "grad_norm": 0.3656668619278649, "learning_rate": 0.00032915514668123056, "loss": 0.3498, "step": 10350 }, { "epoch": 0.28256262565886, "grad_norm": 0.36465329584026973, "learning_rate": 0.00032849830939859977, "loss": 0.3722, "step": 10400 }, { "epoch": 0.2839210998206814, "grad_norm": 0.28300896811439313, "learning_rate": 0.00032783910330512776, "loss": 0.3583, "step": 10450 }, { "epoch": 0.28527957398250287, "grad_norm": 0.2470361242697161, "learning_rate": 0.000327177540552994, "loss": 0.3462, "step": 10500 }, { "epoch": 0.2866380481443243, "grad_norm": 0.36918740507242426, "learning_rate": 0.0003265136333378223, "loss": 0.3699, "step": 10550 }, { "epoch": 0.28799652230614575, "grad_norm": 0.32934977838919777, "learning_rate": 0.0003258473938984554, "loss": 0.3625, "step": 10600 }, { "epoch": 0.2893549964679672, "grad_norm": 0.2087576531023101, "learning_rate": 0.0003251788345167296, "loss": 0.3568, "step": 10650 }, { "epoch": 0.29071347062978864, "grad_norm": 0.39857663501798557, "learning_rate": 0.00032450796751724837, "loss": 0.3591, "step": 10700 }, { "epoch": 0.2920719447916101, "grad_norm": 0.32871619749282505, "learning_rate": 0.00032383480526715526, "loss": 0.3603, "step": 10750 }, { "epoch": 0.2934304189534315, "grad_norm": 0.3062057976969262, "learning_rate": 0.00032315936017590554, "loss": 0.3575, "step": 10800 }, { "epoch": 0.29478889311525297, "grad_norm": 0.35676299616191043, "learning_rate": 0.0003224816446950378, "loss": 0.3406, "step": 10850 }, { "epoch": 0.2961473672770744, "grad_norm": 0.268494161462533, "learning_rate": 0.00032180167131794425, "loss": 0.3356, "step": 10900 }, { "epoch": 0.29750584143889586, "grad_norm": 0.39304660551244835, "learning_rate": 0.0003211194525796404, "loss": 0.3681, "step": 10950 }, { "epoch": 0.2988643156007173, "grad_norm": 0.36242243481768954, "learning_rate": 0.00032043500105653414, "loss": 0.3624, "step": 11000 }, { "epoch": 0.3002227897625387, "grad_norm": 0.32191025299969356, "learning_rate": 0.0003197483293661937, "loss": 0.3639, "step": 11050 }, { "epoch": 0.30158126392436013, "grad_norm": 0.3819533916481645, "learning_rate": 0.0003190594501671151, "loss": 0.3639, "step": 11100 }, { "epoch": 0.3029397380861816, "grad_norm": 0.3051700734949664, "learning_rate": 0.000318368376158489, "loss": 0.3495, "step": 11150 }, { "epoch": 0.304298212248003, "grad_norm": 0.25353257677490404, "learning_rate": 0.00031768900650322744, "loss": 0.3424, "step": 11200 }, { "epoch": 0.30565668640982446, "grad_norm": 0.2971513332547502, "learning_rate": 0.000316993624394983, "loss": 0.3465, "step": 11250 }, { "epoch": 0.3070151605716459, "grad_norm": 0.3393454172568527, "learning_rate": 0.00031629608555979686, "loss": 0.357, "step": 11300 }, { "epoch": 0.30837363473346735, "grad_norm": 0.3079000714041467, "learning_rate": 0.0003155964028564964, "loss": 0.3315, "step": 11350 }, { "epoch": 0.3097321088952888, "grad_norm": 0.236457076827118, "learning_rate": 0.00031489458918342993, "loss": 0.3586, "step": 11400 }, { "epoch": 0.31109058305711024, "grad_norm": 0.37187670656153765, "learning_rate": 0.0003141906574782295, "loss": 0.3479, "step": 11450 }, { "epoch": 0.3124490572189317, "grad_norm": 0.2858744824646288, "learning_rate": 0.0003134846207175722, "loss": 0.359, "step": 11500 }, { "epoch": 0.3138075313807531, "grad_norm": 0.29954433740207526, "learning_rate": 0.00031277649191694063, "loss": 0.3466, "step": 11550 }, { "epoch": 0.31516600554257457, "grad_norm": 0.25530263584194796, "learning_rate": 0.0003120662841303836, "loss": 0.3488, "step": 11600 }, { "epoch": 0.316524479704396, "grad_norm": 0.22413446350946586, "learning_rate": 0.0003113540104502747, "loss": 0.3471, "step": 11650 }, { "epoch": 0.31788295386621745, "grad_norm": 0.40702625283242805, "learning_rate": 0.000310639684007072, "loss": 0.3382, "step": 11700 }, { "epoch": 0.3192414280280389, "grad_norm": 0.2866280004893114, "learning_rate": 0.0003099233179690746, "loss": 0.3779, "step": 11750 }, { "epoch": 0.32059990218986034, "grad_norm": 0.313655190983661, "learning_rate": 0.0003092049255421813, "loss": 0.3646, "step": 11800 }, { "epoch": 0.3219583763516818, "grad_norm": 0.34970496506197146, "learning_rate": 0.00030848451996964615, "loss": 0.3628, "step": 11850 }, { "epoch": 0.32331685051350323, "grad_norm": 0.36130996602692567, "learning_rate": 0.00030776211453183475, "loss": 0.3608, "step": 11900 }, { "epoch": 0.3246753246753247, "grad_norm": 0.22850628919525512, "learning_rate": 0.00030703772254597945, "loss": 0.326, "step": 11950 }, { "epoch": 0.3260337988371461, "grad_norm": 0.3620511895369416, "learning_rate": 0.00030631135736593364, "loss": 0.349, "step": 12000 }, { "epoch": 0.32739227299896756, "grad_norm": 0.2122923442045741, "learning_rate": 0.0003055830323819257, "loss": 0.3734, "step": 12050 }, { "epoch": 0.328750747160789, "grad_norm": 0.24737840068319314, "learning_rate": 0.00030485276102031235, "loss": 0.358, "step": 12100 }, { "epoch": 0.33010922132261045, "grad_norm": 0.3610838024240164, "learning_rate": 0.0003041205567433305, "loss": 0.3513, "step": 12150 }, { "epoch": 0.3314676954844319, "grad_norm": 0.33939684182516894, "learning_rate": 0.0003033864330488499, "loss": 0.3555, "step": 12200 }, { "epoch": 0.33282616964625333, "grad_norm": 0.2666873772787006, "learning_rate": 0.00030265040347012397, "loss": 0.3469, "step": 12250 }, { "epoch": 0.3341846438080748, "grad_norm": 0.21914168729339542, "learning_rate": 0.00030191248157554, "loss": 0.3323, "step": 12300 }, { "epoch": 0.3355431179698962, "grad_norm": 0.3499432909434212, "learning_rate": 0.0003011726809683694, "loss": 0.3321, "step": 12350 }, { "epoch": 0.33690159213171766, "grad_norm": 0.2752315627002723, "learning_rate": 0.0003004310152865169, "loss": 0.366, "step": 12400 }, { "epoch": 0.3382600662935391, "grad_norm": 0.3224312997036977, "learning_rate": 0.0002996874982022692, "loss": 0.3363, "step": 12450 }, { "epoch": 0.33961854045536055, "grad_norm": 0.2614682577027786, "learning_rate": 0.00029894214342204243, "loss": 0.3364, "step": 12500 }, { "epoch": 0.340977014617182, "grad_norm": 0.35386811908507626, "learning_rate": 0.00029819496468613024, "loss": 0.3468, "step": 12550 }, { "epoch": 0.34233548877900344, "grad_norm": 0.3451776698004379, "learning_rate": 0.00029744597576844995, "loss": 0.3457, "step": 12600 }, { "epoch": 0.3436939629408249, "grad_norm": 0.2075635034305044, "learning_rate": 0.00029669519047628874, "loss": 0.3217, "step": 12650 }, { "epoch": 0.3450524371026463, "grad_norm": 0.35938341724916706, "learning_rate": 0.0002959426226500493, "loss": 0.3518, "step": 12700 }, { "epoch": 0.34641091126446777, "grad_norm": 0.2028404175509972, "learning_rate": 0.0002951882861629944, "loss": 0.3464, "step": 12750 }, { "epoch": 0.3477693854262892, "grad_norm": 0.3092038953376563, "learning_rate": 0.00029443219492099153, "loss": 0.3565, "step": 12800 }, { "epoch": 0.34912785958811066, "grad_norm": 0.29068071721416333, "learning_rate": 0.0002936743628622562, "loss": 0.3315, "step": 12850 }, { "epoch": 0.3504863337499321, "grad_norm": 0.20779330405236773, "learning_rate": 0.0002929148039570951, "loss": 0.3174, "step": 12900 }, { "epoch": 0.35184480791175354, "grad_norm": 0.31923873979474604, "learning_rate": 0.00029215353220764863, "loss": 0.3441, "step": 12950 }, { "epoch": 0.353203282073575, "grad_norm": 0.2745041226462606, "learning_rate": 0.00029139056164763274, "loss": 0.3467, "step": 13000 }, { "epoch": 0.35456175623539643, "grad_norm": 0.4368395215278957, "learning_rate": 0.0002906259063420803, "loss": 0.3517, "step": 13050 }, { "epoch": 0.3559202303972178, "grad_norm": 0.30792463599025904, "learning_rate": 0.0002898595803870815, "loss": 0.3442, "step": 13100 }, { "epoch": 0.35727870455903926, "grad_norm": 0.3611952448865168, "learning_rate": 0.0002890915979095244, "loss": 0.3204, "step": 13150 }, { "epoch": 0.3586371787208607, "grad_norm": 0.23056033787481225, "learning_rate": 0.0002883219730668345, "loss": 0.3239, "step": 13200 }, { "epoch": 0.35999565288268215, "grad_norm": 0.2530394826085691, "learning_rate": 0.00028755072004671314, "loss": 0.3473, "step": 13250 }, { "epoch": 0.3613541270445036, "grad_norm": 0.33962698046120804, "learning_rate": 0.000286793326131175, "loss": 0.3416, "step": 13300 }, { "epoch": 0.36271260120632504, "grad_norm": 0.21053853436821962, "learning_rate": 0.0002860188912935213, "loss": 0.3278, "step": 13350 }, { "epoch": 0.3640710753681465, "grad_norm": 0.3129818212559564, "learning_rate": 0.00028524287073475416, "loss": 0.3541, "step": 13400 }, { "epoch": 0.3654295495299679, "grad_norm": 0.2699867150782398, "learning_rate": 0.0002844652787604775, "loss": 0.3403, "step": 13450 }, { "epoch": 0.36678802369178937, "grad_norm": 0.28737860107629143, "learning_rate": 0.00028368612970526357, "loss": 0.3323, "step": 13500 }, { "epoch": 0.3681464978536108, "grad_norm": 0.3515960746260734, "learning_rate": 0.00028290543793238867, "loss": 0.3293, "step": 13550 }, { "epoch": 0.36950497201543225, "grad_norm": 0.20401533840321576, "learning_rate": 0.0002821232178335684, "loss": 0.3316, "step": 13600 }, { "epoch": 0.3708634461772537, "grad_norm": 0.19995437318728543, "learning_rate": 0.0002813551732516669, "loss": 0.3427, "step": 13650 }, { "epoch": 0.37222192033907514, "grad_norm": 0.2545451160615089, "learning_rate": 0.00028056996963593105, "loss": 0.3246, "step": 13700 }, { "epoch": 0.3735803945008966, "grad_norm": 0.29065996361482416, "learning_rate": 0.0002797832807475994, "loss": 0.3377, "step": 13750 }, { "epoch": 0.37493886866271803, "grad_norm": 0.3334762345639782, "learning_rate": 0.00027899512108894186, "loss": 0.3281, "step": 13800 }, { "epoch": 0.37629734282453947, "grad_norm": 0.18363139112462235, "learning_rate": 0.00027820550518934127, "loss": 0.3498, "step": 13850 }, { "epoch": 0.3776558169863609, "grad_norm": 0.303677922590966, "learning_rate": 0.00027741444760502593, "loss": 0.3282, "step": 13900 }, { "epoch": 0.37901429114818236, "grad_norm": 0.33021307742532524, "learning_rate": 0.0002766378265036753, "loss": 0.3612, "step": 13950 }, { "epoch": 0.3803727653100038, "grad_norm": 0.3855197948015209, "learning_rate": 0.00027584395743117087, "loss": 0.326, "step": 14000 }, { "epoch": 0.38173123947182525, "grad_norm": 0.17305752786285836, "learning_rate": 0.0002750486902080647, "loss": 0.3306, "step": 14050 }, { "epoch": 0.3830897136336467, "grad_norm": 0.3557889572340088, "learning_rate": 0.0002742520394947646, "loss": 0.3363, "step": 14100 }, { "epoch": 0.38444818779546813, "grad_norm": 0.269254653829798, "learning_rate": 0.0002734540199771824, "loss": 0.3509, "step": 14150 }, { "epoch": 0.3858066619572896, "grad_norm": 0.46153677475953025, "learning_rate": 0.00027265464636646333, "loss": 0.3423, "step": 14200 }, { "epoch": 0.387165136119111, "grad_norm": 0.25450280604338793, "learning_rate": 0.0002718539333987147, "loss": 0.3344, "step": 14250 }, { "epoch": 0.38852361028093246, "grad_norm": 0.24854855950361845, "learning_rate": 0.00027105189583473416, "loss": 0.317, "step": 14300 }, { "epoch": 0.3898820844427539, "grad_norm": 0.25191512294105933, "learning_rate": 0.00027024854845973797, "loss": 0.3343, "step": 14350 }, { "epoch": 0.39124055860457535, "grad_norm": 0.3399094367009323, "learning_rate": 0.000269443906083088, "loss": 0.3141, "step": 14400 }, { "epoch": 0.3925990327663968, "grad_norm": 0.27297702861099216, "learning_rate": 0.00026863798353801905, "loss": 0.344, "step": 14450 }, { "epoch": 0.39395750692821824, "grad_norm": 0.3089505317673794, "learning_rate": 0.000267830795681365, "loss": 0.3248, "step": 14500 }, { "epoch": 0.3953159810900397, "grad_norm": 0.28407318632921835, "learning_rate": 0.0002670223573932857, "loss": 0.3218, "step": 14550 }, { "epoch": 0.3966744552518611, "grad_norm": 0.27517856010825675, "learning_rate": 0.0002662126835769916, "loss": 0.3207, "step": 14600 }, { "epoch": 0.39803292941368257, "grad_norm": 0.2209431864475645, "learning_rate": 0.00026540178915847003, "loss": 0.3213, "step": 14650 }, { "epoch": 0.399391403575504, "grad_norm": 0.3012179785372981, "learning_rate": 0.0002645896890862093, "loss": 0.3031, "step": 14700 }, { "epoch": 0.40074987773732546, "grad_norm": 0.35758174495742123, "learning_rate": 0.0002637763983309235, "loss": 0.3244, "step": 14750 }, { "epoch": 0.4021083518991469, "grad_norm": 0.20197976836253828, "learning_rate": 0.00026296193188527655, "loss": 0.3211, "step": 14800 }, { "epoch": 0.40346682606096834, "grad_norm": 0.2784592655459722, "learning_rate": 0.0002621463047636057, "loss": 0.3233, "step": 14850 }, { "epoch": 0.4048253002227898, "grad_norm": 0.3125528717241462, "learning_rate": 0.0002613295320016445, "loss": 0.324, "step": 14900 }, { "epoch": 0.40618377438461123, "grad_norm": 0.3316178107391592, "learning_rate": 0.00026051162865624636, "loss": 0.3358, "step": 14950 }, { "epoch": 0.4075422485464327, "grad_norm": 0.18439640337971394, "learning_rate": 0.00025969260980510605, "loss": 0.3031, "step": 15000 }, { "epoch": 0.4089007227082541, "grad_norm": 0.2963162351967641, "learning_rate": 0.00025887249054648245, "loss": 0.3276, "step": 15050 }, { "epoch": 0.4102591968700755, "grad_norm": 0.22466036509634918, "learning_rate": 0.00025805128599891994, "loss": 0.3364, "step": 15100 }, { "epoch": 0.41161767103189695, "grad_norm": 0.2956284294357639, "learning_rate": 0.00025722901130096975, "loss": 0.3314, "step": 15150 }, { "epoch": 0.4129761451937184, "grad_norm": 0.36079018502753485, "learning_rate": 0.00025642215844549676, "loss": 0.3351, "step": 15200 }, { "epoch": 0.41433461935553983, "grad_norm": 0.3005152688044544, "learning_rate": 0.00025559780958847773, "loss": 0.3202, "step": 15250 }, { "epoch": 0.4156930935173613, "grad_norm": 0.33773262295043566, "learning_rate": 0.00025477243580984904, "loss": 0.3089, "step": 15300 }, { "epoch": 0.4170515676791827, "grad_norm": 0.3045253275707874, "learning_rate": 0.00025394605232501987, "loss": 0.32, "step": 15350 }, { "epoch": 0.41841004184100417, "grad_norm": 0.34532028635872886, "learning_rate": 0.0002531186743680128, "loss": 0.3449, "step": 15400 }, { "epoch": 0.4197685160028256, "grad_norm": 0.14134553464927366, "learning_rate": 0.0002522903171911834, "loss": 0.3184, "step": 15450 }, { "epoch": 0.42112699016464705, "grad_norm": 0.2761208223745771, "learning_rate": 0.00025146099606493817, "loss": 0.3133, "step": 15500 }, { "epoch": 0.4224854643264685, "grad_norm": 0.26549068110573254, "learning_rate": 0.0002506307262774542, "loss": 0.3205, "step": 15550 }, { "epoch": 0.42384393848828994, "grad_norm": 0.2773532590377829, "learning_rate": 0.00024979952313439636, "loss": 0.3064, "step": 15600 }, { "epoch": 0.4252024126501114, "grad_norm": 0.3073493622335924, "learning_rate": 0.0002489674019586356, "loss": 0.3188, "step": 15650 }, { "epoch": 0.4265608868119328, "grad_norm": 0.21684387112528378, "learning_rate": 0.0002481343780899665, "loss": 0.3198, "step": 15700 }, { "epoch": 0.42791936097375427, "grad_norm": 0.25977297083107986, "learning_rate": 0.00024730046688482436, "loss": 0.3065, "step": 15750 }, { "epoch": 0.4292778351355757, "grad_norm": 0.3308957326333168, "learning_rate": 0.000246465683716002, "loss": 0.3085, "step": 15800 }, { "epoch": 0.43063630929739716, "grad_norm": 0.25944521132960924, "learning_rate": 0.0002456300439723668, "loss": 0.3136, "step": 15850 }, { "epoch": 0.4319947834592186, "grad_norm": 0.22121128637476822, "learning_rate": 0.0002447935630585764, "loss": 0.322, "step": 15900 }, { "epoch": 0.43335325762104004, "grad_norm": 0.32019002146360315, "learning_rate": 0.0002439562563947953, "loss": 0.3103, "step": 15950 }, { "epoch": 0.4347117317828615, "grad_norm": 0.26761100791647713, "learning_rate": 0.0002431181394164103, "loss": 0.3114, "step": 16000 }, { "epoch": 0.43607020594468293, "grad_norm": 0.22262870758692213, "learning_rate": 0.00024227922757374582, "loss": 0.3069, "step": 16050 }, { "epoch": 0.4374286801065044, "grad_norm": 0.18940890843015826, "learning_rate": 0.00024143953633177937, "loss": 0.327, "step": 16100 }, { "epoch": 0.4387871542683258, "grad_norm": 0.27459192854267717, "learning_rate": 0.00024059908116985654, "loss": 0.3183, "step": 16150 }, { "epoch": 0.44014562843014726, "grad_norm": 0.36514373383887516, "learning_rate": 0.00023975787758140525, "loss": 0.2878, "step": 16200 }, { "epoch": 0.4415041025919687, "grad_norm": 0.30714779342945764, "learning_rate": 0.00023891594107365024, "loss": 0.3173, "step": 16250 }, { "epoch": 0.44286257675379015, "grad_norm": 0.24572160078772548, "learning_rate": 0.0002380732871673276, "loss": 0.3169, "step": 16300 }, { "epoch": 0.4442210509156116, "grad_norm": 0.22451585676228034, "learning_rate": 0.00023722993139639806, "loss": 0.2982, "step": 16350 }, { "epoch": 0.44557952507743304, "grad_norm": 0.4312837719351318, "learning_rate": 0.000236436550903555, "loss": 0.3126, "step": 16400 }, { "epoch": 0.4469379992392545, "grad_norm": 0.23649698073314787, "learning_rate": 0.00023559187786324523, "loss": 0.3229, "step": 16450 }, { "epoch": 0.4482964734010759, "grad_norm": 0.21885398793120167, "learning_rate": 0.00023474654870203753, "loss": 0.3066, "step": 16500 }, { "epoch": 0.44965494756289737, "grad_norm": 0.2522766751448378, "learning_rate": 0.00023390057900320987, "loss": 0.3121, "step": 16550 }, { "epoch": 0.4510134217247188, "grad_norm": 0.2023032182722522, "learning_rate": 0.0002330539843618484, "loss": 0.295, "step": 16600 }, { "epoch": 0.45237189588654025, "grad_norm": 0.3093016288187825, "learning_rate": 0.00023220678038455975, "loss": 0.2962, "step": 16650 }, { "epoch": 0.4537303700483617, "grad_norm": 0.2805332120341892, "learning_rate": 0.00023135898268918323, "loss": 0.313, "step": 16700 }, { "epoch": 0.45508884421018314, "grad_norm": 0.25366173411593823, "learning_rate": 0.00023051060690450337, "loss": 0.308, "step": 16750 }, { "epoch": 0.4564473183720046, "grad_norm": 0.2848859608687515, "learning_rate": 0.00022966166866996134, "loss": 0.2966, "step": 16800 }, { "epoch": 0.45780579253382603, "grad_norm": 0.3400405221454168, "learning_rate": 0.0002288121836353669, "loss": 0.313, "step": 16850 }, { "epoch": 0.45916426669564747, "grad_norm": 0.25178053552917457, "learning_rate": 0.0002279621674606098, "loss": 0.3008, "step": 16900 }, { "epoch": 0.4605227408574689, "grad_norm": 0.2173042023564375, "learning_rate": 0.00022711163581537106, "loss": 0.3062, "step": 16950 }, { "epoch": 0.46188121501929036, "grad_norm": 0.23983156392471572, "learning_rate": 0.00022626060437883435, "loss": 0.3055, "step": 17000 }, { "epoch": 0.4632396891811118, "grad_norm": 0.21031049569805663, "learning_rate": 0.00022540908883939668, "loss": 0.311, "step": 17050 }, { "epoch": 0.4645981633429332, "grad_norm": 0.1989362043985782, "learning_rate": 0.00022455710489437927, "loss": 0.3259, "step": 17100 }, { "epoch": 0.46595663750475463, "grad_norm": 0.1900668237823236, "learning_rate": 0.00022370466824973812, "loss": 0.2797, "step": 17150 }, { "epoch": 0.4673151116665761, "grad_norm": 0.20840494435322787, "learning_rate": 0.00022285179461977483, "loss": 0.3079, "step": 17200 }, { "epoch": 0.4686735858283975, "grad_norm": 0.4022346673956682, "learning_rate": 0.00022199849972684633, "loss": 0.2958, "step": 17250 }, { "epoch": 0.47003205999021896, "grad_norm": 0.3270990206921089, "learning_rate": 0.0002211447993010755, "loss": 0.3313, "step": 17300 }, { "epoch": 0.4713905341520404, "grad_norm": 0.249803246986443, "learning_rate": 0.00022029070908006096, "loss": 0.3104, "step": 17350 }, { "epoch": 0.47274900831386185, "grad_norm": 0.2813145656422356, "learning_rate": 0.0002194362448085872, "loss": 0.3039, "step": 17400 }, { "epoch": 0.4741074824756833, "grad_norm": 0.20904103519051825, "learning_rate": 0.00021858142223833395, "loss": 0.3093, "step": 17450 }, { "epoch": 0.47546595663750474, "grad_norm": 0.2476519540180904, "learning_rate": 0.00021772625712758624, "loss": 0.3133, "step": 17500 }, { "epoch": 0.4768244307993262, "grad_norm": 0.2897735958185, "learning_rate": 0.00021687076524094353, "loss": 0.3184, "step": 17550 }, { "epoch": 0.4781829049611476, "grad_norm": 0.36797022439353905, "learning_rate": 0.0002160149623490293, "loss": 0.2982, "step": 17600 }, { "epoch": 0.47954137912296907, "grad_norm": 0.22151406862910683, "learning_rate": 0.0002151588642282003, "loss": 0.3031, "step": 17650 }, { "epoch": 0.4808998532847905, "grad_norm": 0.24573689529627643, "learning_rate": 0.00021430248666025561, "loss": 0.2927, "step": 17700 }, { "epoch": 0.48225832744661196, "grad_norm": 0.25110843175386494, "learning_rate": 0.0002134458454321459, "loss": 0.2984, "step": 17750 }, { "epoch": 0.4836168016084334, "grad_norm": 0.26920027208505604, "learning_rate": 0.00021258895633568238, "loss": 0.2869, "step": 17800 }, { "epoch": 0.48497527577025484, "grad_norm": 0.3111889899596438, "learning_rate": 0.0002117318351672454, "loss": 0.3215, "step": 17850 }, { "epoch": 0.4863337499320763, "grad_norm": 0.20320042839557148, "learning_rate": 0.00021087449772749347, "loss": 0.3019, "step": 17900 }, { "epoch": 0.48769222409389773, "grad_norm": 0.29026043340389285, "learning_rate": 0.00021001695982107217, "loss": 0.3087, "step": 17950 }, { "epoch": 0.4890506982557192, "grad_norm": 0.26193168931031524, "learning_rate": 0.00020915923725632244, "loss": 0.3036, "step": 18000 }, { "epoch": 0.4904091724175406, "grad_norm": 0.23673083795318206, "learning_rate": 0.0002083013458449893, "loss": 0.3111, "step": 18050 }, { "epoch": 0.49176764657936206, "grad_norm": 0.2259659757224692, "learning_rate": 0.00020744330140193046, "loss": 0.2883, "step": 18100 }, { "epoch": 0.4931261207411835, "grad_norm": 0.2902171908048496, "learning_rate": 0.00020658511974482475, "loss": 0.2898, "step": 18150 }, { "epoch": 0.49448459490300495, "grad_norm": 0.31472212166057917, "learning_rate": 0.0002057268166938803, "loss": 0.3111, "step": 18200 }, { "epoch": 0.4958430690648264, "grad_norm": 0.27417754560735935, "learning_rate": 0.00020486840807154325, "loss": 0.3013, "step": 18250 }, { "epoch": 0.49720154322664784, "grad_norm": 0.24533216444780298, "learning_rate": 0.0002040099097022059, "loss": 0.3073, "step": 18300 }, { "epoch": 0.4985600173884693, "grad_norm": 0.2597365406230817, "learning_rate": 0.0002031513374119148, "loss": 0.2918, "step": 18350 }, { "epoch": 0.4999184915502907, "grad_norm": 0.23849823607914308, "learning_rate": 0.00020229270702807952, "loss": 0.3044, "step": 18400 }, { "epoch": 0.5012769657121121, "grad_norm": 0.40233301575689023, "learning_rate": 0.0002014340343791802, "loss": 0.3086, "step": 18450 }, { "epoch": 0.5026354398739336, "grad_norm": 0.24678497017149986, "learning_rate": 0.00020057533529447647, "loss": 0.2947, "step": 18500 }, { "epoch": 0.503993914035755, "grad_norm": 0.18418790064404403, "learning_rate": 0.000199716625603715, "loss": 0.2802, "step": 18550 }, { "epoch": 0.5053523881975764, "grad_norm": 0.20614362466496808, "learning_rate": 0.00019887509507259376, "loss": 0.3082, "step": 18600 }, { "epoch": 0.5067108623593979, "grad_norm": 0.3176004501620565, "learning_rate": 0.0001980164110832425, "loss": 0.2946, "step": 18650 }, { "epoch": 0.5080693365212193, "grad_norm": 0.24434247355813202, "learning_rate": 0.00019715776366049622, "loss": 0.2852, "step": 18700 }, { "epoch": 0.5094278106830408, "grad_norm": 0.2632819823395696, "learning_rate": 0.00019629916863314945, "loss": 0.3119, "step": 18750 }, { "epoch": 0.5107862848448622, "grad_norm": 0.36866015249871253, "learning_rate": 0.00019544064182903077, "loss": 0.3064, "step": 18800 }, { "epoch": 0.5121447590066837, "grad_norm": 0.28334197775915865, "learning_rate": 0.000194582199074711, "loss": 0.2982, "step": 18850 }, { "epoch": 0.5135032331685051, "grad_norm": 0.29353450964831995, "learning_rate": 0.00019372385619521155, "loss": 0.2997, "step": 18900 }, { "epoch": 0.5148617073303265, "grad_norm": 0.30235983080661416, "learning_rate": 0.00019286562901371282, "loss": 0.2953, "step": 18950 }, { "epoch": 0.516220181492148, "grad_norm": 0.24006103860300088, "learning_rate": 0.0001920075333512621, "loss": 0.312, "step": 19000 }, { "epoch": 0.5175786556539694, "grad_norm": 0.25401074594196943, "learning_rate": 0.00019114958502648258, "loss": 0.2928, "step": 19050 }, { "epoch": 0.5189371298157909, "grad_norm": 0.3126940882002115, "learning_rate": 0.00019029179985528095, "loss": 0.2881, "step": 19100 }, { "epoch": 0.5202956039776123, "grad_norm": 0.244186090338719, "learning_rate": 0.00018945134391851735, "loss": 0.2844, "step": 19150 }, { "epoch": 0.5216540781394338, "grad_norm": 0.2620555496999193, "learning_rate": 0.00018861107474107164, "loss": 0.3033, "step": 19200 }, { "epoch": 0.5230125523012552, "grad_norm": 0.29660068432502984, "learning_rate": 0.00018775386516779982, "loss": 0.2815, "step": 19250 }, { "epoch": 0.5243710264630767, "grad_norm": 0.24636353127452668, "learning_rate": 0.0001868968813467351, "loss": 0.2982, "step": 19300 }, { "epoch": 0.5257295006248981, "grad_norm": 0.3036729051937609, "learning_rate": 0.00018604013907600413, "loss": 0.2697, "step": 19350 }, { "epoch": 0.5270879747867195, "grad_norm": 0.25151244998729483, "learning_rate": 0.00018518365414928073, "loss": 0.3005, "step": 19400 }, { "epoch": 0.528446448948541, "grad_norm": 0.3900757856018299, "learning_rate": 0.00018432744235549457, "loss": 0.3163, "step": 19450 }, { "epoch": 0.5298049231103624, "grad_norm": 0.3209166901430777, "learning_rate": 0.0001834715194785403, "loss": 0.2946, "step": 19500 }, { "epoch": 0.5311633972721839, "grad_norm": 0.20611000381285643, "learning_rate": 0.00018261590129698663, "loss": 0.2877, "step": 19550 }, { "epoch": 0.5325218714340053, "grad_norm": 0.21332069721707292, "learning_rate": 0.00018176060358378503, "loss": 0.2916, "step": 19600 }, { "epoch": 0.5338803455958268, "grad_norm": 0.34732582027624836, "learning_rate": 0.00018090564210597975, "loss": 0.3057, "step": 19650 }, { "epoch": 0.5352388197576482, "grad_norm": 0.23660042062818817, "learning_rate": 0.00018005103262441622, "loss": 0.2746, "step": 19700 }, { "epoch": 0.5365972939194696, "grad_norm": 0.23653513107119012, "learning_rate": 0.00017919679089345122, "loss": 0.295, "step": 19750 }, { "epoch": 0.5379557680812911, "grad_norm": 0.2066174691631555, "learning_rate": 0.00017834293266066222, "loss": 0.2896, "step": 19800 }, { "epoch": 0.5393142422431125, "grad_norm": 0.26332165957058984, "learning_rate": 0.00017748947366655687, "loss": 0.2811, "step": 19850 }, { "epoch": 0.540672716404934, "grad_norm": 0.22960074466120436, "learning_rate": 0.00017663642964428318, "loss": 0.2846, "step": 19900 }, { "epoch": 0.5420311905667554, "grad_norm": 0.3090166915756585, "learning_rate": 0.00017578381631933946, "loss": 0.2924, "step": 19950 }, { "epoch": 0.5433896647285769, "grad_norm": 0.36568571497107416, "learning_rate": 0.00017493164940928402, "loss": 0.2865, "step": 20000 }, { "epoch": 0.5447481388903983, "grad_norm": 0.29059486954556535, "learning_rate": 0.00017407994462344584, "loss": 0.2785, "step": 20050 }, { "epoch": 0.5461066130522197, "grad_norm": 0.27957466708084117, "learning_rate": 0.00017322871766263487, "loss": 0.2935, "step": 20100 }, { "epoch": 0.5474650872140412, "grad_norm": 0.2151461608605068, "learning_rate": 0.00017237798421885253, "loss": 0.2841, "step": 20150 }, { "epoch": 0.5488235613758626, "grad_norm": 0.24819887268532007, "learning_rate": 0.00017152775997500238, "loss": 0.285, "step": 20200 }, { "epoch": 0.5501820355376841, "grad_norm": 0.20284647935207317, "learning_rate": 0.0001706780606046013, "loss": 0.2927, "step": 20250 }, { "epoch": 0.5515405096995055, "grad_norm": 0.19244100345976062, "learning_rate": 0.0001698289017714902, "loss": 0.2645, "step": 20300 }, { "epoch": 0.552898983861327, "grad_norm": 0.22539860380829246, "learning_rate": 0.00016898029912954546, "loss": 0.2939, "step": 20350 }, { "epoch": 0.5542574580231484, "grad_norm": 0.2619800733732195, "learning_rate": 0.00016813226832239025, "loss": 0.2836, "step": 20400 }, { "epoch": 0.5556159321849699, "grad_norm": 0.23393114722266678, "learning_rate": 0.00016728482498310637, "loss": 0.2736, "step": 20450 }, { "epoch": 0.5569744063467913, "grad_norm": 0.30087081995833126, "learning_rate": 0.00016643798473394566, "loss": 0.2794, "step": 20500 }, { "epoch": 0.5583328805086127, "grad_norm": 0.308240444312431, "learning_rate": 0.00016559176318604258, "loss": 0.2671, "step": 20550 }, { "epoch": 0.5596913546704342, "grad_norm": 0.24052215603123736, "learning_rate": 0.00016474617593912583, "loss": 0.2874, "step": 20600 }, { "epoch": 0.5610498288322556, "grad_norm": 0.2750519886277399, "learning_rate": 0.00016390123858123118, "loss": 0.2732, "step": 20650 }, { "epoch": 0.5624083029940771, "grad_norm": 0.2175806661894403, "learning_rate": 0.0001630569666884139, "loss": 0.2885, "step": 20700 }, { "epoch": 0.5637667771558985, "grad_norm": 0.2923956849374819, "learning_rate": 0.00016221337582446172, "loss": 0.2924, "step": 20750 }, { "epoch": 0.56512525131772, "grad_norm": 0.2708091098394788, "learning_rate": 0.00016137048154060785, "loss": 0.2705, "step": 20800 }, { "epoch": 0.5664837254795414, "grad_norm": 0.260062882274282, "learning_rate": 0.0001605282993752446, "loss": 0.2833, "step": 20850 }, { "epoch": 0.5678421996413628, "grad_norm": 0.28046003747194964, "learning_rate": 0.00015968684485363635, "loss": 0.2875, "step": 20900 }, { "epoch": 0.5692006738031843, "grad_norm": 0.18648990278831484, "learning_rate": 0.0001588461334876338, "loss": 0.2788, "step": 20950 }, { "epoch": 0.5705591479650057, "grad_norm": 0.26108175409809964, "learning_rate": 0.000158006180775388, "loss": 0.2809, "step": 21000 }, { "epoch": 0.5719176221268272, "grad_norm": 0.15533902511877934, "learning_rate": 0.0001571670022010644, "loss": 0.2808, "step": 21050 }, { "epoch": 0.5732760962886486, "grad_norm": 0.17785716374013105, "learning_rate": 0.0001563286132345576, "loss": 0.2854, "step": 21100 }, { "epoch": 0.5746345704504701, "grad_norm": 0.2493856351979774, "learning_rate": 0.00015549102933120625, "loss": 0.2672, "step": 21150 }, { "epoch": 0.5759930446122915, "grad_norm": 0.37551758591172574, "learning_rate": 0.00015467099305876942, "loss": 0.2883, "step": 21200 }, { "epoch": 0.577351518774113, "grad_norm": 0.21750010428694388, "learning_rate": 0.00015383504871844582, "loss": 0.2779, "step": 21250 }, { "epoch": 0.5787099929359344, "grad_norm": 0.19042627120914027, "learning_rate": 0.00015299995540906267, "loss": 0.2764, "step": 21300 }, { "epoch": 0.5800684670977558, "grad_norm": 0.2797732165932674, "learning_rate": 0.0001521657285252044, "loss": 0.2922, "step": 21350 }, { "epoch": 0.5814269412595773, "grad_norm": 0.3591848479346681, "learning_rate": 0.00015133238344548327, "loss": 0.2884, "step": 21400 }, { "epoch": 0.5827854154213987, "grad_norm": 0.21764914836042967, "learning_rate": 0.00015049993553225608, "loss": 0.2715, "step": 21450 }, { "epoch": 0.5841438895832202, "grad_norm": 0.26727180336133755, "learning_rate": 0.0001496684001313406, "loss": 0.2753, "step": 21500 }, { "epoch": 0.5855023637450416, "grad_norm": 0.21915535565528904, "learning_rate": 0.00014883779257173285, "loss": 0.265, "step": 21550 }, { "epoch": 0.586860837906863, "grad_norm": 0.25668689734119876, "learning_rate": 0.0001480081281653244, "loss": 0.2762, "step": 21600 }, { "epoch": 0.5882193120686845, "grad_norm": 0.2834782294538094, "learning_rate": 0.00014717942220662038, "loss": 0.28, "step": 21650 }, { "epoch": 0.5895777862305059, "grad_norm": 0.24516802954697497, "learning_rate": 0.00014635168997245712, "loss": 0.2755, "step": 21700 }, { "epoch": 0.5909362603923274, "grad_norm": 0.22053403799293927, "learning_rate": 0.00014552494672172113, "loss": 0.2732, "step": 21750 }, { "epoch": 0.5922947345541488, "grad_norm": 0.297493134455997, "learning_rate": 0.00014469920769506704, "loss": 0.2819, "step": 21800 }, { "epoch": 0.5936532087159703, "grad_norm": 0.26448034669148435, "learning_rate": 0.00014387448811463722, "loss": 0.2947, "step": 21850 }, { "epoch": 0.5950116828777917, "grad_norm": 0.1887478278727578, "learning_rate": 0.00014305080318378105, "loss": 0.2573, "step": 21900 }, { "epoch": 0.5963701570396132, "grad_norm": 0.24486372742215648, "learning_rate": 0.0001422281680867744, "loss": 0.2762, "step": 21950 }, { "epoch": 0.5977286312014346, "grad_norm": 0.22891270758035537, "learning_rate": 0.00014140659798854012, "loss": 0.2816, "step": 22000 }, { "epoch": 0.599087105363256, "grad_norm": 0.25531740500430156, "learning_rate": 0.00014058610803436813, "loss": 0.2544, "step": 22050 }, { "epoch": 0.6004455795250774, "grad_norm": 0.2198360405690994, "learning_rate": 0.00013976671334963648, "loss": 0.27, "step": 22100 }, { "epoch": 0.6018040536868988, "grad_norm": 0.22767226535607382, "learning_rate": 0.0001389484290395323, "loss": 0.2869, "step": 22150 }, { "epoch": 0.6031625278487203, "grad_norm": 0.2694860139304321, "learning_rate": 0.00013813127018877331, "loss": 0.2752, "step": 22200 }, { "epoch": 0.6045210020105417, "grad_norm": 0.19898660564261053, "learning_rate": 0.00013731525186133026, "loss": 0.2624, "step": 22250 }, { "epoch": 0.6058794761723632, "grad_norm": 0.23150351646391246, "learning_rate": 0.00013653296123522198, "loss": 0.2718, "step": 22300 }, { "epoch": 0.6072379503341846, "grad_norm": 0.24064115266253058, "learning_rate": 0.00013571922195028266, "loss": 0.2812, "step": 22350 }, { "epoch": 0.608596424496006, "grad_norm": 0.25687846535740555, "learning_rate": 0.0001349066676537268, "loss": 0.262, "step": 22400 }, { "epoch": 0.6099548986578275, "grad_norm": 0.20024379738006956, "learning_rate": 0.00013409531332464196, "loss": 0.2796, "step": 22450 }, { "epoch": 0.6113133728196489, "grad_norm": 0.30669943060449323, "learning_rate": 0.00013328517391999483, "loss": 0.2748, "step": 22500 }, { "epoch": 0.6126718469814704, "grad_norm": 0.26517225209707274, "learning_rate": 0.00013247626437435539, "loss": 0.2641, "step": 22550 }, { "epoch": 0.6140303211432918, "grad_norm": 0.23089105114814368, "learning_rate": 0.0001316685995996218, "loss": 0.2716, "step": 22600 }, { "epoch": 0.6153887953051133, "grad_norm": 0.3141172219746477, "learning_rate": 0.0001308621944847455, "loss": 0.2601, "step": 22650 }, { "epoch": 0.6167472694669347, "grad_norm": 0.2290976880794265, "learning_rate": 0.0001300570638954565, "loss": 0.2805, "step": 22700 }, { "epoch": 0.6181057436287561, "grad_norm": 0.21218409171582492, "learning_rate": 0.0001292532226739894, "loss": 0.2686, "step": 22750 }, { "epoch": 0.6194642177905776, "grad_norm": 0.22628948026088308, "learning_rate": 0.0001284506856388101, "loss": 0.2688, "step": 22800 }, { "epoch": 0.620822691952399, "grad_norm": 0.2948337400203754, "learning_rate": 0.00012764946758434225, "loss": 0.2655, "step": 22850 }, { "epoch": 0.6221811661142205, "grad_norm": 0.3340188815254344, "learning_rate": 0.00012684958328069453, "loss": 0.2754, "step": 22900 }, { "epoch": 0.6235396402760419, "grad_norm": 0.2767372638913053, "learning_rate": 0.0001260510474733888, "loss": 0.2602, "step": 22950 }, { "epoch": 0.6248981144378634, "grad_norm": 0.270894988791611, "learning_rate": 0.00012525387488308783, "loss": 0.2564, "step": 23000 }, { "epoch": 0.6262565885996848, "grad_norm": 0.20130647702859084, "learning_rate": 0.000124458080205324, "loss": 0.2699, "step": 23050 }, { "epoch": 0.6276150627615062, "grad_norm": 0.2606352685620501, "learning_rate": 0.0001236795524100573, "loss": 0.2777, "step": 23100 }, { "epoch": 0.6289735369233277, "grad_norm": 0.26862575508349007, "learning_rate": 0.00012288652925419885, "loss": 0.27, "step": 23150 }, { "epoch": 0.6303320110851491, "grad_norm": 0.2264767237464518, "learning_rate": 0.00012209492765187177, "loss": 0.2717, "step": 23200 }, { "epoch": 0.6316904852469706, "grad_norm": 0.3116565801871334, "learning_rate": 0.00012130476219590986, "loss": 0.2595, "step": 23250 }, { "epoch": 0.633048959408792, "grad_norm": 0.2778393951264189, "learning_rate": 0.00012051604745267213, "loss": 0.2791, "step": 23300 }, { "epoch": 0.6344074335706135, "grad_norm": 0.1850696129101786, "learning_rate": 0.00011972879796177415, "loss": 0.2717, "step": 23350 }, { "epoch": 0.6357659077324349, "grad_norm": 0.24958891669063782, "learning_rate": 0.00011894302823582031, "loss": 0.2638, "step": 23400 }, { "epoch": 0.6371243818942564, "grad_norm": 0.3700870104750999, "learning_rate": 0.00011815875276013624, "loss": 0.2742, "step": 23450 }, { "epoch": 0.6384828560560778, "grad_norm": 0.33264994031715317, "learning_rate": 0.0001173759859925015, "loss": 0.2774, "step": 23500 }, { "epoch": 0.6398413302178992, "grad_norm": 0.31037389441035956, "learning_rate": 0.00011659474236288361, "loss": 0.2403, "step": 23550 }, { "epoch": 0.6411998043797207, "grad_norm": 0.2731125175831413, "learning_rate": 0.00011581503627317138, "loss": 0.2568, "step": 23600 }, { "epoch": 0.6425582785415421, "grad_norm": 0.31542476581603357, "learning_rate": 0.00011503688209690988, "loss": 0.2405, "step": 23650 }, { "epoch": 0.6439167527033636, "grad_norm": 0.2856271842999882, "learning_rate": 0.00011426029417903521, "loss": 0.2594, "step": 23700 }, { "epoch": 0.645275226865185, "grad_norm": 0.304609790388205, "learning_rate": 0.00011348528683561044, "loss": 0.2617, "step": 23750 }, { "epoch": 0.6466337010270065, "grad_norm": 0.24926409052563817, "learning_rate": 0.00011271187435356107, "loss": 0.2624, "step": 23800 }, { "epoch": 0.6479921751888279, "grad_norm": 0.29444243889916777, "learning_rate": 0.00011194007099041242, "loss": 0.267, "step": 23850 }, { "epoch": 0.6493506493506493, "grad_norm": 0.251174398975187, "learning_rate": 0.00011116989097402601, "loss": 0.2745, "step": 23900 }, { "epoch": 0.6507091235124708, "grad_norm": 0.26364700269491465, "learning_rate": 0.0001104013485023379, "loss": 0.2695, "step": 23950 }, { "epoch": 0.6520675976742922, "grad_norm": 0.1408465902411862, "learning_rate": 0.00010963445774309668, "loss": 0.2423, "step": 24000 }, { "epoch": 0.6534260718361137, "grad_norm": 0.1933859329942763, "learning_rate": 0.00010886923283360217, "loss": 0.2359, "step": 24050 }, { "epoch": 0.6547845459979351, "grad_norm": 0.2614195528425062, "learning_rate": 0.00010810568788044524, "loss": 0.2673, "step": 24100 }, { "epoch": 0.6561430201597566, "grad_norm": 0.24091031620062864, "learning_rate": 0.00010734383695924741, "loss": 0.2493, "step": 24150 }, { "epoch": 0.657501494321578, "grad_norm": 0.2697615824186297, "learning_rate": 0.00010658369411440134, "loss": 0.2729, "step": 24200 }, { "epoch": 0.6588599684833994, "grad_norm": 0.20653067849872642, "learning_rate": 0.00010582527335881209, "loss": 0.274, "step": 24250 }, { "epoch": 0.6602184426452209, "grad_norm": 0.2589626095489949, "learning_rate": 0.0001050685886736388, "loss": 0.2609, "step": 24300 }, { "epoch": 0.6615769168070423, "grad_norm": 0.2672837103760092, "learning_rate": 0.00010431365400803682, "loss": 0.2524, "step": 24350 }, { "epoch": 0.6629353909688638, "grad_norm": 0.1824225008155396, "learning_rate": 0.00010356048327890064, "loss": 0.2702, "step": 24400 }, { "epoch": 0.6642938651306852, "grad_norm": 0.22598161309206102, "learning_rate": 0.00010280909037060747, "loss": 0.2601, "step": 24450 }, { "epoch": 0.6656523392925067, "grad_norm": 0.3087441379489739, "learning_rate": 0.00010205948913476113, "loss": 0.2645, "step": 24500 }, { "epoch": 0.6670108134543281, "grad_norm": 0.26641776561733793, "learning_rate": 0.00010131169338993662, "loss": 0.2572, "step": 24550 }, { "epoch": 0.6683692876161496, "grad_norm": 0.16642889477958095, "learning_rate": 0.00010056571692142558, "loss": 0.2437, "step": 24600 }, { "epoch": 0.669727761777971, "grad_norm": 0.33006631915049106, "learning_rate": 9.982157348098204e-05, "loss": 0.2557, "step": 24650 }, { "epoch": 0.6710862359397924, "grad_norm": 0.25184518346403906, "learning_rate": 9.907927678656888e-05, "loss": 0.2481, "step": 24700 }, { "epoch": 0.6724447101016139, "grad_norm": 0.21781761609625996, "learning_rate": 9.833884052210525e-05, "loss": 0.2474, "step": 24750 }, { "epoch": 0.6738031842634353, "grad_norm": 0.2707646383120265, "learning_rate": 9.760027833721379e-05, "loss": 0.2652, "step": 24800 }, { "epoch": 0.6751616584252568, "grad_norm": 0.2069393478176125, "learning_rate": 9.686360384696958e-05, "loss": 0.2595, "step": 24850 }, { "epoch": 0.6765201325870782, "grad_norm": 0.24428561492811254, "learning_rate": 9.614350737579221e-05, "loss": 0.2501, "step": 24900 }, { "epoch": 0.6778786067488997, "grad_norm": 0.26915065049504966, "learning_rate": 9.541061055170308e-05, "loss": 0.2595, "step": 24950 }, { "epoch": 0.6792370809107211, "grad_norm": 0.32715798463519263, "learning_rate": 9.467964178784106e-05, "loss": 0.249, "step": 25000 }, { "epoch": 0.6805955550725425, "grad_norm": 0.2188565278615699, "learning_rate": 9.395061455929976e-05, "loss": 0.2644, "step": 25050 }, { "epoch": 0.681954029234364, "grad_norm": 0.1464346234966987, "learning_rate": 9.32235423053812e-05, "loss": 0.2489, "step": 25100 }, { "epoch": 0.6833125033961854, "grad_norm": 0.23268193269727472, "learning_rate": 9.249843842934851e-05, "loss": 0.2524, "step": 25150 }, { "epoch": 0.6846709775580069, "grad_norm": 0.2823606594876491, "learning_rate": 9.177531629817841e-05, "loss": 0.2734, "step": 25200 }, { "epoch": 0.6860294517198283, "grad_norm": 0.24467058752217685, "learning_rate": 9.105418924231516e-05, "loss": 0.2579, "step": 25250 }, { "epoch": 0.6873879258816498, "grad_norm": 0.2721349790032047, "learning_rate": 9.034943316134114e-05, "loss": 0.2501, "step": 25300 }, { "epoch": 0.6887464000434712, "grad_norm": 0.2063496873982564, "learning_rate": 8.96322955378789e-05, "loss": 0.2546, "step": 25350 }, { "epoch": 0.6901048742052927, "grad_norm": 0.1388645672356858, "learning_rate": 8.891719249538568e-05, "loss": 0.2481, "step": 25400 }, { "epoch": 0.6914633483671141, "grad_norm": 0.26747348762140405, "learning_rate": 8.820413721647738e-05, "loss": 0.2406, "step": 25450 }, { "epoch": 0.6928218225289355, "grad_norm": 0.24773718412732226, "learning_rate": 8.749314284602002e-05, "loss": 0.2345, "step": 25500 }, { "epoch": 0.694180296690757, "grad_norm": 0.1875176742846847, "learning_rate": 8.67842224908878e-05, "loss": 0.2697, "step": 25550 }, { "epoch": 0.6955387708525784, "grad_norm": 0.34193810804953745, "learning_rate": 8.607738921972125e-05, "loss": 0.2499, "step": 25600 }, { "epoch": 0.6968972450143999, "grad_norm": 0.19405847865847933, "learning_rate": 8.537265606268663e-05, "loss": 0.2469, "step": 25650 }, { "epoch": 0.6982557191762213, "grad_norm": 0.1829187140282853, "learning_rate": 8.467003601123527e-05, "loss": 0.2519, "step": 25700 }, { "epoch": 0.6996141933380428, "grad_norm": 0.20733927044724373, "learning_rate": 8.396954201786429e-05, "loss": 0.2655, "step": 25750 }, { "epoch": 0.7009726674998642, "grad_norm": 0.2261928459658941, "learning_rate": 8.32711869958781e-05, "loss": 0.2593, "step": 25800 }, { "epoch": 0.7023311416616856, "grad_norm": 0.2718188963862619, "learning_rate": 8.25749838191499e-05, "loss": 0.2415, "step": 25850 }, { "epoch": 0.7036896158235071, "grad_norm": 0.3565494856705099, "learning_rate": 8.18809453218845e-05, "loss": 0.2586, "step": 25900 }, { "epoch": 0.7050480899853285, "grad_norm": 0.23853635314623642, "learning_rate": 8.118908429838201e-05, "loss": 0.2495, "step": 25950 }, { "epoch": 0.70640656414715, "grad_norm": 0.12974546184866537, "learning_rate": 8.049941350280157e-05, "loss": 0.241, "step": 26000 }, { "epoch": 0.7077650383089714, "grad_norm": 0.15240966539892364, "learning_rate": 7.981194564892645e-05, "loss": 0.2642, "step": 26050 }, { "epoch": 0.7091235124707929, "grad_norm": 0.18578994346470928, "learning_rate": 7.912669340992957e-05, "loss": 0.2561, "step": 26100 }, { "epoch": 0.7104819866326142, "grad_norm": 0.2354542836489054, "learning_rate": 7.844366941814016e-05, "loss": 0.2433, "step": 26150 }, { "epoch": 0.7118404607944356, "grad_norm": 0.32359876529310133, "learning_rate": 7.776288626481043e-05, "loss": 0.2589, "step": 26200 }, { "epoch": 0.7131989349562571, "grad_norm": 0.21721518409143126, "learning_rate": 7.708435649988394e-05, "loss": 0.248, "step": 26250 }, { "epoch": 0.7145574091180785, "grad_norm": 0.2413841328575766, "learning_rate": 7.640809263176381e-05, "loss": 0.2495, "step": 26300 }, { "epoch": 0.7159158832799, "grad_norm": 0.1937874091125614, "learning_rate": 7.57341071270824e-05, "loss": 0.2379, "step": 26350 }, { "epoch": 0.7172743574417214, "grad_norm": 0.29670499546178025, "learning_rate": 7.507582377492124e-05, "loss": 0.2481, "step": 26400 }, { "epoch": 0.7186328316035429, "grad_norm": 0.2733674523937474, "learning_rate": 7.44063860443e-05, "loss": 0.24, "step": 26450 }, { "epoch": 0.7199913057653643, "grad_norm": 0.24849432830004892, "learning_rate": 7.373926357771387e-05, "loss": 0.2518, "step": 26500 }, { "epoch": 0.7213497799271857, "grad_norm": 0.3217997284475769, "learning_rate": 7.307446867327764e-05, "loss": 0.2558, "step": 26550 }, { "epoch": 0.7227082540890072, "grad_norm": 0.1903670555116767, "learning_rate": 7.241201358619814e-05, "loss": 0.2459, "step": 26600 }, { "epoch": 0.7240667282508286, "grad_norm": 0.1308938028529946, "learning_rate": 7.175191052854886e-05, "loss": 0.2507, "step": 26650 }, { "epoch": 0.7254252024126501, "grad_norm": 0.2795123652476836, "learning_rate": 7.109417166904457e-05, "loss": 0.2518, "step": 26700 }, { "epoch": 0.7267836765744715, "grad_norm": 0.16091370835854293, "learning_rate": 7.043880913281707e-05, "loss": 0.2554, "step": 26750 }, { "epoch": 0.728142150736293, "grad_norm": 0.16950014605111838, "learning_rate": 6.978583500119171e-05, "loss": 0.2451, "step": 26800 }, { "epoch": 0.7295006248981144, "grad_norm": 0.19788089327913239, "learning_rate": 6.913526131146473e-05, "loss": 0.2456, "step": 26850 }, { "epoch": 0.7308590990599358, "grad_norm": 0.23996422423355868, "learning_rate": 6.848710005668106e-05, "loss": 0.2372, "step": 26900 }, { "epoch": 0.7322175732217573, "grad_norm": 0.30447979386999535, "learning_rate": 6.784136318541352e-05, "loss": 0.2507, "step": 26950 }, { "epoch": 0.7335760473835787, "grad_norm": 0.2442932467375467, "learning_rate": 6.719806260154248e-05, "loss": 0.2499, "step": 27000 }, { "epoch": 0.7349345215454002, "grad_norm": 0.2053301139703188, "learning_rate": 6.655721016403638e-05, "loss": 0.2351, "step": 27050 }, { "epoch": 0.7362929957072216, "grad_norm": 0.28412523900572856, "learning_rate": 6.591881768673309e-05, "loss": 0.2463, "step": 27100 }, { "epoch": 0.7376514698690431, "grad_norm": 0.2102789887873736, "learning_rate": 6.52828969381223e-05, "loss": 0.2469, "step": 27150 }, { "epoch": 0.7390099440308645, "grad_norm": 0.37446012395142053, "learning_rate": 6.464945964112845e-05, "loss": 0.2381, "step": 27200 }, { "epoch": 0.740368418192686, "grad_norm": 0.16201575759035203, "learning_rate": 6.401851747289451e-05, "loss": 0.2349, "step": 27250 }, { "epoch": 0.7417268923545074, "grad_norm": 0.2489903791806012, "learning_rate": 6.339008206456684e-05, "loss": 0.2482, "step": 27300 }, { "epoch": 0.7430853665163288, "grad_norm": 0.21608399737617504, "learning_rate": 6.276416500108084e-05, "loss": 0.2446, "step": 27350 }, { "epoch": 0.7444438406781503, "grad_norm": 0.2704960434877356, "learning_rate": 6.215322069728647e-05, "loss": 0.2424, "step": 27400 }, { "epoch": 0.7458023148399717, "grad_norm": 0.2267608806933957, "learning_rate": 6.153232395255646e-05, "loss": 0.2441, "step": 27450 }, { "epoch": 0.7471607890017932, "grad_norm": 0.183167292044454, "learning_rate": 6.0913979799636686e-05, "loss": 0.2445, "step": 27500 }, { "epoch": 0.7485192631636146, "grad_norm": 0.15376761881823003, "learning_rate": 6.0298199637434525e-05, "loss": 0.2253, "step": 27550 }, { "epoch": 0.7498777373254361, "grad_norm": 0.19314678658445544, "learning_rate": 5.9684994817591334e-05, "loss": 0.2383, "step": 27600 }, { "epoch": 0.7512362114872575, "grad_norm": 0.2544241890699629, "learning_rate": 5.907437664427311e-05, "loss": 0.2391, "step": 27650 }, { "epoch": 0.7525946856490789, "grad_norm": 0.21989322641900727, "learning_rate": 5.846635637396216e-05, "loss": 0.2332, "step": 27700 }, { "epoch": 0.7539531598109004, "grad_norm": 0.22618174199432453, "learning_rate": 5.7860945215249696e-05, "loss": 0.2337, "step": 27750 }, { "epoch": 0.7553116339727218, "grad_norm": 0.18764808609541392, "learning_rate": 5.725815432862887e-05, "loss": 0.2482, "step": 27800 }, { "epoch": 0.7566701081345433, "grad_norm": 0.4380376298961902, "learning_rate": 5.6657994826289465e-05, "loss": 0.2262, "step": 27850 }, { "epoch": 0.7580285822963647, "grad_norm": 0.23698867029895784, "learning_rate": 5.606047777191268e-05, "loss": 0.2409, "step": 27900 }, { "epoch": 0.7593870564581862, "grad_norm": 0.20034127068488122, "learning_rate": 5.546561418046736e-05, "loss": 0.2419, "step": 27950 }, { "epoch": 0.7607455306200076, "grad_norm": 0.2949286374600259, "learning_rate": 5.4873415018006867e-05, "loss": 0.2261, "step": 28000 }, { "epoch": 0.762104004781829, "grad_norm": 0.25152518852471184, "learning_rate": 5.428389120146715e-05, "loss": 0.2375, "step": 28050 }, { "epoch": 0.7634624789436505, "grad_norm": 0.1611737332419803, "learning_rate": 5.369705359846511e-05, "loss": 0.2318, "step": 28100 }, { "epoch": 0.7648209531054719, "grad_norm": 0.24055178507097832, "learning_rate": 5.311291302709844e-05, "loss": 0.2373, "step": 28150 }, { "epoch": 0.7661794272672934, "grad_norm": 0.1897183193395996, "learning_rate": 5.2531480255746476e-05, "loss": 0.245, "step": 28200 }, { "epoch": 0.7675379014291148, "grad_norm": 0.17982933996634243, "learning_rate": 5.195276600287118e-05, "loss": 0.2369, "step": 28250 }, { "epoch": 0.7688963755909363, "grad_norm": 0.25848505633412666, "learning_rate": 5.137678093681983e-05, "loss": 0.2319, "step": 28300 }, { "epoch": 0.7702548497527577, "grad_norm": 0.20072845241494364, "learning_rate": 5.0803535675628497e-05, "loss": 0.2306, "step": 28350 }, { "epoch": 0.7716133239145792, "grad_norm": 0.20242303091668362, "learning_rate": 5.0233040786825935e-05, "loss": 0.2422, "step": 28400 }, { "epoch": 0.7729717980764006, "grad_norm": 0.2519217142033256, "learning_rate": 4.9665306787239086e-05, "loss": 0.25, "step": 28450 }, { "epoch": 0.774330272238222, "grad_norm": 0.20962707780239995, "learning_rate": 4.910034414279902e-05, "loss": 0.2253, "step": 28500 }, { "epoch": 0.7756887464000435, "grad_norm": 0.22029442635682775, "learning_rate": 4.853816326834808e-05, "loss": 0.2411, "step": 28550 }, { "epoch": 0.7770472205618649, "grad_norm": 0.2324490253296684, "learning_rate": 4.797877452744792e-05, "loss": 0.2373, "step": 28600 }, { "epoch": 0.7784056947236864, "grad_norm": 0.25563113455518527, "learning_rate": 4.742218823218851e-05, "loss": 0.2363, "step": 28650 }, { "epoch": 0.7797641688855078, "grad_norm": 0.2147681728902342, "learning_rate": 4.686841464299776e-05, "loss": 0.2474, "step": 28700 }, { "epoch": 0.7811226430473293, "grad_norm": 0.25383739922693677, "learning_rate": 4.6317463968452624e-05, "loss": 0.2212, "step": 28750 }, { "epoch": 0.7824811172091507, "grad_norm": 0.2679584758554305, "learning_rate": 4.5769346365090894e-05, "loss": 0.252, "step": 28800 }, { "epoch": 0.7838395913709721, "grad_norm": 0.2088581606519284, "learning_rate": 4.522407193722382e-05, "loss": 0.2277, "step": 28850 }, { "epoch": 0.7851980655327936, "grad_norm": 0.26508761902628303, "learning_rate": 4.4681650736750016e-05, "loss": 0.2277, "step": 28900 }, { "epoch": 0.786556539694615, "grad_norm": 0.377399622435696, "learning_rate": 4.416361998302716e-05, "loss": 0.2278, "step": 28950 }, { "epoch": 0.7879150138564365, "grad_norm": 0.15962057847455954, "learning_rate": 4.3626820065221566e-05, "loss": 0.2242, "step": 29000 }, { "epoch": 0.7892734880182579, "grad_norm": 0.32929537012542076, "learning_rate": 4.309290281945775e-05, "loss": 0.228, "step": 29050 }, { "epoch": 0.7906319621800794, "grad_norm": 0.21846014602740327, "learning_rate": 4.256187808826948e-05, "loss": 0.2446, "step": 29100 }, { "epoch": 0.7919904363419008, "grad_norm": 0.21017757165369907, "learning_rate": 4.203375566086851e-05, "loss": 0.2401, "step": 29150 }, { "epoch": 0.7933489105037222, "grad_norm": 0.24137967337019786, "learning_rate": 4.15085452729636e-05, "loss": 0.2465, "step": 29200 }, { "epoch": 0.7947073846655437, "grad_norm": 0.25590967007232035, "learning_rate": 4.098625660658151e-05, "loss": 0.2375, "step": 29250 }, { "epoch": 0.7960658588273651, "grad_norm": 0.279864351972487, "learning_rate": 4.0466899289888205e-05, "loss": 0.2374, "step": 29300 }, { "epoch": 0.7974243329891866, "grad_norm": 0.2607946144900689, "learning_rate": 3.995048289701155e-05, "loss": 0.222, "step": 29350 }, { "epoch": 0.798782807151008, "grad_norm": 0.3426303124257882, "learning_rate": 3.9437016947864745e-05, "loss": 0.2367, "step": 29400 }, { "epoch": 0.8001412813128295, "grad_norm": 0.27770053634270586, "learning_rate": 3.892651090797075e-05, "loss": 0.2417, "step": 29450 }, { "epoch": 0.8014997554746509, "grad_norm": 0.24470556593709372, "learning_rate": 3.841897418828797e-05, "loss": 0.219, "step": 29500 }, { "epoch": 0.8028582296364724, "grad_norm": 0.24604839411340365, "learning_rate": 3.791441614503675e-05, "loss": 0.2382, "step": 29550 }, { "epoch": 0.8042167037982938, "grad_norm": 0.47026656573464204, "learning_rate": 3.7412846079526644e-05, "loss": 0.2196, "step": 29600 }, { "epoch": 0.8055751779601152, "grad_norm": 0.26091598553410145, "learning_rate": 3.691427323798522e-05, "loss": 0.2268, "step": 29650 }, { "epoch": 0.8069336521219367, "grad_norm": 0.21960674238700215, "learning_rate": 3.6418706811387504e-05, "loss": 0.2356, "step": 29700 }, { "epoch": 0.8082921262837581, "grad_norm": 0.2181680329611913, "learning_rate": 3.592615593528652e-05, "loss": 0.2261, "step": 29750 }, { "epoch": 0.8096506004455796, "grad_norm": 0.1831881150211827, "learning_rate": 3.543662968964496e-05, "loss": 0.2306, "step": 29800 }, { "epoch": 0.811009074607401, "grad_norm": 0.24753134638996258, "learning_rate": 3.4950137098667836e-05, "loss": 0.2459, "step": 29850 }, { "epoch": 0.8123675487692225, "grad_norm": 0.3847831369965376, "learning_rate": 3.4466687130635856e-05, "loss": 0.2201, "step": 29900 }, { "epoch": 0.8137260229310439, "grad_norm": 0.2975898486391868, "learning_rate": 3.39862886977405e-05, "loss": 0.2166, "step": 29950 }, { "epoch": 0.8150844970928653, "grad_norm": 0.20045687385866154, "learning_rate": 3.3508950655919394e-05, "loss": 0.228, "step": 30000 }, { "epoch": 0.8164429712546868, "grad_norm": 0.303426868110847, "learning_rate": 3.3034681804693204e-05, "loss": 0.22, "step": 30050 }, { "epoch": 0.8178014454165082, "grad_norm": 0.20754051862810569, "learning_rate": 3.25634908870033e-05, "loss": 0.2301, "step": 30100 }, { "epoch": 0.8191599195783296, "grad_norm": 0.24422546737008857, "learning_rate": 3.209538658905087e-05, "loss": 0.2367, "step": 30150 }, { "epoch": 0.820518393740151, "grad_norm": 0.3027570941981141, "learning_rate": 3.163037754013647e-05, "loss": 0.2417, "step": 30200 }, { "epoch": 0.8218768679019725, "grad_norm": 0.280254043533181, "learning_rate": 3.116847231250104e-05, "loss": 0.2266, "step": 30250 }, { "epoch": 0.8232353420637939, "grad_norm": 0.28505781897752897, "learning_rate": 3.070967942116807e-05, "loss": 0.2307, "step": 30300 }, { "epoch": 0.8245938162256153, "grad_norm": 0.22370571404265266, "learning_rate": 3.0254007323786338e-05, "loss": 0.2292, "step": 30350 }, { "epoch": 0.8259522903874368, "grad_norm": 0.20314542315669792, "learning_rate": 2.9801464420474135e-05, "loss": 0.2384, "step": 30400 }, { "epoch": 0.8273107645492582, "grad_norm": 0.3091717822159854, "learning_rate": 2.9352059053664515e-05, "loss": 0.2252, "step": 30450 }, { "epoch": 0.8286692387110797, "grad_norm": 0.2850647955523155, "learning_rate": 2.8905799507951314e-05, "loss": 0.2228, "step": 30500 }, { "epoch": 0.8300277128729011, "grad_norm": 0.256986010255855, "learning_rate": 2.846269400993655e-05, "loss": 0.2176, "step": 30550 }, { "epoch": 0.8313861870347226, "grad_norm": 0.19662900160930957, "learning_rate": 2.802275072807865e-05, "loss": 0.2271, "step": 30600 }, { "epoch": 0.832744661196544, "grad_norm": 0.2745095935502404, "learning_rate": 2.7585977772542126e-05, "loss": 0.2254, "step": 30650 }, { "epoch": 0.8341031353583654, "grad_norm": 0.20724780113594732, "learning_rate": 2.715238319504769e-05, "loss": 0.2415, "step": 30700 }, { "epoch": 0.8354616095201869, "grad_norm": 0.21591886814918512, "learning_rate": 2.6721974988724264e-05, "loss": 0.2305, "step": 30750 }, { "epoch": 0.8368200836820083, "grad_norm": 0.4269961594661858, "learning_rate": 2.629476108796114e-05, "loss": 0.2344, "step": 30800 }, { "epoch": 0.8381785578438298, "grad_norm": 0.3444022954836087, "learning_rate": 2.587074936826215e-05, "loss": 0.2355, "step": 30850 }, { "epoch": 0.8395370320056512, "grad_norm": 0.2810532430087154, "learning_rate": 2.5449947646100202e-05, "loss": 0.2333, "step": 30900 }, { "epoch": 0.8408955061674727, "grad_norm": 0.2016521391071412, "learning_rate": 2.5032363678773284e-05, "loss": 0.2345, "step": 30950 }, { "epoch": 0.8422539803292941, "grad_norm": 0.26305838948166177, "learning_rate": 2.4626260675610046e-05, "loss": 0.2229, "step": 31000 }, { "epoch": 0.8436124544911155, "grad_norm": 0.3334236797955835, "learning_rate": 2.4223262587394115e-05, "loss": 0.2414, "step": 31050 }, { "epoch": 0.844970928652937, "grad_norm": 0.1995390748027635, "learning_rate": 2.381524806289641e-05, "loss": 0.229, "step": 31100 }, { "epoch": 0.8463294028147584, "grad_norm": 0.24559427709489323, "learning_rate": 2.3410481428214603e-05, "loss": 0.2139, "step": 31150 }, { "epoch": 0.8476878769765799, "grad_norm": 0.20659570740849767, "learning_rate": 2.300897014504688e-05, "loss": 0.227, "step": 31200 }, { "epoch": 0.8490463511384013, "grad_norm": 0.21220065960007847, "learning_rate": 2.261072161508033e-05, "loss": 0.2374, "step": 31250 }, { "epoch": 0.8504048253002228, "grad_norm": 0.24984661749787465, "learning_rate": 2.2215743179854577e-05, "loss": 0.2266, "step": 31300 }, { "epoch": 0.8517632994620442, "grad_norm": 0.3258474284548005, "learning_rate": 2.1824042120626543e-05, "loss": 0.2231, "step": 31350 }, { "epoch": 0.8531217736238657, "grad_norm": 0.3217882648747335, "learning_rate": 2.143562565823609e-05, "loss": 0.2313, "step": 31400 }, { "epoch": 0.8544802477856871, "grad_norm": 0.2660872932773675, "learning_rate": 2.1050500952972985e-05, "loss": 0.2443, "step": 31450 }, { "epoch": 0.8558387219475085, "grad_norm": 0.2552800785940888, "learning_rate": 2.0668675104444745e-05, "loss": 0.2282, "step": 31500 }, { "epoch": 0.85719719610933, "grad_norm": 0.23195860174218688, "learning_rate": 2.0290155151446145e-05, "loss": 0.2375, "step": 31550 }, { "epoch": 0.8585556702711514, "grad_norm": 0.29024757595999545, "learning_rate": 1.9914948071828922e-05, "loss": 0.2222, "step": 31600 }, { "epoch": 0.8599141444329729, "grad_norm": 0.2658334720707247, "learning_rate": 1.9543060782373667e-05, "loss": 0.2351, "step": 31650 }, { "epoch": 0.8612726185947943, "grad_norm": 0.26827491653104296, "learning_rate": 1.917450013866189e-05, "loss": 0.2397, "step": 31700 }, { "epoch": 0.8626310927566158, "grad_norm": 0.2739389172356759, "learning_rate": 1.880927293494994e-05, "loss": 0.233, "step": 31750 }, { "epoch": 0.8639895669184372, "grad_norm": 0.2784269064173774, "learning_rate": 1.8447385904043534e-05, "loss": 0.2418, "step": 31800 }, { "epoch": 0.8653480410802586, "grad_norm": 0.3614092162288049, "learning_rate": 1.808884571717384e-05, "loss": 0.2257, "step": 31850 }, { "epoch": 0.8667065152420801, "grad_norm": 0.24413648696026682, "learning_rate": 1.7733658983874336e-05, "loss": 0.2389, "step": 31900 }, { "epoch": 0.8680649894039015, "grad_norm": 0.1967545842426806, "learning_rate": 1.7381832251859075e-05, "loss": 0.2191, "step": 31950 }, { "epoch": 0.869423463565723, "grad_norm": 0.23747418190146288, "learning_rate": 1.7033372006901982e-05, "loss": 0.223, "step": 32000 }, { "epoch": 0.8707819377275444, "grad_norm": 0.28815680461600557, "learning_rate": 1.6702023326195593e-05, "loss": 0.2242, "step": 32050 }, { "epoch": 0.8721404118893659, "grad_norm": 0.2868186834763779, "learning_rate": 1.636017997206618e-05, "loss": 0.2155, "step": 32100 }, { "epoch": 0.8734988860511873, "grad_norm": 0.20903132836326485, "learning_rate": 1.6021721938713497e-05, "loss": 0.2258, "step": 32150 }, { "epoch": 0.8748573602130088, "grad_norm": 0.22192290777199325, "learning_rate": 1.568665546546517e-05, "loss": 0.2322, "step": 32200 }, { "epoch": 0.8762158343748302, "grad_norm": 0.27939346399599835, "learning_rate": 1.5354986729126963e-05, "loss": 0.2166, "step": 32250 }, { "epoch": 0.8775743085366516, "grad_norm": 0.23760881910404164, "learning_rate": 1.5026721843868797e-05, "loss": 0.2231, "step": 32300 }, { "epoch": 0.8789327826984731, "grad_norm": 0.2384919572031985, "learning_rate": 1.4701866861112057e-05, "loss": 0.2115, "step": 32350 }, { "epoch": 0.8802912568602945, "grad_norm": 0.18196788330053723, "learning_rate": 1.4380427769418081e-05, "loss": 0.2214, "step": 32400 }, { "epoch": 0.881649731022116, "grad_norm": 0.11092114968179356, "learning_rate": 1.4062410494377642e-05, "loss": 0.2136, "step": 32450 }, { "epoch": 0.8830082051839374, "grad_norm": 0.24840886469424456, "learning_rate": 1.3747820898501929e-05, "loss": 0.228, "step": 32500 }, { "epoch": 0.8843666793457589, "grad_norm": 0.24015308441997552, "learning_rate": 1.3436664781114295e-05, "loss": 0.2225, "step": 32550 }, { "epoch": 0.8857251535075803, "grad_norm": 0.2836665970861565, "learning_rate": 1.3128947878243392e-05, "loss": 0.2203, "step": 32600 }, { "epoch": 0.8870836276694017, "grad_norm": 0.21821870263137624, "learning_rate": 1.2824675862517388e-05, "loss": 0.2236, "step": 32650 }, { "epoch": 0.8884421018312232, "grad_norm": 0.24628179490379828, "learning_rate": 1.2523854343059538e-05, "loss": 0.2224, "step": 32700 }, { "epoch": 0.8898005759930446, "grad_norm": 0.3785043038913189, "learning_rate": 1.2226488865384622e-05, "loss": 0.2328, "step": 32750 }, { "epoch": 0.8911590501548661, "grad_norm": 0.27390465067758646, "learning_rate": 1.1932584911296762e-05, "loss": 0.2409, "step": 32800 }, { "epoch": 0.8925175243166875, "grad_norm": 0.21777850622413425, "learning_rate": 1.164214789878848e-05, "loss": 0.2113, "step": 32850 }, { "epoch": 0.893875998478509, "grad_norm": 0.31282081336865486, "learning_rate": 1.1355183181940688e-05, "loss": 0.2294, "step": 32900 }, { "epoch": 0.8952344726403304, "grad_norm": 0.21847851398276275, "learning_rate": 1.1071696050823988e-05, "loss": 0.2176, "step": 32950 }, { "epoch": 0.8965929468021518, "grad_norm": 0.1797782169135658, "learning_rate": 1.0791691731401221e-05, "loss": 0.2197, "step": 33000 }, { "epoch": 0.8979514209639733, "grad_norm": 0.236661072817734, "learning_rate": 1.0526169006027186e-05, "loss": 0.2287, "step": 33050 }, { "epoch": 0.8993098951257947, "grad_norm": 0.24545168612281812, "learning_rate": 1.0253005911068837e-05, "loss": 0.2248, "step": 33100 }, { "epoch": 0.9006683692876162, "grad_norm": 0.2998814580945627, "learning_rate": 9.98334072000362e-06, "loss": 0.2279, "step": 33150 }, { "epoch": 0.9020268434494376, "grad_norm": 0.25372185228970084, "learning_rate": 9.717178403992866e-06, "loss": 0.2296, "step": 33200 }, { "epoch": 0.9033853176112591, "grad_norm": 0.22808552163017606, "learning_rate": 9.454523869623889e-06, "loss": 0.2142, "step": 33250 }, { "epoch": 0.9047437917730805, "grad_norm": 0.22587780733166465, "learning_rate": 9.195381958819637e-06, "loss": 0.2332, "step": 33300 }, { "epoch": 0.906102265934902, "grad_norm": 0.40072192174734456, "learning_rate": 8.939757448749286e-06, "loss": 0.2294, "step": 33350 }, { "epoch": 0.9074607400967234, "grad_norm": 0.207716936686938, "learning_rate": 8.687655051740318e-06, "loss": 0.2296, "step": 33400 }, { "epoch": 0.9088192142585448, "grad_norm": 0.2779354186342964, "learning_rate": 8.439079415191532e-06, "loss": 0.2219, "step": 33450 }, { "epoch": 0.9101776884203663, "grad_norm": 0.2723935977374799, "learning_rate": 8.194035121487465e-06, "loss": 0.2153, "step": 33500 }, { "epoch": 0.9115361625821877, "grad_norm": 0.19697799899707563, "learning_rate": 7.952526687913842e-06, "loss": 0.2146, "step": 33550 }, { "epoch": 0.9128946367440092, "grad_norm": 0.2085749517551805, "learning_rate": 7.714558566574325e-06, "loss": 0.2136, "step": 33600 }, { "epoch": 0.9142531109058306, "grad_norm": 0.21717861835570473, "learning_rate": 7.480135144308475e-06, "loss": 0.2148, "step": 33650 }, { "epoch": 0.9156115850676521, "grad_norm": 0.19269266158892198, "learning_rate": 7.2492607426108305e-06, "loss": 0.2257, "step": 33700 }, { "epoch": 0.9169700592294735, "grad_norm": 0.24145080749460107, "learning_rate": 7.02193961755131e-06, "loss": 0.2207, "step": 33750 }, { "epoch": 0.9183285333912949, "grad_norm": 0.38368673649347684, "learning_rate": 6.798175959696629e-06, "loss": 0.2277, "step": 33800 }, { "epoch": 0.9196870075531164, "grad_norm": 0.23667938172589895, "learning_rate": 6.577973894033274e-06, "loss": 0.2175, "step": 33850 }, { "epoch": 0.9210454817149378, "grad_norm": 0.3161307046290422, "learning_rate": 6.3613374798911605e-06, "loss": 0.2343, "step": 33900 }, { "epoch": 0.9224039558767593, "grad_norm": 0.21360593050098195, "learning_rate": 6.148270710869053e-06, "loss": 0.2239, "step": 33950 }, { "epoch": 0.9237624300385807, "grad_norm": 0.28937490357759443, "learning_rate": 5.938777514760796e-06, "loss": 0.2124, "step": 34000 }, { "epoch": 0.9251209042004022, "grad_norm": 0.2795740945530124, "learning_rate": 5.732861753483043e-06, "loss": 0.2204, "step": 34050 }, { "epoch": 0.9264793783622236, "grad_norm": 0.2537546094067306, "learning_rate": 5.538551797587777e-06, "loss": 0.2112, "step": 34100 }, { "epoch": 0.927837852524045, "grad_norm": 0.33705155604910225, "learning_rate": 5.339658758640753e-06, "loss": 0.2199, "step": 34150 }, { "epoch": 0.9291963266858664, "grad_norm": 0.3270077837617184, "learning_rate": 5.14435419901973e-06, "loss": 0.2297, "step": 34200 }, { "epoch": 0.9305548008476878, "grad_norm": 0.22117872354331958, "learning_rate": 4.95264171907992e-06, "loss": 0.22, "step": 34250 }, { "epoch": 0.9319132750095093, "grad_norm": 0.2091602624721329, "learning_rate": 4.7645248529581076e-06, "loss": 0.2107, "step": 34300 }, { "epoch": 0.9332717491713307, "grad_norm": 0.19828806364167093, "learning_rate": 4.580007068507497e-06, "loss": 0.2215, "step": 34350 }, { "epoch": 0.9346302233331522, "grad_norm": 0.2922062157376195, "learning_rate": 4.399091767233743e-06, "loss": 0.234, "step": 34400 }, { "epoch": 0.9359886974949736, "grad_norm": 0.2833929720260013, "learning_rate": 4.221782284232312e-06, "loss": 0.2358, "step": 34450 }, { "epoch": 0.937347171656795, "grad_norm": 0.10391453032816735, "learning_rate": 4.048081888126931e-06, "loss": 0.2194, "step": 34500 }, { "epoch": 0.9387056458186165, "grad_norm": 0.31685765392965615, "learning_rate": 3.877993781009415e-06, "loss": 0.2237, "step": 34550 }, { "epoch": 0.9400641199804379, "grad_norm": 0.32028046645415253, "learning_rate": 3.7115210983805326e-06, "loss": 0.2296, "step": 34600 }, { "epoch": 0.9414225941422594, "grad_norm": 0.2275872271102818, "learning_rate": 3.548666909092324e-06, "loss": 0.2237, "step": 34650 }, { "epoch": 0.9427810683040808, "grad_norm": 0.28672637661803746, "learning_rate": 3.3894342152914092e-06, "loss": 0.2129, "step": 34700 }, { "epoch": 0.9441395424659023, "grad_norm": 0.17447544388507297, "learning_rate": 3.233825952363767e-06, "loss": 0.2156, "step": 34750 }, { "epoch": 0.9454980166277237, "grad_norm": 0.27717659551061696, "learning_rate": 3.081844988880511e-06, "loss": 0.2325, "step": 34800 }, { "epoch": 0.9468564907895451, "grad_norm": 0.19384754194181844, "learning_rate": 2.9334941265450666e-06, "loss": 0.2264, "step": 34850 }, { "epoch": 0.9482149649513666, "grad_norm": 0.20190545639653648, "learning_rate": 2.788776100141499e-06, "loss": 0.2162, "step": 34900 }, { "epoch": 0.949573439113188, "grad_norm": 0.13741628116355178, "learning_rate": 2.647693577484156e-06, "loss": 0.2175, "step": 34950 }, { "epoch": 0.9509319132750095, "grad_norm": 0.30120083790962326, "learning_rate": 2.5102491593684164e-06, "loss": 0.2098, "step": 35000 }, { "epoch": 0.9522903874368309, "grad_norm": 0.18521977588022978, "learning_rate": 2.3764453795227737e-06, "loss": 0.2232, "step": 35050 }, { "epoch": 0.9536488615986524, "grad_norm": 0.31542769729636866, "learning_rate": 2.2462847045620737e-06, "loss": 0.2223, "step": 35100 }, { "epoch": 0.9550073357604738, "grad_norm": 0.27121032732352324, "learning_rate": 2.1247601176086262e-06, "loss": 0.2167, "step": 35150 }, { "epoch": 0.9563658099222953, "grad_norm": 0.23513329499397734, "learning_rate": 2.0017468261825268e-06, "loss": 0.2118, "step": 35200 }, { "epoch": 0.9577242840841167, "grad_norm": 0.15376591935080916, "learning_rate": 1.8823835470474395e-06, "loss": 0.2247, "step": 35250 }, { "epoch": 0.9590827582459381, "grad_norm": 0.3234625822847048, "learning_rate": 1.766672480613818e-06, "loss": 0.2229, "step": 35300 }, { "epoch": 0.9604412324077596, "grad_norm": 0.17001324587245673, "learning_rate": 1.6546157599652613e-06, "loss": 0.2148, "step": 35350 }, { "epoch": 0.961799706569581, "grad_norm": 0.11359209160906845, "learning_rate": 1.5462154508190108e-06, "loss": 0.214, "step": 35400 }, { "epoch": 0.9631581807314025, "grad_norm": 0.2409519071516936, "learning_rate": 1.4414735514879373e-06, "loss": 0.2118, "step": 35450 }, { "epoch": 0.9645166548932239, "grad_norm": 0.21439695271092557, "learning_rate": 1.3403919928437036e-06, "loss": 0.2219, "step": 35500 }, { "epoch": 0.9658751290550454, "grad_norm": 0.21370470770092295, "learning_rate": 1.2429726382812368e-06, "loss": 0.2147, "step": 35550 }, { "epoch": 0.9672336032168668, "grad_norm": 0.2311514348226777, "learning_rate": 1.149217283684223e-06, "loss": 0.23, "step": 35600 }, { "epoch": 0.9685920773786882, "grad_norm": 0.29860147103327, "learning_rate": 1.059127657392156e-06, "loss": 0.2313, "step": 35650 }, { "epoch": 0.9699505515405097, "grad_norm": 0.1814371857599918, "learning_rate": 9.72705420168407e-07, "loss": 0.2241, "step": 35700 }, { "epoch": 0.9713090257023311, "grad_norm": 0.31658685339190534, "learning_rate": 8.899521651695831e-07, "loss": 0.2207, "step": 35750 }, { "epoch": 0.9726674998641526, "grad_norm": 0.19909543685451667, "learning_rate": 8.124150923443096e-07, "loss": 0.2351, "step": 35800 }, { "epoch": 0.974025974025974, "grad_norm": 0.24098233089589674, "learning_rate": 7.369308575313927e-07, "loss": 0.2192, "step": 35850 }, { "epoch": 0.9753844481877955, "grad_norm": 0.23547731000564504, "learning_rate": 6.651199513456607e-07, "loss": 0.2268, "step": 35900 }, { "epoch": 0.9767429223496169, "grad_norm": 0.14817468589671154, "learning_rate": 5.969836975901366e-07, "loss": 0.2175, "step": 35950 }, { "epoch": 0.9781013965114383, "grad_norm": 0.140443367773756, "learning_rate": 5.3252335232723e-07, "loss": 0.2214, "step": 36000 }, { "epoch": 0.9794598706732598, "grad_norm": 0.3119367682303709, "learning_rate": 4.71740103855578e-07, "loss": 0.2249, "step": 36050 }, { "epoch": 0.9808183448350812, "grad_norm": 0.311395190548215, "learning_rate": 4.146350726881076e-07, "loss": 0.2222, "step": 36100 }, { "epoch": 0.9821768189969027, "grad_norm": 0.37040140534262844, "learning_rate": 3.6120931153138525e-07, "loss": 0.2361, "step": 36150 }, { "epoch": 0.9835352931587241, "grad_norm": 0.2988864815826086, "learning_rate": 3.114638052662988e-07, "loss": 0.2207, "step": 36200 }, { "epoch": 0.9848937673205456, "grad_norm": 0.21108645711904683, "learning_rate": 2.6539947092976135e-07, "loss": 0.2247, "step": 36250 }, { "epoch": 0.986252241482367, "grad_norm": 0.17869396874533416, "learning_rate": 2.2301715769783572e-07, "loss": 0.2231, "step": 36300 }, { "epoch": 0.9876107156441885, "grad_norm": 0.30165958371764556, "learning_rate": 1.8431764687021347e-07, "loss": 0.2232, "step": 36350 }, { "epoch": 0.9889691898060099, "grad_norm": 0.24564888452021394, "learning_rate": 1.4930165185564894e-07, "loss": 0.2135, "step": 36400 }, { "epoch": 0.9903276639678313, "grad_norm": 0.2710306063596811, "learning_rate": 1.1796981815888064e-07, "loss": 0.2099, "step": 36450 }, { "epoch": 0.9916861381296528, "grad_norm": 0.2109106341757756, "learning_rate": 9.032272336875203e-08, "loss": 0.21, "step": 36500 }, { "epoch": 0.9930446122914742, "grad_norm": 0.27834177280791966, "learning_rate": 6.636087714748662e-08, "loss": 0.2265, "step": 36550 }, { "epoch": 0.9944030864532957, "grad_norm": 0.30415917234664425, "learning_rate": 4.608472122138441e-08, "loss": 0.232, "step": 36600 }, { "epoch": 0.9957615606151171, "grad_norm": 0.24438726040214506, "learning_rate": 2.949462937262837e-08, "loss": 0.221, "step": 36650 }, { "epoch": 0.9971200347769386, "grad_norm": 0.26240483157432476, "learning_rate": 1.6590907432401104e-08, "loss": 0.2276, "step": 36700 }, { "epoch": 0.99847850893876, "grad_norm": 0.24193448916634921, "learning_rate": 7.3737932752226955e-09, "loss": 0.2229, "step": 36750 }, { "epoch": 0.9998369831005814, "grad_norm": 0.308625063910033, "learning_rate": 1.843456814643041e-09, "loss": 0.219, "step": 36800 }, { "epoch": 1.0, "step": 36806, "total_flos": 244684510289920.0, "train_loss": 0.3143004386741001, "train_runtime": 666350.8913, "train_samples_per_second": 1.326, "train_steps_per_second": 0.055 } ], "logging_steps": 50, "max_steps": 36806, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 244684510289920.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }