diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5194 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 36806, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013584741618214422, + "grad_norm": 0.336629520848516, + "learning_rate": 8.868778280542987e-05, + "loss": 0.9342, + "step": 50 + }, + { + "epoch": 0.0027169483236428845, + "grad_norm": 0.41112101024853426, + "learning_rate": 0.00017918552036199096, + "loss": 0.5198, + "step": 100 + }, + { + "epoch": 0.004075422485464326, + "grad_norm": 0.38040867145053625, + "learning_rate": 0.0002696832579185521, + "loss": 0.4962, + "step": 150 + }, + { + "epoch": 0.005433896647285769, + "grad_norm": 0.5231995312551354, + "learning_rate": 0.00036018099547511313, + "loss": 0.5198, + "step": 200 + }, + { + "epoch": 0.006792370809107211, + "grad_norm": 0.37419434381789457, + "learning_rate": 0.00039999942189133334, + "loss": 0.5615, + "step": 250 + }, + { + "epoch": 0.008150844970928653, + "grad_norm": 0.5051525763054373, + "learning_rate": 0.00039999551377337605, + "loss": 0.5629, + "step": 300 + }, + { + "epoch": 0.009509319132750094, + "grad_norm": 0.37940069661143105, + "learning_rate": 0.0003999879188244911, + "loss": 0.5715, + "step": 350 + }, + { + "epoch": 0.010867793294571538, + "grad_norm": 0.4187127030134641, + "learning_rate": 0.0003999766371846881, + "loss": 0.5191, + "step": 400 + }, + { + "epoch": 0.01222626745639298, + "grad_norm": 0.4287344223835902, + "learning_rate": 0.00039996166906193926, + "loss": 0.5266, + "step": 450 + }, + { + "epoch": 0.013584741618214421, + "grad_norm": 0.42083341057731205, + "learning_rate": 0.00039994301473217543, + "loss": 0.4993, + "step": 500 + }, + { + "epoch": 0.014943215780035863, + "grad_norm": 0.44004507818862004, + "learning_rate": 0.00039992067453928115, + "loss": 0.5008, + "step": 550 + }, + { + "epoch": 0.016301689941857305, + "grad_norm": 0.3947637329018398, + "learning_rate": 0.0003998946488950882, + "loss": 0.5199, + "step": 600 + }, + { + "epoch": 0.01766016410367875, + "grad_norm": 0.35666971296762606, + "learning_rate": 0.0003998649382793681, + "loss": 0.5024, + "step": 650 + }, + { + "epoch": 0.01901863826550019, + "grad_norm": 0.3864531806325715, + "learning_rate": 0.0003998315432398232, + "loss": 0.5058, + "step": 700 + }, + { + "epoch": 0.020377112427321632, + "grad_norm": 0.3979644110719079, + "learning_rate": 0.00039979446439207663, + "loss": 0.4833, + "step": 750 + }, + { + "epoch": 0.021735586589143076, + "grad_norm": 0.32463557091237133, + "learning_rate": 0.000399753702419661, + "loss": 0.5181, + "step": 800 + }, + { + "epoch": 0.023094060750964516, + "grad_norm": 0.36185137659998867, + "learning_rate": 0.0003997092580740055, + "loss": 0.4731, + "step": 850 + }, + { + "epoch": 0.02445253491278596, + "grad_norm": 0.36585435226850893, + "learning_rate": 0.00039966113217442266, + "loss": 0.5028, + "step": 900 + }, + { + "epoch": 0.0258110090746074, + "grad_norm": 0.41125601490924335, + "learning_rate": 0.00039960932560809256, + "loss": 0.4907, + "step": 950 + }, + { + "epoch": 0.027169483236428843, + "grad_norm": 0.3848215921898568, + "learning_rate": 0.0003995538393300469, + "loss": 0.483, + "step": 1000 + }, + { + "epoch": 0.028527957398250287, + "grad_norm": 0.33951726776456204, + "learning_rate": 0.0003994946743631513, + "loss": 0.4567, + "step": 1050 + }, + { + "epoch": 0.029886431560071727, + "grad_norm": 0.3234265221555652, + "learning_rate": 0.0003994318317980862, + "loss": 0.4663, + "step": 1100 + }, + { + "epoch": 0.03124490572189317, + "grad_norm": 0.3997241882392808, + "learning_rate": 0.0003993666791947118, + "loss": 0.4709, + "step": 1150 + }, + { + "epoch": 0.03260337988371461, + "grad_norm": 0.45209570419286, + "learning_rate": 0.00039929799689193896, + "loss": 0.4836, + "step": 1200 + }, + { + "epoch": 0.033961854045536054, + "grad_norm": 0.31674750892690945, + "learning_rate": 0.000399224275685374, + "loss": 0.4708, + "step": 1250 + }, + { + "epoch": 0.0353203282073575, + "grad_norm": 0.3336127152054193, + "learning_rate": 0.0003991468818653226, + "loss": 0.48, + "step": 1300 + }, + { + "epoch": 0.03667880236917894, + "grad_norm": 0.30959090922038635, + "learning_rate": 0.0003990658168585062, + "loss": 0.4568, + "step": 1350 + }, + { + "epoch": 0.03803727653100038, + "grad_norm": 0.37908103624960793, + "learning_rate": 0.0003989810821593234, + "loss": 0.4867, + "step": 1400 + }, + { + "epoch": 0.03939575069282182, + "grad_norm": 0.3746655333602711, + "learning_rate": 0.0003988944833236597, + "loss": 0.4685, + "step": 1450 + }, + { + "epoch": 0.040754224854643264, + "grad_norm": 0.3841226468141105, + "learning_rate": 0.0003988043631486177, + "loss": 0.4788, + "step": 1500 + }, + { + "epoch": 0.04211269901646471, + "grad_norm": 0.4460643403291581, + "learning_rate": 0.0003987087755737732, + "loss": 0.4502, + "step": 1550 + }, + { + "epoch": 0.04347117317828615, + "grad_norm": 0.3671013360773688, + "learning_rate": 0.00039860952488846415, + "loss": 0.4534, + "step": 1600 + }, + { + "epoch": 0.04482964734010759, + "grad_norm": 0.34747090480175297, + "learning_rate": 0.000398506612922334, + "loss": 0.4405, + "step": 1650 + }, + { + "epoch": 0.04618812150192903, + "grad_norm": 0.36167597296572834, + "learning_rate": 0.00039840004157252047, + "loss": 0.4778, + "step": 1700 + }, + { + "epoch": 0.047546595663750475, + "grad_norm": 0.4009770951011953, + "learning_rate": 0.0003982898128036203, + "loss": 0.479, + "step": 1750 + }, + { + "epoch": 0.04890506982557192, + "grad_norm": 0.5968187908901367, + "learning_rate": 0.00039818055417120837, + "loss": 0.5444, + "step": 1800 + }, + { + "epoch": 0.05026354398739336, + "grad_norm": 1.8105646301472897, + "learning_rate": 0.0003980655464341668, + "loss": 0.5653, + "step": 1850 + }, + { + "epoch": 0.0516220181492148, + "grad_norm": 0.789226312089056, + "learning_rate": 0.0003979470320337101, + "loss": 0.7458, + "step": 1900 + }, + { + "epoch": 0.05298049231103624, + "grad_norm": 0.9263531743492742, + "learning_rate": 0.0003978224864094713, + "loss": 0.5415, + "step": 1950 + }, + { + "epoch": 0.054338966472857686, + "grad_norm": 0.47720194915599734, + "learning_rate": 0.00039769429401312583, + "loss": 0.5115, + "step": 2000 + }, + { + "epoch": 0.05569744063467913, + "grad_norm": 0.6259624884089505, + "learning_rate": 0.0003975624572078452, + "loss": 0.4968, + "step": 2050 + }, + { + "epoch": 0.05705591479650057, + "grad_norm": 0.6070556198109234, + "learning_rate": 0.00039742697842398407, + "loss": 0.4721, + "step": 2100 + }, + { + "epoch": 0.05841438895832201, + "grad_norm": 0.9495402483045952, + "learning_rate": 0.00039728786015903527, + "loss": 0.5384, + "step": 2150 + }, + { + "epoch": 0.05977286312014345, + "grad_norm": 0.5094390312707457, + "learning_rate": 0.0003971451049775838, + "loss": 0.4768, + "step": 2200 + }, + { + "epoch": 0.0611313372819649, + "grad_norm": 0.4164209148361273, + "learning_rate": 0.00039699871551126, + "loss": 0.4745, + "step": 2250 + }, + { + "epoch": 0.06248981144378634, + "grad_norm": 0.4096404988646368, + "learning_rate": 0.0003968486944586903, + "loss": 0.4571, + "step": 2300 + }, + { + "epoch": 0.06384828560560778, + "grad_norm": 0.36680803179529065, + "learning_rate": 0.00039669504458544815, + "loss": 0.4542, + "step": 2350 + }, + { + "epoch": 0.06520675976742922, + "grad_norm": 0.4436494287681541, + "learning_rate": 0.00039653776872400245, + "loss": 0.4526, + "step": 2400 + }, + { + "epoch": 0.06656523392925066, + "grad_norm": 0.632255456628572, + "learning_rate": 0.0003963768697736659, + "loss": 0.4614, + "step": 2450 + }, + { + "epoch": 0.06792370809107211, + "grad_norm": 0.27294306772696525, + "learning_rate": 0.0003962123507005411, + "loss": 0.4247, + "step": 2500 + }, + { + "epoch": 0.06928218225289355, + "grad_norm": 0.4922506411666624, + "learning_rate": 0.00039604421453746615, + "loss": 0.4336, + "step": 2550 + }, + { + "epoch": 0.070640656414715, + "grad_norm": 0.3836605187611845, + "learning_rate": 0.00039587246438395866, + "loss": 0.4308, + "step": 2600 + }, + { + "epoch": 0.07199913057653644, + "grad_norm": 0.30031785058018967, + "learning_rate": 0.0003956971034061584, + "loss": 0.4336, + "step": 2650 + }, + { + "epoch": 0.07335760473835788, + "grad_norm": 0.30896010198949975, + "learning_rate": 0.00039551813483676944, + "loss": 0.4282, + "step": 2700 + }, + { + "epoch": 0.07471607890017933, + "grad_norm": 0.3264468040624618, + "learning_rate": 0.0003953355619749999, + "loss": 0.4466, + "step": 2750 + }, + { + "epoch": 0.07607455306200075, + "grad_norm": 0.47417679268203056, + "learning_rate": 0.0003951493881865018, + "loss": 0.455, + "step": 2800 + }, + { + "epoch": 0.0774330272238222, + "grad_norm": 0.467248267974456, + "learning_rate": 0.0003949596169033084, + "loss": 0.436, + "step": 2850 + }, + { + "epoch": 0.07879150138564364, + "grad_norm": 0.2574312755334094, + "learning_rate": 0.0003947662516237714, + "loss": 0.4263, + "step": 2900 + }, + { + "epoch": 0.08014997554746509, + "grad_norm": 0.40465633762609327, + "learning_rate": 0.0003945692959124962, + "loss": 0.4275, + "step": 2950 + }, + { + "epoch": 0.08150844970928653, + "grad_norm": 0.3105964962014554, + "learning_rate": 0.0003943687534002764, + "loss": 0.4063, + "step": 3000 + }, + { + "epoch": 0.08286692387110797, + "grad_norm": 0.36079130494736616, + "learning_rate": 0.00039416462778402644, + "loss": 0.4291, + "step": 3050 + }, + { + "epoch": 0.08422539803292942, + "grad_norm": 0.27991323241801247, + "learning_rate": 0.0003939569228267139, + "loss": 0.4294, + "step": 3100 + }, + { + "epoch": 0.08558387219475086, + "grad_norm": 0.28381836513166353, + "learning_rate": 0.00039374564235729017, + "loss": 0.4198, + "step": 3150 + }, + { + "epoch": 0.0869423463565723, + "grad_norm": 0.3299204189197327, + "learning_rate": 0.00039353079027061935, + "loss": 0.4103, + "step": 3200 + }, + { + "epoch": 0.08830082051839375, + "grad_norm": 0.326311631143589, + "learning_rate": 0.0003933123705274068, + "loss": 0.4297, + "step": 3250 + }, + { + "epoch": 0.08965929468021518, + "grad_norm": 0.3302186726703939, + "learning_rate": 0.0003930903871541262, + "loss": 0.4129, + "step": 3300 + }, + { + "epoch": 0.09101776884203662, + "grad_norm": 0.24354600292418105, + "learning_rate": 0.00039286484424294534, + "loss": 0.4178, + "step": 3350 + }, + { + "epoch": 0.09237624300385806, + "grad_norm": 0.33427871787687957, + "learning_rate": 0.00039263574595165007, + "loss": 0.4229, + "step": 3400 + }, + { + "epoch": 0.09373471716567951, + "grad_norm": 0.3922838514063193, + "learning_rate": 0.00039240309650356874, + "loss": 0.416, + "step": 3450 + }, + { + "epoch": 0.09509319132750095, + "grad_norm": 0.37066104429189983, + "learning_rate": 0.0003921669001874933, + "loss": 0.4359, + "step": 3500 + }, + { + "epoch": 0.0964516654893224, + "grad_norm": 0.29257797493644805, + "learning_rate": 0.0003919271613576008, + "loss": 0.4286, + "step": 3550 + }, + { + "epoch": 0.09781013965114384, + "grad_norm": 0.45431531124773644, + "learning_rate": 0.0003916838844333732, + "loss": 0.4291, + "step": 3600 + }, + { + "epoch": 0.09916861381296528, + "grad_norm": 0.31092097963579685, + "learning_rate": 0.0003914370738995154, + "loss": 0.431, + "step": 3650 + }, + { + "epoch": 0.10052708797478672, + "grad_norm": 0.2895204897106821, + "learning_rate": 0.00039118673430587307, + "loss": 0.4372, + "step": 3700 + }, + { + "epoch": 0.10188556213660817, + "grad_norm": 0.2576792476027214, + "learning_rate": 0.0003909328702673485, + "loss": 0.4527, + "step": 3750 + }, + { + "epoch": 0.1032440362984296, + "grad_norm": 0.30963702592618436, + "learning_rate": 0.0003906754864638156, + "loss": 0.4121, + "step": 3800 + }, + { + "epoch": 0.10460251046025104, + "grad_norm": 0.6475657508703404, + "learning_rate": 0.0003904145876400337, + "loss": 0.4224, + "step": 3850 + }, + { + "epoch": 0.10596098462207248, + "grad_norm": 0.3256994801441109, + "learning_rate": 0.00039015017860555984, + "loss": 0.4363, + "step": 3900 + }, + { + "epoch": 0.10731945878389393, + "grad_norm": 0.3471706882794347, + "learning_rate": 0.0003898822642346604, + "loss": 0.4252, + "step": 3950 + }, + { + "epoch": 0.10867793294571537, + "grad_norm": 0.29742285163537485, + "learning_rate": 0.00038961084946622114, + "loss": 0.41, + "step": 4000 + }, + { + "epoch": 0.11003640710753682, + "grad_norm": 0.42367795916225637, + "learning_rate": 0.0003893359393036561, + "loss": 0.4047, + "step": 4050 + }, + { + "epoch": 0.11139488126935826, + "grad_norm": 0.3766799857882688, + "learning_rate": 0.0003890575388148154, + "loss": 0.4142, + "step": 4100 + }, + { + "epoch": 0.1127533554311797, + "grad_norm": 0.4163460710146468, + "learning_rate": 0.00038877565313189184, + "loss": 0.4467, + "step": 4150 + }, + { + "epoch": 0.11411182959300115, + "grad_norm": 0.2826407429662945, + "learning_rate": 0.00038849028745132627, + "loss": 0.4149, + "step": 4200 + }, + { + "epoch": 0.11547030375482259, + "grad_norm": 0.3265965959677555, + "learning_rate": 0.0003882014470337117, + "loss": 0.4358, + "step": 4250 + }, + { + "epoch": 0.11682877791664402, + "grad_norm": 0.24777830179313484, + "learning_rate": 0.00038790913720369657, + "loss": 0.4012, + "step": 4300 + }, + { + "epoch": 0.11818725207846546, + "grad_norm": 0.2915403708081659, + "learning_rate": 0.00038761336334988634, + "loss": 0.4069, + "step": 4350 + }, + { + "epoch": 0.1195457262402869, + "grad_norm": 0.3326202807353683, + "learning_rate": 0.00038731413092474423, + "loss": 0.3902, + "step": 4400 + }, + { + "epoch": 0.12090420040210835, + "grad_norm": 0.3965527555219645, + "learning_rate": 0.00038701144544449085, + "loss": 0.3894, + "step": 4450 + }, + { + "epoch": 0.1222626745639298, + "grad_norm": 0.36617448279834447, + "learning_rate": 0.0003867053124890022, + "loss": 0.3993, + "step": 4500 + }, + { + "epoch": 0.12362114872575124, + "grad_norm": 0.2978802526091461, + "learning_rate": 0.0003863957377017073, + "loss": 0.3934, + "step": 4550 + }, + { + "epoch": 0.12497962288757268, + "grad_norm": 0.3199935306648141, + "learning_rate": 0.0003860827267894834, + "loss": 0.4015, + "step": 4600 + }, + { + "epoch": 0.1263380970493941, + "grad_norm": 0.28870094415921566, + "learning_rate": 0.00038576628552255173, + "loss": 0.4242, + "step": 4650 + }, + { + "epoch": 0.12769657121121555, + "grad_norm": 0.3368662682662353, + "learning_rate": 0.00038544641973437026, + "loss": 0.4078, + "step": 4700 + }, + { + "epoch": 0.129055045373037, + "grad_norm": 0.34171191785593347, + "learning_rate": 0.0003851231353215267, + "loss": 0.4184, + "step": 4750 + }, + { + "epoch": 0.13041351953485844, + "grad_norm": 0.3456350837751831, + "learning_rate": 0.00038479643824362956, + "loss": 0.4011, + "step": 4800 + }, + { + "epoch": 0.13177199369667988, + "grad_norm": 0.44062561750629825, + "learning_rate": 0.00038446633452319845, + "loss": 0.4179, + "step": 4850 + }, + { + "epoch": 0.13313046785850133, + "grad_norm": 0.30494197906676423, + "learning_rate": 0.00038413283024555284, + "loss": 0.3987, + "step": 4900 + }, + { + "epoch": 0.13448894202032277, + "grad_norm": 0.26281770974778434, + "learning_rate": 0.00038379593155870006, + "loss": 0.3745, + "step": 4950 + }, + { + "epoch": 0.13584741618214421, + "grad_norm": 0.33576603130586835, + "learning_rate": 0.00038345564467322197, + "loss": 0.3981, + "step": 5000 + }, + { + "epoch": 0.13720589034396566, + "grad_norm": 0.3395300651756381, + "learning_rate": 0.00038311197586216023, + "loss": 0.3908, + "step": 5050 + }, + { + "epoch": 0.1385643645057871, + "grad_norm": 0.3641141735123601, + "learning_rate": 0.0003827649314609011, + "loss": 0.4156, + "step": 5100 + }, + { + "epoch": 0.13992283866760855, + "grad_norm": 0.3880481801495523, + "learning_rate": 0.00038241451786705824, + "loss": 0.4225, + "step": 5150 + }, + { + "epoch": 0.14128131282943, + "grad_norm": 0.33587356117829287, + "learning_rate": 0.0003820607415403548, + "loss": 0.4322, + "step": 5200 + }, + { + "epoch": 0.14263978699125143, + "grad_norm": 0.2651951238410833, + "learning_rate": 0.0003817036090025046, + "loss": 0.3882, + "step": 5250 + }, + { + "epoch": 0.14399826115307288, + "grad_norm": 0.3108835459594419, + "learning_rate": 0.0003813431268370919, + "loss": 0.3962, + "step": 5300 + }, + { + "epoch": 0.14535673531489432, + "grad_norm": 0.5822321494392535, + "learning_rate": 0.0003809793016894496, + "loss": 0.4092, + "step": 5350 + }, + { + "epoch": 0.14671520947671576, + "grad_norm": 0.37563297659114, + "learning_rate": 0.0003806121402665372, + "loss": 0.4168, + "step": 5400 + }, + { + "epoch": 0.1480736836385372, + "grad_norm": 0.3315292653141971, + "learning_rate": 0.00038024164933681703, + "loss": 0.4094, + "step": 5450 + }, + { + "epoch": 0.14943215780035865, + "grad_norm": 0.3989400802203129, + "learning_rate": 0.00037986783573012935, + "loss": 0.4068, + "step": 5500 + }, + { + "epoch": 0.15079063196218007, + "grad_norm": 0.3194388411256492, + "learning_rate": 0.0003794907063375666, + "loss": 0.4003, + "step": 5550 + }, + { + "epoch": 0.1521491061240015, + "grad_norm": 0.30240424166641394, + "learning_rate": 0.00037911026811134616, + "loss": 0.407, + "step": 5600 + }, + { + "epoch": 0.15350758028582295, + "grad_norm": 0.351737936530188, + "learning_rate": 0.0003787265280646825, + "loss": 0.4107, + "step": 5650 + }, + { + "epoch": 0.1548660544476444, + "grad_norm": 0.3836451635236239, + "learning_rate": 0.0003783394932716577, + "loss": 0.3999, + "step": 5700 + }, + { + "epoch": 0.15622452860946584, + "grad_norm": 0.25767700494067813, + "learning_rate": 0.0003779491708670909, + "loss": 0.388, + "step": 5750 + }, + { + "epoch": 0.15758300277128728, + "grad_norm": 0.35195866802663683, + "learning_rate": 0.00037755556804640723, + "loss": 0.3986, + "step": 5800 + }, + { + "epoch": 0.15894147693310873, + "grad_norm": 0.37789059875509895, + "learning_rate": 0.00037715869206550467, + "loss": 0.4124, + "step": 5850 + }, + { + "epoch": 0.16029995109493017, + "grad_norm": 0.29714505650099465, + "learning_rate": 0.0003767585502406204, + "loss": 0.382, + "step": 5900 + }, + { + "epoch": 0.16165842525675161, + "grad_norm": 0.36569606019843054, + "learning_rate": 0.0003763551499481964, + "loss": 0.4091, + "step": 5950 + }, + { + "epoch": 0.16301689941857306, + "grad_norm": 0.5124251442347727, + "learning_rate": 0.0003759484986247426, + "loss": 0.3957, + "step": 6000 + }, + { + "epoch": 0.1643753735803945, + "grad_norm": 0.42061839044447147, + "learning_rate": 0.0003755386037667007, + "loss": 0.3939, + "step": 6050 + }, + { + "epoch": 0.16573384774221595, + "grad_norm": 0.278996914346875, + "learning_rate": 0.0003751254729303053, + "loss": 0.4171, + "step": 6100 + }, + { + "epoch": 0.1670923219040374, + "grad_norm": 0.22931110142699168, + "learning_rate": 0.0003747091137314451, + "loss": 0.4037, + "step": 6150 + }, + { + "epoch": 0.16845079606585883, + "grad_norm": 0.4632003674215028, + "learning_rate": 0.00037428953384552197, + "loss": 0.3856, + "step": 6200 + }, + { + "epoch": 0.16980927022768028, + "grad_norm": 0.3456285538182738, + "learning_rate": 0.00037386674100730986, + "loss": 0.3887, + "step": 6250 + }, + { + "epoch": 0.17116774438950172, + "grad_norm": 0.4299448792360789, + "learning_rate": 0.0003734407430108124, + "loss": 0.3802, + "step": 6300 + }, + { + "epoch": 0.17252621855132316, + "grad_norm": 0.26911515847901823, + "learning_rate": 0.0003730115477091185, + "loss": 0.3906, + "step": 6350 + }, + { + "epoch": 0.1738846927131446, + "grad_norm": 0.24894262795042146, + "learning_rate": 0.00037257916301425823, + "loss": 0.3743, + "step": 6400 + }, + { + "epoch": 0.17524316687496605, + "grad_norm": 0.36696930956387547, + "learning_rate": 0.00037214359689705676, + "loss": 0.3977, + "step": 6450 + }, + { + "epoch": 0.1766016410367875, + "grad_norm": 0.4978113756154764, + "learning_rate": 0.0003717048573869873, + "loss": 0.3782, + "step": 6500 + }, + { + "epoch": 0.1779601151986089, + "grad_norm": 0.3180001579838143, + "learning_rate": 0.00037126295257202324, + "loss": 0.3975, + "step": 6550 + }, + { + "epoch": 0.17931858936043035, + "grad_norm": 0.408582540741261, + "learning_rate": 0.0003708178905984891, + "loss": 0.3763, + "step": 6600 + }, + { + "epoch": 0.1806770635222518, + "grad_norm": 0.27659616140789006, + "learning_rate": 0.00037036967967091005, + "loss": 0.4013, + "step": 6650 + }, + { + "epoch": 0.18203553768407324, + "grad_norm": 0.16425635963494883, + "learning_rate": 0.00036991832805186107, + "loss": 0.3865, + "step": 6700 + }, + { + "epoch": 0.18339401184589468, + "grad_norm": 0.37646449133193777, + "learning_rate": 0.00036946384406181425, + "loss": 0.3892, + "step": 6750 + }, + { + "epoch": 0.18475248600771613, + "grad_norm": 0.3427684491705063, + "learning_rate": 0.0003690062360789858, + "loss": 0.3969, + "step": 6800 + }, + { + "epoch": 0.18611096016953757, + "grad_norm": 0.392156716422773, + "learning_rate": 0.0003685455125391811, + "loss": 0.3709, + "step": 6850 + }, + { + "epoch": 0.18746943433135901, + "grad_norm": 0.3626113579097081, + "learning_rate": 0.0003680816819356398, + "loss": 0.3929, + "step": 6900 + }, + { + "epoch": 0.18882790849318046, + "grad_norm": 0.3652111299862884, + "learning_rate": 0.00036761475281887863, + "loss": 0.3941, + "step": 6950 + }, + { + "epoch": 0.1901863826550019, + "grad_norm": 0.3038051956143636, + "learning_rate": 0.0003671541644021072, + "loss": 0.4019, + "step": 7000 + }, + { + "epoch": 0.19154485681682334, + "grad_norm": 0.34844140332875567, + "learning_rate": 0.00036668112567831633, + "loss": 0.3666, + "step": 7050 + }, + { + "epoch": 0.1929033309786448, + "grad_norm": 0.31453187134210386, + "learning_rate": 0.0003662050142599555, + "loss": 0.4062, + "step": 7100 + }, + { + "epoch": 0.19426180514046623, + "grad_norm": 0.29627297234275446, + "learning_rate": 0.00036572583892393305, + "loss": 0.3807, + "step": 7150 + }, + { + "epoch": 0.19562027930228768, + "grad_norm": 0.37160716135610367, + "learning_rate": 0.0003652436085036393, + "loss": 0.3936, + "step": 7200 + }, + { + "epoch": 0.19697875346410912, + "grad_norm": 0.3178371792735983, + "learning_rate": 0.0003647583318887839, + "loss": 0.3942, + "step": 7250 + }, + { + "epoch": 0.19833722762593056, + "grad_norm": 0.36580162938243826, + "learning_rate": 0.0003642700180252315, + "loss": 0.3932, + "step": 7300 + }, + { + "epoch": 0.199695701787752, + "grad_norm": 0.2503536243665017, + "learning_rate": 0.0003637786759148375, + "loss": 0.3835, + "step": 7350 + }, + { + "epoch": 0.20105417594957345, + "grad_norm": 0.3422541205714727, + "learning_rate": 0.0003632942313704729, + "loss": 0.3869, + "step": 7400 + }, + { + "epoch": 0.2024126501113949, + "grad_norm": 0.2635673669764689, + "learning_rate": 0.00036279692010693837, + "loss": 0.374, + "step": 7450 + }, + { + "epoch": 0.20377112427321634, + "grad_norm": 0.32133691105349677, + "learning_rate": 0.0003622966077524861, + "loss": 0.3829, + "step": 7500 + }, + { + "epoch": 0.20512959843503775, + "grad_norm": 0.2963373700211773, + "learning_rate": 0.0003617933035301583, + "loss": 0.3784, + "step": 7550 + }, + { + "epoch": 0.2064880725968592, + "grad_norm": 0.34175259284676457, + "learning_rate": 0.000361287016718151, + "loss": 0.3634, + "step": 7600 + }, + { + "epoch": 0.20784654675868064, + "grad_norm": 0.21695916175140184, + "learning_rate": 0.0003607777566496428, + "loss": 0.3913, + "step": 7650 + }, + { + "epoch": 0.20920502092050208, + "grad_norm": 0.38835064397473545, + "learning_rate": 0.00036027580617629013, + "loss": 0.3937, + "step": 7700 + }, + { + "epoch": 0.21056349508232353, + "grad_norm": 0.33577145967062894, + "learning_rate": 0.00035976068680901367, + "loss": 0.4041, + "step": 7750 + }, + { + "epoch": 0.21192196924414497, + "grad_norm": 0.2829507495162838, + "learning_rate": 0.0003592426223224691, + "loss": 0.3885, + "step": 7800 + }, + { + "epoch": 0.2132804434059664, + "grad_norm": 0.30282305495180506, + "learning_rate": 0.00035872162226695156, + "loss": 0.425, + "step": 7850 + }, + { + "epoch": 0.21463891756778786, + "grad_norm": 0.26572704020691806, + "learning_rate": 0.000358197696246872, + "loss": 0.3719, + "step": 7900 + }, + { + "epoch": 0.2159973917296093, + "grad_norm": 0.4362525108778928, + "learning_rate": 0.0003576708539205804, + "loss": 0.3751, + "step": 7950 + }, + { + "epoch": 0.21735586589143074, + "grad_norm": 0.25828364696271605, + "learning_rate": 0.0003571411050001875, + "loss": 0.3863, + "step": 8000 + }, + { + "epoch": 0.2187143400532522, + "grad_norm": 0.3678501770422065, + "learning_rate": 0.00035660845925138585, + "loss": 0.3931, + "step": 8050 + }, + { + "epoch": 0.22007281421507363, + "grad_norm": 0.27819454488467, + "learning_rate": 0.00035607292649326983, + "loss": 0.3633, + "step": 8100 + }, + { + "epoch": 0.22143128837689507, + "grad_norm": 0.32104809246529153, + "learning_rate": 0.00035553451659815457, + "loss": 0.3914, + "step": 8150 + }, + { + "epoch": 0.22278976253871652, + "grad_norm": 0.32593288782577173, + "learning_rate": 0.000354993239491394, + "loss": 0.3721, + "step": 8200 + }, + { + "epoch": 0.22414823670053796, + "grad_norm": 0.2740573766851601, + "learning_rate": 0.00035444910515119776, + "loss": 0.3725, + "step": 8250 + }, + { + "epoch": 0.2255067108623594, + "grad_norm": 0.6372146103791719, + "learning_rate": 0.00035390212360844744, + "loss": 0.3786, + "step": 8300 + }, + { + "epoch": 0.22686518502418085, + "grad_norm": 0.3847972604563355, + "learning_rate": 0.00035335230494651165, + "loss": 0.3807, + "step": 8350 + }, + { + "epoch": 0.2282236591860023, + "grad_norm": 0.3297830679777594, + "learning_rate": 0.00035279965930105987, + "loss": 0.3757, + "step": 8400 + }, + { + "epoch": 0.22958213334782374, + "grad_norm": 0.27775568777577214, + "learning_rate": 0.00035224419685987593, + "loss": 0.3796, + "step": 8450 + }, + { + "epoch": 0.23094060750964518, + "grad_norm": 0.2985445199508582, + "learning_rate": 0.0003516859278626702, + "loss": 0.385, + "step": 8500 + }, + { + "epoch": 0.2322990816714666, + "grad_norm": 0.2822498403343061, + "learning_rate": 0.00035112486260089026, + "loss": 0.3654, + "step": 8550 + }, + { + "epoch": 0.23365755583328804, + "grad_norm": 0.29173659614651143, + "learning_rate": 0.0003505610114175323, + "loss": 0.3693, + "step": 8600 + }, + { + "epoch": 0.23501602999510948, + "grad_norm": 0.37316571053101977, + "learning_rate": 0.00034999438470694903, + "loss": 0.3624, + "step": 8650 + }, + { + "epoch": 0.23637450415693093, + "grad_norm": 0.2991870149215513, + "learning_rate": 0.0003494249929146593, + "loss": 0.3944, + "step": 8700 + }, + { + "epoch": 0.23773297831875237, + "grad_norm": 0.2626766731044403, + "learning_rate": 0.000348852846537155, + "loss": 0.3562, + "step": 8750 + }, + { + "epoch": 0.2390914524805738, + "grad_norm": 0.3742270372557048, + "learning_rate": 0.0003482779561217074, + "loss": 0.3737, + "step": 8800 + }, + { + "epoch": 0.24044992664239526, + "grad_norm": 0.33975911074890713, + "learning_rate": 0.000347700332266173, + "loss": 0.3673, + "step": 8850 + }, + { + "epoch": 0.2418084008042167, + "grad_norm": 0.24493733544746576, + "learning_rate": 0.00034711998561879823, + "loss": 0.3863, + "step": 8900 + }, + { + "epoch": 0.24316687496603814, + "grad_norm": 0.24367293507823867, + "learning_rate": 0.00034653692687802295, + "loss": 0.3597, + "step": 8950 + }, + { + "epoch": 0.2445253491278596, + "grad_norm": 0.4992858322232737, + "learning_rate": 0.0003459511667922831, + "loss": 0.3759, + "step": 9000 + }, + { + "epoch": 0.24588382328968103, + "grad_norm": 0.34074243370950325, + "learning_rate": 0.000345362716159813, + "loss": 0.3704, + "step": 9050 + }, + { + "epoch": 0.24724229745150247, + "grad_norm": 0.37589710424283923, + "learning_rate": 0.0003447715858284458, + "loss": 0.3605, + "step": 9100 + }, + { + "epoch": 0.24860077161332392, + "grad_norm": 0.2954917370820438, + "learning_rate": 0.00034417778669541414, + "loss": 0.3619, + "step": 9150 + }, + { + "epoch": 0.24995924577514536, + "grad_norm": 0.2682683326827451, + "learning_rate": 0.00034358132970714833, + "loss": 0.3548, + "step": 9200 + }, + { + "epoch": 0.2513177199369668, + "grad_norm": 0.3334186903450503, + "learning_rate": 0.00034298222585907556, + "loss": 0.3582, + "step": 9250 + }, + { + "epoch": 0.2526761940987882, + "grad_norm": 0.3037548456565183, + "learning_rate": 0.0003423804861954165, + "loss": 0.3598, + "step": 9300 + }, + { + "epoch": 0.25403466826060966, + "grad_norm": 0.3415172861536472, + "learning_rate": 0.00034177612180898186, + "loss": 0.3596, + "step": 9350 + }, + { + "epoch": 0.2553931424224311, + "grad_norm": 0.2778208976878839, + "learning_rate": 0.0003411691438409683, + "loss": 0.3557, + "step": 9400 + }, + { + "epoch": 0.25675161658425255, + "grad_norm": 0.2722662895108223, + "learning_rate": 0.0003405595634807524, + "loss": 0.3568, + "step": 9450 + }, + { + "epoch": 0.258110090746074, + "grad_norm": 0.2470305006609605, + "learning_rate": 0.00033994739196568485, + "loss": 0.3693, + "step": 9500 + }, + { + "epoch": 0.25946856490789544, + "grad_norm": 0.3204976945527186, + "learning_rate": 0.00033933264058088323, + "loss": 0.3744, + "step": 9550 + }, + { + "epoch": 0.2608270390697169, + "grad_norm": 0.29419054157347835, + "learning_rate": 0.0003387153206590238, + "loss": 0.3578, + "step": 9600 + }, + { + "epoch": 0.2621855132315383, + "grad_norm": 0.285418283786098, + "learning_rate": 0.0003380954435801327, + "loss": 0.3666, + "step": 9650 + }, + { + "epoch": 0.26354398739335977, + "grad_norm": 0.40228260917678826, + "learning_rate": 0.0003374730207713763, + "loss": 0.3642, + "step": 9700 + }, + { + "epoch": 0.2649024615551812, + "grad_norm": 0.30226961110995426, + "learning_rate": 0.0003368480637068501, + "loss": 0.3955, + "step": 9750 + }, + { + "epoch": 0.26626093571700266, + "grad_norm": 0.445566495529195, + "learning_rate": 0.00033622058390736785, + "loss": 0.3756, + "step": 9800 + }, + { + "epoch": 0.2676194098788241, + "grad_norm": 0.387905313007887, + "learning_rate": 0.00033559059294024864, + "loss": 0.3657, + "step": 9850 + }, + { + "epoch": 0.26897788404064554, + "grad_norm": 0.22352584167074716, + "learning_rate": 0.00033495810241910385, + "loss": 0.3452, + "step": 9900 + }, + { + "epoch": 0.270336358202467, + "grad_norm": 0.2556859512831143, + "learning_rate": 0.00033432312400362305, + "loss": 0.3463, + "step": 9950 + }, + { + "epoch": 0.27169483236428843, + "grad_norm": 0.5717326212718582, + "learning_rate": 0.00033368566939935925, + "loss": 0.3731, + "step": 10000 + }, + { + "epoch": 0.2730533065261099, + "grad_norm": 0.32255486027652513, + "learning_rate": 0.0003330457503575127, + "loss": 0.3698, + "step": 10050 + }, + { + "epoch": 0.2744117806879313, + "grad_norm": 0.25946616692420554, + "learning_rate": 0.0003324033786747145, + "loss": 0.3637, + "step": 10100 + }, + { + "epoch": 0.27577025484975276, + "grad_norm": 0.3530893683862247, + "learning_rate": 0.0003317585661928094, + "loss": 0.3646, + "step": 10150 + }, + { + "epoch": 0.2771287290115742, + "grad_norm": 0.2853492397892913, + "learning_rate": 0.000331111324798637, + "loss": 0.3295, + "step": 10200 + }, + { + "epoch": 0.27848720317339565, + "grad_norm": 0.19156693197511587, + "learning_rate": 0.0003304616664238127, + "loss": 0.359, + "step": 10250 + }, + { + "epoch": 0.2798456773352171, + "grad_norm": 0.26150890027393986, + "learning_rate": 0.00032980960304450834, + "loss": 0.3665, + "step": 10300 + }, + { + "epoch": 0.28120415149703853, + "grad_norm": 0.3656668619278649, + "learning_rate": 0.00032915514668123056, + "loss": 0.3498, + "step": 10350 + }, + { + "epoch": 0.28256262565886, + "grad_norm": 0.36465329584026973, + "learning_rate": 0.00032849830939859977, + "loss": 0.3722, + "step": 10400 + }, + { + "epoch": 0.2839210998206814, + "grad_norm": 0.28300896811439313, + "learning_rate": 0.00032783910330512776, + "loss": 0.3583, + "step": 10450 + }, + { + "epoch": 0.28527957398250287, + "grad_norm": 0.2470361242697161, + "learning_rate": 0.000327177540552994, + "loss": 0.3462, + "step": 10500 + }, + { + "epoch": 0.2866380481443243, + "grad_norm": 0.36918740507242426, + "learning_rate": 0.0003265136333378223, + "loss": 0.3699, + "step": 10550 + }, + { + "epoch": 0.28799652230614575, + "grad_norm": 0.32934977838919777, + "learning_rate": 0.0003258473938984554, + "loss": 0.3625, + "step": 10600 + }, + { + "epoch": 0.2893549964679672, + "grad_norm": 0.2087576531023101, + "learning_rate": 0.0003251788345167296, + "loss": 0.3568, + "step": 10650 + }, + { + "epoch": 0.29071347062978864, + "grad_norm": 0.39857663501798557, + "learning_rate": 0.00032450796751724837, + "loss": 0.3591, + "step": 10700 + }, + { + "epoch": 0.2920719447916101, + "grad_norm": 0.32871619749282505, + "learning_rate": 0.00032383480526715526, + "loss": 0.3603, + "step": 10750 + }, + { + "epoch": 0.2934304189534315, + "grad_norm": 0.3062057976969262, + "learning_rate": 0.00032315936017590554, + "loss": 0.3575, + "step": 10800 + }, + { + "epoch": 0.29478889311525297, + "grad_norm": 0.35676299616191043, + "learning_rate": 0.0003224816446950378, + "loss": 0.3406, + "step": 10850 + }, + { + "epoch": 0.2961473672770744, + "grad_norm": 0.268494161462533, + "learning_rate": 0.00032180167131794425, + "loss": 0.3356, + "step": 10900 + }, + { + "epoch": 0.29750584143889586, + "grad_norm": 0.39304660551244835, + "learning_rate": 0.0003211194525796404, + "loss": 0.3681, + "step": 10950 + }, + { + "epoch": 0.2988643156007173, + "grad_norm": 0.36242243481768954, + "learning_rate": 0.00032043500105653414, + "loss": 0.3624, + "step": 11000 + }, + { + "epoch": 0.3002227897625387, + "grad_norm": 0.32191025299969356, + "learning_rate": 0.0003197483293661937, + "loss": 0.3639, + "step": 11050 + }, + { + "epoch": 0.30158126392436013, + "grad_norm": 0.3819533916481645, + "learning_rate": 0.0003190594501671151, + "loss": 0.3639, + "step": 11100 + }, + { + "epoch": 0.3029397380861816, + "grad_norm": 0.3051700734949664, + "learning_rate": 0.000318368376158489, + "loss": 0.3495, + "step": 11150 + }, + { + "epoch": 0.304298212248003, + "grad_norm": 0.25353257677490404, + "learning_rate": 0.00031768900650322744, + "loss": 0.3424, + "step": 11200 + }, + { + "epoch": 0.30565668640982446, + "grad_norm": 0.2971513332547502, + "learning_rate": 0.000316993624394983, + "loss": 0.3465, + "step": 11250 + }, + { + "epoch": 0.3070151605716459, + "grad_norm": 0.3393454172568527, + "learning_rate": 0.00031629608555979686, + "loss": 0.357, + "step": 11300 + }, + { + "epoch": 0.30837363473346735, + "grad_norm": 0.3079000714041467, + "learning_rate": 0.0003155964028564964, + "loss": 0.3315, + "step": 11350 + }, + { + "epoch": 0.3097321088952888, + "grad_norm": 0.236457076827118, + "learning_rate": 0.00031489458918342993, + "loss": 0.3586, + "step": 11400 + }, + { + "epoch": 0.31109058305711024, + "grad_norm": 0.37187670656153765, + "learning_rate": 0.0003141906574782295, + "loss": 0.3479, + "step": 11450 + }, + { + "epoch": 0.3124490572189317, + "grad_norm": 0.2858744824646288, + "learning_rate": 0.0003134846207175722, + "loss": 0.359, + "step": 11500 + }, + { + "epoch": 0.3138075313807531, + "grad_norm": 0.29954433740207526, + "learning_rate": 0.00031277649191694063, + "loss": 0.3466, + "step": 11550 + }, + { + "epoch": 0.31516600554257457, + "grad_norm": 0.25530263584194796, + "learning_rate": 0.0003120662841303836, + "loss": 0.3488, + "step": 11600 + }, + { + "epoch": 0.316524479704396, + "grad_norm": 0.22413446350946586, + "learning_rate": 0.0003113540104502747, + "loss": 0.3471, + "step": 11650 + }, + { + "epoch": 0.31788295386621745, + "grad_norm": 0.40702625283242805, + "learning_rate": 0.000310639684007072, + "loss": 0.3382, + "step": 11700 + }, + { + "epoch": 0.3192414280280389, + "grad_norm": 0.2866280004893114, + "learning_rate": 0.0003099233179690746, + "loss": 0.3779, + "step": 11750 + }, + { + "epoch": 0.32059990218986034, + "grad_norm": 0.313655190983661, + "learning_rate": 0.0003092049255421813, + "loss": 0.3646, + "step": 11800 + }, + { + "epoch": 0.3219583763516818, + "grad_norm": 0.34970496506197146, + "learning_rate": 0.00030848451996964615, + "loss": 0.3628, + "step": 11850 + }, + { + "epoch": 0.32331685051350323, + "grad_norm": 0.36130996602692567, + "learning_rate": 0.00030776211453183475, + "loss": 0.3608, + "step": 11900 + }, + { + "epoch": 0.3246753246753247, + "grad_norm": 0.22850628919525512, + "learning_rate": 0.00030703772254597945, + "loss": 0.326, + "step": 11950 + }, + { + "epoch": 0.3260337988371461, + "grad_norm": 0.3620511895369416, + "learning_rate": 0.00030631135736593364, + "loss": 0.349, + "step": 12000 + }, + { + "epoch": 0.32739227299896756, + "grad_norm": 0.2122923442045741, + "learning_rate": 0.0003055830323819257, + "loss": 0.3734, + "step": 12050 + }, + { + "epoch": 0.328750747160789, + "grad_norm": 0.24737840068319314, + "learning_rate": 0.00030485276102031235, + "loss": 0.358, + "step": 12100 + }, + { + "epoch": 0.33010922132261045, + "grad_norm": 0.3610838024240164, + "learning_rate": 0.0003041205567433305, + "loss": 0.3513, + "step": 12150 + }, + { + "epoch": 0.3314676954844319, + "grad_norm": 0.33939684182516894, + "learning_rate": 0.0003033864330488499, + "loss": 0.3555, + "step": 12200 + }, + { + "epoch": 0.33282616964625333, + "grad_norm": 0.2666873772787006, + "learning_rate": 0.00030265040347012397, + "loss": 0.3469, + "step": 12250 + }, + { + "epoch": 0.3341846438080748, + "grad_norm": 0.21914168729339542, + "learning_rate": 0.00030191248157554, + "loss": 0.3323, + "step": 12300 + }, + { + "epoch": 0.3355431179698962, + "grad_norm": 0.3499432909434212, + "learning_rate": 0.0003011726809683694, + "loss": 0.3321, + "step": 12350 + }, + { + "epoch": 0.33690159213171766, + "grad_norm": 0.2752315627002723, + "learning_rate": 0.0003004310152865169, + "loss": 0.366, + "step": 12400 + }, + { + "epoch": 0.3382600662935391, + "grad_norm": 0.3224312997036977, + "learning_rate": 0.0002996874982022692, + "loss": 0.3363, + "step": 12450 + }, + { + "epoch": 0.33961854045536055, + "grad_norm": 0.2614682577027786, + "learning_rate": 0.00029894214342204243, + "loss": 0.3364, + "step": 12500 + }, + { + "epoch": 0.340977014617182, + "grad_norm": 0.35386811908507626, + "learning_rate": 0.00029819496468613024, + "loss": 0.3468, + "step": 12550 + }, + { + "epoch": 0.34233548877900344, + "grad_norm": 0.3451776698004379, + "learning_rate": 0.00029744597576844995, + "loss": 0.3457, + "step": 12600 + }, + { + "epoch": 0.3436939629408249, + "grad_norm": 0.2075635034305044, + "learning_rate": 0.00029669519047628874, + "loss": 0.3217, + "step": 12650 + }, + { + "epoch": 0.3450524371026463, + "grad_norm": 0.35938341724916706, + "learning_rate": 0.0002959426226500493, + "loss": 0.3518, + "step": 12700 + }, + { + "epoch": 0.34641091126446777, + "grad_norm": 0.2028404175509972, + "learning_rate": 0.0002951882861629944, + "loss": 0.3464, + "step": 12750 + }, + { + "epoch": 0.3477693854262892, + "grad_norm": 0.3092038953376563, + "learning_rate": 0.00029443219492099153, + "loss": 0.3565, + "step": 12800 + }, + { + "epoch": 0.34912785958811066, + "grad_norm": 0.29068071721416333, + "learning_rate": 0.0002936743628622562, + "loss": 0.3315, + "step": 12850 + }, + { + "epoch": 0.3504863337499321, + "grad_norm": 0.20779330405236773, + "learning_rate": 0.0002929148039570951, + "loss": 0.3174, + "step": 12900 + }, + { + "epoch": 0.35184480791175354, + "grad_norm": 0.31923873979474604, + "learning_rate": 0.00029215353220764863, + "loss": 0.3441, + "step": 12950 + }, + { + "epoch": 0.353203282073575, + "grad_norm": 0.2745041226462606, + "learning_rate": 0.00029139056164763274, + "loss": 0.3467, + "step": 13000 + }, + { + "epoch": 0.35456175623539643, + "grad_norm": 0.4368395215278957, + "learning_rate": 0.0002906259063420803, + "loss": 0.3517, + "step": 13050 + }, + { + "epoch": 0.3559202303972178, + "grad_norm": 0.30792463599025904, + "learning_rate": 0.0002898595803870815, + "loss": 0.3442, + "step": 13100 + }, + { + "epoch": 0.35727870455903926, + "grad_norm": 0.3611952448865168, + "learning_rate": 0.0002890915979095244, + "loss": 0.3204, + "step": 13150 + }, + { + "epoch": 0.3586371787208607, + "grad_norm": 0.23056033787481225, + "learning_rate": 0.0002883219730668345, + "loss": 0.3239, + "step": 13200 + }, + { + "epoch": 0.35999565288268215, + "grad_norm": 0.2530394826085691, + "learning_rate": 0.00028755072004671314, + "loss": 0.3473, + "step": 13250 + }, + { + "epoch": 0.3613541270445036, + "grad_norm": 0.33962698046120804, + "learning_rate": 0.000286793326131175, + "loss": 0.3416, + "step": 13300 + }, + { + "epoch": 0.36271260120632504, + "grad_norm": 0.21053853436821962, + "learning_rate": 0.0002860188912935213, + "loss": 0.3278, + "step": 13350 + }, + { + "epoch": 0.3640710753681465, + "grad_norm": 0.3129818212559564, + "learning_rate": 0.00028524287073475416, + "loss": 0.3541, + "step": 13400 + }, + { + "epoch": 0.3654295495299679, + "grad_norm": 0.2699867150782398, + "learning_rate": 0.0002844652787604775, + "loss": 0.3403, + "step": 13450 + }, + { + "epoch": 0.36678802369178937, + "grad_norm": 0.28737860107629143, + "learning_rate": 0.00028368612970526357, + "loss": 0.3323, + "step": 13500 + }, + { + "epoch": 0.3681464978536108, + "grad_norm": 0.3515960746260734, + "learning_rate": 0.00028290543793238867, + "loss": 0.3293, + "step": 13550 + }, + { + "epoch": 0.36950497201543225, + "grad_norm": 0.20401533840321576, + "learning_rate": 0.0002821232178335684, + "loss": 0.3316, + "step": 13600 + }, + { + "epoch": 0.3708634461772537, + "grad_norm": 0.19995437318728543, + "learning_rate": 0.0002813551732516669, + "loss": 0.3427, + "step": 13650 + }, + { + "epoch": 0.37222192033907514, + "grad_norm": 0.2545451160615089, + "learning_rate": 0.00028056996963593105, + "loss": 0.3246, + "step": 13700 + }, + { + "epoch": 0.3735803945008966, + "grad_norm": 0.29065996361482416, + "learning_rate": 0.0002797832807475994, + "loss": 0.3377, + "step": 13750 + }, + { + "epoch": 0.37493886866271803, + "grad_norm": 0.3334762345639782, + "learning_rate": 0.00027899512108894186, + "loss": 0.3281, + "step": 13800 + }, + { + "epoch": 0.37629734282453947, + "grad_norm": 0.18363139112462235, + "learning_rate": 0.00027820550518934127, + "loss": 0.3498, + "step": 13850 + }, + { + "epoch": 0.3776558169863609, + "grad_norm": 0.303677922590966, + "learning_rate": 0.00027741444760502593, + "loss": 0.3282, + "step": 13900 + }, + { + "epoch": 0.37901429114818236, + "grad_norm": 0.33021307742532524, + "learning_rate": 0.0002766378265036753, + "loss": 0.3612, + "step": 13950 + }, + { + "epoch": 0.3803727653100038, + "grad_norm": 0.3855197948015209, + "learning_rate": 0.00027584395743117087, + "loss": 0.326, + "step": 14000 + }, + { + "epoch": 0.38173123947182525, + "grad_norm": 0.17305752786285836, + "learning_rate": 0.0002750486902080647, + "loss": 0.3306, + "step": 14050 + }, + { + "epoch": 0.3830897136336467, + "grad_norm": 0.3557889572340088, + "learning_rate": 0.0002742520394947646, + "loss": 0.3363, + "step": 14100 + }, + { + "epoch": 0.38444818779546813, + "grad_norm": 0.269254653829798, + "learning_rate": 0.0002734540199771824, + "loss": 0.3509, + "step": 14150 + }, + { + "epoch": 0.3858066619572896, + "grad_norm": 0.46153677475953025, + "learning_rate": 0.00027265464636646333, + "loss": 0.3423, + "step": 14200 + }, + { + "epoch": 0.387165136119111, + "grad_norm": 0.25450280604338793, + "learning_rate": 0.0002718539333987147, + "loss": 0.3344, + "step": 14250 + }, + { + "epoch": 0.38852361028093246, + "grad_norm": 0.24854855950361845, + "learning_rate": 0.00027105189583473416, + "loss": 0.317, + "step": 14300 + }, + { + "epoch": 0.3898820844427539, + "grad_norm": 0.25191512294105933, + "learning_rate": 0.00027024854845973797, + "loss": 0.3343, + "step": 14350 + }, + { + "epoch": 0.39124055860457535, + "grad_norm": 0.3399094367009323, + "learning_rate": 0.000269443906083088, + "loss": 0.3141, + "step": 14400 + }, + { + "epoch": 0.3925990327663968, + "grad_norm": 0.27297702861099216, + "learning_rate": 0.00026863798353801905, + "loss": 0.344, + "step": 14450 + }, + { + "epoch": 0.39395750692821824, + "grad_norm": 0.3089505317673794, + "learning_rate": 0.000267830795681365, + "loss": 0.3248, + "step": 14500 + }, + { + "epoch": 0.3953159810900397, + "grad_norm": 0.28407318632921835, + "learning_rate": 0.0002670223573932857, + "loss": 0.3218, + "step": 14550 + }, + { + "epoch": 0.3966744552518611, + "grad_norm": 0.27517856010825675, + "learning_rate": 0.0002662126835769916, + "loss": 0.3207, + "step": 14600 + }, + { + "epoch": 0.39803292941368257, + "grad_norm": 0.2209431864475645, + "learning_rate": 0.00026540178915847003, + "loss": 0.3213, + "step": 14650 + }, + { + "epoch": 0.399391403575504, + "grad_norm": 0.3012179785372981, + "learning_rate": 0.0002645896890862093, + "loss": 0.3031, + "step": 14700 + }, + { + "epoch": 0.40074987773732546, + "grad_norm": 0.35758174495742123, + "learning_rate": 0.0002637763983309235, + "loss": 0.3244, + "step": 14750 + }, + { + "epoch": 0.4021083518991469, + "grad_norm": 0.20197976836253828, + "learning_rate": 0.00026296193188527655, + "loss": 0.3211, + "step": 14800 + }, + { + "epoch": 0.40346682606096834, + "grad_norm": 0.2784592655459722, + "learning_rate": 0.0002621463047636057, + "loss": 0.3233, + "step": 14850 + }, + { + "epoch": 0.4048253002227898, + "grad_norm": 0.3125528717241462, + "learning_rate": 0.0002613295320016445, + "loss": 0.324, + "step": 14900 + }, + { + "epoch": 0.40618377438461123, + "grad_norm": 0.3316178107391592, + "learning_rate": 0.00026051162865624636, + "loss": 0.3358, + "step": 14950 + }, + { + "epoch": 0.4075422485464327, + "grad_norm": 0.18439640337971394, + "learning_rate": 0.00025969260980510605, + "loss": 0.3031, + "step": 15000 + }, + { + "epoch": 0.4089007227082541, + "grad_norm": 0.2963162351967641, + "learning_rate": 0.00025887249054648245, + "loss": 0.3276, + "step": 15050 + }, + { + "epoch": 0.4102591968700755, + "grad_norm": 0.22466036509634918, + "learning_rate": 0.00025805128599891994, + "loss": 0.3364, + "step": 15100 + }, + { + "epoch": 0.41161767103189695, + "grad_norm": 0.2956284294357639, + "learning_rate": 0.00025722901130096975, + "loss": 0.3314, + "step": 15150 + }, + { + "epoch": 0.4129761451937184, + "grad_norm": 0.36079018502753485, + "learning_rate": 0.00025642215844549676, + "loss": 0.3351, + "step": 15200 + }, + { + "epoch": 0.41433461935553983, + "grad_norm": 0.3005152688044544, + "learning_rate": 0.00025559780958847773, + "loss": 0.3202, + "step": 15250 + }, + { + "epoch": 0.4156930935173613, + "grad_norm": 0.33773262295043566, + "learning_rate": 0.00025477243580984904, + "loss": 0.3089, + "step": 15300 + }, + { + "epoch": 0.4170515676791827, + "grad_norm": 0.3045253275707874, + "learning_rate": 0.00025394605232501987, + "loss": 0.32, + "step": 15350 + }, + { + "epoch": 0.41841004184100417, + "grad_norm": 0.34532028635872886, + "learning_rate": 0.0002531186743680128, + "loss": 0.3449, + "step": 15400 + }, + { + "epoch": 0.4197685160028256, + "grad_norm": 0.14134553464927366, + "learning_rate": 0.0002522903171911834, + "loss": 0.3184, + "step": 15450 + }, + { + "epoch": 0.42112699016464705, + "grad_norm": 0.2761208223745771, + "learning_rate": 0.00025146099606493817, + "loss": 0.3133, + "step": 15500 + }, + { + "epoch": 0.4224854643264685, + "grad_norm": 0.26549068110573254, + "learning_rate": 0.0002506307262774542, + "loss": 0.3205, + "step": 15550 + }, + { + "epoch": 0.42384393848828994, + "grad_norm": 0.2773532590377829, + "learning_rate": 0.00024979952313439636, + "loss": 0.3064, + "step": 15600 + }, + { + "epoch": 0.4252024126501114, + "grad_norm": 0.3073493622335924, + "learning_rate": 0.0002489674019586356, + "loss": 0.3188, + "step": 15650 + }, + { + "epoch": 0.4265608868119328, + "grad_norm": 0.21684387112528378, + "learning_rate": 0.0002481343780899665, + "loss": 0.3198, + "step": 15700 + }, + { + "epoch": 0.42791936097375427, + "grad_norm": 0.25977297083107986, + "learning_rate": 0.00024730046688482436, + "loss": 0.3065, + "step": 15750 + }, + { + "epoch": 0.4292778351355757, + "grad_norm": 0.3308957326333168, + "learning_rate": 0.000246465683716002, + "loss": 0.3085, + "step": 15800 + }, + { + "epoch": 0.43063630929739716, + "grad_norm": 0.25944521132960924, + "learning_rate": 0.0002456300439723668, + "loss": 0.3136, + "step": 15850 + }, + { + "epoch": 0.4319947834592186, + "grad_norm": 0.22121128637476822, + "learning_rate": 0.0002447935630585764, + "loss": 0.322, + "step": 15900 + }, + { + "epoch": 0.43335325762104004, + "grad_norm": 0.32019002146360315, + "learning_rate": 0.0002439562563947953, + "loss": 0.3103, + "step": 15950 + }, + { + "epoch": 0.4347117317828615, + "grad_norm": 0.26761100791647713, + "learning_rate": 0.0002431181394164103, + "loss": 0.3114, + "step": 16000 + }, + { + "epoch": 0.43607020594468293, + "grad_norm": 0.22262870758692213, + "learning_rate": 0.00024227922757374582, + "loss": 0.3069, + "step": 16050 + }, + { + "epoch": 0.4374286801065044, + "grad_norm": 0.18940890843015826, + "learning_rate": 0.00024143953633177937, + "loss": 0.327, + "step": 16100 + }, + { + "epoch": 0.4387871542683258, + "grad_norm": 0.27459192854267717, + "learning_rate": 0.00024059908116985654, + "loss": 0.3183, + "step": 16150 + }, + { + "epoch": 0.44014562843014726, + "grad_norm": 0.36514373383887516, + "learning_rate": 0.00023975787758140525, + "loss": 0.2878, + "step": 16200 + }, + { + "epoch": 0.4415041025919687, + "grad_norm": 0.30714779342945764, + "learning_rate": 0.00023891594107365024, + "loss": 0.3173, + "step": 16250 + }, + { + "epoch": 0.44286257675379015, + "grad_norm": 0.24572160078772548, + "learning_rate": 0.0002380732871673276, + "loss": 0.3169, + "step": 16300 + }, + { + "epoch": 0.4442210509156116, + "grad_norm": 0.22451585676228034, + "learning_rate": 0.00023722993139639806, + "loss": 0.2982, + "step": 16350 + }, + { + "epoch": 0.44557952507743304, + "grad_norm": 0.4312837719351318, + "learning_rate": 0.000236436550903555, + "loss": 0.3126, + "step": 16400 + }, + { + "epoch": 0.4469379992392545, + "grad_norm": 0.23649698073314787, + "learning_rate": 0.00023559187786324523, + "loss": 0.3229, + "step": 16450 + }, + { + "epoch": 0.4482964734010759, + "grad_norm": 0.21885398793120167, + "learning_rate": 0.00023474654870203753, + "loss": 0.3066, + "step": 16500 + }, + { + "epoch": 0.44965494756289737, + "grad_norm": 0.2522766751448378, + "learning_rate": 0.00023390057900320987, + "loss": 0.3121, + "step": 16550 + }, + { + "epoch": 0.4510134217247188, + "grad_norm": 0.2023032182722522, + "learning_rate": 0.0002330539843618484, + "loss": 0.295, + "step": 16600 + }, + { + "epoch": 0.45237189588654025, + "grad_norm": 0.3093016288187825, + "learning_rate": 0.00023220678038455975, + "loss": 0.2962, + "step": 16650 + }, + { + "epoch": 0.4537303700483617, + "grad_norm": 0.2805332120341892, + "learning_rate": 0.00023135898268918323, + "loss": 0.313, + "step": 16700 + }, + { + "epoch": 0.45508884421018314, + "grad_norm": 0.25366173411593823, + "learning_rate": 0.00023051060690450337, + "loss": 0.308, + "step": 16750 + }, + { + "epoch": 0.4564473183720046, + "grad_norm": 0.2848859608687515, + "learning_rate": 0.00022966166866996134, + "loss": 0.2966, + "step": 16800 + }, + { + "epoch": 0.45780579253382603, + "grad_norm": 0.3400405221454168, + "learning_rate": 0.0002288121836353669, + "loss": 0.313, + "step": 16850 + }, + { + "epoch": 0.45916426669564747, + "grad_norm": 0.25178053552917457, + "learning_rate": 0.0002279621674606098, + "loss": 0.3008, + "step": 16900 + }, + { + "epoch": 0.4605227408574689, + "grad_norm": 0.2173042023564375, + "learning_rate": 0.00022711163581537106, + "loss": 0.3062, + "step": 16950 + }, + { + "epoch": 0.46188121501929036, + "grad_norm": 0.23983156392471572, + "learning_rate": 0.00022626060437883435, + "loss": 0.3055, + "step": 17000 + }, + { + "epoch": 0.4632396891811118, + "grad_norm": 0.21031049569805663, + "learning_rate": 0.00022540908883939668, + "loss": 0.311, + "step": 17050 + }, + { + "epoch": 0.4645981633429332, + "grad_norm": 0.1989362043985782, + "learning_rate": 0.00022455710489437927, + "loss": 0.3259, + "step": 17100 + }, + { + "epoch": 0.46595663750475463, + "grad_norm": 0.1900668237823236, + "learning_rate": 0.00022370466824973812, + "loss": 0.2797, + "step": 17150 + }, + { + "epoch": 0.4673151116665761, + "grad_norm": 0.20840494435322787, + "learning_rate": 0.00022285179461977483, + "loss": 0.3079, + "step": 17200 + }, + { + "epoch": 0.4686735858283975, + "grad_norm": 0.4022346673956682, + "learning_rate": 0.00022199849972684633, + "loss": 0.2958, + "step": 17250 + }, + { + "epoch": 0.47003205999021896, + "grad_norm": 0.3270990206921089, + "learning_rate": 0.0002211447993010755, + "loss": 0.3313, + "step": 17300 + }, + { + "epoch": 0.4713905341520404, + "grad_norm": 0.249803246986443, + "learning_rate": 0.00022029070908006096, + "loss": 0.3104, + "step": 17350 + }, + { + "epoch": 0.47274900831386185, + "grad_norm": 0.2813145656422356, + "learning_rate": 0.0002194362448085872, + "loss": 0.3039, + "step": 17400 + }, + { + "epoch": 0.4741074824756833, + "grad_norm": 0.20904103519051825, + "learning_rate": 0.00021858142223833395, + "loss": 0.3093, + "step": 17450 + }, + { + "epoch": 0.47546595663750474, + "grad_norm": 0.2476519540180904, + "learning_rate": 0.00021772625712758624, + "loss": 0.3133, + "step": 17500 + }, + { + "epoch": 0.4768244307993262, + "grad_norm": 0.2897735958185, + "learning_rate": 0.00021687076524094353, + "loss": 0.3184, + "step": 17550 + }, + { + "epoch": 0.4781829049611476, + "grad_norm": 0.36797022439353905, + "learning_rate": 0.0002160149623490293, + "loss": 0.2982, + "step": 17600 + }, + { + "epoch": 0.47954137912296907, + "grad_norm": 0.22151406862910683, + "learning_rate": 0.0002151588642282003, + "loss": 0.3031, + "step": 17650 + }, + { + "epoch": 0.4808998532847905, + "grad_norm": 0.24573689529627643, + "learning_rate": 0.00021430248666025561, + "loss": 0.2927, + "step": 17700 + }, + { + "epoch": 0.48225832744661196, + "grad_norm": 0.25110843175386494, + "learning_rate": 0.0002134458454321459, + "loss": 0.2984, + "step": 17750 + }, + { + "epoch": 0.4836168016084334, + "grad_norm": 0.26920027208505604, + "learning_rate": 0.00021258895633568238, + "loss": 0.2869, + "step": 17800 + }, + { + "epoch": 0.48497527577025484, + "grad_norm": 0.3111889899596438, + "learning_rate": 0.0002117318351672454, + "loss": 0.3215, + "step": 17850 + }, + { + "epoch": 0.4863337499320763, + "grad_norm": 0.20320042839557148, + "learning_rate": 0.00021087449772749347, + "loss": 0.3019, + "step": 17900 + }, + { + "epoch": 0.48769222409389773, + "grad_norm": 0.29026043340389285, + "learning_rate": 0.00021001695982107217, + "loss": 0.3087, + "step": 17950 + }, + { + "epoch": 0.4890506982557192, + "grad_norm": 0.26193168931031524, + "learning_rate": 0.00020915923725632244, + "loss": 0.3036, + "step": 18000 + }, + { + "epoch": 0.4904091724175406, + "grad_norm": 0.23673083795318206, + "learning_rate": 0.0002083013458449893, + "loss": 0.3111, + "step": 18050 + }, + { + "epoch": 0.49176764657936206, + "grad_norm": 0.2259659757224692, + "learning_rate": 0.00020744330140193046, + "loss": 0.2883, + "step": 18100 + }, + { + "epoch": 0.4931261207411835, + "grad_norm": 0.2902171908048496, + "learning_rate": 0.00020658511974482475, + "loss": 0.2898, + "step": 18150 + }, + { + "epoch": 0.49448459490300495, + "grad_norm": 0.31472212166057917, + "learning_rate": 0.0002057268166938803, + "loss": 0.3111, + "step": 18200 + }, + { + "epoch": 0.4958430690648264, + "grad_norm": 0.27417754560735935, + "learning_rate": 0.00020486840807154325, + "loss": 0.3013, + "step": 18250 + }, + { + "epoch": 0.49720154322664784, + "grad_norm": 0.24533216444780298, + "learning_rate": 0.0002040099097022059, + "loss": 0.3073, + "step": 18300 + }, + { + "epoch": 0.4985600173884693, + "grad_norm": 0.2597365406230817, + "learning_rate": 0.0002031513374119148, + "loss": 0.2918, + "step": 18350 + }, + { + "epoch": 0.4999184915502907, + "grad_norm": 0.23849823607914308, + "learning_rate": 0.00020229270702807952, + "loss": 0.3044, + "step": 18400 + }, + { + "epoch": 0.5012769657121121, + "grad_norm": 0.40233301575689023, + "learning_rate": 0.0002014340343791802, + "loss": 0.3086, + "step": 18450 + }, + { + "epoch": 0.5026354398739336, + "grad_norm": 0.24678497017149986, + "learning_rate": 0.00020057533529447647, + "loss": 0.2947, + "step": 18500 + }, + { + "epoch": 0.503993914035755, + "grad_norm": 0.18418790064404403, + "learning_rate": 0.000199716625603715, + "loss": 0.2802, + "step": 18550 + }, + { + "epoch": 0.5053523881975764, + "grad_norm": 0.20614362466496808, + "learning_rate": 0.00019887509507259376, + "loss": 0.3082, + "step": 18600 + }, + { + "epoch": 0.5067108623593979, + "grad_norm": 0.3176004501620565, + "learning_rate": 0.0001980164110832425, + "loss": 0.2946, + "step": 18650 + }, + { + "epoch": 0.5080693365212193, + "grad_norm": 0.24434247355813202, + "learning_rate": 0.00019715776366049622, + "loss": 0.2852, + "step": 18700 + }, + { + "epoch": 0.5094278106830408, + "grad_norm": 0.2632819823395696, + "learning_rate": 0.00019629916863314945, + "loss": 0.3119, + "step": 18750 + }, + { + "epoch": 0.5107862848448622, + "grad_norm": 0.36866015249871253, + "learning_rate": 0.00019544064182903077, + "loss": 0.3064, + "step": 18800 + }, + { + "epoch": 0.5121447590066837, + "grad_norm": 0.28334197775915865, + "learning_rate": 0.000194582199074711, + "loss": 0.2982, + "step": 18850 + }, + { + "epoch": 0.5135032331685051, + "grad_norm": 0.29353450964831995, + "learning_rate": 0.00019372385619521155, + "loss": 0.2997, + "step": 18900 + }, + { + "epoch": 0.5148617073303265, + "grad_norm": 0.30235983080661416, + "learning_rate": 0.00019286562901371282, + "loss": 0.2953, + "step": 18950 + }, + { + "epoch": 0.516220181492148, + "grad_norm": 0.24006103860300088, + "learning_rate": 0.0001920075333512621, + "loss": 0.312, + "step": 19000 + }, + { + "epoch": 0.5175786556539694, + "grad_norm": 0.25401074594196943, + "learning_rate": 0.00019114958502648258, + "loss": 0.2928, + "step": 19050 + }, + { + "epoch": 0.5189371298157909, + "grad_norm": 0.3126940882002115, + "learning_rate": 0.00019029179985528095, + "loss": 0.2881, + "step": 19100 + }, + { + "epoch": 0.5202956039776123, + "grad_norm": 0.244186090338719, + "learning_rate": 0.00018945134391851735, + "loss": 0.2844, + "step": 19150 + }, + { + "epoch": 0.5216540781394338, + "grad_norm": 0.2620555496999193, + "learning_rate": 0.00018861107474107164, + "loss": 0.3033, + "step": 19200 + }, + { + "epoch": 0.5230125523012552, + "grad_norm": 0.29660068432502984, + "learning_rate": 0.00018775386516779982, + "loss": 0.2815, + "step": 19250 + }, + { + "epoch": 0.5243710264630767, + "grad_norm": 0.24636353127452668, + "learning_rate": 0.0001868968813467351, + "loss": 0.2982, + "step": 19300 + }, + { + "epoch": 0.5257295006248981, + "grad_norm": 0.3036729051937609, + "learning_rate": 0.00018604013907600413, + "loss": 0.2697, + "step": 19350 + }, + { + "epoch": 0.5270879747867195, + "grad_norm": 0.25151244998729483, + "learning_rate": 0.00018518365414928073, + "loss": 0.3005, + "step": 19400 + }, + { + "epoch": 0.528446448948541, + "grad_norm": 0.3900757856018299, + "learning_rate": 0.00018432744235549457, + "loss": 0.3163, + "step": 19450 + }, + { + "epoch": 0.5298049231103624, + "grad_norm": 0.3209166901430777, + "learning_rate": 0.0001834715194785403, + "loss": 0.2946, + "step": 19500 + }, + { + "epoch": 0.5311633972721839, + "grad_norm": 0.20611000381285643, + "learning_rate": 0.00018261590129698663, + "loss": 0.2877, + "step": 19550 + }, + { + "epoch": 0.5325218714340053, + "grad_norm": 0.21332069721707292, + "learning_rate": 0.00018176060358378503, + "loss": 0.2916, + "step": 19600 + }, + { + "epoch": 0.5338803455958268, + "grad_norm": 0.34732582027624836, + "learning_rate": 0.00018090564210597975, + "loss": 0.3057, + "step": 19650 + }, + { + "epoch": 0.5352388197576482, + "grad_norm": 0.23660042062818817, + "learning_rate": 0.00018005103262441622, + "loss": 0.2746, + "step": 19700 + }, + { + "epoch": 0.5365972939194696, + "grad_norm": 0.23653513107119012, + "learning_rate": 0.00017919679089345122, + "loss": 0.295, + "step": 19750 + }, + { + "epoch": 0.5379557680812911, + "grad_norm": 0.2066174691631555, + "learning_rate": 0.00017834293266066222, + "loss": 0.2896, + "step": 19800 + }, + { + "epoch": 0.5393142422431125, + "grad_norm": 0.26332165957058984, + "learning_rate": 0.00017748947366655687, + "loss": 0.2811, + "step": 19850 + }, + { + "epoch": 0.540672716404934, + "grad_norm": 0.22960074466120436, + "learning_rate": 0.00017663642964428318, + "loss": 0.2846, + "step": 19900 + }, + { + "epoch": 0.5420311905667554, + "grad_norm": 0.3090166915756585, + "learning_rate": 0.00017578381631933946, + "loss": 0.2924, + "step": 19950 + }, + { + "epoch": 0.5433896647285769, + "grad_norm": 0.36568571497107416, + "learning_rate": 0.00017493164940928402, + "loss": 0.2865, + "step": 20000 + }, + { + "epoch": 0.5447481388903983, + "grad_norm": 0.29059486954556535, + "learning_rate": 0.00017407994462344584, + "loss": 0.2785, + "step": 20050 + }, + { + "epoch": 0.5461066130522197, + "grad_norm": 0.27957466708084117, + "learning_rate": 0.00017322871766263487, + "loss": 0.2935, + "step": 20100 + }, + { + "epoch": 0.5474650872140412, + "grad_norm": 0.2151461608605068, + "learning_rate": 0.00017237798421885253, + "loss": 0.2841, + "step": 20150 + }, + { + "epoch": 0.5488235613758626, + "grad_norm": 0.24819887268532007, + "learning_rate": 0.00017152775997500238, + "loss": 0.285, + "step": 20200 + }, + { + "epoch": 0.5501820355376841, + "grad_norm": 0.20284647935207317, + "learning_rate": 0.0001706780606046013, + "loss": 0.2927, + "step": 20250 + }, + { + "epoch": 0.5515405096995055, + "grad_norm": 0.19244100345976062, + "learning_rate": 0.0001698289017714902, + "loss": 0.2645, + "step": 20300 + }, + { + "epoch": 0.552898983861327, + "grad_norm": 0.22539860380829246, + "learning_rate": 0.00016898029912954546, + "loss": 0.2939, + "step": 20350 + }, + { + "epoch": 0.5542574580231484, + "grad_norm": 0.2619800733732195, + "learning_rate": 0.00016813226832239025, + "loss": 0.2836, + "step": 20400 + }, + { + "epoch": 0.5556159321849699, + "grad_norm": 0.23393114722266678, + "learning_rate": 0.00016728482498310637, + "loss": 0.2736, + "step": 20450 + }, + { + "epoch": 0.5569744063467913, + "grad_norm": 0.30087081995833126, + "learning_rate": 0.00016643798473394566, + "loss": 0.2794, + "step": 20500 + }, + { + "epoch": 0.5583328805086127, + "grad_norm": 0.308240444312431, + "learning_rate": 0.00016559176318604258, + "loss": 0.2671, + "step": 20550 + }, + { + "epoch": 0.5596913546704342, + "grad_norm": 0.24052215603123736, + "learning_rate": 0.00016474617593912583, + "loss": 0.2874, + "step": 20600 + }, + { + "epoch": 0.5610498288322556, + "grad_norm": 0.2750519886277399, + "learning_rate": 0.00016390123858123118, + "loss": 0.2732, + "step": 20650 + }, + { + "epoch": 0.5624083029940771, + "grad_norm": 0.2175806661894403, + "learning_rate": 0.0001630569666884139, + "loss": 0.2885, + "step": 20700 + }, + { + "epoch": 0.5637667771558985, + "grad_norm": 0.2923956849374819, + "learning_rate": 0.00016221337582446172, + "loss": 0.2924, + "step": 20750 + }, + { + "epoch": 0.56512525131772, + "grad_norm": 0.2708091098394788, + "learning_rate": 0.00016137048154060785, + "loss": 0.2705, + "step": 20800 + }, + { + "epoch": 0.5664837254795414, + "grad_norm": 0.260062882274282, + "learning_rate": 0.0001605282993752446, + "loss": 0.2833, + "step": 20850 + }, + { + "epoch": 0.5678421996413628, + "grad_norm": 0.28046003747194964, + "learning_rate": 0.00015968684485363635, + "loss": 0.2875, + "step": 20900 + }, + { + "epoch": 0.5692006738031843, + "grad_norm": 0.18648990278831484, + "learning_rate": 0.0001588461334876338, + "loss": 0.2788, + "step": 20950 + }, + { + "epoch": 0.5705591479650057, + "grad_norm": 0.26108175409809964, + "learning_rate": 0.000158006180775388, + "loss": 0.2809, + "step": 21000 + }, + { + "epoch": 0.5719176221268272, + "grad_norm": 0.15533902511877934, + "learning_rate": 0.0001571670022010644, + "loss": 0.2808, + "step": 21050 + }, + { + "epoch": 0.5732760962886486, + "grad_norm": 0.17785716374013105, + "learning_rate": 0.0001563286132345576, + "loss": 0.2854, + "step": 21100 + }, + { + "epoch": 0.5746345704504701, + "grad_norm": 0.2493856351979774, + "learning_rate": 0.00015549102933120625, + "loss": 0.2672, + "step": 21150 + }, + { + "epoch": 0.5759930446122915, + "grad_norm": 0.37551758591172574, + "learning_rate": 0.00015467099305876942, + "loss": 0.2883, + "step": 21200 + }, + { + "epoch": 0.577351518774113, + "grad_norm": 0.21750010428694388, + "learning_rate": 0.00015383504871844582, + "loss": 0.2779, + "step": 21250 + }, + { + "epoch": 0.5787099929359344, + "grad_norm": 0.19042627120914027, + "learning_rate": 0.00015299995540906267, + "loss": 0.2764, + "step": 21300 + }, + { + "epoch": 0.5800684670977558, + "grad_norm": 0.2797732165932674, + "learning_rate": 0.0001521657285252044, + "loss": 0.2922, + "step": 21350 + }, + { + "epoch": 0.5814269412595773, + "grad_norm": 0.3591848479346681, + "learning_rate": 0.00015133238344548327, + "loss": 0.2884, + "step": 21400 + }, + { + "epoch": 0.5827854154213987, + "grad_norm": 0.21764914836042967, + "learning_rate": 0.00015049993553225608, + "loss": 0.2715, + "step": 21450 + }, + { + "epoch": 0.5841438895832202, + "grad_norm": 0.26727180336133755, + "learning_rate": 0.0001496684001313406, + "loss": 0.2753, + "step": 21500 + }, + { + "epoch": 0.5855023637450416, + "grad_norm": 0.21915535565528904, + "learning_rate": 0.00014883779257173285, + "loss": 0.265, + "step": 21550 + }, + { + "epoch": 0.586860837906863, + "grad_norm": 0.25668689734119876, + "learning_rate": 0.0001480081281653244, + "loss": 0.2762, + "step": 21600 + }, + { + "epoch": 0.5882193120686845, + "grad_norm": 0.2834782294538094, + "learning_rate": 0.00014717942220662038, + "loss": 0.28, + "step": 21650 + }, + { + "epoch": 0.5895777862305059, + "grad_norm": 0.24516802954697497, + "learning_rate": 0.00014635168997245712, + "loss": 0.2755, + "step": 21700 + }, + { + "epoch": 0.5909362603923274, + "grad_norm": 0.22053403799293927, + "learning_rate": 0.00014552494672172113, + "loss": 0.2732, + "step": 21750 + }, + { + "epoch": 0.5922947345541488, + "grad_norm": 0.297493134455997, + "learning_rate": 0.00014469920769506704, + "loss": 0.2819, + "step": 21800 + }, + { + "epoch": 0.5936532087159703, + "grad_norm": 0.26448034669148435, + "learning_rate": 0.00014387448811463722, + "loss": 0.2947, + "step": 21850 + }, + { + "epoch": 0.5950116828777917, + "grad_norm": 0.1887478278727578, + "learning_rate": 0.00014305080318378105, + "loss": 0.2573, + "step": 21900 + }, + { + "epoch": 0.5963701570396132, + "grad_norm": 0.24486372742215648, + "learning_rate": 0.0001422281680867744, + "loss": 0.2762, + "step": 21950 + }, + { + "epoch": 0.5977286312014346, + "grad_norm": 0.22891270758035537, + "learning_rate": 0.00014140659798854012, + "loss": 0.2816, + "step": 22000 + }, + { + "epoch": 0.599087105363256, + "grad_norm": 0.25531740500430156, + "learning_rate": 0.00014058610803436813, + "loss": 0.2544, + "step": 22050 + }, + { + "epoch": 0.6004455795250774, + "grad_norm": 0.2198360405690994, + "learning_rate": 0.00013976671334963648, + "loss": 0.27, + "step": 22100 + }, + { + "epoch": 0.6018040536868988, + "grad_norm": 0.22767226535607382, + "learning_rate": 0.0001389484290395323, + "loss": 0.2869, + "step": 22150 + }, + { + "epoch": 0.6031625278487203, + "grad_norm": 0.2694860139304321, + "learning_rate": 0.00013813127018877331, + "loss": 0.2752, + "step": 22200 + }, + { + "epoch": 0.6045210020105417, + "grad_norm": 0.19898660564261053, + "learning_rate": 0.00013731525186133026, + "loss": 0.2624, + "step": 22250 + }, + { + "epoch": 0.6058794761723632, + "grad_norm": 0.23150351646391246, + "learning_rate": 0.00013653296123522198, + "loss": 0.2718, + "step": 22300 + }, + { + "epoch": 0.6072379503341846, + "grad_norm": 0.24064115266253058, + "learning_rate": 0.00013571922195028266, + "loss": 0.2812, + "step": 22350 + }, + { + "epoch": 0.608596424496006, + "grad_norm": 0.25687846535740555, + "learning_rate": 0.0001349066676537268, + "loss": 0.262, + "step": 22400 + }, + { + "epoch": 0.6099548986578275, + "grad_norm": 0.20024379738006956, + "learning_rate": 0.00013409531332464196, + "loss": 0.2796, + "step": 22450 + }, + { + "epoch": 0.6113133728196489, + "grad_norm": 0.30669943060449323, + "learning_rate": 0.00013328517391999483, + "loss": 0.2748, + "step": 22500 + }, + { + "epoch": 0.6126718469814704, + "grad_norm": 0.26517225209707274, + "learning_rate": 0.00013247626437435539, + "loss": 0.2641, + "step": 22550 + }, + { + "epoch": 0.6140303211432918, + "grad_norm": 0.23089105114814368, + "learning_rate": 0.0001316685995996218, + "loss": 0.2716, + "step": 22600 + }, + { + "epoch": 0.6153887953051133, + "grad_norm": 0.3141172219746477, + "learning_rate": 0.0001308621944847455, + "loss": 0.2601, + "step": 22650 + }, + { + "epoch": 0.6167472694669347, + "grad_norm": 0.2290976880794265, + "learning_rate": 0.0001300570638954565, + "loss": 0.2805, + "step": 22700 + }, + { + "epoch": 0.6181057436287561, + "grad_norm": 0.21218409171582492, + "learning_rate": 0.0001292532226739894, + "loss": 0.2686, + "step": 22750 + }, + { + "epoch": 0.6194642177905776, + "grad_norm": 0.22628948026088308, + "learning_rate": 0.0001284506856388101, + "loss": 0.2688, + "step": 22800 + }, + { + "epoch": 0.620822691952399, + "grad_norm": 0.2948337400203754, + "learning_rate": 0.00012764946758434225, + "loss": 0.2655, + "step": 22850 + }, + { + "epoch": 0.6221811661142205, + "grad_norm": 0.3340188815254344, + "learning_rate": 0.00012684958328069453, + "loss": 0.2754, + "step": 22900 + }, + { + "epoch": 0.6235396402760419, + "grad_norm": 0.2767372638913053, + "learning_rate": 0.0001260510474733888, + "loss": 0.2602, + "step": 22950 + }, + { + "epoch": 0.6248981144378634, + "grad_norm": 0.270894988791611, + "learning_rate": 0.00012525387488308783, + "loss": 0.2564, + "step": 23000 + }, + { + "epoch": 0.6262565885996848, + "grad_norm": 0.20130647702859084, + "learning_rate": 0.000124458080205324, + "loss": 0.2699, + "step": 23050 + }, + { + "epoch": 0.6276150627615062, + "grad_norm": 0.2606352685620501, + "learning_rate": 0.0001236795524100573, + "loss": 0.2777, + "step": 23100 + }, + { + "epoch": 0.6289735369233277, + "grad_norm": 0.26862575508349007, + "learning_rate": 0.00012288652925419885, + "loss": 0.27, + "step": 23150 + }, + { + "epoch": 0.6303320110851491, + "grad_norm": 0.2264767237464518, + "learning_rate": 0.00012209492765187177, + "loss": 0.2717, + "step": 23200 + }, + { + "epoch": 0.6316904852469706, + "grad_norm": 0.3116565801871334, + "learning_rate": 0.00012130476219590986, + "loss": 0.2595, + "step": 23250 + }, + { + "epoch": 0.633048959408792, + "grad_norm": 0.2778393951264189, + "learning_rate": 0.00012051604745267213, + "loss": 0.2791, + "step": 23300 + }, + { + "epoch": 0.6344074335706135, + "grad_norm": 0.1850696129101786, + "learning_rate": 0.00011972879796177415, + "loss": 0.2717, + "step": 23350 + }, + { + "epoch": 0.6357659077324349, + "grad_norm": 0.24958891669063782, + "learning_rate": 0.00011894302823582031, + "loss": 0.2638, + "step": 23400 + }, + { + "epoch": 0.6371243818942564, + "grad_norm": 0.3700870104750999, + "learning_rate": 0.00011815875276013624, + "loss": 0.2742, + "step": 23450 + }, + { + "epoch": 0.6384828560560778, + "grad_norm": 0.33264994031715317, + "learning_rate": 0.0001173759859925015, + "loss": 0.2774, + "step": 23500 + }, + { + "epoch": 0.6398413302178992, + "grad_norm": 0.31037389441035956, + "learning_rate": 0.00011659474236288361, + "loss": 0.2403, + "step": 23550 + }, + { + "epoch": 0.6411998043797207, + "grad_norm": 0.2731125175831413, + "learning_rate": 0.00011581503627317138, + "loss": 0.2568, + "step": 23600 + }, + { + "epoch": 0.6425582785415421, + "grad_norm": 0.31542476581603357, + "learning_rate": 0.00011503688209690988, + "loss": 0.2405, + "step": 23650 + }, + { + "epoch": 0.6439167527033636, + "grad_norm": 0.2856271842999882, + "learning_rate": 0.00011426029417903521, + "loss": 0.2594, + "step": 23700 + }, + { + "epoch": 0.645275226865185, + "grad_norm": 0.304609790388205, + "learning_rate": 0.00011348528683561044, + "loss": 0.2617, + "step": 23750 + }, + { + "epoch": 0.6466337010270065, + "grad_norm": 0.24926409052563817, + "learning_rate": 0.00011271187435356107, + "loss": 0.2624, + "step": 23800 + }, + { + "epoch": 0.6479921751888279, + "grad_norm": 0.29444243889916777, + "learning_rate": 0.00011194007099041242, + "loss": 0.267, + "step": 23850 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 0.251174398975187, + "learning_rate": 0.00011116989097402601, + "loss": 0.2745, + "step": 23900 + }, + { + "epoch": 0.6507091235124708, + "grad_norm": 0.26364700269491465, + "learning_rate": 0.0001104013485023379, + "loss": 0.2695, + "step": 23950 + }, + { + "epoch": 0.6520675976742922, + "grad_norm": 0.1408465902411862, + "learning_rate": 0.00010963445774309668, + "loss": 0.2423, + "step": 24000 + }, + { + "epoch": 0.6534260718361137, + "grad_norm": 0.1933859329942763, + "learning_rate": 0.00010886923283360217, + "loss": 0.2359, + "step": 24050 + }, + { + "epoch": 0.6547845459979351, + "grad_norm": 0.2614195528425062, + "learning_rate": 0.00010810568788044524, + "loss": 0.2673, + "step": 24100 + }, + { + "epoch": 0.6561430201597566, + "grad_norm": 0.24091031620062864, + "learning_rate": 0.00010734383695924741, + "loss": 0.2493, + "step": 24150 + }, + { + "epoch": 0.657501494321578, + "grad_norm": 0.2697615824186297, + "learning_rate": 0.00010658369411440134, + "loss": 0.2729, + "step": 24200 + }, + { + "epoch": 0.6588599684833994, + "grad_norm": 0.20653067849872642, + "learning_rate": 0.00010582527335881209, + "loss": 0.274, + "step": 24250 + }, + { + "epoch": 0.6602184426452209, + "grad_norm": 0.2589626095489949, + "learning_rate": 0.0001050685886736388, + "loss": 0.2609, + "step": 24300 + }, + { + "epoch": 0.6615769168070423, + "grad_norm": 0.2672837103760092, + "learning_rate": 0.00010431365400803682, + "loss": 0.2524, + "step": 24350 + }, + { + "epoch": 0.6629353909688638, + "grad_norm": 0.1824225008155396, + "learning_rate": 0.00010356048327890064, + "loss": 0.2702, + "step": 24400 + }, + { + "epoch": 0.6642938651306852, + "grad_norm": 0.22598161309206102, + "learning_rate": 0.00010280909037060747, + "loss": 0.2601, + "step": 24450 + }, + { + "epoch": 0.6656523392925067, + "grad_norm": 0.3087441379489739, + "learning_rate": 0.00010205948913476113, + "loss": 0.2645, + "step": 24500 + }, + { + "epoch": 0.6670108134543281, + "grad_norm": 0.26641776561733793, + "learning_rate": 0.00010131169338993662, + "loss": 0.2572, + "step": 24550 + }, + { + "epoch": 0.6683692876161496, + "grad_norm": 0.16642889477958095, + "learning_rate": 0.00010056571692142558, + "loss": 0.2437, + "step": 24600 + }, + { + "epoch": 0.669727761777971, + "grad_norm": 0.33006631915049106, + "learning_rate": 9.982157348098204e-05, + "loss": 0.2557, + "step": 24650 + }, + { + "epoch": 0.6710862359397924, + "grad_norm": 0.25184518346403906, + "learning_rate": 9.907927678656888e-05, + "loss": 0.2481, + "step": 24700 + }, + { + "epoch": 0.6724447101016139, + "grad_norm": 0.21781761609625996, + "learning_rate": 9.833884052210525e-05, + "loss": 0.2474, + "step": 24750 + }, + { + "epoch": 0.6738031842634353, + "grad_norm": 0.2707646383120265, + "learning_rate": 9.760027833721379e-05, + "loss": 0.2652, + "step": 24800 + }, + { + "epoch": 0.6751616584252568, + "grad_norm": 0.2069393478176125, + "learning_rate": 9.686360384696958e-05, + "loss": 0.2595, + "step": 24850 + }, + { + "epoch": 0.6765201325870782, + "grad_norm": 0.24428561492811254, + "learning_rate": 9.614350737579221e-05, + "loss": 0.2501, + "step": 24900 + }, + { + "epoch": 0.6778786067488997, + "grad_norm": 0.26915065049504966, + "learning_rate": 9.541061055170308e-05, + "loss": 0.2595, + "step": 24950 + }, + { + "epoch": 0.6792370809107211, + "grad_norm": 0.32715798463519263, + "learning_rate": 9.467964178784106e-05, + "loss": 0.249, + "step": 25000 + }, + { + "epoch": 0.6805955550725425, + "grad_norm": 0.2188565278615699, + "learning_rate": 9.395061455929976e-05, + "loss": 0.2644, + "step": 25050 + }, + { + "epoch": 0.681954029234364, + "grad_norm": 0.1464346234966987, + "learning_rate": 9.32235423053812e-05, + "loss": 0.2489, + "step": 25100 + }, + { + "epoch": 0.6833125033961854, + "grad_norm": 0.23268193269727472, + "learning_rate": 9.249843842934851e-05, + "loss": 0.2524, + "step": 25150 + }, + { + "epoch": 0.6846709775580069, + "grad_norm": 0.2823606594876491, + "learning_rate": 9.177531629817841e-05, + "loss": 0.2734, + "step": 25200 + }, + { + "epoch": 0.6860294517198283, + "grad_norm": 0.24467058752217685, + "learning_rate": 9.105418924231516e-05, + "loss": 0.2579, + "step": 25250 + }, + { + "epoch": 0.6873879258816498, + "grad_norm": 0.2721349790032047, + "learning_rate": 9.034943316134114e-05, + "loss": 0.2501, + "step": 25300 + }, + { + "epoch": 0.6887464000434712, + "grad_norm": 0.2063496873982564, + "learning_rate": 8.96322955378789e-05, + "loss": 0.2546, + "step": 25350 + }, + { + "epoch": 0.6901048742052927, + "grad_norm": 0.1388645672356858, + "learning_rate": 8.891719249538568e-05, + "loss": 0.2481, + "step": 25400 + }, + { + "epoch": 0.6914633483671141, + "grad_norm": 0.26747348762140405, + "learning_rate": 8.820413721647738e-05, + "loss": 0.2406, + "step": 25450 + }, + { + "epoch": 0.6928218225289355, + "grad_norm": 0.24773718412732226, + "learning_rate": 8.749314284602002e-05, + "loss": 0.2345, + "step": 25500 + }, + { + "epoch": 0.694180296690757, + "grad_norm": 0.1875176742846847, + "learning_rate": 8.67842224908878e-05, + "loss": 0.2697, + "step": 25550 + }, + { + "epoch": 0.6955387708525784, + "grad_norm": 0.34193810804953745, + "learning_rate": 8.607738921972125e-05, + "loss": 0.2499, + "step": 25600 + }, + { + "epoch": 0.6968972450143999, + "grad_norm": 0.19405847865847933, + "learning_rate": 8.537265606268663e-05, + "loss": 0.2469, + "step": 25650 + }, + { + "epoch": 0.6982557191762213, + "grad_norm": 0.1829187140282853, + "learning_rate": 8.467003601123527e-05, + "loss": 0.2519, + "step": 25700 + }, + { + "epoch": 0.6996141933380428, + "grad_norm": 0.20733927044724373, + "learning_rate": 8.396954201786429e-05, + "loss": 0.2655, + "step": 25750 + }, + { + "epoch": 0.7009726674998642, + "grad_norm": 0.2261928459658941, + "learning_rate": 8.32711869958781e-05, + "loss": 0.2593, + "step": 25800 + }, + { + "epoch": 0.7023311416616856, + "grad_norm": 0.2718188963862619, + "learning_rate": 8.25749838191499e-05, + "loss": 0.2415, + "step": 25850 + }, + { + "epoch": 0.7036896158235071, + "grad_norm": 0.3565494856705099, + "learning_rate": 8.18809453218845e-05, + "loss": 0.2586, + "step": 25900 + }, + { + "epoch": 0.7050480899853285, + "grad_norm": 0.23853635314623642, + "learning_rate": 8.118908429838201e-05, + "loss": 0.2495, + "step": 25950 + }, + { + "epoch": 0.70640656414715, + "grad_norm": 0.12974546184866537, + "learning_rate": 8.049941350280157e-05, + "loss": 0.241, + "step": 26000 + }, + { + "epoch": 0.7077650383089714, + "grad_norm": 0.15240966539892364, + "learning_rate": 7.981194564892645e-05, + "loss": 0.2642, + "step": 26050 + }, + { + "epoch": 0.7091235124707929, + "grad_norm": 0.18578994346470928, + "learning_rate": 7.912669340992957e-05, + "loss": 0.2561, + "step": 26100 + }, + { + "epoch": 0.7104819866326142, + "grad_norm": 0.2354542836489054, + "learning_rate": 7.844366941814016e-05, + "loss": 0.2433, + "step": 26150 + }, + { + "epoch": 0.7118404607944356, + "grad_norm": 0.32359876529310133, + "learning_rate": 7.776288626481043e-05, + "loss": 0.2589, + "step": 26200 + }, + { + "epoch": 0.7131989349562571, + "grad_norm": 0.21721518409143126, + "learning_rate": 7.708435649988394e-05, + "loss": 0.248, + "step": 26250 + }, + { + "epoch": 0.7145574091180785, + "grad_norm": 0.2413841328575766, + "learning_rate": 7.640809263176381e-05, + "loss": 0.2495, + "step": 26300 + }, + { + "epoch": 0.7159158832799, + "grad_norm": 0.1937874091125614, + "learning_rate": 7.57341071270824e-05, + "loss": 0.2379, + "step": 26350 + }, + { + "epoch": 0.7172743574417214, + "grad_norm": 0.29670499546178025, + "learning_rate": 7.507582377492124e-05, + "loss": 0.2481, + "step": 26400 + }, + { + "epoch": 0.7186328316035429, + "grad_norm": 0.2733674523937474, + "learning_rate": 7.44063860443e-05, + "loss": 0.24, + "step": 26450 + }, + { + "epoch": 0.7199913057653643, + "grad_norm": 0.24849432830004892, + "learning_rate": 7.373926357771387e-05, + "loss": 0.2518, + "step": 26500 + }, + { + "epoch": 0.7213497799271857, + "grad_norm": 0.3217997284475769, + "learning_rate": 7.307446867327764e-05, + "loss": 0.2558, + "step": 26550 + }, + { + "epoch": 0.7227082540890072, + "grad_norm": 0.1903670555116767, + "learning_rate": 7.241201358619814e-05, + "loss": 0.2459, + "step": 26600 + }, + { + "epoch": 0.7240667282508286, + "grad_norm": 0.1308938028529946, + "learning_rate": 7.175191052854886e-05, + "loss": 0.2507, + "step": 26650 + }, + { + "epoch": 0.7254252024126501, + "grad_norm": 0.2795123652476836, + "learning_rate": 7.109417166904457e-05, + "loss": 0.2518, + "step": 26700 + }, + { + "epoch": 0.7267836765744715, + "grad_norm": 0.16091370835854293, + "learning_rate": 7.043880913281707e-05, + "loss": 0.2554, + "step": 26750 + }, + { + "epoch": 0.728142150736293, + "grad_norm": 0.16950014605111838, + "learning_rate": 6.978583500119171e-05, + "loss": 0.2451, + "step": 26800 + }, + { + "epoch": 0.7295006248981144, + "grad_norm": 0.19788089327913239, + "learning_rate": 6.913526131146473e-05, + "loss": 0.2456, + "step": 26850 + }, + { + "epoch": 0.7308590990599358, + "grad_norm": 0.23996422423355868, + "learning_rate": 6.848710005668106e-05, + "loss": 0.2372, + "step": 26900 + }, + { + "epoch": 0.7322175732217573, + "grad_norm": 0.30447979386999535, + "learning_rate": 6.784136318541352e-05, + "loss": 0.2507, + "step": 26950 + }, + { + "epoch": 0.7335760473835787, + "grad_norm": 0.2442932467375467, + "learning_rate": 6.719806260154248e-05, + "loss": 0.2499, + "step": 27000 + }, + { + "epoch": 0.7349345215454002, + "grad_norm": 0.2053301139703188, + "learning_rate": 6.655721016403638e-05, + "loss": 0.2351, + "step": 27050 + }, + { + "epoch": 0.7362929957072216, + "grad_norm": 0.28412523900572856, + "learning_rate": 6.591881768673309e-05, + "loss": 0.2463, + "step": 27100 + }, + { + "epoch": 0.7376514698690431, + "grad_norm": 0.2102789887873736, + "learning_rate": 6.52828969381223e-05, + "loss": 0.2469, + "step": 27150 + }, + { + "epoch": 0.7390099440308645, + "grad_norm": 0.37446012395142053, + "learning_rate": 6.464945964112845e-05, + "loss": 0.2381, + "step": 27200 + }, + { + "epoch": 0.740368418192686, + "grad_norm": 0.16201575759035203, + "learning_rate": 6.401851747289451e-05, + "loss": 0.2349, + "step": 27250 + }, + { + "epoch": 0.7417268923545074, + "grad_norm": 0.2489903791806012, + "learning_rate": 6.339008206456684e-05, + "loss": 0.2482, + "step": 27300 + }, + { + "epoch": 0.7430853665163288, + "grad_norm": 0.21608399737617504, + "learning_rate": 6.276416500108084e-05, + "loss": 0.2446, + "step": 27350 + }, + { + "epoch": 0.7444438406781503, + "grad_norm": 0.2704960434877356, + "learning_rate": 6.215322069728647e-05, + "loss": 0.2424, + "step": 27400 + }, + { + "epoch": 0.7458023148399717, + "grad_norm": 0.2267608806933957, + "learning_rate": 6.153232395255646e-05, + "loss": 0.2441, + "step": 27450 + }, + { + "epoch": 0.7471607890017932, + "grad_norm": 0.183167292044454, + "learning_rate": 6.0913979799636686e-05, + "loss": 0.2445, + "step": 27500 + }, + { + "epoch": 0.7485192631636146, + "grad_norm": 0.15376761881823003, + "learning_rate": 6.0298199637434525e-05, + "loss": 0.2253, + "step": 27550 + }, + { + "epoch": 0.7498777373254361, + "grad_norm": 0.19314678658445544, + "learning_rate": 5.9684994817591334e-05, + "loss": 0.2383, + "step": 27600 + }, + { + "epoch": 0.7512362114872575, + "grad_norm": 0.2544241890699629, + "learning_rate": 5.907437664427311e-05, + "loss": 0.2391, + "step": 27650 + }, + { + "epoch": 0.7525946856490789, + "grad_norm": 0.21989322641900727, + "learning_rate": 5.846635637396216e-05, + "loss": 0.2332, + "step": 27700 + }, + { + "epoch": 0.7539531598109004, + "grad_norm": 0.22618174199432453, + "learning_rate": 5.7860945215249696e-05, + "loss": 0.2337, + "step": 27750 + }, + { + "epoch": 0.7553116339727218, + "grad_norm": 0.18764808609541392, + "learning_rate": 5.725815432862887e-05, + "loss": 0.2482, + "step": 27800 + }, + { + "epoch": 0.7566701081345433, + "grad_norm": 0.4380376298961902, + "learning_rate": 5.6657994826289465e-05, + "loss": 0.2262, + "step": 27850 + }, + { + "epoch": 0.7580285822963647, + "grad_norm": 0.23698867029895784, + "learning_rate": 5.606047777191268e-05, + "loss": 0.2409, + "step": 27900 + }, + { + "epoch": 0.7593870564581862, + "grad_norm": 0.20034127068488122, + "learning_rate": 5.546561418046736e-05, + "loss": 0.2419, + "step": 27950 + }, + { + "epoch": 0.7607455306200076, + "grad_norm": 0.2949286374600259, + "learning_rate": 5.4873415018006867e-05, + "loss": 0.2261, + "step": 28000 + }, + { + "epoch": 0.762104004781829, + "grad_norm": 0.25152518852471184, + "learning_rate": 5.428389120146715e-05, + "loss": 0.2375, + "step": 28050 + }, + { + "epoch": 0.7634624789436505, + "grad_norm": 0.1611737332419803, + "learning_rate": 5.369705359846511e-05, + "loss": 0.2318, + "step": 28100 + }, + { + "epoch": 0.7648209531054719, + "grad_norm": 0.24055178507097832, + "learning_rate": 5.311291302709844e-05, + "loss": 0.2373, + "step": 28150 + }, + { + "epoch": 0.7661794272672934, + "grad_norm": 0.1897183193395996, + "learning_rate": 5.2531480255746476e-05, + "loss": 0.245, + "step": 28200 + }, + { + "epoch": 0.7675379014291148, + "grad_norm": 0.17982933996634243, + "learning_rate": 5.195276600287118e-05, + "loss": 0.2369, + "step": 28250 + }, + { + "epoch": 0.7688963755909363, + "grad_norm": 0.25848505633412666, + "learning_rate": 5.137678093681983e-05, + "loss": 0.2319, + "step": 28300 + }, + { + "epoch": 0.7702548497527577, + "grad_norm": 0.20072845241494364, + "learning_rate": 5.0803535675628497e-05, + "loss": 0.2306, + "step": 28350 + }, + { + "epoch": 0.7716133239145792, + "grad_norm": 0.20242303091668362, + "learning_rate": 5.0233040786825935e-05, + "loss": 0.2422, + "step": 28400 + }, + { + "epoch": 0.7729717980764006, + "grad_norm": 0.2519217142033256, + "learning_rate": 4.9665306787239086e-05, + "loss": 0.25, + "step": 28450 + }, + { + "epoch": 0.774330272238222, + "grad_norm": 0.20962707780239995, + "learning_rate": 4.910034414279902e-05, + "loss": 0.2253, + "step": 28500 + }, + { + "epoch": 0.7756887464000435, + "grad_norm": 0.22029442635682775, + "learning_rate": 4.853816326834808e-05, + "loss": 0.2411, + "step": 28550 + }, + { + "epoch": 0.7770472205618649, + "grad_norm": 0.2324490253296684, + "learning_rate": 4.797877452744792e-05, + "loss": 0.2373, + "step": 28600 + }, + { + "epoch": 0.7784056947236864, + "grad_norm": 0.25563113455518527, + "learning_rate": 4.742218823218851e-05, + "loss": 0.2363, + "step": 28650 + }, + { + "epoch": 0.7797641688855078, + "grad_norm": 0.2147681728902342, + "learning_rate": 4.686841464299776e-05, + "loss": 0.2474, + "step": 28700 + }, + { + "epoch": 0.7811226430473293, + "grad_norm": 0.25383739922693677, + "learning_rate": 4.6317463968452624e-05, + "loss": 0.2212, + "step": 28750 + }, + { + "epoch": 0.7824811172091507, + "grad_norm": 0.2679584758554305, + "learning_rate": 4.5769346365090894e-05, + "loss": 0.252, + "step": 28800 + }, + { + "epoch": 0.7838395913709721, + "grad_norm": 0.2088581606519284, + "learning_rate": 4.522407193722382e-05, + "loss": 0.2277, + "step": 28850 + }, + { + "epoch": 0.7851980655327936, + "grad_norm": 0.26508761902628303, + "learning_rate": 4.4681650736750016e-05, + "loss": 0.2277, + "step": 28900 + }, + { + "epoch": 0.786556539694615, + "grad_norm": 0.377399622435696, + "learning_rate": 4.416361998302716e-05, + "loss": 0.2278, + "step": 28950 + }, + { + "epoch": 0.7879150138564365, + "grad_norm": 0.15962057847455954, + "learning_rate": 4.3626820065221566e-05, + "loss": 0.2242, + "step": 29000 + }, + { + "epoch": 0.7892734880182579, + "grad_norm": 0.32929537012542076, + "learning_rate": 4.309290281945775e-05, + "loss": 0.228, + "step": 29050 + }, + { + "epoch": 0.7906319621800794, + "grad_norm": 0.21846014602740327, + "learning_rate": 4.256187808826948e-05, + "loss": 0.2446, + "step": 29100 + }, + { + "epoch": 0.7919904363419008, + "grad_norm": 0.21017757165369907, + "learning_rate": 4.203375566086851e-05, + "loss": 0.2401, + "step": 29150 + }, + { + "epoch": 0.7933489105037222, + "grad_norm": 0.24137967337019786, + "learning_rate": 4.15085452729636e-05, + "loss": 0.2465, + "step": 29200 + }, + { + "epoch": 0.7947073846655437, + "grad_norm": 0.25590967007232035, + "learning_rate": 4.098625660658151e-05, + "loss": 0.2375, + "step": 29250 + }, + { + "epoch": 0.7960658588273651, + "grad_norm": 0.279864351972487, + "learning_rate": 4.0466899289888205e-05, + "loss": 0.2374, + "step": 29300 + }, + { + "epoch": 0.7974243329891866, + "grad_norm": 0.2607946144900689, + "learning_rate": 3.995048289701155e-05, + "loss": 0.222, + "step": 29350 + }, + { + "epoch": 0.798782807151008, + "grad_norm": 0.3426303124257882, + "learning_rate": 3.9437016947864745e-05, + "loss": 0.2367, + "step": 29400 + }, + { + "epoch": 0.8001412813128295, + "grad_norm": 0.27770053634270586, + "learning_rate": 3.892651090797075e-05, + "loss": 0.2417, + "step": 29450 + }, + { + "epoch": 0.8014997554746509, + "grad_norm": 0.24470556593709372, + "learning_rate": 3.841897418828797e-05, + "loss": 0.219, + "step": 29500 + }, + { + "epoch": 0.8028582296364724, + "grad_norm": 0.24604839411340365, + "learning_rate": 3.791441614503675e-05, + "loss": 0.2382, + "step": 29550 + }, + { + "epoch": 0.8042167037982938, + "grad_norm": 0.47026656573464204, + "learning_rate": 3.7412846079526644e-05, + "loss": 0.2196, + "step": 29600 + }, + { + "epoch": 0.8055751779601152, + "grad_norm": 0.26091598553410145, + "learning_rate": 3.691427323798522e-05, + "loss": 0.2268, + "step": 29650 + }, + { + "epoch": 0.8069336521219367, + "grad_norm": 0.21960674238700215, + "learning_rate": 3.6418706811387504e-05, + "loss": 0.2356, + "step": 29700 + }, + { + "epoch": 0.8082921262837581, + "grad_norm": 0.2181680329611913, + "learning_rate": 3.592615593528652e-05, + "loss": 0.2261, + "step": 29750 + }, + { + "epoch": 0.8096506004455796, + "grad_norm": 0.1831881150211827, + "learning_rate": 3.543662968964496e-05, + "loss": 0.2306, + "step": 29800 + }, + { + "epoch": 0.811009074607401, + "grad_norm": 0.24753134638996258, + "learning_rate": 3.4950137098667836e-05, + "loss": 0.2459, + "step": 29850 + }, + { + "epoch": 0.8123675487692225, + "grad_norm": 0.3847831369965376, + "learning_rate": 3.4466687130635856e-05, + "loss": 0.2201, + "step": 29900 + }, + { + "epoch": 0.8137260229310439, + "grad_norm": 0.2975898486391868, + "learning_rate": 3.39862886977405e-05, + "loss": 0.2166, + "step": 29950 + }, + { + "epoch": 0.8150844970928653, + "grad_norm": 0.20045687385866154, + "learning_rate": 3.3508950655919394e-05, + "loss": 0.228, + "step": 30000 + }, + { + "epoch": 0.8164429712546868, + "grad_norm": 0.303426868110847, + "learning_rate": 3.3034681804693204e-05, + "loss": 0.22, + "step": 30050 + }, + { + "epoch": 0.8178014454165082, + "grad_norm": 0.20754051862810569, + "learning_rate": 3.25634908870033e-05, + "loss": 0.2301, + "step": 30100 + }, + { + "epoch": 0.8191599195783296, + "grad_norm": 0.24422546737008857, + "learning_rate": 3.209538658905087e-05, + "loss": 0.2367, + "step": 30150 + }, + { + "epoch": 0.820518393740151, + "grad_norm": 0.3027570941981141, + "learning_rate": 3.163037754013647e-05, + "loss": 0.2417, + "step": 30200 + }, + { + "epoch": 0.8218768679019725, + "grad_norm": 0.280254043533181, + "learning_rate": 3.116847231250104e-05, + "loss": 0.2266, + "step": 30250 + }, + { + "epoch": 0.8232353420637939, + "grad_norm": 0.28505781897752897, + "learning_rate": 3.070967942116807e-05, + "loss": 0.2307, + "step": 30300 + }, + { + "epoch": 0.8245938162256153, + "grad_norm": 0.22370571404265266, + "learning_rate": 3.0254007323786338e-05, + "loss": 0.2292, + "step": 30350 + }, + { + "epoch": 0.8259522903874368, + "grad_norm": 0.20314542315669792, + "learning_rate": 2.9801464420474135e-05, + "loss": 0.2384, + "step": 30400 + }, + { + "epoch": 0.8273107645492582, + "grad_norm": 0.3091717822159854, + "learning_rate": 2.9352059053664515e-05, + "loss": 0.2252, + "step": 30450 + }, + { + "epoch": 0.8286692387110797, + "grad_norm": 0.2850647955523155, + "learning_rate": 2.8905799507951314e-05, + "loss": 0.2228, + "step": 30500 + }, + { + "epoch": 0.8300277128729011, + "grad_norm": 0.256986010255855, + "learning_rate": 2.846269400993655e-05, + "loss": 0.2176, + "step": 30550 + }, + { + "epoch": 0.8313861870347226, + "grad_norm": 0.19662900160930957, + "learning_rate": 2.802275072807865e-05, + "loss": 0.2271, + "step": 30600 + }, + { + "epoch": 0.832744661196544, + "grad_norm": 0.2745095935502404, + "learning_rate": 2.7585977772542126e-05, + "loss": 0.2254, + "step": 30650 + }, + { + "epoch": 0.8341031353583654, + "grad_norm": 0.20724780113594732, + "learning_rate": 2.715238319504769e-05, + "loss": 0.2415, + "step": 30700 + }, + { + "epoch": 0.8354616095201869, + "grad_norm": 0.21591886814918512, + "learning_rate": 2.6721974988724264e-05, + "loss": 0.2305, + "step": 30750 + }, + { + "epoch": 0.8368200836820083, + "grad_norm": 0.4269961594661858, + "learning_rate": 2.629476108796114e-05, + "loss": 0.2344, + "step": 30800 + }, + { + "epoch": 0.8381785578438298, + "grad_norm": 0.3444022954836087, + "learning_rate": 2.587074936826215e-05, + "loss": 0.2355, + "step": 30850 + }, + { + "epoch": 0.8395370320056512, + "grad_norm": 0.2810532430087154, + "learning_rate": 2.5449947646100202e-05, + "loss": 0.2333, + "step": 30900 + }, + { + "epoch": 0.8408955061674727, + "grad_norm": 0.2016521391071412, + "learning_rate": 2.5032363678773284e-05, + "loss": 0.2345, + "step": 30950 + }, + { + "epoch": 0.8422539803292941, + "grad_norm": 0.26305838948166177, + "learning_rate": 2.4626260675610046e-05, + "loss": 0.2229, + "step": 31000 + }, + { + "epoch": 0.8436124544911155, + "grad_norm": 0.3334236797955835, + "learning_rate": 2.4223262587394115e-05, + "loss": 0.2414, + "step": 31050 + }, + { + "epoch": 0.844970928652937, + "grad_norm": 0.1995390748027635, + "learning_rate": 2.381524806289641e-05, + "loss": 0.229, + "step": 31100 + }, + { + "epoch": 0.8463294028147584, + "grad_norm": 0.24559427709489323, + "learning_rate": 2.3410481428214603e-05, + "loss": 0.2139, + "step": 31150 + }, + { + "epoch": 0.8476878769765799, + "grad_norm": 0.20659570740849767, + "learning_rate": 2.300897014504688e-05, + "loss": 0.227, + "step": 31200 + }, + { + "epoch": 0.8490463511384013, + "grad_norm": 0.21220065960007847, + "learning_rate": 2.261072161508033e-05, + "loss": 0.2374, + "step": 31250 + }, + { + "epoch": 0.8504048253002228, + "grad_norm": 0.24984661749787465, + "learning_rate": 2.2215743179854577e-05, + "loss": 0.2266, + "step": 31300 + }, + { + "epoch": 0.8517632994620442, + "grad_norm": 0.3258474284548005, + "learning_rate": 2.1824042120626543e-05, + "loss": 0.2231, + "step": 31350 + }, + { + "epoch": 0.8531217736238657, + "grad_norm": 0.3217882648747335, + "learning_rate": 2.143562565823609e-05, + "loss": 0.2313, + "step": 31400 + }, + { + "epoch": 0.8544802477856871, + "grad_norm": 0.2660872932773675, + "learning_rate": 2.1050500952972985e-05, + "loss": 0.2443, + "step": 31450 + }, + { + "epoch": 0.8558387219475085, + "grad_norm": 0.2552800785940888, + "learning_rate": 2.0668675104444745e-05, + "loss": 0.2282, + "step": 31500 + }, + { + "epoch": 0.85719719610933, + "grad_norm": 0.23195860174218688, + "learning_rate": 2.0290155151446145e-05, + "loss": 0.2375, + "step": 31550 + }, + { + "epoch": 0.8585556702711514, + "grad_norm": 0.29024757595999545, + "learning_rate": 1.9914948071828922e-05, + "loss": 0.2222, + "step": 31600 + }, + { + "epoch": 0.8599141444329729, + "grad_norm": 0.2658334720707247, + "learning_rate": 1.9543060782373667e-05, + "loss": 0.2351, + "step": 31650 + }, + { + "epoch": 0.8612726185947943, + "grad_norm": 0.26827491653104296, + "learning_rate": 1.917450013866189e-05, + "loss": 0.2397, + "step": 31700 + }, + { + "epoch": 0.8626310927566158, + "grad_norm": 0.2739389172356759, + "learning_rate": 1.880927293494994e-05, + "loss": 0.233, + "step": 31750 + }, + { + "epoch": 0.8639895669184372, + "grad_norm": 0.2784269064173774, + "learning_rate": 1.8447385904043534e-05, + "loss": 0.2418, + "step": 31800 + }, + { + "epoch": 0.8653480410802586, + "grad_norm": 0.3614092162288049, + "learning_rate": 1.808884571717384e-05, + "loss": 0.2257, + "step": 31850 + }, + { + "epoch": 0.8667065152420801, + "grad_norm": 0.24413648696026682, + "learning_rate": 1.7733658983874336e-05, + "loss": 0.2389, + "step": 31900 + }, + { + "epoch": 0.8680649894039015, + "grad_norm": 0.1967545842426806, + "learning_rate": 1.7381832251859075e-05, + "loss": 0.2191, + "step": 31950 + }, + { + "epoch": 0.869423463565723, + "grad_norm": 0.23747418190146288, + "learning_rate": 1.7033372006901982e-05, + "loss": 0.223, + "step": 32000 + }, + { + "epoch": 0.8707819377275444, + "grad_norm": 0.28815680461600557, + "learning_rate": 1.6702023326195593e-05, + "loss": 0.2242, + "step": 32050 + }, + { + "epoch": 0.8721404118893659, + "grad_norm": 0.2868186834763779, + "learning_rate": 1.636017997206618e-05, + "loss": 0.2155, + "step": 32100 + }, + { + "epoch": 0.8734988860511873, + "grad_norm": 0.20903132836326485, + "learning_rate": 1.6021721938713497e-05, + "loss": 0.2258, + "step": 32150 + }, + { + "epoch": 0.8748573602130088, + "grad_norm": 0.22192290777199325, + "learning_rate": 1.568665546546517e-05, + "loss": 0.2322, + "step": 32200 + }, + { + "epoch": 0.8762158343748302, + "grad_norm": 0.27939346399599835, + "learning_rate": 1.5354986729126963e-05, + "loss": 0.2166, + "step": 32250 + }, + { + "epoch": 0.8775743085366516, + "grad_norm": 0.23760881910404164, + "learning_rate": 1.5026721843868797e-05, + "loss": 0.2231, + "step": 32300 + }, + { + "epoch": 0.8789327826984731, + "grad_norm": 0.2384919572031985, + "learning_rate": 1.4701866861112057e-05, + "loss": 0.2115, + "step": 32350 + }, + { + "epoch": 0.8802912568602945, + "grad_norm": 0.18196788330053723, + "learning_rate": 1.4380427769418081e-05, + "loss": 0.2214, + "step": 32400 + }, + { + "epoch": 0.881649731022116, + "grad_norm": 0.11092114968179356, + "learning_rate": 1.4062410494377642e-05, + "loss": 0.2136, + "step": 32450 + }, + { + "epoch": 0.8830082051839374, + "grad_norm": 0.24840886469424456, + "learning_rate": 1.3747820898501929e-05, + "loss": 0.228, + "step": 32500 + }, + { + "epoch": 0.8843666793457589, + "grad_norm": 0.24015308441997552, + "learning_rate": 1.3436664781114295e-05, + "loss": 0.2225, + "step": 32550 + }, + { + "epoch": 0.8857251535075803, + "grad_norm": 0.2836665970861565, + "learning_rate": 1.3128947878243392e-05, + "loss": 0.2203, + "step": 32600 + }, + { + "epoch": 0.8870836276694017, + "grad_norm": 0.21821870263137624, + "learning_rate": 1.2824675862517388e-05, + "loss": 0.2236, + "step": 32650 + }, + { + "epoch": 0.8884421018312232, + "grad_norm": 0.24628179490379828, + "learning_rate": 1.2523854343059538e-05, + "loss": 0.2224, + "step": 32700 + }, + { + "epoch": 0.8898005759930446, + "grad_norm": 0.3785043038913189, + "learning_rate": 1.2226488865384622e-05, + "loss": 0.2328, + "step": 32750 + }, + { + "epoch": 0.8911590501548661, + "grad_norm": 0.27390465067758646, + "learning_rate": 1.1932584911296762e-05, + "loss": 0.2409, + "step": 32800 + }, + { + "epoch": 0.8925175243166875, + "grad_norm": 0.21777850622413425, + "learning_rate": 1.164214789878848e-05, + "loss": 0.2113, + "step": 32850 + }, + { + "epoch": 0.893875998478509, + "grad_norm": 0.31282081336865486, + "learning_rate": 1.1355183181940688e-05, + "loss": 0.2294, + "step": 32900 + }, + { + "epoch": 0.8952344726403304, + "grad_norm": 0.21847851398276275, + "learning_rate": 1.1071696050823988e-05, + "loss": 0.2176, + "step": 32950 + }, + { + "epoch": 0.8965929468021518, + "grad_norm": 0.1797782169135658, + "learning_rate": 1.0791691731401221e-05, + "loss": 0.2197, + "step": 33000 + }, + { + "epoch": 0.8979514209639733, + "grad_norm": 0.236661072817734, + "learning_rate": 1.0526169006027186e-05, + "loss": 0.2287, + "step": 33050 + }, + { + "epoch": 0.8993098951257947, + "grad_norm": 0.24545168612281812, + "learning_rate": 1.0253005911068837e-05, + "loss": 0.2248, + "step": 33100 + }, + { + "epoch": 0.9006683692876162, + "grad_norm": 0.2998814580945627, + "learning_rate": 9.98334072000362e-06, + "loss": 0.2279, + "step": 33150 + }, + { + "epoch": 0.9020268434494376, + "grad_norm": 0.25372185228970084, + "learning_rate": 9.717178403992866e-06, + "loss": 0.2296, + "step": 33200 + }, + { + "epoch": 0.9033853176112591, + "grad_norm": 0.22808552163017606, + "learning_rate": 9.454523869623889e-06, + "loss": 0.2142, + "step": 33250 + }, + { + "epoch": 0.9047437917730805, + "grad_norm": 0.22587780733166465, + "learning_rate": 9.195381958819637e-06, + "loss": 0.2332, + "step": 33300 + }, + { + "epoch": 0.906102265934902, + "grad_norm": 0.40072192174734456, + "learning_rate": 8.939757448749286e-06, + "loss": 0.2294, + "step": 33350 + }, + { + "epoch": 0.9074607400967234, + "grad_norm": 0.207716936686938, + "learning_rate": 8.687655051740318e-06, + "loss": 0.2296, + "step": 33400 + }, + { + "epoch": 0.9088192142585448, + "grad_norm": 0.2779354186342964, + "learning_rate": 8.439079415191532e-06, + "loss": 0.2219, + "step": 33450 + }, + { + "epoch": 0.9101776884203663, + "grad_norm": 0.2723935977374799, + "learning_rate": 8.194035121487465e-06, + "loss": 0.2153, + "step": 33500 + }, + { + "epoch": 0.9115361625821877, + "grad_norm": 0.19697799899707563, + "learning_rate": 7.952526687913842e-06, + "loss": 0.2146, + "step": 33550 + }, + { + "epoch": 0.9128946367440092, + "grad_norm": 0.2085749517551805, + "learning_rate": 7.714558566574325e-06, + "loss": 0.2136, + "step": 33600 + }, + { + "epoch": 0.9142531109058306, + "grad_norm": 0.21717861835570473, + "learning_rate": 7.480135144308475e-06, + "loss": 0.2148, + "step": 33650 + }, + { + "epoch": 0.9156115850676521, + "grad_norm": 0.19269266158892198, + "learning_rate": 7.2492607426108305e-06, + "loss": 0.2257, + "step": 33700 + }, + { + "epoch": 0.9169700592294735, + "grad_norm": 0.24145080749460107, + "learning_rate": 7.02193961755131e-06, + "loss": 0.2207, + "step": 33750 + }, + { + "epoch": 0.9183285333912949, + "grad_norm": 0.38368673649347684, + "learning_rate": 6.798175959696629e-06, + "loss": 0.2277, + "step": 33800 + }, + { + "epoch": 0.9196870075531164, + "grad_norm": 0.23667938172589895, + "learning_rate": 6.577973894033274e-06, + "loss": 0.2175, + "step": 33850 + }, + { + "epoch": 0.9210454817149378, + "grad_norm": 0.3161307046290422, + "learning_rate": 6.3613374798911605e-06, + "loss": 0.2343, + "step": 33900 + }, + { + "epoch": 0.9224039558767593, + "grad_norm": 0.21360593050098195, + "learning_rate": 6.148270710869053e-06, + "loss": 0.2239, + "step": 33950 + }, + { + "epoch": 0.9237624300385807, + "grad_norm": 0.28937490357759443, + "learning_rate": 5.938777514760796e-06, + "loss": 0.2124, + "step": 34000 + }, + { + "epoch": 0.9251209042004022, + "grad_norm": 0.2795740945530124, + "learning_rate": 5.732861753483043e-06, + "loss": 0.2204, + "step": 34050 + }, + { + "epoch": 0.9264793783622236, + "grad_norm": 0.2537546094067306, + "learning_rate": 5.538551797587777e-06, + "loss": 0.2112, + "step": 34100 + }, + { + "epoch": 0.927837852524045, + "grad_norm": 0.33705155604910225, + "learning_rate": 5.339658758640753e-06, + "loss": 0.2199, + "step": 34150 + }, + { + "epoch": 0.9291963266858664, + "grad_norm": 0.3270077837617184, + "learning_rate": 5.14435419901973e-06, + "loss": 0.2297, + "step": 34200 + }, + { + "epoch": 0.9305548008476878, + "grad_norm": 0.22117872354331958, + "learning_rate": 4.95264171907992e-06, + "loss": 0.22, + "step": 34250 + }, + { + "epoch": 0.9319132750095093, + "grad_norm": 0.2091602624721329, + "learning_rate": 4.7645248529581076e-06, + "loss": 0.2107, + "step": 34300 + }, + { + "epoch": 0.9332717491713307, + "grad_norm": 0.19828806364167093, + "learning_rate": 4.580007068507497e-06, + "loss": 0.2215, + "step": 34350 + }, + { + "epoch": 0.9346302233331522, + "grad_norm": 0.2922062157376195, + "learning_rate": 4.399091767233743e-06, + "loss": 0.234, + "step": 34400 + }, + { + "epoch": 0.9359886974949736, + "grad_norm": 0.2833929720260013, + "learning_rate": 4.221782284232312e-06, + "loss": 0.2358, + "step": 34450 + }, + { + "epoch": 0.937347171656795, + "grad_norm": 0.10391453032816735, + "learning_rate": 4.048081888126931e-06, + "loss": 0.2194, + "step": 34500 + }, + { + "epoch": 0.9387056458186165, + "grad_norm": 0.31685765392965615, + "learning_rate": 3.877993781009415e-06, + "loss": 0.2237, + "step": 34550 + }, + { + "epoch": 0.9400641199804379, + "grad_norm": 0.32028046645415253, + "learning_rate": 3.7115210983805326e-06, + "loss": 0.2296, + "step": 34600 + }, + { + "epoch": 0.9414225941422594, + "grad_norm": 0.2275872271102818, + "learning_rate": 3.548666909092324e-06, + "loss": 0.2237, + "step": 34650 + }, + { + "epoch": 0.9427810683040808, + "grad_norm": 0.28672637661803746, + "learning_rate": 3.3894342152914092e-06, + "loss": 0.2129, + "step": 34700 + }, + { + "epoch": 0.9441395424659023, + "grad_norm": 0.17447544388507297, + "learning_rate": 3.233825952363767e-06, + "loss": 0.2156, + "step": 34750 + }, + { + "epoch": 0.9454980166277237, + "grad_norm": 0.27717659551061696, + "learning_rate": 3.081844988880511e-06, + "loss": 0.2325, + "step": 34800 + }, + { + "epoch": 0.9468564907895451, + "grad_norm": 0.19384754194181844, + "learning_rate": 2.9334941265450666e-06, + "loss": 0.2264, + "step": 34850 + }, + { + "epoch": 0.9482149649513666, + "grad_norm": 0.20190545639653648, + "learning_rate": 2.788776100141499e-06, + "loss": 0.2162, + "step": 34900 + }, + { + "epoch": 0.949573439113188, + "grad_norm": 0.13741628116355178, + "learning_rate": 2.647693577484156e-06, + "loss": 0.2175, + "step": 34950 + }, + { + "epoch": 0.9509319132750095, + "grad_norm": 0.30120083790962326, + "learning_rate": 2.5102491593684164e-06, + "loss": 0.2098, + "step": 35000 + }, + { + "epoch": 0.9522903874368309, + "grad_norm": 0.18521977588022978, + "learning_rate": 2.3764453795227737e-06, + "loss": 0.2232, + "step": 35050 + }, + { + "epoch": 0.9536488615986524, + "grad_norm": 0.31542769729636866, + "learning_rate": 2.2462847045620737e-06, + "loss": 0.2223, + "step": 35100 + }, + { + "epoch": 0.9550073357604738, + "grad_norm": 0.27121032732352324, + "learning_rate": 2.1247601176086262e-06, + "loss": 0.2167, + "step": 35150 + }, + { + "epoch": 0.9563658099222953, + "grad_norm": 0.23513329499397734, + "learning_rate": 2.0017468261825268e-06, + "loss": 0.2118, + "step": 35200 + }, + { + "epoch": 0.9577242840841167, + "grad_norm": 0.15376591935080916, + "learning_rate": 1.8823835470474395e-06, + "loss": 0.2247, + "step": 35250 + }, + { + "epoch": 0.9590827582459381, + "grad_norm": 0.3234625822847048, + "learning_rate": 1.766672480613818e-06, + "loss": 0.2229, + "step": 35300 + }, + { + "epoch": 0.9604412324077596, + "grad_norm": 0.17001324587245673, + "learning_rate": 1.6546157599652613e-06, + "loss": 0.2148, + "step": 35350 + }, + { + "epoch": 0.961799706569581, + "grad_norm": 0.11359209160906845, + "learning_rate": 1.5462154508190108e-06, + "loss": 0.214, + "step": 35400 + }, + { + "epoch": 0.9631581807314025, + "grad_norm": 0.2409519071516936, + "learning_rate": 1.4414735514879373e-06, + "loss": 0.2118, + "step": 35450 + }, + { + "epoch": 0.9645166548932239, + "grad_norm": 0.21439695271092557, + "learning_rate": 1.3403919928437036e-06, + "loss": 0.2219, + "step": 35500 + }, + { + "epoch": 0.9658751290550454, + "grad_norm": 0.21370470770092295, + "learning_rate": 1.2429726382812368e-06, + "loss": 0.2147, + "step": 35550 + }, + { + "epoch": 0.9672336032168668, + "grad_norm": 0.2311514348226777, + "learning_rate": 1.149217283684223e-06, + "loss": 0.23, + "step": 35600 + }, + { + "epoch": 0.9685920773786882, + "grad_norm": 0.29860147103327, + "learning_rate": 1.059127657392156e-06, + "loss": 0.2313, + "step": 35650 + }, + { + "epoch": 0.9699505515405097, + "grad_norm": 0.1814371857599918, + "learning_rate": 9.72705420168407e-07, + "loss": 0.2241, + "step": 35700 + }, + { + "epoch": 0.9713090257023311, + "grad_norm": 0.31658685339190534, + "learning_rate": 8.899521651695831e-07, + "loss": 0.2207, + "step": 35750 + }, + { + "epoch": 0.9726674998641526, + "grad_norm": 0.19909543685451667, + "learning_rate": 8.124150923443096e-07, + "loss": 0.2351, + "step": 35800 + }, + { + "epoch": 0.974025974025974, + "grad_norm": 0.24098233089589674, + "learning_rate": 7.369308575313927e-07, + "loss": 0.2192, + "step": 35850 + }, + { + "epoch": 0.9753844481877955, + "grad_norm": 0.23547731000564504, + "learning_rate": 6.651199513456607e-07, + "loss": 0.2268, + "step": 35900 + }, + { + "epoch": 0.9767429223496169, + "grad_norm": 0.14817468589671154, + "learning_rate": 5.969836975901366e-07, + "loss": 0.2175, + "step": 35950 + }, + { + "epoch": 0.9781013965114383, + "grad_norm": 0.140443367773756, + "learning_rate": 5.3252335232723e-07, + "loss": 0.2214, + "step": 36000 + }, + { + "epoch": 0.9794598706732598, + "grad_norm": 0.3119367682303709, + "learning_rate": 4.71740103855578e-07, + "loss": 0.2249, + "step": 36050 + }, + { + "epoch": 0.9808183448350812, + "grad_norm": 0.311395190548215, + "learning_rate": 4.146350726881076e-07, + "loss": 0.2222, + "step": 36100 + }, + { + "epoch": 0.9821768189969027, + "grad_norm": 0.37040140534262844, + "learning_rate": 3.6120931153138525e-07, + "loss": 0.2361, + "step": 36150 + }, + { + "epoch": 0.9835352931587241, + "grad_norm": 0.2988864815826086, + "learning_rate": 3.114638052662988e-07, + "loss": 0.2207, + "step": 36200 + }, + { + "epoch": 0.9848937673205456, + "grad_norm": 0.21108645711904683, + "learning_rate": 2.6539947092976135e-07, + "loss": 0.2247, + "step": 36250 + }, + { + "epoch": 0.986252241482367, + "grad_norm": 0.17869396874533416, + "learning_rate": 2.2301715769783572e-07, + "loss": 0.2231, + "step": 36300 + }, + { + "epoch": 0.9876107156441885, + "grad_norm": 0.30165958371764556, + "learning_rate": 1.8431764687021347e-07, + "loss": 0.2232, + "step": 36350 + }, + { + "epoch": 0.9889691898060099, + "grad_norm": 0.24564888452021394, + "learning_rate": 1.4930165185564894e-07, + "loss": 0.2135, + "step": 36400 + }, + { + "epoch": 0.9903276639678313, + "grad_norm": 0.2710306063596811, + "learning_rate": 1.1796981815888064e-07, + "loss": 0.2099, + "step": 36450 + }, + { + "epoch": 0.9916861381296528, + "grad_norm": 0.2109106341757756, + "learning_rate": 9.032272336875203e-08, + "loss": 0.21, + "step": 36500 + }, + { + "epoch": 0.9930446122914742, + "grad_norm": 0.27834177280791966, + "learning_rate": 6.636087714748662e-08, + "loss": 0.2265, + "step": 36550 + }, + { + "epoch": 0.9944030864532957, + "grad_norm": 0.30415917234664425, + "learning_rate": 4.608472122138441e-08, + "loss": 0.232, + "step": 36600 + }, + { + "epoch": 0.9957615606151171, + "grad_norm": 0.24438726040214506, + "learning_rate": 2.949462937262837e-08, + "loss": 0.221, + "step": 36650 + }, + { + "epoch": 0.9971200347769386, + "grad_norm": 0.26240483157432476, + "learning_rate": 1.6590907432401104e-08, + "loss": 0.2276, + "step": 36700 + }, + { + "epoch": 0.99847850893876, + "grad_norm": 0.24193448916634921, + "learning_rate": 7.3737932752226955e-09, + "loss": 0.2229, + "step": 36750 + }, + { + "epoch": 0.9998369831005814, + "grad_norm": 0.308625063910033, + "learning_rate": 1.843456814643041e-09, + "loss": 0.219, + "step": 36800 + }, + { + "epoch": 1.0, + "step": 36806, + "total_flos": 244684510289920.0, + "train_loss": 0.3143004386741001, + "train_runtime": 666350.8913, + "train_samples_per_second": 1.326, + "train_steps_per_second": 0.055 + } + ], + "logging_steps": 50, + "max_steps": 36806, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 244684510289920.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}