OpusLlama / trainer_state.json
YaoYao12138's picture
Upload 5 files
d0a03ca verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 36806,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013584741618214422,
"grad_norm": 0.336629520848516,
"learning_rate": 8.868778280542987e-05,
"loss": 0.9342,
"step": 50
},
{
"epoch": 0.0027169483236428845,
"grad_norm": 0.41112101024853426,
"learning_rate": 0.00017918552036199096,
"loss": 0.5198,
"step": 100
},
{
"epoch": 0.004075422485464326,
"grad_norm": 0.38040867145053625,
"learning_rate": 0.0002696832579185521,
"loss": 0.4962,
"step": 150
},
{
"epoch": 0.005433896647285769,
"grad_norm": 0.5231995312551354,
"learning_rate": 0.00036018099547511313,
"loss": 0.5198,
"step": 200
},
{
"epoch": 0.006792370809107211,
"grad_norm": 0.37419434381789457,
"learning_rate": 0.00039999942189133334,
"loss": 0.5615,
"step": 250
},
{
"epoch": 0.008150844970928653,
"grad_norm": 0.5051525763054373,
"learning_rate": 0.00039999551377337605,
"loss": 0.5629,
"step": 300
},
{
"epoch": 0.009509319132750094,
"grad_norm": 0.37940069661143105,
"learning_rate": 0.0003999879188244911,
"loss": 0.5715,
"step": 350
},
{
"epoch": 0.010867793294571538,
"grad_norm": 0.4187127030134641,
"learning_rate": 0.0003999766371846881,
"loss": 0.5191,
"step": 400
},
{
"epoch": 0.01222626745639298,
"grad_norm": 0.4287344223835902,
"learning_rate": 0.00039996166906193926,
"loss": 0.5266,
"step": 450
},
{
"epoch": 0.013584741618214421,
"grad_norm": 0.42083341057731205,
"learning_rate": 0.00039994301473217543,
"loss": 0.4993,
"step": 500
},
{
"epoch": 0.014943215780035863,
"grad_norm": 0.44004507818862004,
"learning_rate": 0.00039992067453928115,
"loss": 0.5008,
"step": 550
},
{
"epoch": 0.016301689941857305,
"grad_norm": 0.3947637329018398,
"learning_rate": 0.0003998946488950882,
"loss": 0.5199,
"step": 600
},
{
"epoch": 0.01766016410367875,
"grad_norm": 0.35666971296762606,
"learning_rate": 0.0003998649382793681,
"loss": 0.5024,
"step": 650
},
{
"epoch": 0.01901863826550019,
"grad_norm": 0.3864531806325715,
"learning_rate": 0.0003998315432398232,
"loss": 0.5058,
"step": 700
},
{
"epoch": 0.020377112427321632,
"grad_norm": 0.3979644110719079,
"learning_rate": 0.00039979446439207663,
"loss": 0.4833,
"step": 750
},
{
"epoch": 0.021735586589143076,
"grad_norm": 0.32463557091237133,
"learning_rate": 0.000399753702419661,
"loss": 0.5181,
"step": 800
},
{
"epoch": 0.023094060750964516,
"grad_norm": 0.36185137659998867,
"learning_rate": 0.0003997092580740055,
"loss": 0.4731,
"step": 850
},
{
"epoch": 0.02445253491278596,
"grad_norm": 0.36585435226850893,
"learning_rate": 0.00039966113217442266,
"loss": 0.5028,
"step": 900
},
{
"epoch": 0.0258110090746074,
"grad_norm": 0.41125601490924335,
"learning_rate": 0.00039960932560809256,
"loss": 0.4907,
"step": 950
},
{
"epoch": 0.027169483236428843,
"grad_norm": 0.3848215921898568,
"learning_rate": 0.0003995538393300469,
"loss": 0.483,
"step": 1000
},
{
"epoch": 0.028527957398250287,
"grad_norm": 0.33951726776456204,
"learning_rate": 0.0003994946743631513,
"loss": 0.4567,
"step": 1050
},
{
"epoch": 0.029886431560071727,
"grad_norm": 0.3234265221555652,
"learning_rate": 0.0003994318317980862,
"loss": 0.4663,
"step": 1100
},
{
"epoch": 0.03124490572189317,
"grad_norm": 0.3997241882392808,
"learning_rate": 0.0003993666791947118,
"loss": 0.4709,
"step": 1150
},
{
"epoch": 0.03260337988371461,
"grad_norm": 0.45209570419286,
"learning_rate": 0.00039929799689193896,
"loss": 0.4836,
"step": 1200
},
{
"epoch": 0.033961854045536054,
"grad_norm": 0.31674750892690945,
"learning_rate": 0.000399224275685374,
"loss": 0.4708,
"step": 1250
},
{
"epoch": 0.0353203282073575,
"grad_norm": 0.3336127152054193,
"learning_rate": 0.0003991468818653226,
"loss": 0.48,
"step": 1300
},
{
"epoch": 0.03667880236917894,
"grad_norm": 0.30959090922038635,
"learning_rate": 0.0003990658168585062,
"loss": 0.4568,
"step": 1350
},
{
"epoch": 0.03803727653100038,
"grad_norm": 0.37908103624960793,
"learning_rate": 0.0003989810821593234,
"loss": 0.4867,
"step": 1400
},
{
"epoch": 0.03939575069282182,
"grad_norm": 0.3746655333602711,
"learning_rate": 0.0003988944833236597,
"loss": 0.4685,
"step": 1450
},
{
"epoch": 0.040754224854643264,
"grad_norm": 0.3841226468141105,
"learning_rate": 0.0003988043631486177,
"loss": 0.4788,
"step": 1500
},
{
"epoch": 0.04211269901646471,
"grad_norm": 0.4460643403291581,
"learning_rate": 0.0003987087755737732,
"loss": 0.4502,
"step": 1550
},
{
"epoch": 0.04347117317828615,
"grad_norm": 0.3671013360773688,
"learning_rate": 0.00039860952488846415,
"loss": 0.4534,
"step": 1600
},
{
"epoch": 0.04482964734010759,
"grad_norm": 0.34747090480175297,
"learning_rate": 0.000398506612922334,
"loss": 0.4405,
"step": 1650
},
{
"epoch": 0.04618812150192903,
"grad_norm": 0.36167597296572834,
"learning_rate": 0.00039840004157252047,
"loss": 0.4778,
"step": 1700
},
{
"epoch": 0.047546595663750475,
"grad_norm": 0.4009770951011953,
"learning_rate": 0.0003982898128036203,
"loss": 0.479,
"step": 1750
},
{
"epoch": 0.04890506982557192,
"grad_norm": 0.5968187908901367,
"learning_rate": 0.00039818055417120837,
"loss": 0.5444,
"step": 1800
},
{
"epoch": 0.05026354398739336,
"grad_norm": 1.8105646301472897,
"learning_rate": 0.0003980655464341668,
"loss": 0.5653,
"step": 1850
},
{
"epoch": 0.0516220181492148,
"grad_norm": 0.789226312089056,
"learning_rate": 0.0003979470320337101,
"loss": 0.7458,
"step": 1900
},
{
"epoch": 0.05298049231103624,
"grad_norm": 0.9263531743492742,
"learning_rate": 0.0003978224864094713,
"loss": 0.5415,
"step": 1950
},
{
"epoch": 0.054338966472857686,
"grad_norm": 0.47720194915599734,
"learning_rate": 0.00039769429401312583,
"loss": 0.5115,
"step": 2000
},
{
"epoch": 0.05569744063467913,
"grad_norm": 0.6259624884089505,
"learning_rate": 0.0003975624572078452,
"loss": 0.4968,
"step": 2050
},
{
"epoch": 0.05705591479650057,
"grad_norm": 0.6070556198109234,
"learning_rate": 0.00039742697842398407,
"loss": 0.4721,
"step": 2100
},
{
"epoch": 0.05841438895832201,
"grad_norm": 0.9495402483045952,
"learning_rate": 0.00039728786015903527,
"loss": 0.5384,
"step": 2150
},
{
"epoch": 0.05977286312014345,
"grad_norm": 0.5094390312707457,
"learning_rate": 0.0003971451049775838,
"loss": 0.4768,
"step": 2200
},
{
"epoch": 0.0611313372819649,
"grad_norm": 0.4164209148361273,
"learning_rate": 0.00039699871551126,
"loss": 0.4745,
"step": 2250
},
{
"epoch": 0.06248981144378634,
"grad_norm": 0.4096404988646368,
"learning_rate": 0.0003968486944586903,
"loss": 0.4571,
"step": 2300
},
{
"epoch": 0.06384828560560778,
"grad_norm": 0.36680803179529065,
"learning_rate": 0.00039669504458544815,
"loss": 0.4542,
"step": 2350
},
{
"epoch": 0.06520675976742922,
"grad_norm": 0.4436494287681541,
"learning_rate": 0.00039653776872400245,
"loss": 0.4526,
"step": 2400
},
{
"epoch": 0.06656523392925066,
"grad_norm": 0.632255456628572,
"learning_rate": 0.0003963768697736659,
"loss": 0.4614,
"step": 2450
},
{
"epoch": 0.06792370809107211,
"grad_norm": 0.27294306772696525,
"learning_rate": 0.0003962123507005411,
"loss": 0.4247,
"step": 2500
},
{
"epoch": 0.06928218225289355,
"grad_norm": 0.4922506411666624,
"learning_rate": 0.00039604421453746615,
"loss": 0.4336,
"step": 2550
},
{
"epoch": 0.070640656414715,
"grad_norm": 0.3836605187611845,
"learning_rate": 0.00039587246438395866,
"loss": 0.4308,
"step": 2600
},
{
"epoch": 0.07199913057653644,
"grad_norm": 0.30031785058018967,
"learning_rate": 0.0003956971034061584,
"loss": 0.4336,
"step": 2650
},
{
"epoch": 0.07335760473835788,
"grad_norm": 0.30896010198949975,
"learning_rate": 0.00039551813483676944,
"loss": 0.4282,
"step": 2700
},
{
"epoch": 0.07471607890017933,
"grad_norm": 0.3264468040624618,
"learning_rate": 0.0003953355619749999,
"loss": 0.4466,
"step": 2750
},
{
"epoch": 0.07607455306200075,
"grad_norm": 0.47417679268203056,
"learning_rate": 0.0003951493881865018,
"loss": 0.455,
"step": 2800
},
{
"epoch": 0.0774330272238222,
"grad_norm": 0.467248267974456,
"learning_rate": 0.0003949596169033084,
"loss": 0.436,
"step": 2850
},
{
"epoch": 0.07879150138564364,
"grad_norm": 0.2574312755334094,
"learning_rate": 0.0003947662516237714,
"loss": 0.4263,
"step": 2900
},
{
"epoch": 0.08014997554746509,
"grad_norm": 0.40465633762609327,
"learning_rate": 0.0003945692959124962,
"loss": 0.4275,
"step": 2950
},
{
"epoch": 0.08150844970928653,
"grad_norm": 0.3105964962014554,
"learning_rate": 0.0003943687534002764,
"loss": 0.4063,
"step": 3000
},
{
"epoch": 0.08286692387110797,
"grad_norm": 0.36079130494736616,
"learning_rate": 0.00039416462778402644,
"loss": 0.4291,
"step": 3050
},
{
"epoch": 0.08422539803292942,
"grad_norm": 0.27991323241801247,
"learning_rate": 0.0003939569228267139,
"loss": 0.4294,
"step": 3100
},
{
"epoch": 0.08558387219475086,
"grad_norm": 0.28381836513166353,
"learning_rate": 0.00039374564235729017,
"loss": 0.4198,
"step": 3150
},
{
"epoch": 0.0869423463565723,
"grad_norm": 0.3299204189197327,
"learning_rate": 0.00039353079027061935,
"loss": 0.4103,
"step": 3200
},
{
"epoch": 0.08830082051839375,
"grad_norm": 0.326311631143589,
"learning_rate": 0.0003933123705274068,
"loss": 0.4297,
"step": 3250
},
{
"epoch": 0.08965929468021518,
"grad_norm": 0.3302186726703939,
"learning_rate": 0.0003930903871541262,
"loss": 0.4129,
"step": 3300
},
{
"epoch": 0.09101776884203662,
"grad_norm": 0.24354600292418105,
"learning_rate": 0.00039286484424294534,
"loss": 0.4178,
"step": 3350
},
{
"epoch": 0.09237624300385806,
"grad_norm": 0.33427871787687957,
"learning_rate": 0.00039263574595165007,
"loss": 0.4229,
"step": 3400
},
{
"epoch": 0.09373471716567951,
"grad_norm": 0.3922838514063193,
"learning_rate": 0.00039240309650356874,
"loss": 0.416,
"step": 3450
},
{
"epoch": 0.09509319132750095,
"grad_norm": 0.37066104429189983,
"learning_rate": 0.0003921669001874933,
"loss": 0.4359,
"step": 3500
},
{
"epoch": 0.0964516654893224,
"grad_norm": 0.29257797493644805,
"learning_rate": 0.0003919271613576008,
"loss": 0.4286,
"step": 3550
},
{
"epoch": 0.09781013965114384,
"grad_norm": 0.45431531124773644,
"learning_rate": 0.0003916838844333732,
"loss": 0.4291,
"step": 3600
},
{
"epoch": 0.09916861381296528,
"grad_norm": 0.31092097963579685,
"learning_rate": 0.0003914370738995154,
"loss": 0.431,
"step": 3650
},
{
"epoch": 0.10052708797478672,
"grad_norm": 0.2895204897106821,
"learning_rate": 0.00039118673430587307,
"loss": 0.4372,
"step": 3700
},
{
"epoch": 0.10188556213660817,
"grad_norm": 0.2576792476027214,
"learning_rate": 0.0003909328702673485,
"loss": 0.4527,
"step": 3750
},
{
"epoch": 0.1032440362984296,
"grad_norm": 0.30963702592618436,
"learning_rate": 0.0003906754864638156,
"loss": 0.4121,
"step": 3800
},
{
"epoch": 0.10460251046025104,
"grad_norm": 0.6475657508703404,
"learning_rate": 0.0003904145876400337,
"loss": 0.4224,
"step": 3850
},
{
"epoch": 0.10596098462207248,
"grad_norm": 0.3256994801441109,
"learning_rate": 0.00039015017860555984,
"loss": 0.4363,
"step": 3900
},
{
"epoch": 0.10731945878389393,
"grad_norm": 0.3471706882794347,
"learning_rate": 0.0003898822642346604,
"loss": 0.4252,
"step": 3950
},
{
"epoch": 0.10867793294571537,
"grad_norm": 0.29742285163537485,
"learning_rate": 0.00038961084946622114,
"loss": 0.41,
"step": 4000
},
{
"epoch": 0.11003640710753682,
"grad_norm": 0.42367795916225637,
"learning_rate": 0.0003893359393036561,
"loss": 0.4047,
"step": 4050
},
{
"epoch": 0.11139488126935826,
"grad_norm": 0.3766799857882688,
"learning_rate": 0.0003890575388148154,
"loss": 0.4142,
"step": 4100
},
{
"epoch": 0.1127533554311797,
"grad_norm": 0.4163460710146468,
"learning_rate": 0.00038877565313189184,
"loss": 0.4467,
"step": 4150
},
{
"epoch": 0.11411182959300115,
"grad_norm": 0.2826407429662945,
"learning_rate": 0.00038849028745132627,
"loss": 0.4149,
"step": 4200
},
{
"epoch": 0.11547030375482259,
"grad_norm": 0.3265965959677555,
"learning_rate": 0.0003882014470337117,
"loss": 0.4358,
"step": 4250
},
{
"epoch": 0.11682877791664402,
"grad_norm": 0.24777830179313484,
"learning_rate": 0.00038790913720369657,
"loss": 0.4012,
"step": 4300
},
{
"epoch": 0.11818725207846546,
"grad_norm": 0.2915403708081659,
"learning_rate": 0.00038761336334988634,
"loss": 0.4069,
"step": 4350
},
{
"epoch": 0.1195457262402869,
"grad_norm": 0.3326202807353683,
"learning_rate": 0.00038731413092474423,
"loss": 0.3902,
"step": 4400
},
{
"epoch": 0.12090420040210835,
"grad_norm": 0.3965527555219645,
"learning_rate": 0.00038701144544449085,
"loss": 0.3894,
"step": 4450
},
{
"epoch": 0.1222626745639298,
"grad_norm": 0.36617448279834447,
"learning_rate": 0.0003867053124890022,
"loss": 0.3993,
"step": 4500
},
{
"epoch": 0.12362114872575124,
"grad_norm": 0.2978802526091461,
"learning_rate": 0.0003863957377017073,
"loss": 0.3934,
"step": 4550
},
{
"epoch": 0.12497962288757268,
"grad_norm": 0.3199935306648141,
"learning_rate": 0.0003860827267894834,
"loss": 0.4015,
"step": 4600
},
{
"epoch": 0.1263380970493941,
"grad_norm": 0.28870094415921566,
"learning_rate": 0.00038576628552255173,
"loss": 0.4242,
"step": 4650
},
{
"epoch": 0.12769657121121555,
"grad_norm": 0.3368662682662353,
"learning_rate": 0.00038544641973437026,
"loss": 0.4078,
"step": 4700
},
{
"epoch": 0.129055045373037,
"grad_norm": 0.34171191785593347,
"learning_rate": 0.0003851231353215267,
"loss": 0.4184,
"step": 4750
},
{
"epoch": 0.13041351953485844,
"grad_norm": 0.3456350837751831,
"learning_rate": 0.00038479643824362956,
"loss": 0.4011,
"step": 4800
},
{
"epoch": 0.13177199369667988,
"grad_norm": 0.44062561750629825,
"learning_rate": 0.00038446633452319845,
"loss": 0.4179,
"step": 4850
},
{
"epoch": 0.13313046785850133,
"grad_norm": 0.30494197906676423,
"learning_rate": 0.00038413283024555284,
"loss": 0.3987,
"step": 4900
},
{
"epoch": 0.13448894202032277,
"grad_norm": 0.26281770974778434,
"learning_rate": 0.00038379593155870006,
"loss": 0.3745,
"step": 4950
},
{
"epoch": 0.13584741618214421,
"grad_norm": 0.33576603130586835,
"learning_rate": 0.00038345564467322197,
"loss": 0.3981,
"step": 5000
},
{
"epoch": 0.13720589034396566,
"grad_norm": 0.3395300651756381,
"learning_rate": 0.00038311197586216023,
"loss": 0.3908,
"step": 5050
},
{
"epoch": 0.1385643645057871,
"grad_norm": 0.3641141735123601,
"learning_rate": 0.0003827649314609011,
"loss": 0.4156,
"step": 5100
},
{
"epoch": 0.13992283866760855,
"grad_norm": 0.3880481801495523,
"learning_rate": 0.00038241451786705824,
"loss": 0.4225,
"step": 5150
},
{
"epoch": 0.14128131282943,
"grad_norm": 0.33587356117829287,
"learning_rate": 0.0003820607415403548,
"loss": 0.4322,
"step": 5200
},
{
"epoch": 0.14263978699125143,
"grad_norm": 0.2651951238410833,
"learning_rate": 0.0003817036090025046,
"loss": 0.3882,
"step": 5250
},
{
"epoch": 0.14399826115307288,
"grad_norm": 0.3108835459594419,
"learning_rate": 0.0003813431268370919,
"loss": 0.3962,
"step": 5300
},
{
"epoch": 0.14535673531489432,
"grad_norm": 0.5822321494392535,
"learning_rate": 0.0003809793016894496,
"loss": 0.4092,
"step": 5350
},
{
"epoch": 0.14671520947671576,
"grad_norm": 0.37563297659114,
"learning_rate": 0.0003806121402665372,
"loss": 0.4168,
"step": 5400
},
{
"epoch": 0.1480736836385372,
"grad_norm": 0.3315292653141971,
"learning_rate": 0.00038024164933681703,
"loss": 0.4094,
"step": 5450
},
{
"epoch": 0.14943215780035865,
"grad_norm": 0.3989400802203129,
"learning_rate": 0.00037986783573012935,
"loss": 0.4068,
"step": 5500
},
{
"epoch": 0.15079063196218007,
"grad_norm": 0.3194388411256492,
"learning_rate": 0.0003794907063375666,
"loss": 0.4003,
"step": 5550
},
{
"epoch": 0.1521491061240015,
"grad_norm": 0.30240424166641394,
"learning_rate": 0.00037911026811134616,
"loss": 0.407,
"step": 5600
},
{
"epoch": 0.15350758028582295,
"grad_norm": 0.351737936530188,
"learning_rate": 0.0003787265280646825,
"loss": 0.4107,
"step": 5650
},
{
"epoch": 0.1548660544476444,
"grad_norm": 0.3836451635236239,
"learning_rate": 0.0003783394932716577,
"loss": 0.3999,
"step": 5700
},
{
"epoch": 0.15622452860946584,
"grad_norm": 0.25767700494067813,
"learning_rate": 0.0003779491708670909,
"loss": 0.388,
"step": 5750
},
{
"epoch": 0.15758300277128728,
"grad_norm": 0.35195866802663683,
"learning_rate": 0.00037755556804640723,
"loss": 0.3986,
"step": 5800
},
{
"epoch": 0.15894147693310873,
"grad_norm": 0.37789059875509895,
"learning_rate": 0.00037715869206550467,
"loss": 0.4124,
"step": 5850
},
{
"epoch": 0.16029995109493017,
"grad_norm": 0.29714505650099465,
"learning_rate": 0.0003767585502406204,
"loss": 0.382,
"step": 5900
},
{
"epoch": 0.16165842525675161,
"grad_norm": 0.36569606019843054,
"learning_rate": 0.0003763551499481964,
"loss": 0.4091,
"step": 5950
},
{
"epoch": 0.16301689941857306,
"grad_norm": 0.5124251442347727,
"learning_rate": 0.0003759484986247426,
"loss": 0.3957,
"step": 6000
},
{
"epoch": 0.1643753735803945,
"grad_norm": 0.42061839044447147,
"learning_rate": 0.0003755386037667007,
"loss": 0.3939,
"step": 6050
},
{
"epoch": 0.16573384774221595,
"grad_norm": 0.278996914346875,
"learning_rate": 0.0003751254729303053,
"loss": 0.4171,
"step": 6100
},
{
"epoch": 0.1670923219040374,
"grad_norm": 0.22931110142699168,
"learning_rate": 0.0003747091137314451,
"loss": 0.4037,
"step": 6150
},
{
"epoch": 0.16845079606585883,
"grad_norm": 0.4632003674215028,
"learning_rate": 0.00037428953384552197,
"loss": 0.3856,
"step": 6200
},
{
"epoch": 0.16980927022768028,
"grad_norm": 0.3456285538182738,
"learning_rate": 0.00037386674100730986,
"loss": 0.3887,
"step": 6250
},
{
"epoch": 0.17116774438950172,
"grad_norm": 0.4299448792360789,
"learning_rate": 0.0003734407430108124,
"loss": 0.3802,
"step": 6300
},
{
"epoch": 0.17252621855132316,
"grad_norm": 0.26911515847901823,
"learning_rate": 0.0003730115477091185,
"loss": 0.3906,
"step": 6350
},
{
"epoch": 0.1738846927131446,
"grad_norm": 0.24894262795042146,
"learning_rate": 0.00037257916301425823,
"loss": 0.3743,
"step": 6400
},
{
"epoch": 0.17524316687496605,
"grad_norm": 0.36696930956387547,
"learning_rate": 0.00037214359689705676,
"loss": 0.3977,
"step": 6450
},
{
"epoch": 0.1766016410367875,
"grad_norm": 0.4978113756154764,
"learning_rate": 0.0003717048573869873,
"loss": 0.3782,
"step": 6500
},
{
"epoch": 0.1779601151986089,
"grad_norm": 0.3180001579838143,
"learning_rate": 0.00037126295257202324,
"loss": 0.3975,
"step": 6550
},
{
"epoch": 0.17931858936043035,
"grad_norm": 0.408582540741261,
"learning_rate": 0.0003708178905984891,
"loss": 0.3763,
"step": 6600
},
{
"epoch": 0.1806770635222518,
"grad_norm": 0.27659616140789006,
"learning_rate": 0.00037036967967091005,
"loss": 0.4013,
"step": 6650
},
{
"epoch": 0.18203553768407324,
"grad_norm": 0.16425635963494883,
"learning_rate": 0.00036991832805186107,
"loss": 0.3865,
"step": 6700
},
{
"epoch": 0.18339401184589468,
"grad_norm": 0.37646449133193777,
"learning_rate": 0.00036946384406181425,
"loss": 0.3892,
"step": 6750
},
{
"epoch": 0.18475248600771613,
"grad_norm": 0.3427684491705063,
"learning_rate": 0.0003690062360789858,
"loss": 0.3969,
"step": 6800
},
{
"epoch": 0.18611096016953757,
"grad_norm": 0.392156716422773,
"learning_rate": 0.0003685455125391811,
"loss": 0.3709,
"step": 6850
},
{
"epoch": 0.18746943433135901,
"grad_norm": 0.3626113579097081,
"learning_rate": 0.0003680816819356398,
"loss": 0.3929,
"step": 6900
},
{
"epoch": 0.18882790849318046,
"grad_norm": 0.3652111299862884,
"learning_rate": 0.00036761475281887863,
"loss": 0.3941,
"step": 6950
},
{
"epoch": 0.1901863826550019,
"grad_norm": 0.3038051956143636,
"learning_rate": 0.0003671541644021072,
"loss": 0.4019,
"step": 7000
},
{
"epoch": 0.19154485681682334,
"grad_norm": 0.34844140332875567,
"learning_rate": 0.00036668112567831633,
"loss": 0.3666,
"step": 7050
},
{
"epoch": 0.1929033309786448,
"grad_norm": 0.31453187134210386,
"learning_rate": 0.0003662050142599555,
"loss": 0.4062,
"step": 7100
},
{
"epoch": 0.19426180514046623,
"grad_norm": 0.29627297234275446,
"learning_rate": 0.00036572583892393305,
"loss": 0.3807,
"step": 7150
},
{
"epoch": 0.19562027930228768,
"grad_norm": 0.37160716135610367,
"learning_rate": 0.0003652436085036393,
"loss": 0.3936,
"step": 7200
},
{
"epoch": 0.19697875346410912,
"grad_norm": 0.3178371792735983,
"learning_rate": 0.0003647583318887839,
"loss": 0.3942,
"step": 7250
},
{
"epoch": 0.19833722762593056,
"grad_norm": 0.36580162938243826,
"learning_rate": 0.0003642700180252315,
"loss": 0.3932,
"step": 7300
},
{
"epoch": 0.199695701787752,
"grad_norm": 0.2503536243665017,
"learning_rate": 0.0003637786759148375,
"loss": 0.3835,
"step": 7350
},
{
"epoch": 0.20105417594957345,
"grad_norm": 0.3422541205714727,
"learning_rate": 0.0003632942313704729,
"loss": 0.3869,
"step": 7400
},
{
"epoch": 0.2024126501113949,
"grad_norm": 0.2635673669764689,
"learning_rate": 0.00036279692010693837,
"loss": 0.374,
"step": 7450
},
{
"epoch": 0.20377112427321634,
"grad_norm": 0.32133691105349677,
"learning_rate": 0.0003622966077524861,
"loss": 0.3829,
"step": 7500
},
{
"epoch": 0.20512959843503775,
"grad_norm": 0.2963373700211773,
"learning_rate": 0.0003617933035301583,
"loss": 0.3784,
"step": 7550
},
{
"epoch": 0.2064880725968592,
"grad_norm": 0.34175259284676457,
"learning_rate": 0.000361287016718151,
"loss": 0.3634,
"step": 7600
},
{
"epoch": 0.20784654675868064,
"grad_norm": 0.21695916175140184,
"learning_rate": 0.0003607777566496428,
"loss": 0.3913,
"step": 7650
},
{
"epoch": 0.20920502092050208,
"grad_norm": 0.38835064397473545,
"learning_rate": 0.00036027580617629013,
"loss": 0.3937,
"step": 7700
},
{
"epoch": 0.21056349508232353,
"grad_norm": 0.33577145967062894,
"learning_rate": 0.00035976068680901367,
"loss": 0.4041,
"step": 7750
},
{
"epoch": 0.21192196924414497,
"grad_norm": 0.2829507495162838,
"learning_rate": 0.0003592426223224691,
"loss": 0.3885,
"step": 7800
},
{
"epoch": 0.2132804434059664,
"grad_norm": 0.30282305495180506,
"learning_rate": 0.00035872162226695156,
"loss": 0.425,
"step": 7850
},
{
"epoch": 0.21463891756778786,
"grad_norm": 0.26572704020691806,
"learning_rate": 0.000358197696246872,
"loss": 0.3719,
"step": 7900
},
{
"epoch": 0.2159973917296093,
"grad_norm": 0.4362525108778928,
"learning_rate": 0.0003576708539205804,
"loss": 0.3751,
"step": 7950
},
{
"epoch": 0.21735586589143074,
"grad_norm": 0.25828364696271605,
"learning_rate": 0.0003571411050001875,
"loss": 0.3863,
"step": 8000
},
{
"epoch": 0.2187143400532522,
"grad_norm": 0.3678501770422065,
"learning_rate": 0.00035660845925138585,
"loss": 0.3931,
"step": 8050
},
{
"epoch": 0.22007281421507363,
"grad_norm": 0.27819454488467,
"learning_rate": 0.00035607292649326983,
"loss": 0.3633,
"step": 8100
},
{
"epoch": 0.22143128837689507,
"grad_norm": 0.32104809246529153,
"learning_rate": 0.00035553451659815457,
"loss": 0.3914,
"step": 8150
},
{
"epoch": 0.22278976253871652,
"grad_norm": 0.32593288782577173,
"learning_rate": 0.000354993239491394,
"loss": 0.3721,
"step": 8200
},
{
"epoch": 0.22414823670053796,
"grad_norm": 0.2740573766851601,
"learning_rate": 0.00035444910515119776,
"loss": 0.3725,
"step": 8250
},
{
"epoch": 0.2255067108623594,
"grad_norm": 0.6372146103791719,
"learning_rate": 0.00035390212360844744,
"loss": 0.3786,
"step": 8300
},
{
"epoch": 0.22686518502418085,
"grad_norm": 0.3847972604563355,
"learning_rate": 0.00035335230494651165,
"loss": 0.3807,
"step": 8350
},
{
"epoch": 0.2282236591860023,
"grad_norm": 0.3297830679777594,
"learning_rate": 0.00035279965930105987,
"loss": 0.3757,
"step": 8400
},
{
"epoch": 0.22958213334782374,
"grad_norm": 0.27775568777577214,
"learning_rate": 0.00035224419685987593,
"loss": 0.3796,
"step": 8450
},
{
"epoch": 0.23094060750964518,
"grad_norm": 0.2985445199508582,
"learning_rate": 0.0003516859278626702,
"loss": 0.385,
"step": 8500
},
{
"epoch": 0.2322990816714666,
"grad_norm": 0.2822498403343061,
"learning_rate": 0.00035112486260089026,
"loss": 0.3654,
"step": 8550
},
{
"epoch": 0.23365755583328804,
"grad_norm": 0.29173659614651143,
"learning_rate": 0.0003505610114175323,
"loss": 0.3693,
"step": 8600
},
{
"epoch": 0.23501602999510948,
"grad_norm": 0.37316571053101977,
"learning_rate": 0.00034999438470694903,
"loss": 0.3624,
"step": 8650
},
{
"epoch": 0.23637450415693093,
"grad_norm": 0.2991870149215513,
"learning_rate": 0.0003494249929146593,
"loss": 0.3944,
"step": 8700
},
{
"epoch": 0.23773297831875237,
"grad_norm": 0.2626766731044403,
"learning_rate": 0.000348852846537155,
"loss": 0.3562,
"step": 8750
},
{
"epoch": 0.2390914524805738,
"grad_norm": 0.3742270372557048,
"learning_rate": 0.0003482779561217074,
"loss": 0.3737,
"step": 8800
},
{
"epoch": 0.24044992664239526,
"grad_norm": 0.33975911074890713,
"learning_rate": 0.000347700332266173,
"loss": 0.3673,
"step": 8850
},
{
"epoch": 0.2418084008042167,
"grad_norm": 0.24493733544746576,
"learning_rate": 0.00034711998561879823,
"loss": 0.3863,
"step": 8900
},
{
"epoch": 0.24316687496603814,
"grad_norm": 0.24367293507823867,
"learning_rate": 0.00034653692687802295,
"loss": 0.3597,
"step": 8950
},
{
"epoch": 0.2445253491278596,
"grad_norm": 0.4992858322232737,
"learning_rate": 0.0003459511667922831,
"loss": 0.3759,
"step": 9000
},
{
"epoch": 0.24588382328968103,
"grad_norm": 0.34074243370950325,
"learning_rate": 0.000345362716159813,
"loss": 0.3704,
"step": 9050
},
{
"epoch": 0.24724229745150247,
"grad_norm": 0.37589710424283923,
"learning_rate": 0.0003447715858284458,
"loss": 0.3605,
"step": 9100
},
{
"epoch": 0.24860077161332392,
"grad_norm": 0.2954917370820438,
"learning_rate": 0.00034417778669541414,
"loss": 0.3619,
"step": 9150
},
{
"epoch": 0.24995924577514536,
"grad_norm": 0.2682683326827451,
"learning_rate": 0.00034358132970714833,
"loss": 0.3548,
"step": 9200
},
{
"epoch": 0.2513177199369668,
"grad_norm": 0.3334186903450503,
"learning_rate": 0.00034298222585907556,
"loss": 0.3582,
"step": 9250
},
{
"epoch": 0.2526761940987882,
"grad_norm": 0.3037548456565183,
"learning_rate": 0.0003423804861954165,
"loss": 0.3598,
"step": 9300
},
{
"epoch": 0.25403466826060966,
"grad_norm": 0.3415172861536472,
"learning_rate": 0.00034177612180898186,
"loss": 0.3596,
"step": 9350
},
{
"epoch": 0.2553931424224311,
"grad_norm": 0.2778208976878839,
"learning_rate": 0.0003411691438409683,
"loss": 0.3557,
"step": 9400
},
{
"epoch": 0.25675161658425255,
"grad_norm": 0.2722662895108223,
"learning_rate": 0.0003405595634807524,
"loss": 0.3568,
"step": 9450
},
{
"epoch": 0.258110090746074,
"grad_norm": 0.2470305006609605,
"learning_rate": 0.00033994739196568485,
"loss": 0.3693,
"step": 9500
},
{
"epoch": 0.25946856490789544,
"grad_norm": 0.3204976945527186,
"learning_rate": 0.00033933264058088323,
"loss": 0.3744,
"step": 9550
},
{
"epoch": 0.2608270390697169,
"grad_norm": 0.29419054157347835,
"learning_rate": 0.0003387153206590238,
"loss": 0.3578,
"step": 9600
},
{
"epoch": 0.2621855132315383,
"grad_norm": 0.285418283786098,
"learning_rate": 0.0003380954435801327,
"loss": 0.3666,
"step": 9650
},
{
"epoch": 0.26354398739335977,
"grad_norm": 0.40228260917678826,
"learning_rate": 0.0003374730207713763,
"loss": 0.3642,
"step": 9700
},
{
"epoch": 0.2649024615551812,
"grad_norm": 0.30226961110995426,
"learning_rate": 0.0003368480637068501,
"loss": 0.3955,
"step": 9750
},
{
"epoch": 0.26626093571700266,
"grad_norm": 0.445566495529195,
"learning_rate": 0.00033622058390736785,
"loss": 0.3756,
"step": 9800
},
{
"epoch": 0.2676194098788241,
"grad_norm": 0.387905313007887,
"learning_rate": 0.00033559059294024864,
"loss": 0.3657,
"step": 9850
},
{
"epoch": 0.26897788404064554,
"grad_norm": 0.22352584167074716,
"learning_rate": 0.00033495810241910385,
"loss": 0.3452,
"step": 9900
},
{
"epoch": 0.270336358202467,
"grad_norm": 0.2556859512831143,
"learning_rate": 0.00033432312400362305,
"loss": 0.3463,
"step": 9950
},
{
"epoch": 0.27169483236428843,
"grad_norm": 0.5717326212718582,
"learning_rate": 0.00033368566939935925,
"loss": 0.3731,
"step": 10000
},
{
"epoch": 0.2730533065261099,
"grad_norm": 0.32255486027652513,
"learning_rate": 0.0003330457503575127,
"loss": 0.3698,
"step": 10050
},
{
"epoch": 0.2744117806879313,
"grad_norm": 0.25946616692420554,
"learning_rate": 0.0003324033786747145,
"loss": 0.3637,
"step": 10100
},
{
"epoch": 0.27577025484975276,
"grad_norm": 0.3530893683862247,
"learning_rate": 0.0003317585661928094,
"loss": 0.3646,
"step": 10150
},
{
"epoch": 0.2771287290115742,
"grad_norm": 0.2853492397892913,
"learning_rate": 0.000331111324798637,
"loss": 0.3295,
"step": 10200
},
{
"epoch": 0.27848720317339565,
"grad_norm": 0.19156693197511587,
"learning_rate": 0.0003304616664238127,
"loss": 0.359,
"step": 10250
},
{
"epoch": 0.2798456773352171,
"grad_norm": 0.26150890027393986,
"learning_rate": 0.00032980960304450834,
"loss": 0.3665,
"step": 10300
},
{
"epoch": 0.28120415149703853,
"grad_norm": 0.3656668619278649,
"learning_rate": 0.00032915514668123056,
"loss": 0.3498,
"step": 10350
},
{
"epoch": 0.28256262565886,
"grad_norm": 0.36465329584026973,
"learning_rate": 0.00032849830939859977,
"loss": 0.3722,
"step": 10400
},
{
"epoch": 0.2839210998206814,
"grad_norm": 0.28300896811439313,
"learning_rate": 0.00032783910330512776,
"loss": 0.3583,
"step": 10450
},
{
"epoch": 0.28527957398250287,
"grad_norm": 0.2470361242697161,
"learning_rate": 0.000327177540552994,
"loss": 0.3462,
"step": 10500
},
{
"epoch": 0.2866380481443243,
"grad_norm": 0.36918740507242426,
"learning_rate": 0.0003265136333378223,
"loss": 0.3699,
"step": 10550
},
{
"epoch": 0.28799652230614575,
"grad_norm": 0.32934977838919777,
"learning_rate": 0.0003258473938984554,
"loss": 0.3625,
"step": 10600
},
{
"epoch": 0.2893549964679672,
"grad_norm": 0.2087576531023101,
"learning_rate": 0.0003251788345167296,
"loss": 0.3568,
"step": 10650
},
{
"epoch": 0.29071347062978864,
"grad_norm": 0.39857663501798557,
"learning_rate": 0.00032450796751724837,
"loss": 0.3591,
"step": 10700
},
{
"epoch": 0.2920719447916101,
"grad_norm": 0.32871619749282505,
"learning_rate": 0.00032383480526715526,
"loss": 0.3603,
"step": 10750
},
{
"epoch": 0.2934304189534315,
"grad_norm": 0.3062057976969262,
"learning_rate": 0.00032315936017590554,
"loss": 0.3575,
"step": 10800
},
{
"epoch": 0.29478889311525297,
"grad_norm": 0.35676299616191043,
"learning_rate": 0.0003224816446950378,
"loss": 0.3406,
"step": 10850
},
{
"epoch": 0.2961473672770744,
"grad_norm": 0.268494161462533,
"learning_rate": 0.00032180167131794425,
"loss": 0.3356,
"step": 10900
},
{
"epoch": 0.29750584143889586,
"grad_norm": 0.39304660551244835,
"learning_rate": 0.0003211194525796404,
"loss": 0.3681,
"step": 10950
},
{
"epoch": 0.2988643156007173,
"grad_norm": 0.36242243481768954,
"learning_rate": 0.00032043500105653414,
"loss": 0.3624,
"step": 11000
},
{
"epoch": 0.3002227897625387,
"grad_norm": 0.32191025299969356,
"learning_rate": 0.0003197483293661937,
"loss": 0.3639,
"step": 11050
},
{
"epoch": 0.30158126392436013,
"grad_norm": 0.3819533916481645,
"learning_rate": 0.0003190594501671151,
"loss": 0.3639,
"step": 11100
},
{
"epoch": 0.3029397380861816,
"grad_norm": 0.3051700734949664,
"learning_rate": 0.000318368376158489,
"loss": 0.3495,
"step": 11150
},
{
"epoch": 0.304298212248003,
"grad_norm": 0.25353257677490404,
"learning_rate": 0.00031768900650322744,
"loss": 0.3424,
"step": 11200
},
{
"epoch": 0.30565668640982446,
"grad_norm": 0.2971513332547502,
"learning_rate": 0.000316993624394983,
"loss": 0.3465,
"step": 11250
},
{
"epoch": 0.3070151605716459,
"grad_norm": 0.3393454172568527,
"learning_rate": 0.00031629608555979686,
"loss": 0.357,
"step": 11300
},
{
"epoch": 0.30837363473346735,
"grad_norm": 0.3079000714041467,
"learning_rate": 0.0003155964028564964,
"loss": 0.3315,
"step": 11350
},
{
"epoch": 0.3097321088952888,
"grad_norm": 0.236457076827118,
"learning_rate": 0.00031489458918342993,
"loss": 0.3586,
"step": 11400
},
{
"epoch": 0.31109058305711024,
"grad_norm": 0.37187670656153765,
"learning_rate": 0.0003141906574782295,
"loss": 0.3479,
"step": 11450
},
{
"epoch": 0.3124490572189317,
"grad_norm": 0.2858744824646288,
"learning_rate": 0.0003134846207175722,
"loss": 0.359,
"step": 11500
},
{
"epoch": 0.3138075313807531,
"grad_norm": 0.29954433740207526,
"learning_rate": 0.00031277649191694063,
"loss": 0.3466,
"step": 11550
},
{
"epoch": 0.31516600554257457,
"grad_norm": 0.25530263584194796,
"learning_rate": 0.0003120662841303836,
"loss": 0.3488,
"step": 11600
},
{
"epoch": 0.316524479704396,
"grad_norm": 0.22413446350946586,
"learning_rate": 0.0003113540104502747,
"loss": 0.3471,
"step": 11650
},
{
"epoch": 0.31788295386621745,
"grad_norm": 0.40702625283242805,
"learning_rate": 0.000310639684007072,
"loss": 0.3382,
"step": 11700
},
{
"epoch": 0.3192414280280389,
"grad_norm": 0.2866280004893114,
"learning_rate": 0.0003099233179690746,
"loss": 0.3779,
"step": 11750
},
{
"epoch": 0.32059990218986034,
"grad_norm": 0.313655190983661,
"learning_rate": 0.0003092049255421813,
"loss": 0.3646,
"step": 11800
},
{
"epoch": 0.3219583763516818,
"grad_norm": 0.34970496506197146,
"learning_rate": 0.00030848451996964615,
"loss": 0.3628,
"step": 11850
},
{
"epoch": 0.32331685051350323,
"grad_norm": 0.36130996602692567,
"learning_rate": 0.00030776211453183475,
"loss": 0.3608,
"step": 11900
},
{
"epoch": 0.3246753246753247,
"grad_norm": 0.22850628919525512,
"learning_rate": 0.00030703772254597945,
"loss": 0.326,
"step": 11950
},
{
"epoch": 0.3260337988371461,
"grad_norm": 0.3620511895369416,
"learning_rate": 0.00030631135736593364,
"loss": 0.349,
"step": 12000
},
{
"epoch": 0.32739227299896756,
"grad_norm": 0.2122923442045741,
"learning_rate": 0.0003055830323819257,
"loss": 0.3734,
"step": 12050
},
{
"epoch": 0.328750747160789,
"grad_norm": 0.24737840068319314,
"learning_rate": 0.00030485276102031235,
"loss": 0.358,
"step": 12100
},
{
"epoch": 0.33010922132261045,
"grad_norm": 0.3610838024240164,
"learning_rate": 0.0003041205567433305,
"loss": 0.3513,
"step": 12150
},
{
"epoch": 0.3314676954844319,
"grad_norm": 0.33939684182516894,
"learning_rate": 0.0003033864330488499,
"loss": 0.3555,
"step": 12200
},
{
"epoch": 0.33282616964625333,
"grad_norm": 0.2666873772787006,
"learning_rate": 0.00030265040347012397,
"loss": 0.3469,
"step": 12250
},
{
"epoch": 0.3341846438080748,
"grad_norm": 0.21914168729339542,
"learning_rate": 0.00030191248157554,
"loss": 0.3323,
"step": 12300
},
{
"epoch": 0.3355431179698962,
"grad_norm": 0.3499432909434212,
"learning_rate": 0.0003011726809683694,
"loss": 0.3321,
"step": 12350
},
{
"epoch": 0.33690159213171766,
"grad_norm": 0.2752315627002723,
"learning_rate": 0.0003004310152865169,
"loss": 0.366,
"step": 12400
},
{
"epoch": 0.3382600662935391,
"grad_norm": 0.3224312997036977,
"learning_rate": 0.0002996874982022692,
"loss": 0.3363,
"step": 12450
},
{
"epoch": 0.33961854045536055,
"grad_norm": 0.2614682577027786,
"learning_rate": 0.00029894214342204243,
"loss": 0.3364,
"step": 12500
},
{
"epoch": 0.340977014617182,
"grad_norm": 0.35386811908507626,
"learning_rate": 0.00029819496468613024,
"loss": 0.3468,
"step": 12550
},
{
"epoch": 0.34233548877900344,
"grad_norm": 0.3451776698004379,
"learning_rate": 0.00029744597576844995,
"loss": 0.3457,
"step": 12600
},
{
"epoch": 0.3436939629408249,
"grad_norm": 0.2075635034305044,
"learning_rate": 0.00029669519047628874,
"loss": 0.3217,
"step": 12650
},
{
"epoch": 0.3450524371026463,
"grad_norm": 0.35938341724916706,
"learning_rate": 0.0002959426226500493,
"loss": 0.3518,
"step": 12700
},
{
"epoch": 0.34641091126446777,
"grad_norm": 0.2028404175509972,
"learning_rate": 0.0002951882861629944,
"loss": 0.3464,
"step": 12750
},
{
"epoch": 0.3477693854262892,
"grad_norm": 0.3092038953376563,
"learning_rate": 0.00029443219492099153,
"loss": 0.3565,
"step": 12800
},
{
"epoch": 0.34912785958811066,
"grad_norm": 0.29068071721416333,
"learning_rate": 0.0002936743628622562,
"loss": 0.3315,
"step": 12850
},
{
"epoch": 0.3504863337499321,
"grad_norm": 0.20779330405236773,
"learning_rate": 0.0002929148039570951,
"loss": 0.3174,
"step": 12900
},
{
"epoch": 0.35184480791175354,
"grad_norm": 0.31923873979474604,
"learning_rate": 0.00029215353220764863,
"loss": 0.3441,
"step": 12950
},
{
"epoch": 0.353203282073575,
"grad_norm": 0.2745041226462606,
"learning_rate": 0.00029139056164763274,
"loss": 0.3467,
"step": 13000
},
{
"epoch": 0.35456175623539643,
"grad_norm": 0.4368395215278957,
"learning_rate": 0.0002906259063420803,
"loss": 0.3517,
"step": 13050
},
{
"epoch": 0.3559202303972178,
"grad_norm": 0.30792463599025904,
"learning_rate": 0.0002898595803870815,
"loss": 0.3442,
"step": 13100
},
{
"epoch": 0.35727870455903926,
"grad_norm": 0.3611952448865168,
"learning_rate": 0.0002890915979095244,
"loss": 0.3204,
"step": 13150
},
{
"epoch": 0.3586371787208607,
"grad_norm": 0.23056033787481225,
"learning_rate": 0.0002883219730668345,
"loss": 0.3239,
"step": 13200
},
{
"epoch": 0.35999565288268215,
"grad_norm": 0.2530394826085691,
"learning_rate": 0.00028755072004671314,
"loss": 0.3473,
"step": 13250
},
{
"epoch": 0.3613541270445036,
"grad_norm": 0.33962698046120804,
"learning_rate": 0.000286793326131175,
"loss": 0.3416,
"step": 13300
},
{
"epoch": 0.36271260120632504,
"grad_norm": 0.21053853436821962,
"learning_rate": 0.0002860188912935213,
"loss": 0.3278,
"step": 13350
},
{
"epoch": 0.3640710753681465,
"grad_norm": 0.3129818212559564,
"learning_rate": 0.00028524287073475416,
"loss": 0.3541,
"step": 13400
},
{
"epoch": 0.3654295495299679,
"grad_norm": 0.2699867150782398,
"learning_rate": 0.0002844652787604775,
"loss": 0.3403,
"step": 13450
},
{
"epoch": 0.36678802369178937,
"grad_norm": 0.28737860107629143,
"learning_rate": 0.00028368612970526357,
"loss": 0.3323,
"step": 13500
},
{
"epoch": 0.3681464978536108,
"grad_norm": 0.3515960746260734,
"learning_rate": 0.00028290543793238867,
"loss": 0.3293,
"step": 13550
},
{
"epoch": 0.36950497201543225,
"grad_norm": 0.20401533840321576,
"learning_rate": 0.0002821232178335684,
"loss": 0.3316,
"step": 13600
},
{
"epoch": 0.3708634461772537,
"grad_norm": 0.19995437318728543,
"learning_rate": 0.0002813551732516669,
"loss": 0.3427,
"step": 13650
},
{
"epoch": 0.37222192033907514,
"grad_norm": 0.2545451160615089,
"learning_rate": 0.00028056996963593105,
"loss": 0.3246,
"step": 13700
},
{
"epoch": 0.3735803945008966,
"grad_norm": 0.29065996361482416,
"learning_rate": 0.0002797832807475994,
"loss": 0.3377,
"step": 13750
},
{
"epoch": 0.37493886866271803,
"grad_norm": 0.3334762345639782,
"learning_rate": 0.00027899512108894186,
"loss": 0.3281,
"step": 13800
},
{
"epoch": 0.37629734282453947,
"grad_norm": 0.18363139112462235,
"learning_rate": 0.00027820550518934127,
"loss": 0.3498,
"step": 13850
},
{
"epoch": 0.3776558169863609,
"grad_norm": 0.303677922590966,
"learning_rate": 0.00027741444760502593,
"loss": 0.3282,
"step": 13900
},
{
"epoch": 0.37901429114818236,
"grad_norm": 0.33021307742532524,
"learning_rate": 0.0002766378265036753,
"loss": 0.3612,
"step": 13950
},
{
"epoch": 0.3803727653100038,
"grad_norm": 0.3855197948015209,
"learning_rate": 0.00027584395743117087,
"loss": 0.326,
"step": 14000
},
{
"epoch": 0.38173123947182525,
"grad_norm": 0.17305752786285836,
"learning_rate": 0.0002750486902080647,
"loss": 0.3306,
"step": 14050
},
{
"epoch": 0.3830897136336467,
"grad_norm": 0.3557889572340088,
"learning_rate": 0.0002742520394947646,
"loss": 0.3363,
"step": 14100
},
{
"epoch": 0.38444818779546813,
"grad_norm": 0.269254653829798,
"learning_rate": 0.0002734540199771824,
"loss": 0.3509,
"step": 14150
},
{
"epoch": 0.3858066619572896,
"grad_norm": 0.46153677475953025,
"learning_rate": 0.00027265464636646333,
"loss": 0.3423,
"step": 14200
},
{
"epoch": 0.387165136119111,
"grad_norm": 0.25450280604338793,
"learning_rate": 0.0002718539333987147,
"loss": 0.3344,
"step": 14250
},
{
"epoch": 0.38852361028093246,
"grad_norm": 0.24854855950361845,
"learning_rate": 0.00027105189583473416,
"loss": 0.317,
"step": 14300
},
{
"epoch": 0.3898820844427539,
"grad_norm": 0.25191512294105933,
"learning_rate": 0.00027024854845973797,
"loss": 0.3343,
"step": 14350
},
{
"epoch": 0.39124055860457535,
"grad_norm": 0.3399094367009323,
"learning_rate": 0.000269443906083088,
"loss": 0.3141,
"step": 14400
},
{
"epoch": 0.3925990327663968,
"grad_norm": 0.27297702861099216,
"learning_rate": 0.00026863798353801905,
"loss": 0.344,
"step": 14450
},
{
"epoch": 0.39395750692821824,
"grad_norm": 0.3089505317673794,
"learning_rate": 0.000267830795681365,
"loss": 0.3248,
"step": 14500
},
{
"epoch": 0.3953159810900397,
"grad_norm": 0.28407318632921835,
"learning_rate": 0.0002670223573932857,
"loss": 0.3218,
"step": 14550
},
{
"epoch": 0.3966744552518611,
"grad_norm": 0.27517856010825675,
"learning_rate": 0.0002662126835769916,
"loss": 0.3207,
"step": 14600
},
{
"epoch": 0.39803292941368257,
"grad_norm": 0.2209431864475645,
"learning_rate": 0.00026540178915847003,
"loss": 0.3213,
"step": 14650
},
{
"epoch": 0.399391403575504,
"grad_norm": 0.3012179785372981,
"learning_rate": 0.0002645896890862093,
"loss": 0.3031,
"step": 14700
},
{
"epoch": 0.40074987773732546,
"grad_norm": 0.35758174495742123,
"learning_rate": 0.0002637763983309235,
"loss": 0.3244,
"step": 14750
},
{
"epoch": 0.4021083518991469,
"grad_norm": 0.20197976836253828,
"learning_rate": 0.00026296193188527655,
"loss": 0.3211,
"step": 14800
},
{
"epoch": 0.40346682606096834,
"grad_norm": 0.2784592655459722,
"learning_rate": 0.0002621463047636057,
"loss": 0.3233,
"step": 14850
},
{
"epoch": 0.4048253002227898,
"grad_norm": 0.3125528717241462,
"learning_rate": 0.0002613295320016445,
"loss": 0.324,
"step": 14900
},
{
"epoch": 0.40618377438461123,
"grad_norm": 0.3316178107391592,
"learning_rate": 0.00026051162865624636,
"loss": 0.3358,
"step": 14950
},
{
"epoch": 0.4075422485464327,
"grad_norm": 0.18439640337971394,
"learning_rate": 0.00025969260980510605,
"loss": 0.3031,
"step": 15000
},
{
"epoch": 0.4089007227082541,
"grad_norm": 0.2963162351967641,
"learning_rate": 0.00025887249054648245,
"loss": 0.3276,
"step": 15050
},
{
"epoch": 0.4102591968700755,
"grad_norm": 0.22466036509634918,
"learning_rate": 0.00025805128599891994,
"loss": 0.3364,
"step": 15100
},
{
"epoch": 0.41161767103189695,
"grad_norm": 0.2956284294357639,
"learning_rate": 0.00025722901130096975,
"loss": 0.3314,
"step": 15150
},
{
"epoch": 0.4129761451937184,
"grad_norm": 0.36079018502753485,
"learning_rate": 0.00025642215844549676,
"loss": 0.3351,
"step": 15200
},
{
"epoch": 0.41433461935553983,
"grad_norm": 0.3005152688044544,
"learning_rate": 0.00025559780958847773,
"loss": 0.3202,
"step": 15250
},
{
"epoch": 0.4156930935173613,
"grad_norm": 0.33773262295043566,
"learning_rate": 0.00025477243580984904,
"loss": 0.3089,
"step": 15300
},
{
"epoch": 0.4170515676791827,
"grad_norm": 0.3045253275707874,
"learning_rate": 0.00025394605232501987,
"loss": 0.32,
"step": 15350
},
{
"epoch": 0.41841004184100417,
"grad_norm": 0.34532028635872886,
"learning_rate": 0.0002531186743680128,
"loss": 0.3449,
"step": 15400
},
{
"epoch": 0.4197685160028256,
"grad_norm": 0.14134553464927366,
"learning_rate": 0.0002522903171911834,
"loss": 0.3184,
"step": 15450
},
{
"epoch": 0.42112699016464705,
"grad_norm": 0.2761208223745771,
"learning_rate": 0.00025146099606493817,
"loss": 0.3133,
"step": 15500
},
{
"epoch": 0.4224854643264685,
"grad_norm": 0.26549068110573254,
"learning_rate": 0.0002506307262774542,
"loss": 0.3205,
"step": 15550
},
{
"epoch": 0.42384393848828994,
"grad_norm": 0.2773532590377829,
"learning_rate": 0.00024979952313439636,
"loss": 0.3064,
"step": 15600
},
{
"epoch": 0.4252024126501114,
"grad_norm": 0.3073493622335924,
"learning_rate": 0.0002489674019586356,
"loss": 0.3188,
"step": 15650
},
{
"epoch": 0.4265608868119328,
"grad_norm": 0.21684387112528378,
"learning_rate": 0.0002481343780899665,
"loss": 0.3198,
"step": 15700
},
{
"epoch": 0.42791936097375427,
"grad_norm": 0.25977297083107986,
"learning_rate": 0.00024730046688482436,
"loss": 0.3065,
"step": 15750
},
{
"epoch": 0.4292778351355757,
"grad_norm": 0.3308957326333168,
"learning_rate": 0.000246465683716002,
"loss": 0.3085,
"step": 15800
},
{
"epoch": 0.43063630929739716,
"grad_norm": 0.25944521132960924,
"learning_rate": 0.0002456300439723668,
"loss": 0.3136,
"step": 15850
},
{
"epoch": 0.4319947834592186,
"grad_norm": 0.22121128637476822,
"learning_rate": 0.0002447935630585764,
"loss": 0.322,
"step": 15900
},
{
"epoch": 0.43335325762104004,
"grad_norm": 0.32019002146360315,
"learning_rate": 0.0002439562563947953,
"loss": 0.3103,
"step": 15950
},
{
"epoch": 0.4347117317828615,
"grad_norm": 0.26761100791647713,
"learning_rate": 0.0002431181394164103,
"loss": 0.3114,
"step": 16000
},
{
"epoch": 0.43607020594468293,
"grad_norm": 0.22262870758692213,
"learning_rate": 0.00024227922757374582,
"loss": 0.3069,
"step": 16050
},
{
"epoch": 0.4374286801065044,
"grad_norm": 0.18940890843015826,
"learning_rate": 0.00024143953633177937,
"loss": 0.327,
"step": 16100
},
{
"epoch": 0.4387871542683258,
"grad_norm": 0.27459192854267717,
"learning_rate": 0.00024059908116985654,
"loss": 0.3183,
"step": 16150
},
{
"epoch": 0.44014562843014726,
"grad_norm": 0.36514373383887516,
"learning_rate": 0.00023975787758140525,
"loss": 0.2878,
"step": 16200
},
{
"epoch": 0.4415041025919687,
"grad_norm": 0.30714779342945764,
"learning_rate": 0.00023891594107365024,
"loss": 0.3173,
"step": 16250
},
{
"epoch": 0.44286257675379015,
"grad_norm": 0.24572160078772548,
"learning_rate": 0.0002380732871673276,
"loss": 0.3169,
"step": 16300
},
{
"epoch": 0.4442210509156116,
"grad_norm": 0.22451585676228034,
"learning_rate": 0.00023722993139639806,
"loss": 0.2982,
"step": 16350
},
{
"epoch": 0.44557952507743304,
"grad_norm": 0.4312837719351318,
"learning_rate": 0.000236436550903555,
"loss": 0.3126,
"step": 16400
},
{
"epoch": 0.4469379992392545,
"grad_norm": 0.23649698073314787,
"learning_rate": 0.00023559187786324523,
"loss": 0.3229,
"step": 16450
},
{
"epoch": 0.4482964734010759,
"grad_norm": 0.21885398793120167,
"learning_rate": 0.00023474654870203753,
"loss": 0.3066,
"step": 16500
},
{
"epoch": 0.44965494756289737,
"grad_norm": 0.2522766751448378,
"learning_rate": 0.00023390057900320987,
"loss": 0.3121,
"step": 16550
},
{
"epoch": 0.4510134217247188,
"grad_norm": 0.2023032182722522,
"learning_rate": 0.0002330539843618484,
"loss": 0.295,
"step": 16600
},
{
"epoch": 0.45237189588654025,
"grad_norm": 0.3093016288187825,
"learning_rate": 0.00023220678038455975,
"loss": 0.2962,
"step": 16650
},
{
"epoch": 0.4537303700483617,
"grad_norm": 0.2805332120341892,
"learning_rate": 0.00023135898268918323,
"loss": 0.313,
"step": 16700
},
{
"epoch": 0.45508884421018314,
"grad_norm": 0.25366173411593823,
"learning_rate": 0.00023051060690450337,
"loss": 0.308,
"step": 16750
},
{
"epoch": 0.4564473183720046,
"grad_norm": 0.2848859608687515,
"learning_rate": 0.00022966166866996134,
"loss": 0.2966,
"step": 16800
},
{
"epoch": 0.45780579253382603,
"grad_norm": 0.3400405221454168,
"learning_rate": 0.0002288121836353669,
"loss": 0.313,
"step": 16850
},
{
"epoch": 0.45916426669564747,
"grad_norm": 0.25178053552917457,
"learning_rate": 0.0002279621674606098,
"loss": 0.3008,
"step": 16900
},
{
"epoch": 0.4605227408574689,
"grad_norm": 0.2173042023564375,
"learning_rate": 0.00022711163581537106,
"loss": 0.3062,
"step": 16950
},
{
"epoch": 0.46188121501929036,
"grad_norm": 0.23983156392471572,
"learning_rate": 0.00022626060437883435,
"loss": 0.3055,
"step": 17000
},
{
"epoch": 0.4632396891811118,
"grad_norm": 0.21031049569805663,
"learning_rate": 0.00022540908883939668,
"loss": 0.311,
"step": 17050
},
{
"epoch": 0.4645981633429332,
"grad_norm": 0.1989362043985782,
"learning_rate": 0.00022455710489437927,
"loss": 0.3259,
"step": 17100
},
{
"epoch": 0.46595663750475463,
"grad_norm": 0.1900668237823236,
"learning_rate": 0.00022370466824973812,
"loss": 0.2797,
"step": 17150
},
{
"epoch": 0.4673151116665761,
"grad_norm": 0.20840494435322787,
"learning_rate": 0.00022285179461977483,
"loss": 0.3079,
"step": 17200
},
{
"epoch": 0.4686735858283975,
"grad_norm": 0.4022346673956682,
"learning_rate": 0.00022199849972684633,
"loss": 0.2958,
"step": 17250
},
{
"epoch": 0.47003205999021896,
"grad_norm": 0.3270990206921089,
"learning_rate": 0.0002211447993010755,
"loss": 0.3313,
"step": 17300
},
{
"epoch": 0.4713905341520404,
"grad_norm": 0.249803246986443,
"learning_rate": 0.00022029070908006096,
"loss": 0.3104,
"step": 17350
},
{
"epoch": 0.47274900831386185,
"grad_norm": 0.2813145656422356,
"learning_rate": 0.0002194362448085872,
"loss": 0.3039,
"step": 17400
},
{
"epoch": 0.4741074824756833,
"grad_norm": 0.20904103519051825,
"learning_rate": 0.00021858142223833395,
"loss": 0.3093,
"step": 17450
},
{
"epoch": 0.47546595663750474,
"grad_norm": 0.2476519540180904,
"learning_rate": 0.00021772625712758624,
"loss": 0.3133,
"step": 17500
},
{
"epoch": 0.4768244307993262,
"grad_norm": 0.2897735958185,
"learning_rate": 0.00021687076524094353,
"loss": 0.3184,
"step": 17550
},
{
"epoch": 0.4781829049611476,
"grad_norm": 0.36797022439353905,
"learning_rate": 0.0002160149623490293,
"loss": 0.2982,
"step": 17600
},
{
"epoch": 0.47954137912296907,
"grad_norm": 0.22151406862910683,
"learning_rate": 0.0002151588642282003,
"loss": 0.3031,
"step": 17650
},
{
"epoch": 0.4808998532847905,
"grad_norm": 0.24573689529627643,
"learning_rate": 0.00021430248666025561,
"loss": 0.2927,
"step": 17700
},
{
"epoch": 0.48225832744661196,
"grad_norm": 0.25110843175386494,
"learning_rate": 0.0002134458454321459,
"loss": 0.2984,
"step": 17750
},
{
"epoch": 0.4836168016084334,
"grad_norm": 0.26920027208505604,
"learning_rate": 0.00021258895633568238,
"loss": 0.2869,
"step": 17800
},
{
"epoch": 0.48497527577025484,
"grad_norm": 0.3111889899596438,
"learning_rate": 0.0002117318351672454,
"loss": 0.3215,
"step": 17850
},
{
"epoch": 0.4863337499320763,
"grad_norm": 0.20320042839557148,
"learning_rate": 0.00021087449772749347,
"loss": 0.3019,
"step": 17900
},
{
"epoch": 0.48769222409389773,
"grad_norm": 0.29026043340389285,
"learning_rate": 0.00021001695982107217,
"loss": 0.3087,
"step": 17950
},
{
"epoch": 0.4890506982557192,
"grad_norm": 0.26193168931031524,
"learning_rate": 0.00020915923725632244,
"loss": 0.3036,
"step": 18000
},
{
"epoch": 0.4904091724175406,
"grad_norm": 0.23673083795318206,
"learning_rate": 0.0002083013458449893,
"loss": 0.3111,
"step": 18050
},
{
"epoch": 0.49176764657936206,
"grad_norm": 0.2259659757224692,
"learning_rate": 0.00020744330140193046,
"loss": 0.2883,
"step": 18100
},
{
"epoch": 0.4931261207411835,
"grad_norm": 0.2902171908048496,
"learning_rate": 0.00020658511974482475,
"loss": 0.2898,
"step": 18150
},
{
"epoch": 0.49448459490300495,
"grad_norm": 0.31472212166057917,
"learning_rate": 0.0002057268166938803,
"loss": 0.3111,
"step": 18200
},
{
"epoch": 0.4958430690648264,
"grad_norm": 0.27417754560735935,
"learning_rate": 0.00020486840807154325,
"loss": 0.3013,
"step": 18250
},
{
"epoch": 0.49720154322664784,
"grad_norm": 0.24533216444780298,
"learning_rate": 0.0002040099097022059,
"loss": 0.3073,
"step": 18300
},
{
"epoch": 0.4985600173884693,
"grad_norm": 0.2597365406230817,
"learning_rate": 0.0002031513374119148,
"loss": 0.2918,
"step": 18350
},
{
"epoch": 0.4999184915502907,
"grad_norm": 0.23849823607914308,
"learning_rate": 0.00020229270702807952,
"loss": 0.3044,
"step": 18400
},
{
"epoch": 0.5012769657121121,
"grad_norm": 0.40233301575689023,
"learning_rate": 0.0002014340343791802,
"loss": 0.3086,
"step": 18450
},
{
"epoch": 0.5026354398739336,
"grad_norm": 0.24678497017149986,
"learning_rate": 0.00020057533529447647,
"loss": 0.2947,
"step": 18500
},
{
"epoch": 0.503993914035755,
"grad_norm": 0.18418790064404403,
"learning_rate": 0.000199716625603715,
"loss": 0.2802,
"step": 18550
},
{
"epoch": 0.5053523881975764,
"grad_norm": 0.20614362466496808,
"learning_rate": 0.00019887509507259376,
"loss": 0.3082,
"step": 18600
},
{
"epoch": 0.5067108623593979,
"grad_norm": 0.3176004501620565,
"learning_rate": 0.0001980164110832425,
"loss": 0.2946,
"step": 18650
},
{
"epoch": 0.5080693365212193,
"grad_norm": 0.24434247355813202,
"learning_rate": 0.00019715776366049622,
"loss": 0.2852,
"step": 18700
},
{
"epoch": 0.5094278106830408,
"grad_norm": 0.2632819823395696,
"learning_rate": 0.00019629916863314945,
"loss": 0.3119,
"step": 18750
},
{
"epoch": 0.5107862848448622,
"grad_norm": 0.36866015249871253,
"learning_rate": 0.00019544064182903077,
"loss": 0.3064,
"step": 18800
},
{
"epoch": 0.5121447590066837,
"grad_norm": 0.28334197775915865,
"learning_rate": 0.000194582199074711,
"loss": 0.2982,
"step": 18850
},
{
"epoch": 0.5135032331685051,
"grad_norm": 0.29353450964831995,
"learning_rate": 0.00019372385619521155,
"loss": 0.2997,
"step": 18900
},
{
"epoch": 0.5148617073303265,
"grad_norm": 0.30235983080661416,
"learning_rate": 0.00019286562901371282,
"loss": 0.2953,
"step": 18950
},
{
"epoch": 0.516220181492148,
"grad_norm": 0.24006103860300088,
"learning_rate": 0.0001920075333512621,
"loss": 0.312,
"step": 19000
},
{
"epoch": 0.5175786556539694,
"grad_norm": 0.25401074594196943,
"learning_rate": 0.00019114958502648258,
"loss": 0.2928,
"step": 19050
},
{
"epoch": 0.5189371298157909,
"grad_norm": 0.3126940882002115,
"learning_rate": 0.00019029179985528095,
"loss": 0.2881,
"step": 19100
},
{
"epoch": 0.5202956039776123,
"grad_norm": 0.244186090338719,
"learning_rate": 0.00018945134391851735,
"loss": 0.2844,
"step": 19150
},
{
"epoch": 0.5216540781394338,
"grad_norm": 0.2620555496999193,
"learning_rate": 0.00018861107474107164,
"loss": 0.3033,
"step": 19200
},
{
"epoch": 0.5230125523012552,
"grad_norm": 0.29660068432502984,
"learning_rate": 0.00018775386516779982,
"loss": 0.2815,
"step": 19250
},
{
"epoch": 0.5243710264630767,
"grad_norm": 0.24636353127452668,
"learning_rate": 0.0001868968813467351,
"loss": 0.2982,
"step": 19300
},
{
"epoch": 0.5257295006248981,
"grad_norm": 0.3036729051937609,
"learning_rate": 0.00018604013907600413,
"loss": 0.2697,
"step": 19350
},
{
"epoch": 0.5270879747867195,
"grad_norm": 0.25151244998729483,
"learning_rate": 0.00018518365414928073,
"loss": 0.3005,
"step": 19400
},
{
"epoch": 0.528446448948541,
"grad_norm": 0.3900757856018299,
"learning_rate": 0.00018432744235549457,
"loss": 0.3163,
"step": 19450
},
{
"epoch": 0.5298049231103624,
"grad_norm": 0.3209166901430777,
"learning_rate": 0.0001834715194785403,
"loss": 0.2946,
"step": 19500
},
{
"epoch": 0.5311633972721839,
"grad_norm": 0.20611000381285643,
"learning_rate": 0.00018261590129698663,
"loss": 0.2877,
"step": 19550
},
{
"epoch": 0.5325218714340053,
"grad_norm": 0.21332069721707292,
"learning_rate": 0.00018176060358378503,
"loss": 0.2916,
"step": 19600
},
{
"epoch": 0.5338803455958268,
"grad_norm": 0.34732582027624836,
"learning_rate": 0.00018090564210597975,
"loss": 0.3057,
"step": 19650
},
{
"epoch": 0.5352388197576482,
"grad_norm": 0.23660042062818817,
"learning_rate": 0.00018005103262441622,
"loss": 0.2746,
"step": 19700
},
{
"epoch": 0.5365972939194696,
"grad_norm": 0.23653513107119012,
"learning_rate": 0.00017919679089345122,
"loss": 0.295,
"step": 19750
},
{
"epoch": 0.5379557680812911,
"grad_norm": 0.2066174691631555,
"learning_rate": 0.00017834293266066222,
"loss": 0.2896,
"step": 19800
},
{
"epoch": 0.5393142422431125,
"grad_norm": 0.26332165957058984,
"learning_rate": 0.00017748947366655687,
"loss": 0.2811,
"step": 19850
},
{
"epoch": 0.540672716404934,
"grad_norm": 0.22960074466120436,
"learning_rate": 0.00017663642964428318,
"loss": 0.2846,
"step": 19900
},
{
"epoch": 0.5420311905667554,
"grad_norm": 0.3090166915756585,
"learning_rate": 0.00017578381631933946,
"loss": 0.2924,
"step": 19950
},
{
"epoch": 0.5433896647285769,
"grad_norm": 0.36568571497107416,
"learning_rate": 0.00017493164940928402,
"loss": 0.2865,
"step": 20000
},
{
"epoch": 0.5447481388903983,
"grad_norm": 0.29059486954556535,
"learning_rate": 0.00017407994462344584,
"loss": 0.2785,
"step": 20050
},
{
"epoch": 0.5461066130522197,
"grad_norm": 0.27957466708084117,
"learning_rate": 0.00017322871766263487,
"loss": 0.2935,
"step": 20100
},
{
"epoch": 0.5474650872140412,
"grad_norm": 0.2151461608605068,
"learning_rate": 0.00017237798421885253,
"loss": 0.2841,
"step": 20150
},
{
"epoch": 0.5488235613758626,
"grad_norm": 0.24819887268532007,
"learning_rate": 0.00017152775997500238,
"loss": 0.285,
"step": 20200
},
{
"epoch": 0.5501820355376841,
"grad_norm": 0.20284647935207317,
"learning_rate": 0.0001706780606046013,
"loss": 0.2927,
"step": 20250
},
{
"epoch": 0.5515405096995055,
"grad_norm": 0.19244100345976062,
"learning_rate": 0.0001698289017714902,
"loss": 0.2645,
"step": 20300
},
{
"epoch": 0.552898983861327,
"grad_norm": 0.22539860380829246,
"learning_rate": 0.00016898029912954546,
"loss": 0.2939,
"step": 20350
},
{
"epoch": 0.5542574580231484,
"grad_norm": 0.2619800733732195,
"learning_rate": 0.00016813226832239025,
"loss": 0.2836,
"step": 20400
},
{
"epoch": 0.5556159321849699,
"grad_norm": 0.23393114722266678,
"learning_rate": 0.00016728482498310637,
"loss": 0.2736,
"step": 20450
},
{
"epoch": 0.5569744063467913,
"grad_norm": 0.30087081995833126,
"learning_rate": 0.00016643798473394566,
"loss": 0.2794,
"step": 20500
},
{
"epoch": 0.5583328805086127,
"grad_norm": 0.308240444312431,
"learning_rate": 0.00016559176318604258,
"loss": 0.2671,
"step": 20550
},
{
"epoch": 0.5596913546704342,
"grad_norm": 0.24052215603123736,
"learning_rate": 0.00016474617593912583,
"loss": 0.2874,
"step": 20600
},
{
"epoch": 0.5610498288322556,
"grad_norm": 0.2750519886277399,
"learning_rate": 0.00016390123858123118,
"loss": 0.2732,
"step": 20650
},
{
"epoch": 0.5624083029940771,
"grad_norm": 0.2175806661894403,
"learning_rate": 0.0001630569666884139,
"loss": 0.2885,
"step": 20700
},
{
"epoch": 0.5637667771558985,
"grad_norm": 0.2923956849374819,
"learning_rate": 0.00016221337582446172,
"loss": 0.2924,
"step": 20750
},
{
"epoch": 0.56512525131772,
"grad_norm": 0.2708091098394788,
"learning_rate": 0.00016137048154060785,
"loss": 0.2705,
"step": 20800
},
{
"epoch": 0.5664837254795414,
"grad_norm": 0.260062882274282,
"learning_rate": 0.0001605282993752446,
"loss": 0.2833,
"step": 20850
},
{
"epoch": 0.5678421996413628,
"grad_norm": 0.28046003747194964,
"learning_rate": 0.00015968684485363635,
"loss": 0.2875,
"step": 20900
},
{
"epoch": 0.5692006738031843,
"grad_norm": 0.18648990278831484,
"learning_rate": 0.0001588461334876338,
"loss": 0.2788,
"step": 20950
},
{
"epoch": 0.5705591479650057,
"grad_norm": 0.26108175409809964,
"learning_rate": 0.000158006180775388,
"loss": 0.2809,
"step": 21000
},
{
"epoch": 0.5719176221268272,
"grad_norm": 0.15533902511877934,
"learning_rate": 0.0001571670022010644,
"loss": 0.2808,
"step": 21050
},
{
"epoch": 0.5732760962886486,
"grad_norm": 0.17785716374013105,
"learning_rate": 0.0001563286132345576,
"loss": 0.2854,
"step": 21100
},
{
"epoch": 0.5746345704504701,
"grad_norm": 0.2493856351979774,
"learning_rate": 0.00015549102933120625,
"loss": 0.2672,
"step": 21150
},
{
"epoch": 0.5759930446122915,
"grad_norm": 0.37551758591172574,
"learning_rate": 0.00015467099305876942,
"loss": 0.2883,
"step": 21200
},
{
"epoch": 0.577351518774113,
"grad_norm": 0.21750010428694388,
"learning_rate": 0.00015383504871844582,
"loss": 0.2779,
"step": 21250
},
{
"epoch": 0.5787099929359344,
"grad_norm": 0.19042627120914027,
"learning_rate": 0.00015299995540906267,
"loss": 0.2764,
"step": 21300
},
{
"epoch": 0.5800684670977558,
"grad_norm": 0.2797732165932674,
"learning_rate": 0.0001521657285252044,
"loss": 0.2922,
"step": 21350
},
{
"epoch": 0.5814269412595773,
"grad_norm": 0.3591848479346681,
"learning_rate": 0.00015133238344548327,
"loss": 0.2884,
"step": 21400
},
{
"epoch": 0.5827854154213987,
"grad_norm": 0.21764914836042967,
"learning_rate": 0.00015049993553225608,
"loss": 0.2715,
"step": 21450
},
{
"epoch": 0.5841438895832202,
"grad_norm": 0.26727180336133755,
"learning_rate": 0.0001496684001313406,
"loss": 0.2753,
"step": 21500
},
{
"epoch": 0.5855023637450416,
"grad_norm": 0.21915535565528904,
"learning_rate": 0.00014883779257173285,
"loss": 0.265,
"step": 21550
},
{
"epoch": 0.586860837906863,
"grad_norm": 0.25668689734119876,
"learning_rate": 0.0001480081281653244,
"loss": 0.2762,
"step": 21600
},
{
"epoch": 0.5882193120686845,
"grad_norm": 0.2834782294538094,
"learning_rate": 0.00014717942220662038,
"loss": 0.28,
"step": 21650
},
{
"epoch": 0.5895777862305059,
"grad_norm": 0.24516802954697497,
"learning_rate": 0.00014635168997245712,
"loss": 0.2755,
"step": 21700
},
{
"epoch": 0.5909362603923274,
"grad_norm": 0.22053403799293927,
"learning_rate": 0.00014552494672172113,
"loss": 0.2732,
"step": 21750
},
{
"epoch": 0.5922947345541488,
"grad_norm": 0.297493134455997,
"learning_rate": 0.00014469920769506704,
"loss": 0.2819,
"step": 21800
},
{
"epoch": 0.5936532087159703,
"grad_norm": 0.26448034669148435,
"learning_rate": 0.00014387448811463722,
"loss": 0.2947,
"step": 21850
},
{
"epoch": 0.5950116828777917,
"grad_norm": 0.1887478278727578,
"learning_rate": 0.00014305080318378105,
"loss": 0.2573,
"step": 21900
},
{
"epoch": 0.5963701570396132,
"grad_norm": 0.24486372742215648,
"learning_rate": 0.0001422281680867744,
"loss": 0.2762,
"step": 21950
},
{
"epoch": 0.5977286312014346,
"grad_norm": 0.22891270758035537,
"learning_rate": 0.00014140659798854012,
"loss": 0.2816,
"step": 22000
},
{
"epoch": 0.599087105363256,
"grad_norm": 0.25531740500430156,
"learning_rate": 0.00014058610803436813,
"loss": 0.2544,
"step": 22050
},
{
"epoch": 0.6004455795250774,
"grad_norm": 0.2198360405690994,
"learning_rate": 0.00013976671334963648,
"loss": 0.27,
"step": 22100
},
{
"epoch": 0.6018040536868988,
"grad_norm": 0.22767226535607382,
"learning_rate": 0.0001389484290395323,
"loss": 0.2869,
"step": 22150
},
{
"epoch": 0.6031625278487203,
"grad_norm": 0.2694860139304321,
"learning_rate": 0.00013813127018877331,
"loss": 0.2752,
"step": 22200
},
{
"epoch": 0.6045210020105417,
"grad_norm": 0.19898660564261053,
"learning_rate": 0.00013731525186133026,
"loss": 0.2624,
"step": 22250
},
{
"epoch": 0.6058794761723632,
"grad_norm": 0.23150351646391246,
"learning_rate": 0.00013653296123522198,
"loss": 0.2718,
"step": 22300
},
{
"epoch": 0.6072379503341846,
"grad_norm": 0.24064115266253058,
"learning_rate": 0.00013571922195028266,
"loss": 0.2812,
"step": 22350
},
{
"epoch": 0.608596424496006,
"grad_norm": 0.25687846535740555,
"learning_rate": 0.0001349066676537268,
"loss": 0.262,
"step": 22400
},
{
"epoch": 0.6099548986578275,
"grad_norm": 0.20024379738006956,
"learning_rate": 0.00013409531332464196,
"loss": 0.2796,
"step": 22450
},
{
"epoch": 0.6113133728196489,
"grad_norm": 0.30669943060449323,
"learning_rate": 0.00013328517391999483,
"loss": 0.2748,
"step": 22500
},
{
"epoch": 0.6126718469814704,
"grad_norm": 0.26517225209707274,
"learning_rate": 0.00013247626437435539,
"loss": 0.2641,
"step": 22550
},
{
"epoch": 0.6140303211432918,
"grad_norm": 0.23089105114814368,
"learning_rate": 0.0001316685995996218,
"loss": 0.2716,
"step": 22600
},
{
"epoch": 0.6153887953051133,
"grad_norm": 0.3141172219746477,
"learning_rate": 0.0001308621944847455,
"loss": 0.2601,
"step": 22650
},
{
"epoch": 0.6167472694669347,
"grad_norm": 0.2290976880794265,
"learning_rate": 0.0001300570638954565,
"loss": 0.2805,
"step": 22700
},
{
"epoch": 0.6181057436287561,
"grad_norm": 0.21218409171582492,
"learning_rate": 0.0001292532226739894,
"loss": 0.2686,
"step": 22750
},
{
"epoch": 0.6194642177905776,
"grad_norm": 0.22628948026088308,
"learning_rate": 0.0001284506856388101,
"loss": 0.2688,
"step": 22800
},
{
"epoch": 0.620822691952399,
"grad_norm": 0.2948337400203754,
"learning_rate": 0.00012764946758434225,
"loss": 0.2655,
"step": 22850
},
{
"epoch": 0.6221811661142205,
"grad_norm": 0.3340188815254344,
"learning_rate": 0.00012684958328069453,
"loss": 0.2754,
"step": 22900
},
{
"epoch": 0.6235396402760419,
"grad_norm": 0.2767372638913053,
"learning_rate": 0.0001260510474733888,
"loss": 0.2602,
"step": 22950
},
{
"epoch": 0.6248981144378634,
"grad_norm": 0.270894988791611,
"learning_rate": 0.00012525387488308783,
"loss": 0.2564,
"step": 23000
},
{
"epoch": 0.6262565885996848,
"grad_norm": 0.20130647702859084,
"learning_rate": 0.000124458080205324,
"loss": 0.2699,
"step": 23050
},
{
"epoch": 0.6276150627615062,
"grad_norm": 0.2606352685620501,
"learning_rate": 0.0001236795524100573,
"loss": 0.2777,
"step": 23100
},
{
"epoch": 0.6289735369233277,
"grad_norm": 0.26862575508349007,
"learning_rate": 0.00012288652925419885,
"loss": 0.27,
"step": 23150
},
{
"epoch": 0.6303320110851491,
"grad_norm": 0.2264767237464518,
"learning_rate": 0.00012209492765187177,
"loss": 0.2717,
"step": 23200
},
{
"epoch": 0.6316904852469706,
"grad_norm": 0.3116565801871334,
"learning_rate": 0.00012130476219590986,
"loss": 0.2595,
"step": 23250
},
{
"epoch": 0.633048959408792,
"grad_norm": 0.2778393951264189,
"learning_rate": 0.00012051604745267213,
"loss": 0.2791,
"step": 23300
},
{
"epoch": 0.6344074335706135,
"grad_norm": 0.1850696129101786,
"learning_rate": 0.00011972879796177415,
"loss": 0.2717,
"step": 23350
},
{
"epoch": 0.6357659077324349,
"grad_norm": 0.24958891669063782,
"learning_rate": 0.00011894302823582031,
"loss": 0.2638,
"step": 23400
},
{
"epoch": 0.6371243818942564,
"grad_norm": 0.3700870104750999,
"learning_rate": 0.00011815875276013624,
"loss": 0.2742,
"step": 23450
},
{
"epoch": 0.6384828560560778,
"grad_norm": 0.33264994031715317,
"learning_rate": 0.0001173759859925015,
"loss": 0.2774,
"step": 23500
},
{
"epoch": 0.6398413302178992,
"grad_norm": 0.31037389441035956,
"learning_rate": 0.00011659474236288361,
"loss": 0.2403,
"step": 23550
},
{
"epoch": 0.6411998043797207,
"grad_norm": 0.2731125175831413,
"learning_rate": 0.00011581503627317138,
"loss": 0.2568,
"step": 23600
},
{
"epoch": 0.6425582785415421,
"grad_norm": 0.31542476581603357,
"learning_rate": 0.00011503688209690988,
"loss": 0.2405,
"step": 23650
},
{
"epoch": 0.6439167527033636,
"grad_norm": 0.2856271842999882,
"learning_rate": 0.00011426029417903521,
"loss": 0.2594,
"step": 23700
},
{
"epoch": 0.645275226865185,
"grad_norm": 0.304609790388205,
"learning_rate": 0.00011348528683561044,
"loss": 0.2617,
"step": 23750
},
{
"epoch": 0.6466337010270065,
"grad_norm": 0.24926409052563817,
"learning_rate": 0.00011271187435356107,
"loss": 0.2624,
"step": 23800
},
{
"epoch": 0.6479921751888279,
"grad_norm": 0.29444243889916777,
"learning_rate": 0.00011194007099041242,
"loss": 0.267,
"step": 23850
},
{
"epoch": 0.6493506493506493,
"grad_norm": 0.251174398975187,
"learning_rate": 0.00011116989097402601,
"loss": 0.2745,
"step": 23900
},
{
"epoch": 0.6507091235124708,
"grad_norm": 0.26364700269491465,
"learning_rate": 0.0001104013485023379,
"loss": 0.2695,
"step": 23950
},
{
"epoch": 0.6520675976742922,
"grad_norm": 0.1408465902411862,
"learning_rate": 0.00010963445774309668,
"loss": 0.2423,
"step": 24000
},
{
"epoch": 0.6534260718361137,
"grad_norm": 0.1933859329942763,
"learning_rate": 0.00010886923283360217,
"loss": 0.2359,
"step": 24050
},
{
"epoch": 0.6547845459979351,
"grad_norm": 0.2614195528425062,
"learning_rate": 0.00010810568788044524,
"loss": 0.2673,
"step": 24100
},
{
"epoch": 0.6561430201597566,
"grad_norm": 0.24091031620062864,
"learning_rate": 0.00010734383695924741,
"loss": 0.2493,
"step": 24150
},
{
"epoch": 0.657501494321578,
"grad_norm": 0.2697615824186297,
"learning_rate": 0.00010658369411440134,
"loss": 0.2729,
"step": 24200
},
{
"epoch": 0.6588599684833994,
"grad_norm": 0.20653067849872642,
"learning_rate": 0.00010582527335881209,
"loss": 0.274,
"step": 24250
},
{
"epoch": 0.6602184426452209,
"grad_norm": 0.2589626095489949,
"learning_rate": 0.0001050685886736388,
"loss": 0.2609,
"step": 24300
},
{
"epoch": 0.6615769168070423,
"grad_norm": 0.2672837103760092,
"learning_rate": 0.00010431365400803682,
"loss": 0.2524,
"step": 24350
},
{
"epoch": 0.6629353909688638,
"grad_norm": 0.1824225008155396,
"learning_rate": 0.00010356048327890064,
"loss": 0.2702,
"step": 24400
},
{
"epoch": 0.6642938651306852,
"grad_norm": 0.22598161309206102,
"learning_rate": 0.00010280909037060747,
"loss": 0.2601,
"step": 24450
},
{
"epoch": 0.6656523392925067,
"grad_norm": 0.3087441379489739,
"learning_rate": 0.00010205948913476113,
"loss": 0.2645,
"step": 24500
},
{
"epoch": 0.6670108134543281,
"grad_norm": 0.26641776561733793,
"learning_rate": 0.00010131169338993662,
"loss": 0.2572,
"step": 24550
},
{
"epoch": 0.6683692876161496,
"grad_norm": 0.16642889477958095,
"learning_rate": 0.00010056571692142558,
"loss": 0.2437,
"step": 24600
},
{
"epoch": 0.669727761777971,
"grad_norm": 0.33006631915049106,
"learning_rate": 9.982157348098204e-05,
"loss": 0.2557,
"step": 24650
},
{
"epoch": 0.6710862359397924,
"grad_norm": 0.25184518346403906,
"learning_rate": 9.907927678656888e-05,
"loss": 0.2481,
"step": 24700
},
{
"epoch": 0.6724447101016139,
"grad_norm": 0.21781761609625996,
"learning_rate": 9.833884052210525e-05,
"loss": 0.2474,
"step": 24750
},
{
"epoch": 0.6738031842634353,
"grad_norm": 0.2707646383120265,
"learning_rate": 9.760027833721379e-05,
"loss": 0.2652,
"step": 24800
},
{
"epoch": 0.6751616584252568,
"grad_norm": 0.2069393478176125,
"learning_rate": 9.686360384696958e-05,
"loss": 0.2595,
"step": 24850
},
{
"epoch": 0.6765201325870782,
"grad_norm": 0.24428561492811254,
"learning_rate": 9.614350737579221e-05,
"loss": 0.2501,
"step": 24900
},
{
"epoch": 0.6778786067488997,
"grad_norm": 0.26915065049504966,
"learning_rate": 9.541061055170308e-05,
"loss": 0.2595,
"step": 24950
},
{
"epoch": 0.6792370809107211,
"grad_norm": 0.32715798463519263,
"learning_rate": 9.467964178784106e-05,
"loss": 0.249,
"step": 25000
},
{
"epoch": 0.6805955550725425,
"grad_norm": 0.2188565278615699,
"learning_rate": 9.395061455929976e-05,
"loss": 0.2644,
"step": 25050
},
{
"epoch": 0.681954029234364,
"grad_norm": 0.1464346234966987,
"learning_rate": 9.32235423053812e-05,
"loss": 0.2489,
"step": 25100
},
{
"epoch": 0.6833125033961854,
"grad_norm": 0.23268193269727472,
"learning_rate": 9.249843842934851e-05,
"loss": 0.2524,
"step": 25150
},
{
"epoch": 0.6846709775580069,
"grad_norm": 0.2823606594876491,
"learning_rate": 9.177531629817841e-05,
"loss": 0.2734,
"step": 25200
},
{
"epoch": 0.6860294517198283,
"grad_norm": 0.24467058752217685,
"learning_rate": 9.105418924231516e-05,
"loss": 0.2579,
"step": 25250
},
{
"epoch": 0.6873879258816498,
"grad_norm": 0.2721349790032047,
"learning_rate": 9.034943316134114e-05,
"loss": 0.2501,
"step": 25300
},
{
"epoch": 0.6887464000434712,
"grad_norm": 0.2063496873982564,
"learning_rate": 8.96322955378789e-05,
"loss": 0.2546,
"step": 25350
},
{
"epoch": 0.6901048742052927,
"grad_norm": 0.1388645672356858,
"learning_rate": 8.891719249538568e-05,
"loss": 0.2481,
"step": 25400
},
{
"epoch": 0.6914633483671141,
"grad_norm": 0.26747348762140405,
"learning_rate": 8.820413721647738e-05,
"loss": 0.2406,
"step": 25450
},
{
"epoch": 0.6928218225289355,
"grad_norm": 0.24773718412732226,
"learning_rate": 8.749314284602002e-05,
"loss": 0.2345,
"step": 25500
},
{
"epoch": 0.694180296690757,
"grad_norm": 0.1875176742846847,
"learning_rate": 8.67842224908878e-05,
"loss": 0.2697,
"step": 25550
},
{
"epoch": 0.6955387708525784,
"grad_norm": 0.34193810804953745,
"learning_rate": 8.607738921972125e-05,
"loss": 0.2499,
"step": 25600
},
{
"epoch": 0.6968972450143999,
"grad_norm": 0.19405847865847933,
"learning_rate": 8.537265606268663e-05,
"loss": 0.2469,
"step": 25650
},
{
"epoch": 0.6982557191762213,
"grad_norm": 0.1829187140282853,
"learning_rate": 8.467003601123527e-05,
"loss": 0.2519,
"step": 25700
},
{
"epoch": 0.6996141933380428,
"grad_norm": 0.20733927044724373,
"learning_rate": 8.396954201786429e-05,
"loss": 0.2655,
"step": 25750
},
{
"epoch": 0.7009726674998642,
"grad_norm": 0.2261928459658941,
"learning_rate": 8.32711869958781e-05,
"loss": 0.2593,
"step": 25800
},
{
"epoch": 0.7023311416616856,
"grad_norm": 0.2718188963862619,
"learning_rate": 8.25749838191499e-05,
"loss": 0.2415,
"step": 25850
},
{
"epoch": 0.7036896158235071,
"grad_norm": 0.3565494856705099,
"learning_rate": 8.18809453218845e-05,
"loss": 0.2586,
"step": 25900
},
{
"epoch": 0.7050480899853285,
"grad_norm": 0.23853635314623642,
"learning_rate": 8.118908429838201e-05,
"loss": 0.2495,
"step": 25950
},
{
"epoch": 0.70640656414715,
"grad_norm": 0.12974546184866537,
"learning_rate": 8.049941350280157e-05,
"loss": 0.241,
"step": 26000
},
{
"epoch": 0.7077650383089714,
"grad_norm": 0.15240966539892364,
"learning_rate": 7.981194564892645e-05,
"loss": 0.2642,
"step": 26050
},
{
"epoch": 0.7091235124707929,
"grad_norm": 0.18578994346470928,
"learning_rate": 7.912669340992957e-05,
"loss": 0.2561,
"step": 26100
},
{
"epoch": 0.7104819866326142,
"grad_norm": 0.2354542836489054,
"learning_rate": 7.844366941814016e-05,
"loss": 0.2433,
"step": 26150
},
{
"epoch": 0.7118404607944356,
"grad_norm": 0.32359876529310133,
"learning_rate": 7.776288626481043e-05,
"loss": 0.2589,
"step": 26200
},
{
"epoch": 0.7131989349562571,
"grad_norm": 0.21721518409143126,
"learning_rate": 7.708435649988394e-05,
"loss": 0.248,
"step": 26250
},
{
"epoch": 0.7145574091180785,
"grad_norm": 0.2413841328575766,
"learning_rate": 7.640809263176381e-05,
"loss": 0.2495,
"step": 26300
},
{
"epoch": 0.7159158832799,
"grad_norm": 0.1937874091125614,
"learning_rate": 7.57341071270824e-05,
"loss": 0.2379,
"step": 26350
},
{
"epoch": 0.7172743574417214,
"grad_norm": 0.29670499546178025,
"learning_rate": 7.507582377492124e-05,
"loss": 0.2481,
"step": 26400
},
{
"epoch": 0.7186328316035429,
"grad_norm": 0.2733674523937474,
"learning_rate": 7.44063860443e-05,
"loss": 0.24,
"step": 26450
},
{
"epoch": 0.7199913057653643,
"grad_norm": 0.24849432830004892,
"learning_rate": 7.373926357771387e-05,
"loss": 0.2518,
"step": 26500
},
{
"epoch": 0.7213497799271857,
"grad_norm": 0.3217997284475769,
"learning_rate": 7.307446867327764e-05,
"loss": 0.2558,
"step": 26550
},
{
"epoch": 0.7227082540890072,
"grad_norm": 0.1903670555116767,
"learning_rate": 7.241201358619814e-05,
"loss": 0.2459,
"step": 26600
},
{
"epoch": 0.7240667282508286,
"grad_norm": 0.1308938028529946,
"learning_rate": 7.175191052854886e-05,
"loss": 0.2507,
"step": 26650
},
{
"epoch": 0.7254252024126501,
"grad_norm": 0.2795123652476836,
"learning_rate": 7.109417166904457e-05,
"loss": 0.2518,
"step": 26700
},
{
"epoch": 0.7267836765744715,
"grad_norm": 0.16091370835854293,
"learning_rate": 7.043880913281707e-05,
"loss": 0.2554,
"step": 26750
},
{
"epoch": 0.728142150736293,
"grad_norm": 0.16950014605111838,
"learning_rate": 6.978583500119171e-05,
"loss": 0.2451,
"step": 26800
},
{
"epoch": 0.7295006248981144,
"grad_norm": 0.19788089327913239,
"learning_rate": 6.913526131146473e-05,
"loss": 0.2456,
"step": 26850
},
{
"epoch": 0.7308590990599358,
"grad_norm": 0.23996422423355868,
"learning_rate": 6.848710005668106e-05,
"loss": 0.2372,
"step": 26900
},
{
"epoch": 0.7322175732217573,
"grad_norm": 0.30447979386999535,
"learning_rate": 6.784136318541352e-05,
"loss": 0.2507,
"step": 26950
},
{
"epoch": 0.7335760473835787,
"grad_norm": 0.2442932467375467,
"learning_rate": 6.719806260154248e-05,
"loss": 0.2499,
"step": 27000
},
{
"epoch": 0.7349345215454002,
"grad_norm": 0.2053301139703188,
"learning_rate": 6.655721016403638e-05,
"loss": 0.2351,
"step": 27050
},
{
"epoch": 0.7362929957072216,
"grad_norm": 0.28412523900572856,
"learning_rate": 6.591881768673309e-05,
"loss": 0.2463,
"step": 27100
},
{
"epoch": 0.7376514698690431,
"grad_norm": 0.2102789887873736,
"learning_rate": 6.52828969381223e-05,
"loss": 0.2469,
"step": 27150
},
{
"epoch": 0.7390099440308645,
"grad_norm": 0.37446012395142053,
"learning_rate": 6.464945964112845e-05,
"loss": 0.2381,
"step": 27200
},
{
"epoch": 0.740368418192686,
"grad_norm": 0.16201575759035203,
"learning_rate": 6.401851747289451e-05,
"loss": 0.2349,
"step": 27250
},
{
"epoch": 0.7417268923545074,
"grad_norm": 0.2489903791806012,
"learning_rate": 6.339008206456684e-05,
"loss": 0.2482,
"step": 27300
},
{
"epoch": 0.7430853665163288,
"grad_norm": 0.21608399737617504,
"learning_rate": 6.276416500108084e-05,
"loss": 0.2446,
"step": 27350
},
{
"epoch": 0.7444438406781503,
"grad_norm": 0.2704960434877356,
"learning_rate": 6.215322069728647e-05,
"loss": 0.2424,
"step": 27400
},
{
"epoch": 0.7458023148399717,
"grad_norm": 0.2267608806933957,
"learning_rate": 6.153232395255646e-05,
"loss": 0.2441,
"step": 27450
},
{
"epoch": 0.7471607890017932,
"grad_norm": 0.183167292044454,
"learning_rate": 6.0913979799636686e-05,
"loss": 0.2445,
"step": 27500
},
{
"epoch": 0.7485192631636146,
"grad_norm": 0.15376761881823003,
"learning_rate": 6.0298199637434525e-05,
"loss": 0.2253,
"step": 27550
},
{
"epoch": 0.7498777373254361,
"grad_norm": 0.19314678658445544,
"learning_rate": 5.9684994817591334e-05,
"loss": 0.2383,
"step": 27600
},
{
"epoch": 0.7512362114872575,
"grad_norm": 0.2544241890699629,
"learning_rate": 5.907437664427311e-05,
"loss": 0.2391,
"step": 27650
},
{
"epoch": 0.7525946856490789,
"grad_norm": 0.21989322641900727,
"learning_rate": 5.846635637396216e-05,
"loss": 0.2332,
"step": 27700
},
{
"epoch": 0.7539531598109004,
"grad_norm": 0.22618174199432453,
"learning_rate": 5.7860945215249696e-05,
"loss": 0.2337,
"step": 27750
},
{
"epoch": 0.7553116339727218,
"grad_norm": 0.18764808609541392,
"learning_rate": 5.725815432862887e-05,
"loss": 0.2482,
"step": 27800
},
{
"epoch": 0.7566701081345433,
"grad_norm": 0.4380376298961902,
"learning_rate": 5.6657994826289465e-05,
"loss": 0.2262,
"step": 27850
},
{
"epoch": 0.7580285822963647,
"grad_norm": 0.23698867029895784,
"learning_rate": 5.606047777191268e-05,
"loss": 0.2409,
"step": 27900
},
{
"epoch": 0.7593870564581862,
"grad_norm": 0.20034127068488122,
"learning_rate": 5.546561418046736e-05,
"loss": 0.2419,
"step": 27950
},
{
"epoch": 0.7607455306200076,
"grad_norm": 0.2949286374600259,
"learning_rate": 5.4873415018006867e-05,
"loss": 0.2261,
"step": 28000
},
{
"epoch": 0.762104004781829,
"grad_norm": 0.25152518852471184,
"learning_rate": 5.428389120146715e-05,
"loss": 0.2375,
"step": 28050
},
{
"epoch": 0.7634624789436505,
"grad_norm": 0.1611737332419803,
"learning_rate": 5.369705359846511e-05,
"loss": 0.2318,
"step": 28100
},
{
"epoch": 0.7648209531054719,
"grad_norm": 0.24055178507097832,
"learning_rate": 5.311291302709844e-05,
"loss": 0.2373,
"step": 28150
},
{
"epoch": 0.7661794272672934,
"grad_norm": 0.1897183193395996,
"learning_rate": 5.2531480255746476e-05,
"loss": 0.245,
"step": 28200
},
{
"epoch": 0.7675379014291148,
"grad_norm": 0.17982933996634243,
"learning_rate": 5.195276600287118e-05,
"loss": 0.2369,
"step": 28250
},
{
"epoch": 0.7688963755909363,
"grad_norm": 0.25848505633412666,
"learning_rate": 5.137678093681983e-05,
"loss": 0.2319,
"step": 28300
},
{
"epoch": 0.7702548497527577,
"grad_norm": 0.20072845241494364,
"learning_rate": 5.0803535675628497e-05,
"loss": 0.2306,
"step": 28350
},
{
"epoch": 0.7716133239145792,
"grad_norm": 0.20242303091668362,
"learning_rate": 5.0233040786825935e-05,
"loss": 0.2422,
"step": 28400
},
{
"epoch": 0.7729717980764006,
"grad_norm": 0.2519217142033256,
"learning_rate": 4.9665306787239086e-05,
"loss": 0.25,
"step": 28450
},
{
"epoch": 0.774330272238222,
"grad_norm": 0.20962707780239995,
"learning_rate": 4.910034414279902e-05,
"loss": 0.2253,
"step": 28500
},
{
"epoch": 0.7756887464000435,
"grad_norm": 0.22029442635682775,
"learning_rate": 4.853816326834808e-05,
"loss": 0.2411,
"step": 28550
},
{
"epoch": 0.7770472205618649,
"grad_norm": 0.2324490253296684,
"learning_rate": 4.797877452744792e-05,
"loss": 0.2373,
"step": 28600
},
{
"epoch": 0.7784056947236864,
"grad_norm": 0.25563113455518527,
"learning_rate": 4.742218823218851e-05,
"loss": 0.2363,
"step": 28650
},
{
"epoch": 0.7797641688855078,
"grad_norm": 0.2147681728902342,
"learning_rate": 4.686841464299776e-05,
"loss": 0.2474,
"step": 28700
},
{
"epoch": 0.7811226430473293,
"grad_norm": 0.25383739922693677,
"learning_rate": 4.6317463968452624e-05,
"loss": 0.2212,
"step": 28750
},
{
"epoch": 0.7824811172091507,
"grad_norm": 0.2679584758554305,
"learning_rate": 4.5769346365090894e-05,
"loss": 0.252,
"step": 28800
},
{
"epoch": 0.7838395913709721,
"grad_norm": 0.2088581606519284,
"learning_rate": 4.522407193722382e-05,
"loss": 0.2277,
"step": 28850
},
{
"epoch": 0.7851980655327936,
"grad_norm": 0.26508761902628303,
"learning_rate": 4.4681650736750016e-05,
"loss": 0.2277,
"step": 28900
},
{
"epoch": 0.786556539694615,
"grad_norm": 0.377399622435696,
"learning_rate": 4.416361998302716e-05,
"loss": 0.2278,
"step": 28950
},
{
"epoch": 0.7879150138564365,
"grad_norm": 0.15962057847455954,
"learning_rate": 4.3626820065221566e-05,
"loss": 0.2242,
"step": 29000
},
{
"epoch": 0.7892734880182579,
"grad_norm": 0.32929537012542076,
"learning_rate": 4.309290281945775e-05,
"loss": 0.228,
"step": 29050
},
{
"epoch": 0.7906319621800794,
"grad_norm": 0.21846014602740327,
"learning_rate": 4.256187808826948e-05,
"loss": 0.2446,
"step": 29100
},
{
"epoch": 0.7919904363419008,
"grad_norm": 0.21017757165369907,
"learning_rate": 4.203375566086851e-05,
"loss": 0.2401,
"step": 29150
},
{
"epoch": 0.7933489105037222,
"grad_norm": 0.24137967337019786,
"learning_rate": 4.15085452729636e-05,
"loss": 0.2465,
"step": 29200
},
{
"epoch": 0.7947073846655437,
"grad_norm": 0.25590967007232035,
"learning_rate": 4.098625660658151e-05,
"loss": 0.2375,
"step": 29250
},
{
"epoch": 0.7960658588273651,
"grad_norm": 0.279864351972487,
"learning_rate": 4.0466899289888205e-05,
"loss": 0.2374,
"step": 29300
},
{
"epoch": 0.7974243329891866,
"grad_norm": 0.2607946144900689,
"learning_rate": 3.995048289701155e-05,
"loss": 0.222,
"step": 29350
},
{
"epoch": 0.798782807151008,
"grad_norm": 0.3426303124257882,
"learning_rate": 3.9437016947864745e-05,
"loss": 0.2367,
"step": 29400
},
{
"epoch": 0.8001412813128295,
"grad_norm": 0.27770053634270586,
"learning_rate": 3.892651090797075e-05,
"loss": 0.2417,
"step": 29450
},
{
"epoch": 0.8014997554746509,
"grad_norm": 0.24470556593709372,
"learning_rate": 3.841897418828797e-05,
"loss": 0.219,
"step": 29500
},
{
"epoch": 0.8028582296364724,
"grad_norm": 0.24604839411340365,
"learning_rate": 3.791441614503675e-05,
"loss": 0.2382,
"step": 29550
},
{
"epoch": 0.8042167037982938,
"grad_norm": 0.47026656573464204,
"learning_rate": 3.7412846079526644e-05,
"loss": 0.2196,
"step": 29600
},
{
"epoch": 0.8055751779601152,
"grad_norm": 0.26091598553410145,
"learning_rate": 3.691427323798522e-05,
"loss": 0.2268,
"step": 29650
},
{
"epoch": 0.8069336521219367,
"grad_norm": 0.21960674238700215,
"learning_rate": 3.6418706811387504e-05,
"loss": 0.2356,
"step": 29700
},
{
"epoch": 0.8082921262837581,
"grad_norm": 0.2181680329611913,
"learning_rate": 3.592615593528652e-05,
"loss": 0.2261,
"step": 29750
},
{
"epoch": 0.8096506004455796,
"grad_norm": 0.1831881150211827,
"learning_rate": 3.543662968964496e-05,
"loss": 0.2306,
"step": 29800
},
{
"epoch": 0.811009074607401,
"grad_norm": 0.24753134638996258,
"learning_rate": 3.4950137098667836e-05,
"loss": 0.2459,
"step": 29850
},
{
"epoch": 0.8123675487692225,
"grad_norm": 0.3847831369965376,
"learning_rate": 3.4466687130635856e-05,
"loss": 0.2201,
"step": 29900
},
{
"epoch": 0.8137260229310439,
"grad_norm": 0.2975898486391868,
"learning_rate": 3.39862886977405e-05,
"loss": 0.2166,
"step": 29950
},
{
"epoch": 0.8150844970928653,
"grad_norm": 0.20045687385866154,
"learning_rate": 3.3508950655919394e-05,
"loss": 0.228,
"step": 30000
},
{
"epoch": 0.8164429712546868,
"grad_norm": 0.303426868110847,
"learning_rate": 3.3034681804693204e-05,
"loss": 0.22,
"step": 30050
},
{
"epoch": 0.8178014454165082,
"grad_norm": 0.20754051862810569,
"learning_rate": 3.25634908870033e-05,
"loss": 0.2301,
"step": 30100
},
{
"epoch": 0.8191599195783296,
"grad_norm": 0.24422546737008857,
"learning_rate": 3.209538658905087e-05,
"loss": 0.2367,
"step": 30150
},
{
"epoch": 0.820518393740151,
"grad_norm": 0.3027570941981141,
"learning_rate": 3.163037754013647e-05,
"loss": 0.2417,
"step": 30200
},
{
"epoch": 0.8218768679019725,
"grad_norm": 0.280254043533181,
"learning_rate": 3.116847231250104e-05,
"loss": 0.2266,
"step": 30250
},
{
"epoch": 0.8232353420637939,
"grad_norm": 0.28505781897752897,
"learning_rate": 3.070967942116807e-05,
"loss": 0.2307,
"step": 30300
},
{
"epoch": 0.8245938162256153,
"grad_norm": 0.22370571404265266,
"learning_rate": 3.0254007323786338e-05,
"loss": 0.2292,
"step": 30350
},
{
"epoch": 0.8259522903874368,
"grad_norm": 0.20314542315669792,
"learning_rate": 2.9801464420474135e-05,
"loss": 0.2384,
"step": 30400
},
{
"epoch": 0.8273107645492582,
"grad_norm": 0.3091717822159854,
"learning_rate": 2.9352059053664515e-05,
"loss": 0.2252,
"step": 30450
},
{
"epoch": 0.8286692387110797,
"grad_norm": 0.2850647955523155,
"learning_rate": 2.8905799507951314e-05,
"loss": 0.2228,
"step": 30500
},
{
"epoch": 0.8300277128729011,
"grad_norm": 0.256986010255855,
"learning_rate": 2.846269400993655e-05,
"loss": 0.2176,
"step": 30550
},
{
"epoch": 0.8313861870347226,
"grad_norm": 0.19662900160930957,
"learning_rate": 2.802275072807865e-05,
"loss": 0.2271,
"step": 30600
},
{
"epoch": 0.832744661196544,
"grad_norm": 0.2745095935502404,
"learning_rate": 2.7585977772542126e-05,
"loss": 0.2254,
"step": 30650
},
{
"epoch": 0.8341031353583654,
"grad_norm": 0.20724780113594732,
"learning_rate": 2.715238319504769e-05,
"loss": 0.2415,
"step": 30700
},
{
"epoch": 0.8354616095201869,
"grad_norm": 0.21591886814918512,
"learning_rate": 2.6721974988724264e-05,
"loss": 0.2305,
"step": 30750
},
{
"epoch": 0.8368200836820083,
"grad_norm": 0.4269961594661858,
"learning_rate": 2.629476108796114e-05,
"loss": 0.2344,
"step": 30800
},
{
"epoch": 0.8381785578438298,
"grad_norm": 0.3444022954836087,
"learning_rate": 2.587074936826215e-05,
"loss": 0.2355,
"step": 30850
},
{
"epoch": 0.8395370320056512,
"grad_norm": 0.2810532430087154,
"learning_rate": 2.5449947646100202e-05,
"loss": 0.2333,
"step": 30900
},
{
"epoch": 0.8408955061674727,
"grad_norm": 0.2016521391071412,
"learning_rate": 2.5032363678773284e-05,
"loss": 0.2345,
"step": 30950
},
{
"epoch": 0.8422539803292941,
"grad_norm": 0.26305838948166177,
"learning_rate": 2.4626260675610046e-05,
"loss": 0.2229,
"step": 31000
},
{
"epoch": 0.8436124544911155,
"grad_norm": 0.3334236797955835,
"learning_rate": 2.4223262587394115e-05,
"loss": 0.2414,
"step": 31050
},
{
"epoch": 0.844970928652937,
"grad_norm": 0.1995390748027635,
"learning_rate": 2.381524806289641e-05,
"loss": 0.229,
"step": 31100
},
{
"epoch": 0.8463294028147584,
"grad_norm": 0.24559427709489323,
"learning_rate": 2.3410481428214603e-05,
"loss": 0.2139,
"step": 31150
},
{
"epoch": 0.8476878769765799,
"grad_norm": 0.20659570740849767,
"learning_rate": 2.300897014504688e-05,
"loss": 0.227,
"step": 31200
},
{
"epoch": 0.8490463511384013,
"grad_norm": 0.21220065960007847,
"learning_rate": 2.261072161508033e-05,
"loss": 0.2374,
"step": 31250
},
{
"epoch": 0.8504048253002228,
"grad_norm": 0.24984661749787465,
"learning_rate": 2.2215743179854577e-05,
"loss": 0.2266,
"step": 31300
},
{
"epoch": 0.8517632994620442,
"grad_norm": 0.3258474284548005,
"learning_rate": 2.1824042120626543e-05,
"loss": 0.2231,
"step": 31350
},
{
"epoch": 0.8531217736238657,
"grad_norm": 0.3217882648747335,
"learning_rate": 2.143562565823609e-05,
"loss": 0.2313,
"step": 31400
},
{
"epoch": 0.8544802477856871,
"grad_norm": 0.2660872932773675,
"learning_rate": 2.1050500952972985e-05,
"loss": 0.2443,
"step": 31450
},
{
"epoch": 0.8558387219475085,
"grad_norm": 0.2552800785940888,
"learning_rate": 2.0668675104444745e-05,
"loss": 0.2282,
"step": 31500
},
{
"epoch": 0.85719719610933,
"grad_norm": 0.23195860174218688,
"learning_rate": 2.0290155151446145e-05,
"loss": 0.2375,
"step": 31550
},
{
"epoch": 0.8585556702711514,
"grad_norm": 0.29024757595999545,
"learning_rate": 1.9914948071828922e-05,
"loss": 0.2222,
"step": 31600
},
{
"epoch": 0.8599141444329729,
"grad_norm": 0.2658334720707247,
"learning_rate": 1.9543060782373667e-05,
"loss": 0.2351,
"step": 31650
},
{
"epoch": 0.8612726185947943,
"grad_norm": 0.26827491653104296,
"learning_rate": 1.917450013866189e-05,
"loss": 0.2397,
"step": 31700
},
{
"epoch": 0.8626310927566158,
"grad_norm": 0.2739389172356759,
"learning_rate": 1.880927293494994e-05,
"loss": 0.233,
"step": 31750
},
{
"epoch": 0.8639895669184372,
"grad_norm": 0.2784269064173774,
"learning_rate": 1.8447385904043534e-05,
"loss": 0.2418,
"step": 31800
},
{
"epoch": 0.8653480410802586,
"grad_norm": 0.3614092162288049,
"learning_rate": 1.808884571717384e-05,
"loss": 0.2257,
"step": 31850
},
{
"epoch": 0.8667065152420801,
"grad_norm": 0.24413648696026682,
"learning_rate": 1.7733658983874336e-05,
"loss": 0.2389,
"step": 31900
},
{
"epoch": 0.8680649894039015,
"grad_norm": 0.1967545842426806,
"learning_rate": 1.7381832251859075e-05,
"loss": 0.2191,
"step": 31950
},
{
"epoch": 0.869423463565723,
"grad_norm": 0.23747418190146288,
"learning_rate": 1.7033372006901982e-05,
"loss": 0.223,
"step": 32000
},
{
"epoch": 0.8707819377275444,
"grad_norm": 0.28815680461600557,
"learning_rate": 1.6702023326195593e-05,
"loss": 0.2242,
"step": 32050
},
{
"epoch": 0.8721404118893659,
"grad_norm": 0.2868186834763779,
"learning_rate": 1.636017997206618e-05,
"loss": 0.2155,
"step": 32100
},
{
"epoch": 0.8734988860511873,
"grad_norm": 0.20903132836326485,
"learning_rate": 1.6021721938713497e-05,
"loss": 0.2258,
"step": 32150
},
{
"epoch": 0.8748573602130088,
"grad_norm": 0.22192290777199325,
"learning_rate": 1.568665546546517e-05,
"loss": 0.2322,
"step": 32200
},
{
"epoch": 0.8762158343748302,
"grad_norm": 0.27939346399599835,
"learning_rate": 1.5354986729126963e-05,
"loss": 0.2166,
"step": 32250
},
{
"epoch": 0.8775743085366516,
"grad_norm": 0.23760881910404164,
"learning_rate": 1.5026721843868797e-05,
"loss": 0.2231,
"step": 32300
},
{
"epoch": 0.8789327826984731,
"grad_norm": 0.2384919572031985,
"learning_rate": 1.4701866861112057e-05,
"loss": 0.2115,
"step": 32350
},
{
"epoch": 0.8802912568602945,
"grad_norm": 0.18196788330053723,
"learning_rate": 1.4380427769418081e-05,
"loss": 0.2214,
"step": 32400
},
{
"epoch": 0.881649731022116,
"grad_norm": 0.11092114968179356,
"learning_rate": 1.4062410494377642e-05,
"loss": 0.2136,
"step": 32450
},
{
"epoch": 0.8830082051839374,
"grad_norm": 0.24840886469424456,
"learning_rate": 1.3747820898501929e-05,
"loss": 0.228,
"step": 32500
},
{
"epoch": 0.8843666793457589,
"grad_norm": 0.24015308441997552,
"learning_rate": 1.3436664781114295e-05,
"loss": 0.2225,
"step": 32550
},
{
"epoch": 0.8857251535075803,
"grad_norm": 0.2836665970861565,
"learning_rate": 1.3128947878243392e-05,
"loss": 0.2203,
"step": 32600
},
{
"epoch": 0.8870836276694017,
"grad_norm": 0.21821870263137624,
"learning_rate": 1.2824675862517388e-05,
"loss": 0.2236,
"step": 32650
},
{
"epoch": 0.8884421018312232,
"grad_norm": 0.24628179490379828,
"learning_rate": 1.2523854343059538e-05,
"loss": 0.2224,
"step": 32700
},
{
"epoch": 0.8898005759930446,
"grad_norm": 0.3785043038913189,
"learning_rate": 1.2226488865384622e-05,
"loss": 0.2328,
"step": 32750
},
{
"epoch": 0.8911590501548661,
"grad_norm": 0.27390465067758646,
"learning_rate": 1.1932584911296762e-05,
"loss": 0.2409,
"step": 32800
},
{
"epoch": 0.8925175243166875,
"grad_norm": 0.21777850622413425,
"learning_rate": 1.164214789878848e-05,
"loss": 0.2113,
"step": 32850
},
{
"epoch": 0.893875998478509,
"grad_norm": 0.31282081336865486,
"learning_rate": 1.1355183181940688e-05,
"loss": 0.2294,
"step": 32900
},
{
"epoch": 0.8952344726403304,
"grad_norm": 0.21847851398276275,
"learning_rate": 1.1071696050823988e-05,
"loss": 0.2176,
"step": 32950
},
{
"epoch": 0.8965929468021518,
"grad_norm": 0.1797782169135658,
"learning_rate": 1.0791691731401221e-05,
"loss": 0.2197,
"step": 33000
},
{
"epoch": 0.8979514209639733,
"grad_norm": 0.236661072817734,
"learning_rate": 1.0526169006027186e-05,
"loss": 0.2287,
"step": 33050
},
{
"epoch": 0.8993098951257947,
"grad_norm": 0.24545168612281812,
"learning_rate": 1.0253005911068837e-05,
"loss": 0.2248,
"step": 33100
},
{
"epoch": 0.9006683692876162,
"grad_norm": 0.2998814580945627,
"learning_rate": 9.98334072000362e-06,
"loss": 0.2279,
"step": 33150
},
{
"epoch": 0.9020268434494376,
"grad_norm": 0.25372185228970084,
"learning_rate": 9.717178403992866e-06,
"loss": 0.2296,
"step": 33200
},
{
"epoch": 0.9033853176112591,
"grad_norm": 0.22808552163017606,
"learning_rate": 9.454523869623889e-06,
"loss": 0.2142,
"step": 33250
},
{
"epoch": 0.9047437917730805,
"grad_norm": 0.22587780733166465,
"learning_rate": 9.195381958819637e-06,
"loss": 0.2332,
"step": 33300
},
{
"epoch": 0.906102265934902,
"grad_norm": 0.40072192174734456,
"learning_rate": 8.939757448749286e-06,
"loss": 0.2294,
"step": 33350
},
{
"epoch": 0.9074607400967234,
"grad_norm": 0.207716936686938,
"learning_rate": 8.687655051740318e-06,
"loss": 0.2296,
"step": 33400
},
{
"epoch": 0.9088192142585448,
"grad_norm": 0.2779354186342964,
"learning_rate": 8.439079415191532e-06,
"loss": 0.2219,
"step": 33450
},
{
"epoch": 0.9101776884203663,
"grad_norm": 0.2723935977374799,
"learning_rate": 8.194035121487465e-06,
"loss": 0.2153,
"step": 33500
},
{
"epoch": 0.9115361625821877,
"grad_norm": 0.19697799899707563,
"learning_rate": 7.952526687913842e-06,
"loss": 0.2146,
"step": 33550
},
{
"epoch": 0.9128946367440092,
"grad_norm": 0.2085749517551805,
"learning_rate": 7.714558566574325e-06,
"loss": 0.2136,
"step": 33600
},
{
"epoch": 0.9142531109058306,
"grad_norm": 0.21717861835570473,
"learning_rate": 7.480135144308475e-06,
"loss": 0.2148,
"step": 33650
},
{
"epoch": 0.9156115850676521,
"grad_norm": 0.19269266158892198,
"learning_rate": 7.2492607426108305e-06,
"loss": 0.2257,
"step": 33700
},
{
"epoch": 0.9169700592294735,
"grad_norm": 0.24145080749460107,
"learning_rate": 7.02193961755131e-06,
"loss": 0.2207,
"step": 33750
},
{
"epoch": 0.9183285333912949,
"grad_norm": 0.38368673649347684,
"learning_rate": 6.798175959696629e-06,
"loss": 0.2277,
"step": 33800
},
{
"epoch": 0.9196870075531164,
"grad_norm": 0.23667938172589895,
"learning_rate": 6.577973894033274e-06,
"loss": 0.2175,
"step": 33850
},
{
"epoch": 0.9210454817149378,
"grad_norm": 0.3161307046290422,
"learning_rate": 6.3613374798911605e-06,
"loss": 0.2343,
"step": 33900
},
{
"epoch": 0.9224039558767593,
"grad_norm": 0.21360593050098195,
"learning_rate": 6.148270710869053e-06,
"loss": 0.2239,
"step": 33950
},
{
"epoch": 0.9237624300385807,
"grad_norm": 0.28937490357759443,
"learning_rate": 5.938777514760796e-06,
"loss": 0.2124,
"step": 34000
},
{
"epoch": 0.9251209042004022,
"grad_norm": 0.2795740945530124,
"learning_rate": 5.732861753483043e-06,
"loss": 0.2204,
"step": 34050
},
{
"epoch": 0.9264793783622236,
"grad_norm": 0.2537546094067306,
"learning_rate": 5.538551797587777e-06,
"loss": 0.2112,
"step": 34100
},
{
"epoch": 0.927837852524045,
"grad_norm": 0.33705155604910225,
"learning_rate": 5.339658758640753e-06,
"loss": 0.2199,
"step": 34150
},
{
"epoch": 0.9291963266858664,
"grad_norm": 0.3270077837617184,
"learning_rate": 5.14435419901973e-06,
"loss": 0.2297,
"step": 34200
},
{
"epoch": 0.9305548008476878,
"grad_norm": 0.22117872354331958,
"learning_rate": 4.95264171907992e-06,
"loss": 0.22,
"step": 34250
},
{
"epoch": 0.9319132750095093,
"grad_norm": 0.2091602624721329,
"learning_rate": 4.7645248529581076e-06,
"loss": 0.2107,
"step": 34300
},
{
"epoch": 0.9332717491713307,
"grad_norm": 0.19828806364167093,
"learning_rate": 4.580007068507497e-06,
"loss": 0.2215,
"step": 34350
},
{
"epoch": 0.9346302233331522,
"grad_norm": 0.2922062157376195,
"learning_rate": 4.399091767233743e-06,
"loss": 0.234,
"step": 34400
},
{
"epoch": 0.9359886974949736,
"grad_norm": 0.2833929720260013,
"learning_rate": 4.221782284232312e-06,
"loss": 0.2358,
"step": 34450
},
{
"epoch": 0.937347171656795,
"grad_norm": 0.10391453032816735,
"learning_rate": 4.048081888126931e-06,
"loss": 0.2194,
"step": 34500
},
{
"epoch": 0.9387056458186165,
"grad_norm": 0.31685765392965615,
"learning_rate": 3.877993781009415e-06,
"loss": 0.2237,
"step": 34550
},
{
"epoch": 0.9400641199804379,
"grad_norm": 0.32028046645415253,
"learning_rate": 3.7115210983805326e-06,
"loss": 0.2296,
"step": 34600
},
{
"epoch": 0.9414225941422594,
"grad_norm": 0.2275872271102818,
"learning_rate": 3.548666909092324e-06,
"loss": 0.2237,
"step": 34650
},
{
"epoch": 0.9427810683040808,
"grad_norm": 0.28672637661803746,
"learning_rate": 3.3894342152914092e-06,
"loss": 0.2129,
"step": 34700
},
{
"epoch": 0.9441395424659023,
"grad_norm": 0.17447544388507297,
"learning_rate": 3.233825952363767e-06,
"loss": 0.2156,
"step": 34750
},
{
"epoch": 0.9454980166277237,
"grad_norm": 0.27717659551061696,
"learning_rate": 3.081844988880511e-06,
"loss": 0.2325,
"step": 34800
},
{
"epoch": 0.9468564907895451,
"grad_norm": 0.19384754194181844,
"learning_rate": 2.9334941265450666e-06,
"loss": 0.2264,
"step": 34850
},
{
"epoch": 0.9482149649513666,
"grad_norm": 0.20190545639653648,
"learning_rate": 2.788776100141499e-06,
"loss": 0.2162,
"step": 34900
},
{
"epoch": 0.949573439113188,
"grad_norm": 0.13741628116355178,
"learning_rate": 2.647693577484156e-06,
"loss": 0.2175,
"step": 34950
},
{
"epoch": 0.9509319132750095,
"grad_norm": 0.30120083790962326,
"learning_rate": 2.5102491593684164e-06,
"loss": 0.2098,
"step": 35000
},
{
"epoch": 0.9522903874368309,
"grad_norm": 0.18521977588022978,
"learning_rate": 2.3764453795227737e-06,
"loss": 0.2232,
"step": 35050
},
{
"epoch": 0.9536488615986524,
"grad_norm": 0.31542769729636866,
"learning_rate": 2.2462847045620737e-06,
"loss": 0.2223,
"step": 35100
},
{
"epoch": 0.9550073357604738,
"grad_norm": 0.27121032732352324,
"learning_rate": 2.1247601176086262e-06,
"loss": 0.2167,
"step": 35150
},
{
"epoch": 0.9563658099222953,
"grad_norm": 0.23513329499397734,
"learning_rate": 2.0017468261825268e-06,
"loss": 0.2118,
"step": 35200
},
{
"epoch": 0.9577242840841167,
"grad_norm": 0.15376591935080916,
"learning_rate": 1.8823835470474395e-06,
"loss": 0.2247,
"step": 35250
},
{
"epoch": 0.9590827582459381,
"grad_norm": 0.3234625822847048,
"learning_rate": 1.766672480613818e-06,
"loss": 0.2229,
"step": 35300
},
{
"epoch": 0.9604412324077596,
"grad_norm": 0.17001324587245673,
"learning_rate": 1.6546157599652613e-06,
"loss": 0.2148,
"step": 35350
},
{
"epoch": 0.961799706569581,
"grad_norm": 0.11359209160906845,
"learning_rate": 1.5462154508190108e-06,
"loss": 0.214,
"step": 35400
},
{
"epoch": 0.9631581807314025,
"grad_norm": 0.2409519071516936,
"learning_rate": 1.4414735514879373e-06,
"loss": 0.2118,
"step": 35450
},
{
"epoch": 0.9645166548932239,
"grad_norm": 0.21439695271092557,
"learning_rate": 1.3403919928437036e-06,
"loss": 0.2219,
"step": 35500
},
{
"epoch": 0.9658751290550454,
"grad_norm": 0.21370470770092295,
"learning_rate": 1.2429726382812368e-06,
"loss": 0.2147,
"step": 35550
},
{
"epoch": 0.9672336032168668,
"grad_norm": 0.2311514348226777,
"learning_rate": 1.149217283684223e-06,
"loss": 0.23,
"step": 35600
},
{
"epoch": 0.9685920773786882,
"grad_norm": 0.29860147103327,
"learning_rate": 1.059127657392156e-06,
"loss": 0.2313,
"step": 35650
},
{
"epoch": 0.9699505515405097,
"grad_norm": 0.1814371857599918,
"learning_rate": 9.72705420168407e-07,
"loss": 0.2241,
"step": 35700
},
{
"epoch": 0.9713090257023311,
"grad_norm": 0.31658685339190534,
"learning_rate": 8.899521651695831e-07,
"loss": 0.2207,
"step": 35750
},
{
"epoch": 0.9726674998641526,
"grad_norm": 0.19909543685451667,
"learning_rate": 8.124150923443096e-07,
"loss": 0.2351,
"step": 35800
},
{
"epoch": 0.974025974025974,
"grad_norm": 0.24098233089589674,
"learning_rate": 7.369308575313927e-07,
"loss": 0.2192,
"step": 35850
},
{
"epoch": 0.9753844481877955,
"grad_norm": 0.23547731000564504,
"learning_rate": 6.651199513456607e-07,
"loss": 0.2268,
"step": 35900
},
{
"epoch": 0.9767429223496169,
"grad_norm": 0.14817468589671154,
"learning_rate": 5.969836975901366e-07,
"loss": 0.2175,
"step": 35950
},
{
"epoch": 0.9781013965114383,
"grad_norm": 0.140443367773756,
"learning_rate": 5.3252335232723e-07,
"loss": 0.2214,
"step": 36000
},
{
"epoch": 0.9794598706732598,
"grad_norm": 0.3119367682303709,
"learning_rate": 4.71740103855578e-07,
"loss": 0.2249,
"step": 36050
},
{
"epoch": 0.9808183448350812,
"grad_norm": 0.311395190548215,
"learning_rate": 4.146350726881076e-07,
"loss": 0.2222,
"step": 36100
},
{
"epoch": 0.9821768189969027,
"grad_norm": 0.37040140534262844,
"learning_rate": 3.6120931153138525e-07,
"loss": 0.2361,
"step": 36150
},
{
"epoch": 0.9835352931587241,
"grad_norm": 0.2988864815826086,
"learning_rate": 3.114638052662988e-07,
"loss": 0.2207,
"step": 36200
},
{
"epoch": 0.9848937673205456,
"grad_norm": 0.21108645711904683,
"learning_rate": 2.6539947092976135e-07,
"loss": 0.2247,
"step": 36250
},
{
"epoch": 0.986252241482367,
"grad_norm": 0.17869396874533416,
"learning_rate": 2.2301715769783572e-07,
"loss": 0.2231,
"step": 36300
},
{
"epoch": 0.9876107156441885,
"grad_norm": 0.30165958371764556,
"learning_rate": 1.8431764687021347e-07,
"loss": 0.2232,
"step": 36350
},
{
"epoch": 0.9889691898060099,
"grad_norm": 0.24564888452021394,
"learning_rate": 1.4930165185564894e-07,
"loss": 0.2135,
"step": 36400
},
{
"epoch": 0.9903276639678313,
"grad_norm": 0.2710306063596811,
"learning_rate": 1.1796981815888064e-07,
"loss": 0.2099,
"step": 36450
},
{
"epoch": 0.9916861381296528,
"grad_norm": 0.2109106341757756,
"learning_rate": 9.032272336875203e-08,
"loss": 0.21,
"step": 36500
},
{
"epoch": 0.9930446122914742,
"grad_norm": 0.27834177280791966,
"learning_rate": 6.636087714748662e-08,
"loss": 0.2265,
"step": 36550
},
{
"epoch": 0.9944030864532957,
"grad_norm": 0.30415917234664425,
"learning_rate": 4.608472122138441e-08,
"loss": 0.232,
"step": 36600
},
{
"epoch": 0.9957615606151171,
"grad_norm": 0.24438726040214506,
"learning_rate": 2.949462937262837e-08,
"loss": 0.221,
"step": 36650
},
{
"epoch": 0.9971200347769386,
"grad_norm": 0.26240483157432476,
"learning_rate": 1.6590907432401104e-08,
"loss": 0.2276,
"step": 36700
},
{
"epoch": 0.99847850893876,
"grad_norm": 0.24193448916634921,
"learning_rate": 7.3737932752226955e-09,
"loss": 0.2229,
"step": 36750
},
{
"epoch": 0.9998369831005814,
"grad_norm": 0.308625063910033,
"learning_rate": 1.843456814643041e-09,
"loss": 0.219,
"step": 36800
},
{
"epoch": 1.0,
"step": 36806,
"total_flos": 244684510289920.0,
"train_loss": 0.3143004386741001,
"train_runtime": 666350.8913,
"train_samples_per_second": 1.326,
"train_steps_per_second": 0.055
}
],
"logging_steps": 50,
"max_steps": 36806,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 244684510289920.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}