OpusBioLLM / checkpoint-9000 /trainer_state.json
YaoYao12138's picture
Upload 19 files
cd5476f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.666975365808483,
"eval_steps": 500,
"global_step": 9000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018521948508983144,
"grad_norm": 0.11988232185850008,
"learning_rate": 7.692307692307693e-05,
"loss": 0.5464,
"step": 10
},
{
"epoch": 0.003704389701796629,
"grad_norm": 0.1595727597726012,
"learning_rate": 0.00015384615384615385,
"loss": 0.3714,
"step": 20
},
{
"epoch": 0.0055565845526949435,
"grad_norm": 0.09668305340799446,
"learning_rate": 0.0002307692307692308,
"loss": 0.3157,
"step": 30
},
{
"epoch": 0.007408779403593258,
"grad_norm": 0.16023475106511212,
"learning_rate": 0.0003076923076923077,
"loss": 0.3987,
"step": 40
},
{
"epoch": 0.009260974254491572,
"grad_norm": 0.169453827168658,
"learning_rate": 0.00038461538461538467,
"loss": 0.3192,
"step": 50
},
{
"epoch": 0.011113169105389887,
"grad_norm": 0.3012465198223198,
"learning_rate": 0.0004615384615384616,
"loss": 0.3793,
"step": 60
},
{
"epoch": 0.012965363956288202,
"grad_norm": 0.3048336774601151,
"learning_rate": 0.0004999997322635931,
"loss": 0.3902,
"step": 70
},
{
"epoch": 0.014817558807186515,
"grad_norm": 0.3627728524819501,
"learning_rate": 0.0004999979009491321,
"loss": 0.3932,
"step": 80
},
{
"epoch": 0.016669753658084832,
"grad_norm": 0.4114314981691849,
"learning_rate": 0.0004999938313774507,
"loss": 0.4758,
"step": 90
},
{
"epoch": 0.018521948508983144,
"grad_norm": 0.44568863103845946,
"learning_rate": 0.0004999876199685106,
"loss": 0.4944,
"step": 100
},
{
"epoch": 0.02037414335988146,
"grad_norm": 0.3944831575252504,
"learning_rate": 0.0004999792667755284,
"loss": 0.4604,
"step": 110
},
{
"epoch": 0.022226338210779774,
"grad_norm": 0.4390978457678222,
"learning_rate": 0.0004999687718700706,
"loss": 0.5137,
"step": 120
},
{
"epoch": 0.02407853306167809,
"grad_norm": 0.43510394816525627,
"learning_rate": 0.000499956135342053,
"loss": 0.4757,
"step": 130
},
{
"epoch": 0.025930727912576404,
"grad_norm": 0.44811558740355373,
"learning_rate": 0.0004999413572997397,
"loss": 0.5541,
"step": 140
},
{
"epoch": 0.02778292276347472,
"grad_norm": 0.3691107778538866,
"learning_rate": 0.0004999262261712005,
"loss": 0.465,
"step": 150
},
{
"epoch": 0.02963511761437303,
"grad_norm": 0.38877053127394406,
"learning_rate": 0.0004999073796157043,
"loss": 0.4877,
"step": 160
},
{
"epoch": 0.031487312465271346,
"grad_norm": 0.43039217557165316,
"learning_rate": 0.00049988639196363,
"loss": 0.4371,
"step": 170
},
{
"epoch": 0.033339507316169664,
"grad_norm": 0.31896475378695344,
"learning_rate": 0.0004998632633947908,
"loss": 0.5814,
"step": 180
},
{
"epoch": 0.035191702167067976,
"grad_norm": 0.4333729012961258,
"learning_rate": 0.000499837994107342,
"loss": 0.5196,
"step": 190
},
{
"epoch": 0.03704389701796629,
"grad_norm": 0.34656949570118684,
"learning_rate": 0.0004998105843177797,
"loss": 0.4593,
"step": 200
},
{
"epoch": 0.038896091868864606,
"grad_norm": 0.3679842761186855,
"learning_rate": 0.000499781034260939,
"loss": 0.4526,
"step": 210
},
{
"epoch": 0.04074828671976292,
"grad_norm": 0.3877757158501542,
"learning_rate": 0.0004997493441899917,
"loss": 0.4261,
"step": 220
},
{
"epoch": 0.042600481570661236,
"grad_norm": 0.3289617422897924,
"learning_rate": 0.0004997155143764444,
"loss": 0.4934,
"step": 230
},
{
"epoch": 0.04445267642155955,
"grad_norm": 0.3826208484059836,
"learning_rate": 0.0004996795451101361,
"loss": 0.5347,
"step": 240
},
{
"epoch": 0.04630487127245786,
"grad_norm": 0.38157790109545875,
"learning_rate": 0.0004996414366992357,
"loss": 0.4789,
"step": 250
},
{
"epoch": 0.04815706612335618,
"grad_norm": 0.3154636026466987,
"learning_rate": 0.0004996011894702393,
"loss": 0.5096,
"step": 260
},
{
"epoch": 0.05000926097425449,
"grad_norm": 0.45302017894233715,
"learning_rate": 0.0004995588037679675,
"loss": 0.4752,
"step": 270
},
{
"epoch": 0.05186145582515281,
"grad_norm": 0.3690345364974773,
"learning_rate": 0.0004995142799555624,
"loss": 0.4454,
"step": 280
},
{
"epoch": 0.05371365067605112,
"grad_norm": 0.2843117768216058,
"learning_rate": 0.0004994676184144843,
"loss": 0.5058,
"step": 290
},
{
"epoch": 0.05556584552694944,
"grad_norm": 0.3278296958742726,
"learning_rate": 0.0004994188195445089,
"loss": 0.514,
"step": 300
},
{
"epoch": 0.05741804037784775,
"grad_norm": 0.3702152478433116,
"learning_rate": 0.0004993678837637235,
"loss": 0.4938,
"step": 310
},
{
"epoch": 0.05927023522874606,
"grad_norm": 0.3157950493567883,
"learning_rate": 0.0004993148115085233,
"loss": 0.4744,
"step": 320
},
{
"epoch": 0.06112243007964438,
"grad_norm": 0.2864560430174496,
"learning_rate": 0.0004992596032336082,
"loss": 0.4614,
"step": 330
},
{
"epoch": 0.06297462493054269,
"grad_norm": 0.27306121969283015,
"learning_rate": 0.0004992022594119784,
"loss": 0.5571,
"step": 340
},
{
"epoch": 0.064826819781441,
"grad_norm": 0.28239603992719803,
"learning_rate": 0.0004991427805349305,
"loss": 0.5352,
"step": 350
},
{
"epoch": 0.06667901463233933,
"grad_norm": 0.3067959635308188,
"learning_rate": 0.0004990811671120534,
"loss": 0.4366,
"step": 360
},
{
"epoch": 0.06853120948323764,
"grad_norm": 0.28026509474367334,
"learning_rate": 0.0004990174196712239,
"loss": 0.4413,
"step": 370
},
{
"epoch": 0.07038340433413595,
"grad_norm": 0.3553036031847406,
"learning_rate": 0.0004989515387586022,
"loss": 0.4771,
"step": 380
},
{
"epoch": 0.07223559918503426,
"grad_norm": 0.3078977949859338,
"learning_rate": 0.0004988904222849908,
"loss": 0.4456,
"step": 390
},
{
"epoch": 0.07408779403593257,
"grad_norm": 0.3110301739864855,
"learning_rate": 0.0004988204893460954,
"loss": 0.4383,
"step": 400
},
{
"epoch": 0.0759399888868309,
"grad_norm": 0.36180927604524143,
"learning_rate": 0.0004987484246226201,
"loss": 0.4467,
"step": 410
},
{
"epoch": 0.07779218373772921,
"grad_norm": 0.2902432894644559,
"learning_rate": 0.0004986742287319836,
"loss": 0.5027,
"step": 420
},
{
"epoch": 0.07964437858862752,
"grad_norm": 0.3038323599185379,
"learning_rate": 0.0004985979023098639,
"loss": 0.4896,
"step": 430
},
{
"epoch": 0.08149657343952584,
"grad_norm": 0.37728379802123757,
"learning_rate": 0.0004985194460101922,
"loss": 0.446,
"step": 440
},
{
"epoch": 0.08334876829042415,
"grad_norm": 0.4090633364935015,
"learning_rate": 0.0004984388605051474,
"loss": 0.4457,
"step": 450
},
{
"epoch": 0.08520096314132247,
"grad_norm": 0.2905031001353468,
"learning_rate": 0.000498356146485151,
"loss": 0.4807,
"step": 460
},
{
"epoch": 0.08705315799222078,
"grad_norm": 0.33050891942743665,
"learning_rate": 0.00049827130465886,
"loss": 0.4457,
"step": 470
},
{
"epoch": 0.0889053528431191,
"grad_norm": 0.3454971392885025,
"learning_rate": 0.0004981843357531622,
"loss": 0.4441,
"step": 480
},
{
"epoch": 0.09075754769401741,
"grad_norm": 0.2560611824951078,
"learning_rate": 0.0004980952405131687,
"loss": 0.4601,
"step": 490
},
{
"epoch": 0.09260974254491572,
"grad_norm": 0.3071403176866605,
"learning_rate": 0.0004980040197022085,
"loss": 0.422,
"step": 500
},
{
"epoch": 0.09446193739581404,
"grad_norm": 0.3103572812280149,
"learning_rate": 0.0004979106741018214,
"loss": 0.4556,
"step": 510
},
{
"epoch": 0.09631413224671236,
"grad_norm": 0.22158829723284448,
"learning_rate": 0.0004978152045117515,
"loss": 0.5279,
"step": 520
},
{
"epoch": 0.09816632709761067,
"grad_norm": 0.2643825421503944,
"learning_rate": 0.0004977176117499402,
"loss": 0.4332,
"step": 530
},
{
"epoch": 0.10001852194850898,
"grad_norm": 0.4119060845568467,
"learning_rate": 0.0004976178966525194,
"loss": 0.4748,
"step": 540
},
{
"epoch": 0.1018707167994073,
"grad_norm": 0.23946695979831795,
"learning_rate": 0.0004975160600738043,
"loss": 0.4564,
"step": 550
},
{
"epoch": 0.10372291165030562,
"grad_norm": 0.32293396447938405,
"learning_rate": 0.0004974121028862858,
"loss": 0.4037,
"step": 560
},
{
"epoch": 0.10557510650120393,
"grad_norm": 0.2737410631384409,
"learning_rate": 0.0004973060259806235,
"loss": 0.4471,
"step": 570
},
{
"epoch": 0.10742730135210224,
"grad_norm": 0.2639287107206222,
"learning_rate": 0.0004971978302656376,
"loss": 0.492,
"step": 580
},
{
"epoch": 0.10927949620300055,
"grad_norm": 0.3304530971496624,
"learning_rate": 0.0004970875166683017,
"loss": 0.4433,
"step": 590
},
{
"epoch": 0.11113169105389888,
"grad_norm": 0.3383662002406531,
"learning_rate": 0.0004969750861337338,
"loss": 0.5059,
"step": 600
},
{
"epoch": 0.11298388590479719,
"grad_norm": 0.3718630666323684,
"learning_rate": 0.0004968605396251896,
"loss": 0.4944,
"step": 610
},
{
"epoch": 0.1148360807556955,
"grad_norm": 0.3089667090694828,
"learning_rate": 0.0004967438781240532,
"loss": 0.5117,
"step": 620
},
{
"epoch": 0.11668827560659381,
"grad_norm": 0.3055449117119714,
"learning_rate": 0.000496625102629829,
"loss": 0.4504,
"step": 630
},
{
"epoch": 0.11854047045749212,
"grad_norm": 0.3104727563565212,
"learning_rate": 0.0004965042141601331,
"loss": 0.4279,
"step": 640
},
{
"epoch": 0.12039266530839045,
"grad_norm": 0.3356499915029813,
"learning_rate": 0.000496381213750685,
"loss": 0.4227,
"step": 650
},
{
"epoch": 0.12224486015928876,
"grad_norm": 0.27125317345626304,
"learning_rate": 0.0004962561024552981,
"loss": 0.4373,
"step": 660
},
{
"epoch": 0.12409705501018707,
"grad_norm": 0.30382841565038987,
"learning_rate": 0.0004961288813458708,
"loss": 0.4621,
"step": 670
},
{
"epoch": 0.12594924986108538,
"grad_norm": 0.24989596364522906,
"learning_rate": 0.0004959995515123779,
"loss": 0.4213,
"step": 680
},
{
"epoch": 0.1278014447119837,
"grad_norm": 0.38205278522841757,
"learning_rate": 0.0004958681140628603,
"loss": 0.4367,
"step": 690
},
{
"epoch": 0.129653639562882,
"grad_norm": 0.36439745638468385,
"learning_rate": 0.0004957345701234165,
"loss": 0.4427,
"step": 700
},
{
"epoch": 0.13150583441378033,
"grad_norm": 0.35080175318468465,
"learning_rate": 0.0004955989208381922,
"loss": 0.4133,
"step": 710
},
{
"epoch": 0.13335802926467866,
"grad_norm": 0.3137679102742871,
"learning_rate": 0.0004954611673693708,
"loss": 0.4044,
"step": 720
},
{
"epoch": 0.13521022411557695,
"grad_norm": 0.329188444759587,
"learning_rate": 0.0004953213108971637,
"loss": 0.4922,
"step": 730
},
{
"epoch": 0.13706241896647528,
"grad_norm": 0.21687503014556075,
"learning_rate": 0.0004951793526197992,
"loss": 0.4667,
"step": 740
},
{
"epoch": 0.13891461381737358,
"grad_norm": 0.35060249170961755,
"learning_rate": 0.0004950352937535139,
"loss": 0.4678,
"step": 750
},
{
"epoch": 0.1407668086682719,
"grad_norm": 0.2424350783919833,
"learning_rate": 0.0004948891355325407,
"loss": 0.5452,
"step": 760
},
{
"epoch": 0.14261900351917023,
"grad_norm": 0.29988592081373705,
"learning_rate": 0.0004947408792090989,
"loss": 0.4472,
"step": 770
},
{
"epoch": 0.14447119837006853,
"grad_norm": 0.25092463767440515,
"learning_rate": 0.0004945905260533836,
"loss": 0.4379,
"step": 780
},
{
"epoch": 0.14632339322096685,
"grad_norm": 0.2707811618812939,
"learning_rate": 0.0004944380773535545,
"loss": 0.4489,
"step": 790
},
{
"epoch": 0.14817558807186515,
"grad_norm": 0.29945644041990244,
"learning_rate": 0.000494283534415725,
"loss": 0.4627,
"step": 800
},
{
"epoch": 0.15002778292276348,
"grad_norm": 0.3269089383236662,
"learning_rate": 0.0004941268985639511,
"loss": 0.4559,
"step": 810
},
{
"epoch": 0.1518799777736618,
"grad_norm": 0.34167018575418623,
"learning_rate": 0.0004939681711402201,
"loss": 0.4502,
"step": 820
},
{
"epoch": 0.1537321726245601,
"grad_norm": 0.23233347955254757,
"learning_rate": 0.0004938073535044385,
"loss": 0.4848,
"step": 830
},
{
"epoch": 0.15558436747545842,
"grad_norm": 0.28810200476716363,
"learning_rate": 0.0004936444470344212,
"loss": 0.4334,
"step": 840
},
{
"epoch": 0.15743656232635672,
"grad_norm": 0.2502390156217485,
"learning_rate": 0.0004934794531258794,
"loss": 0.4756,
"step": 850
},
{
"epoch": 0.15928875717725505,
"grad_norm": 0.306502262394428,
"learning_rate": 0.0004933123731924083,
"loss": 0.4009,
"step": 860
},
{
"epoch": 0.16114095202815337,
"grad_norm": 0.2866551052549121,
"learning_rate": 0.0004931432086654751,
"loss": 0.411,
"step": 870
},
{
"epoch": 0.16299314687905167,
"grad_norm": 0.2975618645144025,
"learning_rate": 0.0004929719609944075,
"loss": 0.4386,
"step": 880
},
{
"epoch": 0.16484534172995,
"grad_norm": 0.3269989409370364,
"learning_rate": 0.00049279863164638,
"loss": 0.4811,
"step": 890
},
{
"epoch": 0.1666975365808483,
"grad_norm": 0.2764720769175588,
"learning_rate": 0.0004926232221064024,
"loss": 0.4319,
"step": 900
},
{
"epoch": 0.16854973143174662,
"grad_norm": 0.31817437091747597,
"learning_rate": 0.0004924457338773062,
"loss": 0.5039,
"step": 910
},
{
"epoch": 0.17040192628264494,
"grad_norm": 0.27931435921536724,
"learning_rate": 0.0004922661684797332,
"loss": 0.447,
"step": 920
},
{
"epoch": 0.17225412113354324,
"grad_norm": 0.31719086644687416,
"learning_rate": 0.0004920845274521201,
"loss": 0.4486,
"step": 930
},
{
"epoch": 0.17410631598444157,
"grad_norm": 0.2554455359026809,
"learning_rate": 0.0004919008123506878,
"loss": 0.4683,
"step": 940
},
{
"epoch": 0.17595851083533987,
"grad_norm": 0.33286076816889937,
"learning_rate": 0.0004917150247494265,
"loss": 0.4438,
"step": 950
},
{
"epoch": 0.1778107056862382,
"grad_norm": 0.2611238399418209,
"learning_rate": 0.0004915271662400824,
"loss": 0.3582,
"step": 960
},
{
"epoch": 0.17966290053713652,
"grad_norm": 0.2652458587080694,
"learning_rate": 0.0004913372384321449,
"loss": 0.4845,
"step": 970
},
{
"epoch": 0.18151509538803481,
"grad_norm": 0.2794832294188891,
"learning_rate": 0.000491145242952832,
"loss": 0.4398,
"step": 980
},
{
"epoch": 0.18336729023893314,
"grad_norm": 0.21029714010049572,
"learning_rate": 0.0004909511814470764,
"loss": 0.4408,
"step": 990
},
{
"epoch": 0.18521948508983144,
"grad_norm": 0.2781493608292439,
"learning_rate": 0.0004907550555775119,
"loss": 0.4999,
"step": 1000
},
{
"epoch": 0.18707167994072976,
"grad_norm": 0.3287877830017298,
"learning_rate": 0.0004905568670244588,
"loss": 0.4389,
"step": 1010
},
{
"epoch": 0.1889238747916281,
"grad_norm": 0.34207107261927205,
"learning_rate": 0.0004903566174859094,
"loss": 0.4537,
"step": 1020
},
{
"epoch": 0.19077606964252639,
"grad_norm": 0.24403509336935494,
"learning_rate": 0.0004901543086775137,
"loss": 0.3921,
"step": 1030
},
{
"epoch": 0.1926282644934247,
"grad_norm": 0.2671352359873941,
"learning_rate": 0.0004899499423325647,
"loss": 0.4023,
"step": 1040
},
{
"epoch": 0.194480459344323,
"grad_norm": 0.36145293617111,
"learning_rate": 0.0004897435202019832,
"loss": 0.4346,
"step": 1050
},
{
"epoch": 0.19633265419522133,
"grad_norm": 0.3104045357811312,
"learning_rate": 0.0004895350440543036,
"loss": 0.4299,
"step": 1060
},
{
"epoch": 0.19818484904611966,
"grad_norm": 0.2530391260727553,
"learning_rate": 0.0004893245156756578,
"loss": 0.4477,
"step": 1070
},
{
"epoch": 0.20003704389701796,
"grad_norm": 0.26339622262916945,
"learning_rate": 0.0004891119368697605,
"loss": 0.4907,
"step": 1080
},
{
"epoch": 0.20188923874791628,
"grad_norm": 0.24758807862533388,
"learning_rate": 0.0004888973094578931,
"loss": 0.4215,
"step": 1090
},
{
"epoch": 0.2037414335988146,
"grad_norm": 0.24646474329045825,
"learning_rate": 0.0004886806352788893,
"loss": 0.4727,
"step": 1100
},
{
"epoch": 0.2055936284497129,
"grad_norm": 0.30101780230375413,
"learning_rate": 0.0004884619161891181,
"loss": 0.4835,
"step": 1110
},
{
"epoch": 0.20744582330061123,
"grad_norm": 0.38338135072675056,
"learning_rate": 0.0004882411540624684,
"loss": 0.4713,
"step": 1120
},
{
"epoch": 0.20929801815150953,
"grad_norm": 0.30051618582402373,
"learning_rate": 0.00048801835079033325,
"loss": 0.4318,
"step": 1130
},
{
"epoch": 0.21115021300240785,
"grad_norm": 0.3169294143209614,
"learning_rate": 0.00048779350828159307,
"loss": 0.4414,
"step": 1140
},
{
"epoch": 0.21300240785330618,
"grad_norm": 0.2243691219456984,
"learning_rate": 0.0004875666284625996,
"loss": 0.4732,
"step": 1150
},
{
"epoch": 0.21485460270420448,
"grad_norm": 0.32093479593839086,
"learning_rate": 0.0004873377132771594,
"loss": 0.4477,
"step": 1160
},
{
"epoch": 0.2167067975551028,
"grad_norm": 0.30480291068654214,
"learning_rate": 0.00048710676468651724,
"loss": 0.4159,
"step": 1170
},
{
"epoch": 0.2185589924060011,
"grad_norm": 0.31550505987353533,
"learning_rate": 0.00048687378466933913,
"loss": 0.4121,
"step": 1180
},
{
"epoch": 0.22041118725689943,
"grad_norm": 0.2825917386970882,
"learning_rate": 0.0004866387752216953,
"loss": 0.4531,
"step": 1190
},
{
"epoch": 0.22226338210779775,
"grad_norm": 0.2507091074214277,
"learning_rate": 0.0004864017383570436,
"loss": 0.373,
"step": 1200
},
{
"epoch": 0.22411557695869605,
"grad_norm": 0.2533897084759911,
"learning_rate": 0.00048616267610621154,
"loss": 0.466,
"step": 1210
},
{
"epoch": 0.22596777180959438,
"grad_norm": 0.30135005574304485,
"learning_rate": 0.00048592159051737946,
"loss": 0.4678,
"step": 1220
},
{
"epoch": 0.22781996666049267,
"grad_norm": 0.2900534769133878,
"learning_rate": 0.0004856784836560627,
"loss": 0.4412,
"step": 1230
},
{
"epoch": 0.229672161511391,
"grad_norm": 0.3356512247856666,
"learning_rate": 0.000485433357605094,
"loss": 0.4381,
"step": 1240
},
{
"epoch": 0.23152435636228932,
"grad_norm": 0.28373492782986676,
"learning_rate": 0.00048518621446460555,
"loss": 0.4332,
"step": 1250
},
{
"epoch": 0.23337655121318762,
"grad_norm": 0.27681961152835116,
"learning_rate": 0.00048493705635201123,
"loss": 0.3954,
"step": 1260
},
{
"epoch": 0.23522874606408595,
"grad_norm": 0.3183042306103447,
"learning_rate": 0.0004846858854019882,
"loss": 0.4898,
"step": 1270
},
{
"epoch": 0.23708094091498425,
"grad_norm": 0.2806922738056069,
"learning_rate": 0.00048443270376645876,
"loss": 0.4621,
"step": 1280
},
{
"epoch": 0.23893313576588257,
"grad_norm": 0.32027034011519323,
"learning_rate": 0.00048417751361457185,
"loss": 0.4264,
"step": 1290
},
{
"epoch": 0.2407853306167809,
"grad_norm": 0.25756897907173815,
"learning_rate": 0.00048392031713268447,
"loss": 0.4213,
"step": 1300
},
{
"epoch": 0.2426375254676792,
"grad_norm": 0.29761680785972183,
"learning_rate": 0.0004836611165243432,
"loss": 0.41,
"step": 1310
},
{
"epoch": 0.24448972031857752,
"grad_norm": 0.28775863303393384,
"learning_rate": 0.00048339991401026474,
"loss": 0.4237,
"step": 1320
},
{
"epoch": 0.24634191516947582,
"grad_norm": 0.20527409355092,
"learning_rate": 0.00048313671182831743,
"loss": 0.4227,
"step": 1330
},
{
"epoch": 0.24819411002037414,
"grad_norm": 0.3049894888864481,
"learning_rate": 0.00048287151223350193,
"loss": 0.4188,
"step": 1340
},
{
"epoch": 0.25004630487127244,
"grad_norm": 0.28816158479568416,
"learning_rate": 0.00048260431749793184,
"loss": 0.4193,
"step": 1350
},
{
"epoch": 0.25189849972217077,
"grad_norm": 0.2810466941829626,
"learning_rate": 0.00048233512991081406,
"loss": 0.431,
"step": 1360
},
{
"epoch": 0.2537506945730691,
"grad_norm": 0.34419272070908224,
"learning_rate": 0.0004820639517784297,
"loss": 0.4802,
"step": 1370
},
{
"epoch": 0.2556028894239674,
"grad_norm": 0.2614191417571005,
"learning_rate": 0.00048179078542411367,
"loss": 0.4218,
"step": 1380
},
{
"epoch": 0.25745508427486574,
"grad_norm": 0.3620169455808058,
"learning_rate": 0.0004815156331882352,
"loss": 0.4259,
"step": 1390
},
{
"epoch": 0.259307279125764,
"grad_norm": 0.3495069978116607,
"learning_rate": 0.0004812384974281778,
"loss": 0.414,
"step": 1400
},
{
"epoch": 0.26115947397666234,
"grad_norm": 0.23822327577745042,
"learning_rate": 0.0004809593805183187,
"loss": 0.4885,
"step": 1410
},
{
"epoch": 0.26301166882756066,
"grad_norm": 0.31188479403470154,
"learning_rate": 0.00048067828485000904,
"loss": 0.438,
"step": 1420
},
{
"epoch": 0.264863863678459,
"grad_norm": 0.30908266150851776,
"learning_rate": 0.00048039521283155283,
"loss": 0.4224,
"step": 1430
},
{
"epoch": 0.2667160585293573,
"grad_norm": 0.3926396606462005,
"learning_rate": 0.0004801101668881869,
"loss": 0.4481,
"step": 1440
},
{
"epoch": 0.2685682533802556,
"grad_norm": 0.2937266710438928,
"learning_rate": 0.0004798231494620593,
"loss": 0.4785,
"step": 1450
},
{
"epoch": 0.2704204482311539,
"grad_norm": 0.29097772272918393,
"learning_rate": 0.00047953416301220936,
"loss": 0.5,
"step": 1460
},
{
"epoch": 0.27227264308205223,
"grad_norm": 0.2552279327553987,
"learning_rate": 0.000479243210014546,
"loss": 0.32,
"step": 1470
},
{
"epoch": 0.27412483793295056,
"grad_norm": 0.2699430209822517,
"learning_rate": 0.00047895029296182636,
"loss": 0.3985,
"step": 1480
},
{
"epoch": 0.2759770327838489,
"grad_norm": 0.31833186888024984,
"learning_rate": 0.0004786554143636353,
"loss": 0.4375,
"step": 1490
},
{
"epoch": 0.27782922763474716,
"grad_norm": 0.2751779388841223,
"learning_rate": 0.00047835857674636287,
"loss": 0.4001,
"step": 1500
},
{
"epoch": 0.2796814224856455,
"grad_norm": 0.2940862163328187,
"learning_rate": 0.0004780597826531833,
"loss": 0.4308,
"step": 1510
},
{
"epoch": 0.2815336173365438,
"grad_norm": 0.3386550227204627,
"learning_rate": 0.00047775903464403305,
"loss": 0.5353,
"step": 1520
},
{
"epoch": 0.28338581218744213,
"grad_norm": 0.31240154547554955,
"learning_rate": 0.00047745633529558884,
"loss": 0.3715,
"step": 1530
},
{
"epoch": 0.28523800703834046,
"grad_norm": 0.32759929614793354,
"learning_rate": 0.0004771516872012457,
"loss": 0.3929,
"step": 1540
},
{
"epoch": 0.2870902018892387,
"grad_norm": 0.29742817791928194,
"learning_rate": 0.0004768450929710945,
"loss": 0.4812,
"step": 1550
},
{
"epoch": 0.28894239674013705,
"grad_norm": 0.32461600905212035,
"learning_rate": 0.00047653655523189996,
"loss": 0.4181,
"step": 1560
},
{
"epoch": 0.2907945915910354,
"grad_norm": 0.26208477940948965,
"learning_rate": 0.00047622607662707773,
"loss": 0.3872,
"step": 1570
},
{
"epoch": 0.2926467864419337,
"grad_norm": 0.315046477208,
"learning_rate": 0.000475913659816672,
"loss": 0.4267,
"step": 1580
},
{
"epoch": 0.29449898129283203,
"grad_norm": 0.2451451562089501,
"learning_rate": 0.0004755993074773327,
"loss": 0.4525,
"step": 1590
},
{
"epoch": 0.2963511761437303,
"grad_norm": 0.2936495362556869,
"learning_rate": 0.00047528302230229246,
"loss": 0.4167,
"step": 1600
},
{
"epoch": 0.2982033709946286,
"grad_norm": 0.3551639863299712,
"learning_rate": 0.00047496480700134376,
"loss": 0.4214,
"step": 1610
},
{
"epoch": 0.30005556584552695,
"grad_norm": 0.21422448887216472,
"learning_rate": 0.0004746446643008153,
"loss": 0.4111,
"step": 1620
},
{
"epoch": 0.3019077606964253,
"grad_norm": 0.2593924521965729,
"learning_rate": 0.00047432259694354896,
"loss": 0.5274,
"step": 1630
},
{
"epoch": 0.3037599555473236,
"grad_norm": 0.30074263766274656,
"learning_rate": 0.0004739986076888765,
"loss": 0.4424,
"step": 1640
},
{
"epoch": 0.30561215039822187,
"grad_norm": 0.291226317138353,
"learning_rate": 0.0004736726993125952,
"loss": 0.4802,
"step": 1650
},
{
"epoch": 0.3074643452491202,
"grad_norm": 0.23749441719859632,
"learning_rate": 0.0004733448746069449,
"loss": 0.4288,
"step": 1660
},
{
"epoch": 0.3093165401000185,
"grad_norm": 0.2740636498957509,
"learning_rate": 0.00047301513638058355,
"loss": 0.4742,
"step": 1670
},
{
"epoch": 0.31116873495091685,
"grad_norm": 0.3263090001341323,
"learning_rate": 0.0004726834874585634,
"loss": 0.4945,
"step": 1680
},
{
"epoch": 0.3130209298018152,
"grad_norm": 0.23702905590165377,
"learning_rate": 0.00047234993068230656,
"loss": 0.3995,
"step": 1690
},
{
"epoch": 0.31487312465271344,
"grad_norm": 0.35028858247208006,
"learning_rate": 0.0004720144689095809,
"loss": 0.3937,
"step": 1700
},
{
"epoch": 0.31672531950361177,
"grad_norm": 0.35160376937763926,
"learning_rate": 0.00047167710501447535,
"loss": 0.4388,
"step": 1710
},
{
"epoch": 0.3185775143545101,
"grad_norm": 0.2769519878263511,
"learning_rate": 0.0004713378418873756,
"loss": 0.43,
"step": 1720
},
{
"epoch": 0.3204297092054084,
"grad_norm": 0.2723567337414344,
"learning_rate": 0.00047099668243493886,
"loss": 0.4546,
"step": 1730
},
{
"epoch": 0.32228190405630674,
"grad_norm": 0.4145209498456788,
"learning_rate": 0.0004706536295800695,
"loss": 0.4331,
"step": 1740
},
{
"epoch": 0.324134098907205,
"grad_norm": 0.3793519870853873,
"learning_rate": 0.0004703086862618935,
"loss": 0.3716,
"step": 1750
},
{
"epoch": 0.32598629375810334,
"grad_norm": 0.2962260082256936,
"learning_rate": 0.00046996185543573356,
"loss": 0.4161,
"step": 1760
},
{
"epoch": 0.32783848860900167,
"grad_norm": 0.24861664813452802,
"learning_rate": 0.00046961314007308374,
"loss": 0.4772,
"step": 1770
},
{
"epoch": 0.3296906834599,
"grad_norm": 0.30394710320503215,
"learning_rate": 0.00046926254316158414,
"loss": 0.4521,
"step": 1780
},
{
"epoch": 0.3315428783107983,
"grad_norm": 0.2835284077342044,
"learning_rate": 0.0004689100677049948,
"loss": 0.439,
"step": 1790
},
{
"epoch": 0.3333950731616966,
"grad_norm": 0.2936297703950855,
"learning_rate": 0.00046855571672317056,
"loss": 0.4539,
"step": 1800
},
{
"epoch": 0.3352472680125949,
"grad_norm": 0.31076414372805394,
"learning_rate": 0.00046819949325203485,
"loss": 0.5226,
"step": 1810
},
{
"epoch": 0.33709946286349324,
"grad_norm": 0.3151990506296693,
"learning_rate": 0.00046784140034355386,
"loss": 0.4502,
"step": 1820
},
{
"epoch": 0.33895165771439156,
"grad_norm": 0.2999740764164084,
"learning_rate": 0.0004674814410657102,
"loss": 0.405,
"step": 1830
},
{
"epoch": 0.3408038525652899,
"grad_norm": 0.2848528621693946,
"learning_rate": 0.00046711961850247677,
"loss": 0.4686,
"step": 1840
},
{
"epoch": 0.34265604741618816,
"grad_norm": 0.3304960024436658,
"learning_rate": 0.0004667559357537901,
"loss": 0.3961,
"step": 1850
},
{
"epoch": 0.3445082422670865,
"grad_norm": 0.29714447800492894,
"learning_rate": 0.00046639039593552423,
"loss": 0.4121,
"step": 1860
},
{
"epoch": 0.3463604371179848,
"grad_norm": 0.3737053983821796,
"learning_rate": 0.0004660230021794637,
"loss": 0.4899,
"step": 1870
},
{
"epoch": 0.34821263196888314,
"grad_norm": 0.2715803166164925,
"learning_rate": 0.00046565375763327655,
"loss": 0.418,
"step": 1880
},
{
"epoch": 0.35006482681978146,
"grad_norm": 0.2962801885853028,
"learning_rate": 0.0004652826654604879,
"loss": 0.4675,
"step": 1890
},
{
"epoch": 0.35191702167067973,
"grad_norm": 0.30660107375890056,
"learning_rate": 0.0004649097288404523,
"loss": 0.4536,
"step": 1900
},
{
"epoch": 0.35376921652157806,
"grad_norm": 0.28266003520813626,
"learning_rate": 0.00046453495096832677,
"loss": 0.44,
"step": 1910
},
{
"epoch": 0.3556214113724764,
"grad_norm": 0.3422119367179134,
"learning_rate": 0.00046415833505504344,
"loss": 0.4584,
"step": 1920
},
{
"epoch": 0.3574736062233747,
"grad_norm": 0.2749096084932521,
"learning_rate": 0.0004637798843272819,
"loss": 0.3907,
"step": 1930
},
{
"epoch": 0.35932580107427303,
"grad_norm": 0.26388805864831494,
"learning_rate": 0.00046339960202744154,
"loss": 0.5757,
"step": 1940
},
{
"epoch": 0.3611779959251713,
"grad_norm": 0.2738001016444935,
"learning_rate": 0.000463017491413614,
"loss": 0.4938,
"step": 1950
},
{
"epoch": 0.36303019077606963,
"grad_norm": 0.27217682271594046,
"learning_rate": 0.00046263355575955513,
"loss": 0.4063,
"step": 1960
},
{
"epoch": 0.36488238562696795,
"grad_norm": 0.23291262129921603,
"learning_rate": 0.0004622477983546567,
"loss": 0.419,
"step": 1970
},
{
"epoch": 0.3667345804778663,
"grad_norm": 0.304942976924537,
"learning_rate": 0.0004618602225039187,
"loss": 0.4168,
"step": 1980
},
{
"epoch": 0.3685867753287646,
"grad_norm": 0.24084297499524615,
"learning_rate": 0.00046147083152792064,
"loss": 0.3846,
"step": 1990
},
{
"epoch": 0.3704389701796629,
"grad_norm": 0.27930179036055947,
"learning_rate": 0.00046107962876279317,
"loss": 0.4226,
"step": 2000
},
{
"epoch": 0.3722911650305612,
"grad_norm": 0.22286791279607676,
"learning_rate": 0.00046068661756018975,
"loss": 0.3928,
"step": 2010
},
{
"epoch": 0.3741433598814595,
"grad_norm": 0.22400156451080455,
"learning_rate": 0.00046029180128725756,
"loss": 0.4584,
"step": 2020
},
{
"epoch": 0.37599555473235785,
"grad_norm": 0.3152682221415501,
"learning_rate": 0.0004598951833266087,
"loss": 0.4314,
"step": 2030
},
{
"epoch": 0.3778477495832562,
"grad_norm": 0.31019682799358195,
"learning_rate": 0.00045949676707629186,
"loss": 0.4237,
"step": 2040
},
{
"epoch": 0.37969994443415445,
"grad_norm": 0.32258613660465024,
"learning_rate": 0.00045909655594976207,
"loss": 0.3827,
"step": 2050
},
{
"epoch": 0.38155213928505277,
"grad_norm": 0.2506911135234745,
"learning_rate": 0.00045869455337585246,
"loss": 0.4037,
"step": 2060
},
{
"epoch": 0.3834043341359511,
"grad_norm": 0.35915658848471477,
"learning_rate": 0.0004582907627987444,
"loss": 0.4242,
"step": 2070
},
{
"epoch": 0.3852565289868494,
"grad_norm": 0.28180517097875335,
"learning_rate": 0.00045788518767793786,
"loss": 0.4342,
"step": 2080
},
{
"epoch": 0.38710872383774775,
"grad_norm": 0.22401926241944572,
"learning_rate": 0.0004574778314882225,
"loss": 0.4546,
"step": 2090
},
{
"epoch": 0.388960918688646,
"grad_norm": 0.3007971129642205,
"learning_rate": 0.0004570686977196468,
"loss": 0.4364,
"step": 2100
},
{
"epoch": 0.39081311353954434,
"grad_norm": 0.24088799894015317,
"learning_rate": 0.0004566577898774893,
"loss": 0.4313,
"step": 2110
},
{
"epoch": 0.39266530839044267,
"grad_norm": 0.30698196088504776,
"learning_rate": 0.0004562451114822276,
"loss": 0.3996,
"step": 2120
},
{
"epoch": 0.394517503241341,
"grad_norm": 0.2516817084212753,
"learning_rate": 0.0004558306660695089,
"loss": 0.4434,
"step": 2130
},
{
"epoch": 0.3963696980922393,
"grad_norm": 0.24923810797995163,
"learning_rate": 0.00045541445719011933,
"loss": 0.3827,
"step": 2140
},
{
"epoch": 0.39822189294313765,
"grad_norm": 0.2838748265882661,
"learning_rate": 0.0004549964884099534,
"loss": 0.4097,
"step": 2150
},
{
"epoch": 0.4000740877940359,
"grad_norm": 0.2520366270233344,
"learning_rate": 0.0004545767633099842,
"loss": 0.4257,
"step": 2160
},
{
"epoch": 0.40192628264493424,
"grad_norm": 0.29635595927178765,
"learning_rate": 0.0004541552854862317,
"loss": 0.4305,
"step": 2170
},
{
"epoch": 0.40377847749583257,
"grad_norm": 0.3136173166936259,
"learning_rate": 0.00045373205854973265,
"loss": 0.4592,
"step": 2180
},
{
"epoch": 0.4056306723467309,
"grad_norm": 0.2849443744452371,
"learning_rate": 0.0004533070861265094,
"loss": 0.4604,
"step": 2190
},
{
"epoch": 0.4074828671976292,
"grad_norm": 0.27436502832510207,
"learning_rate": 0.000452880371857539,
"loss": 0.3709,
"step": 2200
},
{
"epoch": 0.4093350620485275,
"grad_norm": 0.31084213819654966,
"learning_rate": 0.0004524519193987215,
"loss": 0.4707,
"step": 2210
},
{
"epoch": 0.4111872568994258,
"grad_norm": 0.27171948513912497,
"learning_rate": 0.00045202173242084954,
"loss": 0.4131,
"step": 2220
},
{
"epoch": 0.41303945175032414,
"grad_norm": 0.2720258760965373,
"learning_rate": 0.0004515898146095758,
"loss": 0.3954,
"step": 2230
},
{
"epoch": 0.41489164660122246,
"grad_norm": 0.21964829380379464,
"learning_rate": 0.0004511561696653823,
"loss": 0.432,
"step": 2240
},
{
"epoch": 0.4167438414521208,
"grad_norm": 0.22147147407497397,
"learning_rate": 0.0004507208013035483,
"loss": 0.406,
"step": 2250
},
{
"epoch": 0.41859603630301906,
"grad_norm": 0.2592943907855162,
"learning_rate": 0.0004502837132541186,
"loss": 0.4092,
"step": 2260
},
{
"epoch": 0.4204482311539174,
"grad_norm": 0.2697288980975384,
"learning_rate": 0.0004498449092618715,
"loss": 0.3643,
"step": 2270
},
{
"epoch": 0.4223004260048157,
"grad_norm": 0.2502930773158984,
"learning_rate": 0.00044940439308628654,
"loss": 0.344,
"step": 2280
},
{
"epoch": 0.42415262085571404,
"grad_norm": 0.28445457893318615,
"learning_rate": 0.00044896216850151294,
"loss": 0.4511,
"step": 2290
},
{
"epoch": 0.42600481570661236,
"grad_norm": 0.3361734430502526,
"learning_rate": 0.0004485182392963364,
"loss": 0.3547,
"step": 2300
},
{
"epoch": 0.42785701055751063,
"grad_norm": 0.2326479256523765,
"learning_rate": 0.0004480726092741472,
"loss": 0.3731,
"step": 2310
},
{
"epoch": 0.42970920540840896,
"grad_norm": 0.2646729222942232,
"learning_rate": 0.00044762528225290757,
"loss": 0.4015,
"step": 2320
},
{
"epoch": 0.4315614002593073,
"grad_norm": 0.33778964570201236,
"learning_rate": 0.0004471762620651187,
"loss": 0.4,
"step": 2330
},
{
"epoch": 0.4334135951102056,
"grad_norm": 0.31289509233278756,
"learning_rate": 0.00044672555255778824,
"loss": 0.4377,
"step": 2340
},
{
"epoch": 0.43526578996110393,
"grad_norm": 0.27440247092572545,
"learning_rate": 0.00044627315759239715,
"loss": 0.3972,
"step": 2350
},
{
"epoch": 0.4371179848120022,
"grad_norm": 0.2641845623874125,
"learning_rate": 0.0004458190810448667,
"loss": 0.3864,
"step": 2360
},
{
"epoch": 0.43897017966290053,
"grad_norm": 0.3042810996664228,
"learning_rate": 0.0004453633268055249,
"loss": 0.4277,
"step": 2370
},
{
"epoch": 0.44082237451379885,
"grad_norm": 0.2497842382086681,
"learning_rate": 0.00044490589877907406,
"loss": 0.3926,
"step": 2380
},
{
"epoch": 0.4426745693646972,
"grad_norm": 0.2259561601883072,
"learning_rate": 0.00044444680088455624,
"loss": 0.4567,
"step": 2390
},
{
"epoch": 0.4445267642155955,
"grad_norm": 0.2644522169590116,
"learning_rate": 0.00044398603705532046,
"loss": 0.4257,
"step": 2400
},
{
"epoch": 0.4463789590664938,
"grad_norm": 0.24862008909243488,
"learning_rate": 0.0004435236112389887,
"loss": 0.3187,
"step": 2410
},
{
"epoch": 0.4482311539173921,
"grad_norm": 0.2838495721029593,
"learning_rate": 0.000443059527397422,
"loss": 0.4659,
"step": 2420
},
{
"epoch": 0.4500833487682904,
"grad_norm": 0.219358259027201,
"learning_rate": 0.00044259378950668683,
"loss": 0.3919,
"step": 2430
},
{
"epoch": 0.45193554361918875,
"grad_norm": 0.31146983163040265,
"learning_rate": 0.00044212640155702053,
"loss": 0.4584,
"step": 2440
},
{
"epoch": 0.4537877384700871,
"grad_norm": 0.26979102938650734,
"learning_rate": 0.00044165736755279785,
"loss": 0.3086,
"step": 2450
},
{
"epoch": 0.45563993332098535,
"grad_norm": 0.29314640181084967,
"learning_rate": 0.00044118669151249585,
"loss": 0.4357,
"step": 2460
},
{
"epoch": 0.45749212817188367,
"grad_norm": 0.2523855052206998,
"learning_rate": 0.00044071437746865994,
"loss": 0.4024,
"step": 2470
},
{
"epoch": 0.459344323022782,
"grad_norm": 0.24148640334233432,
"learning_rate": 0.0004402404294678692,
"loss": 0.396,
"step": 2480
},
{
"epoch": 0.4611965178736803,
"grad_norm": 0.22896761800287638,
"learning_rate": 0.00043976485157070185,
"loss": 0.4293,
"step": 2490
},
{
"epoch": 0.46304871272457865,
"grad_norm": 0.24737906716097793,
"learning_rate": 0.0004392876478517002,
"loss": 0.4756,
"step": 2500
},
{
"epoch": 0.4649009075754769,
"grad_norm": 0.305490554690619,
"learning_rate": 0.000438808822399336,
"loss": 0.405,
"step": 2510
},
{
"epoch": 0.46675310242637524,
"grad_norm": 0.2802043380804828,
"learning_rate": 0.00043832837931597526,
"loss": 0.3876,
"step": 2520
},
{
"epoch": 0.46860529727727357,
"grad_norm": 0.2860415378563156,
"learning_rate": 0.00043784632271784304,
"loss": 0.4161,
"step": 2530
},
{
"epoch": 0.4704574921281719,
"grad_norm": 0.28267000501834966,
"learning_rate": 0.0004373626567349885,
"loss": 0.4143,
"step": 2540
},
{
"epoch": 0.4723096869790702,
"grad_norm": 0.2525367504836072,
"learning_rate": 0.00043687738551124913,
"loss": 0.3757,
"step": 2550
},
{
"epoch": 0.4741618818299685,
"grad_norm": 0.3925357847215651,
"learning_rate": 0.0004363905132042154,
"loss": 0.3826,
"step": 2560
},
{
"epoch": 0.4760140766808668,
"grad_norm": 0.3263265495863413,
"learning_rate": 0.00043590204398519526,
"loss": 0.4263,
"step": 2570
},
{
"epoch": 0.47786627153176514,
"grad_norm": 0.30208444736193557,
"learning_rate": 0.0004354119820391784,
"loss": 0.3817,
"step": 2580
},
{
"epoch": 0.47971846638266347,
"grad_norm": 0.2561058320675499,
"learning_rate": 0.00043492033156479997,
"loss": 0.4278,
"step": 2590
},
{
"epoch": 0.4815706612335618,
"grad_norm": 0.30589399146654594,
"learning_rate": 0.0004344270967743052,
"loss": 0.4058,
"step": 2600
},
{
"epoch": 0.48342285608446006,
"grad_norm": 0.2978445001042373,
"learning_rate": 0.00043393228189351297,
"loss": 0.4212,
"step": 2610
},
{
"epoch": 0.4852750509353584,
"grad_norm": 0.29323906443796505,
"learning_rate": 0.0004334358911617797,
"loss": 0.4304,
"step": 2620
},
{
"epoch": 0.4871272457862567,
"grad_norm": 0.25775394604491453,
"learning_rate": 0.000432937928831963,
"loss": 0.4291,
"step": 2630
},
{
"epoch": 0.48897944063715504,
"grad_norm": 0.2860673624388678,
"learning_rate": 0.00043243839917038506,
"loss": 0.4452,
"step": 2640
},
{
"epoch": 0.49083163548805336,
"grad_norm": 0.2451402557512562,
"learning_rate": 0.00043193730645679665,
"loss": 0.349,
"step": 2650
},
{
"epoch": 0.49268383033895163,
"grad_norm": 0.23951029660105672,
"learning_rate": 0.0004314346549843398,
"loss": 0.3986,
"step": 2660
},
{
"epoch": 0.49453602518984996,
"grad_norm": 0.24086380299145352,
"learning_rate": 0.0004309304490595113,
"loss": 0.4069,
"step": 2670
},
{
"epoch": 0.4963882200407483,
"grad_norm": 0.19690525958834837,
"learning_rate": 0.00043042469300212595,
"loss": 0.3658,
"step": 2680
},
{
"epoch": 0.4982404148916466,
"grad_norm": 0.2873547855172915,
"learning_rate": 0.0004299173911452794,
"loss": 0.4045,
"step": 2690
},
{
"epoch": 0.5000926097425449,
"grad_norm": 0.3445660214713212,
"learning_rate": 0.0004294085478353109,
"loss": 0.3342,
"step": 2700
},
{
"epoch": 0.5019448045934433,
"grad_norm": 0.26259627047719875,
"learning_rate": 0.00042889816743176625,
"loss": 0.4115,
"step": 2710
},
{
"epoch": 0.5037969994443415,
"grad_norm": 0.27090069459316,
"learning_rate": 0.0004283862543073604,
"loss": 0.4178,
"step": 2720
},
{
"epoch": 0.5056491942952399,
"grad_norm": 0.3203148075266908,
"learning_rate": 0.00042787281284794,
"loss": 0.4177,
"step": 2730
},
{
"epoch": 0.5075013891461382,
"grad_norm": 0.2044466650316563,
"learning_rate": 0.00042735784745244585,
"loss": 0.415,
"step": 2740
},
{
"epoch": 0.5093535839970365,
"grad_norm": 0.2673811085531597,
"learning_rate": 0.000426841362532875,
"loss": 0.3923,
"step": 2750
},
{
"epoch": 0.5112057788479348,
"grad_norm": 0.23323940410282512,
"learning_rate": 0.00042632336251424317,
"loss": 0.3643,
"step": 2760
},
{
"epoch": 0.5130579736988331,
"grad_norm": 0.19502502356966445,
"learning_rate": 0.00042580385183454695,
"loss": 0.4509,
"step": 2770
},
{
"epoch": 0.5149101685497315,
"grad_norm": 0.3081825384344212,
"learning_rate": 0.0004252828349447254,
"loss": 0.3374,
"step": 2780
},
{
"epoch": 0.5167623634006298,
"grad_norm": 0.19926889616728075,
"learning_rate": 0.00042476031630862235,
"loss": 0.3751,
"step": 2790
},
{
"epoch": 0.518614558251528,
"grad_norm": 0.2980672545203656,
"learning_rate": 0.00042423630040294756,
"loss": 0.3737,
"step": 2800
},
{
"epoch": 0.5204667531024264,
"grad_norm": 0.2805956385580894,
"learning_rate": 0.0004237107917172391,
"loss": 0.3498,
"step": 2810
},
{
"epoch": 0.5223189479533247,
"grad_norm": 0.24883952133869866,
"learning_rate": 0.00042318379475382454,
"loss": 0.369,
"step": 2820
},
{
"epoch": 0.5241711428042231,
"grad_norm": 0.26010129083226985,
"learning_rate": 0.0004226553140277819,
"loss": 0.3763,
"step": 2830
},
{
"epoch": 0.5260233376551213,
"grad_norm": 0.3407509896784033,
"learning_rate": 0.000422125354066902,
"loss": 0.3339,
"step": 2840
},
{
"epoch": 0.5278755325060196,
"grad_norm": 0.2022248872951544,
"learning_rate": 0.0004215939194116487,
"loss": 0.415,
"step": 2850
},
{
"epoch": 0.529727727356918,
"grad_norm": 0.3427987857911665,
"learning_rate": 0.0004210610146151206,
"loss": 0.4224,
"step": 2860
},
{
"epoch": 0.5315799222078162,
"grad_norm": 0.23594824415533,
"learning_rate": 0.0004205266442430117,
"loss": 0.4051,
"step": 2870
},
{
"epoch": 0.5334321170587146,
"grad_norm": 0.29315061402915377,
"learning_rate": 0.00041999081287357246,
"loss": 0.3898,
"step": 2880
},
{
"epoch": 0.5352843119096129,
"grad_norm": 0.25391786215048595,
"learning_rate": 0.0004194535250975705,
"loss": 0.4163,
"step": 2890
},
{
"epoch": 0.5371365067605112,
"grad_norm": 0.30989709227816453,
"learning_rate": 0.00041891478551825135,
"loss": 0.4528,
"step": 2900
},
{
"epoch": 0.5389887016114095,
"grad_norm": 0.30084068834422883,
"learning_rate": 0.000418374598751299,
"loss": 0.4187,
"step": 2910
},
{
"epoch": 0.5408408964623078,
"grad_norm": 0.2707819885874306,
"learning_rate": 0.000417832969424796,
"loss": 0.4203,
"step": 2920
},
{
"epoch": 0.5426930913132062,
"grad_norm": 0.27765562870418,
"learning_rate": 0.00041728990217918454,
"loss": 0.4354,
"step": 2930
},
{
"epoch": 0.5445452861641045,
"grad_norm": 0.2957077208859336,
"learning_rate": 0.00041674540166722595,
"loss": 0.4214,
"step": 2940
},
{
"epoch": 0.5463974810150027,
"grad_norm": 0.3687676577456054,
"learning_rate": 0.0004161994725539614,
"loss": 0.3915,
"step": 2950
},
{
"epoch": 0.5482496758659011,
"grad_norm": 0.26016346169725796,
"learning_rate": 0.00041565211951667143,
"loss": 0.4265,
"step": 2960
},
{
"epoch": 0.5501018707167994,
"grad_norm": 0.29400682034550746,
"learning_rate": 0.0004151033472448363,
"loss": 0.3754,
"step": 2970
},
{
"epoch": 0.5519540655676978,
"grad_norm": 0.24729614759661173,
"learning_rate": 0.00041455316044009563,
"loss": 0.3678,
"step": 2980
},
{
"epoch": 0.553806260418596,
"grad_norm": 0.30448617928085525,
"learning_rate": 0.0004140015638162081,
"loss": 0.3521,
"step": 2990
},
{
"epoch": 0.5556584552694943,
"grad_norm": 0.326331806127286,
"learning_rate": 0.0004134485620990113,
"loss": 0.3829,
"step": 3000
},
{
"epoch": 0.5575106501203927,
"grad_norm": 0.2831079722418925,
"learning_rate": 0.0004128941600263805,
"loss": 0.3499,
"step": 3010
},
{
"epoch": 0.559362844971291,
"grad_norm": 0.2544434887846111,
"learning_rate": 0.00041233836234818926,
"loss": 0.4621,
"step": 3020
},
{
"epoch": 0.5612150398221893,
"grad_norm": 0.272652788679403,
"learning_rate": 0.0004117811738262677,
"loss": 0.413,
"step": 3030
},
{
"epoch": 0.5630672346730876,
"grad_norm": 0.25142412831266564,
"learning_rate": 0.0004112225992343621,
"loss": 0.4163,
"step": 3040
},
{
"epoch": 0.5649194295239859,
"grad_norm": 0.2647884767561391,
"learning_rate": 0.00041066264335809413,
"loss": 0.3914,
"step": 3050
},
{
"epoch": 0.5667716243748843,
"grad_norm": 0.23801633376774256,
"learning_rate": 0.00041010131099491944,
"loss": 0.3754,
"step": 3060
},
{
"epoch": 0.5686238192257825,
"grad_norm": 0.2731341421028539,
"learning_rate": 0.0004095386069540872,
"loss": 0.4227,
"step": 3070
},
{
"epoch": 0.5704760140766809,
"grad_norm": 0.2011024370599634,
"learning_rate": 0.0004089745360565981,
"loss": 0.3834,
"step": 3080
},
{
"epoch": 0.5723282089275792,
"grad_norm": 0.23740640073183247,
"learning_rate": 0.00040840910313516364,
"loss": 0.4279,
"step": 3090
},
{
"epoch": 0.5741804037784775,
"grad_norm": 0.2525764151086583,
"learning_rate": 0.00040784231303416473,
"loss": 0.3782,
"step": 3100
},
{
"epoch": 0.5760325986293758,
"grad_norm": 0.29277924659862603,
"learning_rate": 0.00040727417060960967,
"loss": 0.3743,
"step": 3110
},
{
"epoch": 0.5778847934802741,
"grad_norm": 0.24242972284715095,
"learning_rate": 0.0004067046807290931,
"loss": 0.3832,
"step": 3120
},
{
"epoch": 0.5797369883311725,
"grad_norm": 0.25071856580407875,
"learning_rate": 0.0004061338482717538,
"loss": 0.3867,
"step": 3130
},
{
"epoch": 0.5815891831820708,
"grad_norm": 0.2837990600721797,
"learning_rate": 0.0004055616781282335,
"loss": 0.4151,
"step": 3140
},
{
"epoch": 0.583441378032969,
"grad_norm": 0.22534951219394125,
"learning_rate": 0.0004049881752006346,
"loss": 0.3788,
"step": 3150
},
{
"epoch": 0.5852935728838674,
"grad_norm": 0.2817669494395476,
"learning_rate": 0.0004044133444024779,
"loss": 0.437,
"step": 3160
},
{
"epoch": 0.5871457677347657,
"grad_norm": 0.20817420244233692,
"learning_rate": 0.00040383719065866105,
"loss": 0.3918,
"step": 3170
},
{
"epoch": 0.5889979625856641,
"grad_norm": 0.2734267113676852,
"learning_rate": 0.0004032597189054161,
"loss": 0.4261,
"step": 3180
},
{
"epoch": 0.5908501574365623,
"grad_norm": 0.27859862469331026,
"learning_rate": 0.0004026809340902672,
"loss": 0.4035,
"step": 3190
},
{
"epoch": 0.5927023522874606,
"grad_norm": 0.2545952221508602,
"learning_rate": 0.0004021008411719881,
"loss": 0.3432,
"step": 3200
},
{
"epoch": 0.594554547138359,
"grad_norm": 0.270005891201155,
"learning_rate": 0.0004015194451205601,
"loss": 0.354,
"step": 3210
},
{
"epoch": 0.5964067419892572,
"grad_norm": 0.24352901007536132,
"learning_rate": 0.000400936750917129,
"loss": 0.3729,
"step": 3220
},
{
"epoch": 0.5982589368401556,
"grad_norm": 0.2556498791861634,
"learning_rate": 0.0004003527635539625,
"loss": 0.4015,
"step": 3230
},
{
"epoch": 0.6001111316910539,
"grad_norm": 0.2752351613083482,
"learning_rate": 0.00039976748803440774,
"loss": 0.3672,
"step": 3240
},
{
"epoch": 0.6019633265419522,
"grad_norm": 0.2609226477539244,
"learning_rate": 0.000399180929372848,
"loss": 0.4015,
"step": 3250
},
{
"epoch": 0.6038155213928506,
"grad_norm": 0.30960657643957806,
"learning_rate": 0.00039859309259466017,
"loss": 0.3641,
"step": 3260
},
{
"epoch": 0.6056677162437488,
"grad_norm": 0.3035485490629689,
"learning_rate": 0.0003980039827361712,
"loss": 0.4543,
"step": 3270
},
{
"epoch": 0.6075199110946472,
"grad_norm": 0.2184916474124068,
"learning_rate": 0.0003974136048446155,
"loss": 0.337,
"step": 3280
},
{
"epoch": 0.6093721059455455,
"grad_norm": 0.2843568329769092,
"learning_rate": 0.0003968219639780915,
"loss": 0.4351,
"step": 3290
},
{
"epoch": 0.6112243007964437,
"grad_norm": 0.269831900653445,
"learning_rate": 0.00039622906520551786,
"loss": 0.3777,
"step": 3300
},
{
"epoch": 0.6130764956473421,
"grad_norm": 0.2834037960599415,
"learning_rate": 0.0003956349136065908,
"loss": 0.3924,
"step": 3310
},
{
"epoch": 0.6149286904982404,
"grad_norm": 0.24761657160080242,
"learning_rate": 0.00039503951427173985,
"loss": 0.4168,
"step": 3320
},
{
"epoch": 0.6167808853491388,
"grad_norm": 0.30901172205688504,
"learning_rate": 0.00039444287230208495,
"loss": 0.3873,
"step": 3330
},
{
"epoch": 0.618633080200037,
"grad_norm": 0.29747872909981493,
"learning_rate": 0.0003938449928093922,
"loss": 0.4341,
"step": 3340
},
{
"epoch": 0.6204852750509353,
"grad_norm": 0.2543886903531346,
"learning_rate": 0.0003932458809160303,
"loss": 0.3683,
"step": 3350
},
{
"epoch": 0.6223374699018337,
"grad_norm": 0.33337732842586854,
"learning_rate": 0.0003926455417549266,
"loss": 0.3755,
"step": 3360
},
{
"epoch": 0.624189664752732,
"grad_norm": 0.2464332085515913,
"learning_rate": 0.00039204398046952313,
"loss": 0.3602,
"step": 3370
},
{
"epoch": 0.6260418596036303,
"grad_norm": 0.2946927475643436,
"learning_rate": 0.00039144120221373254,
"loss": 0.4474,
"step": 3380
},
{
"epoch": 0.6278940544545286,
"grad_norm": 0.3017003197321625,
"learning_rate": 0.0003908372121518939,
"loss": 0.4334,
"step": 3390
},
{
"epoch": 0.6297462493054269,
"grad_norm": 0.32871078632996376,
"learning_rate": 0.0003902320154587288,
"loss": 0.3826,
"step": 3400
},
{
"epoch": 0.6315984441563253,
"grad_norm": 0.3041703577665594,
"learning_rate": 0.0003896256173192963,
"loss": 0.4301,
"step": 3410
},
{
"epoch": 0.6334506390072235,
"grad_norm": 0.27657730284049636,
"learning_rate": 0.0003890180229289492,
"loss": 0.3637,
"step": 3420
},
{
"epoch": 0.6353028338581219,
"grad_norm": 0.2894023841563432,
"learning_rate": 0.0003884701694853233,
"loss": 0.4083,
"step": 3430
},
{
"epoch": 0.6371550287090202,
"grad_norm": 0.3313798136401644,
"learning_rate": 0.00038786031656810573,
"loss": 0.3613,
"step": 3440
},
{
"epoch": 0.6390072235599185,
"grad_norm": 0.31419538828574267,
"learning_rate": 0.0003872492825242943,
"loss": 0.3517,
"step": 3450
},
{
"epoch": 0.6408594184108168,
"grad_norm": 0.2856367570197956,
"learning_rate": 0.0003866370725889602,
"loss": 0.3311,
"step": 3460
},
{
"epoch": 0.6427116132617151,
"grad_norm": 0.32378046135112004,
"learning_rate": 0.00038602369200724907,
"loss": 0.3808,
"step": 3470
},
{
"epoch": 0.6445638081126135,
"grad_norm": 0.2809834575253639,
"learning_rate": 0.00038540914603433596,
"loss": 0.3874,
"step": 3480
},
{
"epoch": 0.6464160029635118,
"grad_norm": 0.23009208535401943,
"learning_rate": 0.00038479343993538085,
"loss": 0.415,
"step": 3490
},
{
"epoch": 0.64826819781441,
"grad_norm": 0.22641660111122883,
"learning_rate": 0.00038417657898548284,
"loss": 0.3278,
"step": 3500
},
{
"epoch": 0.6501203926653084,
"grad_norm": 0.2981220824138414,
"learning_rate": 0.00038355856846963545,
"loss": 0.4047,
"step": 3510
},
{
"epoch": 0.6519725875162067,
"grad_norm": 0.2555163199749857,
"learning_rate": 0.00038293941368268105,
"loss": 0.4132,
"step": 3520
},
{
"epoch": 0.6538247823671051,
"grad_norm": 0.2291679316803199,
"learning_rate": 0.00038231911992926573,
"loss": 0.4501,
"step": 3530
},
{
"epoch": 0.6556769772180033,
"grad_norm": 0.22327007435525262,
"learning_rate": 0.0003816976925237936,
"loss": 0.4047,
"step": 3540
},
{
"epoch": 0.6575291720689016,
"grad_norm": 0.26270477479908155,
"learning_rate": 0.00038113744298654294,
"loss": 0.3669,
"step": 3550
},
{
"epoch": 0.6593813669198,
"grad_norm": 0.20304050646048286,
"learning_rate": 0.00038051387631809585,
"loss": 0.4247,
"step": 3560
},
{
"epoch": 0.6612335617706983,
"grad_norm": 0.2626214779683425,
"learning_rate": 0.0003798891914641258,
"loss": 0.3397,
"step": 3570
},
{
"epoch": 0.6630857566215966,
"grad_norm": 0.2927783575344774,
"learning_rate": 0.00037926339377665805,
"loss": 0.3352,
"step": 3580
},
{
"epoch": 0.6649379514724949,
"grad_norm": 0.2868661472365901,
"learning_rate": 0.0003786364886172521,
"loss": 0.4321,
"step": 3590
},
{
"epoch": 0.6667901463233932,
"grad_norm": 0.1980588697868199,
"learning_rate": 0.00037800848135695564,
"loss": 0.355,
"step": 3600
},
{
"epoch": 0.6686423411742916,
"grad_norm": 0.27964064214829887,
"learning_rate": 0.00037737937737625905,
"loss": 0.3953,
"step": 3610
},
{
"epoch": 0.6704945360251898,
"grad_norm": 0.30140561884162703,
"learning_rate": 0.0003767491820650486,
"loss": 0.3802,
"step": 3620
},
{
"epoch": 0.6723467308760882,
"grad_norm": 0.26216353668713616,
"learning_rate": 0.00037611790082256073,
"loss": 0.3701,
"step": 3630
},
{
"epoch": 0.6741989257269865,
"grad_norm": 0.2667607207767126,
"learning_rate": 0.00037548553905733566,
"loss": 0.4217,
"step": 3640
},
{
"epoch": 0.6760511205778847,
"grad_norm": 0.2888052260287578,
"learning_rate": 0.00037485210218717095,
"loss": 0.3861,
"step": 3650
},
{
"epoch": 0.6779033154287831,
"grad_norm": 0.322681691929484,
"learning_rate": 0.0003742175956390754,
"loss": 0.3769,
"step": 3660
},
{
"epoch": 0.6797555102796814,
"grad_norm": 0.2809039196576165,
"learning_rate": 0.0003735820248492221,
"loss": 0.37,
"step": 3670
},
{
"epoch": 0.6816077051305798,
"grad_norm": 0.3168194333373297,
"learning_rate": 0.0003729453952629022,
"loss": 0.3813,
"step": 3680
},
{
"epoch": 0.683459899981478,
"grad_norm": 0.2743408298239755,
"learning_rate": 0.00037230771233447813,
"loss": 0.3762,
"step": 3690
},
{
"epoch": 0.6853120948323763,
"grad_norm": 0.2997039201183461,
"learning_rate": 0.000371668981527337,
"loss": 0.4346,
"step": 3700
},
{
"epoch": 0.6871642896832747,
"grad_norm": 0.18532771548719357,
"learning_rate": 0.0003710292083138436,
"loss": 0.344,
"step": 3710
},
{
"epoch": 0.689016484534173,
"grad_norm": 0.3521954419398032,
"learning_rate": 0.0003703883981752935,
"loss": 0.378,
"step": 3720
},
{
"epoch": 0.6908686793850713,
"grad_norm": 0.3037259752726694,
"learning_rate": 0.00036974655660186644,
"loss": 0.4339,
"step": 3730
},
{
"epoch": 0.6927208742359696,
"grad_norm": 0.24733145996258551,
"learning_rate": 0.0003691036890925788,
"loss": 0.4195,
"step": 3740
},
{
"epoch": 0.6945730690868679,
"grad_norm": 0.19584340465708208,
"learning_rate": 0.0003684598011552368,
"loss": 0.3404,
"step": 3750
},
{
"epoch": 0.6964252639377663,
"grad_norm": 0.2530305551321265,
"learning_rate": 0.00036781489830638923,
"loss": 0.3163,
"step": 3760
},
{
"epoch": 0.6982774587886645,
"grad_norm": 0.26939789666432756,
"learning_rate": 0.0003671689860712804,
"loss": 0.3419,
"step": 3770
},
{
"epoch": 0.7001296536395629,
"grad_norm": 0.24191294552249204,
"learning_rate": 0.0003665220699838022,
"loss": 0.4176,
"step": 3780
},
{
"epoch": 0.7019818484904612,
"grad_norm": 0.2777592117015156,
"learning_rate": 0.00036587415558644756,
"loss": 0.3215,
"step": 3790
},
{
"epoch": 0.7038340433413595,
"grad_norm": 0.30078087923699953,
"learning_rate": 0.00036522524843026193,
"loss": 0.3564,
"step": 3800
},
{
"epoch": 0.7056862381922578,
"grad_norm": 0.29338660781666925,
"learning_rate": 0.00036457535407479673,
"loss": 0.3725,
"step": 3810
},
{
"epoch": 0.7075384330431561,
"grad_norm": 0.2296766539983086,
"learning_rate": 0.00036392447808806117,
"loss": 0.3688,
"step": 3820
},
{
"epoch": 0.7093906278940545,
"grad_norm": 0.30321062833889273,
"learning_rate": 0.0003632726260464746,
"loss": 0.3948,
"step": 3830
},
{
"epoch": 0.7112428227449528,
"grad_norm": 0.29399675372420425,
"learning_rate": 0.0003626198035348187,
"loss": 0.4013,
"step": 3840
},
{
"epoch": 0.713095017595851,
"grad_norm": 0.2105362387910143,
"learning_rate": 0.0003619660161461898,
"loss": 0.366,
"step": 3850
},
{
"epoch": 0.7149472124467494,
"grad_norm": 0.23037128345764354,
"learning_rate": 0.00036131126948195103,
"loss": 0.4221,
"step": 3860
},
{
"epoch": 0.7167994072976477,
"grad_norm": 0.2768953340591145,
"learning_rate": 0.00036065556915168377,
"loss": 0.2986,
"step": 3870
},
{
"epoch": 0.7186516021485461,
"grad_norm": 0.23581750422601885,
"learning_rate": 0.0003599989207731404,
"loss": 0.3691,
"step": 3880
},
{
"epoch": 0.7205037969994443,
"grad_norm": 0.23261721710497926,
"learning_rate": 0.0003593413299721955,
"loss": 0.4161,
"step": 3890
},
{
"epoch": 0.7223559918503426,
"grad_norm": 0.26947390848344027,
"learning_rate": 0.00035868280238279804,
"loss": 0.4034,
"step": 3900
},
{
"epoch": 0.724208186701241,
"grad_norm": 0.2604323518406546,
"learning_rate": 0.00035802334364692283,
"loss": 0.3652,
"step": 3910
},
{
"epoch": 0.7260603815521393,
"grad_norm": 0.19811786937816656,
"learning_rate": 0.00035736295941452256,
"loss": 0.3411,
"step": 3920
},
{
"epoch": 0.7279125764030376,
"grad_norm": 0.2942447611839833,
"learning_rate": 0.0003567016553434791,
"loss": 0.3932,
"step": 3930
},
{
"epoch": 0.7297647712539359,
"grad_norm": 0.20647945881304144,
"learning_rate": 0.00035603943709955495,
"loss": 0.3481,
"step": 3940
},
{
"epoch": 0.7316169661048342,
"grad_norm": 0.29098401038664423,
"learning_rate": 0.0003553763103563449,
"loss": 0.3205,
"step": 3950
},
{
"epoch": 0.7334691609557326,
"grad_norm": 0.24827960683081182,
"learning_rate": 0.00035471228079522754,
"loss": 0.3653,
"step": 3960
},
{
"epoch": 0.7353213558066308,
"grad_norm": 0.21532456030161418,
"learning_rate": 0.0003540473541053161,
"loss": 0.3299,
"step": 3970
},
{
"epoch": 0.7371735506575292,
"grad_norm": 0.28516797949078204,
"learning_rate": 0.0003533815359834103,
"loss": 0.3718,
"step": 3980
},
{
"epoch": 0.7390257455084275,
"grad_norm": 0.2617620703053819,
"learning_rate": 0.00035271483213394715,
"loss": 0.3505,
"step": 3990
},
{
"epoch": 0.7408779403593257,
"grad_norm": 0.27198805201563014,
"learning_rate": 0.000352047248268952,
"loss": 0.3968,
"step": 4000
},
{
"epoch": 0.7427301352102241,
"grad_norm": 0.1957730557770133,
"learning_rate": 0.0003513787901079902,
"loss": 0.3647,
"step": 4010
},
{
"epoch": 0.7445823300611224,
"grad_norm": 0.2424016899157965,
"learning_rate": 0.0003507094633781173,
"loss": 0.4071,
"step": 4020
},
{
"epoch": 0.7464345249120208,
"grad_norm": 0.2513574669580144,
"learning_rate": 0.00035003927381383046,
"loss": 0.3348,
"step": 4030
},
{
"epoch": 0.748286719762919,
"grad_norm": 0.2524624117498673,
"learning_rate": 0.00034936822715701945,
"loss": 0.3805,
"step": 4040
},
{
"epoch": 0.7501389146138173,
"grad_norm": 0.23903538948524897,
"learning_rate": 0.00034869632915691685,
"loss": 0.335,
"step": 4050
},
{
"epoch": 0.7519911094647157,
"grad_norm": 0.18376558979991064,
"learning_rate": 0.0003480235855700495,
"loss": 0.3251,
"step": 4060
},
{
"epoch": 0.753843304315614,
"grad_norm": 0.23255076073481523,
"learning_rate": 0.0003473500021601888,
"loss": 0.3706,
"step": 4070
},
{
"epoch": 0.7556954991665124,
"grad_norm": 0.26504941120664904,
"learning_rate": 0.0003466755846983012,
"loss": 0.3388,
"step": 4080
},
{
"epoch": 0.7575476940174106,
"grad_norm": 0.21513866870033804,
"learning_rate": 0.00034600033896249903,
"loss": 0.3493,
"step": 4090
},
{
"epoch": 0.7593998888683089,
"grad_norm": 0.2588933457999632,
"learning_rate": 0.00034532427073799115,
"loss": 0.3335,
"step": 4100
},
{
"epoch": 0.7612520837192073,
"grad_norm": 0.22932856457029652,
"learning_rate": 0.0003446473858170328,
"loss": 0.3573,
"step": 4110
},
{
"epoch": 0.7631042785701055,
"grad_norm": 0.25882003589945557,
"learning_rate": 0.00034396968999887635,
"loss": 0.3448,
"step": 4120
},
{
"epoch": 0.7649564734210039,
"grad_norm": 0.18186372017813182,
"learning_rate": 0.00034329118908972187,
"loss": 0.3451,
"step": 4130
},
{
"epoch": 0.7668086682719022,
"grad_norm": 0.2905270964806583,
"learning_rate": 0.00034261188890266674,
"loss": 0.3388,
"step": 4140
},
{
"epoch": 0.7686608631228005,
"grad_norm": 0.27875971252061826,
"learning_rate": 0.00034193179525765646,
"loss": 0.3131,
"step": 4150
},
{
"epoch": 0.7705130579736988,
"grad_norm": 0.24842087853864708,
"learning_rate": 0.00034125091398143445,
"loss": 0.4291,
"step": 4160
},
{
"epoch": 0.7723652528245971,
"grad_norm": 0.2684559843295528,
"learning_rate": 0.00034056925090749214,
"loss": 0.3715,
"step": 4170
},
{
"epoch": 0.7742174476754955,
"grad_norm": 0.22463589836430295,
"learning_rate": 0.00033988681187601907,
"loss": 0.4228,
"step": 4180
},
{
"epoch": 0.7760696425263938,
"grad_norm": 0.27828743228315045,
"learning_rate": 0.00033920360273385295,
"loss": 0.2931,
"step": 4190
},
{
"epoch": 0.777921837377292,
"grad_norm": 0.24380996785281236,
"learning_rate": 0.0003385196293344295,
"loss": 0.4017,
"step": 4200
},
{
"epoch": 0.7797740322281904,
"grad_norm": 0.2909979077113848,
"learning_rate": 0.0003378348975377319,
"loss": 0.3481,
"step": 4210
},
{
"epoch": 0.7816262270790887,
"grad_norm": 0.23332383664304898,
"learning_rate": 0.0003371494132102414,
"loss": 0.3445,
"step": 4220
},
{
"epoch": 0.7834784219299871,
"grad_norm": 0.21450077928300515,
"learning_rate": 0.0003364631822248863,
"loss": 0.3472,
"step": 4230
},
{
"epoch": 0.7853306167808853,
"grad_norm": 0.21521239472704395,
"learning_rate": 0.00033577621046099214,
"loss": 0.3326,
"step": 4240
},
{
"epoch": 0.7871828116317837,
"grad_norm": 0.21746868050833518,
"learning_rate": 0.00033508850380423107,
"loss": 0.317,
"step": 4250
},
{
"epoch": 0.789035006482682,
"grad_norm": 0.25145609268154195,
"learning_rate": 0.00033440006814657123,
"loss": 0.3903,
"step": 4260
},
{
"epoch": 0.7908872013335803,
"grad_norm": 0.2493850757271924,
"learning_rate": 0.00033371090938622683,
"loss": 0.376,
"step": 4270
},
{
"epoch": 0.7927393961844786,
"grad_norm": 0.27042518686478084,
"learning_rate": 0.00033302103342760717,
"loss": 0.3324,
"step": 4280
},
{
"epoch": 0.7945915910353769,
"grad_norm": 0.36372007737066575,
"learning_rate": 0.0003323304461812663,
"loss": 0.2962,
"step": 4290
},
{
"epoch": 0.7964437858862753,
"grad_norm": 0.2789450982129661,
"learning_rate": 0.0003316391535638521,
"loss": 0.4018,
"step": 4300
},
{
"epoch": 0.7982959807371736,
"grad_norm": 0.30183962763634775,
"learning_rate": 0.00033094716149805587,
"loss": 0.3866,
"step": 4310
},
{
"epoch": 0.8001481755880718,
"grad_norm": 0.21612720841935062,
"learning_rate": 0.0003302544759125615,
"loss": 0.4077,
"step": 4320
},
{
"epoch": 0.8020003704389702,
"grad_norm": 0.23394333144621351,
"learning_rate": 0.00032956110274199457,
"loss": 0.386,
"step": 4330
},
{
"epoch": 0.8038525652898685,
"grad_norm": 0.23944805976592476,
"learning_rate": 0.00032886704792687156,
"loss": 0.2975,
"step": 4340
},
{
"epoch": 0.8057047601407669,
"grad_norm": 0.30206829611790686,
"learning_rate": 0.0003281723174135491,
"loss": 0.3464,
"step": 4350
},
{
"epoch": 0.8075569549916651,
"grad_norm": 0.25395526533782503,
"learning_rate": 0.00032747691715417297,
"loss": 0.3839,
"step": 4360
},
{
"epoch": 0.8094091498425634,
"grad_norm": 0.2701846283890953,
"learning_rate": 0.0003267808531066268,
"loss": 0.3718,
"step": 4370
},
{
"epoch": 0.8112613446934618,
"grad_norm": 0.3284423662284243,
"learning_rate": 0.00032608413123448127,
"loss": 0.3123,
"step": 4380
},
{
"epoch": 0.81311353954436,
"grad_norm": 0.19093953526607452,
"learning_rate": 0.00032538675750694323,
"loss": 0.3178,
"step": 4390
},
{
"epoch": 0.8149657343952584,
"grad_norm": 0.2588745305552011,
"learning_rate": 0.0003246887378988044,
"loss": 0.3364,
"step": 4400
},
{
"epoch": 0.8168179292461567,
"grad_norm": 0.2944248033604882,
"learning_rate": 0.00032399007839038974,
"loss": 0.3851,
"step": 4410
},
{
"epoch": 0.818670124097055,
"grad_norm": 0.35233338424624305,
"learning_rate": 0.00032329078496750685,
"loss": 0.3935,
"step": 4420
},
{
"epoch": 0.8205223189479534,
"grad_norm": 0.2529989683445966,
"learning_rate": 0.00032259086362139444,
"loss": 0.3545,
"step": 4430
},
{
"epoch": 0.8223745137988516,
"grad_norm": 0.21890769609197974,
"learning_rate": 0.00032189032034867095,
"loss": 0.3322,
"step": 4440
},
{
"epoch": 0.82422670864975,
"grad_norm": 0.2966639221943858,
"learning_rate": 0.00032118916115128317,
"loss": 0.3413,
"step": 4450
},
{
"epoch": 0.8260789035006483,
"grad_norm": 0.28138389738354624,
"learning_rate": 0.00032048739203645484,
"loss": 0.3594,
"step": 4460
},
{
"epoch": 0.8279310983515465,
"grad_norm": 0.26012433275701663,
"learning_rate": 0.00031978501901663544,
"loss": 0.354,
"step": 4470
},
{
"epoch": 0.8297832932024449,
"grad_norm": 0.22288136348571755,
"learning_rate": 0.00031908204810944806,
"loss": 0.3345,
"step": 4480
},
{
"epoch": 0.8316354880533432,
"grad_norm": 0.2563012485418534,
"learning_rate": 0.0003183784853376386,
"loss": 0.377,
"step": 4490
},
{
"epoch": 0.8334876829042416,
"grad_norm": 0.19175987210580075,
"learning_rate": 0.00031767433672902357,
"loss": 0.378,
"step": 4500
},
{
"epoch": 0.8353398777551398,
"grad_norm": 0.27929483171815755,
"learning_rate": 0.0003169696083164387,
"loss": 0.4083,
"step": 4510
},
{
"epoch": 0.8371920726060381,
"grad_norm": 0.22806754292261686,
"learning_rate": 0.00031626430613768727,
"loss": 0.2805,
"step": 4520
},
{
"epoch": 0.8390442674569365,
"grad_norm": 0.2098902858669142,
"learning_rate": 0.0003155584362354883,
"loss": 0.3046,
"step": 4530
},
{
"epoch": 0.8408964623078348,
"grad_norm": 0.22326173310010555,
"learning_rate": 0.0003148520046574248,
"loss": 0.3618,
"step": 4540
},
{
"epoch": 0.8427486571587331,
"grad_norm": 0.28432435874722173,
"learning_rate": 0.00031414501745589214,
"loss": 0.3047,
"step": 4550
},
{
"epoch": 0.8446008520096314,
"grad_norm": 0.22658460752200546,
"learning_rate": 0.0003134374806880458,
"loss": 0.3075,
"step": 4560
},
{
"epoch": 0.8464530468605297,
"grad_norm": 0.2326511532797664,
"learning_rate": 0.00031272940041574985,
"loss": 0.3253,
"step": 4570
},
{
"epoch": 0.8483052417114281,
"grad_norm": 0.26196194032003345,
"learning_rate": 0.00031202078270552483,
"loss": 0.3672,
"step": 4580
},
{
"epoch": 0.8501574365623263,
"grad_norm": 0.2216415083774707,
"learning_rate": 0.00031131163362849563,
"loss": 0.361,
"step": 4590
},
{
"epoch": 0.8520096314132247,
"grad_norm": 0.31309200526058145,
"learning_rate": 0.0003106019592603401,
"loss": 0.4028,
"step": 4600
},
{
"epoch": 0.853861826264123,
"grad_norm": 0.30199878040880657,
"learning_rate": 0.000309891765681236,
"loss": 0.3254,
"step": 4610
},
{
"epoch": 0.8557140211150213,
"grad_norm": 0.2657478340310185,
"learning_rate": 0.0003091810589758099,
"loss": 0.3965,
"step": 4620
},
{
"epoch": 0.8575662159659196,
"grad_norm": 0.26801220601237896,
"learning_rate": 0.0003084698452330844,
"loss": 0.2717,
"step": 4630
},
{
"epoch": 0.8594184108168179,
"grad_norm": 0.2691236527559968,
"learning_rate": 0.0003077581305464263,
"loss": 0.3449,
"step": 4640
},
{
"epoch": 0.8612706056677163,
"grad_norm": 0.250751208793887,
"learning_rate": 0.0003070459210134941,
"loss": 0.3398,
"step": 4650
},
{
"epoch": 0.8631228005186146,
"grad_norm": 0.2598136376324884,
"learning_rate": 0.0003063332227361861,
"loss": 0.379,
"step": 4660
},
{
"epoch": 0.8649749953695128,
"grad_norm": 0.2320138289175307,
"learning_rate": 0.00030569138145676144,
"loss": 0.4172,
"step": 4670
},
{
"epoch": 0.8668271902204112,
"grad_norm": 0.2544457573722289,
"learning_rate": 0.0003049777713908237,
"loss": 0.3363,
"step": 4680
},
{
"epoch": 0.8686793850713095,
"grad_norm": 0.21755454053442072,
"learning_rate": 0.000304263690299507,
"loss": 0.3903,
"step": 4690
},
{
"epoch": 0.8705315799222079,
"grad_norm": 0.1876698563670142,
"learning_rate": 0.0003035491443007442,
"loss": 0.3813,
"step": 4700
},
{
"epoch": 0.8723837747731061,
"grad_norm": 0.23125086361592628,
"learning_rate": 0.0003028341395164513,
"loss": 0.326,
"step": 4710
},
{
"epoch": 0.8742359696240044,
"grad_norm": 0.24526039999109062,
"learning_rate": 0.0003021186820724752,
"loss": 0.3818,
"step": 4720
},
{
"epoch": 0.8760881644749028,
"grad_norm": 0.23276472003991475,
"learning_rate": 0.0003014027780985406,
"loss": 0.3286,
"step": 4730
},
{
"epoch": 0.8779403593258011,
"grad_norm": 0.2879683324317072,
"learning_rate": 0.00030068643372819804,
"loss": 0.3563,
"step": 4740
},
{
"epoch": 0.8797925541766994,
"grad_norm": 0.19871362889489913,
"learning_rate": 0.0002999696550987713,
"loss": 0.3271,
"step": 4750
},
{
"epoch": 0.8816447490275977,
"grad_norm": 0.2749990294223314,
"learning_rate": 0.00029925244835130466,
"loss": 0.36,
"step": 4760
},
{
"epoch": 0.883496943878496,
"grad_norm": 0.19581874215709116,
"learning_rate": 0.00029853481963051015,
"loss": 0.3869,
"step": 4770
},
{
"epoch": 0.8853491387293944,
"grad_norm": 0.25690630291268424,
"learning_rate": 0.0002978167750847153,
"loss": 0.3291,
"step": 4780
},
{
"epoch": 0.8872013335802926,
"grad_norm": 0.23380636858065187,
"learning_rate": 0.0002970983208658101,
"loss": 0.3148,
"step": 4790
},
{
"epoch": 0.889053528431191,
"grad_norm": 0.27392706669357925,
"learning_rate": 0.00029637946312919443,
"loss": 0.3471,
"step": 4800
},
{
"epoch": 0.8909057232820893,
"grad_norm": 0.262683330886347,
"learning_rate": 0.00029566020803372544,
"loss": 0.3581,
"step": 4810
},
{
"epoch": 0.8927579181329875,
"grad_norm": 0.1967433279025824,
"learning_rate": 0.0002949405617416647,
"loss": 0.3244,
"step": 4820
},
{
"epoch": 0.8946101129838859,
"grad_norm": 0.21893101415992228,
"learning_rate": 0.00029422053041862524,
"loss": 0.2418,
"step": 4830
},
{
"epoch": 0.8964623078347842,
"grad_norm": 0.3050479264269311,
"learning_rate": 0.000293500120233519,
"loss": 0.3154,
"step": 4840
},
{
"epoch": 0.8983145026856826,
"grad_norm": 0.22098931345400527,
"learning_rate": 0.00029277933735850366,
"loss": 0.3875,
"step": 4850
},
{
"epoch": 0.9001666975365809,
"grad_norm": 0.18665489074313069,
"learning_rate": 0.0002920581879689302,
"loss": 0.3203,
"step": 4860
},
{
"epoch": 0.9020188923874791,
"grad_norm": 0.22546452927540434,
"learning_rate": 0.00029133667824328944,
"loss": 0.3174,
"step": 4870
},
{
"epoch": 0.9038710872383775,
"grad_norm": 0.273911749633942,
"learning_rate": 0.0002906148143631597,
"loss": 0.4109,
"step": 4880
},
{
"epoch": 0.9057232820892758,
"grad_norm": 0.2862382822755954,
"learning_rate": 0.0002898926025131534,
"loss": 0.3438,
"step": 4890
},
{
"epoch": 0.9075754769401742,
"grad_norm": 0.2256784413424552,
"learning_rate": 0.0002891700488808641,
"loss": 0.4231,
"step": 4900
},
{
"epoch": 0.9094276717910724,
"grad_norm": 0.25475613390595164,
"learning_rate": 0.0002884471596568138,
"loss": 0.311,
"step": 4910
},
{
"epoch": 0.9112798666419707,
"grad_norm": 0.22040988223176197,
"learning_rate": 0.0002877239410343995,
"loss": 0.3609,
"step": 4920
},
{
"epoch": 0.9131320614928691,
"grad_norm": 0.21405974357001087,
"learning_rate": 0.0002870003992098406,
"loss": 0.3199,
"step": 4930
},
{
"epoch": 0.9149842563437673,
"grad_norm": 0.22165830710412393,
"learning_rate": 0.00028627654038212535,
"loss": 0.2932,
"step": 4940
},
{
"epoch": 0.9168364511946657,
"grad_norm": 0.2539298146212295,
"learning_rate": 0.000285552370752958,
"loss": 0.3203,
"step": 4950
},
{
"epoch": 0.918688646045564,
"grad_norm": 0.2519284526672049,
"learning_rate": 0.0002848278965267057,
"loss": 0.299,
"step": 4960
},
{
"epoch": 0.9205408408964623,
"grad_norm": 0.21558726442907455,
"learning_rate": 0.000284103123910345,
"loss": 0.3227,
"step": 4970
},
{
"epoch": 0.9223930357473606,
"grad_norm": 0.2314909389156984,
"learning_rate": 0.00028337805911340914,
"loss": 0.3018,
"step": 4980
},
{
"epoch": 0.9242452305982589,
"grad_norm": 0.278811225532839,
"learning_rate": 0.00028265270834793466,
"loss": 0.3002,
"step": 4990
},
{
"epoch": 0.9260974254491573,
"grad_norm": 0.21464467115282912,
"learning_rate": 0.0002819270778284081,
"loss": 0.2984,
"step": 5000
},
{
"epoch": 0.9279496203000556,
"grad_norm": 0.21949485740442687,
"learning_rate": 0.0002812011737717127,
"loss": 0.3034,
"step": 5010
},
{
"epoch": 0.9298018151509538,
"grad_norm": 0.22922734336855702,
"learning_rate": 0.0002804750023970753,
"loss": 0.3648,
"step": 5020
},
{
"epoch": 0.9316540100018522,
"grad_norm": 0.2807666058464406,
"learning_rate": 0.00027974856992601314,
"loss": 0.347,
"step": 5030
},
{
"epoch": 0.9335062048527505,
"grad_norm": 0.21380147064458355,
"learning_rate": 0.00027902188258228033,
"loss": 0.2868,
"step": 5040
},
{
"epoch": 0.9353583997036489,
"grad_norm": 0.23226632039182726,
"learning_rate": 0.00027829494659181454,
"loss": 0.3373,
"step": 5050
},
{
"epoch": 0.9372105945545471,
"grad_norm": 0.16664382791007723,
"learning_rate": 0.0002775677681826838,
"loss": 0.3425,
"step": 5060
},
{
"epoch": 0.9390627894054454,
"grad_norm": 0.2131603970341897,
"learning_rate": 0.00027684035358503315,
"loss": 0.356,
"step": 5070
},
{
"epoch": 0.9409149842563438,
"grad_norm": 0.2943760673928641,
"learning_rate": 0.00027611270903103095,
"loss": 0.3573,
"step": 5080
},
{
"epoch": 0.9427671791072421,
"grad_norm": 0.2862566121817152,
"learning_rate": 0.00027538484075481613,
"loss": 0.4255,
"step": 5090
},
{
"epoch": 0.9446193739581404,
"grad_norm": 0.231901510250299,
"learning_rate": 0.00027465675499244396,
"loss": 0.3407,
"step": 5100
},
{
"epoch": 0.9464715688090387,
"grad_norm": 0.2476530639942114,
"learning_rate": 0.0002739284579818333,
"loss": 0.2723,
"step": 5110
},
{
"epoch": 0.948323763659937,
"grad_norm": 0.21350073532203115,
"learning_rate": 0.0002731999559627127,
"loss": 0.3461,
"step": 5120
},
{
"epoch": 0.9501759585108354,
"grad_norm": 0.2002031483905575,
"learning_rate": 0.0002724712551765673,
"loss": 0.3514,
"step": 5130
},
{
"epoch": 0.9520281533617336,
"grad_norm": 0.2370797517823577,
"learning_rate": 0.00027174236186658515,
"loss": 0.3378,
"step": 5140
},
{
"epoch": 0.953880348212632,
"grad_norm": 0.21585863872901473,
"learning_rate": 0.0002710132822776037,
"loss": 0.3321,
"step": 5150
},
{
"epoch": 0.9557325430635303,
"grad_norm": 0.26386608394124156,
"learning_rate": 0.0002702840226560564,
"loss": 0.3436,
"step": 5160
},
{
"epoch": 0.9575847379144286,
"grad_norm": 0.2890408109766508,
"learning_rate": 0.00026955458924991923,
"loss": 0.401,
"step": 5170
},
{
"epoch": 0.9594369327653269,
"grad_norm": 0.25751071532225056,
"learning_rate": 0.00026882498830865673,
"loss": 0.3359,
"step": 5180
},
{
"epoch": 0.9612891276162252,
"grad_norm": 0.1908489549011557,
"learning_rate": 0.00026809522608316926,
"loss": 0.3446,
"step": 5190
},
{
"epoch": 0.9631413224671236,
"grad_norm": 0.2654943827624779,
"learning_rate": 0.0002673653088257388,
"loss": 0.3226,
"step": 5200
},
{
"epoch": 0.9649935173180219,
"grad_norm": 0.2090532023246876,
"learning_rate": 0.00026663524278997534,
"loss": 0.3627,
"step": 5210
},
{
"epoch": 0.9668457121689201,
"grad_norm": 0.1928560578254249,
"learning_rate": 0.00026590503423076404,
"loss": 0.3829,
"step": 5220
},
{
"epoch": 0.9686979070198185,
"grad_norm": 0.2669070196379663,
"learning_rate": 0.0002651746894042108,
"loss": 0.3034,
"step": 5230
},
{
"epoch": 0.9705501018707168,
"grad_norm": 0.30560885950305455,
"learning_rate": 0.00026444421456758887,
"loss": 0.3662,
"step": 5240
},
{
"epoch": 0.9724022967216152,
"grad_norm": 0.26179376779317864,
"learning_rate": 0.00026371361597928586,
"loss": 0.3277,
"step": 5250
},
{
"epoch": 0.9742544915725134,
"grad_norm": 0.22773579499385666,
"learning_rate": 0.0002629828998987491,
"loss": 0.3227,
"step": 5260
},
{
"epoch": 0.9761066864234117,
"grad_norm": 0.22913911318822955,
"learning_rate": 0.0002622520725864328,
"loss": 0.4155,
"step": 5270
},
{
"epoch": 0.9779588812743101,
"grad_norm": 0.26745430474124415,
"learning_rate": 0.0002615211403037441,
"loss": 0.3134,
"step": 5280
},
{
"epoch": 0.9798110761252083,
"grad_norm": 0.18747224024104983,
"learning_rate": 0.00026079010931298965,
"loss": 0.3352,
"step": 5290
},
{
"epoch": 0.9816632709761067,
"grad_norm": 0.2507770069072283,
"learning_rate": 0.0002600589858773216,
"loss": 0.2841,
"step": 5300
},
{
"epoch": 0.983515465827005,
"grad_norm": 0.2320843718590129,
"learning_rate": 0.00025932777626068405,
"loss": 0.2901,
"step": 5310
},
{
"epoch": 0.9853676606779033,
"grad_norm": 0.25694442462488337,
"learning_rate": 0.0002585964867277597,
"loss": 0.3655,
"step": 5320
},
{
"epoch": 0.9872198555288016,
"grad_norm": 0.1946752572256077,
"learning_rate": 0.00025786512354391585,
"loss": 0.3399,
"step": 5330
},
{
"epoch": 0.9890720503796999,
"grad_norm": 0.1531862751587864,
"learning_rate": 0.00025713369297515056,
"loss": 0.3309,
"step": 5340
},
{
"epoch": 0.9909242452305983,
"grad_norm": 0.23979500779092153,
"learning_rate": 0.00025640220128803965,
"loss": 0.3476,
"step": 5350
},
{
"epoch": 0.9927764400814966,
"grad_norm": 0.22955793113305528,
"learning_rate": 0.00025567065474968226,
"loss": 0.34,
"step": 5360
},
{
"epoch": 0.9946286349323948,
"grad_norm": 0.26774128565687644,
"learning_rate": 0.00025501222114748204,
"loss": 0.3265,
"step": 5370
},
{
"epoch": 0.9964808297832932,
"grad_norm": 0.2331087333203837,
"learning_rate": 0.00025428058765925466,
"loss": 0.2761,
"step": 5380
},
{
"epoch": 0.9983330246341915,
"grad_norm": 0.24526043917044132,
"learning_rate": 0.00025354891749683386,
"loss": 0.3495,
"step": 5390
},
{
"epoch": 1.0001852194850898,
"grad_norm": 0.2031173709527516,
"learning_rate": 0.0002528172169288478,
"loss": 0.3272,
"step": 5400
},
{
"epoch": 1.0020374143359883,
"grad_norm": 0.2229851857312578,
"learning_rate": 0.0002520854922241855,
"loss": 0.2226,
"step": 5410
},
{
"epoch": 1.0038896091868865,
"grad_norm": 0.23237399050753563,
"learning_rate": 0.0002513537496519425,
"loss": 0.2502,
"step": 5420
},
{
"epoch": 1.0057418040377848,
"grad_norm": 0.22482059046916258,
"learning_rate": 0.00025062199548136767,
"loss": 0.2567,
"step": 5430
},
{
"epoch": 1.007593998888683,
"grad_norm": 0.19384034239788644,
"learning_rate": 0.00024989023598180886,
"loss": 0.231,
"step": 5440
},
{
"epoch": 1.0094461937395813,
"grad_norm": 0.18371330112888887,
"learning_rate": 0.0002491584774226599,
"loss": 0.2927,
"step": 5450
},
{
"epoch": 1.0112983885904798,
"grad_norm": 0.21546778676484551,
"learning_rate": 0.0002484267260733065,
"loss": 0.265,
"step": 5460
},
{
"epoch": 1.013150583441378,
"grad_norm": 0.14298891444963896,
"learning_rate": 0.0002476949882030726,
"loss": 0.2211,
"step": 5470
},
{
"epoch": 1.0150027782922764,
"grad_norm": 0.25187217178584165,
"learning_rate": 0.0002469632700811665,
"loss": 0.2581,
"step": 5480
},
{
"epoch": 1.0168549731431746,
"grad_norm": 0.31946252092124755,
"learning_rate": 0.00024623157797662757,
"loss": 0.2171,
"step": 5490
},
{
"epoch": 1.018707167994073,
"grad_norm": 0.20257626106772428,
"learning_rate": 0.000245499918158272,
"loss": 0.21,
"step": 5500
},
{
"epoch": 1.0205593628449714,
"grad_norm": 0.30792020448282925,
"learning_rate": 0.00024476829689463965,
"loss": 0.2199,
"step": 5510
},
{
"epoch": 1.0224115576958697,
"grad_norm": 0.2359106076314458,
"learning_rate": 0.0002440367204539398,
"loss": 0.2221,
"step": 5520
},
{
"epoch": 1.024263752546768,
"grad_norm": 0.2642461112213505,
"learning_rate": 0.00024330519510399774,
"loss": 0.287,
"step": 5530
},
{
"epoch": 1.0261159473976662,
"grad_norm": 0.25013845200803386,
"learning_rate": 0.00024257372711220134,
"loss": 0.2578,
"step": 5540
},
{
"epoch": 1.0279681422485645,
"grad_norm": 0.26551429905341034,
"learning_rate": 0.00024184232274544672,
"loss": 0.2509,
"step": 5550
},
{
"epoch": 1.029820337099463,
"grad_norm": 0.2070332092773878,
"learning_rate": 0.00024111098827008494,
"loss": 0.2202,
"step": 5560
},
{
"epoch": 1.0316725319503612,
"grad_norm": 0.21040587853785286,
"learning_rate": 0.00024037972995186838,
"loss": 0.2858,
"step": 5570
},
{
"epoch": 1.0335247268012595,
"grad_norm": 0.21864583485000008,
"learning_rate": 0.00023964855405589689,
"loss": 0.2114,
"step": 5580
},
{
"epoch": 1.0353769216521578,
"grad_norm": 0.21646010024279735,
"learning_rate": 0.00023891746684656412,
"loss": 0.2519,
"step": 5590
},
{
"epoch": 1.037229116503056,
"grad_norm": 0.31512168932825474,
"learning_rate": 0.00023818647458750388,
"loss": 0.2967,
"step": 5600
},
{
"epoch": 1.0390813113539545,
"grad_norm": 0.20525167225456686,
"learning_rate": 0.00023745558354153654,
"loss": 0.2591,
"step": 5610
},
{
"epoch": 1.0409335062048528,
"grad_norm": 0.23384175420672978,
"learning_rate": 0.0002367247999706154,
"loss": 0.2236,
"step": 5620
},
{
"epoch": 1.042785701055751,
"grad_norm": 0.24586451573414675,
"learning_rate": 0.00023599413013577277,
"loss": 0.2807,
"step": 5630
},
{
"epoch": 1.0446378959066493,
"grad_norm": 0.31412889304572406,
"learning_rate": 0.00023526358029706665,
"loss": 0.2676,
"step": 5640
},
{
"epoch": 1.0464900907575476,
"grad_norm": 0.157853905207218,
"learning_rate": 0.00023453315671352693,
"loss": 0.2769,
"step": 5650
},
{
"epoch": 1.0483422856084461,
"grad_norm": 0.2229105615382073,
"learning_rate": 0.00023380286564310176,
"loss": 0.2735,
"step": 5660
},
{
"epoch": 1.0501944804593444,
"grad_norm": 0.26127473765870846,
"learning_rate": 0.0002330727133426041,
"loss": 0.3007,
"step": 5670
},
{
"epoch": 1.0520466753102427,
"grad_norm": 0.3906751493250249,
"learning_rate": 0.00023234270606765778,
"loss": 0.2809,
"step": 5680
},
{
"epoch": 1.053898870161141,
"grad_norm": 0.2398049248934978,
"learning_rate": 0.00023161285007264446,
"loss": 0.2144,
"step": 5690
},
{
"epoch": 1.0557510650120392,
"grad_norm": 0.24411940105501112,
"learning_rate": 0.0002308831516106494,
"loss": 0.223,
"step": 5700
},
{
"epoch": 1.0576032598629377,
"grad_norm": 0.2547297157594742,
"learning_rate": 0.0002301536169334082,
"loss": 0.2458,
"step": 5710
},
{
"epoch": 1.059455454713836,
"grad_norm": 0.18393906015457895,
"learning_rate": 0.00022942425229125328,
"loss": 0.248,
"step": 5720
},
{
"epoch": 1.0613076495647342,
"grad_norm": 0.24279551434371524,
"learning_rate": 0.0002286950639330604,
"loss": 0.2709,
"step": 5730
},
{
"epoch": 1.0631598444156325,
"grad_norm": 0.23381376758753333,
"learning_rate": 0.00022796605810619487,
"loss": 0.2361,
"step": 5740
},
{
"epoch": 1.0650120392665308,
"grad_norm": 0.24452694586413046,
"learning_rate": 0.00022723724105645814,
"loss": 0.2076,
"step": 5750
},
{
"epoch": 1.0668642341174293,
"grad_norm": 0.30441717560616044,
"learning_rate": 0.00022650861902803426,
"loss": 0.2922,
"step": 5760
},
{
"epoch": 1.0687164289683275,
"grad_norm": 0.2588550928583629,
"learning_rate": 0.00022578019826343656,
"loss": 0.2687,
"step": 5770
},
{
"epoch": 1.0705686238192258,
"grad_norm": 0.17900093913620954,
"learning_rate": 0.00022505198500345403,
"loss": 0.2467,
"step": 5780
},
{
"epoch": 1.072420818670124,
"grad_norm": 0.2492431472220246,
"learning_rate": 0.00022432398548709767,
"loss": 0.2938,
"step": 5790
},
{
"epoch": 1.0742730135210223,
"grad_norm": 0.21358503411722063,
"learning_rate": 0.00022359620595154743,
"loss": 0.2038,
"step": 5800
},
{
"epoch": 1.0761252083719208,
"grad_norm": 0.28309019763963955,
"learning_rate": 0.00022286865263209833,
"loss": 0.2905,
"step": 5810
},
{
"epoch": 1.077977403222819,
"grad_norm": 0.21729388154855128,
"learning_rate": 0.00022214133176210756,
"loss": 0.226,
"step": 5820
},
{
"epoch": 1.0798295980737174,
"grad_norm": 0.18775475682209616,
"learning_rate": 0.0002214142495729405,
"loss": 0.2762,
"step": 5830
},
{
"epoch": 1.0816817929246156,
"grad_norm": 0.19069211253783463,
"learning_rate": 0.00022068741229391777,
"loss": 0.2256,
"step": 5840
},
{
"epoch": 1.083533987775514,
"grad_norm": 0.25813186890444373,
"learning_rate": 0.00021996082615226176,
"loss": 0.2409,
"step": 5850
},
{
"epoch": 1.0853861826264124,
"grad_norm": 0.19945938160620094,
"learning_rate": 0.00021923449737304312,
"loss": 0.2536,
"step": 5860
},
{
"epoch": 1.0872383774773107,
"grad_norm": 0.25882839571818395,
"learning_rate": 0.00021850843217912757,
"loss": 0.277,
"step": 5870
},
{
"epoch": 1.089090572328209,
"grad_norm": 0.3164832568487736,
"learning_rate": 0.0002177826367911225,
"loss": 0.2705,
"step": 5880
},
{
"epoch": 1.0909427671791072,
"grad_norm": 0.26233993949922385,
"learning_rate": 0.0002170571174273238,
"loss": 0.2524,
"step": 5890
},
{
"epoch": 1.0927949620300055,
"grad_norm": 0.21974259388964484,
"learning_rate": 0.0002163318803036624,
"loss": 0.2304,
"step": 5900
},
{
"epoch": 1.094647156880904,
"grad_norm": 0.2423119808479642,
"learning_rate": 0.00021560693163365127,
"loss": 0.2864,
"step": 5910
},
{
"epoch": 1.0964993517318022,
"grad_norm": 0.23788077135736266,
"learning_rate": 0.00021488227762833187,
"loss": 0.223,
"step": 5920
},
{
"epoch": 1.0983515465827005,
"grad_norm": 0.2626939992945942,
"learning_rate": 0.00021415792449622128,
"loss": 0.2174,
"step": 5930
},
{
"epoch": 1.1002037414335988,
"grad_norm": 0.15991056421689562,
"learning_rate": 0.0002134338784432587,
"loss": 0.2381,
"step": 5940
},
{
"epoch": 1.102055936284497,
"grad_norm": 0.20700833727267778,
"learning_rate": 0.00021271014567275239,
"loss": 0.2646,
"step": 5950
},
{
"epoch": 1.1039081311353955,
"grad_norm": 0.3351339504582773,
"learning_rate": 0.00021198673238532665,
"loss": 0.2484,
"step": 5960
},
{
"epoch": 1.1057603259862938,
"grad_norm": 0.25621425870572345,
"learning_rate": 0.00021126364477886848,
"loss": 0.2078,
"step": 5970
},
{
"epoch": 1.107612520837192,
"grad_norm": 0.23131050803651781,
"learning_rate": 0.00021054088904847476,
"loss": 0.2254,
"step": 5980
},
{
"epoch": 1.1094647156880904,
"grad_norm": 0.18439721493846953,
"learning_rate": 0.0002098184713863987,
"loss": 0.2095,
"step": 5990
},
{
"epoch": 1.1113169105389886,
"grad_norm": 0.2388500241914586,
"learning_rate": 0.00020909639798199754,
"loss": 0.2091,
"step": 6000
},
{
"epoch": 1.1131691053898871,
"grad_norm": 0.21529124736985356,
"learning_rate": 0.00020837467502167868,
"loss": 0.2167,
"step": 6010
},
{
"epoch": 1.1150213002407854,
"grad_norm": 0.16618163554721885,
"learning_rate": 0.0002076533086888472,
"loss": 0.2104,
"step": 6020
},
{
"epoch": 1.1168734950916837,
"grad_norm": 0.33925928207566014,
"learning_rate": 0.00020693230516385266,
"loss": 0.2119,
"step": 6030
},
{
"epoch": 1.118725689942582,
"grad_norm": 0.1826830206402772,
"learning_rate": 0.0002062116706239365,
"loss": 0.2462,
"step": 6040
},
{
"epoch": 1.1205778847934802,
"grad_norm": 0.19046785383617137,
"learning_rate": 0.00020549141124317865,
"loss": 0.2117,
"step": 6050
},
{
"epoch": 1.1224300796443787,
"grad_norm": 0.24622926500228018,
"learning_rate": 0.00020477153319244478,
"loss": 0.227,
"step": 6060
},
{
"epoch": 1.124282274495277,
"grad_norm": 0.2165508639382145,
"learning_rate": 0.00020405204263933375,
"loss": 0.2638,
"step": 6070
},
{
"epoch": 1.1261344693461752,
"grad_norm": 0.23498687913366198,
"learning_rate": 0.00020333294574812415,
"loss": 0.2281,
"step": 6080
},
{
"epoch": 1.1279866641970735,
"grad_norm": 0.19311160739289338,
"learning_rate": 0.00020261424867972226,
"loss": 0.2159,
"step": 6090
},
{
"epoch": 1.1298388590479718,
"grad_norm": 0.20569897318234276,
"learning_rate": 0.00020189595759160855,
"loss": 0.2557,
"step": 6100
},
{
"epoch": 1.1316910538988703,
"grad_norm": 0.1637570670386419,
"learning_rate": 0.00020117807863778537,
"loss": 0.2231,
"step": 6110
},
{
"epoch": 1.1335432487497685,
"grad_norm": 0.26014467806402464,
"learning_rate": 0.000200460617968724,
"loss": 0.286,
"step": 6120
},
{
"epoch": 1.1353954436006668,
"grad_norm": 0.2505673154655342,
"learning_rate": 0.00019974358173131202,
"loss": 0.2853,
"step": 6130
},
{
"epoch": 1.137247638451565,
"grad_norm": 0.22347929448158552,
"learning_rate": 0.00019902697606880089,
"loss": 0.2677,
"step": 6140
},
{
"epoch": 1.1390998333024633,
"grad_norm": 0.20920726669707854,
"learning_rate": 0.00019831080712075268,
"loss": 0.244,
"step": 6150
},
{
"epoch": 1.1409520281533618,
"grad_norm": 0.20688915094296348,
"learning_rate": 0.00019759508102298846,
"loss": 0.2327,
"step": 6160
},
{
"epoch": 1.14280422300426,
"grad_norm": 0.25157909739969075,
"learning_rate": 0.00019687980390753465,
"loss": 0.2485,
"step": 6170
},
{
"epoch": 1.1446564178551584,
"grad_norm": 0.23866241222091628,
"learning_rate": 0.00019616498190257121,
"loss": 0.2492,
"step": 6180
},
{
"epoch": 1.1465086127060566,
"grad_norm": 0.264337208089594,
"learning_rate": 0.00019545062113237875,
"loss": 0.2758,
"step": 6190
},
{
"epoch": 1.148360807556955,
"grad_norm": 0.25587094035952673,
"learning_rate": 0.00019473672771728648,
"loss": 0.2129,
"step": 6200
},
{
"epoch": 1.1502130024078534,
"grad_norm": 0.16128043145453166,
"learning_rate": 0.00019402330777361934,
"loss": 0.2231,
"step": 6210
},
{
"epoch": 1.1520651972587517,
"grad_norm": 0.233999456400375,
"learning_rate": 0.0001933103674136458,
"loss": 0.2443,
"step": 6220
},
{
"epoch": 1.15391739210965,
"grad_norm": 0.23923089697365066,
"learning_rate": 0.00019259791274552548,
"loss": 0.2532,
"step": 6230
},
{
"epoch": 1.1557695869605482,
"grad_norm": 0.18310940478929233,
"learning_rate": 0.00019188594987325675,
"loss": 0.2084,
"step": 6240
},
{
"epoch": 1.1576217818114465,
"grad_norm": 0.20715212646569164,
"learning_rate": 0.00019117448489662468,
"loss": 0.2315,
"step": 6250
},
{
"epoch": 1.159473976662345,
"grad_norm": 0.16666508872746613,
"learning_rate": 0.00019046352391114836,
"loss": 0.2214,
"step": 6260
},
{
"epoch": 1.1613261715132432,
"grad_norm": 0.19036221587749683,
"learning_rate": 0.000189753073008029,
"loss": 0.2011,
"step": 6270
},
{
"epoch": 1.1631783663641415,
"grad_norm": 0.18630573209584733,
"learning_rate": 0.00018904313827409764,
"loss": 0.2081,
"step": 6280
},
{
"epoch": 1.1650305612150398,
"grad_norm": 0.20378341723916718,
"learning_rate": 0.0001883337257917631,
"loss": 0.2573,
"step": 6290
},
{
"epoch": 1.166882756065938,
"grad_norm": 0.24764507328618723,
"learning_rate": 0.00018762484163895962,
"loss": 0.2245,
"step": 6300
},
{
"epoch": 1.1687349509168365,
"grad_norm": 0.2536985360849042,
"learning_rate": 0.00018691649188909494,
"loss": 0.2427,
"step": 6310
},
{
"epoch": 1.1705871457677348,
"grad_norm": 0.22553827575055346,
"learning_rate": 0.00018620868261099856,
"loss": 0.2556,
"step": 6320
},
{
"epoch": 1.172439340618633,
"grad_norm": 0.238267227934858,
"learning_rate": 0.00018550141986886914,
"loss": 0.2079,
"step": 6330
},
{
"epoch": 1.1742915354695314,
"grad_norm": 0.24364164673526545,
"learning_rate": 0.00018479470972222295,
"loss": 0.2377,
"step": 6340
},
{
"epoch": 1.1761437303204296,
"grad_norm": 0.23684110576656128,
"learning_rate": 0.00018408855822584186,
"loss": 0.2106,
"step": 6350
},
{
"epoch": 1.1779959251713281,
"grad_norm": 0.24133180260347029,
"learning_rate": 0.0001833829714297216,
"loss": 0.2325,
"step": 6360
},
{
"epoch": 1.1798481200222264,
"grad_norm": 0.27161152313481657,
"learning_rate": 0.0001826779553790196,
"loss": 0.2816,
"step": 6370
},
{
"epoch": 1.1817003148731247,
"grad_norm": 0.2549979606684111,
"learning_rate": 0.0001819735161140035,
"loss": 0.2716,
"step": 6380
},
{
"epoch": 1.183552509724023,
"grad_norm": 0.2171602609914945,
"learning_rate": 0.0001812696596699992,
"loss": 0.1919,
"step": 6390
},
{
"epoch": 1.1854047045749212,
"grad_norm": 0.2426365201904578,
"learning_rate": 0.00018056639207733943,
"loss": 0.1937,
"step": 6400
},
{
"epoch": 1.1872568994258197,
"grad_norm": 0.23103167647591963,
"learning_rate": 0.0001798637193613118,
"loss": 0.2212,
"step": 6410
},
{
"epoch": 1.189109094276718,
"grad_norm": 0.18152043318271277,
"learning_rate": 0.00017916164754210723,
"loss": 0.2525,
"step": 6420
},
{
"epoch": 1.1909612891276162,
"grad_norm": 0.2404169525253988,
"learning_rate": 0.00017846018263476844,
"loss": 0.2365,
"step": 6430
},
{
"epoch": 1.1928134839785145,
"grad_norm": 0.2527427714001698,
"learning_rate": 0.00017775933064913838,
"loss": 0.2382,
"step": 6440
},
{
"epoch": 1.1946656788294128,
"grad_norm": 0.2504119633783523,
"learning_rate": 0.0001770590975898089,
"loss": 0.2435,
"step": 6450
},
{
"epoch": 1.1965178736803113,
"grad_norm": 0.21122876356534948,
"learning_rate": 0.0001763594894560689,
"loss": 0.2182,
"step": 6460
},
{
"epoch": 1.1983700685312095,
"grad_norm": 0.17197814060082,
"learning_rate": 0.00017566051224185357,
"loss": 0.2316,
"step": 6470
},
{
"epoch": 1.2002222633821078,
"grad_norm": 0.2261749683499797,
"learning_rate": 0.0001749621719356923,
"loss": 0.2834,
"step": 6480
},
{
"epoch": 1.202074458233006,
"grad_norm": 0.18709901189179085,
"learning_rate": 0.00017426447452065786,
"loss": 0.2329,
"step": 6490
},
{
"epoch": 1.2039266530839043,
"grad_norm": 0.22261464085835025,
"learning_rate": 0.00017356742597431503,
"loss": 0.2294,
"step": 6500
},
{
"epoch": 1.2057788479348028,
"grad_norm": 0.1562966068716981,
"learning_rate": 0.0001728710322686694,
"loss": 0.2676,
"step": 6510
},
{
"epoch": 1.207631042785701,
"grad_norm": 0.20080366502853164,
"learning_rate": 0.00017217529937011612,
"loss": 0.2034,
"step": 6520
},
{
"epoch": 1.2094832376365994,
"grad_norm": 0.2488017093046758,
"learning_rate": 0.00017148023323938877,
"loss": 0.2576,
"step": 6530
},
{
"epoch": 1.2113354324874976,
"grad_norm": 0.3018899089016778,
"learning_rate": 0.00017078583983150852,
"loss": 0.2521,
"step": 6540
},
{
"epoch": 1.213187627338396,
"grad_norm": 0.21650035591018305,
"learning_rate": 0.00017009212509573273,
"loss": 0.1992,
"step": 6550
},
{
"epoch": 1.2150398221892944,
"grad_norm": 0.18604059543117943,
"learning_rate": 0.00016939909497550455,
"loss": 0.2145,
"step": 6560
},
{
"epoch": 1.2168920170401927,
"grad_norm": 0.13425561299908903,
"learning_rate": 0.0001687067554084012,
"loss": 0.2121,
"step": 6570
},
{
"epoch": 1.218744211891091,
"grad_norm": 0.15061326471247105,
"learning_rate": 0.00016801511232608388,
"loss": 0.2093,
"step": 6580
},
{
"epoch": 1.2205964067419892,
"grad_norm": 0.18586921295904735,
"learning_rate": 0.00016732417165424645,
"loss": 0.2442,
"step": 6590
},
{
"epoch": 1.2224486015928875,
"grad_norm": 0.1947265751683096,
"learning_rate": 0.00016663393931256484,
"loss": 0.1964,
"step": 6600
},
{
"epoch": 1.224300796443786,
"grad_norm": 0.3014541141949089,
"learning_rate": 0.00016594442121464648,
"loss": 0.2539,
"step": 6610
},
{
"epoch": 1.2261529912946842,
"grad_norm": 0.2665331923593494,
"learning_rate": 0.00016525562326797911,
"loss": 0.2052,
"step": 6620
},
{
"epoch": 1.2280051861455825,
"grad_norm": 0.23248425733346062,
"learning_rate": 0.00016456755137388105,
"loss": 0.2206,
"step": 6630
},
{
"epoch": 1.2298573809964808,
"grad_norm": 0.21597100541187533,
"learning_rate": 0.0001638802114274497,
"loss": 0.2399,
"step": 6640
},
{
"epoch": 1.231709575847379,
"grad_norm": 0.22311107620019674,
"learning_rate": 0.0001631936093175116,
"loss": 0.2344,
"step": 6650
},
{
"epoch": 1.2335617706982775,
"grad_norm": 0.23595231727324342,
"learning_rate": 0.0001625077509265717,
"loss": 0.2302,
"step": 6660
},
{
"epoch": 1.2354139655491758,
"grad_norm": 0.18416586445656416,
"learning_rate": 0.0001618226421307635,
"loss": 0.2438,
"step": 6670
},
{
"epoch": 1.237266160400074,
"grad_norm": 0.2397024652142972,
"learning_rate": 0.00016113828879979776,
"loss": 0.2174,
"step": 6680
},
{
"epoch": 1.2391183552509724,
"grad_norm": 0.2458273041744814,
"learning_rate": 0.00016045469679691306,
"loss": 0.2649,
"step": 6690
},
{
"epoch": 1.2409705501018706,
"grad_norm": 0.24261819790944433,
"learning_rate": 0.00015977187197882529,
"loss": 0.2353,
"step": 6700
},
{
"epoch": 1.2428227449527691,
"grad_norm": 0.21058758451619233,
"learning_rate": 0.0001590898201956772,
"loss": 0.2517,
"step": 6710
},
{
"epoch": 1.2446749398036674,
"grad_norm": 0.2260538599044833,
"learning_rate": 0.0001584085472909888,
"loss": 0.2425,
"step": 6720
},
{
"epoch": 1.2465271346545657,
"grad_norm": 0.2973826520271178,
"learning_rate": 0.0001577280591016068,
"loss": 0.2344,
"step": 6730
},
{
"epoch": 1.248379329505464,
"grad_norm": 0.17773144739281946,
"learning_rate": 0.0001570483614576549,
"loss": 0.237,
"step": 6740
},
{
"epoch": 1.2502315243563622,
"grad_norm": 0.24361822775457953,
"learning_rate": 0.0001563694601824837,
"loss": 0.2208,
"step": 6750
},
{
"epoch": 1.2520837192072607,
"grad_norm": 0.19831921681917936,
"learning_rate": 0.000155691361092621,
"loss": 0.2447,
"step": 6760
},
{
"epoch": 1.253935914058159,
"grad_norm": 0.2429000368973823,
"learning_rate": 0.00015501406999772154,
"loss": 0.2525,
"step": 6770
},
{
"epoch": 1.2557881089090572,
"grad_norm": 0.2833773062005256,
"learning_rate": 0.000154337592700518,
"loss": 0.2699,
"step": 6780
},
{
"epoch": 1.2576403037599555,
"grad_norm": 0.28456822568540374,
"learning_rate": 0.00015366193499677036,
"loss": 0.2871,
"step": 6790
},
{
"epoch": 1.2594924986108538,
"grad_norm": 0.22620507444223148,
"learning_rate": 0.00015298710267521682,
"loss": 0.2287,
"step": 6800
},
{
"epoch": 1.2613446934617523,
"grad_norm": 0.28690671723743605,
"learning_rate": 0.00015231310151752407,
"loss": 0.2882,
"step": 6810
},
{
"epoch": 1.2631968883126505,
"grad_norm": 0.3475884413325309,
"learning_rate": 0.0001516399372982377,
"loss": 0.2293,
"step": 6820
},
{
"epoch": 1.2650490831635488,
"grad_norm": 0.2072556191346626,
"learning_rate": 0.000150967615784733,
"loss": 0.2185,
"step": 6830
},
{
"epoch": 1.266901278014447,
"grad_norm": 0.21644887901267165,
"learning_rate": 0.00015029614273716506,
"loss": 0.2664,
"step": 6840
},
{
"epoch": 1.2687534728653453,
"grad_norm": 0.17990296855165974,
"learning_rate": 0.0001496255239084199,
"loss": 0.2087,
"step": 6850
},
{
"epoch": 1.2706056677162438,
"grad_norm": 0.27058636297908395,
"learning_rate": 0.00014895576504406465,
"loss": 0.1908,
"step": 6860
},
{
"epoch": 1.272457862567142,
"grad_norm": 0.18569390040885966,
"learning_rate": 0.00014828687188229905,
"loss": 0.2416,
"step": 6870
},
{
"epoch": 1.2743100574180404,
"grad_norm": 0.29190142926898804,
"learning_rate": 0.00014761885015390568,
"loss": 0.2463,
"step": 6880
},
{
"epoch": 1.2761622522689386,
"grad_norm": 0.17606951118976896,
"learning_rate": 0.000146951705582201,
"loss": 0.2208,
"step": 6890
},
{
"epoch": 1.278014447119837,
"grad_norm": 0.17608746275541837,
"learning_rate": 0.00014628544388298642,
"loss": 0.219,
"step": 6900
},
{
"epoch": 1.2798666419707354,
"grad_norm": 0.16242847709515437,
"learning_rate": 0.00014562007076449944,
"loss": 0.2331,
"step": 6910
},
{
"epoch": 1.2817188368216337,
"grad_norm": 0.2755204876160437,
"learning_rate": 0.00014495559192736435,
"loss": 0.2291,
"step": 6920
},
{
"epoch": 1.283571031672532,
"grad_norm": 0.20200318254837507,
"learning_rate": 0.00014429201306454364,
"loss": 0.235,
"step": 6930
},
{
"epoch": 1.2854232265234302,
"grad_norm": 0.17156079642065042,
"learning_rate": 0.00014362933986128963,
"loss": 0.2182,
"step": 6940
},
{
"epoch": 1.2872754213743285,
"grad_norm": 0.21604115340537886,
"learning_rate": 0.0001429675779950947,
"loss": 0.2471,
"step": 6950
},
{
"epoch": 1.289127616225227,
"grad_norm": 0.187996583890282,
"learning_rate": 0.00014230673313564397,
"loss": 0.2151,
"step": 6960
},
{
"epoch": 1.2909798110761252,
"grad_norm": 0.19730532837034964,
"learning_rate": 0.00014164681094476551,
"loss": 0.2106,
"step": 6970
},
{
"epoch": 1.2928320059270235,
"grad_norm": 0.18610760518567895,
"learning_rate": 0.0001409878170763826,
"loss": 0.1997,
"step": 6980
},
{
"epoch": 1.2946842007779218,
"grad_norm": 0.26588737789650624,
"learning_rate": 0.00014032975717646505,
"loss": 0.2779,
"step": 6990
},
{
"epoch": 1.29653639562882,
"grad_norm": 0.2023558780876639,
"learning_rate": 0.0001396726368829808,
"loss": 0.1862,
"step": 7000
},
{
"epoch": 1.2983885904797186,
"grad_norm": 0.1911627012671031,
"learning_rate": 0.0001390164618258477,
"loss": 0.2309,
"step": 7010
},
{
"epoch": 1.3002407853306168,
"grad_norm": 0.11786773578619021,
"learning_rate": 0.0001383612376268852,
"loss": 0.2342,
"step": 7020
},
{
"epoch": 1.302092980181515,
"grad_norm": 0.28174803457783004,
"learning_rate": 0.00013770696989976616,
"loss": 0.2286,
"step": 7030
},
{
"epoch": 1.3039451750324134,
"grad_norm": 0.17826542771264642,
"learning_rate": 0.0001370536642499689,
"loss": 0.1801,
"step": 7040
},
{
"epoch": 1.3057973698833116,
"grad_norm": 0.2244828460772529,
"learning_rate": 0.00013640132627472918,
"loss": 0.2266,
"step": 7050
},
{
"epoch": 1.3076495647342101,
"grad_norm": 0.17076031236762176,
"learning_rate": 0.0001357499615629919,
"loss": 0.2064,
"step": 7060
},
{
"epoch": 1.3095017595851084,
"grad_norm": 0.21153152349490145,
"learning_rate": 0.00013509957569536368,
"loss": 0.2259,
"step": 7070
},
{
"epoch": 1.3113539544360067,
"grad_norm": 0.21657797572838655,
"learning_rate": 0.00013445017424406459,
"loss": 0.2174,
"step": 7080
},
{
"epoch": 1.313206149286905,
"grad_norm": 0.19916951980627734,
"learning_rate": 0.00013380176277288098,
"loss": 0.2524,
"step": 7090
},
{
"epoch": 1.3150583441378032,
"grad_norm": 0.15608777576271463,
"learning_rate": 0.00013315434683711731,
"loss": 0.2252,
"step": 7100
},
{
"epoch": 1.3169105389887017,
"grad_norm": 0.21137373945091645,
"learning_rate": 0.0001325079319835486,
"loss": 0.2512,
"step": 7110
},
{
"epoch": 1.3187627338396,
"grad_norm": 0.28789005617840957,
"learning_rate": 0.00013186252375037332,
"loss": 0.2269,
"step": 7120
},
{
"epoch": 1.3206149286904982,
"grad_norm": 0.20697477426134353,
"learning_rate": 0.0001312181276671654,
"loss": 0.1923,
"step": 7130
},
{
"epoch": 1.3224671235413965,
"grad_norm": 0.20780168330103488,
"learning_rate": 0.00013057474925482732,
"loss": 0.2,
"step": 7140
},
{
"epoch": 1.3243193183922948,
"grad_norm": 0.2619781587243672,
"learning_rate": 0.00012993239402554237,
"loss": 0.2418,
"step": 7150
},
{
"epoch": 1.3261715132431933,
"grad_norm": 0.21912577308112016,
"learning_rate": 0.00012929106748272792,
"loss": 0.2187,
"step": 7160
},
{
"epoch": 1.3280237080940915,
"grad_norm": 0.2268912171128973,
"learning_rate": 0.00012865077512098789,
"loss": 0.2028,
"step": 7170
},
{
"epoch": 1.3298759029449898,
"grad_norm": 0.21743955397611459,
"learning_rate": 0.0001280115224260658,
"loss": 0.2427,
"step": 7180
},
{
"epoch": 1.331728097795888,
"grad_norm": 0.2738954036709458,
"learning_rate": 0.00012737331487479764,
"loss": 0.2614,
"step": 7190
},
{
"epoch": 1.3335802926467863,
"grad_norm": 0.19258917852110208,
"learning_rate": 0.00012673615793506524,
"loss": 0.2099,
"step": 7200
},
{
"epoch": 1.3354324874976848,
"grad_norm": 0.2502839601700166,
"learning_rate": 0.00012610005706574918,
"loss": 0.212,
"step": 7210
},
{
"epoch": 1.337284682348583,
"grad_norm": 0.2599916951105217,
"learning_rate": 0.0001254650177166821,
"loss": 0.2124,
"step": 7220
},
{
"epoch": 1.3391368771994814,
"grad_norm": 0.177484083446667,
"learning_rate": 0.00012483104532860204,
"loss": 0.1797,
"step": 7230
},
{
"epoch": 1.3409890720503796,
"grad_norm": 0.2826696479487746,
"learning_rate": 0.00012419814533310558,
"loss": 0.2466,
"step": 7240
},
{
"epoch": 1.342841266901278,
"grad_norm": 0.25661668196827314,
"learning_rate": 0.0001235663231526019,
"loss": 0.2332,
"step": 7250
},
{
"epoch": 1.3446934617521764,
"grad_norm": 0.2568941368041713,
"learning_rate": 0.00012293558420026557,
"loss": 0.2523,
"step": 7260
},
{
"epoch": 1.3465456566030747,
"grad_norm": 0.20215212528107282,
"learning_rate": 0.00012230593387999082,
"loss": 0.2352,
"step": 7270
},
{
"epoch": 1.348397851453973,
"grad_norm": 0.24815860875352733,
"learning_rate": 0.00012167737758634473,
"loss": 0.2188,
"step": 7280
},
{
"epoch": 1.3502500463048712,
"grad_norm": 0.22038982892081588,
"learning_rate": 0.00012104992070452137,
"loss": 0.2685,
"step": 7290
},
{
"epoch": 1.3521022411557695,
"grad_norm": 0.2083445910203971,
"learning_rate": 0.00012042356861029547,
"loss": 0.2328,
"step": 7300
},
{
"epoch": 1.353954436006668,
"grad_norm": 0.20267314146087212,
"learning_rate": 0.00011979832666997642,
"loss": 0.2264,
"step": 7310
},
{
"epoch": 1.3558066308575663,
"grad_norm": 0.29234235079551857,
"learning_rate": 0.00011917420024036241,
"loss": 0.24,
"step": 7320
},
{
"epoch": 1.3576588257084645,
"grad_norm": 0.19217333964822353,
"learning_rate": 0.00011855119466869426,
"loss": 0.2551,
"step": 7330
},
{
"epoch": 1.3595110205593628,
"grad_norm": 0.18622316897174804,
"learning_rate": 0.00011792931529260992,
"loss": 0.2383,
"step": 7340
},
{
"epoch": 1.361363215410261,
"grad_norm": 0.2639171890597442,
"learning_rate": 0.00011730856744009846,
"loss": 0.2447,
"step": 7350
},
{
"epoch": 1.3632154102611596,
"grad_norm": 0.24703406547971726,
"learning_rate": 0.0001166889564294546,
"loss": 0.1885,
"step": 7360
},
{
"epoch": 1.3650676051120578,
"grad_norm": 0.2395087018493502,
"learning_rate": 0.00011607048756923327,
"loss": 0.2408,
"step": 7370
},
{
"epoch": 1.366919799962956,
"grad_norm": 0.1715869085136323,
"learning_rate": 0.00011551484651328101,
"loss": 0.2231,
"step": 7380
},
{
"epoch": 1.3687719948138544,
"grad_norm": 0.24875690978651382,
"learning_rate": 0.0001148985623288476,
"loss": 0.2107,
"step": 7390
},
{
"epoch": 1.3706241896647526,
"grad_norm": 0.21621060634153644,
"learning_rate": 0.00011428343563414629,
"loss": 0.2827,
"step": 7400
},
{
"epoch": 1.3724763845156511,
"grad_norm": 0.17411298598721778,
"learning_rate": 0.00011366947169931222,
"loss": 0.1956,
"step": 7410
},
{
"epoch": 1.3743285793665494,
"grad_norm": 0.21075418595890044,
"learning_rate": 0.00011305667578451847,
"loss": 0.2384,
"step": 7420
},
{
"epoch": 1.3761807742174477,
"grad_norm": 0.1762011368192225,
"learning_rate": 0.00011244505313993115,
"loss": 0.2248,
"step": 7430
},
{
"epoch": 1.378032969068346,
"grad_norm": 0.2713344050149392,
"learning_rate": 0.00011183460900566405,
"loss": 0.2253,
"step": 7440
},
{
"epoch": 1.3798851639192442,
"grad_norm": 0.13308645120441578,
"learning_rate": 0.00011122534861173444,
"loss": 0.2188,
"step": 7450
},
{
"epoch": 1.3817373587701427,
"grad_norm": 0.26214160905875167,
"learning_rate": 0.00011061727717801745,
"loss": 0.2509,
"step": 7460
},
{
"epoch": 1.383589553621041,
"grad_norm": 0.16725861800168582,
"learning_rate": 0.00011001039991420181,
"loss": 0.2395,
"step": 7470
},
{
"epoch": 1.3854417484719392,
"grad_norm": 0.17751505759886393,
"learning_rate": 0.00010940472201974508,
"loss": 0.1914,
"step": 7480
},
{
"epoch": 1.3872939433228375,
"grad_norm": 0.21463454020196815,
"learning_rate": 0.00010880024868382943,
"loss": 0.2086,
"step": 7490
},
{
"epoch": 1.3891461381737358,
"grad_norm": 0.2026092509755857,
"learning_rate": 0.00010819698508531659,
"loss": 0.2149,
"step": 7500
},
{
"epoch": 1.3909983330246343,
"grad_norm": 0.16323623074986704,
"learning_rate": 0.00010759493639270387,
"loss": 0.27,
"step": 7510
},
{
"epoch": 1.3928505278755325,
"grad_norm": 0.22139846358468115,
"learning_rate": 0.00010705413557727304,
"loss": 0.2054,
"step": 7520
},
{
"epoch": 1.3947027227264308,
"grad_norm": 0.25885865603646047,
"learning_rate": 0.0001064544094077661,
"loss": 0.2037,
"step": 7530
},
{
"epoch": 1.396554917577329,
"grad_norm": 0.18312190666440223,
"learning_rate": 0.00010585591307378175,
"loss": 0.2177,
"step": 7540
},
{
"epoch": 1.3984071124282274,
"grad_norm": 0.2452824521308415,
"learning_rate": 0.00010525865170297353,
"loss": 0.2443,
"step": 7550
},
{
"epoch": 1.4002593072791258,
"grad_norm": 0.22491815184492542,
"learning_rate": 0.00010466263041241426,
"loss": 0.2028,
"step": 7560
},
{
"epoch": 1.4021115021300241,
"grad_norm": 0.21626081653727397,
"learning_rate": 0.00010406785430855237,
"loss": 0.1719,
"step": 7570
},
{
"epoch": 1.4039636969809224,
"grad_norm": 0.24105946067666537,
"learning_rate": 0.00010347432848716812,
"loss": 0.225,
"step": 7580
},
{
"epoch": 1.4058158918318207,
"grad_norm": 0.23078802018114886,
"learning_rate": 0.00010288205803332975,
"loss": 0.2278,
"step": 7590
},
{
"epoch": 1.407668086682719,
"grad_norm": 0.2574880724739788,
"learning_rate": 0.00010229104802135034,
"loss": 0.244,
"step": 7600
},
{
"epoch": 1.4095202815336174,
"grad_norm": 0.24593167284827877,
"learning_rate": 0.00010170130351474377,
"loss": 0.2159,
"step": 7610
},
{
"epoch": 1.4113724763845157,
"grad_norm": 0.261530928817991,
"learning_rate": 0.00010111282956618181,
"loss": 0.1827,
"step": 7620
},
{
"epoch": 1.413224671235414,
"grad_norm": 0.19005464332149496,
"learning_rate": 0.0001005256312174505,
"loss": 0.1942,
"step": 7630
},
{
"epoch": 1.4150768660863122,
"grad_norm": 0.22377467210489174,
"learning_rate": 9.993971349940717e-05,
"loss": 0.2553,
"step": 7640
},
{
"epoch": 1.4169290609372105,
"grad_norm": 0.21440875435999618,
"learning_rate": 9.935508143193739e-05,
"loss": 0.2169,
"step": 7650
},
{
"epoch": 1.418781255788109,
"grad_norm": 0.22734623733013004,
"learning_rate": 9.877174002391165e-05,
"loss": 0.1859,
"step": 7660
},
{
"epoch": 1.4206334506390073,
"grad_norm": 0.20257954902342695,
"learning_rate": 9.818969427314275e-05,
"loss": 0.208,
"step": 7670
},
{
"epoch": 1.4224856454899055,
"grad_norm": 0.23157903079657188,
"learning_rate": 9.760894916634283e-05,
"loss": 0.2136,
"step": 7680
},
{
"epoch": 1.4243378403408038,
"grad_norm": 0.23047953760740483,
"learning_rate": 9.702950967908067e-05,
"loss": 0.2244,
"step": 7690
},
{
"epoch": 1.426190035191702,
"grad_norm": 0.1893981494941497,
"learning_rate": 9.645138077573904e-05,
"loss": 0.202,
"step": 7700
},
{
"epoch": 1.4280422300426006,
"grad_norm": 0.1944059258719957,
"learning_rate": 9.587456740947236e-05,
"loss": 0.2395,
"step": 7710
},
{
"epoch": 1.4298944248934988,
"grad_norm": 0.19154551462212566,
"learning_rate": 9.529907452216402e-05,
"loss": 0.1877,
"step": 7720
},
{
"epoch": 1.431746619744397,
"grad_norm": 0.25705195721078017,
"learning_rate": 9.472490704438403e-05,
"loss": 0.2439,
"step": 7730
},
{
"epoch": 1.4335988145952954,
"grad_norm": 0.27237298997689074,
"learning_rate": 9.4152069895347e-05,
"loss": 0.2269,
"step": 7740
},
{
"epoch": 1.4354510094461936,
"grad_norm": 0.22572015857646327,
"learning_rate": 9.358056798286982e-05,
"loss": 0.1761,
"step": 7750
},
{
"epoch": 1.4373032042970921,
"grad_norm": 0.1681521243481353,
"learning_rate": 9.301040620332962e-05,
"loss": 0.2453,
"step": 7760
},
{
"epoch": 1.4391553991479904,
"grad_norm": 0.20322718308914284,
"learning_rate": 9.244158944162198e-05,
"loss": 0.1995,
"step": 7770
},
{
"epoch": 1.4410075939988887,
"grad_norm": 0.17221136952935692,
"learning_rate": 9.187412257111882e-05,
"loss": 0.1991,
"step": 7780
},
{
"epoch": 1.442859788849787,
"grad_norm": 0.23211721231411886,
"learning_rate": 9.130801045362678e-05,
"loss": 0.225,
"step": 7790
},
{
"epoch": 1.4447119837006852,
"grad_norm": 0.2557003480049842,
"learning_rate": 9.074325793934582e-05,
"loss": 0.2396,
"step": 7800
},
{
"epoch": 1.4465641785515837,
"grad_norm": 0.2743087049471899,
"learning_rate": 9.017986986682705e-05,
"loss": 0.2622,
"step": 7810
},
{
"epoch": 1.448416373402482,
"grad_norm": 0.22044857915056804,
"learning_rate": 8.961785106293202e-05,
"loss": 0.208,
"step": 7820
},
{
"epoch": 1.4502685682533802,
"grad_norm": 0.295975717325647,
"learning_rate": 8.905720634279068e-05,
"loss": 0.2406,
"step": 7830
},
{
"epoch": 1.4521207631042785,
"grad_norm": 0.2119255826308734,
"learning_rate": 8.849794050976062e-05,
"loss": 0.1863,
"step": 7840
},
{
"epoch": 1.4539729579551768,
"grad_norm": 0.19120118368025074,
"learning_rate": 8.794005835538558e-05,
"loss": 0.1899,
"step": 7850
},
{
"epoch": 1.4558251528060753,
"grad_norm": 0.20269011463788664,
"learning_rate": 8.738356465935467e-05,
"loss": 0.1887,
"step": 7860
},
{
"epoch": 1.4576773476569735,
"grad_norm": 0.2933956506003441,
"learning_rate": 8.68284641894613e-05,
"loss": 0.1969,
"step": 7870
},
{
"epoch": 1.4595295425078718,
"grad_norm": 0.17871898787286603,
"learning_rate": 8.627476170156224e-05,
"loss": 0.2315,
"step": 7880
},
{
"epoch": 1.46138173735877,
"grad_norm": 0.2552476396822797,
"learning_rate": 8.572246193953703e-05,
"loss": 0.2485,
"step": 7890
},
{
"epoch": 1.4632339322096684,
"grad_norm": 0.31173163044095015,
"learning_rate": 8.517156963524719e-05,
"loss": 0.1816,
"step": 7900
},
{
"epoch": 1.4650861270605668,
"grad_norm": 0.2158798667093176,
"learning_rate": 8.462208950849598e-05,
"loss": 0.2469,
"step": 7910
},
{
"epoch": 1.4669383219114651,
"grad_norm": 0.24218457777393995,
"learning_rate": 8.407402626698751e-05,
"loss": 0.2161,
"step": 7920
},
{
"epoch": 1.4687905167623634,
"grad_norm": 0.1979730263341676,
"learning_rate": 8.352738460628675e-05,
"loss": 0.2037,
"step": 7930
},
{
"epoch": 1.4706427116132617,
"grad_norm": 0.2696373926575332,
"learning_rate": 8.298216920977914e-05,
"loss": 0.1691,
"step": 7940
},
{
"epoch": 1.47249490646416,
"grad_norm": 0.25798986555999925,
"learning_rate": 8.243838474863047e-05,
"loss": 0.2285,
"step": 7950
},
{
"epoch": 1.4743471013150584,
"grad_norm": 0.20862952822180633,
"learning_rate": 8.189603588174712e-05,
"loss": 0.2118,
"step": 7960
},
{
"epoch": 1.4761992961659567,
"grad_norm": 0.1750842888641512,
"learning_rate": 8.135512725573574e-05,
"loss": 0.2116,
"step": 7970
},
{
"epoch": 1.478051491016855,
"grad_norm": 0.23773871116567313,
"learning_rate": 8.081566350486363e-05,
"loss": 0.1949,
"step": 7980
},
{
"epoch": 1.4799036858677532,
"grad_norm": 0.164420670542161,
"learning_rate": 8.027764925101911e-05,
"loss": 0.209,
"step": 7990
},
{
"epoch": 1.4817558807186515,
"grad_norm": 0.21216576721258398,
"learning_rate": 7.974108910367178e-05,
"loss": 0.1966,
"step": 8000
},
{
"epoch": 1.48360807556955,
"grad_norm": 0.2790248976449928,
"learning_rate": 7.920598765983308e-05,
"loss": 0.2063,
"step": 8010
},
{
"epoch": 1.4854602704204483,
"grad_norm": 0.29784954052004964,
"learning_rate": 7.867234950401714e-05,
"loss": 0.1589,
"step": 8020
},
{
"epoch": 1.4873124652713465,
"grad_norm": 0.15966925896267653,
"learning_rate": 7.8140179208201e-05,
"loss": 0.2203,
"step": 8030
},
{
"epoch": 1.4891646601222448,
"grad_norm": 0.21411813554248801,
"learning_rate": 7.76094813317858e-05,
"loss": 0.191,
"step": 8040
},
{
"epoch": 1.491016854973143,
"grad_norm": 0.16778546998214966,
"learning_rate": 7.708026042155775e-05,
"loss": 0.1972,
"step": 8050
},
{
"epoch": 1.4928690498240416,
"grad_norm": 0.23986270787568656,
"learning_rate": 7.655252101164894e-05,
"loss": 0.2115,
"step": 8060
},
{
"epoch": 1.4947212446749398,
"grad_norm": 0.250339172193944,
"learning_rate": 7.602626762349865e-05,
"loss": 0.2112,
"step": 8070
},
{
"epoch": 1.496573439525838,
"grad_norm": 0.18288675343831115,
"learning_rate": 7.55015047658146e-05,
"loss": 0.2316,
"step": 8080
},
{
"epoch": 1.4984256343767364,
"grad_norm": 0.23542544018483225,
"learning_rate": 7.497823693453429e-05,
"loss": 0.2278,
"step": 8090
},
{
"epoch": 1.5002778292276346,
"grad_norm": 0.21853735172760996,
"learning_rate": 7.44564686127865e-05,
"loss": 0.2435,
"step": 8100
},
{
"epoch": 1.5021300240785331,
"grad_norm": 0.230876996211439,
"learning_rate": 7.39362042708527e-05,
"loss": 0.2132,
"step": 8110
},
{
"epoch": 1.5039822189294314,
"grad_norm": 0.23449285027681627,
"learning_rate": 7.341744836612929e-05,
"loss": 0.2205,
"step": 8120
},
{
"epoch": 1.5058344137803297,
"grad_norm": 0.1770364349318145,
"learning_rate": 7.290020534308883e-05,
"loss": 0.1771,
"step": 8130
},
{
"epoch": 1.5076866086312282,
"grad_norm": 0.24440773842340074,
"learning_rate": 7.23844796332421e-05,
"loss": 0.2009,
"step": 8140
},
{
"epoch": 1.5095388034821262,
"grad_norm": 0.19125723562538224,
"learning_rate": 7.187027565510032e-05,
"loss": 0.2214,
"step": 8150
},
{
"epoch": 1.5113909983330247,
"grad_norm": 0.24413160941991816,
"learning_rate": 7.135759781413714e-05,
"loss": 0.2483,
"step": 8160
},
{
"epoch": 1.513243193183923,
"grad_norm": 0.18714126123807273,
"learning_rate": 7.084645050275093e-05,
"loss": 0.1754,
"step": 8170
},
{
"epoch": 1.5150953880348212,
"grad_norm": 0.24068172003031482,
"learning_rate": 7.033683810022717e-05,
"loss": 0.2208,
"step": 8180
},
{
"epoch": 1.5169475828857197,
"grad_norm": 0.21118944152545294,
"learning_rate": 6.982876497270093e-05,
"loss": 0.2354,
"step": 8190
},
{
"epoch": 1.5187997777366178,
"grad_norm": 0.16304142225648927,
"learning_rate": 6.932223547311948e-05,
"loss": 0.191,
"step": 8200
},
{
"epoch": 1.5206519725875163,
"grad_norm": 0.22402630540204685,
"learning_rate": 6.881725394120483e-05,
"loss": 0.2235,
"step": 8210
},
{
"epoch": 1.5225041674384145,
"grad_norm": 0.14686671761669617,
"learning_rate": 6.831382470341674e-05,
"loss": 0.2374,
"step": 8220
},
{
"epoch": 1.5243563622893128,
"grad_norm": 0.1910492658359761,
"learning_rate": 6.781195207291579e-05,
"loss": 0.1912,
"step": 8230
},
{
"epoch": 1.5262085571402113,
"grad_norm": 0.285797167185037,
"learning_rate": 6.7311640349526e-05,
"loss": 0.1946,
"step": 8240
},
{
"epoch": 1.5280607519911094,
"grad_norm": 0.24899927517169534,
"learning_rate": 6.681289381969827e-05,
"loss": 0.2437,
"step": 8250
},
{
"epoch": 1.5299129468420078,
"grad_norm": 0.27104957130230045,
"learning_rate": 6.631571675647358e-05,
"loss": 0.2007,
"step": 8260
},
{
"epoch": 1.5317651416929061,
"grad_norm": 0.1836787768149552,
"learning_rate": 6.582011341944661e-05,
"loss": 0.1992,
"step": 8270
},
{
"epoch": 1.5336173365438044,
"grad_norm": 0.16592192801262687,
"learning_rate": 6.532608805472884e-05,
"loss": 0.2243,
"step": 8280
},
{
"epoch": 1.5354695313947029,
"grad_norm": 0.19477759718427087,
"learning_rate": 6.483364489491242e-05,
"loss": 0.1866,
"step": 8290
},
{
"epoch": 1.537321726245601,
"grad_norm": 0.2612938997397552,
"learning_rate": 6.434278815903392e-05,
"loss": 0.1884,
"step": 8300
},
{
"epoch": 1.5391739210964994,
"grad_norm": 0.22106523393294486,
"learning_rate": 6.3853522052538e-05,
"loss": 0.2464,
"step": 8310
},
{
"epoch": 1.5410261159473977,
"grad_norm": 0.11918922044507506,
"learning_rate": 6.336585076724169e-05,
"loss": 0.2205,
"step": 8320
},
{
"epoch": 1.542878310798296,
"grad_norm": 0.27735599029951385,
"learning_rate": 6.287977848129811e-05,
"loss": 0.2125,
"step": 8330
},
{
"epoch": 1.5447305056491945,
"grad_norm": 0.11824966641617995,
"learning_rate": 6.239530935916105e-05,
"loss": 0.1886,
"step": 8340
},
{
"epoch": 1.5465827005000925,
"grad_norm": 0.14239263856247222,
"learning_rate": 6.191244755154896e-05,
"loss": 0.2283,
"step": 8350
},
{
"epoch": 1.548434895350991,
"grad_norm": 0.2614832702732058,
"learning_rate": 6.143119719540951e-05,
"loss": 0.2419,
"step": 8360
},
{
"epoch": 1.5502870902018893,
"grad_norm": 0.1719421648495295,
"learning_rate": 6.0951562413884276e-05,
"loss": 0.1813,
"step": 8370
},
{
"epoch": 1.5521392850527875,
"grad_norm": 0.1339861662540805,
"learning_rate": 6.047354731627319e-05,
"loss": 0.1732,
"step": 8380
},
{
"epoch": 1.553991479903686,
"grad_norm": 0.2649420028984007,
"learning_rate": 5.9997155997999486e-05,
"loss": 0.2312,
"step": 8390
},
{
"epoch": 1.555843674754584,
"grad_norm": 0.2986635713988608,
"learning_rate": 5.952239254057462e-05,
"loss": 0.2537,
"step": 8400
},
{
"epoch": 1.5576958696054826,
"grad_norm": 0.20847627410802858,
"learning_rate": 5.904926101156316e-05,
"loss": 0.2198,
"step": 8410
},
{
"epoch": 1.5595480644563808,
"grad_norm": 0.20583750133284387,
"learning_rate": 5.8577765464548014e-05,
"loss": 0.2194,
"step": 8420
},
{
"epoch": 1.561400259307279,
"grad_norm": 0.24884919423637333,
"learning_rate": 5.810790993909595e-05,
"loss": 0.2201,
"step": 8430
},
{
"epoch": 1.5632524541581776,
"grad_norm": 0.2292784136541862,
"learning_rate": 5.7639698460722366e-05,
"loss": 0.2139,
"step": 8440
},
{
"epoch": 1.5651046490090756,
"grad_norm": 0.20773042455822294,
"learning_rate": 5.717313504085761e-05,
"loss": 0.1876,
"step": 8450
},
{
"epoch": 1.5669568438599741,
"grad_norm": 0.218184017555461,
"learning_rate": 5.670822367681189e-05,
"loss": 0.1821,
"step": 8460
},
{
"epoch": 1.5688090387108724,
"grad_norm": 0.17843712174744172,
"learning_rate": 5.6244968351741396e-05,
"loss": 0.2006,
"step": 8470
},
{
"epoch": 1.5706612335617707,
"grad_norm": 0.21436245431091455,
"learning_rate": 5.578337303461414e-05,
"loss": 0.1928,
"step": 8480
},
{
"epoch": 1.5725134284126692,
"grad_norm": 0.2084740928506598,
"learning_rate": 5.532344168017589e-05,
"loss": 0.2444,
"step": 8490
},
{
"epoch": 1.5743656232635672,
"grad_norm": 0.20902509315653023,
"learning_rate": 5.4865178228916317e-05,
"loss": 0.2288,
"step": 8500
},
{
"epoch": 1.5762178181144657,
"grad_norm": 0.191128809958979,
"learning_rate": 5.4408586607035236e-05,
"loss": 0.2307,
"step": 8510
},
{
"epoch": 1.578070012965364,
"grad_norm": 0.2804233173323839,
"learning_rate": 5.3953670726408973e-05,
"loss": 0.2049,
"step": 8520
},
{
"epoch": 1.5799222078162622,
"grad_norm": 0.2523996334467096,
"learning_rate": 5.3500434484556744e-05,
"loss": 0.2309,
"step": 8530
},
{
"epoch": 1.5817744026671607,
"grad_norm": 0.22808681153892332,
"learning_rate": 5.304888176460759e-05,
"loss": 0.2224,
"step": 8540
},
{
"epoch": 1.5836265975180588,
"grad_norm": 0.17496689187022768,
"learning_rate": 5.2599016435266656e-05,
"loss": 0.212,
"step": 8550
},
{
"epoch": 1.5854787923689573,
"grad_norm": 0.16684956568038284,
"learning_rate": 5.215084235078232e-05,
"loss": 0.1599,
"step": 8560
},
{
"epoch": 1.5873309872198555,
"grad_norm": 0.2524704034190916,
"learning_rate": 5.170436335091319e-05,
"loss": 0.2239,
"step": 8570
},
{
"epoch": 1.5891831820707538,
"grad_norm": 0.20276889978373874,
"learning_rate": 5.130398471023492e-05,
"loss": 0.1991,
"step": 8580
},
{
"epoch": 1.5910353769216523,
"grad_norm": 0.19401086487052652,
"learning_rate": 5.086073689762982e-05,
"loss": 0.2054,
"step": 8590
},
{
"epoch": 1.5928875717725504,
"grad_norm": 0.24314231015564167,
"learning_rate": 5.0419195222696305e-05,
"loss": 0.216,
"step": 8600
},
{
"epoch": 1.5947397666234489,
"grad_norm": 0.1962559761069099,
"learning_rate": 4.9979363468369426e-05,
"loss": 0.2028,
"step": 8610
},
{
"epoch": 1.5965919614743471,
"grad_norm": 0.21450451616005048,
"learning_rate": 4.95412454029342e-05,
"loss": 0.1485,
"step": 8620
},
{
"epoch": 1.5984441563252454,
"grad_norm": 0.2262800799406614,
"learning_rate": 4.9104844779993744e-05,
"loss": 0.2205,
"step": 8630
},
{
"epoch": 1.6002963511761439,
"grad_norm": 0.15673015952559616,
"learning_rate": 4.867016533843677e-05,
"loss": 0.1878,
"step": 8640
},
{
"epoch": 1.602148546027042,
"grad_norm": 0.22772029995019283,
"learning_rate": 4.823721080240562e-05,
"loss": 0.2144,
"step": 8650
},
{
"epoch": 1.6040007408779404,
"grad_norm": 0.16737363953611054,
"learning_rate": 4.7805984881264366e-05,
"loss": 0.219,
"step": 8660
},
{
"epoch": 1.6058529357288387,
"grad_norm": 0.15059728369872777,
"learning_rate": 4.7376491269567305e-05,
"loss": 0.1827,
"step": 8670
},
{
"epoch": 1.607705130579737,
"grad_norm": 0.2174362092107457,
"learning_rate": 4.694873364702687e-05,
"loss": 0.2427,
"step": 8680
},
{
"epoch": 1.6095573254306355,
"grad_norm": 0.2536534486510469,
"learning_rate": 4.652271567848229e-05,
"loss": 0.2458,
"step": 8690
},
{
"epoch": 1.6114095202815335,
"grad_norm": 0.20306793867476478,
"learning_rate": 4.6098441013868285e-05,
"loss": 0.221,
"step": 8700
},
{
"epoch": 1.613261715132432,
"grad_norm": 0.29865060955062883,
"learning_rate": 4.567591328818371e-05,
"loss": 0.2621,
"step": 8710
},
{
"epoch": 1.6151139099833303,
"grad_norm": 0.20862574024207642,
"learning_rate": 4.529713496011825e-05,
"loss": 0.207,
"step": 8720
},
{
"epoch": 1.6169661048342285,
"grad_norm": 0.21837675462224324,
"learning_rate": 4.487793637919196e-05,
"loss": 0.1828,
"step": 8730
},
{
"epoch": 1.618818299685127,
"grad_norm": 0.23283771674120501,
"learning_rate": 4.446049519394233e-05,
"loss": 0.2166,
"step": 8740
},
{
"epoch": 1.620670494536025,
"grad_norm": 0.1948474369408113,
"learning_rate": 4.4044814980821856e-05,
"loss": 0.2154,
"step": 8750
},
{
"epoch": 1.6225226893869236,
"grad_norm": 0.2821939610991762,
"learning_rate": 4.3630899301195904e-05,
"loss": 0.2428,
"step": 8760
},
{
"epoch": 1.6243748842378218,
"grad_norm": 0.18991376076496028,
"learning_rate": 4.321875170131218e-05,
"loss": 0.1933,
"step": 8770
},
{
"epoch": 1.62622707908872,
"grad_norm": 0.17477269695823847,
"learning_rate": 4.280837571227006e-05,
"loss": 0.1945,
"step": 8780
},
{
"epoch": 1.6280792739396186,
"grad_norm": 0.22671892134617525,
"learning_rate": 4.239977484999063e-05,
"loss": 0.1973,
"step": 8790
},
{
"epoch": 1.6299314687905166,
"grad_norm": 0.2061718775432731,
"learning_rate": 4.1992952615186516e-05,
"loss": 0.2122,
"step": 8800
},
{
"epoch": 1.6317836636414151,
"grad_norm": 0.25086071759237627,
"learning_rate": 4.158791249333177e-05,
"loss": 0.226,
"step": 8810
},
{
"epoch": 1.6336358584923134,
"grad_norm": 0.242794082456384,
"learning_rate": 4.118465795463214e-05,
"loss": 0.2267,
"step": 8820
},
{
"epoch": 1.6354880533432117,
"grad_norm": 0.1935934917483956,
"learning_rate": 4.078319245399514e-05,
"loss": 0.2011,
"step": 8830
},
{
"epoch": 1.6373402481941102,
"grad_norm": 0.2628523170855809,
"learning_rate": 4.038351943100088e-05,
"loss": 0.1934,
"step": 8840
},
{
"epoch": 1.6391924430450082,
"grad_norm": 0.19568463922046236,
"learning_rate": 3.998564230987209e-05,
"loss": 0.1997,
"step": 8850
},
{
"epoch": 1.6410446378959067,
"grad_norm": 0.2481046435287445,
"learning_rate": 3.958956449944501e-05,
"loss": 0.2151,
"step": 8860
},
{
"epoch": 1.642896832746805,
"grad_norm": 0.22476767911377235,
"learning_rate": 3.9195289393140155e-05,
"loss": 0.1621,
"step": 8870
},
{
"epoch": 1.6447490275977033,
"grad_norm": 0.1945122394139851,
"learning_rate": 3.880282036893348e-05,
"loss": 0.1753,
"step": 8880
},
{
"epoch": 1.6466012224486017,
"grad_norm": 0.27437177077690705,
"learning_rate": 3.841216078932702e-05,
"loss": 0.226,
"step": 8890
},
{
"epoch": 1.6484534172994998,
"grad_norm": 0.18562250131807664,
"learning_rate": 3.802331400132028e-05,
"loss": 0.1717,
"step": 8900
},
{
"epoch": 1.6503056121503983,
"grad_norm": 0.21622010412383683,
"learning_rate": 3.7636283336381636e-05,
"loss": 0.155,
"step": 8910
},
{
"epoch": 1.6521578070012966,
"grad_norm": 0.22634728029439885,
"learning_rate": 3.7251072110419727e-05,
"loss": 0.2022,
"step": 8920
},
{
"epoch": 1.6540100018521948,
"grad_norm": 0.2671242474144964,
"learning_rate": 3.686768362375498e-05,
"loss": 0.2234,
"step": 8930
},
{
"epoch": 1.6558621967030933,
"grad_norm": 0.16832839316697204,
"learning_rate": 3.648612116109146e-05,
"loss": 0.1805,
"step": 8940
},
{
"epoch": 1.6577143915539914,
"grad_norm": 0.2688098808357188,
"learning_rate": 3.610638799148858e-05,
"loss": 0.1909,
"step": 8950
},
{
"epoch": 1.6595665864048899,
"grad_norm": 0.172871399134501,
"learning_rate": 3.572848736833326e-05,
"loss": 0.2112,
"step": 8960
},
{
"epoch": 1.6614187812557881,
"grad_norm": 0.23426972546449246,
"learning_rate": 3.5352422529311814e-05,
"loss": 0.2276,
"step": 8970
},
{
"epoch": 1.6632709761066864,
"grad_norm": 0.2682786605548356,
"learning_rate": 3.497819669638266e-05,
"loss": 0.2521,
"step": 8980
},
{
"epoch": 1.6651231709575849,
"grad_norm": 0.2122644465486904,
"learning_rate": 3.4605813075748085e-05,
"loss": 0.2003,
"step": 8990
},
{
"epoch": 1.666975365808483,
"grad_norm": 0.24717950759123916,
"learning_rate": 3.42352748578274e-05,
"loss": 0.1813,
"step": 9000
}
],
"logging_steps": 10,
"max_steps": 10798,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 196010447634432.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}