diff --git "a/checkpoint-9000/trainer_state.json" "b/checkpoint-9000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-9000/trainer_state.json" @@ -0,0 +1,6333 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.666975365808483, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0018521948508983144, + "grad_norm": 0.11988232185850008, + "learning_rate": 7.692307692307693e-05, + "loss": 0.5464, + "step": 10 + }, + { + "epoch": 0.003704389701796629, + "grad_norm": 0.1595727597726012, + "learning_rate": 0.00015384615384615385, + "loss": 0.3714, + "step": 20 + }, + { + "epoch": 0.0055565845526949435, + "grad_norm": 0.09668305340799446, + "learning_rate": 0.0002307692307692308, + "loss": 0.3157, + "step": 30 + }, + { + "epoch": 0.007408779403593258, + "grad_norm": 0.16023475106511212, + "learning_rate": 0.0003076923076923077, + "loss": 0.3987, + "step": 40 + }, + { + "epoch": 0.009260974254491572, + "grad_norm": 0.169453827168658, + "learning_rate": 0.00038461538461538467, + "loss": 0.3192, + "step": 50 + }, + { + "epoch": 0.011113169105389887, + "grad_norm": 0.3012465198223198, + "learning_rate": 0.0004615384615384616, + "loss": 0.3793, + "step": 60 + }, + { + "epoch": 0.012965363956288202, + "grad_norm": 0.3048336774601151, + "learning_rate": 0.0004999997322635931, + "loss": 0.3902, + "step": 70 + }, + { + "epoch": 0.014817558807186515, + "grad_norm": 0.3627728524819501, + "learning_rate": 0.0004999979009491321, + "loss": 0.3932, + "step": 80 + }, + { + "epoch": 0.016669753658084832, + "grad_norm": 0.4114314981691849, + "learning_rate": 0.0004999938313774507, + "loss": 0.4758, + "step": 90 + }, + { + "epoch": 0.018521948508983144, + "grad_norm": 0.44568863103845946, + "learning_rate": 0.0004999876199685106, + "loss": 0.4944, + "step": 100 + }, + { + "epoch": 0.02037414335988146, + "grad_norm": 0.3944831575252504, + "learning_rate": 0.0004999792667755284, + "loss": 0.4604, + "step": 110 + }, + { + "epoch": 0.022226338210779774, + "grad_norm": 0.4390978457678222, + "learning_rate": 0.0004999687718700706, + "loss": 0.5137, + "step": 120 + }, + { + "epoch": 0.02407853306167809, + "grad_norm": 0.43510394816525627, + "learning_rate": 0.000499956135342053, + "loss": 0.4757, + "step": 130 + }, + { + "epoch": 0.025930727912576404, + "grad_norm": 0.44811558740355373, + "learning_rate": 0.0004999413572997397, + "loss": 0.5541, + "step": 140 + }, + { + "epoch": 0.02778292276347472, + "grad_norm": 0.3691107778538866, + "learning_rate": 0.0004999262261712005, + "loss": 0.465, + "step": 150 + }, + { + "epoch": 0.02963511761437303, + "grad_norm": 0.38877053127394406, + "learning_rate": 0.0004999073796157043, + "loss": 0.4877, + "step": 160 + }, + { + "epoch": 0.031487312465271346, + "grad_norm": 0.43039217557165316, + "learning_rate": 0.00049988639196363, + "loss": 0.4371, + "step": 170 + }, + { + "epoch": 0.033339507316169664, + "grad_norm": 0.31896475378695344, + "learning_rate": 0.0004998632633947908, + "loss": 0.5814, + "step": 180 + }, + { + "epoch": 0.035191702167067976, + "grad_norm": 0.4333729012961258, + "learning_rate": 0.000499837994107342, + "loss": 0.5196, + "step": 190 + }, + { + "epoch": 0.03704389701796629, + "grad_norm": 0.34656949570118684, + "learning_rate": 0.0004998105843177797, + "loss": 0.4593, + "step": 200 + }, + { + "epoch": 0.038896091868864606, + "grad_norm": 0.3679842761186855, + "learning_rate": 0.000499781034260939, + "loss": 0.4526, + "step": 210 + }, + { + "epoch": 0.04074828671976292, + "grad_norm": 0.3877757158501542, + "learning_rate": 0.0004997493441899917, + "loss": 0.4261, + "step": 220 + }, + { + "epoch": 0.042600481570661236, + "grad_norm": 0.3289617422897924, + "learning_rate": 0.0004997155143764444, + "loss": 0.4934, + "step": 230 + }, + { + "epoch": 0.04445267642155955, + "grad_norm": 0.3826208484059836, + "learning_rate": 0.0004996795451101361, + "loss": 0.5347, + "step": 240 + }, + { + "epoch": 0.04630487127245786, + "grad_norm": 0.38157790109545875, + "learning_rate": 0.0004996414366992357, + "loss": 0.4789, + "step": 250 + }, + { + "epoch": 0.04815706612335618, + "grad_norm": 0.3154636026466987, + "learning_rate": 0.0004996011894702393, + "loss": 0.5096, + "step": 260 + }, + { + "epoch": 0.05000926097425449, + "grad_norm": 0.45302017894233715, + "learning_rate": 0.0004995588037679675, + "loss": 0.4752, + "step": 270 + }, + { + "epoch": 0.05186145582515281, + "grad_norm": 0.3690345364974773, + "learning_rate": 0.0004995142799555624, + "loss": 0.4454, + "step": 280 + }, + { + "epoch": 0.05371365067605112, + "grad_norm": 0.2843117768216058, + "learning_rate": 0.0004994676184144843, + "loss": 0.5058, + "step": 290 + }, + { + "epoch": 0.05556584552694944, + "grad_norm": 0.3278296958742726, + "learning_rate": 0.0004994188195445089, + "loss": 0.514, + "step": 300 + }, + { + "epoch": 0.05741804037784775, + "grad_norm": 0.3702152478433116, + "learning_rate": 0.0004993678837637235, + "loss": 0.4938, + "step": 310 + }, + { + "epoch": 0.05927023522874606, + "grad_norm": 0.3157950493567883, + "learning_rate": 0.0004993148115085233, + "loss": 0.4744, + "step": 320 + }, + { + "epoch": 0.06112243007964438, + "grad_norm": 0.2864560430174496, + "learning_rate": 0.0004992596032336082, + "loss": 0.4614, + "step": 330 + }, + { + "epoch": 0.06297462493054269, + "grad_norm": 0.27306121969283015, + "learning_rate": 0.0004992022594119784, + "loss": 0.5571, + "step": 340 + }, + { + "epoch": 0.064826819781441, + "grad_norm": 0.28239603992719803, + "learning_rate": 0.0004991427805349305, + "loss": 0.5352, + "step": 350 + }, + { + "epoch": 0.06667901463233933, + "grad_norm": 0.3067959635308188, + "learning_rate": 0.0004990811671120534, + "loss": 0.4366, + "step": 360 + }, + { + "epoch": 0.06853120948323764, + "grad_norm": 0.28026509474367334, + "learning_rate": 0.0004990174196712239, + "loss": 0.4413, + "step": 370 + }, + { + "epoch": 0.07038340433413595, + "grad_norm": 0.3553036031847406, + "learning_rate": 0.0004989515387586022, + "loss": 0.4771, + "step": 380 + }, + { + "epoch": 0.07223559918503426, + "grad_norm": 0.3078977949859338, + "learning_rate": 0.0004988904222849908, + "loss": 0.4456, + "step": 390 + }, + { + "epoch": 0.07408779403593257, + "grad_norm": 0.3110301739864855, + "learning_rate": 0.0004988204893460954, + "loss": 0.4383, + "step": 400 + }, + { + "epoch": 0.0759399888868309, + "grad_norm": 0.36180927604524143, + "learning_rate": 0.0004987484246226201, + "loss": 0.4467, + "step": 410 + }, + { + "epoch": 0.07779218373772921, + "grad_norm": 0.2902432894644559, + "learning_rate": 0.0004986742287319836, + "loss": 0.5027, + "step": 420 + }, + { + "epoch": 0.07964437858862752, + "grad_norm": 0.3038323599185379, + "learning_rate": 0.0004985979023098639, + "loss": 0.4896, + "step": 430 + }, + { + "epoch": 0.08149657343952584, + "grad_norm": 0.37728379802123757, + "learning_rate": 0.0004985194460101922, + "loss": 0.446, + "step": 440 + }, + { + "epoch": 0.08334876829042415, + "grad_norm": 0.4090633364935015, + "learning_rate": 0.0004984388605051474, + "loss": 0.4457, + "step": 450 + }, + { + "epoch": 0.08520096314132247, + "grad_norm": 0.2905031001353468, + "learning_rate": 0.000498356146485151, + "loss": 0.4807, + "step": 460 + }, + { + "epoch": 0.08705315799222078, + "grad_norm": 0.33050891942743665, + "learning_rate": 0.00049827130465886, + "loss": 0.4457, + "step": 470 + }, + { + "epoch": 0.0889053528431191, + "grad_norm": 0.3454971392885025, + "learning_rate": 0.0004981843357531622, + "loss": 0.4441, + "step": 480 + }, + { + "epoch": 0.09075754769401741, + "grad_norm": 0.2560611824951078, + "learning_rate": 0.0004980952405131687, + "loss": 0.4601, + "step": 490 + }, + { + "epoch": 0.09260974254491572, + "grad_norm": 0.3071403176866605, + "learning_rate": 0.0004980040197022085, + "loss": 0.422, + "step": 500 + }, + { + "epoch": 0.09446193739581404, + "grad_norm": 0.3103572812280149, + "learning_rate": 0.0004979106741018214, + "loss": 0.4556, + "step": 510 + }, + { + "epoch": 0.09631413224671236, + "grad_norm": 0.22158829723284448, + "learning_rate": 0.0004978152045117515, + "loss": 0.5279, + "step": 520 + }, + { + "epoch": 0.09816632709761067, + "grad_norm": 0.2643825421503944, + "learning_rate": 0.0004977176117499402, + "loss": 0.4332, + "step": 530 + }, + { + "epoch": 0.10001852194850898, + "grad_norm": 0.4119060845568467, + "learning_rate": 0.0004976178966525194, + "loss": 0.4748, + "step": 540 + }, + { + "epoch": 0.1018707167994073, + "grad_norm": 0.23946695979831795, + "learning_rate": 0.0004975160600738043, + "loss": 0.4564, + "step": 550 + }, + { + "epoch": 0.10372291165030562, + "grad_norm": 0.32293396447938405, + "learning_rate": 0.0004974121028862858, + "loss": 0.4037, + "step": 560 + }, + { + "epoch": 0.10557510650120393, + "grad_norm": 0.2737410631384409, + "learning_rate": 0.0004973060259806235, + "loss": 0.4471, + "step": 570 + }, + { + "epoch": 0.10742730135210224, + "grad_norm": 0.2639287107206222, + "learning_rate": 0.0004971978302656376, + "loss": 0.492, + "step": 580 + }, + { + "epoch": 0.10927949620300055, + "grad_norm": 0.3304530971496624, + "learning_rate": 0.0004970875166683017, + "loss": 0.4433, + "step": 590 + }, + { + "epoch": 0.11113169105389888, + "grad_norm": 0.3383662002406531, + "learning_rate": 0.0004969750861337338, + "loss": 0.5059, + "step": 600 + }, + { + "epoch": 0.11298388590479719, + "grad_norm": 0.3718630666323684, + "learning_rate": 0.0004968605396251896, + "loss": 0.4944, + "step": 610 + }, + { + "epoch": 0.1148360807556955, + "grad_norm": 0.3089667090694828, + "learning_rate": 0.0004967438781240532, + "loss": 0.5117, + "step": 620 + }, + { + "epoch": 0.11668827560659381, + "grad_norm": 0.3055449117119714, + "learning_rate": 0.000496625102629829, + "loss": 0.4504, + "step": 630 + }, + { + "epoch": 0.11854047045749212, + "grad_norm": 0.3104727563565212, + "learning_rate": 0.0004965042141601331, + "loss": 0.4279, + "step": 640 + }, + { + "epoch": 0.12039266530839045, + "grad_norm": 0.3356499915029813, + "learning_rate": 0.000496381213750685, + "loss": 0.4227, + "step": 650 + }, + { + "epoch": 0.12224486015928876, + "grad_norm": 0.27125317345626304, + "learning_rate": 0.0004962561024552981, + "loss": 0.4373, + "step": 660 + }, + { + "epoch": 0.12409705501018707, + "grad_norm": 0.30382841565038987, + "learning_rate": 0.0004961288813458708, + "loss": 0.4621, + "step": 670 + }, + { + "epoch": 0.12594924986108538, + "grad_norm": 0.24989596364522906, + "learning_rate": 0.0004959995515123779, + "loss": 0.4213, + "step": 680 + }, + { + "epoch": 0.1278014447119837, + "grad_norm": 0.38205278522841757, + "learning_rate": 0.0004958681140628603, + "loss": 0.4367, + "step": 690 + }, + { + "epoch": 0.129653639562882, + "grad_norm": 0.36439745638468385, + "learning_rate": 0.0004957345701234165, + "loss": 0.4427, + "step": 700 + }, + { + "epoch": 0.13150583441378033, + "grad_norm": 0.35080175318468465, + "learning_rate": 0.0004955989208381922, + "loss": 0.4133, + "step": 710 + }, + { + "epoch": 0.13335802926467866, + "grad_norm": 0.3137679102742871, + "learning_rate": 0.0004954611673693708, + "loss": 0.4044, + "step": 720 + }, + { + "epoch": 0.13521022411557695, + "grad_norm": 0.329188444759587, + "learning_rate": 0.0004953213108971637, + "loss": 0.4922, + "step": 730 + }, + { + "epoch": 0.13706241896647528, + "grad_norm": 0.21687503014556075, + "learning_rate": 0.0004951793526197992, + "loss": 0.4667, + "step": 740 + }, + { + "epoch": 0.13891461381737358, + "grad_norm": 0.35060249170961755, + "learning_rate": 0.0004950352937535139, + "loss": 0.4678, + "step": 750 + }, + { + "epoch": 0.1407668086682719, + "grad_norm": 0.2424350783919833, + "learning_rate": 0.0004948891355325407, + "loss": 0.5452, + "step": 760 + }, + { + "epoch": 0.14261900351917023, + "grad_norm": 0.29988592081373705, + "learning_rate": 0.0004947408792090989, + "loss": 0.4472, + "step": 770 + }, + { + "epoch": 0.14447119837006853, + "grad_norm": 0.25092463767440515, + "learning_rate": 0.0004945905260533836, + "loss": 0.4379, + "step": 780 + }, + { + "epoch": 0.14632339322096685, + "grad_norm": 0.2707811618812939, + "learning_rate": 0.0004944380773535545, + "loss": 0.4489, + "step": 790 + }, + { + "epoch": 0.14817558807186515, + "grad_norm": 0.29945644041990244, + "learning_rate": 0.000494283534415725, + "loss": 0.4627, + "step": 800 + }, + { + "epoch": 0.15002778292276348, + "grad_norm": 0.3269089383236662, + "learning_rate": 0.0004941268985639511, + "loss": 0.4559, + "step": 810 + }, + { + "epoch": 0.1518799777736618, + "grad_norm": 0.34167018575418623, + "learning_rate": 0.0004939681711402201, + "loss": 0.4502, + "step": 820 + }, + { + "epoch": 0.1537321726245601, + "grad_norm": 0.23233347955254757, + "learning_rate": 0.0004938073535044385, + "loss": 0.4848, + "step": 830 + }, + { + "epoch": 0.15558436747545842, + "grad_norm": 0.28810200476716363, + "learning_rate": 0.0004936444470344212, + "loss": 0.4334, + "step": 840 + }, + { + "epoch": 0.15743656232635672, + "grad_norm": 0.2502390156217485, + "learning_rate": 0.0004934794531258794, + "loss": 0.4756, + "step": 850 + }, + { + "epoch": 0.15928875717725505, + "grad_norm": 0.306502262394428, + "learning_rate": 0.0004933123731924083, + "loss": 0.4009, + "step": 860 + }, + { + "epoch": 0.16114095202815337, + "grad_norm": 0.2866551052549121, + "learning_rate": 0.0004931432086654751, + "loss": 0.411, + "step": 870 + }, + { + "epoch": 0.16299314687905167, + "grad_norm": 0.2975618645144025, + "learning_rate": 0.0004929719609944075, + "loss": 0.4386, + "step": 880 + }, + { + "epoch": 0.16484534172995, + "grad_norm": 0.3269989409370364, + "learning_rate": 0.00049279863164638, + "loss": 0.4811, + "step": 890 + }, + { + "epoch": 0.1666975365808483, + "grad_norm": 0.2764720769175588, + "learning_rate": 0.0004926232221064024, + "loss": 0.4319, + "step": 900 + }, + { + "epoch": 0.16854973143174662, + "grad_norm": 0.31817437091747597, + "learning_rate": 0.0004924457338773062, + "loss": 0.5039, + "step": 910 + }, + { + "epoch": 0.17040192628264494, + "grad_norm": 0.27931435921536724, + "learning_rate": 0.0004922661684797332, + "loss": 0.447, + "step": 920 + }, + { + "epoch": 0.17225412113354324, + "grad_norm": 0.31719086644687416, + "learning_rate": 0.0004920845274521201, + "loss": 0.4486, + "step": 930 + }, + { + "epoch": 0.17410631598444157, + "grad_norm": 0.2554455359026809, + "learning_rate": 0.0004919008123506878, + "loss": 0.4683, + "step": 940 + }, + { + "epoch": 0.17595851083533987, + "grad_norm": 0.33286076816889937, + "learning_rate": 0.0004917150247494265, + "loss": 0.4438, + "step": 950 + }, + { + "epoch": 0.1778107056862382, + "grad_norm": 0.2611238399418209, + "learning_rate": 0.0004915271662400824, + "loss": 0.3582, + "step": 960 + }, + { + "epoch": 0.17966290053713652, + "grad_norm": 0.2652458587080694, + "learning_rate": 0.0004913372384321449, + "loss": 0.4845, + "step": 970 + }, + { + "epoch": 0.18151509538803481, + "grad_norm": 0.2794832294188891, + "learning_rate": 0.000491145242952832, + "loss": 0.4398, + "step": 980 + }, + { + "epoch": 0.18336729023893314, + "grad_norm": 0.21029714010049572, + "learning_rate": 0.0004909511814470764, + "loss": 0.4408, + "step": 990 + }, + { + "epoch": 0.18521948508983144, + "grad_norm": 0.2781493608292439, + "learning_rate": 0.0004907550555775119, + "loss": 0.4999, + "step": 1000 + }, + { + "epoch": 0.18707167994072976, + "grad_norm": 0.3287877830017298, + "learning_rate": 0.0004905568670244588, + "loss": 0.4389, + "step": 1010 + }, + { + "epoch": 0.1889238747916281, + "grad_norm": 0.34207107261927205, + "learning_rate": 0.0004903566174859094, + "loss": 0.4537, + "step": 1020 + }, + { + "epoch": 0.19077606964252639, + "grad_norm": 0.24403509336935494, + "learning_rate": 0.0004901543086775137, + "loss": 0.3921, + "step": 1030 + }, + { + "epoch": 0.1926282644934247, + "grad_norm": 0.2671352359873941, + "learning_rate": 0.0004899499423325647, + "loss": 0.4023, + "step": 1040 + }, + { + "epoch": 0.194480459344323, + "grad_norm": 0.36145293617111, + "learning_rate": 0.0004897435202019832, + "loss": 0.4346, + "step": 1050 + }, + { + "epoch": 0.19633265419522133, + "grad_norm": 0.3104045357811312, + "learning_rate": 0.0004895350440543036, + "loss": 0.4299, + "step": 1060 + }, + { + "epoch": 0.19818484904611966, + "grad_norm": 0.2530391260727553, + "learning_rate": 0.0004893245156756578, + "loss": 0.4477, + "step": 1070 + }, + { + "epoch": 0.20003704389701796, + "grad_norm": 0.26339622262916945, + "learning_rate": 0.0004891119368697605, + "loss": 0.4907, + "step": 1080 + }, + { + "epoch": 0.20188923874791628, + "grad_norm": 0.24758807862533388, + "learning_rate": 0.0004888973094578931, + "loss": 0.4215, + "step": 1090 + }, + { + "epoch": 0.2037414335988146, + "grad_norm": 0.24646474329045825, + "learning_rate": 0.0004886806352788893, + "loss": 0.4727, + "step": 1100 + }, + { + "epoch": 0.2055936284497129, + "grad_norm": 0.30101780230375413, + "learning_rate": 0.0004884619161891181, + "loss": 0.4835, + "step": 1110 + }, + { + "epoch": 0.20744582330061123, + "grad_norm": 0.38338135072675056, + "learning_rate": 0.0004882411540624684, + "loss": 0.4713, + "step": 1120 + }, + { + "epoch": 0.20929801815150953, + "grad_norm": 0.30051618582402373, + "learning_rate": 0.00048801835079033325, + "loss": 0.4318, + "step": 1130 + }, + { + "epoch": 0.21115021300240785, + "grad_norm": 0.3169294143209614, + "learning_rate": 0.00048779350828159307, + "loss": 0.4414, + "step": 1140 + }, + { + "epoch": 0.21300240785330618, + "grad_norm": 0.2243691219456984, + "learning_rate": 0.0004875666284625996, + "loss": 0.4732, + "step": 1150 + }, + { + "epoch": 0.21485460270420448, + "grad_norm": 0.32093479593839086, + "learning_rate": 0.0004873377132771594, + "loss": 0.4477, + "step": 1160 + }, + { + "epoch": 0.2167067975551028, + "grad_norm": 0.30480291068654214, + "learning_rate": 0.00048710676468651724, + "loss": 0.4159, + "step": 1170 + }, + { + "epoch": 0.2185589924060011, + "grad_norm": 0.31550505987353533, + "learning_rate": 0.00048687378466933913, + "loss": 0.4121, + "step": 1180 + }, + { + "epoch": 0.22041118725689943, + "grad_norm": 0.2825917386970882, + "learning_rate": 0.0004866387752216953, + "loss": 0.4531, + "step": 1190 + }, + { + "epoch": 0.22226338210779775, + "grad_norm": 0.2507091074214277, + "learning_rate": 0.0004864017383570436, + "loss": 0.373, + "step": 1200 + }, + { + "epoch": 0.22411557695869605, + "grad_norm": 0.2533897084759911, + "learning_rate": 0.00048616267610621154, + "loss": 0.466, + "step": 1210 + }, + { + "epoch": 0.22596777180959438, + "grad_norm": 0.30135005574304485, + "learning_rate": 0.00048592159051737946, + "loss": 0.4678, + "step": 1220 + }, + { + "epoch": 0.22781996666049267, + "grad_norm": 0.2900534769133878, + "learning_rate": 0.0004856784836560627, + "loss": 0.4412, + "step": 1230 + }, + { + "epoch": 0.229672161511391, + "grad_norm": 0.3356512247856666, + "learning_rate": 0.000485433357605094, + "loss": 0.4381, + "step": 1240 + }, + { + "epoch": 0.23152435636228932, + "grad_norm": 0.28373492782986676, + "learning_rate": 0.00048518621446460555, + "loss": 0.4332, + "step": 1250 + }, + { + "epoch": 0.23337655121318762, + "grad_norm": 0.27681961152835116, + "learning_rate": 0.00048493705635201123, + "loss": 0.3954, + "step": 1260 + }, + { + "epoch": 0.23522874606408595, + "grad_norm": 0.3183042306103447, + "learning_rate": 0.0004846858854019882, + "loss": 0.4898, + "step": 1270 + }, + { + "epoch": 0.23708094091498425, + "grad_norm": 0.2806922738056069, + "learning_rate": 0.00048443270376645876, + "loss": 0.4621, + "step": 1280 + }, + { + "epoch": 0.23893313576588257, + "grad_norm": 0.32027034011519323, + "learning_rate": 0.00048417751361457185, + "loss": 0.4264, + "step": 1290 + }, + { + "epoch": 0.2407853306167809, + "grad_norm": 0.25756897907173815, + "learning_rate": 0.00048392031713268447, + "loss": 0.4213, + "step": 1300 + }, + { + "epoch": 0.2426375254676792, + "grad_norm": 0.29761680785972183, + "learning_rate": 0.0004836611165243432, + "loss": 0.41, + "step": 1310 + }, + { + "epoch": 0.24448972031857752, + "grad_norm": 0.28775863303393384, + "learning_rate": 0.00048339991401026474, + "loss": 0.4237, + "step": 1320 + }, + { + "epoch": 0.24634191516947582, + "grad_norm": 0.20527409355092, + "learning_rate": 0.00048313671182831743, + "loss": 0.4227, + "step": 1330 + }, + { + "epoch": 0.24819411002037414, + "grad_norm": 0.3049894888864481, + "learning_rate": 0.00048287151223350193, + "loss": 0.4188, + "step": 1340 + }, + { + "epoch": 0.25004630487127244, + "grad_norm": 0.28816158479568416, + "learning_rate": 0.00048260431749793184, + "loss": 0.4193, + "step": 1350 + }, + { + "epoch": 0.25189849972217077, + "grad_norm": 0.2810466941829626, + "learning_rate": 0.00048233512991081406, + "loss": 0.431, + "step": 1360 + }, + { + "epoch": 0.2537506945730691, + "grad_norm": 0.34419272070908224, + "learning_rate": 0.0004820639517784297, + "loss": 0.4802, + "step": 1370 + }, + { + "epoch": 0.2556028894239674, + "grad_norm": 0.2614191417571005, + "learning_rate": 0.00048179078542411367, + "loss": 0.4218, + "step": 1380 + }, + { + "epoch": 0.25745508427486574, + "grad_norm": 0.3620169455808058, + "learning_rate": 0.0004815156331882352, + "loss": 0.4259, + "step": 1390 + }, + { + "epoch": 0.259307279125764, + "grad_norm": 0.3495069978116607, + "learning_rate": 0.0004812384974281778, + "loss": 0.414, + "step": 1400 + }, + { + "epoch": 0.26115947397666234, + "grad_norm": 0.23822327577745042, + "learning_rate": 0.0004809593805183187, + "loss": 0.4885, + "step": 1410 + }, + { + "epoch": 0.26301166882756066, + "grad_norm": 0.31188479403470154, + "learning_rate": 0.00048067828485000904, + "loss": 0.438, + "step": 1420 + }, + { + "epoch": 0.264863863678459, + "grad_norm": 0.30908266150851776, + "learning_rate": 0.00048039521283155283, + "loss": 0.4224, + "step": 1430 + }, + { + "epoch": 0.2667160585293573, + "grad_norm": 0.3926396606462005, + "learning_rate": 0.0004801101668881869, + "loss": 0.4481, + "step": 1440 + }, + { + "epoch": 0.2685682533802556, + "grad_norm": 0.2937266710438928, + "learning_rate": 0.0004798231494620593, + "loss": 0.4785, + "step": 1450 + }, + { + "epoch": 0.2704204482311539, + "grad_norm": 0.29097772272918393, + "learning_rate": 0.00047953416301220936, + "loss": 0.5, + "step": 1460 + }, + { + "epoch": 0.27227264308205223, + "grad_norm": 0.2552279327553987, + "learning_rate": 0.000479243210014546, + "loss": 0.32, + "step": 1470 + }, + { + "epoch": 0.27412483793295056, + "grad_norm": 0.2699430209822517, + "learning_rate": 0.00047895029296182636, + "loss": 0.3985, + "step": 1480 + }, + { + "epoch": 0.2759770327838489, + "grad_norm": 0.31833186888024984, + "learning_rate": 0.0004786554143636353, + "loss": 0.4375, + "step": 1490 + }, + { + "epoch": 0.27782922763474716, + "grad_norm": 0.2751779388841223, + "learning_rate": 0.00047835857674636287, + "loss": 0.4001, + "step": 1500 + }, + { + "epoch": 0.2796814224856455, + "grad_norm": 0.2940862163328187, + "learning_rate": 0.0004780597826531833, + "loss": 0.4308, + "step": 1510 + }, + { + "epoch": 0.2815336173365438, + "grad_norm": 0.3386550227204627, + "learning_rate": 0.00047775903464403305, + "loss": 0.5353, + "step": 1520 + }, + { + "epoch": 0.28338581218744213, + "grad_norm": 0.31240154547554955, + "learning_rate": 0.00047745633529558884, + "loss": 0.3715, + "step": 1530 + }, + { + "epoch": 0.28523800703834046, + "grad_norm": 0.32759929614793354, + "learning_rate": 0.0004771516872012457, + "loss": 0.3929, + "step": 1540 + }, + { + "epoch": 0.2870902018892387, + "grad_norm": 0.29742817791928194, + "learning_rate": 0.0004768450929710945, + "loss": 0.4812, + "step": 1550 + }, + { + "epoch": 0.28894239674013705, + "grad_norm": 0.32461600905212035, + "learning_rate": 0.00047653655523189996, + "loss": 0.4181, + "step": 1560 + }, + { + "epoch": 0.2907945915910354, + "grad_norm": 0.26208477940948965, + "learning_rate": 0.00047622607662707773, + "loss": 0.3872, + "step": 1570 + }, + { + "epoch": 0.2926467864419337, + "grad_norm": 0.315046477208, + "learning_rate": 0.000475913659816672, + "loss": 0.4267, + "step": 1580 + }, + { + "epoch": 0.29449898129283203, + "grad_norm": 0.2451451562089501, + "learning_rate": 0.0004755993074773327, + "loss": 0.4525, + "step": 1590 + }, + { + "epoch": 0.2963511761437303, + "grad_norm": 0.2936495362556869, + "learning_rate": 0.00047528302230229246, + "loss": 0.4167, + "step": 1600 + }, + { + "epoch": 0.2982033709946286, + "grad_norm": 0.3551639863299712, + "learning_rate": 0.00047496480700134376, + "loss": 0.4214, + "step": 1610 + }, + { + "epoch": 0.30005556584552695, + "grad_norm": 0.21422448887216472, + "learning_rate": 0.0004746446643008153, + "loss": 0.4111, + "step": 1620 + }, + { + "epoch": 0.3019077606964253, + "grad_norm": 0.2593924521965729, + "learning_rate": 0.00047432259694354896, + "loss": 0.5274, + "step": 1630 + }, + { + "epoch": 0.3037599555473236, + "grad_norm": 0.30074263766274656, + "learning_rate": 0.0004739986076888765, + "loss": 0.4424, + "step": 1640 + }, + { + "epoch": 0.30561215039822187, + "grad_norm": 0.291226317138353, + "learning_rate": 0.0004736726993125952, + "loss": 0.4802, + "step": 1650 + }, + { + "epoch": 0.3074643452491202, + "grad_norm": 0.23749441719859632, + "learning_rate": 0.0004733448746069449, + "loss": 0.4288, + "step": 1660 + }, + { + "epoch": 0.3093165401000185, + "grad_norm": 0.2740636498957509, + "learning_rate": 0.00047301513638058355, + "loss": 0.4742, + "step": 1670 + }, + { + "epoch": 0.31116873495091685, + "grad_norm": 0.3263090001341323, + "learning_rate": 0.0004726834874585634, + "loss": 0.4945, + "step": 1680 + }, + { + "epoch": 0.3130209298018152, + "grad_norm": 0.23702905590165377, + "learning_rate": 0.00047234993068230656, + "loss": 0.3995, + "step": 1690 + }, + { + "epoch": 0.31487312465271344, + "grad_norm": 0.35028858247208006, + "learning_rate": 0.0004720144689095809, + "loss": 0.3937, + "step": 1700 + }, + { + "epoch": 0.31672531950361177, + "grad_norm": 0.35160376937763926, + "learning_rate": 0.00047167710501447535, + "loss": 0.4388, + "step": 1710 + }, + { + "epoch": 0.3185775143545101, + "grad_norm": 0.2769519878263511, + "learning_rate": 0.0004713378418873756, + "loss": 0.43, + "step": 1720 + }, + { + "epoch": 0.3204297092054084, + "grad_norm": 0.2723567337414344, + "learning_rate": 0.00047099668243493886, + "loss": 0.4546, + "step": 1730 + }, + { + "epoch": 0.32228190405630674, + "grad_norm": 0.4145209498456788, + "learning_rate": 0.0004706536295800695, + "loss": 0.4331, + "step": 1740 + }, + { + "epoch": 0.324134098907205, + "grad_norm": 0.3793519870853873, + "learning_rate": 0.0004703086862618935, + "loss": 0.3716, + "step": 1750 + }, + { + "epoch": 0.32598629375810334, + "grad_norm": 0.2962260082256936, + "learning_rate": 0.00046996185543573356, + "loss": 0.4161, + "step": 1760 + }, + { + "epoch": 0.32783848860900167, + "grad_norm": 0.24861664813452802, + "learning_rate": 0.00046961314007308374, + "loss": 0.4772, + "step": 1770 + }, + { + "epoch": 0.3296906834599, + "grad_norm": 0.30394710320503215, + "learning_rate": 0.00046926254316158414, + "loss": 0.4521, + "step": 1780 + }, + { + "epoch": 0.3315428783107983, + "grad_norm": 0.2835284077342044, + "learning_rate": 0.0004689100677049948, + "loss": 0.439, + "step": 1790 + }, + { + "epoch": 0.3333950731616966, + "grad_norm": 0.2936297703950855, + "learning_rate": 0.00046855571672317056, + "loss": 0.4539, + "step": 1800 + }, + { + "epoch": 0.3352472680125949, + "grad_norm": 0.31076414372805394, + "learning_rate": 0.00046819949325203485, + "loss": 0.5226, + "step": 1810 + }, + { + "epoch": 0.33709946286349324, + "grad_norm": 0.3151990506296693, + "learning_rate": 0.00046784140034355386, + "loss": 0.4502, + "step": 1820 + }, + { + "epoch": 0.33895165771439156, + "grad_norm": 0.2999740764164084, + "learning_rate": 0.0004674814410657102, + "loss": 0.405, + "step": 1830 + }, + { + "epoch": 0.3408038525652899, + "grad_norm": 0.2848528621693946, + "learning_rate": 0.00046711961850247677, + "loss": 0.4686, + "step": 1840 + }, + { + "epoch": 0.34265604741618816, + "grad_norm": 0.3304960024436658, + "learning_rate": 0.0004667559357537901, + "loss": 0.3961, + "step": 1850 + }, + { + "epoch": 0.3445082422670865, + "grad_norm": 0.29714447800492894, + "learning_rate": 0.00046639039593552423, + "loss": 0.4121, + "step": 1860 + }, + { + "epoch": 0.3463604371179848, + "grad_norm": 0.3737053983821796, + "learning_rate": 0.0004660230021794637, + "loss": 0.4899, + "step": 1870 + }, + { + "epoch": 0.34821263196888314, + "grad_norm": 0.2715803166164925, + "learning_rate": 0.00046565375763327655, + "loss": 0.418, + "step": 1880 + }, + { + "epoch": 0.35006482681978146, + "grad_norm": 0.2962801885853028, + "learning_rate": 0.0004652826654604879, + "loss": 0.4675, + "step": 1890 + }, + { + "epoch": 0.35191702167067973, + "grad_norm": 0.30660107375890056, + "learning_rate": 0.0004649097288404523, + "loss": 0.4536, + "step": 1900 + }, + { + "epoch": 0.35376921652157806, + "grad_norm": 0.28266003520813626, + "learning_rate": 0.00046453495096832677, + "loss": 0.44, + "step": 1910 + }, + { + "epoch": 0.3556214113724764, + "grad_norm": 0.3422119367179134, + "learning_rate": 0.00046415833505504344, + "loss": 0.4584, + "step": 1920 + }, + { + "epoch": 0.3574736062233747, + "grad_norm": 0.2749096084932521, + "learning_rate": 0.0004637798843272819, + "loss": 0.3907, + "step": 1930 + }, + { + "epoch": 0.35932580107427303, + "grad_norm": 0.26388805864831494, + "learning_rate": 0.00046339960202744154, + "loss": 0.5757, + "step": 1940 + }, + { + "epoch": 0.3611779959251713, + "grad_norm": 0.2738001016444935, + "learning_rate": 0.000463017491413614, + "loss": 0.4938, + "step": 1950 + }, + { + "epoch": 0.36303019077606963, + "grad_norm": 0.27217682271594046, + "learning_rate": 0.00046263355575955513, + "loss": 0.4063, + "step": 1960 + }, + { + "epoch": 0.36488238562696795, + "grad_norm": 0.23291262129921603, + "learning_rate": 0.0004622477983546567, + "loss": 0.419, + "step": 1970 + }, + { + "epoch": 0.3667345804778663, + "grad_norm": 0.304942976924537, + "learning_rate": 0.0004618602225039187, + "loss": 0.4168, + "step": 1980 + }, + { + "epoch": 0.3685867753287646, + "grad_norm": 0.24084297499524615, + "learning_rate": 0.00046147083152792064, + "loss": 0.3846, + "step": 1990 + }, + { + "epoch": 0.3704389701796629, + "grad_norm": 0.27930179036055947, + "learning_rate": 0.00046107962876279317, + "loss": 0.4226, + "step": 2000 + }, + { + "epoch": 0.3722911650305612, + "grad_norm": 0.22286791279607676, + "learning_rate": 0.00046068661756018975, + "loss": 0.3928, + "step": 2010 + }, + { + "epoch": 0.3741433598814595, + "grad_norm": 0.22400156451080455, + "learning_rate": 0.00046029180128725756, + "loss": 0.4584, + "step": 2020 + }, + { + "epoch": 0.37599555473235785, + "grad_norm": 0.3152682221415501, + "learning_rate": 0.0004598951833266087, + "loss": 0.4314, + "step": 2030 + }, + { + "epoch": 0.3778477495832562, + "grad_norm": 0.31019682799358195, + "learning_rate": 0.00045949676707629186, + "loss": 0.4237, + "step": 2040 + }, + { + "epoch": 0.37969994443415445, + "grad_norm": 0.32258613660465024, + "learning_rate": 0.00045909655594976207, + "loss": 0.3827, + "step": 2050 + }, + { + "epoch": 0.38155213928505277, + "grad_norm": 0.2506911135234745, + "learning_rate": 0.00045869455337585246, + "loss": 0.4037, + "step": 2060 + }, + { + "epoch": 0.3834043341359511, + "grad_norm": 0.35915658848471477, + "learning_rate": 0.0004582907627987444, + "loss": 0.4242, + "step": 2070 + }, + { + "epoch": 0.3852565289868494, + "grad_norm": 0.28180517097875335, + "learning_rate": 0.00045788518767793786, + "loss": 0.4342, + "step": 2080 + }, + { + "epoch": 0.38710872383774775, + "grad_norm": 0.22401926241944572, + "learning_rate": 0.0004574778314882225, + "loss": 0.4546, + "step": 2090 + }, + { + "epoch": 0.388960918688646, + "grad_norm": 0.3007971129642205, + "learning_rate": 0.0004570686977196468, + "loss": 0.4364, + "step": 2100 + }, + { + "epoch": 0.39081311353954434, + "grad_norm": 0.24088799894015317, + "learning_rate": 0.0004566577898774893, + "loss": 0.4313, + "step": 2110 + }, + { + "epoch": 0.39266530839044267, + "grad_norm": 0.30698196088504776, + "learning_rate": 0.0004562451114822276, + "loss": 0.3996, + "step": 2120 + }, + { + "epoch": 0.394517503241341, + "grad_norm": 0.2516817084212753, + "learning_rate": 0.0004558306660695089, + "loss": 0.4434, + "step": 2130 + }, + { + "epoch": 0.3963696980922393, + "grad_norm": 0.24923810797995163, + "learning_rate": 0.00045541445719011933, + "loss": 0.3827, + "step": 2140 + }, + { + "epoch": 0.39822189294313765, + "grad_norm": 0.2838748265882661, + "learning_rate": 0.0004549964884099534, + "loss": 0.4097, + "step": 2150 + }, + { + "epoch": 0.4000740877940359, + "grad_norm": 0.2520366270233344, + "learning_rate": 0.0004545767633099842, + "loss": 0.4257, + "step": 2160 + }, + { + "epoch": 0.40192628264493424, + "grad_norm": 0.29635595927178765, + "learning_rate": 0.0004541552854862317, + "loss": 0.4305, + "step": 2170 + }, + { + "epoch": 0.40377847749583257, + "grad_norm": 0.3136173166936259, + "learning_rate": 0.00045373205854973265, + "loss": 0.4592, + "step": 2180 + }, + { + "epoch": 0.4056306723467309, + "grad_norm": 0.2849443744452371, + "learning_rate": 0.0004533070861265094, + "loss": 0.4604, + "step": 2190 + }, + { + "epoch": 0.4074828671976292, + "grad_norm": 0.27436502832510207, + "learning_rate": 0.000452880371857539, + "loss": 0.3709, + "step": 2200 + }, + { + "epoch": 0.4093350620485275, + "grad_norm": 0.31084213819654966, + "learning_rate": 0.0004524519193987215, + "loss": 0.4707, + "step": 2210 + }, + { + "epoch": 0.4111872568994258, + "grad_norm": 0.27171948513912497, + "learning_rate": 0.00045202173242084954, + "loss": 0.4131, + "step": 2220 + }, + { + "epoch": 0.41303945175032414, + "grad_norm": 0.2720258760965373, + "learning_rate": 0.0004515898146095758, + "loss": 0.3954, + "step": 2230 + }, + { + "epoch": 0.41489164660122246, + "grad_norm": 0.21964829380379464, + "learning_rate": 0.0004511561696653823, + "loss": 0.432, + "step": 2240 + }, + { + "epoch": 0.4167438414521208, + "grad_norm": 0.22147147407497397, + "learning_rate": 0.0004507208013035483, + "loss": 0.406, + "step": 2250 + }, + { + "epoch": 0.41859603630301906, + "grad_norm": 0.2592943907855162, + "learning_rate": 0.0004502837132541186, + "loss": 0.4092, + "step": 2260 + }, + { + "epoch": 0.4204482311539174, + "grad_norm": 0.2697288980975384, + "learning_rate": 0.0004498449092618715, + "loss": 0.3643, + "step": 2270 + }, + { + "epoch": 0.4223004260048157, + "grad_norm": 0.2502930773158984, + "learning_rate": 0.00044940439308628654, + "loss": 0.344, + "step": 2280 + }, + { + "epoch": 0.42415262085571404, + "grad_norm": 0.28445457893318615, + "learning_rate": 0.00044896216850151294, + "loss": 0.4511, + "step": 2290 + }, + { + "epoch": 0.42600481570661236, + "grad_norm": 0.3361734430502526, + "learning_rate": 0.0004485182392963364, + "loss": 0.3547, + "step": 2300 + }, + { + "epoch": 0.42785701055751063, + "grad_norm": 0.2326479256523765, + "learning_rate": 0.0004480726092741472, + "loss": 0.3731, + "step": 2310 + }, + { + "epoch": 0.42970920540840896, + "grad_norm": 0.2646729222942232, + "learning_rate": 0.00044762528225290757, + "loss": 0.4015, + "step": 2320 + }, + { + "epoch": 0.4315614002593073, + "grad_norm": 0.33778964570201236, + "learning_rate": 0.0004471762620651187, + "loss": 0.4, + "step": 2330 + }, + { + "epoch": 0.4334135951102056, + "grad_norm": 0.31289509233278756, + "learning_rate": 0.00044672555255778824, + "loss": 0.4377, + "step": 2340 + }, + { + "epoch": 0.43526578996110393, + "grad_norm": 0.27440247092572545, + "learning_rate": 0.00044627315759239715, + "loss": 0.3972, + "step": 2350 + }, + { + "epoch": 0.4371179848120022, + "grad_norm": 0.2641845623874125, + "learning_rate": 0.0004458190810448667, + "loss": 0.3864, + "step": 2360 + }, + { + "epoch": 0.43897017966290053, + "grad_norm": 0.3042810996664228, + "learning_rate": 0.0004453633268055249, + "loss": 0.4277, + "step": 2370 + }, + { + "epoch": 0.44082237451379885, + "grad_norm": 0.2497842382086681, + "learning_rate": 0.00044490589877907406, + "loss": 0.3926, + "step": 2380 + }, + { + "epoch": 0.4426745693646972, + "grad_norm": 0.2259561601883072, + "learning_rate": 0.00044444680088455624, + "loss": 0.4567, + "step": 2390 + }, + { + "epoch": 0.4445267642155955, + "grad_norm": 0.2644522169590116, + "learning_rate": 0.00044398603705532046, + "loss": 0.4257, + "step": 2400 + }, + { + "epoch": 0.4463789590664938, + "grad_norm": 0.24862008909243488, + "learning_rate": 0.0004435236112389887, + "loss": 0.3187, + "step": 2410 + }, + { + "epoch": 0.4482311539173921, + "grad_norm": 0.2838495721029593, + "learning_rate": 0.000443059527397422, + "loss": 0.4659, + "step": 2420 + }, + { + "epoch": 0.4500833487682904, + "grad_norm": 0.219358259027201, + "learning_rate": 0.00044259378950668683, + "loss": 0.3919, + "step": 2430 + }, + { + "epoch": 0.45193554361918875, + "grad_norm": 0.31146983163040265, + "learning_rate": 0.00044212640155702053, + "loss": 0.4584, + "step": 2440 + }, + { + "epoch": 0.4537877384700871, + "grad_norm": 0.26979102938650734, + "learning_rate": 0.00044165736755279785, + "loss": 0.3086, + "step": 2450 + }, + { + "epoch": 0.45563993332098535, + "grad_norm": 0.29314640181084967, + "learning_rate": 0.00044118669151249585, + "loss": 0.4357, + "step": 2460 + }, + { + "epoch": 0.45749212817188367, + "grad_norm": 0.2523855052206998, + "learning_rate": 0.00044071437746865994, + "loss": 0.4024, + "step": 2470 + }, + { + "epoch": 0.459344323022782, + "grad_norm": 0.24148640334233432, + "learning_rate": 0.0004402404294678692, + "loss": 0.396, + "step": 2480 + }, + { + "epoch": 0.4611965178736803, + "grad_norm": 0.22896761800287638, + "learning_rate": 0.00043976485157070185, + "loss": 0.4293, + "step": 2490 + }, + { + "epoch": 0.46304871272457865, + "grad_norm": 0.24737906716097793, + "learning_rate": 0.0004392876478517002, + "loss": 0.4756, + "step": 2500 + }, + { + "epoch": 0.4649009075754769, + "grad_norm": 0.305490554690619, + "learning_rate": 0.000438808822399336, + "loss": 0.405, + "step": 2510 + }, + { + "epoch": 0.46675310242637524, + "grad_norm": 0.2802043380804828, + "learning_rate": 0.00043832837931597526, + "loss": 0.3876, + "step": 2520 + }, + { + "epoch": 0.46860529727727357, + "grad_norm": 0.2860415378563156, + "learning_rate": 0.00043784632271784304, + "loss": 0.4161, + "step": 2530 + }, + { + "epoch": 0.4704574921281719, + "grad_norm": 0.28267000501834966, + "learning_rate": 0.0004373626567349885, + "loss": 0.4143, + "step": 2540 + }, + { + "epoch": 0.4723096869790702, + "grad_norm": 0.2525367504836072, + "learning_rate": 0.00043687738551124913, + "loss": 0.3757, + "step": 2550 + }, + { + "epoch": 0.4741618818299685, + "grad_norm": 0.3925357847215651, + "learning_rate": 0.0004363905132042154, + "loss": 0.3826, + "step": 2560 + }, + { + "epoch": 0.4760140766808668, + "grad_norm": 0.3263265495863413, + "learning_rate": 0.00043590204398519526, + "loss": 0.4263, + "step": 2570 + }, + { + "epoch": 0.47786627153176514, + "grad_norm": 0.30208444736193557, + "learning_rate": 0.0004354119820391784, + "loss": 0.3817, + "step": 2580 + }, + { + "epoch": 0.47971846638266347, + "grad_norm": 0.2561058320675499, + "learning_rate": 0.00043492033156479997, + "loss": 0.4278, + "step": 2590 + }, + { + "epoch": 0.4815706612335618, + "grad_norm": 0.30589399146654594, + "learning_rate": 0.0004344270967743052, + "loss": 0.4058, + "step": 2600 + }, + { + "epoch": 0.48342285608446006, + "grad_norm": 0.2978445001042373, + "learning_rate": 0.00043393228189351297, + "loss": 0.4212, + "step": 2610 + }, + { + "epoch": 0.4852750509353584, + "grad_norm": 0.29323906443796505, + "learning_rate": 0.0004334358911617797, + "loss": 0.4304, + "step": 2620 + }, + { + "epoch": 0.4871272457862567, + "grad_norm": 0.25775394604491453, + "learning_rate": 0.000432937928831963, + "loss": 0.4291, + "step": 2630 + }, + { + "epoch": 0.48897944063715504, + "grad_norm": 0.2860673624388678, + "learning_rate": 0.00043243839917038506, + "loss": 0.4452, + "step": 2640 + }, + { + "epoch": 0.49083163548805336, + "grad_norm": 0.2451402557512562, + "learning_rate": 0.00043193730645679665, + "loss": 0.349, + "step": 2650 + }, + { + "epoch": 0.49268383033895163, + "grad_norm": 0.23951029660105672, + "learning_rate": 0.0004314346549843398, + "loss": 0.3986, + "step": 2660 + }, + { + "epoch": 0.49453602518984996, + "grad_norm": 0.24086380299145352, + "learning_rate": 0.0004309304490595113, + "loss": 0.4069, + "step": 2670 + }, + { + "epoch": 0.4963882200407483, + "grad_norm": 0.19690525958834837, + "learning_rate": 0.00043042469300212595, + "loss": 0.3658, + "step": 2680 + }, + { + "epoch": 0.4982404148916466, + "grad_norm": 0.2873547855172915, + "learning_rate": 0.0004299173911452794, + "loss": 0.4045, + "step": 2690 + }, + { + "epoch": 0.5000926097425449, + "grad_norm": 0.3445660214713212, + "learning_rate": 0.0004294085478353109, + "loss": 0.3342, + "step": 2700 + }, + { + "epoch": 0.5019448045934433, + "grad_norm": 0.26259627047719875, + "learning_rate": 0.00042889816743176625, + "loss": 0.4115, + "step": 2710 + }, + { + "epoch": 0.5037969994443415, + "grad_norm": 0.27090069459316, + "learning_rate": 0.0004283862543073604, + "loss": 0.4178, + "step": 2720 + }, + { + "epoch": 0.5056491942952399, + "grad_norm": 0.3203148075266908, + "learning_rate": 0.00042787281284794, + "loss": 0.4177, + "step": 2730 + }, + { + "epoch": 0.5075013891461382, + "grad_norm": 0.2044466650316563, + "learning_rate": 0.00042735784745244585, + "loss": 0.415, + "step": 2740 + }, + { + "epoch": 0.5093535839970365, + "grad_norm": 0.2673811085531597, + "learning_rate": 0.000426841362532875, + "loss": 0.3923, + "step": 2750 + }, + { + "epoch": 0.5112057788479348, + "grad_norm": 0.23323940410282512, + "learning_rate": 0.00042632336251424317, + "loss": 0.3643, + "step": 2760 + }, + { + "epoch": 0.5130579736988331, + "grad_norm": 0.19502502356966445, + "learning_rate": 0.00042580385183454695, + "loss": 0.4509, + "step": 2770 + }, + { + "epoch": 0.5149101685497315, + "grad_norm": 0.3081825384344212, + "learning_rate": 0.0004252828349447254, + "loss": 0.3374, + "step": 2780 + }, + { + "epoch": 0.5167623634006298, + "grad_norm": 0.19926889616728075, + "learning_rate": 0.00042476031630862235, + "loss": 0.3751, + "step": 2790 + }, + { + "epoch": 0.518614558251528, + "grad_norm": 0.2980672545203656, + "learning_rate": 0.00042423630040294756, + "loss": 0.3737, + "step": 2800 + }, + { + "epoch": 0.5204667531024264, + "grad_norm": 0.2805956385580894, + "learning_rate": 0.0004237107917172391, + "loss": 0.3498, + "step": 2810 + }, + { + "epoch": 0.5223189479533247, + "grad_norm": 0.24883952133869866, + "learning_rate": 0.00042318379475382454, + "loss": 0.369, + "step": 2820 + }, + { + "epoch": 0.5241711428042231, + "grad_norm": 0.26010129083226985, + "learning_rate": 0.0004226553140277819, + "loss": 0.3763, + "step": 2830 + }, + { + "epoch": 0.5260233376551213, + "grad_norm": 0.3407509896784033, + "learning_rate": 0.000422125354066902, + "loss": 0.3339, + "step": 2840 + }, + { + "epoch": 0.5278755325060196, + "grad_norm": 0.2022248872951544, + "learning_rate": 0.0004215939194116487, + "loss": 0.415, + "step": 2850 + }, + { + "epoch": 0.529727727356918, + "grad_norm": 0.3427987857911665, + "learning_rate": 0.0004210610146151206, + "loss": 0.4224, + "step": 2860 + }, + { + "epoch": 0.5315799222078162, + "grad_norm": 0.23594824415533, + "learning_rate": 0.0004205266442430117, + "loss": 0.4051, + "step": 2870 + }, + { + "epoch": 0.5334321170587146, + "grad_norm": 0.29315061402915377, + "learning_rate": 0.00041999081287357246, + "loss": 0.3898, + "step": 2880 + }, + { + "epoch": 0.5352843119096129, + "grad_norm": 0.25391786215048595, + "learning_rate": 0.0004194535250975705, + "loss": 0.4163, + "step": 2890 + }, + { + "epoch": 0.5371365067605112, + "grad_norm": 0.30989709227816453, + "learning_rate": 0.00041891478551825135, + "loss": 0.4528, + "step": 2900 + }, + { + "epoch": 0.5389887016114095, + "grad_norm": 0.30084068834422883, + "learning_rate": 0.000418374598751299, + "loss": 0.4187, + "step": 2910 + }, + { + "epoch": 0.5408408964623078, + "grad_norm": 0.2707819885874306, + "learning_rate": 0.000417832969424796, + "loss": 0.4203, + "step": 2920 + }, + { + "epoch": 0.5426930913132062, + "grad_norm": 0.27765562870418, + "learning_rate": 0.00041728990217918454, + "loss": 0.4354, + "step": 2930 + }, + { + "epoch": 0.5445452861641045, + "grad_norm": 0.2957077208859336, + "learning_rate": 0.00041674540166722595, + "loss": 0.4214, + "step": 2940 + }, + { + "epoch": 0.5463974810150027, + "grad_norm": 0.3687676577456054, + "learning_rate": 0.0004161994725539614, + "loss": 0.3915, + "step": 2950 + }, + { + "epoch": 0.5482496758659011, + "grad_norm": 0.26016346169725796, + "learning_rate": 0.00041565211951667143, + "loss": 0.4265, + "step": 2960 + }, + { + "epoch": 0.5501018707167994, + "grad_norm": 0.29400682034550746, + "learning_rate": 0.0004151033472448363, + "loss": 0.3754, + "step": 2970 + }, + { + "epoch": 0.5519540655676978, + "grad_norm": 0.24729614759661173, + "learning_rate": 0.00041455316044009563, + "loss": 0.3678, + "step": 2980 + }, + { + "epoch": 0.553806260418596, + "grad_norm": 0.30448617928085525, + "learning_rate": 0.0004140015638162081, + "loss": 0.3521, + "step": 2990 + }, + { + "epoch": 0.5556584552694943, + "grad_norm": 0.326331806127286, + "learning_rate": 0.0004134485620990113, + "loss": 0.3829, + "step": 3000 + }, + { + "epoch": 0.5575106501203927, + "grad_norm": 0.2831079722418925, + "learning_rate": 0.0004128941600263805, + "loss": 0.3499, + "step": 3010 + }, + { + "epoch": 0.559362844971291, + "grad_norm": 0.2544434887846111, + "learning_rate": 0.00041233836234818926, + "loss": 0.4621, + "step": 3020 + }, + { + "epoch": 0.5612150398221893, + "grad_norm": 0.272652788679403, + "learning_rate": 0.0004117811738262677, + "loss": 0.413, + "step": 3030 + }, + { + "epoch": 0.5630672346730876, + "grad_norm": 0.25142412831266564, + "learning_rate": 0.0004112225992343621, + "loss": 0.4163, + "step": 3040 + }, + { + "epoch": 0.5649194295239859, + "grad_norm": 0.2647884767561391, + "learning_rate": 0.00041066264335809413, + "loss": 0.3914, + "step": 3050 + }, + { + "epoch": 0.5667716243748843, + "grad_norm": 0.23801633376774256, + "learning_rate": 0.00041010131099491944, + "loss": 0.3754, + "step": 3060 + }, + { + "epoch": 0.5686238192257825, + "grad_norm": 0.2731341421028539, + "learning_rate": 0.0004095386069540872, + "loss": 0.4227, + "step": 3070 + }, + { + "epoch": 0.5704760140766809, + "grad_norm": 0.2011024370599634, + "learning_rate": 0.0004089745360565981, + "loss": 0.3834, + "step": 3080 + }, + { + "epoch": 0.5723282089275792, + "grad_norm": 0.23740640073183247, + "learning_rate": 0.00040840910313516364, + "loss": 0.4279, + "step": 3090 + }, + { + "epoch": 0.5741804037784775, + "grad_norm": 0.2525764151086583, + "learning_rate": 0.00040784231303416473, + "loss": 0.3782, + "step": 3100 + }, + { + "epoch": 0.5760325986293758, + "grad_norm": 0.29277924659862603, + "learning_rate": 0.00040727417060960967, + "loss": 0.3743, + "step": 3110 + }, + { + "epoch": 0.5778847934802741, + "grad_norm": 0.24242972284715095, + "learning_rate": 0.0004067046807290931, + "loss": 0.3832, + "step": 3120 + }, + { + "epoch": 0.5797369883311725, + "grad_norm": 0.25071856580407875, + "learning_rate": 0.0004061338482717538, + "loss": 0.3867, + "step": 3130 + }, + { + "epoch": 0.5815891831820708, + "grad_norm": 0.2837990600721797, + "learning_rate": 0.0004055616781282335, + "loss": 0.4151, + "step": 3140 + }, + { + "epoch": 0.583441378032969, + "grad_norm": 0.22534951219394125, + "learning_rate": 0.0004049881752006346, + "loss": 0.3788, + "step": 3150 + }, + { + "epoch": 0.5852935728838674, + "grad_norm": 0.2817669494395476, + "learning_rate": 0.0004044133444024779, + "loss": 0.437, + "step": 3160 + }, + { + "epoch": 0.5871457677347657, + "grad_norm": 0.20817420244233692, + "learning_rate": 0.00040383719065866105, + "loss": 0.3918, + "step": 3170 + }, + { + "epoch": 0.5889979625856641, + "grad_norm": 0.2734267113676852, + "learning_rate": 0.0004032597189054161, + "loss": 0.4261, + "step": 3180 + }, + { + "epoch": 0.5908501574365623, + "grad_norm": 0.27859862469331026, + "learning_rate": 0.0004026809340902672, + "loss": 0.4035, + "step": 3190 + }, + { + "epoch": 0.5927023522874606, + "grad_norm": 0.2545952221508602, + "learning_rate": 0.0004021008411719881, + "loss": 0.3432, + "step": 3200 + }, + { + "epoch": 0.594554547138359, + "grad_norm": 0.270005891201155, + "learning_rate": 0.0004015194451205601, + "loss": 0.354, + "step": 3210 + }, + { + "epoch": 0.5964067419892572, + "grad_norm": 0.24352901007536132, + "learning_rate": 0.000400936750917129, + "loss": 0.3729, + "step": 3220 + }, + { + "epoch": 0.5982589368401556, + "grad_norm": 0.2556498791861634, + "learning_rate": 0.0004003527635539625, + "loss": 0.4015, + "step": 3230 + }, + { + "epoch": 0.6001111316910539, + "grad_norm": 0.2752351613083482, + "learning_rate": 0.00039976748803440774, + "loss": 0.3672, + "step": 3240 + }, + { + "epoch": 0.6019633265419522, + "grad_norm": 0.2609226477539244, + "learning_rate": 0.000399180929372848, + "loss": 0.4015, + "step": 3250 + }, + { + "epoch": 0.6038155213928506, + "grad_norm": 0.30960657643957806, + "learning_rate": 0.00039859309259466017, + "loss": 0.3641, + "step": 3260 + }, + { + "epoch": 0.6056677162437488, + "grad_norm": 0.3035485490629689, + "learning_rate": 0.0003980039827361712, + "loss": 0.4543, + "step": 3270 + }, + { + "epoch": 0.6075199110946472, + "grad_norm": 0.2184916474124068, + "learning_rate": 0.0003974136048446155, + "loss": 0.337, + "step": 3280 + }, + { + "epoch": 0.6093721059455455, + "grad_norm": 0.2843568329769092, + "learning_rate": 0.0003968219639780915, + "loss": 0.4351, + "step": 3290 + }, + { + "epoch": 0.6112243007964437, + "grad_norm": 0.269831900653445, + "learning_rate": 0.00039622906520551786, + "loss": 0.3777, + "step": 3300 + }, + { + "epoch": 0.6130764956473421, + "grad_norm": 0.2834037960599415, + "learning_rate": 0.0003956349136065908, + "loss": 0.3924, + "step": 3310 + }, + { + "epoch": 0.6149286904982404, + "grad_norm": 0.24761657160080242, + "learning_rate": 0.00039503951427173985, + "loss": 0.4168, + "step": 3320 + }, + { + "epoch": 0.6167808853491388, + "grad_norm": 0.30901172205688504, + "learning_rate": 0.00039444287230208495, + "loss": 0.3873, + "step": 3330 + }, + { + "epoch": 0.618633080200037, + "grad_norm": 0.29747872909981493, + "learning_rate": 0.0003938449928093922, + "loss": 0.4341, + "step": 3340 + }, + { + "epoch": 0.6204852750509353, + "grad_norm": 0.2543886903531346, + "learning_rate": 0.0003932458809160303, + "loss": 0.3683, + "step": 3350 + }, + { + "epoch": 0.6223374699018337, + "grad_norm": 0.33337732842586854, + "learning_rate": 0.0003926455417549266, + "loss": 0.3755, + "step": 3360 + }, + { + "epoch": 0.624189664752732, + "grad_norm": 0.2464332085515913, + "learning_rate": 0.00039204398046952313, + "loss": 0.3602, + "step": 3370 + }, + { + "epoch": 0.6260418596036303, + "grad_norm": 0.2946927475643436, + "learning_rate": 0.00039144120221373254, + "loss": 0.4474, + "step": 3380 + }, + { + "epoch": 0.6278940544545286, + "grad_norm": 0.3017003197321625, + "learning_rate": 0.0003908372121518939, + "loss": 0.4334, + "step": 3390 + }, + { + "epoch": 0.6297462493054269, + "grad_norm": 0.32871078632996376, + "learning_rate": 0.0003902320154587288, + "loss": 0.3826, + "step": 3400 + }, + { + "epoch": 0.6315984441563253, + "grad_norm": 0.3041703577665594, + "learning_rate": 0.0003896256173192963, + "loss": 0.4301, + "step": 3410 + }, + { + "epoch": 0.6334506390072235, + "grad_norm": 0.27657730284049636, + "learning_rate": 0.0003890180229289492, + "loss": 0.3637, + "step": 3420 + }, + { + "epoch": 0.6353028338581219, + "grad_norm": 0.2894023841563432, + "learning_rate": 0.0003884701694853233, + "loss": 0.4083, + "step": 3430 + }, + { + "epoch": 0.6371550287090202, + "grad_norm": 0.3313798136401644, + "learning_rate": 0.00038786031656810573, + "loss": 0.3613, + "step": 3440 + }, + { + "epoch": 0.6390072235599185, + "grad_norm": 0.31419538828574267, + "learning_rate": 0.0003872492825242943, + "loss": 0.3517, + "step": 3450 + }, + { + "epoch": 0.6408594184108168, + "grad_norm": 0.2856367570197956, + "learning_rate": 0.0003866370725889602, + "loss": 0.3311, + "step": 3460 + }, + { + "epoch": 0.6427116132617151, + "grad_norm": 0.32378046135112004, + "learning_rate": 0.00038602369200724907, + "loss": 0.3808, + "step": 3470 + }, + { + "epoch": 0.6445638081126135, + "grad_norm": 0.2809834575253639, + "learning_rate": 0.00038540914603433596, + "loss": 0.3874, + "step": 3480 + }, + { + "epoch": 0.6464160029635118, + "grad_norm": 0.23009208535401943, + "learning_rate": 0.00038479343993538085, + "loss": 0.415, + "step": 3490 + }, + { + "epoch": 0.64826819781441, + "grad_norm": 0.22641660111122883, + "learning_rate": 0.00038417657898548284, + "loss": 0.3278, + "step": 3500 + }, + { + "epoch": 0.6501203926653084, + "grad_norm": 0.2981220824138414, + "learning_rate": 0.00038355856846963545, + "loss": 0.4047, + "step": 3510 + }, + { + "epoch": 0.6519725875162067, + "grad_norm": 0.2555163199749857, + "learning_rate": 0.00038293941368268105, + "loss": 0.4132, + "step": 3520 + }, + { + "epoch": 0.6538247823671051, + "grad_norm": 0.2291679316803199, + "learning_rate": 0.00038231911992926573, + "loss": 0.4501, + "step": 3530 + }, + { + "epoch": 0.6556769772180033, + "grad_norm": 0.22327007435525262, + "learning_rate": 0.0003816976925237936, + "loss": 0.4047, + "step": 3540 + }, + { + "epoch": 0.6575291720689016, + "grad_norm": 0.26270477479908155, + "learning_rate": 0.00038113744298654294, + "loss": 0.3669, + "step": 3550 + }, + { + "epoch": 0.6593813669198, + "grad_norm": 0.20304050646048286, + "learning_rate": 0.00038051387631809585, + "loss": 0.4247, + "step": 3560 + }, + { + "epoch": 0.6612335617706983, + "grad_norm": 0.2626214779683425, + "learning_rate": 0.0003798891914641258, + "loss": 0.3397, + "step": 3570 + }, + { + "epoch": 0.6630857566215966, + "grad_norm": 0.2927783575344774, + "learning_rate": 0.00037926339377665805, + "loss": 0.3352, + "step": 3580 + }, + { + "epoch": 0.6649379514724949, + "grad_norm": 0.2868661472365901, + "learning_rate": 0.0003786364886172521, + "loss": 0.4321, + "step": 3590 + }, + { + "epoch": 0.6667901463233932, + "grad_norm": 0.1980588697868199, + "learning_rate": 0.00037800848135695564, + "loss": 0.355, + "step": 3600 + }, + { + "epoch": 0.6686423411742916, + "grad_norm": 0.27964064214829887, + "learning_rate": 0.00037737937737625905, + "loss": 0.3953, + "step": 3610 + }, + { + "epoch": 0.6704945360251898, + "grad_norm": 0.30140561884162703, + "learning_rate": 0.0003767491820650486, + "loss": 0.3802, + "step": 3620 + }, + { + "epoch": 0.6723467308760882, + "grad_norm": 0.26216353668713616, + "learning_rate": 0.00037611790082256073, + "loss": 0.3701, + "step": 3630 + }, + { + "epoch": 0.6741989257269865, + "grad_norm": 0.2667607207767126, + "learning_rate": 0.00037548553905733566, + "loss": 0.4217, + "step": 3640 + }, + { + "epoch": 0.6760511205778847, + "grad_norm": 0.2888052260287578, + "learning_rate": 0.00037485210218717095, + "loss": 0.3861, + "step": 3650 + }, + { + "epoch": 0.6779033154287831, + "grad_norm": 0.322681691929484, + "learning_rate": 0.0003742175956390754, + "loss": 0.3769, + "step": 3660 + }, + { + "epoch": 0.6797555102796814, + "grad_norm": 0.2809039196576165, + "learning_rate": 0.0003735820248492221, + "loss": 0.37, + "step": 3670 + }, + { + "epoch": 0.6816077051305798, + "grad_norm": 0.3168194333373297, + "learning_rate": 0.0003729453952629022, + "loss": 0.3813, + "step": 3680 + }, + { + "epoch": 0.683459899981478, + "grad_norm": 0.2743408298239755, + "learning_rate": 0.00037230771233447813, + "loss": 0.3762, + "step": 3690 + }, + { + "epoch": 0.6853120948323763, + "grad_norm": 0.2997039201183461, + "learning_rate": 0.000371668981527337, + "loss": 0.4346, + "step": 3700 + }, + { + "epoch": 0.6871642896832747, + "grad_norm": 0.18532771548719357, + "learning_rate": 0.0003710292083138436, + "loss": 0.344, + "step": 3710 + }, + { + "epoch": 0.689016484534173, + "grad_norm": 0.3521954419398032, + "learning_rate": 0.0003703883981752935, + "loss": 0.378, + "step": 3720 + }, + { + "epoch": 0.6908686793850713, + "grad_norm": 0.3037259752726694, + "learning_rate": 0.00036974655660186644, + "loss": 0.4339, + "step": 3730 + }, + { + "epoch": 0.6927208742359696, + "grad_norm": 0.24733145996258551, + "learning_rate": 0.0003691036890925788, + "loss": 0.4195, + "step": 3740 + }, + { + "epoch": 0.6945730690868679, + "grad_norm": 0.19584340465708208, + "learning_rate": 0.0003684598011552368, + "loss": 0.3404, + "step": 3750 + }, + { + "epoch": 0.6964252639377663, + "grad_norm": 0.2530305551321265, + "learning_rate": 0.00036781489830638923, + "loss": 0.3163, + "step": 3760 + }, + { + "epoch": 0.6982774587886645, + "grad_norm": 0.26939789666432756, + "learning_rate": 0.0003671689860712804, + "loss": 0.3419, + "step": 3770 + }, + { + "epoch": 0.7001296536395629, + "grad_norm": 0.24191294552249204, + "learning_rate": 0.0003665220699838022, + "loss": 0.4176, + "step": 3780 + }, + { + "epoch": 0.7019818484904612, + "grad_norm": 0.2777592117015156, + "learning_rate": 0.00036587415558644756, + "loss": 0.3215, + "step": 3790 + }, + { + "epoch": 0.7038340433413595, + "grad_norm": 0.30078087923699953, + "learning_rate": 0.00036522524843026193, + "loss": 0.3564, + "step": 3800 + }, + { + "epoch": 0.7056862381922578, + "grad_norm": 0.29338660781666925, + "learning_rate": 0.00036457535407479673, + "loss": 0.3725, + "step": 3810 + }, + { + "epoch": 0.7075384330431561, + "grad_norm": 0.2296766539983086, + "learning_rate": 0.00036392447808806117, + "loss": 0.3688, + "step": 3820 + }, + { + "epoch": 0.7093906278940545, + "grad_norm": 0.30321062833889273, + "learning_rate": 0.0003632726260464746, + "loss": 0.3948, + "step": 3830 + }, + { + "epoch": 0.7112428227449528, + "grad_norm": 0.29399675372420425, + "learning_rate": 0.0003626198035348187, + "loss": 0.4013, + "step": 3840 + }, + { + "epoch": 0.713095017595851, + "grad_norm": 0.2105362387910143, + "learning_rate": 0.0003619660161461898, + "loss": 0.366, + "step": 3850 + }, + { + "epoch": 0.7149472124467494, + "grad_norm": 0.23037128345764354, + "learning_rate": 0.00036131126948195103, + "loss": 0.4221, + "step": 3860 + }, + { + "epoch": 0.7167994072976477, + "grad_norm": 0.2768953340591145, + "learning_rate": 0.00036065556915168377, + "loss": 0.2986, + "step": 3870 + }, + { + "epoch": 0.7186516021485461, + "grad_norm": 0.23581750422601885, + "learning_rate": 0.0003599989207731404, + "loss": 0.3691, + "step": 3880 + }, + { + "epoch": 0.7205037969994443, + "grad_norm": 0.23261721710497926, + "learning_rate": 0.0003593413299721955, + "loss": 0.4161, + "step": 3890 + }, + { + "epoch": 0.7223559918503426, + "grad_norm": 0.26947390848344027, + "learning_rate": 0.00035868280238279804, + "loss": 0.4034, + "step": 3900 + }, + { + "epoch": 0.724208186701241, + "grad_norm": 0.2604323518406546, + "learning_rate": 0.00035802334364692283, + "loss": 0.3652, + "step": 3910 + }, + { + "epoch": 0.7260603815521393, + "grad_norm": 0.19811786937816656, + "learning_rate": 0.00035736295941452256, + "loss": 0.3411, + "step": 3920 + }, + { + "epoch": 0.7279125764030376, + "grad_norm": 0.2942447611839833, + "learning_rate": 0.0003567016553434791, + "loss": 0.3932, + "step": 3930 + }, + { + "epoch": 0.7297647712539359, + "grad_norm": 0.20647945881304144, + "learning_rate": 0.00035603943709955495, + "loss": 0.3481, + "step": 3940 + }, + { + "epoch": 0.7316169661048342, + "grad_norm": 0.29098401038664423, + "learning_rate": 0.0003553763103563449, + "loss": 0.3205, + "step": 3950 + }, + { + "epoch": 0.7334691609557326, + "grad_norm": 0.24827960683081182, + "learning_rate": 0.00035471228079522754, + "loss": 0.3653, + "step": 3960 + }, + { + "epoch": 0.7353213558066308, + "grad_norm": 0.21532456030161418, + "learning_rate": 0.0003540473541053161, + "loss": 0.3299, + "step": 3970 + }, + { + "epoch": 0.7371735506575292, + "grad_norm": 0.28516797949078204, + "learning_rate": 0.0003533815359834103, + "loss": 0.3718, + "step": 3980 + }, + { + "epoch": 0.7390257455084275, + "grad_norm": 0.2617620703053819, + "learning_rate": 0.00035271483213394715, + "loss": 0.3505, + "step": 3990 + }, + { + "epoch": 0.7408779403593257, + "grad_norm": 0.27198805201563014, + "learning_rate": 0.000352047248268952, + "loss": 0.3968, + "step": 4000 + }, + { + "epoch": 0.7427301352102241, + "grad_norm": 0.1957730557770133, + "learning_rate": 0.0003513787901079902, + "loss": 0.3647, + "step": 4010 + }, + { + "epoch": 0.7445823300611224, + "grad_norm": 0.2424016899157965, + "learning_rate": 0.0003507094633781173, + "loss": 0.4071, + "step": 4020 + }, + { + "epoch": 0.7464345249120208, + "grad_norm": 0.2513574669580144, + "learning_rate": 0.00035003927381383046, + "loss": 0.3348, + "step": 4030 + }, + { + "epoch": 0.748286719762919, + "grad_norm": 0.2524624117498673, + "learning_rate": 0.00034936822715701945, + "loss": 0.3805, + "step": 4040 + }, + { + "epoch": 0.7501389146138173, + "grad_norm": 0.23903538948524897, + "learning_rate": 0.00034869632915691685, + "loss": 0.335, + "step": 4050 + }, + { + "epoch": 0.7519911094647157, + "grad_norm": 0.18376558979991064, + "learning_rate": 0.0003480235855700495, + "loss": 0.3251, + "step": 4060 + }, + { + "epoch": 0.753843304315614, + "grad_norm": 0.23255076073481523, + "learning_rate": 0.0003473500021601888, + "loss": 0.3706, + "step": 4070 + }, + { + "epoch": 0.7556954991665124, + "grad_norm": 0.26504941120664904, + "learning_rate": 0.0003466755846983012, + "loss": 0.3388, + "step": 4080 + }, + { + "epoch": 0.7575476940174106, + "grad_norm": 0.21513866870033804, + "learning_rate": 0.00034600033896249903, + "loss": 0.3493, + "step": 4090 + }, + { + "epoch": 0.7593998888683089, + "grad_norm": 0.2588933457999632, + "learning_rate": 0.00034532427073799115, + "loss": 0.3335, + "step": 4100 + }, + { + "epoch": 0.7612520837192073, + "grad_norm": 0.22932856457029652, + "learning_rate": 0.0003446473858170328, + "loss": 0.3573, + "step": 4110 + }, + { + "epoch": 0.7631042785701055, + "grad_norm": 0.25882003589945557, + "learning_rate": 0.00034396968999887635, + "loss": 0.3448, + "step": 4120 + }, + { + "epoch": 0.7649564734210039, + "grad_norm": 0.18186372017813182, + "learning_rate": 0.00034329118908972187, + "loss": 0.3451, + "step": 4130 + }, + { + "epoch": 0.7668086682719022, + "grad_norm": 0.2905270964806583, + "learning_rate": 0.00034261188890266674, + "loss": 0.3388, + "step": 4140 + }, + { + "epoch": 0.7686608631228005, + "grad_norm": 0.27875971252061826, + "learning_rate": 0.00034193179525765646, + "loss": 0.3131, + "step": 4150 + }, + { + "epoch": 0.7705130579736988, + "grad_norm": 0.24842087853864708, + "learning_rate": 0.00034125091398143445, + "loss": 0.4291, + "step": 4160 + }, + { + "epoch": 0.7723652528245971, + "grad_norm": 0.2684559843295528, + "learning_rate": 0.00034056925090749214, + "loss": 0.3715, + "step": 4170 + }, + { + "epoch": 0.7742174476754955, + "grad_norm": 0.22463589836430295, + "learning_rate": 0.00033988681187601907, + "loss": 0.4228, + "step": 4180 + }, + { + "epoch": 0.7760696425263938, + "grad_norm": 0.27828743228315045, + "learning_rate": 0.00033920360273385295, + "loss": 0.2931, + "step": 4190 + }, + { + "epoch": 0.777921837377292, + "grad_norm": 0.24380996785281236, + "learning_rate": 0.0003385196293344295, + "loss": 0.4017, + "step": 4200 + }, + { + "epoch": 0.7797740322281904, + "grad_norm": 0.2909979077113848, + "learning_rate": 0.0003378348975377319, + "loss": 0.3481, + "step": 4210 + }, + { + "epoch": 0.7816262270790887, + "grad_norm": 0.23332383664304898, + "learning_rate": 0.0003371494132102414, + "loss": 0.3445, + "step": 4220 + }, + { + "epoch": 0.7834784219299871, + "grad_norm": 0.21450077928300515, + "learning_rate": 0.0003364631822248863, + "loss": 0.3472, + "step": 4230 + }, + { + "epoch": 0.7853306167808853, + "grad_norm": 0.21521239472704395, + "learning_rate": 0.00033577621046099214, + "loss": 0.3326, + "step": 4240 + }, + { + "epoch": 0.7871828116317837, + "grad_norm": 0.21746868050833518, + "learning_rate": 0.00033508850380423107, + "loss": 0.317, + "step": 4250 + }, + { + "epoch": 0.789035006482682, + "grad_norm": 0.25145609268154195, + "learning_rate": 0.00033440006814657123, + "loss": 0.3903, + "step": 4260 + }, + { + "epoch": 0.7908872013335803, + "grad_norm": 0.2493850757271924, + "learning_rate": 0.00033371090938622683, + "loss": 0.376, + "step": 4270 + }, + { + "epoch": 0.7927393961844786, + "grad_norm": 0.27042518686478084, + "learning_rate": 0.00033302103342760717, + "loss": 0.3324, + "step": 4280 + }, + { + "epoch": 0.7945915910353769, + "grad_norm": 0.36372007737066575, + "learning_rate": 0.0003323304461812663, + "loss": 0.2962, + "step": 4290 + }, + { + "epoch": 0.7964437858862753, + "grad_norm": 0.2789450982129661, + "learning_rate": 0.0003316391535638521, + "loss": 0.4018, + "step": 4300 + }, + { + "epoch": 0.7982959807371736, + "grad_norm": 0.30183962763634775, + "learning_rate": 0.00033094716149805587, + "loss": 0.3866, + "step": 4310 + }, + { + "epoch": 0.8001481755880718, + "grad_norm": 0.21612720841935062, + "learning_rate": 0.0003302544759125615, + "loss": 0.4077, + "step": 4320 + }, + { + "epoch": 0.8020003704389702, + "grad_norm": 0.23394333144621351, + "learning_rate": 0.00032956110274199457, + "loss": 0.386, + "step": 4330 + }, + { + "epoch": 0.8038525652898685, + "grad_norm": 0.23944805976592476, + "learning_rate": 0.00032886704792687156, + "loss": 0.2975, + "step": 4340 + }, + { + "epoch": 0.8057047601407669, + "grad_norm": 0.30206829611790686, + "learning_rate": 0.0003281723174135491, + "loss": 0.3464, + "step": 4350 + }, + { + "epoch": 0.8075569549916651, + "grad_norm": 0.25395526533782503, + "learning_rate": 0.00032747691715417297, + "loss": 0.3839, + "step": 4360 + }, + { + "epoch": 0.8094091498425634, + "grad_norm": 0.2701846283890953, + "learning_rate": 0.0003267808531066268, + "loss": 0.3718, + "step": 4370 + }, + { + "epoch": 0.8112613446934618, + "grad_norm": 0.3284423662284243, + "learning_rate": 0.00032608413123448127, + "loss": 0.3123, + "step": 4380 + }, + { + "epoch": 0.81311353954436, + "grad_norm": 0.19093953526607452, + "learning_rate": 0.00032538675750694323, + "loss": 0.3178, + "step": 4390 + }, + { + "epoch": 0.8149657343952584, + "grad_norm": 0.2588745305552011, + "learning_rate": 0.0003246887378988044, + "loss": 0.3364, + "step": 4400 + }, + { + "epoch": 0.8168179292461567, + "grad_norm": 0.2944248033604882, + "learning_rate": 0.00032399007839038974, + "loss": 0.3851, + "step": 4410 + }, + { + "epoch": 0.818670124097055, + "grad_norm": 0.35233338424624305, + "learning_rate": 0.00032329078496750685, + "loss": 0.3935, + "step": 4420 + }, + { + "epoch": 0.8205223189479534, + "grad_norm": 0.2529989683445966, + "learning_rate": 0.00032259086362139444, + "loss": 0.3545, + "step": 4430 + }, + { + "epoch": 0.8223745137988516, + "grad_norm": 0.21890769609197974, + "learning_rate": 0.00032189032034867095, + "loss": 0.3322, + "step": 4440 + }, + { + "epoch": 0.82422670864975, + "grad_norm": 0.2966639221943858, + "learning_rate": 0.00032118916115128317, + "loss": 0.3413, + "step": 4450 + }, + { + "epoch": 0.8260789035006483, + "grad_norm": 0.28138389738354624, + "learning_rate": 0.00032048739203645484, + "loss": 0.3594, + "step": 4460 + }, + { + "epoch": 0.8279310983515465, + "grad_norm": 0.26012433275701663, + "learning_rate": 0.00031978501901663544, + "loss": 0.354, + "step": 4470 + }, + { + "epoch": 0.8297832932024449, + "grad_norm": 0.22288136348571755, + "learning_rate": 0.00031908204810944806, + "loss": 0.3345, + "step": 4480 + }, + { + "epoch": 0.8316354880533432, + "grad_norm": 0.2563012485418534, + "learning_rate": 0.0003183784853376386, + "loss": 0.377, + "step": 4490 + }, + { + "epoch": 0.8334876829042416, + "grad_norm": 0.19175987210580075, + "learning_rate": 0.00031767433672902357, + "loss": 0.378, + "step": 4500 + }, + { + "epoch": 0.8353398777551398, + "grad_norm": 0.27929483171815755, + "learning_rate": 0.0003169696083164387, + "loss": 0.4083, + "step": 4510 + }, + { + "epoch": 0.8371920726060381, + "grad_norm": 0.22806754292261686, + "learning_rate": 0.00031626430613768727, + "loss": 0.2805, + "step": 4520 + }, + { + "epoch": 0.8390442674569365, + "grad_norm": 0.2098902858669142, + "learning_rate": 0.0003155584362354883, + "loss": 0.3046, + "step": 4530 + }, + { + "epoch": 0.8408964623078348, + "grad_norm": 0.22326173310010555, + "learning_rate": 0.0003148520046574248, + "loss": 0.3618, + "step": 4540 + }, + { + "epoch": 0.8427486571587331, + "grad_norm": 0.28432435874722173, + "learning_rate": 0.00031414501745589214, + "loss": 0.3047, + "step": 4550 + }, + { + "epoch": 0.8446008520096314, + "grad_norm": 0.22658460752200546, + "learning_rate": 0.0003134374806880458, + "loss": 0.3075, + "step": 4560 + }, + { + "epoch": 0.8464530468605297, + "grad_norm": 0.2326511532797664, + "learning_rate": 0.00031272940041574985, + "loss": 0.3253, + "step": 4570 + }, + { + "epoch": 0.8483052417114281, + "grad_norm": 0.26196194032003345, + "learning_rate": 0.00031202078270552483, + "loss": 0.3672, + "step": 4580 + }, + { + "epoch": 0.8501574365623263, + "grad_norm": 0.2216415083774707, + "learning_rate": 0.00031131163362849563, + "loss": 0.361, + "step": 4590 + }, + { + "epoch": 0.8520096314132247, + "grad_norm": 0.31309200526058145, + "learning_rate": 0.0003106019592603401, + "loss": 0.4028, + "step": 4600 + }, + { + "epoch": 0.853861826264123, + "grad_norm": 0.30199878040880657, + "learning_rate": 0.000309891765681236, + "loss": 0.3254, + "step": 4610 + }, + { + "epoch": 0.8557140211150213, + "grad_norm": 0.2657478340310185, + "learning_rate": 0.0003091810589758099, + "loss": 0.3965, + "step": 4620 + }, + { + "epoch": 0.8575662159659196, + "grad_norm": 0.26801220601237896, + "learning_rate": 0.0003084698452330844, + "loss": 0.2717, + "step": 4630 + }, + { + "epoch": 0.8594184108168179, + "grad_norm": 0.2691236527559968, + "learning_rate": 0.0003077581305464263, + "loss": 0.3449, + "step": 4640 + }, + { + "epoch": 0.8612706056677163, + "grad_norm": 0.250751208793887, + "learning_rate": 0.0003070459210134941, + "loss": 0.3398, + "step": 4650 + }, + { + "epoch": 0.8631228005186146, + "grad_norm": 0.2598136376324884, + "learning_rate": 0.0003063332227361861, + "loss": 0.379, + "step": 4660 + }, + { + "epoch": 0.8649749953695128, + "grad_norm": 0.2320138289175307, + "learning_rate": 0.00030569138145676144, + "loss": 0.4172, + "step": 4670 + }, + { + "epoch": 0.8668271902204112, + "grad_norm": 0.2544457573722289, + "learning_rate": 0.0003049777713908237, + "loss": 0.3363, + "step": 4680 + }, + { + "epoch": 0.8686793850713095, + "grad_norm": 0.21755454053442072, + "learning_rate": 0.000304263690299507, + "loss": 0.3903, + "step": 4690 + }, + { + "epoch": 0.8705315799222079, + "grad_norm": 0.1876698563670142, + "learning_rate": 0.0003035491443007442, + "loss": 0.3813, + "step": 4700 + }, + { + "epoch": 0.8723837747731061, + "grad_norm": 0.23125086361592628, + "learning_rate": 0.0003028341395164513, + "loss": 0.326, + "step": 4710 + }, + { + "epoch": 0.8742359696240044, + "grad_norm": 0.24526039999109062, + "learning_rate": 0.0003021186820724752, + "loss": 0.3818, + "step": 4720 + }, + { + "epoch": 0.8760881644749028, + "grad_norm": 0.23276472003991475, + "learning_rate": 0.0003014027780985406, + "loss": 0.3286, + "step": 4730 + }, + { + "epoch": 0.8779403593258011, + "grad_norm": 0.2879683324317072, + "learning_rate": 0.00030068643372819804, + "loss": 0.3563, + "step": 4740 + }, + { + "epoch": 0.8797925541766994, + "grad_norm": 0.19871362889489913, + "learning_rate": 0.0002999696550987713, + "loss": 0.3271, + "step": 4750 + }, + { + "epoch": 0.8816447490275977, + "grad_norm": 0.2749990294223314, + "learning_rate": 0.00029925244835130466, + "loss": 0.36, + "step": 4760 + }, + { + "epoch": 0.883496943878496, + "grad_norm": 0.19581874215709116, + "learning_rate": 0.00029853481963051015, + "loss": 0.3869, + "step": 4770 + }, + { + "epoch": 0.8853491387293944, + "grad_norm": 0.25690630291268424, + "learning_rate": 0.0002978167750847153, + "loss": 0.3291, + "step": 4780 + }, + { + "epoch": 0.8872013335802926, + "grad_norm": 0.23380636858065187, + "learning_rate": 0.0002970983208658101, + "loss": 0.3148, + "step": 4790 + }, + { + "epoch": 0.889053528431191, + "grad_norm": 0.27392706669357925, + "learning_rate": 0.00029637946312919443, + "loss": 0.3471, + "step": 4800 + }, + { + "epoch": 0.8909057232820893, + "grad_norm": 0.262683330886347, + "learning_rate": 0.00029566020803372544, + "loss": 0.3581, + "step": 4810 + }, + { + "epoch": 0.8927579181329875, + "grad_norm": 0.1967433279025824, + "learning_rate": 0.0002949405617416647, + "loss": 0.3244, + "step": 4820 + }, + { + "epoch": 0.8946101129838859, + "grad_norm": 0.21893101415992228, + "learning_rate": 0.00029422053041862524, + "loss": 0.2418, + "step": 4830 + }, + { + "epoch": 0.8964623078347842, + "grad_norm": 0.3050479264269311, + "learning_rate": 0.000293500120233519, + "loss": 0.3154, + "step": 4840 + }, + { + "epoch": 0.8983145026856826, + "grad_norm": 0.22098931345400527, + "learning_rate": 0.00029277933735850366, + "loss": 0.3875, + "step": 4850 + }, + { + "epoch": 0.9001666975365809, + "grad_norm": 0.18665489074313069, + "learning_rate": 0.0002920581879689302, + "loss": 0.3203, + "step": 4860 + }, + { + "epoch": 0.9020188923874791, + "grad_norm": 0.22546452927540434, + "learning_rate": 0.00029133667824328944, + "loss": 0.3174, + "step": 4870 + }, + { + "epoch": 0.9038710872383775, + "grad_norm": 0.273911749633942, + "learning_rate": 0.0002906148143631597, + "loss": 0.4109, + "step": 4880 + }, + { + "epoch": 0.9057232820892758, + "grad_norm": 0.2862382822755954, + "learning_rate": 0.0002898926025131534, + "loss": 0.3438, + "step": 4890 + }, + { + "epoch": 0.9075754769401742, + "grad_norm": 0.2256784413424552, + "learning_rate": 0.0002891700488808641, + "loss": 0.4231, + "step": 4900 + }, + { + "epoch": 0.9094276717910724, + "grad_norm": 0.25475613390595164, + "learning_rate": 0.0002884471596568138, + "loss": 0.311, + "step": 4910 + }, + { + "epoch": 0.9112798666419707, + "grad_norm": 0.22040988223176197, + "learning_rate": 0.0002877239410343995, + "loss": 0.3609, + "step": 4920 + }, + { + "epoch": 0.9131320614928691, + "grad_norm": 0.21405974357001087, + "learning_rate": 0.0002870003992098406, + "loss": 0.3199, + "step": 4930 + }, + { + "epoch": 0.9149842563437673, + "grad_norm": 0.22165830710412393, + "learning_rate": 0.00028627654038212535, + "loss": 0.2932, + "step": 4940 + }, + { + "epoch": 0.9168364511946657, + "grad_norm": 0.2539298146212295, + "learning_rate": 0.000285552370752958, + "loss": 0.3203, + "step": 4950 + }, + { + "epoch": 0.918688646045564, + "grad_norm": 0.2519284526672049, + "learning_rate": 0.0002848278965267057, + "loss": 0.299, + "step": 4960 + }, + { + "epoch": 0.9205408408964623, + "grad_norm": 0.21558726442907455, + "learning_rate": 0.000284103123910345, + "loss": 0.3227, + "step": 4970 + }, + { + "epoch": 0.9223930357473606, + "grad_norm": 0.2314909389156984, + "learning_rate": 0.00028337805911340914, + "loss": 0.3018, + "step": 4980 + }, + { + "epoch": 0.9242452305982589, + "grad_norm": 0.278811225532839, + "learning_rate": 0.00028265270834793466, + "loss": 0.3002, + "step": 4990 + }, + { + "epoch": 0.9260974254491573, + "grad_norm": 0.21464467115282912, + "learning_rate": 0.0002819270778284081, + "loss": 0.2984, + "step": 5000 + }, + { + "epoch": 0.9279496203000556, + "grad_norm": 0.21949485740442687, + "learning_rate": 0.0002812011737717127, + "loss": 0.3034, + "step": 5010 + }, + { + "epoch": 0.9298018151509538, + "grad_norm": 0.22922734336855702, + "learning_rate": 0.0002804750023970753, + "loss": 0.3648, + "step": 5020 + }, + { + "epoch": 0.9316540100018522, + "grad_norm": 0.2807666058464406, + "learning_rate": 0.00027974856992601314, + "loss": 0.347, + "step": 5030 + }, + { + "epoch": 0.9335062048527505, + "grad_norm": 0.21380147064458355, + "learning_rate": 0.00027902188258228033, + "loss": 0.2868, + "step": 5040 + }, + { + "epoch": 0.9353583997036489, + "grad_norm": 0.23226632039182726, + "learning_rate": 0.00027829494659181454, + "loss": 0.3373, + "step": 5050 + }, + { + "epoch": 0.9372105945545471, + "grad_norm": 0.16664382791007723, + "learning_rate": 0.0002775677681826838, + "loss": 0.3425, + "step": 5060 + }, + { + "epoch": 0.9390627894054454, + "grad_norm": 0.2131603970341897, + "learning_rate": 0.00027684035358503315, + "loss": 0.356, + "step": 5070 + }, + { + "epoch": 0.9409149842563438, + "grad_norm": 0.2943760673928641, + "learning_rate": 0.00027611270903103095, + "loss": 0.3573, + "step": 5080 + }, + { + "epoch": 0.9427671791072421, + "grad_norm": 0.2862566121817152, + "learning_rate": 0.00027538484075481613, + "loss": 0.4255, + "step": 5090 + }, + { + "epoch": 0.9446193739581404, + "grad_norm": 0.231901510250299, + "learning_rate": 0.00027465675499244396, + "loss": 0.3407, + "step": 5100 + }, + { + "epoch": 0.9464715688090387, + "grad_norm": 0.2476530639942114, + "learning_rate": 0.0002739284579818333, + "loss": 0.2723, + "step": 5110 + }, + { + "epoch": 0.948323763659937, + "grad_norm": 0.21350073532203115, + "learning_rate": 0.0002731999559627127, + "loss": 0.3461, + "step": 5120 + }, + { + "epoch": 0.9501759585108354, + "grad_norm": 0.2002031483905575, + "learning_rate": 0.0002724712551765673, + "loss": 0.3514, + "step": 5130 + }, + { + "epoch": 0.9520281533617336, + "grad_norm": 0.2370797517823577, + "learning_rate": 0.00027174236186658515, + "loss": 0.3378, + "step": 5140 + }, + { + "epoch": 0.953880348212632, + "grad_norm": 0.21585863872901473, + "learning_rate": 0.0002710132822776037, + "loss": 0.3321, + "step": 5150 + }, + { + "epoch": 0.9557325430635303, + "grad_norm": 0.26386608394124156, + "learning_rate": 0.0002702840226560564, + "loss": 0.3436, + "step": 5160 + }, + { + "epoch": 0.9575847379144286, + "grad_norm": 0.2890408109766508, + "learning_rate": 0.00026955458924991923, + "loss": 0.401, + "step": 5170 + }, + { + "epoch": 0.9594369327653269, + "grad_norm": 0.25751071532225056, + "learning_rate": 0.00026882498830865673, + "loss": 0.3359, + "step": 5180 + }, + { + "epoch": 0.9612891276162252, + "grad_norm": 0.1908489549011557, + "learning_rate": 0.00026809522608316926, + "loss": 0.3446, + "step": 5190 + }, + { + "epoch": 0.9631413224671236, + "grad_norm": 0.2654943827624779, + "learning_rate": 0.0002673653088257388, + "loss": 0.3226, + "step": 5200 + }, + { + "epoch": 0.9649935173180219, + "grad_norm": 0.2090532023246876, + "learning_rate": 0.00026663524278997534, + "loss": 0.3627, + "step": 5210 + }, + { + "epoch": 0.9668457121689201, + "grad_norm": 0.1928560578254249, + "learning_rate": 0.00026590503423076404, + "loss": 0.3829, + "step": 5220 + }, + { + "epoch": 0.9686979070198185, + "grad_norm": 0.2669070196379663, + "learning_rate": 0.0002651746894042108, + "loss": 0.3034, + "step": 5230 + }, + { + "epoch": 0.9705501018707168, + "grad_norm": 0.30560885950305455, + "learning_rate": 0.00026444421456758887, + "loss": 0.3662, + "step": 5240 + }, + { + "epoch": 0.9724022967216152, + "grad_norm": 0.26179376779317864, + "learning_rate": 0.00026371361597928586, + "loss": 0.3277, + "step": 5250 + }, + { + "epoch": 0.9742544915725134, + "grad_norm": 0.22773579499385666, + "learning_rate": 0.0002629828998987491, + "loss": 0.3227, + "step": 5260 + }, + { + "epoch": 0.9761066864234117, + "grad_norm": 0.22913911318822955, + "learning_rate": 0.0002622520725864328, + "loss": 0.4155, + "step": 5270 + }, + { + "epoch": 0.9779588812743101, + "grad_norm": 0.26745430474124415, + "learning_rate": 0.0002615211403037441, + "loss": 0.3134, + "step": 5280 + }, + { + "epoch": 0.9798110761252083, + "grad_norm": 0.18747224024104983, + "learning_rate": 0.00026079010931298965, + "loss": 0.3352, + "step": 5290 + }, + { + "epoch": 0.9816632709761067, + "grad_norm": 0.2507770069072283, + "learning_rate": 0.0002600589858773216, + "loss": 0.2841, + "step": 5300 + }, + { + "epoch": 0.983515465827005, + "grad_norm": 0.2320843718590129, + "learning_rate": 0.00025932777626068405, + "loss": 0.2901, + "step": 5310 + }, + { + "epoch": 0.9853676606779033, + "grad_norm": 0.25694442462488337, + "learning_rate": 0.0002585964867277597, + "loss": 0.3655, + "step": 5320 + }, + { + "epoch": 0.9872198555288016, + "grad_norm": 0.1946752572256077, + "learning_rate": 0.00025786512354391585, + "loss": 0.3399, + "step": 5330 + }, + { + "epoch": 0.9890720503796999, + "grad_norm": 0.1531862751587864, + "learning_rate": 0.00025713369297515056, + "loss": 0.3309, + "step": 5340 + }, + { + "epoch": 0.9909242452305983, + "grad_norm": 0.23979500779092153, + "learning_rate": 0.00025640220128803965, + "loss": 0.3476, + "step": 5350 + }, + { + "epoch": 0.9927764400814966, + "grad_norm": 0.22955793113305528, + "learning_rate": 0.00025567065474968226, + "loss": 0.34, + "step": 5360 + }, + { + "epoch": 0.9946286349323948, + "grad_norm": 0.26774128565687644, + "learning_rate": 0.00025501222114748204, + "loss": 0.3265, + "step": 5370 + }, + { + "epoch": 0.9964808297832932, + "grad_norm": 0.2331087333203837, + "learning_rate": 0.00025428058765925466, + "loss": 0.2761, + "step": 5380 + }, + { + "epoch": 0.9983330246341915, + "grad_norm": 0.24526043917044132, + "learning_rate": 0.00025354891749683386, + "loss": 0.3495, + "step": 5390 + }, + { + "epoch": 1.0001852194850898, + "grad_norm": 0.2031173709527516, + "learning_rate": 0.0002528172169288478, + "loss": 0.3272, + "step": 5400 + }, + { + "epoch": 1.0020374143359883, + "grad_norm": 0.2229851857312578, + "learning_rate": 0.0002520854922241855, + "loss": 0.2226, + "step": 5410 + }, + { + "epoch": 1.0038896091868865, + "grad_norm": 0.23237399050753563, + "learning_rate": 0.0002513537496519425, + "loss": 0.2502, + "step": 5420 + }, + { + "epoch": 1.0057418040377848, + "grad_norm": 0.22482059046916258, + "learning_rate": 0.00025062199548136767, + "loss": 0.2567, + "step": 5430 + }, + { + "epoch": 1.007593998888683, + "grad_norm": 0.19384034239788644, + "learning_rate": 0.00024989023598180886, + "loss": 0.231, + "step": 5440 + }, + { + "epoch": 1.0094461937395813, + "grad_norm": 0.18371330112888887, + "learning_rate": 0.0002491584774226599, + "loss": 0.2927, + "step": 5450 + }, + { + "epoch": 1.0112983885904798, + "grad_norm": 0.21546778676484551, + "learning_rate": 0.0002484267260733065, + "loss": 0.265, + "step": 5460 + }, + { + "epoch": 1.013150583441378, + "grad_norm": 0.14298891444963896, + "learning_rate": 0.0002476949882030726, + "loss": 0.2211, + "step": 5470 + }, + { + "epoch": 1.0150027782922764, + "grad_norm": 0.25187217178584165, + "learning_rate": 0.0002469632700811665, + "loss": 0.2581, + "step": 5480 + }, + { + "epoch": 1.0168549731431746, + "grad_norm": 0.31946252092124755, + "learning_rate": 0.00024623157797662757, + "loss": 0.2171, + "step": 5490 + }, + { + "epoch": 1.018707167994073, + "grad_norm": 0.20257626106772428, + "learning_rate": 0.000245499918158272, + "loss": 0.21, + "step": 5500 + }, + { + "epoch": 1.0205593628449714, + "grad_norm": 0.30792020448282925, + "learning_rate": 0.00024476829689463965, + "loss": 0.2199, + "step": 5510 + }, + { + "epoch": 1.0224115576958697, + "grad_norm": 0.2359106076314458, + "learning_rate": 0.0002440367204539398, + "loss": 0.2221, + "step": 5520 + }, + { + "epoch": 1.024263752546768, + "grad_norm": 0.2642461112213505, + "learning_rate": 0.00024330519510399774, + "loss": 0.287, + "step": 5530 + }, + { + "epoch": 1.0261159473976662, + "grad_norm": 0.25013845200803386, + "learning_rate": 0.00024257372711220134, + "loss": 0.2578, + "step": 5540 + }, + { + "epoch": 1.0279681422485645, + "grad_norm": 0.26551429905341034, + "learning_rate": 0.00024184232274544672, + "loss": 0.2509, + "step": 5550 + }, + { + "epoch": 1.029820337099463, + "grad_norm": 0.2070332092773878, + "learning_rate": 0.00024111098827008494, + "loss": 0.2202, + "step": 5560 + }, + { + "epoch": 1.0316725319503612, + "grad_norm": 0.21040587853785286, + "learning_rate": 0.00024037972995186838, + "loss": 0.2858, + "step": 5570 + }, + { + "epoch": 1.0335247268012595, + "grad_norm": 0.21864583485000008, + "learning_rate": 0.00023964855405589689, + "loss": 0.2114, + "step": 5580 + }, + { + "epoch": 1.0353769216521578, + "grad_norm": 0.21646010024279735, + "learning_rate": 0.00023891746684656412, + "loss": 0.2519, + "step": 5590 + }, + { + "epoch": 1.037229116503056, + "grad_norm": 0.31512168932825474, + "learning_rate": 0.00023818647458750388, + "loss": 0.2967, + "step": 5600 + }, + { + "epoch": 1.0390813113539545, + "grad_norm": 0.20525167225456686, + "learning_rate": 0.00023745558354153654, + "loss": 0.2591, + "step": 5610 + }, + { + "epoch": 1.0409335062048528, + "grad_norm": 0.23384175420672978, + "learning_rate": 0.0002367247999706154, + "loss": 0.2236, + "step": 5620 + }, + { + "epoch": 1.042785701055751, + "grad_norm": 0.24586451573414675, + "learning_rate": 0.00023599413013577277, + "loss": 0.2807, + "step": 5630 + }, + { + "epoch": 1.0446378959066493, + "grad_norm": 0.31412889304572406, + "learning_rate": 0.00023526358029706665, + "loss": 0.2676, + "step": 5640 + }, + { + "epoch": 1.0464900907575476, + "grad_norm": 0.157853905207218, + "learning_rate": 0.00023453315671352693, + "loss": 0.2769, + "step": 5650 + }, + { + "epoch": 1.0483422856084461, + "grad_norm": 0.2229105615382073, + "learning_rate": 0.00023380286564310176, + "loss": 0.2735, + "step": 5660 + }, + { + "epoch": 1.0501944804593444, + "grad_norm": 0.26127473765870846, + "learning_rate": 0.0002330727133426041, + "loss": 0.3007, + "step": 5670 + }, + { + "epoch": 1.0520466753102427, + "grad_norm": 0.3906751493250249, + "learning_rate": 0.00023234270606765778, + "loss": 0.2809, + "step": 5680 + }, + { + "epoch": 1.053898870161141, + "grad_norm": 0.2398049248934978, + "learning_rate": 0.00023161285007264446, + "loss": 0.2144, + "step": 5690 + }, + { + "epoch": 1.0557510650120392, + "grad_norm": 0.24411940105501112, + "learning_rate": 0.0002308831516106494, + "loss": 0.223, + "step": 5700 + }, + { + "epoch": 1.0576032598629377, + "grad_norm": 0.2547297157594742, + "learning_rate": 0.0002301536169334082, + "loss": 0.2458, + "step": 5710 + }, + { + "epoch": 1.059455454713836, + "grad_norm": 0.18393906015457895, + "learning_rate": 0.00022942425229125328, + "loss": 0.248, + "step": 5720 + }, + { + "epoch": 1.0613076495647342, + "grad_norm": 0.24279551434371524, + "learning_rate": 0.0002286950639330604, + "loss": 0.2709, + "step": 5730 + }, + { + "epoch": 1.0631598444156325, + "grad_norm": 0.23381376758753333, + "learning_rate": 0.00022796605810619487, + "loss": 0.2361, + "step": 5740 + }, + { + "epoch": 1.0650120392665308, + "grad_norm": 0.24452694586413046, + "learning_rate": 0.00022723724105645814, + "loss": 0.2076, + "step": 5750 + }, + { + "epoch": 1.0668642341174293, + "grad_norm": 0.30441717560616044, + "learning_rate": 0.00022650861902803426, + "loss": 0.2922, + "step": 5760 + }, + { + "epoch": 1.0687164289683275, + "grad_norm": 0.2588550928583629, + "learning_rate": 0.00022578019826343656, + "loss": 0.2687, + "step": 5770 + }, + { + "epoch": 1.0705686238192258, + "grad_norm": 0.17900093913620954, + "learning_rate": 0.00022505198500345403, + "loss": 0.2467, + "step": 5780 + }, + { + "epoch": 1.072420818670124, + "grad_norm": 0.2492431472220246, + "learning_rate": 0.00022432398548709767, + "loss": 0.2938, + "step": 5790 + }, + { + "epoch": 1.0742730135210223, + "grad_norm": 0.21358503411722063, + "learning_rate": 0.00022359620595154743, + "loss": 0.2038, + "step": 5800 + }, + { + "epoch": 1.0761252083719208, + "grad_norm": 0.28309019763963955, + "learning_rate": 0.00022286865263209833, + "loss": 0.2905, + "step": 5810 + }, + { + "epoch": 1.077977403222819, + "grad_norm": 0.21729388154855128, + "learning_rate": 0.00022214133176210756, + "loss": 0.226, + "step": 5820 + }, + { + "epoch": 1.0798295980737174, + "grad_norm": 0.18775475682209616, + "learning_rate": 0.0002214142495729405, + "loss": 0.2762, + "step": 5830 + }, + { + "epoch": 1.0816817929246156, + "grad_norm": 0.19069211253783463, + "learning_rate": 0.00022068741229391777, + "loss": 0.2256, + "step": 5840 + }, + { + "epoch": 1.083533987775514, + "grad_norm": 0.25813186890444373, + "learning_rate": 0.00021996082615226176, + "loss": 0.2409, + "step": 5850 + }, + { + "epoch": 1.0853861826264124, + "grad_norm": 0.19945938160620094, + "learning_rate": 0.00021923449737304312, + "loss": 0.2536, + "step": 5860 + }, + { + "epoch": 1.0872383774773107, + "grad_norm": 0.25882839571818395, + "learning_rate": 0.00021850843217912757, + "loss": 0.277, + "step": 5870 + }, + { + "epoch": 1.089090572328209, + "grad_norm": 0.3164832568487736, + "learning_rate": 0.0002177826367911225, + "loss": 0.2705, + "step": 5880 + }, + { + "epoch": 1.0909427671791072, + "grad_norm": 0.26233993949922385, + "learning_rate": 0.0002170571174273238, + "loss": 0.2524, + "step": 5890 + }, + { + "epoch": 1.0927949620300055, + "grad_norm": 0.21974259388964484, + "learning_rate": 0.0002163318803036624, + "loss": 0.2304, + "step": 5900 + }, + { + "epoch": 1.094647156880904, + "grad_norm": 0.2423119808479642, + "learning_rate": 0.00021560693163365127, + "loss": 0.2864, + "step": 5910 + }, + { + "epoch": 1.0964993517318022, + "grad_norm": 0.23788077135736266, + "learning_rate": 0.00021488227762833187, + "loss": 0.223, + "step": 5920 + }, + { + "epoch": 1.0983515465827005, + "grad_norm": 0.2626939992945942, + "learning_rate": 0.00021415792449622128, + "loss": 0.2174, + "step": 5930 + }, + { + "epoch": 1.1002037414335988, + "grad_norm": 0.15991056421689562, + "learning_rate": 0.0002134338784432587, + "loss": 0.2381, + "step": 5940 + }, + { + "epoch": 1.102055936284497, + "grad_norm": 0.20700833727267778, + "learning_rate": 0.00021271014567275239, + "loss": 0.2646, + "step": 5950 + }, + { + "epoch": 1.1039081311353955, + "grad_norm": 0.3351339504582773, + "learning_rate": 0.00021198673238532665, + "loss": 0.2484, + "step": 5960 + }, + { + "epoch": 1.1057603259862938, + "grad_norm": 0.25621425870572345, + "learning_rate": 0.00021126364477886848, + "loss": 0.2078, + "step": 5970 + }, + { + "epoch": 1.107612520837192, + "grad_norm": 0.23131050803651781, + "learning_rate": 0.00021054088904847476, + "loss": 0.2254, + "step": 5980 + }, + { + "epoch": 1.1094647156880904, + "grad_norm": 0.18439721493846953, + "learning_rate": 0.0002098184713863987, + "loss": 0.2095, + "step": 5990 + }, + { + "epoch": 1.1113169105389886, + "grad_norm": 0.2388500241914586, + "learning_rate": 0.00020909639798199754, + "loss": 0.2091, + "step": 6000 + }, + { + "epoch": 1.1131691053898871, + "grad_norm": 0.21529124736985356, + "learning_rate": 0.00020837467502167868, + "loss": 0.2167, + "step": 6010 + }, + { + "epoch": 1.1150213002407854, + "grad_norm": 0.16618163554721885, + "learning_rate": 0.0002076533086888472, + "loss": 0.2104, + "step": 6020 + }, + { + "epoch": 1.1168734950916837, + "grad_norm": 0.33925928207566014, + "learning_rate": 0.00020693230516385266, + "loss": 0.2119, + "step": 6030 + }, + { + "epoch": 1.118725689942582, + "grad_norm": 0.1826830206402772, + "learning_rate": 0.0002062116706239365, + "loss": 0.2462, + "step": 6040 + }, + { + "epoch": 1.1205778847934802, + "grad_norm": 0.19046785383617137, + "learning_rate": 0.00020549141124317865, + "loss": 0.2117, + "step": 6050 + }, + { + "epoch": 1.1224300796443787, + "grad_norm": 0.24622926500228018, + "learning_rate": 0.00020477153319244478, + "loss": 0.227, + "step": 6060 + }, + { + "epoch": 1.124282274495277, + "grad_norm": 0.2165508639382145, + "learning_rate": 0.00020405204263933375, + "loss": 0.2638, + "step": 6070 + }, + { + "epoch": 1.1261344693461752, + "grad_norm": 0.23498687913366198, + "learning_rate": 0.00020333294574812415, + "loss": 0.2281, + "step": 6080 + }, + { + "epoch": 1.1279866641970735, + "grad_norm": 0.19311160739289338, + "learning_rate": 0.00020261424867972226, + "loss": 0.2159, + "step": 6090 + }, + { + "epoch": 1.1298388590479718, + "grad_norm": 0.20569897318234276, + "learning_rate": 0.00020189595759160855, + "loss": 0.2557, + "step": 6100 + }, + { + "epoch": 1.1316910538988703, + "grad_norm": 0.1637570670386419, + "learning_rate": 0.00020117807863778537, + "loss": 0.2231, + "step": 6110 + }, + { + "epoch": 1.1335432487497685, + "grad_norm": 0.26014467806402464, + "learning_rate": 0.000200460617968724, + "loss": 0.286, + "step": 6120 + }, + { + "epoch": 1.1353954436006668, + "grad_norm": 0.2505673154655342, + "learning_rate": 0.00019974358173131202, + "loss": 0.2853, + "step": 6130 + }, + { + "epoch": 1.137247638451565, + "grad_norm": 0.22347929448158552, + "learning_rate": 0.00019902697606880089, + "loss": 0.2677, + "step": 6140 + }, + { + "epoch": 1.1390998333024633, + "grad_norm": 0.20920726669707854, + "learning_rate": 0.00019831080712075268, + "loss": 0.244, + "step": 6150 + }, + { + "epoch": 1.1409520281533618, + "grad_norm": 0.20688915094296348, + "learning_rate": 0.00019759508102298846, + "loss": 0.2327, + "step": 6160 + }, + { + "epoch": 1.14280422300426, + "grad_norm": 0.25157909739969075, + "learning_rate": 0.00019687980390753465, + "loss": 0.2485, + "step": 6170 + }, + { + "epoch": 1.1446564178551584, + "grad_norm": 0.23866241222091628, + "learning_rate": 0.00019616498190257121, + "loss": 0.2492, + "step": 6180 + }, + { + "epoch": 1.1465086127060566, + "grad_norm": 0.264337208089594, + "learning_rate": 0.00019545062113237875, + "loss": 0.2758, + "step": 6190 + }, + { + "epoch": 1.148360807556955, + "grad_norm": 0.25587094035952673, + "learning_rate": 0.00019473672771728648, + "loss": 0.2129, + "step": 6200 + }, + { + "epoch": 1.1502130024078534, + "grad_norm": 0.16128043145453166, + "learning_rate": 0.00019402330777361934, + "loss": 0.2231, + "step": 6210 + }, + { + "epoch": 1.1520651972587517, + "grad_norm": 0.233999456400375, + "learning_rate": 0.0001933103674136458, + "loss": 0.2443, + "step": 6220 + }, + { + "epoch": 1.15391739210965, + "grad_norm": 0.23923089697365066, + "learning_rate": 0.00019259791274552548, + "loss": 0.2532, + "step": 6230 + }, + { + "epoch": 1.1557695869605482, + "grad_norm": 0.18310940478929233, + "learning_rate": 0.00019188594987325675, + "loss": 0.2084, + "step": 6240 + }, + { + "epoch": 1.1576217818114465, + "grad_norm": 0.20715212646569164, + "learning_rate": 0.00019117448489662468, + "loss": 0.2315, + "step": 6250 + }, + { + "epoch": 1.159473976662345, + "grad_norm": 0.16666508872746613, + "learning_rate": 0.00019046352391114836, + "loss": 0.2214, + "step": 6260 + }, + { + "epoch": 1.1613261715132432, + "grad_norm": 0.19036221587749683, + "learning_rate": 0.000189753073008029, + "loss": 0.2011, + "step": 6270 + }, + { + "epoch": 1.1631783663641415, + "grad_norm": 0.18630573209584733, + "learning_rate": 0.00018904313827409764, + "loss": 0.2081, + "step": 6280 + }, + { + "epoch": 1.1650305612150398, + "grad_norm": 0.20378341723916718, + "learning_rate": 0.0001883337257917631, + "loss": 0.2573, + "step": 6290 + }, + { + "epoch": 1.166882756065938, + "grad_norm": 0.24764507328618723, + "learning_rate": 0.00018762484163895962, + "loss": 0.2245, + "step": 6300 + }, + { + "epoch": 1.1687349509168365, + "grad_norm": 0.2536985360849042, + "learning_rate": 0.00018691649188909494, + "loss": 0.2427, + "step": 6310 + }, + { + "epoch": 1.1705871457677348, + "grad_norm": 0.22553827575055346, + "learning_rate": 0.00018620868261099856, + "loss": 0.2556, + "step": 6320 + }, + { + "epoch": 1.172439340618633, + "grad_norm": 0.238267227934858, + "learning_rate": 0.00018550141986886914, + "loss": 0.2079, + "step": 6330 + }, + { + "epoch": 1.1742915354695314, + "grad_norm": 0.24364164673526545, + "learning_rate": 0.00018479470972222295, + "loss": 0.2377, + "step": 6340 + }, + { + "epoch": 1.1761437303204296, + "grad_norm": 0.23684110576656128, + "learning_rate": 0.00018408855822584186, + "loss": 0.2106, + "step": 6350 + }, + { + "epoch": 1.1779959251713281, + "grad_norm": 0.24133180260347029, + "learning_rate": 0.0001833829714297216, + "loss": 0.2325, + "step": 6360 + }, + { + "epoch": 1.1798481200222264, + "grad_norm": 0.27161152313481657, + "learning_rate": 0.0001826779553790196, + "loss": 0.2816, + "step": 6370 + }, + { + "epoch": 1.1817003148731247, + "grad_norm": 0.2549979606684111, + "learning_rate": 0.0001819735161140035, + "loss": 0.2716, + "step": 6380 + }, + { + "epoch": 1.183552509724023, + "grad_norm": 0.2171602609914945, + "learning_rate": 0.0001812696596699992, + "loss": 0.1919, + "step": 6390 + }, + { + "epoch": 1.1854047045749212, + "grad_norm": 0.2426365201904578, + "learning_rate": 0.00018056639207733943, + "loss": 0.1937, + "step": 6400 + }, + { + "epoch": 1.1872568994258197, + "grad_norm": 0.23103167647591963, + "learning_rate": 0.0001798637193613118, + "loss": 0.2212, + "step": 6410 + }, + { + "epoch": 1.189109094276718, + "grad_norm": 0.18152043318271277, + "learning_rate": 0.00017916164754210723, + "loss": 0.2525, + "step": 6420 + }, + { + "epoch": 1.1909612891276162, + "grad_norm": 0.2404169525253988, + "learning_rate": 0.00017846018263476844, + "loss": 0.2365, + "step": 6430 + }, + { + "epoch": 1.1928134839785145, + "grad_norm": 0.2527427714001698, + "learning_rate": 0.00017775933064913838, + "loss": 0.2382, + "step": 6440 + }, + { + "epoch": 1.1946656788294128, + "grad_norm": 0.2504119633783523, + "learning_rate": 0.0001770590975898089, + "loss": 0.2435, + "step": 6450 + }, + { + "epoch": 1.1965178736803113, + "grad_norm": 0.21122876356534948, + "learning_rate": 0.0001763594894560689, + "loss": 0.2182, + "step": 6460 + }, + { + "epoch": 1.1983700685312095, + "grad_norm": 0.17197814060082, + "learning_rate": 0.00017566051224185357, + "loss": 0.2316, + "step": 6470 + }, + { + "epoch": 1.2002222633821078, + "grad_norm": 0.2261749683499797, + "learning_rate": 0.0001749621719356923, + "loss": 0.2834, + "step": 6480 + }, + { + "epoch": 1.202074458233006, + "grad_norm": 0.18709901189179085, + "learning_rate": 0.00017426447452065786, + "loss": 0.2329, + "step": 6490 + }, + { + "epoch": 1.2039266530839043, + "grad_norm": 0.22261464085835025, + "learning_rate": 0.00017356742597431503, + "loss": 0.2294, + "step": 6500 + }, + { + "epoch": 1.2057788479348028, + "grad_norm": 0.1562966068716981, + "learning_rate": 0.0001728710322686694, + "loss": 0.2676, + "step": 6510 + }, + { + "epoch": 1.207631042785701, + "grad_norm": 0.20080366502853164, + "learning_rate": 0.00017217529937011612, + "loss": 0.2034, + "step": 6520 + }, + { + "epoch": 1.2094832376365994, + "grad_norm": 0.2488017093046758, + "learning_rate": 0.00017148023323938877, + "loss": 0.2576, + "step": 6530 + }, + { + "epoch": 1.2113354324874976, + "grad_norm": 0.3018899089016778, + "learning_rate": 0.00017078583983150852, + "loss": 0.2521, + "step": 6540 + }, + { + "epoch": 1.213187627338396, + "grad_norm": 0.21650035591018305, + "learning_rate": 0.00017009212509573273, + "loss": 0.1992, + "step": 6550 + }, + { + "epoch": 1.2150398221892944, + "grad_norm": 0.18604059543117943, + "learning_rate": 0.00016939909497550455, + "loss": 0.2145, + "step": 6560 + }, + { + "epoch": 1.2168920170401927, + "grad_norm": 0.13425561299908903, + "learning_rate": 0.0001687067554084012, + "loss": 0.2121, + "step": 6570 + }, + { + "epoch": 1.218744211891091, + "grad_norm": 0.15061326471247105, + "learning_rate": 0.00016801511232608388, + "loss": 0.2093, + "step": 6580 + }, + { + "epoch": 1.2205964067419892, + "grad_norm": 0.18586921295904735, + "learning_rate": 0.00016732417165424645, + "loss": 0.2442, + "step": 6590 + }, + { + "epoch": 1.2224486015928875, + "grad_norm": 0.1947265751683096, + "learning_rate": 0.00016663393931256484, + "loss": 0.1964, + "step": 6600 + }, + { + "epoch": 1.224300796443786, + "grad_norm": 0.3014541141949089, + "learning_rate": 0.00016594442121464648, + "loss": 0.2539, + "step": 6610 + }, + { + "epoch": 1.2261529912946842, + "grad_norm": 0.2665331923593494, + "learning_rate": 0.00016525562326797911, + "loss": 0.2052, + "step": 6620 + }, + { + "epoch": 1.2280051861455825, + "grad_norm": 0.23248425733346062, + "learning_rate": 0.00016456755137388105, + "loss": 0.2206, + "step": 6630 + }, + { + "epoch": 1.2298573809964808, + "grad_norm": 0.21597100541187533, + "learning_rate": 0.0001638802114274497, + "loss": 0.2399, + "step": 6640 + }, + { + "epoch": 1.231709575847379, + "grad_norm": 0.22311107620019674, + "learning_rate": 0.0001631936093175116, + "loss": 0.2344, + "step": 6650 + }, + { + "epoch": 1.2335617706982775, + "grad_norm": 0.23595231727324342, + "learning_rate": 0.0001625077509265717, + "loss": 0.2302, + "step": 6660 + }, + { + "epoch": 1.2354139655491758, + "grad_norm": 0.18416586445656416, + "learning_rate": 0.0001618226421307635, + "loss": 0.2438, + "step": 6670 + }, + { + "epoch": 1.237266160400074, + "grad_norm": 0.2397024652142972, + "learning_rate": 0.00016113828879979776, + "loss": 0.2174, + "step": 6680 + }, + { + "epoch": 1.2391183552509724, + "grad_norm": 0.2458273041744814, + "learning_rate": 0.00016045469679691306, + "loss": 0.2649, + "step": 6690 + }, + { + "epoch": 1.2409705501018706, + "grad_norm": 0.24261819790944433, + "learning_rate": 0.00015977187197882529, + "loss": 0.2353, + "step": 6700 + }, + { + "epoch": 1.2428227449527691, + "grad_norm": 0.21058758451619233, + "learning_rate": 0.0001590898201956772, + "loss": 0.2517, + "step": 6710 + }, + { + "epoch": 1.2446749398036674, + "grad_norm": 0.2260538599044833, + "learning_rate": 0.0001584085472909888, + "loss": 0.2425, + "step": 6720 + }, + { + "epoch": 1.2465271346545657, + "grad_norm": 0.2973826520271178, + "learning_rate": 0.0001577280591016068, + "loss": 0.2344, + "step": 6730 + }, + { + "epoch": 1.248379329505464, + "grad_norm": 0.17773144739281946, + "learning_rate": 0.0001570483614576549, + "loss": 0.237, + "step": 6740 + }, + { + "epoch": 1.2502315243563622, + "grad_norm": 0.24361822775457953, + "learning_rate": 0.0001563694601824837, + "loss": 0.2208, + "step": 6750 + }, + { + "epoch": 1.2520837192072607, + "grad_norm": 0.19831921681917936, + "learning_rate": 0.000155691361092621, + "loss": 0.2447, + "step": 6760 + }, + { + "epoch": 1.253935914058159, + "grad_norm": 0.2429000368973823, + "learning_rate": 0.00015501406999772154, + "loss": 0.2525, + "step": 6770 + }, + { + "epoch": 1.2557881089090572, + "grad_norm": 0.2833773062005256, + "learning_rate": 0.000154337592700518, + "loss": 0.2699, + "step": 6780 + }, + { + "epoch": 1.2576403037599555, + "grad_norm": 0.28456822568540374, + "learning_rate": 0.00015366193499677036, + "loss": 0.2871, + "step": 6790 + }, + { + "epoch": 1.2594924986108538, + "grad_norm": 0.22620507444223148, + "learning_rate": 0.00015298710267521682, + "loss": 0.2287, + "step": 6800 + }, + { + "epoch": 1.2613446934617523, + "grad_norm": 0.28690671723743605, + "learning_rate": 0.00015231310151752407, + "loss": 0.2882, + "step": 6810 + }, + { + "epoch": 1.2631968883126505, + "grad_norm": 0.3475884413325309, + "learning_rate": 0.0001516399372982377, + "loss": 0.2293, + "step": 6820 + }, + { + "epoch": 1.2650490831635488, + "grad_norm": 0.2072556191346626, + "learning_rate": 0.000150967615784733, + "loss": 0.2185, + "step": 6830 + }, + { + "epoch": 1.266901278014447, + "grad_norm": 0.21644887901267165, + "learning_rate": 0.00015029614273716506, + "loss": 0.2664, + "step": 6840 + }, + { + "epoch": 1.2687534728653453, + "grad_norm": 0.17990296855165974, + "learning_rate": 0.0001496255239084199, + "loss": 0.2087, + "step": 6850 + }, + { + "epoch": 1.2706056677162438, + "grad_norm": 0.27058636297908395, + "learning_rate": 0.00014895576504406465, + "loss": 0.1908, + "step": 6860 + }, + { + "epoch": 1.272457862567142, + "grad_norm": 0.18569390040885966, + "learning_rate": 0.00014828687188229905, + "loss": 0.2416, + "step": 6870 + }, + { + "epoch": 1.2743100574180404, + "grad_norm": 0.29190142926898804, + "learning_rate": 0.00014761885015390568, + "loss": 0.2463, + "step": 6880 + }, + { + "epoch": 1.2761622522689386, + "grad_norm": 0.17606951118976896, + "learning_rate": 0.000146951705582201, + "loss": 0.2208, + "step": 6890 + }, + { + "epoch": 1.278014447119837, + "grad_norm": 0.17608746275541837, + "learning_rate": 0.00014628544388298642, + "loss": 0.219, + "step": 6900 + }, + { + "epoch": 1.2798666419707354, + "grad_norm": 0.16242847709515437, + "learning_rate": 0.00014562007076449944, + "loss": 0.2331, + "step": 6910 + }, + { + "epoch": 1.2817188368216337, + "grad_norm": 0.2755204876160437, + "learning_rate": 0.00014495559192736435, + "loss": 0.2291, + "step": 6920 + }, + { + "epoch": 1.283571031672532, + "grad_norm": 0.20200318254837507, + "learning_rate": 0.00014429201306454364, + "loss": 0.235, + "step": 6930 + }, + { + "epoch": 1.2854232265234302, + "grad_norm": 0.17156079642065042, + "learning_rate": 0.00014362933986128963, + "loss": 0.2182, + "step": 6940 + }, + { + "epoch": 1.2872754213743285, + "grad_norm": 0.21604115340537886, + "learning_rate": 0.0001429675779950947, + "loss": 0.2471, + "step": 6950 + }, + { + "epoch": 1.289127616225227, + "grad_norm": 0.187996583890282, + "learning_rate": 0.00014230673313564397, + "loss": 0.2151, + "step": 6960 + }, + { + "epoch": 1.2909798110761252, + "grad_norm": 0.19730532837034964, + "learning_rate": 0.00014164681094476551, + "loss": 0.2106, + "step": 6970 + }, + { + "epoch": 1.2928320059270235, + "grad_norm": 0.18610760518567895, + "learning_rate": 0.0001409878170763826, + "loss": 0.1997, + "step": 6980 + }, + { + "epoch": 1.2946842007779218, + "grad_norm": 0.26588737789650624, + "learning_rate": 0.00014032975717646505, + "loss": 0.2779, + "step": 6990 + }, + { + "epoch": 1.29653639562882, + "grad_norm": 0.2023558780876639, + "learning_rate": 0.0001396726368829808, + "loss": 0.1862, + "step": 7000 + }, + { + "epoch": 1.2983885904797186, + "grad_norm": 0.1911627012671031, + "learning_rate": 0.0001390164618258477, + "loss": 0.2309, + "step": 7010 + }, + { + "epoch": 1.3002407853306168, + "grad_norm": 0.11786773578619021, + "learning_rate": 0.0001383612376268852, + "loss": 0.2342, + "step": 7020 + }, + { + "epoch": 1.302092980181515, + "grad_norm": 0.28174803457783004, + "learning_rate": 0.00013770696989976616, + "loss": 0.2286, + "step": 7030 + }, + { + "epoch": 1.3039451750324134, + "grad_norm": 0.17826542771264642, + "learning_rate": 0.0001370536642499689, + "loss": 0.1801, + "step": 7040 + }, + { + "epoch": 1.3057973698833116, + "grad_norm": 0.2244828460772529, + "learning_rate": 0.00013640132627472918, + "loss": 0.2266, + "step": 7050 + }, + { + "epoch": 1.3076495647342101, + "grad_norm": 0.17076031236762176, + "learning_rate": 0.0001357499615629919, + "loss": 0.2064, + "step": 7060 + }, + { + "epoch": 1.3095017595851084, + "grad_norm": 0.21153152349490145, + "learning_rate": 0.00013509957569536368, + "loss": 0.2259, + "step": 7070 + }, + { + "epoch": 1.3113539544360067, + "grad_norm": 0.21657797572838655, + "learning_rate": 0.00013445017424406459, + "loss": 0.2174, + "step": 7080 + }, + { + "epoch": 1.313206149286905, + "grad_norm": 0.19916951980627734, + "learning_rate": 0.00013380176277288098, + "loss": 0.2524, + "step": 7090 + }, + { + "epoch": 1.3150583441378032, + "grad_norm": 0.15608777576271463, + "learning_rate": 0.00013315434683711731, + "loss": 0.2252, + "step": 7100 + }, + { + "epoch": 1.3169105389887017, + "grad_norm": 0.21137373945091645, + "learning_rate": 0.0001325079319835486, + "loss": 0.2512, + "step": 7110 + }, + { + "epoch": 1.3187627338396, + "grad_norm": 0.28789005617840957, + "learning_rate": 0.00013186252375037332, + "loss": 0.2269, + "step": 7120 + }, + { + "epoch": 1.3206149286904982, + "grad_norm": 0.20697477426134353, + "learning_rate": 0.0001312181276671654, + "loss": 0.1923, + "step": 7130 + }, + { + "epoch": 1.3224671235413965, + "grad_norm": 0.20780168330103488, + "learning_rate": 0.00013057474925482732, + "loss": 0.2, + "step": 7140 + }, + { + "epoch": 1.3243193183922948, + "grad_norm": 0.2619781587243672, + "learning_rate": 0.00012993239402554237, + "loss": 0.2418, + "step": 7150 + }, + { + "epoch": 1.3261715132431933, + "grad_norm": 0.21912577308112016, + "learning_rate": 0.00012929106748272792, + "loss": 0.2187, + "step": 7160 + }, + { + "epoch": 1.3280237080940915, + "grad_norm": 0.2268912171128973, + "learning_rate": 0.00012865077512098789, + "loss": 0.2028, + "step": 7170 + }, + { + "epoch": 1.3298759029449898, + "grad_norm": 0.21743955397611459, + "learning_rate": 0.0001280115224260658, + "loss": 0.2427, + "step": 7180 + }, + { + "epoch": 1.331728097795888, + "grad_norm": 0.2738954036709458, + "learning_rate": 0.00012737331487479764, + "loss": 0.2614, + "step": 7190 + }, + { + "epoch": 1.3335802926467863, + "grad_norm": 0.19258917852110208, + "learning_rate": 0.00012673615793506524, + "loss": 0.2099, + "step": 7200 + }, + { + "epoch": 1.3354324874976848, + "grad_norm": 0.2502839601700166, + "learning_rate": 0.00012610005706574918, + "loss": 0.212, + "step": 7210 + }, + { + "epoch": 1.337284682348583, + "grad_norm": 0.2599916951105217, + "learning_rate": 0.0001254650177166821, + "loss": 0.2124, + "step": 7220 + }, + { + "epoch": 1.3391368771994814, + "grad_norm": 0.177484083446667, + "learning_rate": 0.00012483104532860204, + "loss": 0.1797, + "step": 7230 + }, + { + "epoch": 1.3409890720503796, + "grad_norm": 0.2826696479487746, + "learning_rate": 0.00012419814533310558, + "loss": 0.2466, + "step": 7240 + }, + { + "epoch": 1.342841266901278, + "grad_norm": 0.25661668196827314, + "learning_rate": 0.0001235663231526019, + "loss": 0.2332, + "step": 7250 + }, + { + "epoch": 1.3446934617521764, + "grad_norm": 0.2568941368041713, + "learning_rate": 0.00012293558420026557, + "loss": 0.2523, + "step": 7260 + }, + { + "epoch": 1.3465456566030747, + "grad_norm": 0.20215212528107282, + "learning_rate": 0.00012230593387999082, + "loss": 0.2352, + "step": 7270 + }, + { + "epoch": 1.348397851453973, + "grad_norm": 0.24815860875352733, + "learning_rate": 0.00012167737758634473, + "loss": 0.2188, + "step": 7280 + }, + { + "epoch": 1.3502500463048712, + "grad_norm": 0.22038982892081588, + "learning_rate": 0.00012104992070452137, + "loss": 0.2685, + "step": 7290 + }, + { + "epoch": 1.3521022411557695, + "grad_norm": 0.2083445910203971, + "learning_rate": 0.00012042356861029547, + "loss": 0.2328, + "step": 7300 + }, + { + "epoch": 1.353954436006668, + "grad_norm": 0.20267314146087212, + "learning_rate": 0.00011979832666997642, + "loss": 0.2264, + "step": 7310 + }, + { + "epoch": 1.3558066308575663, + "grad_norm": 0.29234235079551857, + "learning_rate": 0.00011917420024036241, + "loss": 0.24, + "step": 7320 + }, + { + "epoch": 1.3576588257084645, + "grad_norm": 0.19217333964822353, + "learning_rate": 0.00011855119466869426, + "loss": 0.2551, + "step": 7330 + }, + { + "epoch": 1.3595110205593628, + "grad_norm": 0.18622316897174804, + "learning_rate": 0.00011792931529260992, + "loss": 0.2383, + "step": 7340 + }, + { + "epoch": 1.361363215410261, + "grad_norm": 0.2639171890597442, + "learning_rate": 0.00011730856744009846, + "loss": 0.2447, + "step": 7350 + }, + { + "epoch": 1.3632154102611596, + "grad_norm": 0.24703406547971726, + "learning_rate": 0.0001166889564294546, + "loss": 0.1885, + "step": 7360 + }, + { + "epoch": 1.3650676051120578, + "grad_norm": 0.2395087018493502, + "learning_rate": 0.00011607048756923327, + "loss": 0.2408, + "step": 7370 + }, + { + "epoch": 1.366919799962956, + "grad_norm": 0.1715869085136323, + "learning_rate": 0.00011551484651328101, + "loss": 0.2231, + "step": 7380 + }, + { + "epoch": 1.3687719948138544, + "grad_norm": 0.24875690978651382, + "learning_rate": 0.0001148985623288476, + "loss": 0.2107, + "step": 7390 + }, + { + "epoch": 1.3706241896647526, + "grad_norm": 0.21621060634153644, + "learning_rate": 0.00011428343563414629, + "loss": 0.2827, + "step": 7400 + }, + { + "epoch": 1.3724763845156511, + "grad_norm": 0.17411298598721778, + "learning_rate": 0.00011366947169931222, + "loss": 0.1956, + "step": 7410 + }, + { + "epoch": 1.3743285793665494, + "grad_norm": 0.21075418595890044, + "learning_rate": 0.00011305667578451847, + "loss": 0.2384, + "step": 7420 + }, + { + "epoch": 1.3761807742174477, + "grad_norm": 0.1762011368192225, + "learning_rate": 0.00011244505313993115, + "loss": 0.2248, + "step": 7430 + }, + { + "epoch": 1.378032969068346, + "grad_norm": 0.2713344050149392, + "learning_rate": 0.00011183460900566405, + "loss": 0.2253, + "step": 7440 + }, + { + "epoch": 1.3798851639192442, + "grad_norm": 0.13308645120441578, + "learning_rate": 0.00011122534861173444, + "loss": 0.2188, + "step": 7450 + }, + { + "epoch": 1.3817373587701427, + "grad_norm": 0.26214160905875167, + "learning_rate": 0.00011061727717801745, + "loss": 0.2509, + "step": 7460 + }, + { + "epoch": 1.383589553621041, + "grad_norm": 0.16725861800168582, + "learning_rate": 0.00011001039991420181, + "loss": 0.2395, + "step": 7470 + }, + { + "epoch": 1.3854417484719392, + "grad_norm": 0.17751505759886393, + "learning_rate": 0.00010940472201974508, + "loss": 0.1914, + "step": 7480 + }, + { + "epoch": 1.3872939433228375, + "grad_norm": 0.21463454020196815, + "learning_rate": 0.00010880024868382943, + "loss": 0.2086, + "step": 7490 + }, + { + "epoch": 1.3891461381737358, + "grad_norm": 0.2026092509755857, + "learning_rate": 0.00010819698508531659, + "loss": 0.2149, + "step": 7500 + }, + { + "epoch": 1.3909983330246343, + "grad_norm": 0.16323623074986704, + "learning_rate": 0.00010759493639270387, + "loss": 0.27, + "step": 7510 + }, + { + "epoch": 1.3928505278755325, + "grad_norm": 0.22139846358468115, + "learning_rate": 0.00010705413557727304, + "loss": 0.2054, + "step": 7520 + }, + { + "epoch": 1.3947027227264308, + "grad_norm": 0.25885865603646047, + "learning_rate": 0.0001064544094077661, + "loss": 0.2037, + "step": 7530 + }, + { + "epoch": 1.396554917577329, + "grad_norm": 0.18312190666440223, + "learning_rate": 0.00010585591307378175, + "loss": 0.2177, + "step": 7540 + }, + { + "epoch": 1.3984071124282274, + "grad_norm": 0.2452824521308415, + "learning_rate": 0.00010525865170297353, + "loss": 0.2443, + "step": 7550 + }, + { + "epoch": 1.4002593072791258, + "grad_norm": 0.22491815184492542, + "learning_rate": 0.00010466263041241426, + "loss": 0.2028, + "step": 7560 + }, + { + "epoch": 1.4021115021300241, + "grad_norm": 0.21626081653727397, + "learning_rate": 0.00010406785430855237, + "loss": 0.1719, + "step": 7570 + }, + { + "epoch": 1.4039636969809224, + "grad_norm": 0.24105946067666537, + "learning_rate": 0.00010347432848716812, + "loss": 0.225, + "step": 7580 + }, + { + "epoch": 1.4058158918318207, + "grad_norm": 0.23078802018114886, + "learning_rate": 0.00010288205803332975, + "loss": 0.2278, + "step": 7590 + }, + { + "epoch": 1.407668086682719, + "grad_norm": 0.2574880724739788, + "learning_rate": 0.00010229104802135034, + "loss": 0.244, + "step": 7600 + }, + { + "epoch": 1.4095202815336174, + "grad_norm": 0.24593167284827877, + "learning_rate": 0.00010170130351474377, + "loss": 0.2159, + "step": 7610 + }, + { + "epoch": 1.4113724763845157, + "grad_norm": 0.261530928817991, + "learning_rate": 0.00010111282956618181, + "loss": 0.1827, + "step": 7620 + }, + { + "epoch": 1.413224671235414, + "grad_norm": 0.19005464332149496, + "learning_rate": 0.0001005256312174505, + "loss": 0.1942, + "step": 7630 + }, + { + "epoch": 1.4150768660863122, + "grad_norm": 0.22377467210489174, + "learning_rate": 9.993971349940717e-05, + "loss": 0.2553, + "step": 7640 + }, + { + "epoch": 1.4169290609372105, + "grad_norm": 0.21440875435999618, + "learning_rate": 9.935508143193739e-05, + "loss": 0.2169, + "step": 7650 + }, + { + "epoch": 1.418781255788109, + "grad_norm": 0.22734623733013004, + "learning_rate": 9.877174002391165e-05, + "loss": 0.1859, + "step": 7660 + }, + { + "epoch": 1.4206334506390073, + "grad_norm": 0.20257954902342695, + "learning_rate": 9.818969427314275e-05, + "loss": 0.208, + "step": 7670 + }, + { + "epoch": 1.4224856454899055, + "grad_norm": 0.23157903079657188, + "learning_rate": 9.760894916634283e-05, + "loss": 0.2136, + "step": 7680 + }, + { + "epoch": 1.4243378403408038, + "grad_norm": 0.23047953760740483, + "learning_rate": 9.702950967908067e-05, + "loss": 0.2244, + "step": 7690 + }, + { + "epoch": 1.426190035191702, + "grad_norm": 0.1893981494941497, + "learning_rate": 9.645138077573904e-05, + "loss": 0.202, + "step": 7700 + }, + { + "epoch": 1.4280422300426006, + "grad_norm": 0.1944059258719957, + "learning_rate": 9.587456740947236e-05, + "loss": 0.2395, + "step": 7710 + }, + { + "epoch": 1.4298944248934988, + "grad_norm": 0.19154551462212566, + "learning_rate": 9.529907452216402e-05, + "loss": 0.1877, + "step": 7720 + }, + { + "epoch": 1.431746619744397, + "grad_norm": 0.25705195721078017, + "learning_rate": 9.472490704438403e-05, + "loss": 0.2439, + "step": 7730 + }, + { + "epoch": 1.4335988145952954, + "grad_norm": 0.27237298997689074, + "learning_rate": 9.4152069895347e-05, + "loss": 0.2269, + "step": 7740 + }, + { + "epoch": 1.4354510094461936, + "grad_norm": 0.22572015857646327, + "learning_rate": 9.358056798286982e-05, + "loss": 0.1761, + "step": 7750 + }, + { + "epoch": 1.4373032042970921, + "grad_norm": 0.1681521243481353, + "learning_rate": 9.301040620332962e-05, + "loss": 0.2453, + "step": 7760 + }, + { + "epoch": 1.4391553991479904, + "grad_norm": 0.20322718308914284, + "learning_rate": 9.244158944162198e-05, + "loss": 0.1995, + "step": 7770 + }, + { + "epoch": 1.4410075939988887, + "grad_norm": 0.17221136952935692, + "learning_rate": 9.187412257111882e-05, + "loss": 0.1991, + "step": 7780 + }, + { + "epoch": 1.442859788849787, + "grad_norm": 0.23211721231411886, + "learning_rate": 9.130801045362678e-05, + "loss": 0.225, + "step": 7790 + }, + { + "epoch": 1.4447119837006852, + "grad_norm": 0.2557003480049842, + "learning_rate": 9.074325793934582e-05, + "loss": 0.2396, + "step": 7800 + }, + { + "epoch": 1.4465641785515837, + "grad_norm": 0.2743087049471899, + "learning_rate": 9.017986986682705e-05, + "loss": 0.2622, + "step": 7810 + }, + { + "epoch": 1.448416373402482, + "grad_norm": 0.22044857915056804, + "learning_rate": 8.961785106293202e-05, + "loss": 0.208, + "step": 7820 + }, + { + "epoch": 1.4502685682533802, + "grad_norm": 0.295975717325647, + "learning_rate": 8.905720634279068e-05, + "loss": 0.2406, + "step": 7830 + }, + { + "epoch": 1.4521207631042785, + "grad_norm": 0.2119255826308734, + "learning_rate": 8.849794050976062e-05, + "loss": 0.1863, + "step": 7840 + }, + { + "epoch": 1.4539729579551768, + "grad_norm": 0.19120118368025074, + "learning_rate": 8.794005835538558e-05, + "loss": 0.1899, + "step": 7850 + }, + { + "epoch": 1.4558251528060753, + "grad_norm": 0.20269011463788664, + "learning_rate": 8.738356465935467e-05, + "loss": 0.1887, + "step": 7860 + }, + { + "epoch": 1.4576773476569735, + "grad_norm": 0.2933956506003441, + "learning_rate": 8.68284641894613e-05, + "loss": 0.1969, + "step": 7870 + }, + { + "epoch": 1.4595295425078718, + "grad_norm": 0.17871898787286603, + "learning_rate": 8.627476170156224e-05, + "loss": 0.2315, + "step": 7880 + }, + { + "epoch": 1.46138173735877, + "grad_norm": 0.2552476396822797, + "learning_rate": 8.572246193953703e-05, + "loss": 0.2485, + "step": 7890 + }, + { + "epoch": 1.4632339322096684, + "grad_norm": 0.31173163044095015, + "learning_rate": 8.517156963524719e-05, + "loss": 0.1816, + "step": 7900 + }, + { + "epoch": 1.4650861270605668, + "grad_norm": 0.2158798667093176, + "learning_rate": 8.462208950849598e-05, + "loss": 0.2469, + "step": 7910 + }, + { + "epoch": 1.4669383219114651, + "grad_norm": 0.24218457777393995, + "learning_rate": 8.407402626698751e-05, + "loss": 0.2161, + "step": 7920 + }, + { + "epoch": 1.4687905167623634, + "grad_norm": 0.1979730263341676, + "learning_rate": 8.352738460628675e-05, + "loss": 0.2037, + "step": 7930 + }, + { + "epoch": 1.4706427116132617, + "grad_norm": 0.2696373926575332, + "learning_rate": 8.298216920977914e-05, + "loss": 0.1691, + "step": 7940 + }, + { + "epoch": 1.47249490646416, + "grad_norm": 0.25798986555999925, + "learning_rate": 8.243838474863047e-05, + "loss": 0.2285, + "step": 7950 + }, + { + "epoch": 1.4743471013150584, + "grad_norm": 0.20862952822180633, + "learning_rate": 8.189603588174712e-05, + "loss": 0.2118, + "step": 7960 + }, + { + "epoch": 1.4761992961659567, + "grad_norm": 0.1750842888641512, + "learning_rate": 8.135512725573574e-05, + "loss": 0.2116, + "step": 7970 + }, + { + "epoch": 1.478051491016855, + "grad_norm": 0.23773871116567313, + "learning_rate": 8.081566350486363e-05, + "loss": 0.1949, + "step": 7980 + }, + { + "epoch": 1.4799036858677532, + "grad_norm": 0.164420670542161, + "learning_rate": 8.027764925101911e-05, + "loss": 0.209, + "step": 7990 + }, + { + "epoch": 1.4817558807186515, + "grad_norm": 0.21216576721258398, + "learning_rate": 7.974108910367178e-05, + "loss": 0.1966, + "step": 8000 + }, + { + "epoch": 1.48360807556955, + "grad_norm": 0.2790248976449928, + "learning_rate": 7.920598765983308e-05, + "loss": 0.2063, + "step": 8010 + }, + { + "epoch": 1.4854602704204483, + "grad_norm": 0.29784954052004964, + "learning_rate": 7.867234950401714e-05, + "loss": 0.1589, + "step": 8020 + }, + { + "epoch": 1.4873124652713465, + "grad_norm": 0.15966925896267653, + "learning_rate": 7.8140179208201e-05, + "loss": 0.2203, + "step": 8030 + }, + { + "epoch": 1.4891646601222448, + "grad_norm": 0.21411813554248801, + "learning_rate": 7.76094813317858e-05, + "loss": 0.191, + "step": 8040 + }, + { + "epoch": 1.491016854973143, + "grad_norm": 0.16778546998214966, + "learning_rate": 7.708026042155775e-05, + "loss": 0.1972, + "step": 8050 + }, + { + "epoch": 1.4928690498240416, + "grad_norm": 0.23986270787568656, + "learning_rate": 7.655252101164894e-05, + "loss": 0.2115, + "step": 8060 + }, + { + "epoch": 1.4947212446749398, + "grad_norm": 0.250339172193944, + "learning_rate": 7.602626762349865e-05, + "loss": 0.2112, + "step": 8070 + }, + { + "epoch": 1.496573439525838, + "grad_norm": 0.18288675343831115, + "learning_rate": 7.55015047658146e-05, + "loss": 0.2316, + "step": 8080 + }, + { + "epoch": 1.4984256343767364, + "grad_norm": 0.23542544018483225, + "learning_rate": 7.497823693453429e-05, + "loss": 0.2278, + "step": 8090 + }, + { + "epoch": 1.5002778292276346, + "grad_norm": 0.21853735172760996, + "learning_rate": 7.44564686127865e-05, + "loss": 0.2435, + "step": 8100 + }, + { + "epoch": 1.5021300240785331, + "grad_norm": 0.230876996211439, + "learning_rate": 7.39362042708527e-05, + "loss": 0.2132, + "step": 8110 + }, + { + "epoch": 1.5039822189294314, + "grad_norm": 0.23449285027681627, + "learning_rate": 7.341744836612929e-05, + "loss": 0.2205, + "step": 8120 + }, + { + "epoch": 1.5058344137803297, + "grad_norm": 0.1770364349318145, + "learning_rate": 7.290020534308883e-05, + "loss": 0.1771, + "step": 8130 + }, + { + "epoch": 1.5076866086312282, + "grad_norm": 0.24440773842340074, + "learning_rate": 7.23844796332421e-05, + "loss": 0.2009, + "step": 8140 + }, + { + "epoch": 1.5095388034821262, + "grad_norm": 0.19125723562538224, + "learning_rate": 7.187027565510032e-05, + "loss": 0.2214, + "step": 8150 + }, + { + "epoch": 1.5113909983330247, + "grad_norm": 0.24413160941991816, + "learning_rate": 7.135759781413714e-05, + "loss": 0.2483, + "step": 8160 + }, + { + "epoch": 1.513243193183923, + "grad_norm": 0.18714126123807273, + "learning_rate": 7.084645050275093e-05, + "loss": 0.1754, + "step": 8170 + }, + { + "epoch": 1.5150953880348212, + "grad_norm": 0.24068172003031482, + "learning_rate": 7.033683810022717e-05, + "loss": 0.2208, + "step": 8180 + }, + { + "epoch": 1.5169475828857197, + "grad_norm": 0.21118944152545294, + "learning_rate": 6.982876497270093e-05, + "loss": 0.2354, + "step": 8190 + }, + { + "epoch": 1.5187997777366178, + "grad_norm": 0.16304142225648927, + "learning_rate": 6.932223547311948e-05, + "loss": 0.191, + "step": 8200 + }, + { + "epoch": 1.5206519725875163, + "grad_norm": 0.22402630540204685, + "learning_rate": 6.881725394120483e-05, + "loss": 0.2235, + "step": 8210 + }, + { + "epoch": 1.5225041674384145, + "grad_norm": 0.14686671761669617, + "learning_rate": 6.831382470341674e-05, + "loss": 0.2374, + "step": 8220 + }, + { + "epoch": 1.5243563622893128, + "grad_norm": 0.1910492658359761, + "learning_rate": 6.781195207291579e-05, + "loss": 0.1912, + "step": 8230 + }, + { + "epoch": 1.5262085571402113, + "grad_norm": 0.285797167185037, + "learning_rate": 6.7311640349526e-05, + "loss": 0.1946, + "step": 8240 + }, + { + "epoch": 1.5280607519911094, + "grad_norm": 0.24899927517169534, + "learning_rate": 6.681289381969827e-05, + "loss": 0.2437, + "step": 8250 + }, + { + "epoch": 1.5299129468420078, + "grad_norm": 0.27104957130230045, + "learning_rate": 6.631571675647358e-05, + "loss": 0.2007, + "step": 8260 + }, + { + "epoch": 1.5317651416929061, + "grad_norm": 0.1836787768149552, + "learning_rate": 6.582011341944661e-05, + "loss": 0.1992, + "step": 8270 + }, + { + "epoch": 1.5336173365438044, + "grad_norm": 0.16592192801262687, + "learning_rate": 6.532608805472884e-05, + "loss": 0.2243, + "step": 8280 + }, + { + "epoch": 1.5354695313947029, + "grad_norm": 0.19477759718427087, + "learning_rate": 6.483364489491242e-05, + "loss": 0.1866, + "step": 8290 + }, + { + "epoch": 1.537321726245601, + "grad_norm": 0.2612938997397552, + "learning_rate": 6.434278815903392e-05, + "loss": 0.1884, + "step": 8300 + }, + { + "epoch": 1.5391739210964994, + "grad_norm": 0.22106523393294486, + "learning_rate": 6.3853522052538e-05, + "loss": 0.2464, + "step": 8310 + }, + { + "epoch": 1.5410261159473977, + "grad_norm": 0.11918922044507506, + "learning_rate": 6.336585076724169e-05, + "loss": 0.2205, + "step": 8320 + }, + { + "epoch": 1.542878310798296, + "grad_norm": 0.27735599029951385, + "learning_rate": 6.287977848129811e-05, + "loss": 0.2125, + "step": 8330 + }, + { + "epoch": 1.5447305056491945, + "grad_norm": 0.11824966641617995, + "learning_rate": 6.239530935916105e-05, + "loss": 0.1886, + "step": 8340 + }, + { + "epoch": 1.5465827005000925, + "grad_norm": 0.14239263856247222, + "learning_rate": 6.191244755154896e-05, + "loss": 0.2283, + "step": 8350 + }, + { + "epoch": 1.548434895350991, + "grad_norm": 0.2614832702732058, + "learning_rate": 6.143119719540951e-05, + "loss": 0.2419, + "step": 8360 + }, + { + "epoch": 1.5502870902018893, + "grad_norm": 0.1719421648495295, + "learning_rate": 6.0951562413884276e-05, + "loss": 0.1813, + "step": 8370 + }, + { + "epoch": 1.5521392850527875, + "grad_norm": 0.1339861662540805, + "learning_rate": 6.047354731627319e-05, + "loss": 0.1732, + "step": 8380 + }, + { + "epoch": 1.553991479903686, + "grad_norm": 0.2649420028984007, + "learning_rate": 5.9997155997999486e-05, + "loss": 0.2312, + "step": 8390 + }, + { + "epoch": 1.555843674754584, + "grad_norm": 0.2986635713988608, + "learning_rate": 5.952239254057462e-05, + "loss": 0.2537, + "step": 8400 + }, + { + "epoch": 1.5576958696054826, + "grad_norm": 0.20847627410802858, + "learning_rate": 5.904926101156316e-05, + "loss": 0.2198, + "step": 8410 + }, + { + "epoch": 1.5595480644563808, + "grad_norm": 0.20583750133284387, + "learning_rate": 5.8577765464548014e-05, + "loss": 0.2194, + "step": 8420 + }, + { + "epoch": 1.561400259307279, + "grad_norm": 0.24884919423637333, + "learning_rate": 5.810790993909595e-05, + "loss": 0.2201, + "step": 8430 + }, + { + "epoch": 1.5632524541581776, + "grad_norm": 0.2292784136541862, + "learning_rate": 5.7639698460722366e-05, + "loss": 0.2139, + "step": 8440 + }, + { + "epoch": 1.5651046490090756, + "grad_norm": 0.20773042455822294, + "learning_rate": 5.717313504085761e-05, + "loss": 0.1876, + "step": 8450 + }, + { + "epoch": 1.5669568438599741, + "grad_norm": 0.218184017555461, + "learning_rate": 5.670822367681189e-05, + "loss": 0.1821, + "step": 8460 + }, + { + "epoch": 1.5688090387108724, + "grad_norm": 0.17843712174744172, + "learning_rate": 5.6244968351741396e-05, + "loss": 0.2006, + "step": 8470 + }, + { + "epoch": 1.5706612335617707, + "grad_norm": 0.21436245431091455, + "learning_rate": 5.578337303461414e-05, + "loss": 0.1928, + "step": 8480 + }, + { + "epoch": 1.5725134284126692, + "grad_norm": 0.2084740928506598, + "learning_rate": 5.532344168017589e-05, + "loss": 0.2444, + "step": 8490 + }, + { + "epoch": 1.5743656232635672, + "grad_norm": 0.20902509315653023, + "learning_rate": 5.4865178228916317e-05, + "loss": 0.2288, + "step": 8500 + }, + { + "epoch": 1.5762178181144657, + "grad_norm": 0.191128809958979, + "learning_rate": 5.4408586607035236e-05, + "loss": 0.2307, + "step": 8510 + }, + { + "epoch": 1.578070012965364, + "grad_norm": 0.2804233173323839, + "learning_rate": 5.3953670726408973e-05, + "loss": 0.2049, + "step": 8520 + }, + { + "epoch": 1.5799222078162622, + "grad_norm": 0.2523996334467096, + "learning_rate": 5.3500434484556744e-05, + "loss": 0.2309, + "step": 8530 + }, + { + "epoch": 1.5817744026671607, + "grad_norm": 0.22808681153892332, + "learning_rate": 5.304888176460759e-05, + "loss": 0.2224, + "step": 8540 + }, + { + "epoch": 1.5836265975180588, + "grad_norm": 0.17496689187022768, + "learning_rate": 5.2599016435266656e-05, + "loss": 0.212, + "step": 8550 + }, + { + "epoch": 1.5854787923689573, + "grad_norm": 0.16684956568038284, + "learning_rate": 5.215084235078232e-05, + "loss": 0.1599, + "step": 8560 + }, + { + "epoch": 1.5873309872198555, + "grad_norm": 0.2524704034190916, + "learning_rate": 5.170436335091319e-05, + "loss": 0.2239, + "step": 8570 + }, + { + "epoch": 1.5891831820707538, + "grad_norm": 0.20276889978373874, + "learning_rate": 5.130398471023492e-05, + "loss": 0.1991, + "step": 8580 + }, + { + "epoch": 1.5910353769216523, + "grad_norm": 0.19401086487052652, + "learning_rate": 5.086073689762982e-05, + "loss": 0.2054, + "step": 8590 + }, + { + "epoch": 1.5928875717725504, + "grad_norm": 0.24314231015564167, + "learning_rate": 5.0419195222696305e-05, + "loss": 0.216, + "step": 8600 + }, + { + "epoch": 1.5947397666234489, + "grad_norm": 0.1962559761069099, + "learning_rate": 4.9979363468369426e-05, + "loss": 0.2028, + "step": 8610 + }, + { + "epoch": 1.5965919614743471, + "grad_norm": 0.21450451616005048, + "learning_rate": 4.95412454029342e-05, + "loss": 0.1485, + "step": 8620 + }, + { + "epoch": 1.5984441563252454, + "grad_norm": 0.2262800799406614, + "learning_rate": 4.9104844779993744e-05, + "loss": 0.2205, + "step": 8630 + }, + { + "epoch": 1.6002963511761439, + "grad_norm": 0.15673015952559616, + "learning_rate": 4.867016533843677e-05, + "loss": 0.1878, + "step": 8640 + }, + { + "epoch": 1.602148546027042, + "grad_norm": 0.22772029995019283, + "learning_rate": 4.823721080240562e-05, + "loss": 0.2144, + "step": 8650 + }, + { + "epoch": 1.6040007408779404, + "grad_norm": 0.16737363953611054, + "learning_rate": 4.7805984881264366e-05, + "loss": 0.219, + "step": 8660 + }, + { + "epoch": 1.6058529357288387, + "grad_norm": 0.15059728369872777, + "learning_rate": 4.7376491269567305e-05, + "loss": 0.1827, + "step": 8670 + }, + { + "epoch": 1.607705130579737, + "grad_norm": 0.2174362092107457, + "learning_rate": 4.694873364702687e-05, + "loss": 0.2427, + "step": 8680 + }, + { + "epoch": 1.6095573254306355, + "grad_norm": 0.2536534486510469, + "learning_rate": 4.652271567848229e-05, + "loss": 0.2458, + "step": 8690 + }, + { + "epoch": 1.6114095202815335, + "grad_norm": 0.20306793867476478, + "learning_rate": 4.6098441013868285e-05, + "loss": 0.221, + "step": 8700 + }, + { + "epoch": 1.613261715132432, + "grad_norm": 0.29865060955062883, + "learning_rate": 4.567591328818371e-05, + "loss": 0.2621, + "step": 8710 + }, + { + "epoch": 1.6151139099833303, + "grad_norm": 0.20862574024207642, + "learning_rate": 4.529713496011825e-05, + "loss": 0.207, + "step": 8720 + }, + { + "epoch": 1.6169661048342285, + "grad_norm": 0.21837675462224324, + "learning_rate": 4.487793637919196e-05, + "loss": 0.1828, + "step": 8730 + }, + { + "epoch": 1.618818299685127, + "grad_norm": 0.23283771674120501, + "learning_rate": 4.446049519394233e-05, + "loss": 0.2166, + "step": 8740 + }, + { + "epoch": 1.620670494536025, + "grad_norm": 0.1948474369408113, + "learning_rate": 4.4044814980821856e-05, + "loss": 0.2154, + "step": 8750 + }, + { + "epoch": 1.6225226893869236, + "grad_norm": 0.2821939610991762, + "learning_rate": 4.3630899301195904e-05, + "loss": 0.2428, + "step": 8760 + }, + { + "epoch": 1.6243748842378218, + "grad_norm": 0.18991376076496028, + "learning_rate": 4.321875170131218e-05, + "loss": 0.1933, + "step": 8770 + }, + { + "epoch": 1.62622707908872, + "grad_norm": 0.17477269695823847, + "learning_rate": 4.280837571227006e-05, + "loss": 0.1945, + "step": 8780 + }, + { + "epoch": 1.6280792739396186, + "grad_norm": 0.22671892134617525, + "learning_rate": 4.239977484999063e-05, + "loss": 0.1973, + "step": 8790 + }, + { + "epoch": 1.6299314687905166, + "grad_norm": 0.2061718775432731, + "learning_rate": 4.1992952615186516e-05, + "loss": 0.2122, + "step": 8800 + }, + { + "epoch": 1.6317836636414151, + "grad_norm": 0.25086071759237627, + "learning_rate": 4.158791249333177e-05, + "loss": 0.226, + "step": 8810 + }, + { + "epoch": 1.6336358584923134, + "grad_norm": 0.242794082456384, + "learning_rate": 4.118465795463214e-05, + "loss": 0.2267, + "step": 8820 + }, + { + "epoch": 1.6354880533432117, + "grad_norm": 0.1935934917483956, + "learning_rate": 4.078319245399514e-05, + "loss": 0.2011, + "step": 8830 + }, + { + "epoch": 1.6373402481941102, + "grad_norm": 0.2628523170855809, + "learning_rate": 4.038351943100088e-05, + "loss": 0.1934, + "step": 8840 + }, + { + "epoch": 1.6391924430450082, + "grad_norm": 0.19568463922046236, + "learning_rate": 3.998564230987209e-05, + "loss": 0.1997, + "step": 8850 + }, + { + "epoch": 1.6410446378959067, + "grad_norm": 0.2481046435287445, + "learning_rate": 3.958956449944501e-05, + "loss": 0.2151, + "step": 8860 + }, + { + "epoch": 1.642896832746805, + "grad_norm": 0.22476767911377235, + "learning_rate": 3.9195289393140155e-05, + "loss": 0.1621, + "step": 8870 + }, + { + "epoch": 1.6447490275977033, + "grad_norm": 0.1945122394139851, + "learning_rate": 3.880282036893348e-05, + "loss": 0.1753, + "step": 8880 + }, + { + "epoch": 1.6466012224486017, + "grad_norm": 0.27437177077690705, + "learning_rate": 3.841216078932702e-05, + "loss": 0.226, + "step": 8890 + }, + { + "epoch": 1.6484534172994998, + "grad_norm": 0.18562250131807664, + "learning_rate": 3.802331400132028e-05, + "loss": 0.1717, + "step": 8900 + }, + { + "epoch": 1.6503056121503983, + "grad_norm": 0.21622010412383683, + "learning_rate": 3.7636283336381636e-05, + "loss": 0.155, + "step": 8910 + }, + { + "epoch": 1.6521578070012966, + "grad_norm": 0.22634728029439885, + "learning_rate": 3.7251072110419727e-05, + "loss": 0.2022, + "step": 8920 + }, + { + "epoch": 1.6540100018521948, + "grad_norm": 0.2671242474144964, + "learning_rate": 3.686768362375498e-05, + "loss": 0.2234, + "step": 8930 + }, + { + "epoch": 1.6558621967030933, + "grad_norm": 0.16832839316697204, + "learning_rate": 3.648612116109146e-05, + "loss": 0.1805, + "step": 8940 + }, + { + "epoch": 1.6577143915539914, + "grad_norm": 0.2688098808357188, + "learning_rate": 3.610638799148858e-05, + "loss": 0.1909, + "step": 8950 + }, + { + "epoch": 1.6595665864048899, + "grad_norm": 0.172871399134501, + "learning_rate": 3.572848736833326e-05, + "loss": 0.2112, + "step": 8960 + }, + { + "epoch": 1.6614187812557881, + "grad_norm": 0.23426972546449246, + "learning_rate": 3.5352422529311814e-05, + "loss": 0.2276, + "step": 8970 + }, + { + "epoch": 1.6632709761066864, + "grad_norm": 0.2682786605548356, + "learning_rate": 3.497819669638266e-05, + "loss": 0.2521, + "step": 8980 + }, + { + "epoch": 1.6651231709575849, + "grad_norm": 0.2122644465486904, + "learning_rate": 3.4605813075748085e-05, + "loss": 0.2003, + "step": 8990 + }, + { + "epoch": 1.666975365808483, + "grad_norm": 0.24717950759123916, + "learning_rate": 3.42352748578274e-05, + "loss": 0.1813, + "step": 9000 + } + ], + "logging_steps": 10, + "max_steps": 10798, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 196010447634432.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}