{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999558635300348, "eval_steps": 2000, "global_step": 11328, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.827293993026438e-05, "eval_accuracy": 0.31068875219818615, "eval_loss": 5.8817362785339355, "eval_runtime": 7.0122, "eval_samples_per_second": 45.349, "eval_steps_per_second": 0.428, "step": 1 }, { "epoch": 0.0008827293993026437, "grad_norm": 8.6875, "learning_rate": 2.9999999999999997e-05, "loss": 6.1256, "step": 10 }, { "epoch": 0.0017654587986052875, "grad_norm": 3.84375, "learning_rate": 5.9999999999999995e-05, "loss": 5.4981, "step": 20 }, { "epoch": 0.0026481881979079315, "grad_norm": 2.484375, "learning_rate": 8.999999999999999e-05, "loss": 4.8806, "step": 30 }, { "epoch": 0.003530917597210575, "grad_norm": 6.5, "learning_rate": 0.00011999999999999999, "loss": 4.3476, "step": 40 }, { "epoch": 0.0044136469965132185, "grad_norm": 0.66796875, "learning_rate": 0.00015, "loss": 3.9568, "step": 50 }, { "epoch": 0.005296376395815863, "grad_norm": 0.41015625, "learning_rate": 0.00017999999999999998, "loss": 3.6239, "step": 60 }, { "epoch": 0.0061791057951185065, "grad_norm": 0.28515625, "learning_rate": 0.00020999999999999998, "loss": 3.4071, "step": 70 }, { "epoch": 0.00706183519442115, "grad_norm": 0.255859375, "learning_rate": 0.00023999999999999998, "loss": 3.2199, "step": 80 }, { "epoch": 0.007944564593723794, "grad_norm": 0.298828125, "learning_rate": 0.00027, "loss": 3.1084, "step": 90 }, { "epoch": 0.008827293993026437, "grad_norm": 0.279296875, "learning_rate": 0.0003, "loss": 3.015, "step": 100 }, { "epoch": 0.009710023392329082, "grad_norm": 0.287109375, "learning_rate": 0.00029999941284073584, "loss": 2.9547, "step": 110 }, { "epoch": 0.010592752791631726, "grad_norm": 0.388671875, "learning_rate": 0.0002999976513675402, "loss": 2.9219, "step": 120 }, { "epoch": 0.01147548219093437, "grad_norm": 0.296875, "learning_rate": 0.00029999471559420324, "loss": 2.8948, "step": 130 }, { "epoch": 0.012358211590237013, "grad_norm": 0.357421875, "learning_rate": 0.00029999060554370847, "loss": 2.8636, "step": 140 }, { "epoch": 0.013240940989539656, "grad_norm": 0.3046875, "learning_rate": 0.00029998532124823267, "loss": 2.8381, "step": 150 }, { "epoch": 0.0141236703888423, "grad_norm": 0.255859375, "learning_rate": 0.00029997886274914557, "loss": 2.8316, "step": 160 }, { "epoch": 0.015006399788144944, "grad_norm": 0.337890625, "learning_rate": 0.0002999712300970092, "loss": 2.8113, "step": 170 }, { "epoch": 0.015889129187447587, "grad_norm": 0.259765625, "learning_rate": 0.0002999624233515781, "loss": 2.7975, "step": 180 }, { "epoch": 0.016771858586750232, "grad_norm": 0.255859375, "learning_rate": 0.00029995244258179844, "loss": 2.7885, "step": 190 }, { "epoch": 0.017654587986052874, "grad_norm": 0.25390625, "learning_rate": 0.00029994128786580765, "loss": 2.7802, "step": 200 }, { "epoch": 0.01853731738535552, "grad_norm": 0.236328125, "learning_rate": 0.0002999289592909335, "loss": 2.7829, "step": 210 }, { "epoch": 0.019420046784658165, "grad_norm": 0.234375, "learning_rate": 0.0002999154569536939, "loss": 2.7516, "step": 220 }, { "epoch": 0.020302776183960806, "grad_norm": 0.369140625, "learning_rate": 0.00029990078095979577, "loss": 2.7503, "step": 230 }, { "epoch": 0.02118550558326345, "grad_norm": 0.267578125, "learning_rate": 0.0002998849314241344, "loss": 2.7469, "step": 240 }, { "epoch": 0.022068234982566094, "grad_norm": 0.2412109375, "learning_rate": 0.0002998679084707926, "loss": 2.7378, "step": 250 }, { "epoch": 0.02295096438186874, "grad_norm": 0.255859375, "learning_rate": 0.00029984971223303935, "loss": 2.7339, "step": 260 }, { "epoch": 0.02383369378117138, "grad_norm": 0.234375, "learning_rate": 0.00029983034285332923, "loss": 2.7259, "step": 270 }, { "epoch": 0.024716423180474026, "grad_norm": 0.27734375, "learning_rate": 0.000299809800483301, "loss": 2.711, "step": 280 }, { "epoch": 0.02559915257977667, "grad_norm": 0.224609375, "learning_rate": 0.00029978808528377673, "loss": 2.7184, "step": 290 }, { "epoch": 0.026481881979079313, "grad_norm": 0.2197265625, "learning_rate": 0.00029976519742476, "loss": 2.7073, "step": 300 }, { "epoch": 0.027364611378381958, "grad_norm": 0.2578125, "learning_rate": 0.00029974113708543503, "loss": 2.6999, "step": 310 }, { "epoch": 0.0282473407776846, "grad_norm": 0.2412109375, "learning_rate": 0.00029971590445416525, "loss": 2.7097, "step": 320 }, { "epoch": 0.029130070176987245, "grad_norm": 0.255859375, "learning_rate": 0.0002996894997284917, "loss": 2.702, "step": 330 }, { "epoch": 0.030012799576289887, "grad_norm": 0.2431640625, "learning_rate": 0.0002996619231151313, "loss": 2.7177, "step": 340 }, { "epoch": 0.030895528975592532, "grad_norm": 0.27734375, "learning_rate": 0.0002996331748299757, "loss": 2.7185, "step": 350 }, { "epoch": 0.031778258374895174, "grad_norm": 0.244140625, "learning_rate": 0.00029960325509808904, "loss": 2.6929, "step": 360 }, { "epoch": 0.03266098777419782, "grad_norm": 0.296875, "learning_rate": 0.00029957216415370675, "loss": 2.7073, "step": 370 }, { "epoch": 0.033543717173500465, "grad_norm": 0.2451171875, "learning_rate": 0.0002995399022402333, "loss": 2.6878, "step": 380 }, { "epoch": 0.03442644657280311, "grad_norm": 0.2333984375, "learning_rate": 0.00029950646961024034, "loss": 2.6956, "step": 390 }, { "epoch": 0.03530917597210575, "grad_norm": 0.2470703125, "learning_rate": 0.00029947186652546504, "loss": 2.6676, "step": 400 }, { "epoch": 0.036191905371408394, "grad_norm": 0.2578125, "learning_rate": 0.00029943609325680764, "loss": 2.6881, "step": 410 }, { "epoch": 0.03707463477071104, "grad_norm": 0.220703125, "learning_rate": 0.00029939915008432946, "loss": 2.6751, "step": 420 }, { "epoch": 0.037957364170013684, "grad_norm": 0.20703125, "learning_rate": 0.00029936103729725106, "loss": 2.6753, "step": 430 }, { "epoch": 0.03884009356931633, "grad_norm": 0.2060546875, "learning_rate": 0.0002993217551939492, "loss": 2.6891, "step": 440 }, { "epoch": 0.03972282296861897, "grad_norm": 0.23046875, "learning_rate": 0.00029928130408195545, "loss": 2.6698, "step": 450 }, { "epoch": 0.04060555236792161, "grad_norm": 0.263671875, "learning_rate": 0.000299239684277953, "loss": 2.6608, "step": 460 }, { "epoch": 0.04148828176722426, "grad_norm": 0.302734375, "learning_rate": 0.0002991968961077745, "loss": 2.677, "step": 470 }, { "epoch": 0.0423710111665269, "grad_norm": 0.21484375, "learning_rate": 0.0002991529399063997, "loss": 2.6556, "step": 480 }, { "epoch": 0.04325374056582954, "grad_norm": 0.236328125, "learning_rate": 0.0002991078160179523, "loss": 2.6501, "step": 490 }, { "epoch": 0.04413646996513219, "grad_norm": 0.263671875, "learning_rate": 0.00029906152479569777, "loss": 2.6447, "step": 500 }, { "epoch": 0.04501919936443483, "grad_norm": 0.248046875, "learning_rate": 0.0002990140666020405, "loss": 2.659, "step": 510 }, { "epoch": 0.04590192876373748, "grad_norm": 0.31640625, "learning_rate": 0.0002989654418085207, "loss": 2.65, "step": 520 }, { "epoch": 0.04678465816304012, "grad_norm": 0.265625, "learning_rate": 0.0002989156507958117, "loss": 2.6618, "step": 530 }, { "epoch": 0.04766738756234276, "grad_norm": 0.271484375, "learning_rate": 0.00029886469395371673, "loss": 2.6524, "step": 540 }, { "epoch": 0.048550116961645406, "grad_norm": 0.265625, "learning_rate": 0.0002988125716811664, "loss": 2.6525, "step": 550 }, { "epoch": 0.04943284636094805, "grad_norm": 0.2236328125, "learning_rate": 0.00029875928438621494, "loss": 2.6404, "step": 560 }, { "epoch": 0.0503155757602507, "grad_norm": 0.2373046875, "learning_rate": 0.00029870483248603747, "loss": 2.6545, "step": 570 }, { "epoch": 0.05119830515955334, "grad_norm": 0.2578125, "learning_rate": 0.0002986492164069264, "loss": 2.6577, "step": 580 }, { "epoch": 0.05208103455885598, "grad_norm": 0.23828125, "learning_rate": 0.0002985924365842885, "loss": 2.6468, "step": 590 }, { "epoch": 0.052963763958158626, "grad_norm": 0.21484375, "learning_rate": 0.00029853449346264096, "loss": 2.6538, "step": 600 }, { "epoch": 0.05384649335746127, "grad_norm": 0.220703125, "learning_rate": 0.00029847538749560833, "loss": 2.6368, "step": 610 }, { "epoch": 0.054729222756763916, "grad_norm": 0.2392578125, "learning_rate": 0.00029841511914591883, "loss": 2.6418, "step": 620 }, { "epoch": 0.055611952156066555, "grad_norm": 0.328125, "learning_rate": 0.00029835368888540077, "loss": 2.6521, "step": 630 }, { "epoch": 0.0564946815553692, "grad_norm": 0.31640625, "learning_rate": 0.00029829109719497874, "loss": 2.6458, "step": 640 }, { "epoch": 0.057377410954671845, "grad_norm": 0.24609375, "learning_rate": 0.00029822734456466996, "loss": 2.6334, "step": 650 }, { "epoch": 0.05826014035397449, "grad_norm": 0.3203125, "learning_rate": 0.0002981624314935804, "loss": 2.6384, "step": 660 }, { "epoch": 0.059142869753277136, "grad_norm": 0.26953125, "learning_rate": 0.0002980963584899009, "loss": 2.6361, "step": 670 }, { "epoch": 0.060025599152579774, "grad_norm": 0.2451171875, "learning_rate": 0.000298029126070903, "loss": 2.6363, "step": 680 }, { "epoch": 0.06090832855188242, "grad_norm": 0.26171875, "learning_rate": 0.00029796073476293534, "loss": 2.6334, "step": 690 }, { "epoch": 0.061791057951185065, "grad_norm": 0.2265625, "learning_rate": 0.00029789118510141916, "loss": 2.6242, "step": 700 }, { "epoch": 0.0626737873504877, "grad_norm": 0.19140625, "learning_rate": 0.0002978204776308441, "loss": 2.6384, "step": 710 }, { "epoch": 0.06355651674979035, "grad_norm": 0.2392578125, "learning_rate": 0.0002977486129047641, "loss": 2.6218, "step": 720 }, { "epoch": 0.064439246149093, "grad_norm": 0.2119140625, "learning_rate": 0.000297675591485793, "loss": 2.6137, "step": 730 }, { "epoch": 0.06532197554839564, "grad_norm": 0.255859375, "learning_rate": 0.00029760141394560017, "loss": 2.6269, "step": 740 }, { "epoch": 0.06620470494769828, "grad_norm": 0.26171875, "learning_rate": 0.00029752608086490613, "loss": 2.6224, "step": 750 }, { "epoch": 0.06708743434700093, "grad_norm": 0.25390625, "learning_rate": 0.0002974495928334776, "loss": 2.6409, "step": 760 }, { "epoch": 0.06797016374630357, "grad_norm": 0.20703125, "learning_rate": 0.00029737195045012344, "loss": 2.6228, "step": 770 }, { "epoch": 0.06885289314560622, "grad_norm": 0.2265625, "learning_rate": 0.00029729315432268957, "loss": 2.6072, "step": 780 }, { "epoch": 0.06973562254490885, "grad_norm": 0.32421875, "learning_rate": 0.0002972132050680543, "loss": 2.6177, "step": 790 }, { "epoch": 0.0706183519442115, "grad_norm": 0.21484375, "learning_rate": 0.0002971321033121236, "loss": 2.6201, "step": 800 }, { "epoch": 0.07150108134351414, "grad_norm": 0.28515625, "learning_rate": 0.000297049849689826, "loss": 2.6251, "step": 810 }, { "epoch": 0.07238381074281679, "grad_norm": 0.216796875, "learning_rate": 0.000296966444845108, "loss": 2.6258, "step": 820 }, { "epoch": 0.07326654014211943, "grad_norm": 0.25, "learning_rate": 0.0002968818894309285, "loss": 2.6196, "step": 830 }, { "epoch": 0.07414926954142208, "grad_norm": 0.216796875, "learning_rate": 0.00029679618410925426, "loss": 2.6125, "step": 840 }, { "epoch": 0.07503199894072472, "grad_norm": 0.185546875, "learning_rate": 0.0002967093295510541, "loss": 2.6144, "step": 850 }, { "epoch": 0.07591472834002737, "grad_norm": 0.2080078125, "learning_rate": 0.00029662132643629423, "loss": 2.622, "step": 860 }, { "epoch": 0.07679745773933001, "grad_norm": 0.2177734375, "learning_rate": 0.00029653217545393254, "loss": 2.6145, "step": 870 }, { "epoch": 0.07768018713863266, "grad_norm": 0.26171875, "learning_rate": 0.00029644187730191334, "loss": 2.6166, "step": 880 }, { "epoch": 0.07856291653793529, "grad_norm": 0.251953125, "learning_rate": 0.00029635043268716194, "loss": 2.6096, "step": 890 }, { "epoch": 0.07944564593723794, "grad_norm": 0.2431640625, "learning_rate": 0.00029625784232557904, "loss": 2.6026, "step": 900 }, { "epoch": 0.08032837533654058, "grad_norm": 0.197265625, "learning_rate": 0.00029616410694203516, "loss": 2.6149, "step": 910 }, { "epoch": 0.08121110473584323, "grad_norm": 0.2119140625, "learning_rate": 0.000296069227270365, "loss": 2.6141, "step": 920 }, { "epoch": 0.08209383413514587, "grad_norm": 0.259765625, "learning_rate": 0.0002959732040533615, "loss": 2.5931, "step": 930 }, { "epoch": 0.08297656353444852, "grad_norm": 0.2451171875, "learning_rate": 0.00029587603804277036, "loss": 2.601, "step": 940 }, { "epoch": 0.08385929293375116, "grad_norm": 0.2265625, "learning_rate": 0.00029577772999928384, "loss": 2.6131, "step": 950 }, { "epoch": 0.0847420223330538, "grad_norm": 0.19921875, "learning_rate": 0.00029567828069253504, "loss": 2.6099, "step": 960 }, { "epoch": 0.08562475173235645, "grad_norm": 0.2265625, "learning_rate": 0.00029557769090109165, "loss": 2.6078, "step": 970 }, { "epoch": 0.08650748113165908, "grad_norm": 0.2021484375, "learning_rate": 0.0002954759614124501, "loss": 2.6069, "step": 980 }, { "epoch": 0.08739021053096173, "grad_norm": 0.2216796875, "learning_rate": 0.0002953730930230292, "loss": 2.5915, "step": 990 }, { "epoch": 0.08827293993026437, "grad_norm": 0.189453125, "learning_rate": 0.0002952690865381639, "loss": 2.6075, "step": 1000 }, { "epoch": 0.08915566932956702, "grad_norm": 0.201171875, "learning_rate": 0.0002951639427720993, "loss": 2.602, "step": 1010 }, { "epoch": 0.09003839872886966, "grad_norm": 0.259765625, "learning_rate": 0.00029505766254798375, "loss": 2.5977, "step": 1020 }, { "epoch": 0.09092112812817231, "grad_norm": 0.232421875, "learning_rate": 0.0002949502466978629, "loss": 2.6156, "step": 1030 }, { "epoch": 0.09180385752747496, "grad_norm": 0.2275390625, "learning_rate": 0.000294841696062673, "loss": 2.5994, "step": 1040 }, { "epoch": 0.0926865869267776, "grad_norm": 0.2021484375, "learning_rate": 0.00029473201149223395, "loss": 2.6145, "step": 1050 }, { "epoch": 0.09356931632608025, "grad_norm": 0.2734375, "learning_rate": 0.0002946211938452434, "loss": 2.6008, "step": 1060 }, { "epoch": 0.09445204572538288, "grad_norm": 0.19140625, "learning_rate": 0.00029450924398926947, "loss": 2.6069, "step": 1070 }, { "epoch": 0.09533477512468552, "grad_norm": 0.2197265625, "learning_rate": 0.00029439616280074407, "loss": 2.6012, "step": 1080 }, { "epoch": 0.09621750452398817, "grad_norm": 0.20703125, "learning_rate": 0.00029428195116495607, "loss": 2.595, "step": 1090 }, { "epoch": 0.09710023392329081, "grad_norm": 0.197265625, "learning_rate": 0.0002941666099760444, "loss": 2.5995, "step": 1100 }, { "epoch": 0.09798296332259346, "grad_norm": 0.2431640625, "learning_rate": 0.00029405014013699106, "loss": 2.5903, "step": 1110 }, { "epoch": 0.0988656927218961, "grad_norm": 0.201171875, "learning_rate": 0.00029393254255961394, "loss": 2.5939, "step": 1120 }, { "epoch": 0.09974842212119875, "grad_norm": 0.224609375, "learning_rate": 0.0002938138181645599, "loss": 2.5966, "step": 1130 }, { "epoch": 0.1006311515205014, "grad_norm": 0.23046875, "learning_rate": 0.00029369396788129717, "loss": 2.6042, "step": 1140 }, { "epoch": 0.10151388091980404, "grad_norm": 0.240234375, "learning_rate": 0.0002935729926481086, "loss": 2.6009, "step": 1150 }, { "epoch": 0.10239661031910668, "grad_norm": 0.1982421875, "learning_rate": 0.0002934508934120839, "loss": 2.6177, "step": 1160 }, { "epoch": 0.10327933971840932, "grad_norm": 0.2060546875, "learning_rate": 0.00029332767112911226, "loss": 2.584, "step": 1170 }, { "epoch": 0.10416206911771196, "grad_norm": 0.2177734375, "learning_rate": 0.00029320332676387515, "loss": 2.5933, "step": 1180 }, { "epoch": 0.1050447985170146, "grad_norm": 0.2216796875, "learning_rate": 0.0002930778612898386, "loss": 2.5986, "step": 1190 }, { "epoch": 0.10592752791631725, "grad_norm": 0.2109375, "learning_rate": 0.0002929512756892453, "loss": 2.6084, "step": 1200 }, { "epoch": 0.1068102573156199, "grad_norm": 0.3046875, "learning_rate": 0.0002928235709531075, "loss": 2.6091, "step": 1210 }, { "epoch": 0.10769298671492254, "grad_norm": 0.197265625, "learning_rate": 0.00029269474808119874, "loss": 2.5956, "step": 1220 }, { "epoch": 0.10857571611422519, "grad_norm": 0.2001953125, "learning_rate": 0.00029256480808204625, "loss": 2.5863, "step": 1230 }, { "epoch": 0.10945844551352783, "grad_norm": 0.201171875, "learning_rate": 0.00029243375197292304, "loss": 2.5991, "step": 1240 }, { "epoch": 0.11034117491283048, "grad_norm": 0.251953125, "learning_rate": 0.00029230158077983993, "loss": 2.5909, "step": 1250 }, { "epoch": 0.11122390431213311, "grad_norm": 0.1865234375, "learning_rate": 0.00029216829553753735, "loss": 2.6009, "step": 1260 }, { "epoch": 0.11210663371143575, "grad_norm": 0.2021484375, "learning_rate": 0.00029203389728947756, "loss": 2.5945, "step": 1270 }, { "epoch": 0.1129893631107384, "grad_norm": 0.21484375, "learning_rate": 0.00029189838708783625, "loss": 2.5818, "step": 1280 }, { "epoch": 0.11387209251004105, "grad_norm": 0.2197265625, "learning_rate": 0.00029176176599349436, "loss": 2.5944, "step": 1290 }, { "epoch": 0.11475482190934369, "grad_norm": 0.2021484375, "learning_rate": 0.00029162403507602974, "loss": 2.6008, "step": 1300 }, { "epoch": 0.11563755130864634, "grad_norm": 0.2138671875, "learning_rate": 0.0002914851954137089, "loss": 2.5845, "step": 1310 }, { "epoch": 0.11652028070794898, "grad_norm": 0.189453125, "learning_rate": 0.0002913452480934784, "loss": 2.5952, "step": 1320 }, { "epoch": 0.11740301010725163, "grad_norm": 0.224609375, "learning_rate": 0.00029120419421095644, "loss": 2.5878, "step": 1330 }, { "epoch": 0.11828573950655427, "grad_norm": 0.248046875, "learning_rate": 0.00029106203487042423, "loss": 2.5944, "step": 1340 }, { "epoch": 0.1191684689058569, "grad_norm": 0.212890625, "learning_rate": 0.0002909187711848175, "loss": 2.594, "step": 1350 }, { "epoch": 0.12005119830515955, "grad_norm": 0.1953125, "learning_rate": 0.0002907744042757175, "loss": 2.5729, "step": 1360 }, { "epoch": 0.1209339277044622, "grad_norm": 0.2099609375, "learning_rate": 0.0002906289352733426, "loss": 2.5743, "step": 1370 }, { "epoch": 0.12181665710376484, "grad_norm": 0.1884765625, "learning_rate": 0.000290482365316539, "loss": 2.5864, "step": 1380 }, { "epoch": 0.12269938650306748, "grad_norm": 0.2099609375, "learning_rate": 0.0002903346955527721, "loss": 2.5768, "step": 1390 }, { "epoch": 0.12358211590237013, "grad_norm": 0.205078125, "learning_rate": 0.0002901859271381176, "loss": 2.5854, "step": 1400 }, { "epoch": 0.12446484530167277, "grad_norm": 0.2333984375, "learning_rate": 0.00029003606123725217, "loss": 2.5755, "step": 1410 }, { "epoch": 0.1253475747009754, "grad_norm": 0.2255859375, "learning_rate": 0.0002898850990234445, "loss": 2.5743, "step": 1420 }, { "epoch": 0.12623030410027805, "grad_norm": 0.2021484375, "learning_rate": 0.00028973304167854606, "loss": 2.5667, "step": 1430 }, { "epoch": 0.1271130334995807, "grad_norm": 0.1943359375, "learning_rate": 0.000289579890392982, "loss": 2.584, "step": 1440 }, { "epoch": 0.12799576289888334, "grad_norm": 0.2294921875, "learning_rate": 0.0002894256463657414, "loss": 2.577, "step": 1450 }, { "epoch": 0.128878492298186, "grad_norm": 0.216796875, "learning_rate": 0.0002892703108043686, "loss": 2.5847, "step": 1460 }, { "epoch": 0.12976122169748863, "grad_norm": 0.224609375, "learning_rate": 0.00028911388492495305, "loss": 2.5716, "step": 1470 }, { "epoch": 0.13064395109679128, "grad_norm": 0.2119140625, "learning_rate": 0.00028895636995212003, "loss": 2.5861, "step": 1480 }, { "epoch": 0.13152668049609392, "grad_norm": 0.236328125, "learning_rate": 0.0002887977671190214, "loss": 2.592, "step": 1490 }, { "epoch": 0.13240940989539657, "grad_norm": 0.2109375, "learning_rate": 0.00028863807766732534, "loss": 2.5767, "step": 1500 }, { "epoch": 0.1332921392946992, "grad_norm": 0.2451171875, "learning_rate": 0.0002884773028472071, "loss": 2.5837, "step": 1510 }, { "epoch": 0.13417486869400186, "grad_norm": 0.2158203125, "learning_rate": 0.00028831544391733896, "loss": 2.5806, "step": 1520 }, { "epoch": 0.1350575980933045, "grad_norm": 0.2353515625, "learning_rate": 0.00028815250214488065, "loss": 2.579, "step": 1530 }, { "epoch": 0.13594032749260715, "grad_norm": 0.228515625, "learning_rate": 0.00028798847880546894, "loss": 2.5785, "step": 1540 }, { "epoch": 0.1368230568919098, "grad_norm": 0.1923828125, "learning_rate": 0.0002878233751832083, "loss": 2.5841, "step": 1550 }, { "epoch": 0.13770578629121244, "grad_norm": 0.201171875, "learning_rate": 0.0002876571925706603, "loss": 2.5767, "step": 1560 }, { "epoch": 0.13858851569051509, "grad_norm": 0.1904296875, "learning_rate": 0.0002874899322688337, "loss": 2.5816, "step": 1570 }, { "epoch": 0.1394712450898177, "grad_norm": 0.23046875, "learning_rate": 0.00028732159558717436, "loss": 2.5691, "step": 1580 }, { "epoch": 0.14035397448912035, "grad_norm": 0.2060546875, "learning_rate": 0.0002871521838435548, "loss": 2.5808, "step": 1590 }, { "epoch": 0.141236703888423, "grad_norm": 0.1943359375, "learning_rate": 0.0002869816983642641, "loss": 2.5759, "step": 1600 }, { "epoch": 0.14211943328772564, "grad_norm": 0.2119140625, "learning_rate": 0.0002868101404839972, "loss": 2.5673, "step": 1610 }, { "epoch": 0.14300216268702828, "grad_norm": 0.197265625, "learning_rate": 0.00028663751154584476, "loss": 2.5736, "step": 1620 }, { "epoch": 0.14388489208633093, "grad_norm": 0.1953125, "learning_rate": 0.0002864638129012826, "loss": 2.5819, "step": 1630 }, { "epoch": 0.14476762148563357, "grad_norm": 0.21484375, "learning_rate": 0.00028628904591016094, "loss": 2.5765, "step": 1640 }, { "epoch": 0.14565035088493622, "grad_norm": 0.2314453125, "learning_rate": 0.0002861132119406939, "loss": 2.567, "step": 1650 }, { "epoch": 0.14653308028423886, "grad_norm": 0.171875, "learning_rate": 0.00028593631236944865, "loss": 2.5735, "step": 1660 }, { "epoch": 0.1474158096835415, "grad_norm": 0.20703125, "learning_rate": 0.00028575834858133486, "loss": 2.5832, "step": 1670 }, { "epoch": 0.14829853908284416, "grad_norm": 0.2119140625, "learning_rate": 0.0002855793219695937, "loss": 2.5597, "step": 1680 }, { "epoch": 0.1491812684821468, "grad_norm": 0.20703125, "learning_rate": 0.000285399233935787, "loss": 2.5769, "step": 1690 }, { "epoch": 0.15006399788144945, "grad_norm": 0.185546875, "learning_rate": 0.00028521808588978607, "loss": 2.5776, "step": 1700 }, { "epoch": 0.1509467272807521, "grad_norm": 0.2734375, "learning_rate": 0.00028503587924976105, "loss": 2.5782, "step": 1710 }, { "epoch": 0.15182945668005474, "grad_norm": 0.22265625, "learning_rate": 0.0002848526154421695, "loss": 2.5749, "step": 1720 }, { "epoch": 0.15271218607935738, "grad_norm": 0.189453125, "learning_rate": 0.00028466829590174524, "loss": 2.5722, "step": 1730 }, { "epoch": 0.15359491547866003, "grad_norm": 0.2080078125, "learning_rate": 0.0002844829220714873, "loss": 2.5749, "step": 1740 }, { "epoch": 0.15447764487796267, "grad_norm": 0.1865234375, "learning_rate": 0.00028429649540264866, "loss": 2.5647, "step": 1750 }, { "epoch": 0.15536037427726532, "grad_norm": 0.1953125, "learning_rate": 0.0002841090173547244, "loss": 2.5661, "step": 1760 }, { "epoch": 0.15624310367656793, "grad_norm": 0.220703125, "learning_rate": 0.00028392048939544084, "loss": 2.5821, "step": 1770 }, { "epoch": 0.15712583307587058, "grad_norm": 0.21484375, "learning_rate": 0.0002837309130007439, "loss": 2.5808, "step": 1780 }, { "epoch": 0.15800856247517323, "grad_norm": 0.193359375, "learning_rate": 0.0002835402896547874, "loss": 2.5691, "step": 1790 }, { "epoch": 0.15889129187447587, "grad_norm": 0.1884765625, "learning_rate": 0.0002833486208499215, "loss": 2.5636, "step": 1800 }, { "epoch": 0.15977402127377852, "grad_norm": 0.21484375, "learning_rate": 0.000283155908086681, "loss": 2.5706, "step": 1810 }, { "epoch": 0.16065675067308116, "grad_norm": 0.1875, "learning_rate": 0.0002829621528737738, "loss": 2.5569, "step": 1820 }, { "epoch": 0.1615394800723838, "grad_norm": 0.212890625, "learning_rate": 0.00028276735672806865, "loss": 2.5797, "step": 1830 }, { "epoch": 0.16242220947168645, "grad_norm": 0.185546875, "learning_rate": 0.00028257152117458387, "loss": 2.5718, "step": 1840 }, { "epoch": 0.1633049388709891, "grad_norm": 0.189453125, "learning_rate": 0.0002823746477464748, "loss": 2.5654, "step": 1850 }, { "epoch": 0.16418766827029174, "grad_norm": 0.208984375, "learning_rate": 0.0002821767379850223, "loss": 2.5679, "step": 1860 }, { "epoch": 0.1650703976695944, "grad_norm": 0.1875, "learning_rate": 0.0002819777934396203, "loss": 2.5694, "step": 1870 }, { "epoch": 0.16595312706889703, "grad_norm": 0.1904296875, "learning_rate": 0.00028177781566776395, "loss": 2.5782, "step": 1880 }, { "epoch": 0.16683585646819968, "grad_norm": 0.2265625, "learning_rate": 0.00028157680623503724, "loss": 2.5762, "step": 1890 }, { "epoch": 0.16771858586750232, "grad_norm": 0.212890625, "learning_rate": 0.00028137476671510084, "loss": 2.5743, "step": 1900 }, { "epoch": 0.16860131526680497, "grad_norm": 0.2177734375, "learning_rate": 0.00028117169868967977, "loss": 2.5807, "step": 1910 }, { "epoch": 0.1694840446661076, "grad_norm": 0.1943359375, "learning_rate": 0.000280967603748551, "loss": 2.5736, "step": 1920 }, { "epoch": 0.17036677406541026, "grad_norm": 0.18359375, "learning_rate": 0.000280762483489531, "loss": 2.5675, "step": 1930 }, { "epoch": 0.1712495034647129, "grad_norm": 0.21875, "learning_rate": 0.00028055633951846326, "loss": 2.5704, "step": 1940 }, { "epoch": 0.17213223286401555, "grad_norm": 0.2216796875, "learning_rate": 0.0002803491734492056, "loss": 2.571, "step": 1950 }, { "epoch": 0.17301496226331817, "grad_norm": 0.1845703125, "learning_rate": 0.0002801409869036178, "loss": 2.5622, "step": 1960 }, { "epoch": 0.1738976916626208, "grad_norm": 0.1845703125, "learning_rate": 0.00027993178151154865, "loss": 2.5657, "step": 1970 }, { "epoch": 0.17478042106192346, "grad_norm": 0.203125, "learning_rate": 0.0002797215589108232, "loss": 2.5666, "step": 1980 }, { "epoch": 0.1756631504612261, "grad_norm": 0.1748046875, "learning_rate": 0.0002795103207472301, "loss": 2.5691, "step": 1990 }, { "epoch": 0.17654587986052875, "grad_norm": 0.19140625, "learning_rate": 0.00027929806867450866, "loss": 2.5884, "step": 2000 }, { "epoch": 0.17654587986052875, "eval_accuracy": 0.5014283410509826, "eval_loss": 2.450904607772827, "eval_runtime": 7.0015, "eval_samples_per_second": 45.419, "eval_steps_per_second": 0.428, "step": 2000 }, { "epoch": 0.1774286092598314, "grad_norm": 0.2109375, "learning_rate": 0.00027908480435433577, "loss": 2.5587, "step": 2010 }, { "epoch": 0.17831133865913404, "grad_norm": 0.1806640625, "learning_rate": 0.0002788705294563131, "loss": 2.5719, "step": 2020 }, { "epoch": 0.17919406805843668, "grad_norm": 0.2041015625, "learning_rate": 0.0002786552456579539, "loss": 2.5651, "step": 2030 }, { "epoch": 0.18007679745773933, "grad_norm": 0.1748046875, "learning_rate": 0.00027843895464466973, "loss": 2.5538, "step": 2040 }, { "epoch": 0.18095952685704197, "grad_norm": 0.185546875, "learning_rate": 0.0002782216581097576, "loss": 2.5673, "step": 2050 }, { "epoch": 0.18184225625634462, "grad_norm": 0.2314453125, "learning_rate": 0.00027800335775438653, "loss": 2.568, "step": 2060 }, { "epoch": 0.18272498565564727, "grad_norm": 0.263671875, "learning_rate": 0.00027778405528758424, "loss": 2.5629, "step": 2070 }, { "epoch": 0.1836077150549499, "grad_norm": 0.18359375, "learning_rate": 0.0002775637524262236, "loss": 2.5767, "step": 2080 }, { "epoch": 0.18449044445425256, "grad_norm": 0.203125, "learning_rate": 0.0002773424508950096, "loss": 2.5524, "step": 2090 }, { "epoch": 0.1853731738535552, "grad_norm": 0.18359375, "learning_rate": 0.0002771201524264655, "loss": 2.5509, "step": 2100 }, { "epoch": 0.18625590325285785, "grad_norm": 0.185546875, "learning_rate": 0.00027689685876091925, "loss": 2.5622, "step": 2110 }, { "epoch": 0.1871386326521605, "grad_norm": 0.1806640625, "learning_rate": 0.00027667257164649023, "loss": 2.5658, "step": 2120 }, { "epoch": 0.18802136205146314, "grad_norm": 0.1806640625, "learning_rate": 0.0002764472928390751, "loss": 2.5648, "step": 2130 }, { "epoch": 0.18890409145076575, "grad_norm": 0.193359375, "learning_rate": 0.00027622102410233455, "loss": 2.5679, "step": 2140 }, { "epoch": 0.1897868208500684, "grad_norm": 0.201171875, "learning_rate": 0.0002759937672076789, "loss": 2.5514, "step": 2150 }, { "epoch": 0.19066955024937104, "grad_norm": 0.212890625, "learning_rate": 0.0002757655239342547, "loss": 2.5534, "step": 2160 }, { "epoch": 0.1915522796486737, "grad_norm": 0.1953125, "learning_rate": 0.0002755362960689306, "loss": 2.5418, "step": 2170 }, { "epoch": 0.19243500904797634, "grad_norm": 0.185546875, "learning_rate": 0.00027530608540628367, "loss": 2.5542, "step": 2180 }, { "epoch": 0.19331773844727898, "grad_norm": 0.21484375, "learning_rate": 0.0002750748937485847, "loss": 2.5784, "step": 2190 }, { "epoch": 0.19420046784658163, "grad_norm": 0.212890625, "learning_rate": 0.00027484272290578474, "loss": 2.5653, "step": 2200 }, { "epoch": 0.19508319724588427, "grad_norm": 0.1796875, "learning_rate": 0.0002746095746955006, "loss": 2.5491, "step": 2210 }, { "epoch": 0.19596592664518692, "grad_norm": 0.169921875, "learning_rate": 0.0002743754509430007, "loss": 2.5606, "step": 2220 }, { "epoch": 0.19684865604448956, "grad_norm": 0.1943359375, "learning_rate": 0.00027414035348119074, "loss": 2.5639, "step": 2230 }, { "epoch": 0.1977313854437922, "grad_norm": 0.2060546875, "learning_rate": 0.0002739042841505995, "loss": 2.552, "step": 2240 }, { "epoch": 0.19861411484309485, "grad_norm": 0.21484375, "learning_rate": 0.00027366724479936416, "loss": 2.5671, "step": 2250 }, { "epoch": 0.1994968442423975, "grad_norm": 0.1796875, "learning_rate": 0.00027342923728321613, "loss": 2.5547, "step": 2260 }, { "epoch": 0.20037957364170014, "grad_norm": 0.1796875, "learning_rate": 0.00027319026346546627, "loss": 2.5668, "step": 2270 }, { "epoch": 0.2012623030410028, "grad_norm": 0.2060546875, "learning_rate": 0.0002729503252169905, "loss": 2.5504, "step": 2280 }, { "epoch": 0.20214503244030543, "grad_norm": 0.20703125, "learning_rate": 0.000272709424416215, "loss": 2.5664, "step": 2290 }, { "epoch": 0.20302776183960808, "grad_norm": 0.177734375, "learning_rate": 0.0002724675629491016, "loss": 2.5774, "step": 2300 }, { "epoch": 0.20391049123891072, "grad_norm": 0.171875, "learning_rate": 0.00027222474270913297, "loss": 2.5482, "step": 2310 }, { "epoch": 0.20479322063821337, "grad_norm": 0.1904296875, "learning_rate": 0.0002719809655972979, "loss": 2.5424, "step": 2320 }, { "epoch": 0.205675950037516, "grad_norm": 0.2373046875, "learning_rate": 0.00027173623352207604, "loss": 2.5627, "step": 2330 }, { "epoch": 0.20655867943681863, "grad_norm": 0.2412109375, "learning_rate": 0.00027149054839942374, "loss": 2.5661, "step": 2340 }, { "epoch": 0.20744140883612128, "grad_norm": 0.19140625, "learning_rate": 0.000271243912152758, "loss": 2.5523, "step": 2350 }, { "epoch": 0.20832413823542392, "grad_norm": 0.1748046875, "learning_rate": 0.0002709963267129425, "loss": 2.5527, "step": 2360 }, { "epoch": 0.20920686763472657, "grad_norm": 0.216796875, "learning_rate": 0.0002707477940182716, "loss": 2.5511, "step": 2370 }, { "epoch": 0.2100895970340292, "grad_norm": 0.19921875, "learning_rate": 0.0002704983160144556, "loss": 2.5587, "step": 2380 }, { "epoch": 0.21097232643333186, "grad_norm": 0.2138671875, "learning_rate": 0.00027024789465460543, "loss": 2.5609, "step": 2390 }, { "epoch": 0.2118550558326345, "grad_norm": 0.2021484375, "learning_rate": 0.0002699965318992174, "loss": 2.5588, "step": 2400 }, { "epoch": 0.21273778523193715, "grad_norm": 0.220703125, "learning_rate": 0.0002697442297161578, "loss": 2.5684, "step": 2410 }, { "epoch": 0.2136205146312398, "grad_norm": 0.1728515625, "learning_rate": 0.0002694909900806475, "loss": 2.5606, "step": 2420 }, { "epoch": 0.21450324403054244, "grad_norm": 0.1796875, "learning_rate": 0.0002692368149752464, "loss": 2.5661, "step": 2430 }, { "epoch": 0.21538597342984508, "grad_norm": 0.19921875, "learning_rate": 0.00026898170638983823, "loss": 2.5629, "step": 2440 }, { "epoch": 0.21626870282914773, "grad_norm": 0.181640625, "learning_rate": 0.00026872566632161435, "loss": 2.5618, "step": 2450 }, { "epoch": 0.21715143222845038, "grad_norm": 0.201171875, "learning_rate": 0.00026846869677505893, "loss": 2.5612, "step": 2460 }, { "epoch": 0.21803416162775302, "grad_norm": 0.2021484375, "learning_rate": 0.00026821079976193247, "loss": 2.5578, "step": 2470 }, { "epoch": 0.21891689102705567, "grad_norm": 0.1796875, "learning_rate": 0.0002679519773012568, "loss": 2.56, "step": 2480 }, { "epoch": 0.2197996204263583, "grad_norm": 0.169921875, "learning_rate": 0.0002676922314192984, "loss": 2.5413, "step": 2490 }, { "epoch": 0.22068234982566096, "grad_norm": 0.169921875, "learning_rate": 0.00026743156414955346, "loss": 2.5612, "step": 2500 }, { "epoch": 0.22156507922496357, "grad_norm": 0.2021484375, "learning_rate": 0.0002671699775327313, "loss": 2.5619, "step": 2510 }, { "epoch": 0.22244780862426622, "grad_norm": 0.1845703125, "learning_rate": 0.00026690747361673864, "loss": 2.5591, "step": 2520 }, { "epoch": 0.22333053802356886, "grad_norm": 0.1826171875, "learning_rate": 0.0002666440544566636, "loss": 2.5541, "step": 2530 }, { "epoch": 0.2242132674228715, "grad_norm": 0.21484375, "learning_rate": 0.00026637972211475945, "loss": 2.5504, "step": 2540 }, { "epoch": 0.22509599682217415, "grad_norm": 0.205078125, "learning_rate": 0.0002661144786604287, "loss": 2.5427, "step": 2550 }, { "epoch": 0.2259787262214768, "grad_norm": 0.18359375, "learning_rate": 0.00026584832617020663, "loss": 2.5589, "step": 2560 }, { "epoch": 0.22686145562077945, "grad_norm": 0.1904296875, "learning_rate": 0.00026558126672774534, "loss": 2.5529, "step": 2570 }, { "epoch": 0.2277441850200821, "grad_norm": 0.1796875, "learning_rate": 0.00026531330242379714, "loss": 2.548, "step": 2580 }, { "epoch": 0.22862691441938474, "grad_norm": 0.1845703125, "learning_rate": 0.0002650444353561983, "loss": 2.5455, "step": 2590 }, { "epoch": 0.22950964381868738, "grad_norm": 0.193359375, "learning_rate": 0.00026477466762985267, "loss": 2.5613, "step": 2600 }, { "epoch": 0.23039237321799003, "grad_norm": 0.2041015625, "learning_rate": 0.00026450400135671524, "loss": 2.5431, "step": 2610 }, { "epoch": 0.23127510261729267, "grad_norm": 0.1728515625, "learning_rate": 0.0002642324386557755, "loss": 2.5413, "step": 2620 }, { "epoch": 0.23215783201659532, "grad_norm": 0.2216796875, "learning_rate": 0.0002639599816530407, "loss": 2.5392, "step": 2630 }, { "epoch": 0.23304056141589796, "grad_norm": 0.2138671875, "learning_rate": 0.0002636866324815198, "loss": 2.5554, "step": 2640 }, { "epoch": 0.2339232908152006, "grad_norm": 0.2060546875, "learning_rate": 0.00026341239328120586, "loss": 2.5491, "step": 2650 }, { "epoch": 0.23480602021450325, "grad_norm": 0.1875, "learning_rate": 0.00026313726619906023, "loss": 2.5433, "step": 2660 }, { "epoch": 0.2356887496138059, "grad_norm": 0.1943359375, "learning_rate": 0.000262861253388995, "loss": 2.5371, "step": 2670 }, { "epoch": 0.23657147901310854, "grad_norm": 0.1875, "learning_rate": 0.00026258435701185655, "loss": 2.5585, "step": 2680 }, { "epoch": 0.2374542084124112, "grad_norm": 0.1962890625, "learning_rate": 0.00026230657923540857, "loss": 2.5527, "step": 2690 }, { "epoch": 0.2383369378117138, "grad_norm": 0.1943359375, "learning_rate": 0.000262027922234315, "loss": 2.5459, "step": 2700 }, { "epoch": 0.23921966721101645, "grad_norm": 0.19140625, "learning_rate": 0.000261748388190123, "loss": 2.5508, "step": 2710 }, { "epoch": 0.2401023966103191, "grad_norm": 0.1875, "learning_rate": 0.00026146797929124595, "loss": 2.5471, "step": 2720 }, { "epoch": 0.24098512600962174, "grad_norm": 0.23828125, "learning_rate": 0.0002611866977329463, "loss": 2.5546, "step": 2730 }, { "epoch": 0.2418678554089244, "grad_norm": 0.173828125, "learning_rate": 0.0002609045457173184, "loss": 2.5562, "step": 2740 }, { "epoch": 0.24275058480822703, "grad_norm": 0.205078125, "learning_rate": 0.00026062152545327106, "loss": 2.5354, "step": 2750 }, { "epoch": 0.24363331420752968, "grad_norm": 0.1708984375, "learning_rate": 0.0002603376391565106, "loss": 2.5458, "step": 2760 }, { "epoch": 0.24451604360683232, "grad_norm": 0.1826171875, "learning_rate": 0.0002600528890495234, "loss": 2.5283, "step": 2770 }, { "epoch": 0.24539877300613497, "grad_norm": 0.2001953125, "learning_rate": 0.00025976727736155814, "loss": 2.5406, "step": 2780 }, { "epoch": 0.2462815024054376, "grad_norm": 0.1767578125, "learning_rate": 0.00025948080632860884, "loss": 2.5512, "step": 2790 }, { "epoch": 0.24716423180474026, "grad_norm": 0.2021484375, "learning_rate": 0.00025919347819339716, "loss": 2.5382, "step": 2800 }, { "epoch": 0.2480469612040429, "grad_norm": 0.1806640625, "learning_rate": 0.00025890529520535477, "loss": 2.5547, "step": 2810 }, { "epoch": 0.24892969060334555, "grad_norm": 0.1884765625, "learning_rate": 0.0002586162596206058, "loss": 2.5542, "step": 2820 }, { "epoch": 0.2498124200026482, "grad_norm": 0.1826171875, "learning_rate": 0.0002583263737019492, "loss": 2.5453, "step": 2830 }, { "epoch": 0.2506951494019508, "grad_norm": 0.2060546875, "learning_rate": 0.000258035639718841, "loss": 2.5432, "step": 2840 }, { "epoch": 0.2515778788012535, "grad_norm": 0.2138671875, "learning_rate": 0.0002577440599473766, "loss": 2.5252, "step": 2850 }, { "epoch": 0.2524606082005561, "grad_norm": 0.1748046875, "learning_rate": 0.0002574516366702728, "loss": 2.5437, "step": 2860 }, { "epoch": 0.2533433375998588, "grad_norm": 0.1796875, "learning_rate": 0.0002571583721768501, "loss": 2.5481, "step": 2870 }, { "epoch": 0.2542260669991614, "grad_norm": 0.193359375, "learning_rate": 0.00025686426876301465, "loss": 2.5392, "step": 2880 }, { "epoch": 0.25510879639846407, "grad_norm": 0.2177734375, "learning_rate": 0.0002565693287312405, "loss": 2.5692, "step": 2890 }, { "epoch": 0.2559915257977667, "grad_norm": 0.181640625, "learning_rate": 0.0002562735543905511, "loss": 2.5467, "step": 2900 }, { "epoch": 0.25687425519706936, "grad_norm": 0.1982421875, "learning_rate": 0.0002559769480565019, "loss": 2.5405, "step": 2910 }, { "epoch": 0.257756984596372, "grad_norm": 0.2216796875, "learning_rate": 0.0002556795120511615, "loss": 2.5607, "step": 2920 }, { "epoch": 0.25863971399567465, "grad_norm": 0.2275390625, "learning_rate": 0.000255381248703094, "loss": 2.5327, "step": 2930 }, { "epoch": 0.25952244339497726, "grad_norm": 0.19140625, "learning_rate": 0.0002550821603473406, "loss": 2.5582, "step": 2940 }, { "epoch": 0.26040517279427994, "grad_norm": 0.2265625, "learning_rate": 0.0002547822493254013, "loss": 2.5611, "step": 2950 }, { "epoch": 0.26128790219358256, "grad_norm": 0.197265625, "learning_rate": 0.00025448151798521665, "loss": 2.5681, "step": 2960 }, { "epoch": 0.26217063159288523, "grad_norm": 0.1796875, "learning_rate": 0.00025417996868114907, "loss": 2.5525, "step": 2970 }, { "epoch": 0.26305336099218785, "grad_norm": 0.173828125, "learning_rate": 0.0002538776037739648, "loss": 2.5575, "step": 2980 }, { "epoch": 0.26393609039149046, "grad_norm": 0.177734375, "learning_rate": 0.0002535744256308153, "loss": 2.5481, "step": 2990 }, { "epoch": 0.26481881979079314, "grad_norm": 0.2138671875, "learning_rate": 0.0002532704366252187, "loss": 2.5546, "step": 3000 }, { "epoch": 0.26570154919009575, "grad_norm": 0.1904296875, "learning_rate": 0.0002529656391370411, "loss": 2.5452, "step": 3010 }, { "epoch": 0.2665842785893984, "grad_norm": 0.1591796875, "learning_rate": 0.00025266003555247805, "loss": 2.548, "step": 3020 }, { "epoch": 0.26746700798870104, "grad_norm": 0.173828125, "learning_rate": 0.00025235362826403593, "loss": 2.5447, "step": 3030 }, { "epoch": 0.2683497373880037, "grad_norm": 0.216796875, "learning_rate": 0.0002520464196705131, "loss": 2.5373, "step": 3040 }, { "epoch": 0.26923246678730633, "grad_norm": 0.2177734375, "learning_rate": 0.0002517384121769812, "loss": 2.5332, "step": 3050 }, { "epoch": 0.270115196186609, "grad_norm": 0.189453125, "learning_rate": 0.00025142960819476626, "loss": 2.5469, "step": 3060 }, { "epoch": 0.2709979255859116, "grad_norm": 0.185546875, "learning_rate": 0.0002511200101414298, "loss": 2.5334, "step": 3070 }, { "epoch": 0.2718806549852143, "grad_norm": 0.1943359375, "learning_rate": 0.00025080962044075016, "loss": 2.5389, "step": 3080 }, { "epoch": 0.2727633843845169, "grad_norm": 0.162109375, "learning_rate": 0.0002504984415227031, "loss": 2.5429, "step": 3090 }, { "epoch": 0.2736461137838196, "grad_norm": 0.181640625, "learning_rate": 0.0002501864758234431, "loss": 2.5542, "step": 3100 }, { "epoch": 0.2745288431831222, "grad_norm": 0.22265625, "learning_rate": 0.00024987372578528415, "loss": 2.5463, "step": 3110 }, { "epoch": 0.2754115725824249, "grad_norm": 0.1962890625, "learning_rate": 0.0002495601938566807, "loss": 2.5675, "step": 3120 }, { "epoch": 0.2762943019817275, "grad_norm": 0.2001953125, "learning_rate": 0.00024924588249220837, "loss": 2.557, "step": 3130 }, { "epoch": 0.27717703138103017, "grad_norm": 0.1767578125, "learning_rate": 0.000248930794152545, "loss": 2.5594, "step": 3140 }, { "epoch": 0.2780597607803328, "grad_norm": 0.189453125, "learning_rate": 0.000248614931304451, "loss": 2.5471, "step": 3150 }, { "epoch": 0.2789424901796354, "grad_norm": 0.1865234375, "learning_rate": 0.0002482982964207504, "loss": 2.5515, "step": 3160 }, { "epoch": 0.2798252195789381, "grad_norm": 0.1953125, "learning_rate": 0.00024798089198031125, "loss": 2.5543, "step": 3170 }, { "epoch": 0.2807079489782407, "grad_norm": 0.203125, "learning_rate": 0.0002476627204680263, "loss": 2.5321, "step": 3180 }, { "epoch": 0.28159067837754337, "grad_norm": 0.2138671875, "learning_rate": 0.00024734378437479354, "loss": 2.5528, "step": 3190 }, { "epoch": 0.282473407776846, "grad_norm": 0.201171875, "learning_rate": 0.00024702408619749677, "loss": 2.5484, "step": 3200 }, { "epoch": 0.28335613717614866, "grad_norm": 0.1650390625, "learning_rate": 0.00024670362843898594, "loss": 2.5348, "step": 3210 }, { "epoch": 0.2842388665754513, "grad_norm": 0.1728515625, "learning_rate": 0.0002463824136080576, "loss": 2.5435, "step": 3220 }, { "epoch": 0.28512159597475395, "grad_norm": 0.25, "learning_rate": 0.00024606044421943526, "loss": 2.5475, "step": 3230 }, { "epoch": 0.28600432537405657, "grad_norm": 0.1728515625, "learning_rate": 0.0002457377227937497, "loss": 2.5331, "step": 3240 }, { "epoch": 0.28688705477335924, "grad_norm": 0.1953125, "learning_rate": 0.00024541425185751933, "loss": 2.5478, "step": 3250 }, { "epoch": 0.28776978417266186, "grad_norm": 0.1904296875, "learning_rate": 0.00024509003394313015, "loss": 2.5469, "step": 3260 }, { "epoch": 0.28865251357196453, "grad_norm": 0.19921875, "learning_rate": 0.0002447650715888162, "loss": 2.5386, "step": 3270 }, { "epoch": 0.28953524297126715, "grad_norm": 0.1904296875, "learning_rate": 0.0002444393673386396, "loss": 2.5583, "step": 3280 }, { "epoch": 0.2904179723705698, "grad_norm": 0.18359375, "learning_rate": 0.00024411292374247056, "loss": 2.5491, "step": 3290 }, { "epoch": 0.29130070176987244, "grad_norm": 0.205078125, "learning_rate": 0.00024378574335596752, "loss": 2.5411, "step": 3300 }, { "epoch": 0.2921834311691751, "grad_norm": 0.2119140625, "learning_rate": 0.0002434578287405571, "loss": 2.539, "step": 3310 }, { "epoch": 0.29306616056847773, "grad_norm": 0.2099609375, "learning_rate": 0.00024312918246341403, "loss": 2.5616, "step": 3320 }, { "epoch": 0.2939488899677804, "grad_norm": 0.232421875, "learning_rate": 0.00024279980709744097, "loss": 2.5462, "step": 3330 }, { "epoch": 0.294831619367083, "grad_norm": 0.158203125, "learning_rate": 0.00024246970522124865, "loss": 2.5321, "step": 3340 }, { "epoch": 0.29571434876638564, "grad_norm": 0.1748046875, "learning_rate": 0.00024213887941913536, "loss": 2.5558, "step": 3350 }, { "epoch": 0.2965970781656883, "grad_norm": 0.2236328125, "learning_rate": 0.0002418073322810669, "loss": 2.5422, "step": 3360 }, { "epoch": 0.29747980756499093, "grad_norm": 0.171875, "learning_rate": 0.00024147506640265624, "loss": 2.5369, "step": 3370 }, { "epoch": 0.2983625369642936, "grad_norm": 0.2099609375, "learning_rate": 0.00024114208438514323, "loss": 2.5426, "step": 3380 }, { "epoch": 0.2992452663635962, "grad_norm": 0.234375, "learning_rate": 0.00024080838883537426, "loss": 2.5395, "step": 3390 }, { "epoch": 0.3001279957628989, "grad_norm": 0.203125, "learning_rate": 0.00024047398236578174, "loss": 2.5378, "step": 3400 }, { "epoch": 0.3010107251622015, "grad_norm": 0.21875, "learning_rate": 0.0002401388675943637, "loss": 2.5268, "step": 3410 }, { "epoch": 0.3018934545615042, "grad_norm": 0.201171875, "learning_rate": 0.00023980304714466355, "loss": 2.559, "step": 3420 }, { "epoch": 0.3027761839608068, "grad_norm": 0.177734375, "learning_rate": 0.00023946652364574894, "loss": 2.5379, "step": 3430 }, { "epoch": 0.3036589133601095, "grad_norm": 0.1748046875, "learning_rate": 0.00023912929973219187, "loss": 2.5556, "step": 3440 }, { "epoch": 0.3045416427594121, "grad_norm": 0.193359375, "learning_rate": 0.00023879137804404753, "loss": 2.5366, "step": 3450 }, { "epoch": 0.30542437215871476, "grad_norm": 0.283203125, "learning_rate": 0.00023845276122683395, "loss": 2.5434, "step": 3460 }, { "epoch": 0.3063071015580174, "grad_norm": 0.1708984375, "learning_rate": 0.0002381134519315111, "loss": 2.5513, "step": 3470 }, { "epoch": 0.30718983095732005, "grad_norm": 0.1875, "learning_rate": 0.00023777345281446031, "loss": 2.5486, "step": 3480 }, { "epoch": 0.30807256035662267, "grad_norm": 0.181640625, "learning_rate": 0.0002374327665374634, "loss": 2.5416, "step": 3490 }, { "epoch": 0.30895528975592534, "grad_norm": 0.181640625, "learning_rate": 0.00023709139576768155, "loss": 2.5415, "step": 3500 }, { "epoch": 0.30983801915522796, "grad_norm": 0.1845703125, "learning_rate": 0.0002367493431776351, "loss": 2.5516, "step": 3510 }, { "epoch": 0.31072074855453063, "grad_norm": 0.2197265625, "learning_rate": 0.0002364066114451819, "loss": 2.5443, "step": 3520 }, { "epoch": 0.31160347795383325, "grad_norm": 0.193359375, "learning_rate": 0.0002360632032534968, "loss": 2.5425, "step": 3530 }, { "epoch": 0.31248620735313587, "grad_norm": 0.18359375, "learning_rate": 0.00023571912129105048, "loss": 2.5264, "step": 3540 }, { "epoch": 0.31336893675243854, "grad_norm": 0.189453125, "learning_rate": 0.00023537436825158843, "loss": 2.5505, "step": 3550 }, { "epoch": 0.31425166615174116, "grad_norm": 0.162109375, "learning_rate": 0.00023502894683410992, "loss": 2.5457, "step": 3560 }, { "epoch": 0.31513439555104383, "grad_norm": 0.1806640625, "learning_rate": 0.00023468285974284663, "loss": 2.5616, "step": 3570 }, { "epoch": 0.31601712495034645, "grad_norm": 0.220703125, "learning_rate": 0.00023433610968724187, "loss": 2.5352, "step": 3580 }, { "epoch": 0.3168998543496491, "grad_norm": 0.197265625, "learning_rate": 0.00023398869938192908, "loss": 2.5473, "step": 3590 }, { "epoch": 0.31778258374895174, "grad_norm": 0.1787109375, "learning_rate": 0.00023364063154671062, "loss": 2.5546, "step": 3600 }, { "epoch": 0.3186653131482544, "grad_norm": 0.17578125, "learning_rate": 0.00023329190890653657, "loss": 2.5485, "step": 3610 }, { "epoch": 0.31954804254755703, "grad_norm": 0.1982421875, "learning_rate": 0.00023294253419148324, "loss": 2.5376, "step": 3620 }, { "epoch": 0.3204307719468597, "grad_norm": 0.201171875, "learning_rate": 0.0002325925101367321, "loss": 2.5541, "step": 3630 }, { "epoch": 0.3213135013461623, "grad_norm": 0.1943359375, "learning_rate": 0.0002322418394825479, "loss": 2.5515, "step": 3640 }, { "epoch": 0.322196230745465, "grad_norm": 0.1923828125, "learning_rate": 0.00023189052497425767, "loss": 2.5502, "step": 3650 }, { "epoch": 0.3230789601447676, "grad_norm": 0.181640625, "learning_rate": 0.00023153856936222907, "loss": 2.5482, "step": 3660 }, { "epoch": 0.3239616895440703, "grad_norm": 0.1875, "learning_rate": 0.0002311859754018486, "loss": 2.5283, "step": 3670 }, { "epoch": 0.3248444189433729, "grad_norm": 0.17578125, "learning_rate": 0.0002308327458535005, "loss": 2.5387, "step": 3680 }, { "epoch": 0.3257271483426756, "grad_norm": 0.177734375, "learning_rate": 0.00023047888348254474, "loss": 2.5549, "step": 3690 }, { "epoch": 0.3266098777419782, "grad_norm": 0.1845703125, "learning_rate": 0.00023012439105929563, "loss": 2.5566, "step": 3700 }, { "epoch": 0.32749260714128087, "grad_norm": 0.181640625, "learning_rate": 0.0002297692713589999, "loss": 2.5382, "step": 3710 }, { "epoch": 0.3283753365405835, "grad_norm": 0.2099609375, "learning_rate": 0.00022941352716181524, "loss": 2.5433, "step": 3720 }, { "epoch": 0.3292580659398861, "grad_norm": 0.197265625, "learning_rate": 0.0002290571612527883, "loss": 2.5474, "step": 3730 }, { "epoch": 0.3301407953391888, "grad_norm": 0.16796875, "learning_rate": 0.00022870017642183298, "loss": 2.5526, "step": 3740 }, { "epoch": 0.3310235247384914, "grad_norm": 0.177734375, "learning_rate": 0.0002283425754637087, "loss": 2.5395, "step": 3750 }, { "epoch": 0.33190625413779407, "grad_norm": 0.1845703125, "learning_rate": 0.00022798436117799832, "loss": 2.5495, "step": 3760 }, { "epoch": 0.3327889835370967, "grad_norm": 0.2021484375, "learning_rate": 0.00022762553636908626, "loss": 2.5303, "step": 3770 }, { "epoch": 0.33367171293639936, "grad_norm": 0.1748046875, "learning_rate": 0.00022726610384613666, "loss": 2.541, "step": 3780 }, { "epoch": 0.334554442335702, "grad_norm": 0.17578125, "learning_rate": 0.0002269060664230714, "loss": 2.5394, "step": 3790 }, { "epoch": 0.33543717173500465, "grad_norm": 0.1630859375, "learning_rate": 0.0002265454269185479, "loss": 2.5437, "step": 3800 }, { "epoch": 0.33631990113430726, "grad_norm": 0.1650390625, "learning_rate": 0.00022618418815593722, "loss": 2.5423, "step": 3810 }, { "epoch": 0.33720263053360994, "grad_norm": 0.177734375, "learning_rate": 0.00022582235296330177, "loss": 2.5298, "step": 3820 }, { "epoch": 0.33808535993291255, "grad_norm": 0.171875, "learning_rate": 0.00022545992417337328, "loss": 2.5458, "step": 3830 }, { "epoch": 0.3389680893322152, "grad_norm": 0.1650390625, "learning_rate": 0.00022509690462353088, "loss": 2.5317, "step": 3840 }, { "epoch": 0.33985081873151785, "grad_norm": 0.1640625, "learning_rate": 0.00022473329715577838, "loss": 2.538, "step": 3850 }, { "epoch": 0.3407335481308205, "grad_norm": 0.197265625, "learning_rate": 0.00022436910461672236, "loss": 2.5374, "step": 3860 }, { "epoch": 0.34161627753012314, "grad_norm": 0.173828125, "learning_rate": 0.00022400432985754978, "loss": 2.5162, "step": 3870 }, { "epoch": 0.3424990069294258, "grad_norm": 0.203125, "learning_rate": 0.0002236389757340057, "loss": 2.5432, "step": 3880 }, { "epoch": 0.3433817363287284, "grad_norm": 0.1728515625, "learning_rate": 0.00022327304510637094, "loss": 2.5295, "step": 3890 }, { "epoch": 0.3442644657280311, "grad_norm": 0.1748046875, "learning_rate": 0.00022290654083943951, "loss": 2.5381, "step": 3900 }, { "epoch": 0.3451471951273337, "grad_norm": 0.1796875, "learning_rate": 0.00022253946580249654, "loss": 2.5432, "step": 3910 }, { "epoch": 0.34602992452663633, "grad_norm": 0.17578125, "learning_rate": 0.0002221718228692954, "loss": 2.5492, "step": 3920 }, { "epoch": 0.346912653925939, "grad_norm": 0.1875, "learning_rate": 0.0002218036149180355, "loss": 2.5457, "step": 3930 }, { "epoch": 0.3477953833252416, "grad_norm": 0.19140625, "learning_rate": 0.00022143484483133966, "loss": 2.5473, "step": 3940 }, { "epoch": 0.3486781127245443, "grad_norm": 0.189453125, "learning_rate": 0.00022106551549623145, "loss": 2.5255, "step": 3950 }, { "epoch": 0.3495608421238469, "grad_norm": 0.1962890625, "learning_rate": 0.0002206956298041128, "loss": 2.5427, "step": 3960 }, { "epoch": 0.3504435715231496, "grad_norm": 0.181640625, "learning_rate": 0.00022032519065074118, "loss": 2.5373, "step": 3970 }, { "epoch": 0.3513263009224522, "grad_norm": 0.1533203125, "learning_rate": 0.000219954200936207, "loss": 2.5375, "step": 3980 }, { "epoch": 0.3522090303217549, "grad_norm": 0.1962890625, "learning_rate": 0.00021958266356491088, "loss": 2.5426, "step": 3990 }, { "epoch": 0.3530917597210575, "grad_norm": 0.162109375, "learning_rate": 0.00021921058144554092, "loss": 2.5525, "step": 4000 }, { "epoch": 0.3530917597210575, "eval_accuracy": 0.5046183027315103, "eval_loss": 2.4229321479797363, "eval_runtime": 6.9255, "eval_samples_per_second": 45.917, "eval_steps_per_second": 0.433, "step": 4000 }, { "epoch": 0.35397448912036017, "grad_norm": 0.185546875, "learning_rate": 0.0002188379574910501, "loss": 2.5311, "step": 4010 }, { "epoch": 0.3548572185196628, "grad_norm": 0.1787109375, "learning_rate": 0.000218464794618633, "loss": 2.5558, "step": 4020 }, { "epoch": 0.35573994791896546, "grad_norm": 0.1875, "learning_rate": 0.00021809109574970356, "loss": 2.5385, "step": 4030 }, { "epoch": 0.3566226773182681, "grad_norm": 0.1728515625, "learning_rate": 0.00021771686380987176, "loss": 2.5378, "step": 4040 }, { "epoch": 0.35750540671757075, "grad_norm": 0.1806640625, "learning_rate": 0.00021734210172892106, "loss": 2.5417, "step": 4050 }, { "epoch": 0.35838813611687337, "grad_norm": 0.1865234375, "learning_rate": 0.000216966812440785, "loss": 2.5301, "step": 4060 }, { "epoch": 0.35927086551617604, "grad_norm": 0.216796875, "learning_rate": 0.00021659099888352473, "loss": 2.5325, "step": 4070 }, { "epoch": 0.36015359491547866, "grad_norm": 0.1796875, "learning_rate": 0.00021621466399930582, "loss": 2.5278, "step": 4080 }, { "epoch": 0.3610363243147813, "grad_norm": 0.1689453125, "learning_rate": 0.000215837810734375, "loss": 2.5298, "step": 4090 }, { "epoch": 0.36191905371408395, "grad_norm": 0.16015625, "learning_rate": 0.00021546044203903756, "loss": 2.5342, "step": 4100 }, { "epoch": 0.36280178311338657, "grad_norm": 0.1865234375, "learning_rate": 0.00021508256086763368, "loss": 2.5279, "step": 4110 }, { "epoch": 0.36368451251268924, "grad_norm": 0.1923828125, "learning_rate": 0.0002147041701785159, "loss": 2.5285, "step": 4120 }, { "epoch": 0.36456724191199186, "grad_norm": 0.1962890625, "learning_rate": 0.00021432527293402544, "loss": 2.5222, "step": 4130 }, { "epoch": 0.36544997131129453, "grad_norm": 0.2021484375, "learning_rate": 0.00021394587210046937, "loss": 2.5563, "step": 4140 }, { "epoch": 0.36633270071059715, "grad_norm": 0.1982421875, "learning_rate": 0.00021356597064809725, "loss": 2.5445, "step": 4150 }, { "epoch": 0.3672154301098998, "grad_norm": 0.1748046875, "learning_rate": 0.00021318557155107772, "loss": 2.5354, "step": 4160 }, { "epoch": 0.36809815950920244, "grad_norm": 0.177734375, "learning_rate": 0.00021280467778747562, "loss": 2.5291, "step": 4170 }, { "epoch": 0.3689808889085051, "grad_norm": 0.1953125, "learning_rate": 0.00021242329233922823, "loss": 2.5239, "step": 4180 }, { "epoch": 0.36986361830780773, "grad_norm": 0.1826171875, "learning_rate": 0.00021204141819212231, "loss": 2.5343, "step": 4190 }, { "epoch": 0.3707463477071104, "grad_norm": 0.1630859375, "learning_rate": 0.00021165905833577035, "loss": 2.5446, "step": 4200 }, { "epoch": 0.371629077106413, "grad_norm": 0.1748046875, "learning_rate": 0.00021127621576358743, "loss": 2.5395, "step": 4210 }, { "epoch": 0.3725118065057157, "grad_norm": 0.181640625, "learning_rate": 0.0002108928934727678, "loss": 2.5307, "step": 4220 }, { "epoch": 0.3733945359050183, "grad_norm": 0.17578125, "learning_rate": 0.00021050909446426117, "loss": 2.5278, "step": 4230 }, { "epoch": 0.374277265304321, "grad_norm": 0.1884765625, "learning_rate": 0.00021012482174274954, "loss": 2.5436, "step": 4240 }, { "epoch": 0.3751599947036236, "grad_norm": 0.1708984375, "learning_rate": 0.00020974007831662332, "loss": 2.5537, "step": 4250 }, { "epoch": 0.3760427241029263, "grad_norm": 0.171875, "learning_rate": 0.0002093548671979581, "loss": 2.5315, "step": 4260 }, { "epoch": 0.3769254535022289, "grad_norm": 0.181640625, "learning_rate": 0.00020896919140249093, "loss": 2.526, "step": 4270 }, { "epoch": 0.3778081829015315, "grad_norm": 0.2216796875, "learning_rate": 0.00020858305394959672, "loss": 2.5516, "step": 4280 }, { "epoch": 0.3786909123008342, "grad_norm": 0.24609375, "learning_rate": 0.00020819645786226458, "loss": 2.5443, "step": 4290 }, { "epoch": 0.3795736417001368, "grad_norm": 0.1884765625, "learning_rate": 0.00020780940616707406, "loss": 2.5357, "step": 4300 }, { "epoch": 0.38045637109943947, "grad_norm": 0.2578125, "learning_rate": 0.00020742190189417174, "loss": 2.5324, "step": 4310 }, { "epoch": 0.3813391004987421, "grad_norm": 0.193359375, "learning_rate": 0.00020703394807724726, "loss": 2.5454, "step": 4320 }, { "epoch": 0.38222182989804476, "grad_norm": 0.171875, "learning_rate": 0.0002066455477535097, "loss": 2.534, "step": 4330 }, { "epoch": 0.3831045592973474, "grad_norm": 0.171875, "learning_rate": 0.00020625670396366366, "loss": 2.527, "step": 4340 }, { "epoch": 0.38398728869665005, "grad_norm": 0.1708984375, "learning_rate": 0.00020586741975188558, "loss": 2.5386, "step": 4350 }, { "epoch": 0.38487001809595267, "grad_norm": 0.1728515625, "learning_rate": 0.00020547769816579988, "loss": 2.5253, "step": 4360 }, { "epoch": 0.38575274749525534, "grad_norm": 0.1826171875, "learning_rate": 0.00020508754225645513, "loss": 2.5281, "step": 4370 }, { "epoch": 0.38663547689455796, "grad_norm": 0.1669921875, "learning_rate": 0.00020469695507830004, "loss": 2.5577, "step": 4380 }, { "epoch": 0.38751820629386063, "grad_norm": 0.1787109375, "learning_rate": 0.0002043059396891597, "loss": 2.5449, "step": 4390 }, { "epoch": 0.38840093569316325, "grad_norm": 0.181640625, "learning_rate": 0.0002039144991502116, "loss": 2.5293, "step": 4400 }, { "epoch": 0.3892836650924659, "grad_norm": 0.18359375, "learning_rate": 0.00020352263652596148, "loss": 2.5442, "step": 4410 }, { "epoch": 0.39016639449176854, "grad_norm": 0.173828125, "learning_rate": 0.00020313035488421963, "loss": 2.5386, "step": 4420 }, { "epoch": 0.3910491238910712, "grad_norm": 0.1884765625, "learning_rate": 0.00020273765729607684, "loss": 2.531, "step": 4430 }, { "epoch": 0.39193185329037383, "grad_norm": 0.17578125, "learning_rate": 0.00020234454683588002, "loss": 2.5262, "step": 4440 }, { "epoch": 0.3928145826896765, "grad_norm": 0.177734375, "learning_rate": 0.0002019510265812085, "loss": 2.5253, "step": 4450 }, { "epoch": 0.3936973120889791, "grad_norm": 0.1796875, "learning_rate": 0.00020155709961284978, "loss": 2.542, "step": 4460 }, { "epoch": 0.39458004148828174, "grad_norm": 0.177734375, "learning_rate": 0.00020116276901477548, "loss": 2.5432, "step": 4470 }, { "epoch": 0.3954627708875844, "grad_norm": 0.1591796875, "learning_rate": 0.00020076803787411716, "loss": 2.5355, "step": 4480 }, { "epoch": 0.39634550028688703, "grad_norm": 0.177734375, "learning_rate": 0.00020037290928114196, "loss": 2.5305, "step": 4490 }, { "epoch": 0.3972282296861897, "grad_norm": 0.1640625, "learning_rate": 0.00019997738632922886, "loss": 2.5239, "step": 4500 }, { "epoch": 0.3981109590854923, "grad_norm": 0.1689453125, "learning_rate": 0.00019958147211484402, "loss": 2.5346, "step": 4510 }, { "epoch": 0.398993688484795, "grad_norm": 0.1787109375, "learning_rate": 0.00019918516973751674, "loss": 2.5404, "step": 4520 }, { "epoch": 0.3998764178840976, "grad_norm": 0.189453125, "learning_rate": 0.00019878848229981522, "loss": 2.548, "step": 4530 }, { "epoch": 0.4007591472834003, "grad_norm": 0.1923828125, "learning_rate": 0.00019839141290732215, "loss": 2.5577, "step": 4540 }, { "epoch": 0.4016418766827029, "grad_norm": 0.18359375, "learning_rate": 0.0001979939646686105, "loss": 2.5369, "step": 4550 }, { "epoch": 0.4025246060820056, "grad_norm": 0.1884765625, "learning_rate": 0.00019759614069521914, "loss": 2.5296, "step": 4560 }, { "epoch": 0.4034073354813082, "grad_norm": 0.17578125, "learning_rate": 0.00019719794410162853, "loss": 2.5171, "step": 4570 }, { "epoch": 0.40429006488061087, "grad_norm": 0.1865234375, "learning_rate": 0.00019679937800523624, "loss": 2.5355, "step": 4580 }, { "epoch": 0.4051727942799135, "grad_norm": 0.1669921875, "learning_rate": 0.00019640044552633257, "loss": 2.5519, "step": 4590 }, { "epoch": 0.40605552367921616, "grad_norm": 0.166015625, "learning_rate": 0.00019600114978807618, "loss": 2.5492, "step": 4600 }, { "epoch": 0.4069382530785188, "grad_norm": 0.1884765625, "learning_rate": 0.00019560149391646976, "loss": 2.5349, "step": 4610 }, { "epoch": 0.40782098247782145, "grad_norm": 0.1962890625, "learning_rate": 0.00019520148104033513, "loss": 2.5222, "step": 4620 }, { "epoch": 0.40870371187712407, "grad_norm": 0.1650390625, "learning_rate": 0.00019480111429128922, "loss": 2.533, "step": 4630 }, { "epoch": 0.40958644127642674, "grad_norm": 0.1865234375, "learning_rate": 0.00019440039680371934, "loss": 2.5432, "step": 4640 }, { "epoch": 0.41046917067572936, "grad_norm": 0.1806640625, "learning_rate": 0.0001939993317147586, "loss": 2.5401, "step": 4650 }, { "epoch": 0.411351900075032, "grad_norm": 0.1953125, "learning_rate": 0.0001935979221642614, "loss": 2.5376, "step": 4660 }, { "epoch": 0.41223462947433465, "grad_norm": 0.1787109375, "learning_rate": 0.00019319617129477898, "loss": 2.5401, "step": 4670 }, { "epoch": 0.41311735887363726, "grad_norm": 0.1767578125, "learning_rate": 0.00019279408225153453, "loss": 2.5348, "step": 4680 }, { "epoch": 0.41400008827293994, "grad_norm": 0.189453125, "learning_rate": 0.0001923916581823988, "loss": 2.5317, "step": 4690 }, { "epoch": 0.41488281767224255, "grad_norm": 0.1796875, "learning_rate": 0.00019198890223786542, "loss": 2.5391, "step": 4700 }, { "epoch": 0.4157655470715452, "grad_norm": 0.1748046875, "learning_rate": 0.00019158581757102618, "loss": 2.5272, "step": 4710 }, { "epoch": 0.41664827647084784, "grad_norm": 0.2158203125, "learning_rate": 0.00019118240733754642, "loss": 2.5249, "step": 4720 }, { "epoch": 0.4175310058701505, "grad_norm": 0.203125, "learning_rate": 0.0001907786746956401, "loss": 2.5279, "step": 4730 }, { "epoch": 0.41841373526945314, "grad_norm": 0.18359375, "learning_rate": 0.00019037462280604544, "loss": 2.5277, "step": 4740 }, { "epoch": 0.4192964646687558, "grad_norm": 0.19921875, "learning_rate": 0.00018997025483199987, "loss": 2.5395, "step": 4750 }, { "epoch": 0.4201791940680584, "grad_norm": 0.169921875, "learning_rate": 0.00018956557393921545, "loss": 2.532, "step": 4760 }, { "epoch": 0.4210619234673611, "grad_norm": 0.1640625, "learning_rate": 0.000189160583295854, "loss": 2.5303, "step": 4770 }, { "epoch": 0.4219446528666637, "grad_norm": 0.166015625, "learning_rate": 0.00018875528607250225, "loss": 2.5089, "step": 4780 }, { "epoch": 0.4228273822659664, "grad_norm": 0.181640625, "learning_rate": 0.00018834968544214717, "loss": 2.5267, "step": 4790 }, { "epoch": 0.423710111665269, "grad_norm": 0.173828125, "learning_rate": 0.00018794378458015094, "loss": 2.538, "step": 4800 }, { "epoch": 0.4245928410645717, "grad_norm": 0.1630859375, "learning_rate": 0.0001875375866642263, "loss": 2.5192, "step": 4810 }, { "epoch": 0.4254755704638743, "grad_norm": 0.1708984375, "learning_rate": 0.0001871310948744115, "loss": 2.5364, "step": 4820 }, { "epoch": 0.42635829986317697, "grad_norm": 0.1767578125, "learning_rate": 0.0001867243123930454, "loss": 2.531, "step": 4830 }, { "epoch": 0.4272410292624796, "grad_norm": 0.1796875, "learning_rate": 0.00018631724240474276, "loss": 2.506, "step": 4840 }, { "epoch": 0.4281237586617822, "grad_norm": 0.1748046875, "learning_rate": 0.00018590988809636918, "loss": 2.5338, "step": 4850 }, { "epoch": 0.4290064880610849, "grad_norm": 0.1689453125, "learning_rate": 0.00018550225265701596, "loss": 2.5341, "step": 4860 }, { "epoch": 0.4298892174603875, "grad_norm": 0.1728515625, "learning_rate": 0.0001850943392779755, "loss": 2.5228, "step": 4870 }, { "epoch": 0.43077194685969017, "grad_norm": 0.166015625, "learning_rate": 0.00018468615115271598, "loss": 2.5103, "step": 4880 }, { "epoch": 0.4316546762589928, "grad_norm": 0.1796875, "learning_rate": 0.0001842776914768567, "loss": 2.5357, "step": 4890 }, { "epoch": 0.43253740565829546, "grad_norm": 0.1748046875, "learning_rate": 0.00018386896344814265, "loss": 2.548, "step": 4900 }, { "epoch": 0.4334201350575981, "grad_norm": 0.1640625, "learning_rate": 0.0001834599702664199, "loss": 2.5254, "step": 4910 }, { "epoch": 0.43430286445690075, "grad_norm": 0.177734375, "learning_rate": 0.0001830507151336102, "loss": 2.544, "step": 4920 }, { "epoch": 0.43518559385620337, "grad_norm": 0.1708984375, "learning_rate": 0.00018264120125368615, "loss": 2.5329, "step": 4930 }, { "epoch": 0.43606832325550604, "grad_norm": 0.169921875, "learning_rate": 0.000182231431832646, "loss": 2.5335, "step": 4940 }, { "epoch": 0.43695105265480866, "grad_norm": 0.1787109375, "learning_rate": 0.0001818214100784885, "loss": 2.5235, "step": 4950 }, { "epoch": 0.43783378205411133, "grad_norm": 0.197265625, "learning_rate": 0.0001814111392011881, "loss": 2.5306, "step": 4960 }, { "epoch": 0.43871651145341395, "grad_norm": 0.162109375, "learning_rate": 0.00018100062241266924, "loss": 2.5374, "step": 4970 }, { "epoch": 0.4395992408527166, "grad_norm": 0.193359375, "learning_rate": 0.00018058986292678178, "loss": 2.5261, "step": 4980 }, { "epoch": 0.44048197025201924, "grad_norm": 0.17578125, "learning_rate": 0.00018017886395927564, "loss": 2.5266, "step": 4990 }, { "epoch": 0.4413646996513219, "grad_norm": 0.1708984375, "learning_rate": 0.00017976762872777538, "loss": 2.5378, "step": 5000 }, { "epoch": 0.44224742905062453, "grad_norm": 0.1611328125, "learning_rate": 0.00017935616045175543, "loss": 2.5254, "step": 5010 }, { "epoch": 0.44313015844992715, "grad_norm": 0.173828125, "learning_rate": 0.00017894446235251456, "loss": 2.536, "step": 5020 }, { "epoch": 0.4440128878492298, "grad_norm": 0.2099609375, "learning_rate": 0.00017853253765315083, "loss": 2.5239, "step": 5030 }, { "epoch": 0.44489561724853244, "grad_norm": 0.197265625, "learning_rate": 0.00017812038957853624, "loss": 2.5375, "step": 5040 }, { "epoch": 0.4457783466478351, "grad_norm": 0.16796875, "learning_rate": 0.00017770802135529164, "loss": 2.5422, "step": 5050 }, { "epoch": 0.44666107604713773, "grad_norm": 0.1669921875, "learning_rate": 0.00017729543621176132, "loss": 2.5285, "step": 5060 }, { "epoch": 0.4475438054464404, "grad_norm": 0.173828125, "learning_rate": 0.00017688263737798777, "loss": 2.539, "step": 5070 }, { "epoch": 0.448426534845743, "grad_norm": 0.1865234375, "learning_rate": 0.00017646962808568652, "loss": 2.5253, "step": 5080 }, { "epoch": 0.4493092642450457, "grad_norm": 0.17578125, "learning_rate": 0.00017605641156822057, "loss": 2.5269, "step": 5090 }, { "epoch": 0.4501919936443483, "grad_norm": 0.1806640625, "learning_rate": 0.0001756429910605754, "loss": 2.5228, "step": 5100 }, { "epoch": 0.451074723043651, "grad_norm": 0.169921875, "learning_rate": 0.00017522936979933346, "loss": 2.5403, "step": 5110 }, { "epoch": 0.4519574524429536, "grad_norm": 0.1865234375, "learning_rate": 0.00017481555102264872, "loss": 2.5417, "step": 5120 }, { "epoch": 0.4528401818422563, "grad_norm": 0.1689453125, "learning_rate": 0.00017440153797022165, "loss": 2.5392, "step": 5130 }, { "epoch": 0.4537229112415589, "grad_norm": 0.158203125, "learning_rate": 0.00017398733388327347, "loss": 2.5263, "step": 5140 }, { "epoch": 0.45460564064086156, "grad_norm": 0.1611328125, "learning_rate": 0.00017357294200452122, "loss": 2.5223, "step": 5150 }, { "epoch": 0.4554883700401642, "grad_norm": 0.1748046875, "learning_rate": 0.00017315836557815184, "loss": 2.5349, "step": 5160 }, { "epoch": 0.45637109943946685, "grad_norm": 0.1630859375, "learning_rate": 0.00017274360784979727, "loss": 2.5298, "step": 5170 }, { "epoch": 0.45725382883876947, "grad_norm": 0.17578125, "learning_rate": 0.0001723286720665087, "loss": 2.5291, "step": 5180 }, { "epoch": 0.45813655823807214, "grad_norm": 0.1923828125, "learning_rate": 0.00017191356147673134, "loss": 2.5413, "step": 5190 }, { "epoch": 0.45901928763737476, "grad_norm": 0.169921875, "learning_rate": 0.00017149827933027892, "loss": 2.5209, "step": 5200 }, { "epoch": 0.4599020170366774, "grad_norm": 0.20703125, "learning_rate": 0.00017108282887830817, "loss": 2.5262, "step": 5210 }, { "epoch": 0.46078474643598005, "grad_norm": 0.1982421875, "learning_rate": 0.00017066721337329356, "loss": 2.5253, "step": 5220 }, { "epoch": 0.46166747583528267, "grad_norm": 0.162109375, "learning_rate": 0.00017025143606900166, "loss": 2.5258, "step": 5230 }, { "epoch": 0.46255020523458534, "grad_norm": 0.185546875, "learning_rate": 0.00016983550022046581, "loss": 2.5387, "step": 5240 }, { "epoch": 0.46343293463388796, "grad_norm": 0.248046875, "learning_rate": 0.00016941940908396042, "loss": 2.5208, "step": 5250 }, { "epoch": 0.46431566403319063, "grad_norm": 0.1796875, "learning_rate": 0.00016900316591697572, "loss": 2.5355, "step": 5260 }, { "epoch": 0.46519839343249325, "grad_norm": 0.1728515625, "learning_rate": 0.00016858677397819214, "loss": 2.5429, "step": 5270 }, { "epoch": 0.4660811228317959, "grad_norm": 0.185546875, "learning_rate": 0.0001681702365274548, "loss": 2.5402, "step": 5280 }, { "epoch": 0.46696385223109854, "grad_norm": 0.1650390625, "learning_rate": 0.00016775355682574803, "loss": 2.525, "step": 5290 }, { "epoch": 0.4678465816304012, "grad_norm": 0.171875, "learning_rate": 0.0001673367381351698, "loss": 2.5408, "step": 5300 }, { "epoch": 0.46872931102970383, "grad_norm": 0.1640625, "learning_rate": 0.00016691978371890612, "loss": 2.5219, "step": 5310 }, { "epoch": 0.4696120404290065, "grad_norm": 0.1591796875, "learning_rate": 0.00016650269684120566, "loss": 2.5402, "step": 5320 }, { "epoch": 0.4704947698283091, "grad_norm": 0.171875, "learning_rate": 0.0001660854807673541, "loss": 2.5348, "step": 5330 }, { "epoch": 0.4713774992276118, "grad_norm": 0.1689453125, "learning_rate": 0.00016566813876364858, "loss": 2.5265, "step": 5340 }, { "epoch": 0.4722602286269144, "grad_norm": 0.1826171875, "learning_rate": 0.00016525067409737195, "loss": 2.5164, "step": 5350 }, { "epoch": 0.4731429580262171, "grad_norm": 0.1640625, "learning_rate": 0.00016483309003676765, "loss": 2.5358, "step": 5360 }, { "epoch": 0.4740256874255197, "grad_norm": 0.1591796875, "learning_rate": 0.00016441538985101354, "loss": 2.5194, "step": 5370 }, { "epoch": 0.4749084168248224, "grad_norm": 0.16015625, "learning_rate": 0.00016399757681019686, "loss": 2.5226, "step": 5380 }, { "epoch": 0.475791146224125, "grad_norm": 0.1767578125, "learning_rate": 0.0001635796541852882, "loss": 2.5317, "step": 5390 }, { "epoch": 0.4766738756234276, "grad_norm": 0.1845703125, "learning_rate": 0.00016316162524811605, "loss": 2.5398, "step": 5400 }, { "epoch": 0.4775566050227303, "grad_norm": 0.169921875, "learning_rate": 0.00016274349327134128, "loss": 2.5291, "step": 5410 }, { "epoch": 0.4784393344220329, "grad_norm": 0.2041015625, "learning_rate": 0.0001623252615284314, "loss": 2.5319, "step": 5420 }, { "epoch": 0.4793220638213356, "grad_norm": 0.1669921875, "learning_rate": 0.00016190693329363505, "loss": 2.5225, "step": 5430 }, { "epoch": 0.4802047932206382, "grad_norm": 0.17578125, "learning_rate": 0.00016148851184195616, "loss": 2.5129, "step": 5440 }, { "epoch": 0.48108752261994087, "grad_norm": 0.1669921875, "learning_rate": 0.0001610700004491285, "loss": 2.5394, "step": 5450 }, { "epoch": 0.4819702520192435, "grad_norm": 0.1689453125, "learning_rate": 0.00016065140239158987, "loss": 2.5318, "step": 5460 }, { "epoch": 0.48285298141854616, "grad_norm": 0.158203125, "learning_rate": 0.00016023272094645673, "loss": 2.5254, "step": 5470 }, { "epoch": 0.4837357108178488, "grad_norm": 0.1708984375, "learning_rate": 0.00015981395939149823, "loss": 2.5249, "step": 5480 }, { "epoch": 0.48461844021715145, "grad_norm": 0.1630859375, "learning_rate": 0.00015939512100511077, "loss": 2.5411, "step": 5490 }, { "epoch": 0.48550116961645406, "grad_norm": 0.1611328125, "learning_rate": 0.00015897620906629218, "loss": 2.5302, "step": 5500 }, { "epoch": 0.48638389901575674, "grad_norm": 0.1640625, "learning_rate": 0.00015855722685461607, "loss": 2.5185, "step": 5510 }, { "epoch": 0.48726662841505936, "grad_norm": 0.171875, "learning_rate": 0.00015813817765020636, "loss": 2.5357, "step": 5520 }, { "epoch": 0.48814935781436203, "grad_norm": 0.1650390625, "learning_rate": 0.0001577190647337113, "loss": 2.533, "step": 5530 }, { "epoch": 0.48903208721366465, "grad_norm": 0.1630859375, "learning_rate": 0.000157299891386278, "loss": 2.5261, "step": 5540 }, { "epoch": 0.4899148166129673, "grad_norm": 0.162109375, "learning_rate": 0.0001568806608895266, "loss": 2.5369, "step": 5550 }, { "epoch": 0.49079754601226994, "grad_norm": 0.166015625, "learning_rate": 0.00015646137652552473, "loss": 2.5495, "step": 5560 }, { "epoch": 0.4916802754115726, "grad_norm": 0.1611328125, "learning_rate": 0.00015604204157676176, "loss": 2.5054, "step": 5570 }, { "epoch": 0.4925630048108752, "grad_norm": 0.154296875, "learning_rate": 0.00015562265932612298, "loss": 2.5314, "step": 5580 }, { "epoch": 0.49344573421017784, "grad_norm": 0.1796875, "learning_rate": 0.000155203233056864, "loss": 2.5251, "step": 5590 }, { "epoch": 0.4943284636094805, "grad_norm": 0.1748046875, "learning_rate": 0.0001547837660525851, "loss": 2.5311, "step": 5600 }, { "epoch": 0.49521119300878313, "grad_norm": 0.1865234375, "learning_rate": 0.00015436426159720553, "loss": 2.5355, "step": 5610 }, { "epoch": 0.4960939224080858, "grad_norm": 0.197265625, "learning_rate": 0.00015394472297493753, "loss": 2.5291, "step": 5620 }, { "epoch": 0.4969766518073884, "grad_norm": 0.1806640625, "learning_rate": 0.00015352515347026103, "loss": 2.5253, "step": 5630 }, { "epoch": 0.4978593812066911, "grad_norm": 0.171875, "learning_rate": 0.00015310555636789767, "loss": 2.5222, "step": 5640 }, { "epoch": 0.4987421106059937, "grad_norm": 0.177734375, "learning_rate": 0.000152685934952785, "loss": 2.5461, "step": 5650 }, { "epoch": 0.4996248400052964, "grad_norm": 0.166015625, "learning_rate": 0.0001522662925100512, "loss": 2.5219, "step": 5660 }, { "epoch": 0.5005075694045991, "grad_norm": 0.1630859375, "learning_rate": 0.00015184663232498878, "loss": 2.5391, "step": 5670 }, { "epoch": 0.5013902988039016, "grad_norm": 0.1611328125, "learning_rate": 0.0001514269576830294, "loss": 2.5235, "step": 5680 }, { "epoch": 0.5022730282032043, "grad_norm": 0.1650390625, "learning_rate": 0.00015100727186971762, "loss": 2.5345, "step": 5690 }, { "epoch": 0.503155757602507, "grad_norm": 0.162109375, "learning_rate": 0.00015058757817068577, "loss": 2.5258, "step": 5700 }, { "epoch": 0.5040384870018096, "grad_norm": 0.166015625, "learning_rate": 0.00015016787987162767, "loss": 2.5259, "step": 5710 }, { "epoch": 0.5049212164011122, "grad_norm": 0.177734375, "learning_rate": 0.00014974818025827332, "loss": 2.522, "step": 5720 }, { "epoch": 0.5058039458004149, "grad_norm": 0.1611328125, "learning_rate": 0.00014932848261636287, "loss": 2.5316, "step": 5730 }, { "epoch": 0.5066866751997176, "grad_norm": 0.1689453125, "learning_rate": 0.0001489087902316211, "loss": 2.534, "step": 5740 }, { "epoch": 0.5075694045990202, "grad_norm": 0.16796875, "learning_rate": 0.0001484891063897317, "loss": 2.5264, "step": 5750 }, { "epoch": 0.5084521339983228, "grad_norm": 0.1689453125, "learning_rate": 0.00014806943437631134, "loss": 2.5312, "step": 5760 }, { "epoch": 0.5093348633976255, "grad_norm": 0.16015625, "learning_rate": 0.00014764977747688422, "loss": 2.5312, "step": 5770 }, { "epoch": 0.5102175927969281, "grad_norm": 0.1630859375, "learning_rate": 0.00014723013897685613, "loss": 2.5229, "step": 5780 }, { "epoch": 0.5111003221962307, "grad_norm": 0.15234375, "learning_rate": 0.00014681052216148886, "loss": 2.5323, "step": 5790 }, { "epoch": 0.5119830515955334, "grad_norm": 0.1630859375, "learning_rate": 0.00014639093031587432, "loss": 2.526, "step": 5800 }, { "epoch": 0.512865780994836, "grad_norm": 0.162109375, "learning_rate": 0.00014597136672490915, "loss": 2.5338, "step": 5810 }, { "epoch": 0.5137485103941387, "grad_norm": 0.2109375, "learning_rate": 0.0001455518346732687, "loss": 2.5427, "step": 5820 }, { "epoch": 0.5146312397934413, "grad_norm": 0.169921875, "learning_rate": 0.0001451323374453812, "loss": 2.5253, "step": 5830 }, { "epoch": 0.515513969192744, "grad_norm": 0.171875, "learning_rate": 0.00014471287832540264, "loss": 2.5202, "step": 5840 }, { "epoch": 0.5163966985920466, "grad_norm": 0.1689453125, "learning_rate": 0.00014429346059719033, "loss": 2.5364, "step": 5850 }, { "epoch": 0.5172794279913493, "grad_norm": 0.1630859375, "learning_rate": 0.00014387408754427776, "loss": 2.5163, "step": 5860 }, { "epoch": 0.5181621573906519, "grad_norm": 0.177734375, "learning_rate": 0.00014345476244984845, "loss": 2.5309, "step": 5870 }, { "epoch": 0.5190448867899545, "grad_norm": 0.169921875, "learning_rate": 0.00014303548859671069, "loss": 2.5254, "step": 5880 }, { "epoch": 0.5199276161892572, "grad_norm": 0.1572265625, "learning_rate": 0.00014261626926727146, "loss": 2.5119, "step": 5890 }, { "epoch": 0.5208103455885599, "grad_norm": 0.1748046875, "learning_rate": 0.00014219710774351094, "loss": 2.5193, "step": 5900 }, { "epoch": 0.5216930749878624, "grad_norm": 0.1630859375, "learning_rate": 0.00014177800730695678, "loss": 2.5287, "step": 5910 }, { "epoch": 0.5225758043871651, "grad_norm": 0.1865234375, "learning_rate": 0.00014135897123865833, "loss": 2.5246, "step": 5920 }, { "epoch": 0.5234585337864678, "grad_norm": 0.1884765625, "learning_rate": 0.0001409400028191611, "loss": 2.5242, "step": 5930 }, { "epoch": 0.5243412631857705, "grad_norm": 0.158203125, "learning_rate": 0.0001405211053284808, "loss": 2.5209, "step": 5940 }, { "epoch": 0.525223992585073, "grad_norm": 0.166015625, "learning_rate": 0.0001401022820460782, "loss": 2.5387, "step": 5950 }, { "epoch": 0.5261067219843757, "grad_norm": 0.171875, "learning_rate": 0.00013968353625083279, "loss": 2.5408, "step": 5960 }, { "epoch": 0.5269894513836784, "grad_norm": 0.1572265625, "learning_rate": 0.00013926487122101753, "loss": 2.5262, "step": 5970 }, { "epoch": 0.5278721807829809, "grad_norm": 0.173828125, "learning_rate": 0.00013884629023427314, "loss": 2.5284, "step": 5980 }, { "epoch": 0.5287549101822836, "grad_norm": 0.173828125, "learning_rate": 0.00013842779656758234, "loss": 2.5259, "step": 5990 }, { "epoch": 0.5296376395815863, "grad_norm": 0.1611328125, "learning_rate": 0.00013800939349724426, "loss": 2.5286, "step": 6000 }, { "epoch": 0.5296376395815863, "eval_accuracy": 0.5060543230354551, "eval_loss": 2.412339210510254, "eval_runtime": 6.9399, "eval_samples_per_second": 45.822, "eval_steps_per_second": 0.432, "step": 6000 }, { "epoch": 0.530520368980889, "grad_norm": 0.1767578125, "learning_rate": 0.00013759108429884867, "loss": 2.5166, "step": 6010 }, { "epoch": 0.5314030983801915, "grad_norm": 0.1591796875, "learning_rate": 0.0001371728722472506, "loss": 2.5259, "step": 6020 }, { "epoch": 0.5322858277794942, "grad_norm": 0.169921875, "learning_rate": 0.0001367547606165444, "loss": 2.5271, "step": 6030 }, { "epoch": 0.5331685571787969, "grad_norm": 0.1826171875, "learning_rate": 0.0001363367526800383, "loss": 2.5275, "step": 6040 }, { "epoch": 0.5340512865780995, "grad_norm": 0.1728515625, "learning_rate": 0.00013591885171022886, "loss": 2.5387, "step": 6050 }, { "epoch": 0.5349340159774021, "grad_norm": 0.1796875, "learning_rate": 0.00013550106097877496, "loss": 2.5132, "step": 6060 }, { "epoch": 0.5358167453767048, "grad_norm": 0.1591796875, "learning_rate": 0.0001350833837564726, "loss": 2.5429, "step": 6070 }, { "epoch": 0.5366994747760074, "grad_norm": 0.166015625, "learning_rate": 0.00013466582331322905, "loss": 2.5327, "step": 6080 }, { "epoch": 0.5375822041753101, "grad_norm": 0.1640625, "learning_rate": 0.0001342483829180376, "loss": 2.5268, "step": 6090 }, { "epoch": 0.5384649335746127, "grad_norm": 0.1650390625, "learning_rate": 0.00013383106583895137, "loss": 2.5381, "step": 6100 }, { "epoch": 0.5393476629739153, "grad_norm": 0.162109375, "learning_rate": 0.00013341387534305827, "loss": 2.5317, "step": 6110 }, { "epoch": 0.540230392373218, "grad_norm": 0.1630859375, "learning_rate": 0.00013299681469645513, "loss": 2.523, "step": 6120 }, { "epoch": 0.5411131217725207, "grad_norm": 0.154296875, "learning_rate": 0.0001325798871642223, "loss": 2.5234, "step": 6130 }, { "epoch": 0.5419958511718233, "grad_norm": 0.1865234375, "learning_rate": 0.0001321630960103979, "loss": 2.5245, "step": 6140 }, { "epoch": 0.5428785805711259, "grad_norm": 0.1669921875, "learning_rate": 0.00013174644449795244, "loss": 2.5232, "step": 6150 }, { "epoch": 0.5437613099704286, "grad_norm": 0.16015625, "learning_rate": 0.00013132993588876323, "loss": 2.5361, "step": 6160 }, { "epoch": 0.5446440393697312, "grad_norm": 0.15625, "learning_rate": 0.00013091357344358873, "loss": 2.5315, "step": 6170 }, { "epoch": 0.5455267687690338, "grad_norm": 0.1796875, "learning_rate": 0.00013049736042204318, "loss": 2.5251, "step": 6180 }, { "epoch": 0.5464094981683365, "grad_norm": 0.1630859375, "learning_rate": 0.00013008130008257098, "loss": 2.5242, "step": 6190 }, { "epoch": 0.5472922275676392, "grad_norm": 0.1865234375, "learning_rate": 0.00012966539568242132, "loss": 2.5325, "step": 6200 }, { "epoch": 0.5481749569669417, "grad_norm": 0.16015625, "learning_rate": 0.00012924965047762243, "loss": 2.517, "step": 6210 }, { "epoch": 0.5490576863662444, "grad_norm": 0.1640625, "learning_rate": 0.00012883406772295618, "loss": 2.5368, "step": 6220 }, { "epoch": 0.5499404157655471, "grad_norm": 0.166015625, "learning_rate": 0.000128418650671933, "loss": 2.5331, "step": 6230 }, { "epoch": 0.5508231451648498, "grad_norm": 0.166015625, "learning_rate": 0.0001280034025767656, "loss": 2.5209, "step": 6240 }, { "epoch": 0.5517058745641523, "grad_norm": 0.1552734375, "learning_rate": 0.00012758832668834438, "loss": 2.525, "step": 6250 }, { "epoch": 0.552588603963455, "grad_norm": 0.171875, "learning_rate": 0.0001271734262562112, "loss": 2.5347, "step": 6260 }, { "epoch": 0.5534713333627577, "grad_norm": 0.1669921875, "learning_rate": 0.00012675870452853464, "loss": 2.5221, "step": 6270 }, { "epoch": 0.5543540627620603, "grad_norm": 0.16796875, "learning_rate": 0.00012634416475208401, "loss": 2.5207, "step": 6280 }, { "epoch": 0.5552367921613629, "grad_norm": 0.15625, "learning_rate": 0.00012592981017220425, "loss": 2.521, "step": 6290 }, { "epoch": 0.5561195215606656, "grad_norm": 0.169921875, "learning_rate": 0.00012551564403279048, "loss": 2.5254, "step": 6300 }, { "epoch": 0.5570022509599682, "grad_norm": 0.1591796875, "learning_rate": 0.00012510166957626248, "loss": 2.5301, "step": 6310 }, { "epoch": 0.5578849803592708, "grad_norm": 0.16015625, "learning_rate": 0.00012468789004353942, "loss": 2.529, "step": 6320 }, { "epoch": 0.5587677097585735, "grad_norm": 0.2001953125, "learning_rate": 0.00012427430867401434, "loss": 2.5251, "step": 6330 }, { "epoch": 0.5596504391578762, "grad_norm": 0.1533203125, "learning_rate": 0.00012386092870552917, "loss": 2.5239, "step": 6340 }, { "epoch": 0.5605331685571788, "grad_norm": 0.1572265625, "learning_rate": 0.00012344775337434896, "loss": 2.5275, "step": 6350 }, { "epoch": 0.5614158979564814, "grad_norm": 0.1552734375, "learning_rate": 0.0001230347859151365, "loss": 2.528, "step": 6360 }, { "epoch": 0.5622986273557841, "grad_norm": 0.1591796875, "learning_rate": 0.00012262202956092754, "loss": 2.5306, "step": 6370 }, { "epoch": 0.5631813567550867, "grad_norm": 0.1689453125, "learning_rate": 0.00012220948754310492, "loss": 2.528, "step": 6380 }, { "epoch": 0.5640640861543894, "grad_norm": 0.1669921875, "learning_rate": 0.0001217971630913736, "loss": 2.5143, "step": 6390 }, { "epoch": 0.564946815553692, "grad_norm": 0.173828125, "learning_rate": 0.0001213850594337351, "loss": 2.517, "step": 6400 }, { "epoch": 0.5658295449529946, "grad_norm": 0.1572265625, "learning_rate": 0.0001209731797964626, "loss": 2.5336, "step": 6410 }, { "epoch": 0.5667122743522973, "grad_norm": 0.1689453125, "learning_rate": 0.00012056152740407529, "loss": 2.5209, "step": 6420 }, { "epoch": 0.5675950037516, "grad_norm": 0.1494140625, "learning_rate": 0.00012015010547931336, "loss": 2.5375, "step": 6430 }, { "epoch": 0.5684777331509026, "grad_norm": 0.185546875, "learning_rate": 0.00011973891724311284, "loss": 2.5259, "step": 6440 }, { "epoch": 0.5693604625502052, "grad_norm": 0.158203125, "learning_rate": 0.00011932796591458007, "loss": 2.5348, "step": 6450 }, { "epoch": 0.5702431919495079, "grad_norm": 0.169921875, "learning_rate": 0.00011891725471096684, "loss": 2.5283, "step": 6460 }, { "epoch": 0.5711259213488106, "grad_norm": 0.16015625, "learning_rate": 0.00011850678684764488, "loss": 2.5219, "step": 6470 }, { "epoch": 0.5720086507481131, "grad_norm": 0.1650390625, "learning_rate": 0.00011809656553808112, "loss": 2.5234, "step": 6480 }, { "epoch": 0.5728913801474158, "grad_norm": 0.166015625, "learning_rate": 0.00011768659399381203, "loss": 2.5219, "step": 6490 }, { "epoch": 0.5737741095467185, "grad_norm": 0.158203125, "learning_rate": 0.00011727687542441882, "loss": 2.5239, "step": 6500 }, { "epoch": 0.574656838946021, "grad_norm": 0.1630859375, "learning_rate": 0.00011686741303750225, "loss": 2.5224, "step": 6510 }, { "epoch": 0.5755395683453237, "grad_norm": 0.1650390625, "learning_rate": 0.00011645821003865741, "loss": 2.5401, "step": 6520 }, { "epoch": 0.5764222977446264, "grad_norm": 0.166015625, "learning_rate": 0.00011604926963144873, "loss": 2.5256, "step": 6530 }, { "epoch": 0.5773050271439291, "grad_norm": 0.1611328125, "learning_rate": 0.00011564059501738481, "loss": 2.5409, "step": 6540 }, { "epoch": 0.5781877565432316, "grad_norm": 0.166015625, "learning_rate": 0.00011523218939589354, "loss": 2.5139, "step": 6550 }, { "epoch": 0.5790704859425343, "grad_norm": 0.1650390625, "learning_rate": 0.00011482405596429679, "loss": 2.5274, "step": 6560 }, { "epoch": 0.579953215341837, "grad_norm": 0.1953125, "learning_rate": 0.00011441619791778552, "loss": 2.5268, "step": 6570 }, { "epoch": 0.5808359447411396, "grad_norm": 0.197265625, "learning_rate": 0.00011400861844939495, "loss": 2.5222, "step": 6580 }, { "epoch": 0.5817186741404422, "grad_norm": 0.1630859375, "learning_rate": 0.00011360132074997912, "loss": 2.5227, "step": 6590 }, { "epoch": 0.5826014035397449, "grad_norm": 0.1533203125, "learning_rate": 0.00011319430800818624, "loss": 2.5156, "step": 6600 }, { "epoch": 0.5834841329390476, "grad_norm": 0.1533203125, "learning_rate": 0.00011278758341043367, "loss": 2.5242, "step": 6610 }, { "epoch": 0.5843668623383502, "grad_norm": 0.1748046875, "learning_rate": 0.00011238115014088304, "loss": 2.5203, "step": 6620 }, { "epoch": 0.5852495917376528, "grad_norm": 0.2119140625, "learning_rate": 0.0001119750113814151, "loss": 2.5074, "step": 6630 }, { "epoch": 0.5861323211369555, "grad_norm": 0.16015625, "learning_rate": 0.00011156917031160491, "loss": 2.5132, "step": 6640 }, { "epoch": 0.5870150505362581, "grad_norm": 0.1748046875, "learning_rate": 0.00011116363010869717, "loss": 2.5244, "step": 6650 }, { "epoch": 0.5878977799355608, "grad_norm": 0.16015625, "learning_rate": 0.00011075839394758099, "loss": 2.5282, "step": 6660 }, { "epoch": 0.5887805093348634, "grad_norm": 0.15234375, "learning_rate": 0.00011035346500076523, "loss": 2.5416, "step": 6670 }, { "epoch": 0.589663238734166, "grad_norm": 0.162109375, "learning_rate": 0.00010994884643835366, "loss": 2.5205, "step": 6680 }, { "epoch": 0.5905459681334687, "grad_norm": 0.1982421875, "learning_rate": 0.0001095445414280202, "loss": 2.5347, "step": 6690 }, { "epoch": 0.5914286975327713, "grad_norm": 0.1669921875, "learning_rate": 0.00010914055313498382, "loss": 2.5517, "step": 6700 }, { "epoch": 0.592311426932074, "grad_norm": 0.1552734375, "learning_rate": 0.00010873688472198412, "loss": 2.5164, "step": 6710 }, { "epoch": 0.5931941563313766, "grad_norm": 0.150390625, "learning_rate": 0.00010833353934925652, "loss": 2.5253, "step": 6720 }, { "epoch": 0.5940768857306793, "grad_norm": 0.173828125, "learning_rate": 0.0001079305201745073, "loss": 2.5173, "step": 6730 }, { "epoch": 0.5949596151299819, "grad_norm": 0.1689453125, "learning_rate": 0.0001075278303528889, "loss": 2.5278, "step": 6740 }, { "epoch": 0.5958423445292845, "grad_norm": 0.1591796875, "learning_rate": 0.00010712547303697548, "loss": 2.5174, "step": 6750 }, { "epoch": 0.5967250739285872, "grad_norm": 0.154296875, "learning_rate": 0.00010672345137673814, "loss": 2.5309, "step": 6760 }, { "epoch": 0.5976078033278899, "grad_norm": 0.1591796875, "learning_rate": 0.00010632176851952005, "loss": 2.5312, "step": 6770 }, { "epoch": 0.5984905327271924, "grad_norm": 0.173828125, "learning_rate": 0.00010592042761001198, "loss": 2.5161, "step": 6780 }, { "epoch": 0.5993732621264951, "grad_norm": 0.15625, "learning_rate": 0.00010551943179022779, "loss": 2.5301, "step": 6790 }, { "epoch": 0.6002559915257978, "grad_norm": 0.17578125, "learning_rate": 0.00010511878419947958, "loss": 2.5188, "step": 6800 }, { "epoch": 0.6011387209251005, "grad_norm": 0.177734375, "learning_rate": 0.00010471848797435328, "loss": 2.5187, "step": 6810 }, { "epoch": 0.602021450324403, "grad_norm": 0.16015625, "learning_rate": 0.000104318546248684, "loss": 2.515, "step": 6820 }, { "epoch": 0.6029041797237057, "grad_norm": 0.162109375, "learning_rate": 0.00010391896215353167, "loss": 2.4969, "step": 6830 }, { "epoch": 0.6037869091230084, "grad_norm": 0.1611328125, "learning_rate": 0.00010351973881715632, "loss": 2.5262, "step": 6840 }, { "epoch": 0.604669638522311, "grad_norm": 0.1630859375, "learning_rate": 0.00010312087936499361, "loss": 2.524, "step": 6850 }, { "epoch": 0.6055523679216136, "grad_norm": 0.1513671875, "learning_rate": 0.00010272238691963064, "loss": 2.5197, "step": 6860 }, { "epoch": 0.6064350973209163, "grad_norm": 0.1650390625, "learning_rate": 0.00010232426460078106, "loss": 2.5194, "step": 6870 }, { "epoch": 0.607317826720219, "grad_norm": 0.1767578125, "learning_rate": 0.00010192651552526104, "loss": 2.5243, "step": 6880 }, { "epoch": 0.6082005561195215, "grad_norm": 0.154296875, "learning_rate": 0.00010152914280696453, "loss": 2.5284, "step": 6890 }, { "epoch": 0.6090832855188242, "grad_norm": 0.2041015625, "learning_rate": 0.00010113214955683929, "loss": 2.5218, "step": 6900 }, { "epoch": 0.6099660149181269, "grad_norm": 0.1591796875, "learning_rate": 0.00010073553888286212, "loss": 2.5138, "step": 6910 }, { "epoch": 0.6108487443174295, "grad_norm": 0.17578125, "learning_rate": 0.00010033931389001476, "loss": 2.5151, "step": 6920 }, { "epoch": 0.6117314737167321, "grad_norm": 0.1474609375, "learning_rate": 9.99434776802596e-05, "loss": 2.5328, "step": 6930 }, { "epoch": 0.6126142031160348, "grad_norm": 0.1630859375, "learning_rate": 9.954803335251524e-05, "loss": 2.527, "step": 6940 }, { "epoch": 0.6134969325153374, "grad_norm": 0.1513671875, "learning_rate": 9.915298400263235e-05, "loss": 2.5191, "step": 6950 }, { "epoch": 0.6143796619146401, "grad_norm": 0.1728515625, "learning_rate": 9.87583327233694e-05, "loss": 2.5061, "step": 6960 }, { "epoch": 0.6152623913139427, "grad_norm": 0.15234375, "learning_rate": 9.836408260436849e-05, "loss": 2.5363, "step": 6970 }, { "epoch": 0.6161451207132453, "grad_norm": 0.154296875, "learning_rate": 9.797023673213106e-05, "loss": 2.5118, "step": 6980 }, { "epoch": 0.617027850112548, "grad_norm": 0.169921875, "learning_rate": 9.757679818999374e-05, "loss": 2.5118, "step": 6990 }, { "epoch": 0.6179105795118507, "grad_norm": 0.16796875, "learning_rate": 9.718377005810445e-05, "loss": 2.516, "step": 7000 }, { "epoch": 0.6187933089111533, "grad_norm": 0.15234375, "learning_rate": 9.679115541339793e-05, "loss": 2.5148, "step": 7010 }, { "epoch": 0.6196760383104559, "grad_norm": 0.15234375, "learning_rate": 9.639895732957188e-05, "loss": 2.5346, "step": 7020 }, { "epoch": 0.6205587677097586, "grad_norm": 0.158203125, "learning_rate": 9.600717887706269e-05, "loss": 2.5338, "step": 7030 }, { "epoch": 0.6214414971090613, "grad_norm": 0.162109375, "learning_rate": 9.561582312302181e-05, "loss": 2.5287, "step": 7040 }, { "epoch": 0.6223242265083638, "grad_norm": 0.1630859375, "learning_rate": 9.522489313129128e-05, "loss": 2.5239, "step": 7050 }, { "epoch": 0.6232069559076665, "grad_norm": 0.150390625, "learning_rate": 9.483439196237993e-05, "loss": 2.5216, "step": 7060 }, { "epoch": 0.6240896853069692, "grad_norm": 0.1630859375, "learning_rate": 9.444432267343956e-05, "loss": 2.5228, "step": 7070 }, { "epoch": 0.6249724147062717, "grad_norm": 0.171875, "learning_rate": 9.405468831824076e-05, "loss": 2.5214, "step": 7080 }, { "epoch": 0.6258551441055744, "grad_norm": 0.1669921875, "learning_rate": 9.366549194714915e-05, "loss": 2.5312, "step": 7090 }, { "epoch": 0.6267378735048771, "grad_norm": 0.1591796875, "learning_rate": 9.327673660710138e-05, "loss": 2.5276, "step": 7100 }, { "epoch": 0.6276206029041798, "grad_norm": 0.1572265625, "learning_rate": 9.288842534158163e-05, "loss": 2.5329, "step": 7110 }, { "epoch": 0.6285033323034823, "grad_norm": 0.1572265625, "learning_rate": 9.250056119059715e-05, "loss": 2.5233, "step": 7120 }, { "epoch": 0.629386061702785, "grad_norm": 0.1552734375, "learning_rate": 9.211314719065503e-05, "loss": 2.5109, "step": 7130 }, { "epoch": 0.6302687911020877, "grad_norm": 0.150390625, "learning_rate": 9.172618637473827e-05, "loss": 2.5195, "step": 7140 }, { "epoch": 0.6311515205013903, "grad_norm": 0.16015625, "learning_rate": 9.133968177228186e-05, "loss": 2.5285, "step": 7150 }, { "epoch": 0.6320342499006929, "grad_norm": 0.158203125, "learning_rate": 9.095363640914925e-05, "loss": 2.5302, "step": 7160 }, { "epoch": 0.6329169792999956, "grad_norm": 0.1826171875, "learning_rate": 9.056805330760856e-05, "loss": 2.5262, "step": 7170 }, { "epoch": 0.6337997086992982, "grad_norm": 0.1630859375, "learning_rate": 9.018293548630903e-05, "loss": 2.5155, "step": 7180 }, { "epoch": 0.6346824380986009, "grad_norm": 0.173828125, "learning_rate": 8.979828596025729e-05, "loss": 2.5258, "step": 7190 }, { "epoch": 0.6355651674979035, "grad_norm": 0.1767578125, "learning_rate": 8.941410774079368e-05, "loss": 2.5254, "step": 7200 }, { "epoch": 0.6364478968972062, "grad_norm": 0.1640625, "learning_rate": 8.903040383556901e-05, "loss": 2.5369, "step": 7210 }, { "epoch": 0.6373306262965088, "grad_norm": 0.1572265625, "learning_rate": 8.86471772485206e-05, "loss": 2.5286, "step": 7220 }, { "epoch": 0.6382133556958115, "grad_norm": 0.154296875, "learning_rate": 8.826443097984894e-05, "loss": 2.5026, "step": 7230 }, { "epoch": 0.6390960850951141, "grad_norm": 0.166015625, "learning_rate": 8.788216802599426e-05, "loss": 2.5062, "step": 7240 }, { "epoch": 0.6399788144944167, "grad_norm": 0.162109375, "learning_rate": 8.750039137961314e-05, "loss": 2.5259, "step": 7250 }, { "epoch": 0.6408615438937194, "grad_norm": 0.15625, "learning_rate": 8.711910402955479e-05, "loss": 2.5327, "step": 7260 }, { "epoch": 0.641744273293022, "grad_norm": 0.1708984375, "learning_rate": 8.67383089608378e-05, "loss": 2.5383, "step": 7270 }, { "epoch": 0.6426270026923246, "grad_norm": 0.1572265625, "learning_rate": 8.63580091546269e-05, "loss": 2.5241, "step": 7280 }, { "epoch": 0.6435097320916273, "grad_norm": 0.16015625, "learning_rate": 8.597820758820957e-05, "loss": 2.5233, "step": 7290 }, { "epoch": 0.64439246149093, "grad_norm": 0.1611328125, "learning_rate": 8.559890723497248e-05, "loss": 2.5193, "step": 7300 }, { "epoch": 0.6452751908902326, "grad_norm": 0.15234375, "learning_rate": 8.522011106437853e-05, "loss": 2.5106, "step": 7310 }, { "epoch": 0.6461579202895352, "grad_norm": 0.1630859375, "learning_rate": 8.484182204194349e-05, "loss": 2.5181, "step": 7320 }, { "epoch": 0.6470406496888379, "grad_norm": 0.18359375, "learning_rate": 8.446404312921283e-05, "loss": 2.529, "step": 7330 }, { "epoch": 0.6479233790881406, "grad_norm": 0.1708984375, "learning_rate": 8.408677728373824e-05, "loss": 2.5141, "step": 7340 }, { "epoch": 0.6488061084874431, "grad_norm": 0.150390625, "learning_rate": 8.371002745905492e-05, "loss": 2.5188, "step": 7350 }, { "epoch": 0.6496888378867458, "grad_norm": 0.1513671875, "learning_rate": 8.333379660465829e-05, "loss": 2.5245, "step": 7360 }, { "epoch": 0.6505715672860485, "grad_norm": 0.169921875, "learning_rate": 8.295808766598068e-05, "loss": 2.5033, "step": 7370 }, { "epoch": 0.6514542966853512, "grad_norm": 0.1552734375, "learning_rate": 8.258290358436853e-05, "loss": 2.5333, "step": 7380 }, { "epoch": 0.6523370260846537, "grad_norm": 0.171875, "learning_rate": 8.220824729705939e-05, "loss": 2.5281, "step": 7390 }, { "epoch": 0.6532197554839564, "grad_norm": 0.1669921875, "learning_rate": 8.183412173715862e-05, "loss": 2.5363, "step": 7400 }, { "epoch": 0.6541024848832591, "grad_norm": 0.1474609375, "learning_rate": 8.146052983361675e-05, "loss": 2.5311, "step": 7410 }, { "epoch": 0.6549852142825617, "grad_norm": 0.1533203125, "learning_rate": 8.108747451120652e-05, "loss": 2.5191, "step": 7420 }, { "epoch": 0.6558679436818643, "grad_norm": 0.1611328125, "learning_rate": 8.071495869049968e-05, "loss": 2.5223, "step": 7430 }, { "epoch": 0.656750673081167, "grad_norm": 0.1630859375, "learning_rate": 8.034298528784446e-05, "loss": 2.5225, "step": 7440 }, { "epoch": 0.6576334024804696, "grad_norm": 0.150390625, "learning_rate": 7.997155721534247e-05, "loss": 2.5225, "step": 7450 }, { "epoch": 0.6585161318797722, "grad_norm": 0.158203125, "learning_rate": 7.960067738082638e-05, "loss": 2.5186, "step": 7460 }, { "epoch": 0.6593988612790749, "grad_norm": 0.16015625, "learning_rate": 7.923034868783643e-05, "loss": 2.526, "step": 7470 }, { "epoch": 0.6602815906783776, "grad_norm": 0.166015625, "learning_rate": 7.886057403559831e-05, "loss": 2.5233, "step": 7480 }, { "epoch": 0.6611643200776802, "grad_norm": 0.1552734375, "learning_rate": 7.849135631900026e-05, "loss": 2.5082, "step": 7490 }, { "epoch": 0.6620470494769828, "grad_norm": 0.169921875, "learning_rate": 7.812269842857017e-05, "loss": 2.5237, "step": 7500 }, { "epoch": 0.6629297788762855, "grad_norm": 0.166015625, "learning_rate": 7.775460325045347e-05, "loss": 2.5293, "step": 7510 }, { "epoch": 0.6638125082755881, "grad_norm": 0.173828125, "learning_rate": 7.738707366638993e-05, "loss": 2.5267, "step": 7520 }, { "epoch": 0.6646952376748908, "grad_norm": 0.171875, "learning_rate": 7.702011255369163e-05, "loss": 2.5379, "step": 7530 }, { "epoch": 0.6655779670741934, "grad_norm": 0.150390625, "learning_rate": 7.665372278522018e-05, "loss": 2.5214, "step": 7540 }, { "epoch": 0.666460696473496, "grad_norm": 0.1494140625, "learning_rate": 7.628790722936404e-05, "loss": 2.5139, "step": 7550 }, { "epoch": 0.6673434258727987, "grad_norm": 0.1552734375, "learning_rate": 7.592266875001666e-05, "loss": 2.5222, "step": 7560 }, { "epoch": 0.6682261552721014, "grad_norm": 0.1494140625, "learning_rate": 7.555801020655339e-05, "loss": 2.5214, "step": 7570 }, { "epoch": 0.669108884671404, "grad_norm": 0.15234375, "learning_rate": 7.519393445380932e-05, "loss": 2.5177, "step": 7580 }, { "epoch": 0.6699916140707066, "grad_norm": 0.173828125, "learning_rate": 7.483044434205726e-05, "loss": 2.5251, "step": 7590 }, { "epoch": 0.6708743434700093, "grad_norm": 0.1728515625, "learning_rate": 7.4467542716985e-05, "loss": 2.5217, "step": 7600 }, { "epoch": 0.671757072869312, "grad_norm": 0.1640625, "learning_rate": 7.410523241967328e-05, "loss": 2.5229, "step": 7610 }, { "epoch": 0.6726398022686145, "grad_norm": 0.1435546875, "learning_rate": 7.374351628657328e-05, "loss": 2.5209, "step": 7620 }, { "epoch": 0.6735225316679172, "grad_norm": 0.146484375, "learning_rate": 7.33823971494848e-05, "loss": 2.5122, "step": 7630 }, { "epoch": 0.6744052610672199, "grad_norm": 0.15234375, "learning_rate": 7.302187783553383e-05, "loss": 2.5096, "step": 7640 }, { "epoch": 0.6752879904665224, "grad_norm": 0.146484375, "learning_rate": 7.266196116715033e-05, "loss": 2.5156, "step": 7650 }, { "epoch": 0.6761707198658251, "grad_norm": 0.16015625, "learning_rate": 7.230264996204644e-05, "loss": 2.5213, "step": 7660 }, { "epoch": 0.6770534492651278, "grad_norm": 0.1630859375, "learning_rate": 7.194394703319423e-05, "loss": 2.5149, "step": 7670 }, { "epoch": 0.6779361786644305, "grad_norm": 0.1533203125, "learning_rate": 7.158585518880362e-05, "loss": 2.5178, "step": 7680 }, { "epoch": 0.678818908063733, "grad_norm": 0.15234375, "learning_rate": 7.122837723230051e-05, "loss": 2.5318, "step": 7690 }, { "epoch": 0.6797016374630357, "grad_norm": 0.158203125, "learning_rate": 7.087151596230486e-05, "loss": 2.5249, "step": 7700 }, { "epoch": 0.6805843668623384, "grad_norm": 0.1552734375, "learning_rate": 7.051527417260873e-05, "loss": 2.5316, "step": 7710 }, { "epoch": 0.681467096261641, "grad_norm": 0.173828125, "learning_rate": 7.01596546521543e-05, "loss": 2.5396, "step": 7720 }, { "epoch": 0.6823498256609436, "grad_norm": 0.154296875, "learning_rate": 6.980466018501203e-05, "loss": 2.5148, "step": 7730 }, { "epoch": 0.6832325550602463, "grad_norm": 0.1728515625, "learning_rate": 6.945029355035939e-05, "loss": 2.5173, "step": 7740 }, { "epoch": 0.684115284459549, "grad_norm": 0.158203125, "learning_rate": 6.909655752245823e-05, "loss": 2.5268, "step": 7750 }, { "epoch": 0.6849980138588516, "grad_norm": 0.1484375, "learning_rate": 6.874345487063377e-05, "loss": 2.5174, "step": 7760 }, { "epoch": 0.6858807432581542, "grad_norm": 0.1513671875, "learning_rate": 6.83909883592526e-05, "loss": 2.5214, "step": 7770 }, { "epoch": 0.6867634726574569, "grad_norm": 0.16796875, "learning_rate": 6.803916074770099e-05, "loss": 2.5022, "step": 7780 }, { "epoch": 0.6876462020567595, "grad_norm": 0.150390625, "learning_rate": 6.768797479036363e-05, "loss": 2.5183, "step": 7790 }, { "epoch": 0.6885289314560622, "grad_norm": 0.1552734375, "learning_rate": 6.733743323660155e-05, "loss": 2.5142, "step": 7800 }, { "epoch": 0.6894116608553648, "grad_norm": 0.1494140625, "learning_rate": 6.69875388307311e-05, "loss": 2.5147, "step": 7810 }, { "epoch": 0.6902943902546674, "grad_norm": 0.1513671875, "learning_rate": 6.663829431200224e-05, "loss": 2.5039, "step": 7820 }, { "epoch": 0.6911771196539701, "grad_norm": 0.1884765625, "learning_rate": 6.628970241457684e-05, "loss": 2.5328, "step": 7830 }, { "epoch": 0.6920598490532727, "grad_norm": 0.1494140625, "learning_rate": 6.594176586750796e-05, "loss": 2.5248, "step": 7840 }, { "epoch": 0.6929425784525753, "grad_norm": 0.1484375, "learning_rate": 6.559448739471771e-05, "loss": 2.5236, "step": 7850 }, { "epoch": 0.693825307851878, "grad_norm": 0.1552734375, "learning_rate": 6.524786971497635e-05, "loss": 2.5302, "step": 7860 }, { "epoch": 0.6947080372511807, "grad_norm": 0.1455078125, "learning_rate": 6.490191554188102e-05, "loss": 2.5207, "step": 7870 }, { "epoch": 0.6955907666504832, "grad_norm": 0.1630859375, "learning_rate": 6.455662758383433e-05, "loss": 2.5177, "step": 7880 }, { "epoch": 0.6964734960497859, "grad_norm": 0.16015625, "learning_rate": 6.421200854402337e-05, "loss": 2.5125, "step": 7890 }, { "epoch": 0.6973562254490886, "grad_norm": 0.1513671875, "learning_rate": 6.386806112039812e-05, "loss": 2.5066, "step": 7900 }, { "epoch": 0.6982389548483913, "grad_norm": 0.1611328125, "learning_rate": 6.35247880056509e-05, "loss": 2.5335, "step": 7910 }, { "epoch": 0.6991216842476938, "grad_norm": 0.154296875, "learning_rate": 6.318219188719493e-05, "loss": 2.5365, "step": 7920 }, { "epoch": 0.7000044136469965, "grad_norm": 0.169921875, "learning_rate": 6.284027544714325e-05, "loss": 2.5272, "step": 7930 }, { "epoch": 0.7008871430462992, "grad_norm": 0.1552734375, "learning_rate": 6.249904136228796e-05, "loss": 2.5205, "step": 7940 }, { "epoch": 0.7017698724456018, "grad_norm": 0.1591796875, "learning_rate": 6.21584923040792e-05, "loss": 2.5148, "step": 7950 }, { "epoch": 0.7026526018449044, "grad_norm": 0.16015625, "learning_rate": 6.181863093860399e-05, "loss": 2.5323, "step": 7960 }, { "epoch": 0.7035353312442071, "grad_norm": 0.154296875, "learning_rate": 6.147945992656566e-05, "loss": 2.5282, "step": 7970 }, { "epoch": 0.7044180606435098, "grad_norm": 0.15234375, "learning_rate": 6.114098192326296e-05, "loss": 2.5166, "step": 7980 }, { "epoch": 0.7053007900428124, "grad_norm": 0.154296875, "learning_rate": 6.0803199578569166e-05, "loss": 2.5225, "step": 7990 }, { "epoch": 0.706183519442115, "grad_norm": 0.15234375, "learning_rate": 6.046611553691136e-05, "loss": 2.53, "step": 8000 }, { "epoch": 0.706183519442115, "eval_accuracy": 0.506807657751054, "eval_loss": 2.4078192710876465, "eval_runtime": 6.9779, "eval_samples_per_second": 45.572, "eval_steps_per_second": 0.43, "step": 8000 }, { "epoch": 0.7070662488414177, "grad_norm": 0.17578125, "learning_rate": 6.012973243724968e-05, "loss": 2.5274, "step": 8010 }, { "epoch": 0.7079489782407203, "grad_norm": 0.1630859375, "learning_rate": 5.979405291305702e-05, "loss": 2.5217, "step": 8020 }, { "epoch": 0.7088317076400229, "grad_norm": 0.15625, "learning_rate": 5.94590795922978e-05, "loss": 2.5194, "step": 8030 }, { "epoch": 0.7097144370393256, "grad_norm": 0.1474609375, "learning_rate": 5.912481509740792e-05, "loss": 2.5232, "step": 8040 }, { "epoch": 0.7105971664386282, "grad_norm": 0.1572265625, "learning_rate": 5.8791262045274036e-05, "loss": 2.5207, "step": 8050 }, { "epoch": 0.7114798958379309, "grad_norm": 0.150390625, "learning_rate": 5.84584230472129e-05, "loss": 2.5254, "step": 8060 }, { "epoch": 0.7123626252372335, "grad_norm": 0.1533203125, "learning_rate": 5.81263007089513e-05, "loss": 2.5259, "step": 8070 }, { "epoch": 0.7132453546365362, "grad_norm": 0.1650390625, "learning_rate": 5.7794897630605183e-05, "loss": 2.5175, "step": 8080 }, { "epoch": 0.7141280840358388, "grad_norm": 0.1640625, "learning_rate": 5.7464216406659916e-05, "loss": 2.524, "step": 8090 }, { "epoch": 0.7150108134351415, "grad_norm": 0.1484375, "learning_rate": 5.713425962594933e-05, "loss": 2.5276, "step": 8100 }, { "epoch": 0.7158935428344441, "grad_norm": 0.1611328125, "learning_rate": 5.680502987163573e-05, "loss": 2.5266, "step": 8110 }, { "epoch": 0.7167762722337467, "grad_norm": 0.1474609375, "learning_rate": 5.6476529721189974e-05, "loss": 2.5187, "step": 8120 }, { "epoch": 0.7176590016330494, "grad_norm": 0.1455078125, "learning_rate": 5.614876174637069e-05, "loss": 2.5131, "step": 8130 }, { "epoch": 0.7185417310323521, "grad_norm": 0.15234375, "learning_rate": 5.5821728513204663e-05, "loss": 2.5218, "step": 8140 }, { "epoch": 0.7194244604316546, "grad_norm": 0.1572265625, "learning_rate": 5.549543258196636e-05, "loss": 2.5284, "step": 8150 }, { "epoch": 0.7203071898309573, "grad_norm": 0.1474609375, "learning_rate": 5.516987650715823e-05, "loss": 2.5099, "step": 8160 }, { "epoch": 0.72118991923026, "grad_norm": 0.1494140625, "learning_rate": 5.484506283749051e-05, "loss": 2.5111, "step": 8170 }, { "epoch": 0.7220726486295626, "grad_norm": 0.1474609375, "learning_rate": 5.45209941158612e-05, "loss": 2.5219, "step": 8180 }, { "epoch": 0.7229553780288652, "grad_norm": 0.1484375, "learning_rate": 5.419767287933633e-05, "loss": 2.5207, "step": 8190 }, { "epoch": 0.7238381074281679, "grad_norm": 0.146484375, "learning_rate": 5.387510165913014e-05, "loss": 2.5318, "step": 8200 }, { "epoch": 0.7247208368274706, "grad_norm": 0.1455078125, "learning_rate": 5.355328298058486e-05, "loss": 2.5164, "step": 8210 }, { "epoch": 0.7256035662267731, "grad_norm": 0.15625, "learning_rate": 5.323221936315152e-05, "loss": 2.5063, "step": 8220 }, { "epoch": 0.7264862956260758, "grad_norm": 0.1552734375, "learning_rate": 5.291191332036982e-05, "loss": 2.5058, "step": 8230 }, { "epoch": 0.7273690250253785, "grad_norm": 0.1513671875, "learning_rate": 5.259236735984849e-05, "loss": 2.5232, "step": 8240 }, { "epoch": 0.7282517544246812, "grad_norm": 0.1474609375, "learning_rate": 5.2273583983245834e-05, "loss": 2.5293, "step": 8250 }, { "epoch": 0.7291344838239837, "grad_norm": 0.1513671875, "learning_rate": 5.195556568625006e-05, "loss": 2.5306, "step": 8260 }, { "epoch": 0.7300172132232864, "grad_norm": 0.1494140625, "learning_rate": 5.1638314958559724e-05, "loss": 2.5239, "step": 8270 }, { "epoch": 0.7308999426225891, "grad_norm": 0.1552734375, "learning_rate": 5.1321834283864105e-05, "loss": 2.5286, "step": 8280 }, { "epoch": 0.7317826720218917, "grad_norm": 0.1484375, "learning_rate": 5.100612613982405e-05, "loss": 2.5295, "step": 8290 }, { "epoch": 0.7326654014211943, "grad_norm": 0.1572265625, "learning_rate": 5.0691192998052455e-05, "loss": 2.5086, "step": 8300 }, { "epoch": 0.733548130820497, "grad_norm": 0.1416015625, "learning_rate": 5.037703732409469e-05, "loss": 2.5208, "step": 8310 }, { "epoch": 0.7344308602197996, "grad_norm": 0.1435546875, "learning_rate": 5.0063661577409654e-05, "loss": 2.5269, "step": 8320 }, { "epoch": 0.7353135896191023, "grad_norm": 0.158203125, "learning_rate": 4.9751068211350385e-05, "loss": 2.5154, "step": 8330 }, { "epoch": 0.7361963190184049, "grad_norm": 0.142578125, "learning_rate": 4.943925967314466e-05, "loss": 2.5165, "step": 8340 }, { "epoch": 0.7370790484177075, "grad_norm": 0.1474609375, "learning_rate": 4.912823840387619e-05, "loss": 2.5284, "step": 8350 }, { "epoch": 0.7379617778170102, "grad_norm": 0.146484375, "learning_rate": 4.881800683846511e-05, "loss": 2.5272, "step": 8360 }, { "epoch": 0.7388445072163128, "grad_norm": 0.1513671875, "learning_rate": 4.850856740564943e-05, "loss": 2.505, "step": 8370 }, { "epoch": 0.7397272366156155, "grad_norm": 0.1640625, "learning_rate": 4.819992252796546e-05, "loss": 2.5149, "step": 8380 }, { "epoch": 0.7406099660149181, "grad_norm": 0.154296875, "learning_rate": 4.789207462172909e-05, "loss": 2.5042, "step": 8390 }, { "epoch": 0.7414926954142208, "grad_norm": 0.1591796875, "learning_rate": 4.7585026097017165e-05, "loss": 2.5235, "step": 8400 }, { "epoch": 0.7423754248135234, "grad_norm": 0.15234375, "learning_rate": 4.7278779357648056e-05, "loss": 2.5246, "step": 8410 }, { "epoch": 0.743258154212826, "grad_norm": 0.1513671875, "learning_rate": 4.69733368011633e-05, "loss": 2.5223, "step": 8420 }, { "epoch": 0.7441408836121287, "grad_norm": 0.150390625, "learning_rate": 4.666870081880849e-05, "loss": 2.5305, "step": 8430 }, { "epoch": 0.7450236130114314, "grad_norm": 0.1435546875, "learning_rate": 4.636487379551486e-05, "loss": 2.5245, "step": 8440 }, { "epoch": 0.745906342410734, "grad_norm": 0.1572265625, "learning_rate": 4.6061858109880494e-05, "loss": 2.5166, "step": 8450 }, { "epoch": 0.7467890718100366, "grad_norm": 0.1640625, "learning_rate": 4.575965613415152e-05, "loss": 2.5201, "step": 8460 }, { "epoch": 0.7476718012093393, "grad_norm": 0.146484375, "learning_rate": 4.5458270234203833e-05, "loss": 2.5167, "step": 8470 }, { "epoch": 0.748554530608642, "grad_norm": 0.146484375, "learning_rate": 4.515770276952449e-05, "loss": 2.5169, "step": 8480 }, { "epoch": 0.7494372600079445, "grad_norm": 0.1474609375, "learning_rate": 4.4857956093192966e-05, "loss": 2.5185, "step": 8490 }, { "epoch": 0.7503199894072472, "grad_norm": 0.150390625, "learning_rate": 4.455903255186318e-05, "loss": 2.5327, "step": 8500 }, { "epoch": 0.7512027188065499, "grad_norm": 0.150390625, "learning_rate": 4.426093448574482e-05, "loss": 2.524, "step": 8510 }, { "epoch": 0.7520854482058525, "grad_norm": 0.1630859375, "learning_rate": 4.396366422858514e-05, "loss": 2.5217, "step": 8520 }, { "epoch": 0.7529681776051551, "grad_norm": 0.1474609375, "learning_rate": 4.3667224107650515e-05, "loss": 2.528, "step": 8530 }, { "epoch": 0.7538509070044578, "grad_norm": 0.14453125, "learning_rate": 4.33716164437085e-05, "loss": 2.5256, "step": 8540 }, { "epoch": 0.7547336364037605, "grad_norm": 0.1494140625, "learning_rate": 4.307684355100953e-05, "loss": 2.5234, "step": 8550 }, { "epoch": 0.755616365803063, "grad_norm": 0.1513671875, "learning_rate": 4.2782907737268644e-05, "loss": 2.5342, "step": 8560 }, { "epoch": 0.7564990952023657, "grad_norm": 0.1494140625, "learning_rate": 4.24898113036477e-05, "loss": 2.5258, "step": 8570 }, { "epoch": 0.7573818246016684, "grad_norm": 0.15625, "learning_rate": 4.219755654473722e-05, "loss": 2.5198, "step": 8580 }, { "epoch": 0.758264554000971, "grad_norm": 0.1484375, "learning_rate": 4.190614574853832e-05, "loss": 2.5198, "step": 8590 }, { "epoch": 0.7591472834002736, "grad_norm": 0.1484375, "learning_rate": 4.1615581196444996e-05, "loss": 2.5231, "step": 8600 }, { "epoch": 0.7600300127995763, "grad_norm": 0.1474609375, "learning_rate": 4.132586516322625e-05, "loss": 2.5191, "step": 8610 }, { "epoch": 0.7609127421988789, "grad_norm": 0.1435546875, "learning_rate": 4.103699991700799e-05, "loss": 2.5385, "step": 8620 }, { "epoch": 0.7617954715981816, "grad_norm": 0.142578125, "learning_rate": 4.074898771925573e-05, "loss": 2.512, "step": 8630 }, { "epoch": 0.7626782009974842, "grad_norm": 0.1435546875, "learning_rate": 4.04618308247564e-05, "loss": 2.5103, "step": 8640 }, { "epoch": 0.7635609303967869, "grad_norm": 0.1455078125, "learning_rate": 4.017553148160126e-05, "loss": 2.5212, "step": 8650 }, { "epoch": 0.7644436597960895, "grad_norm": 0.146484375, "learning_rate": 3.989009193116768e-05, "loss": 2.5259, "step": 8660 }, { "epoch": 0.7653263891953922, "grad_norm": 0.14453125, "learning_rate": 3.960551440810203e-05, "loss": 2.5122, "step": 8670 }, { "epoch": 0.7662091185946948, "grad_norm": 0.146484375, "learning_rate": 3.9321801140302086e-05, "loss": 2.5234, "step": 8680 }, { "epoch": 0.7670918479939974, "grad_norm": 0.146484375, "learning_rate": 3.903895434889939e-05, "loss": 2.5138, "step": 8690 }, { "epoch": 0.7679745773933001, "grad_norm": 0.1416015625, "learning_rate": 3.8756976248242246e-05, "loss": 2.5174, "step": 8700 }, { "epoch": 0.7688573067926028, "grad_norm": 0.1435546875, "learning_rate": 3.847586904587793e-05, "loss": 2.5276, "step": 8710 }, { "epoch": 0.7697400361919053, "grad_norm": 0.1494140625, "learning_rate": 3.819563494253582e-05, "loss": 2.5169, "step": 8720 }, { "epoch": 0.770622765591208, "grad_norm": 0.1455078125, "learning_rate": 3.7916276132109915e-05, "loss": 2.5045, "step": 8730 }, { "epoch": 0.7715054949905107, "grad_norm": 0.150390625, "learning_rate": 3.763779480164165e-05, "loss": 2.5122, "step": 8740 }, { "epoch": 0.7723882243898132, "grad_norm": 0.1494140625, "learning_rate": 3.736019313130306e-05, "loss": 2.5251, "step": 8750 }, { "epoch": 0.7732709537891159, "grad_norm": 0.162109375, "learning_rate": 3.708347329437933e-05, "loss": 2.5265, "step": 8760 }, { "epoch": 0.7741536831884186, "grad_norm": 0.1484375, "learning_rate": 3.680763745725194e-05, "loss": 2.514, "step": 8770 }, { "epoch": 0.7750364125877213, "grad_norm": 0.14453125, "learning_rate": 3.6532687779381816e-05, "loss": 2.5269, "step": 8780 }, { "epoch": 0.7759191419870238, "grad_norm": 0.16015625, "learning_rate": 3.625862641329232e-05, "loss": 2.5342, "step": 8790 }, { "epoch": 0.7768018713863265, "grad_norm": 0.1513671875, "learning_rate": 3.598545550455244e-05, "loss": 2.5265, "step": 8800 }, { "epoch": 0.7776846007856292, "grad_norm": 0.15625, "learning_rate": 3.5713177191759795e-05, "loss": 2.54, "step": 8810 }, { "epoch": 0.7785673301849318, "grad_norm": 0.15234375, "learning_rate": 3.544179360652421e-05, "loss": 2.5284, "step": 8820 }, { "epoch": 0.7794500595842344, "grad_norm": 0.14453125, "learning_rate": 3.517130687345089e-05, "loss": 2.5166, "step": 8830 }, { "epoch": 0.7803327889835371, "grad_norm": 0.14453125, "learning_rate": 3.490171911012364e-05, "loss": 2.5229, "step": 8840 }, { "epoch": 0.7812155183828398, "grad_norm": 0.15234375, "learning_rate": 3.463303242708851e-05, "loss": 2.5221, "step": 8850 }, { "epoch": 0.7820982477821424, "grad_norm": 0.14453125, "learning_rate": 3.436524892783725e-05, "loss": 2.5132, "step": 8860 }, { "epoch": 0.782980977181445, "grad_norm": 0.1494140625, "learning_rate": 3.4098370708790574e-05, "loss": 2.5148, "step": 8870 }, { "epoch": 0.7838637065807477, "grad_norm": 0.1513671875, "learning_rate": 3.3832399859282116e-05, "loss": 2.5252, "step": 8880 }, { "epoch": 0.7847464359800503, "grad_norm": 0.15625, "learning_rate": 3.356733846154185e-05, "loss": 2.5277, "step": 8890 }, { "epoch": 0.785629165379353, "grad_norm": 0.146484375, "learning_rate": 3.33031885906799e-05, "loss": 2.5121, "step": 8900 }, { "epoch": 0.7865118947786556, "grad_norm": 0.1572265625, "learning_rate": 3.3039952314670135e-05, "loss": 2.5228, "step": 8910 }, { "epoch": 0.7873946241779582, "grad_norm": 0.1494140625, "learning_rate": 3.2777631694334056e-05, "loss": 2.5075, "step": 8920 }, { "epoch": 0.7882773535772609, "grad_norm": 0.1533203125, "learning_rate": 3.2516228783324904e-05, "loss": 2.5291, "step": 8930 }, { "epoch": 0.7891600829765635, "grad_norm": 0.1533203125, "learning_rate": 3.2255745628111116e-05, "loss": 2.5008, "step": 8940 }, { "epoch": 0.7900428123758662, "grad_norm": 0.146484375, "learning_rate": 3.19961842679607e-05, "loss": 2.5274, "step": 8950 }, { "epoch": 0.7909255417751688, "grad_norm": 0.146484375, "learning_rate": 3.1737546734925126e-05, "loss": 2.525, "step": 8960 }, { "epoch": 0.7918082711744715, "grad_norm": 0.16015625, "learning_rate": 3.147983505382329e-05, "loss": 2.5302, "step": 8970 }, { "epoch": 0.7926910005737741, "grad_norm": 0.154296875, "learning_rate": 3.122305124222597e-05, "loss": 2.5282, "step": 8980 }, { "epoch": 0.7935737299730767, "grad_norm": 0.14453125, "learning_rate": 3.096719731043965e-05, "loss": 2.4993, "step": 8990 }, { "epoch": 0.7944564593723794, "grad_norm": 0.146484375, "learning_rate": 3.0712275261491145e-05, "loss": 2.5319, "step": 9000 }, { "epoch": 0.7953391887716821, "grad_norm": 0.15234375, "learning_rate": 3.0458287091111692e-05, "loss": 2.5233, "step": 9010 }, { "epoch": 0.7962219181709846, "grad_norm": 0.1435546875, "learning_rate": 3.02052347877213e-05, "loss": 2.5286, "step": 9020 }, { "epoch": 0.7971046475702873, "grad_norm": 0.1455078125, "learning_rate": 2.9953120332413517e-05, "loss": 2.526, "step": 9030 }, { "epoch": 0.79798737696959, "grad_norm": 0.1435546875, "learning_rate": 2.970194569893937e-05, "loss": 2.5272, "step": 9040 }, { "epoch": 0.7988701063688927, "grad_norm": 0.14453125, "learning_rate": 2.9451712853692484e-05, "loss": 2.5134, "step": 9050 }, { "epoch": 0.7997528357681952, "grad_norm": 0.14453125, "learning_rate": 2.9202423755693193e-05, "loss": 2.538, "step": 9060 }, { "epoch": 0.8006355651674979, "grad_norm": 0.150390625, "learning_rate": 2.8954080356573596e-05, "loss": 2.5125, "step": 9070 }, { "epoch": 0.8015182945668006, "grad_norm": 0.14453125, "learning_rate": 2.8706684600562098e-05, "loss": 2.5203, "step": 9080 }, { "epoch": 0.8024010239661032, "grad_norm": 0.15234375, "learning_rate": 2.846023842446808e-05, "loss": 2.524, "step": 9090 }, { "epoch": 0.8032837533654058, "grad_norm": 0.1513671875, "learning_rate": 2.8214743757666984e-05, "loss": 2.5204, "step": 9100 }, { "epoch": 0.8041664827647085, "grad_norm": 0.150390625, "learning_rate": 2.79702025220851e-05, "loss": 2.5151, "step": 9110 }, { "epoch": 0.8050492121640112, "grad_norm": 0.1533203125, "learning_rate": 2.772661663218438e-05, "loss": 2.5228, "step": 9120 }, { "epoch": 0.8059319415633137, "grad_norm": 0.1455078125, "learning_rate": 2.748398799494766e-05, "loss": 2.5322, "step": 9130 }, { "epoch": 0.8068146709626164, "grad_norm": 0.1474609375, "learning_rate": 2.7242318509863708e-05, "loss": 2.5293, "step": 9140 }, { "epoch": 0.8076974003619191, "grad_norm": 0.1552734375, "learning_rate": 2.7001610068912117e-05, "loss": 2.537, "step": 9150 }, { "epoch": 0.8085801297612217, "grad_norm": 0.1474609375, "learning_rate": 2.6761864556548805e-05, "loss": 2.5206, "step": 9160 }, { "epoch": 0.8094628591605243, "grad_norm": 0.154296875, "learning_rate": 2.65230838496911e-05, "loss": 2.5154, "step": 9170 }, { "epoch": 0.810345588559827, "grad_norm": 0.146484375, "learning_rate": 2.62852698177031e-05, "loss": 2.5136, "step": 9180 }, { "epoch": 0.8112283179591296, "grad_norm": 0.1513671875, "learning_rate": 2.6048424322380877e-05, "loss": 2.5264, "step": 9190 }, { "epoch": 0.8121110473584323, "grad_norm": 0.1484375, "learning_rate": 2.581254921793816e-05, "loss": 2.5233, "step": 9200 }, { "epoch": 0.8129937767577349, "grad_norm": 0.14453125, "learning_rate": 2.5577646350991686e-05, "loss": 2.5135, "step": 9210 }, { "epoch": 0.8138765061570375, "grad_norm": 0.1494140625, "learning_rate": 2.534371756054664e-05, "loss": 2.5211, "step": 9220 }, { "epoch": 0.8147592355563402, "grad_norm": 0.1455078125, "learning_rate": 2.5110764677982496e-05, "loss": 2.5256, "step": 9230 }, { "epoch": 0.8156419649556429, "grad_norm": 0.1455078125, "learning_rate": 2.4878789527038523e-05, "loss": 2.5161, "step": 9240 }, { "epoch": 0.8165246943549455, "grad_norm": 0.1435546875, "learning_rate": 2.4647793923799413e-05, "loss": 2.5268, "step": 9250 }, { "epoch": 0.8174074237542481, "grad_norm": 0.14453125, "learning_rate": 2.441777967668139e-05, "loss": 2.5147, "step": 9260 }, { "epoch": 0.8182901531535508, "grad_norm": 0.1474609375, "learning_rate": 2.4188748586417588e-05, "loss": 2.5222, "step": 9270 }, { "epoch": 0.8191728825528535, "grad_norm": 0.1474609375, "learning_rate": 2.396070244604454e-05, "loss": 2.5201, "step": 9280 }, { "epoch": 0.820055611952156, "grad_norm": 0.142578125, "learning_rate": 2.3733643040887524e-05, "loss": 2.5223, "step": 9290 }, { "epoch": 0.8209383413514587, "grad_norm": 0.1455078125, "learning_rate": 2.3507572148546928e-05, "loss": 2.5173, "step": 9300 }, { "epoch": 0.8218210707507614, "grad_norm": 0.1474609375, "learning_rate": 2.328249153888449e-05, "loss": 2.5311, "step": 9310 }, { "epoch": 0.822703800150064, "grad_norm": 0.1435546875, "learning_rate": 2.3058402974008954e-05, "loss": 2.5002, "step": 9320 }, { "epoch": 0.8235865295493666, "grad_norm": 0.1474609375, "learning_rate": 2.283530820826276e-05, "loss": 2.5165, "step": 9330 }, { "epoch": 0.8244692589486693, "grad_norm": 0.1494140625, "learning_rate": 2.261320898820795e-05, "loss": 2.5198, "step": 9340 }, { "epoch": 0.825351988347972, "grad_norm": 0.1533203125, "learning_rate": 2.239210705261275e-05, "loss": 2.5456, "step": 9350 }, { "epoch": 0.8262347177472745, "grad_norm": 0.14453125, "learning_rate": 2.2172004132437858e-05, "loss": 2.5197, "step": 9360 }, { "epoch": 0.8271174471465772, "grad_norm": 0.1474609375, "learning_rate": 2.195290195082279e-05, "loss": 2.5263, "step": 9370 }, { "epoch": 0.8280001765458799, "grad_norm": 0.14453125, "learning_rate": 2.1734802223072584e-05, "loss": 2.519, "step": 9380 }, { "epoch": 0.8288829059451825, "grad_norm": 0.1494140625, "learning_rate": 2.151770665664429e-05, "loss": 2.5051, "step": 9390 }, { "epoch": 0.8297656353444851, "grad_norm": 0.140625, "learning_rate": 2.1301616951133473e-05, "loss": 2.5176, "step": 9400 }, { "epoch": 0.8306483647437878, "grad_norm": 0.142578125, "learning_rate": 2.108653479826114e-05, "loss": 2.5257, "step": 9410 }, { "epoch": 0.8315310941430905, "grad_norm": 0.14453125, "learning_rate": 2.0872461881860354e-05, "loss": 2.5256, "step": 9420 }, { "epoch": 0.8324138235423931, "grad_norm": 0.1455078125, "learning_rate": 2.0659399877863054e-05, "loss": 2.5229, "step": 9430 }, { "epoch": 0.8332965529416957, "grad_norm": 0.1416015625, "learning_rate": 2.044735045428693e-05, "loss": 2.5084, "step": 9440 }, { "epoch": 0.8341792823409984, "grad_norm": 0.150390625, "learning_rate": 2.023631527122247e-05, "loss": 2.5064, "step": 9450 }, { "epoch": 0.835062011740301, "grad_norm": 0.1435546875, "learning_rate": 2.002629598081985e-05, "loss": 2.533, "step": 9460 }, { "epoch": 0.8359447411396037, "grad_norm": 0.158203125, "learning_rate": 1.9817294227275987e-05, "loss": 2.521, "step": 9470 }, { "epoch": 0.8368274705389063, "grad_norm": 0.146484375, "learning_rate": 1.960931164682178e-05, "loss": 2.5291, "step": 9480 }, { "epoch": 0.8377101999382089, "grad_norm": 0.1416015625, "learning_rate": 1.940234986770926e-05, "loss": 2.5257, "step": 9490 }, { "epoch": 0.8385929293375116, "grad_norm": 0.14453125, "learning_rate": 1.919641051019869e-05, "loss": 2.5157, "step": 9500 }, { "epoch": 0.8394756587368142, "grad_norm": 0.154296875, "learning_rate": 1.899149518654613e-05, "loss": 2.5225, "step": 9510 }, { "epoch": 0.8403583881361169, "grad_norm": 0.1513671875, "learning_rate": 1.8787605500990688e-05, "loss": 2.5271, "step": 9520 }, { "epoch": 0.8412411175354195, "grad_norm": 0.1455078125, "learning_rate": 1.8584743049741864e-05, "loss": 2.5308, "step": 9530 }, { "epoch": 0.8421238469347222, "grad_norm": 0.1474609375, "learning_rate": 1.8382909420967302e-05, "loss": 2.5248, "step": 9540 }, { "epoch": 0.8430065763340248, "grad_norm": 0.142578125, "learning_rate": 1.8182106194780044e-05, "loss": 2.5321, "step": 9550 }, { "epoch": 0.8438893057333274, "grad_norm": 0.1435546875, "learning_rate": 1.7982334943226517e-05, "loss": 2.5276, "step": 9560 }, { "epoch": 0.8447720351326301, "grad_norm": 0.1435546875, "learning_rate": 1.778359723027386e-05, "loss": 2.5119, "step": 9570 }, { "epoch": 0.8456547645319328, "grad_norm": 0.1455078125, "learning_rate": 1.7585894611797967e-05, "loss": 2.5195, "step": 9580 }, { "epoch": 0.8465374939312353, "grad_norm": 0.146484375, "learning_rate": 1.7389228635571168e-05, "loss": 2.5207, "step": 9590 }, { "epoch": 0.847420223330538, "grad_norm": 0.1494140625, "learning_rate": 1.7193600841250095e-05, "loss": 2.5032, "step": 9600 }, { "epoch": 0.8483029527298407, "grad_norm": 0.1513671875, "learning_rate": 1.6999012760363734e-05, "loss": 2.5265, "step": 9610 }, { "epoch": 0.8491856821291434, "grad_norm": 0.150390625, "learning_rate": 1.6805465916301304e-05, "loss": 2.5189, "step": 9620 }, { "epoch": 0.8500684115284459, "grad_norm": 0.1435546875, "learning_rate": 1.661296182430047e-05, "loss": 2.5247, "step": 9630 }, { "epoch": 0.8509511409277486, "grad_norm": 0.1533203125, "learning_rate": 1.64215019914354e-05, "loss": 2.5286, "step": 9640 }, { "epoch": 0.8518338703270513, "grad_norm": 0.138671875, "learning_rate": 1.623108791660486e-05, "loss": 2.5286, "step": 9650 }, { "epoch": 0.8527165997263539, "grad_norm": 0.1435546875, "learning_rate": 1.6041721090520815e-05, "loss": 2.522, "step": 9660 }, { "epoch": 0.8535993291256565, "grad_norm": 0.1484375, "learning_rate": 1.585340299569637e-05, "loss": 2.5245, "step": 9670 }, { "epoch": 0.8544820585249592, "grad_norm": 0.142578125, "learning_rate": 1.566613510643432e-05, "loss": 2.5262, "step": 9680 }, { "epoch": 0.8553647879242618, "grad_norm": 0.15625, "learning_rate": 1.547991888881574e-05, "loss": 2.5202, "step": 9690 }, { "epoch": 0.8562475173235644, "grad_norm": 0.142578125, "learning_rate": 1.5294755800688307e-05, "loss": 2.5165, "step": 9700 }, { "epoch": 0.8571302467228671, "grad_norm": 0.1455078125, "learning_rate": 1.5110647291655025e-05, "loss": 2.517, "step": 9710 }, { "epoch": 0.8580129761221698, "grad_norm": 0.1484375, "learning_rate": 1.492759480306272e-05, "loss": 2.5204, "step": 9720 }, { "epoch": 0.8588957055214724, "grad_norm": 0.1474609375, "learning_rate": 1.4745599767990945e-05, "loss": 2.5291, "step": 9730 }, { "epoch": 0.859778434920775, "grad_norm": 0.150390625, "learning_rate": 1.4564663611240696e-05, "loss": 2.5206, "step": 9740 }, { "epoch": 0.8606611643200777, "grad_norm": 0.1455078125, "learning_rate": 1.4384787749323095e-05, "loss": 2.5217, "step": 9750 }, { "epoch": 0.8615438937193803, "grad_norm": 0.142578125, "learning_rate": 1.420597359044856e-05, "loss": 2.516, "step": 9760 }, { "epoch": 0.862426623118683, "grad_norm": 0.1435546875, "learning_rate": 1.402822253451567e-05, "loss": 2.5208, "step": 9770 }, { "epoch": 0.8633093525179856, "grad_norm": 0.1455078125, "learning_rate": 1.3851535973100081e-05, "loss": 2.5198, "step": 9780 }, { "epoch": 0.8641920819172882, "grad_norm": 0.1494140625, "learning_rate": 1.3675915289443828e-05, "loss": 2.5015, "step": 9790 }, { "epoch": 0.8650748113165909, "grad_norm": 0.1396484375, "learning_rate": 1.350136185844441e-05, "loss": 2.5008, "step": 9800 }, { "epoch": 0.8659575407158936, "grad_norm": 0.150390625, "learning_rate": 1.3327877046644036e-05, "loss": 2.5321, "step": 9810 }, { "epoch": 0.8668402701151962, "grad_norm": 0.1435546875, "learning_rate": 1.3155462212218881e-05, "loss": 2.5277, "step": 9820 }, { "epoch": 0.8677229995144988, "grad_norm": 0.14453125, "learning_rate": 1.298411870496845e-05, "loss": 2.5212, "step": 9830 }, { "epoch": 0.8686057289138015, "grad_norm": 0.142578125, "learning_rate": 1.2813847866305204e-05, "loss": 2.5146, "step": 9840 }, { "epoch": 0.8694884583131042, "grad_norm": 0.1435546875, "learning_rate": 1.2644651029243747e-05, "loss": 2.5204, "step": 9850 }, { "epoch": 0.8703711877124067, "grad_norm": 0.1474609375, "learning_rate": 1.247652951839065e-05, "loss": 2.511, "step": 9860 }, { "epoch": 0.8712539171117094, "grad_norm": 0.1533203125, "learning_rate": 1.230948464993397e-05, "loss": 2.5177, "step": 9870 }, { "epoch": 0.8721366465110121, "grad_norm": 0.146484375, "learning_rate": 1.2143517731632878e-05, "loss": 2.5317, "step": 9880 }, { "epoch": 0.8730193759103146, "grad_norm": 0.1455078125, "learning_rate": 1.197863006280761e-05, "loss": 2.5347, "step": 9890 }, { "epoch": 0.8739021053096173, "grad_norm": 0.142578125, "learning_rate": 1.1814822934329055e-05, "loss": 2.5204, "step": 9900 }, { "epoch": 0.87478483470892, "grad_norm": 0.142578125, "learning_rate": 1.1652097628608925e-05, "loss": 2.5262, "step": 9910 }, { "epoch": 0.8756675641082227, "grad_norm": 0.1474609375, "learning_rate": 1.149045541958949e-05, "loss": 2.5196, "step": 9920 }, { "epoch": 0.8765502935075252, "grad_norm": 0.14453125, "learning_rate": 1.1329897572733614e-05, "loss": 2.5366, "step": 9930 }, { "epoch": 0.8774330229068279, "grad_norm": 0.1474609375, "learning_rate": 1.117042534501511e-05, "loss": 2.5339, "step": 9940 }, { "epoch": 0.8783157523061306, "grad_norm": 0.1455078125, "learning_rate": 1.1012039984908516e-05, "loss": 2.5345, "step": 9950 }, { "epoch": 0.8791984817054332, "grad_norm": 0.1494140625, "learning_rate": 1.0854742732379612e-05, "loss": 2.5219, "step": 9960 }, { "epoch": 0.8800812111047358, "grad_norm": 0.14453125, "learning_rate": 1.0698534818875543e-05, "loss": 2.5286, "step": 9970 }, { "epoch": 0.8809639405040385, "grad_norm": 0.15234375, "learning_rate": 1.054341746731534e-05, "loss": 2.5282, "step": 9980 }, { "epoch": 0.8818466699033412, "grad_norm": 0.1435546875, "learning_rate": 1.0389391892080185e-05, "loss": 2.5203, "step": 9990 }, { "epoch": 0.8827293993026438, "grad_norm": 0.146484375, "learning_rate": 1.0236459299003974e-05, "loss": 2.528, "step": 10000 }, { "epoch": 0.8827293993026438, "eval_accuracy": 0.5067324010720237, "eval_loss": 2.4069998264312744, "eval_runtime": 6.9831, "eval_samples_per_second": 45.539, "eval_steps_per_second": 0.43, "step": 10000 }, { "epoch": 0.8836121287019464, "grad_norm": 0.1416015625, "learning_rate": 1.008462088536392e-05, "loss": 2.5319, "step": 10010 }, { "epoch": 0.8844948581012491, "grad_norm": 0.14453125, "learning_rate": 9.933877839871101e-06, "loss": 2.518, "step": 10020 }, { "epoch": 0.8853775875005517, "grad_norm": 0.1484375, "learning_rate": 9.784231342661158e-06, "loss": 2.5313, "step": 10030 }, { "epoch": 0.8862603168998543, "grad_norm": 0.1435546875, "learning_rate": 9.635682565285146e-06, "loss": 2.516, "step": 10040 }, { "epoch": 0.887143046299157, "grad_norm": 0.1435546875, "learning_rate": 9.488232670700296e-06, "loss": 2.532, "step": 10050 }, { "epoch": 0.8880257756984596, "grad_norm": 0.1416015625, "learning_rate": 9.341882813260854e-06, "loss": 2.5059, "step": 10060 }, { "epoch": 0.8889085050977623, "grad_norm": 0.1591796875, "learning_rate": 9.196634138709186e-06, "loss": 2.5171, "step": 10070 }, { "epoch": 0.8897912344970649, "grad_norm": 0.1484375, "learning_rate": 9.052487784166656e-06, "loss": 2.5213, "step": 10080 }, { "epoch": 0.8906739638963675, "grad_norm": 0.140625, "learning_rate": 8.909444878124866e-06, "loss": 2.5345, "step": 10090 }, { "epoch": 0.8915566932956702, "grad_norm": 0.14453125, "learning_rate": 8.767506540436675e-06, "loss": 2.5191, "step": 10100 }, { "epoch": 0.8924394226949729, "grad_norm": 0.1484375, "learning_rate": 8.626673882307578e-06, "loss": 2.5239, "step": 10110 }, { "epoch": 0.8933221520942755, "grad_norm": 0.1416015625, "learning_rate": 8.486948006286926e-06, "loss": 2.5161, "step": 10120 }, { "epoch": 0.8942048814935781, "grad_norm": 0.140625, "learning_rate": 8.348330006259234e-06, "loss": 2.5199, "step": 10130 }, { "epoch": 0.8950876108928808, "grad_norm": 0.14453125, "learning_rate": 8.210820967435778e-06, "loss": 2.5185, "step": 10140 }, { "epoch": 0.8959703402921835, "grad_norm": 0.14453125, "learning_rate": 8.074421966345952e-06, "loss": 2.5102, "step": 10150 }, { "epoch": 0.896853069691486, "grad_norm": 0.1435546875, "learning_rate": 7.939134070828907e-06, "loss": 2.5234, "step": 10160 }, { "epoch": 0.8977357990907887, "grad_norm": 0.138671875, "learning_rate": 7.804958340025252e-06, "loss": 2.5315, "step": 10170 }, { "epoch": 0.8986185284900914, "grad_norm": 0.14453125, "learning_rate": 7.671895824368546e-06, "loss": 2.5383, "step": 10180 }, { "epoch": 0.8995012578893941, "grad_norm": 0.14453125, "learning_rate": 7.539947565577381e-06, "loss": 2.5136, "step": 10190 }, { "epoch": 0.9003839872886966, "grad_norm": 0.1455078125, "learning_rate": 7.4091145966469645e-06, "loss": 2.5122, "step": 10200 }, { "epoch": 0.9012667166879993, "grad_norm": 0.1416015625, "learning_rate": 7.279397941841136e-06, "loss": 2.5191, "step": 10210 }, { "epoch": 0.902149446087302, "grad_norm": 0.1455078125, "learning_rate": 7.1507986166843944e-06, "loss": 2.5238, "step": 10220 }, { "epoch": 0.9030321754866045, "grad_norm": 0.1484375, "learning_rate": 7.02331762795385e-06, "loss": 2.5087, "step": 10230 }, { "epoch": 0.9039149048859072, "grad_norm": 0.1474609375, "learning_rate": 6.896955973671487e-06, "loss": 2.5184, "step": 10240 }, { "epoch": 0.9047976342852099, "grad_norm": 0.1435546875, "learning_rate": 6.771714643096093e-06, "loss": 2.5134, "step": 10250 }, { "epoch": 0.9056803636845125, "grad_norm": 0.150390625, "learning_rate": 6.647594616715812e-06, "loss": 2.5425, "step": 10260 }, { "epoch": 0.9065630930838151, "grad_norm": 0.14453125, "learning_rate": 6.524596866240306e-06, "loss": 2.5255, "step": 10270 }, { "epoch": 0.9074458224831178, "grad_norm": 0.14453125, "learning_rate": 6.4027223545931315e-06, "loss": 2.5265, "step": 10280 }, { "epoch": 0.9083285518824205, "grad_norm": 0.1416015625, "learning_rate": 6.281972035904265e-06, "loss": 2.515, "step": 10290 }, { "epoch": 0.9092112812817231, "grad_norm": 0.146484375, "learning_rate": 6.162346855502637e-06, "loss": 2.5176, "step": 10300 }, { "epoch": 0.9100940106810257, "grad_norm": 0.14453125, "learning_rate": 6.043847749908676e-06, "loss": 2.5215, "step": 10310 }, { "epoch": 0.9109767400803284, "grad_norm": 0.1455078125, "learning_rate": 5.9264756468269946e-06, "loss": 2.527, "step": 10320 }, { "epoch": 0.911859469479631, "grad_norm": 0.1552734375, "learning_rate": 5.810231465139198e-06, "loss": 2.5213, "step": 10330 }, { "epoch": 0.9127421988789337, "grad_norm": 0.1416015625, "learning_rate": 5.695116114896603e-06, "loss": 2.5299, "step": 10340 }, { "epoch": 0.9136249282782363, "grad_norm": 0.14453125, "learning_rate": 5.581130497313096e-06, "loss": 2.5215, "step": 10350 }, { "epoch": 0.9145076576775389, "grad_norm": 0.1455078125, "learning_rate": 5.468275504758157e-06, "loss": 2.522, "step": 10360 }, { "epoch": 0.9153903870768416, "grad_norm": 0.1494140625, "learning_rate": 5.35655202074991e-06, "loss": 2.5183, "step": 10370 }, { "epoch": 0.9162731164761443, "grad_norm": 0.146484375, "learning_rate": 5.245960919947984e-06, "loss": 2.5088, "step": 10380 }, { "epoch": 0.9171558458754469, "grad_norm": 0.1435546875, "learning_rate": 5.136503068146963e-06, "loss": 2.5145, "step": 10390 }, { "epoch": 0.9180385752747495, "grad_norm": 0.1435546875, "learning_rate": 5.0281793222693986e-06, "loss": 2.5054, "step": 10400 }, { "epoch": 0.9189213046740522, "grad_norm": 0.146484375, "learning_rate": 4.920990530359142e-06, "loss": 2.5221, "step": 10410 }, { "epoch": 0.9198040340733548, "grad_norm": 0.14453125, "learning_rate": 4.8149375315747865e-06, "loss": 2.5074, "step": 10420 }, { "epoch": 0.9206867634726574, "grad_norm": 0.1455078125, "learning_rate": 4.710021156182969e-06, "loss": 2.517, "step": 10430 }, { "epoch": 0.9215694928719601, "grad_norm": 0.1435546875, "learning_rate": 4.606242225552015e-06, "loss": 2.5157, "step": 10440 }, { "epoch": 0.9224522222712628, "grad_norm": 0.1416015625, "learning_rate": 4.503601552145403e-06, "loss": 2.5327, "step": 10450 }, { "epoch": 0.9233349516705653, "grad_norm": 0.1630859375, "learning_rate": 4.402099939515374e-06, "loss": 2.5172, "step": 10460 }, { "epoch": 0.924217681069868, "grad_norm": 0.14453125, "learning_rate": 4.301738182296838e-06, "loss": 2.5241, "step": 10470 }, { "epoch": 0.9251004104691707, "grad_norm": 0.1435546875, "learning_rate": 4.202517066200872e-06, "loss": 2.5145, "step": 10480 }, { "epoch": 0.9259831398684734, "grad_norm": 0.14453125, "learning_rate": 4.104437368008817e-06, "loss": 2.5354, "step": 10490 }, { "epoch": 0.9268658692677759, "grad_norm": 0.1435546875, "learning_rate": 4.00749985556601e-06, "loss": 2.5148, "step": 10500 }, { "epoch": 0.9277485986670786, "grad_norm": 0.14453125, "learning_rate": 3.911705287775923e-06, "loss": 2.5215, "step": 10510 }, { "epoch": 0.9286313280663813, "grad_norm": 0.146484375, "learning_rate": 3.817054414594156e-06, "loss": 2.5205, "step": 10520 }, { "epoch": 0.9295140574656839, "grad_norm": 0.1455078125, "learning_rate": 3.723547977022484e-06, "loss": 2.5138, "step": 10530 }, { "epoch": 0.9303967868649865, "grad_norm": 0.142578125, "learning_rate": 3.6311867071032006e-06, "loss": 2.5331, "step": 10540 }, { "epoch": 0.9312795162642892, "grad_norm": 0.1611328125, "learning_rate": 3.539971327913338e-06, "loss": 2.5093, "step": 10550 }, { "epoch": 0.9321622456635918, "grad_norm": 0.1455078125, "learning_rate": 3.4499025535589363e-06, "loss": 2.5228, "step": 10560 }, { "epoch": 0.9330449750628945, "grad_norm": 0.14453125, "learning_rate": 3.3609810891695345e-06, "loss": 2.5138, "step": 10570 }, { "epoch": 0.9339277044621971, "grad_norm": 0.1455078125, "learning_rate": 3.273207630892638e-06, "loss": 2.5321, "step": 10580 }, { "epoch": 0.9348104338614998, "grad_norm": 0.14453125, "learning_rate": 3.18658286588821e-06, "loss": 2.5357, "step": 10590 }, { "epoch": 0.9356931632608024, "grad_norm": 0.1396484375, "learning_rate": 3.101107472323372e-06, "loss": 2.5202, "step": 10600 }, { "epoch": 0.936575892660105, "grad_norm": 0.1474609375, "learning_rate": 3.016782119367045e-06, "loss": 2.5207, "step": 10610 }, { "epoch": 0.9374586220594077, "grad_norm": 0.1435546875, "learning_rate": 2.933607467184751e-06, "loss": 2.5176, "step": 10620 }, { "epoch": 0.9383413514587103, "grad_norm": 0.146484375, "learning_rate": 2.851584166933352e-06, "loss": 2.527, "step": 10630 }, { "epoch": 0.939224080858013, "grad_norm": 0.142578125, "learning_rate": 2.7707128607560693e-06, "loss": 2.5126, "step": 10640 }, { "epoch": 0.9401068102573156, "grad_norm": 0.146484375, "learning_rate": 2.690994181777406e-06, "loss": 2.5385, "step": 10650 }, { "epoch": 0.9409895396566182, "grad_norm": 0.1455078125, "learning_rate": 2.6124287540981505e-06, "loss": 2.5173, "step": 10660 }, { "epoch": 0.9418722690559209, "grad_norm": 0.140625, "learning_rate": 2.5350171927905617e-06, "loss": 2.5116, "step": 10670 }, { "epoch": 0.9427549984552236, "grad_norm": 0.1435546875, "learning_rate": 2.4587601038935256e-06, "loss": 2.5182, "step": 10680 }, { "epoch": 0.9436377278545262, "grad_norm": 0.140625, "learning_rate": 2.3836580844077745e-06, "loss": 2.5277, "step": 10690 }, { "epoch": 0.9445204572538288, "grad_norm": 0.142578125, "learning_rate": 2.3097117222912744e-06, "loss": 2.5276, "step": 10700 }, { "epoch": 0.9454031866531315, "grad_norm": 0.1416015625, "learning_rate": 2.2369215964545608e-06, "loss": 2.5215, "step": 10710 }, { "epoch": 0.9462859160524342, "grad_norm": 0.146484375, "learning_rate": 2.1652882767562773e-06, "loss": 2.5281, "step": 10720 }, { "epoch": 0.9471686454517367, "grad_norm": 0.146484375, "learning_rate": 2.0948123239986457e-06, "loss": 2.5192, "step": 10730 }, { "epoch": 0.9480513748510394, "grad_norm": 0.14453125, "learning_rate": 2.0254942899230853e-06, "loss": 2.5156, "step": 10740 }, { "epoch": 0.9489341042503421, "grad_norm": 0.1435546875, "learning_rate": 1.9573347172059996e-06, "loss": 2.505, "step": 10750 }, { "epoch": 0.9498168336496448, "grad_norm": 0.146484375, "learning_rate": 1.890334139454347e-06, "loss": 2.5285, "step": 10760 }, { "epoch": 0.9506995630489473, "grad_norm": 0.140625, "learning_rate": 1.8244930812015945e-06, "loss": 2.5088, "step": 10770 }, { "epoch": 0.95158229244825, "grad_norm": 0.1416015625, "learning_rate": 1.7598120579035702e-06, "loss": 2.5102, "step": 10780 }, { "epoch": 0.9524650218475527, "grad_norm": 0.1435546875, "learning_rate": 1.6962915759344332e-06, "loss": 2.5286, "step": 10790 }, { "epoch": 0.9533477512468552, "grad_norm": 0.1416015625, "learning_rate": 1.6339321325827437e-06, "loss": 2.5105, "step": 10800 }, { "epoch": 0.9542304806461579, "grad_norm": 0.1435546875, "learning_rate": 1.57273421604745e-06, "loss": 2.5235, "step": 10810 }, { "epoch": 0.9551132100454606, "grad_norm": 0.146484375, "learning_rate": 1.5126983054342234e-06, "loss": 2.5269, "step": 10820 }, { "epoch": 0.9559959394447632, "grad_norm": 0.1474609375, "learning_rate": 1.45382487075163e-06, "loss": 2.5232, "step": 10830 }, { "epoch": 0.9568786688440658, "grad_norm": 0.140625, "learning_rate": 1.3961143729073975e-06, "loss": 2.5185, "step": 10840 }, { "epoch": 0.9577613982433685, "grad_norm": 0.1533203125, "learning_rate": 1.339567263704938e-06, "loss": 2.5206, "step": 10850 }, { "epoch": 0.9586441276426712, "grad_norm": 0.1484375, "learning_rate": 1.2841839858396984e-06, "loss": 2.5257, "step": 10860 }, { "epoch": 0.9595268570419738, "grad_norm": 0.146484375, "learning_rate": 1.2299649728957306e-06, "loss": 2.5294, "step": 10870 }, { "epoch": 0.9604095864412764, "grad_norm": 0.14453125, "learning_rate": 1.1769106493423274e-06, "loss": 2.5095, "step": 10880 }, { "epoch": 0.9612923158405791, "grad_norm": 0.1455078125, "learning_rate": 1.125021430530626e-06, "loss": 2.5197, "step": 10890 }, { "epoch": 0.9621750452398817, "grad_norm": 0.142578125, "learning_rate": 1.0742977226904759e-06, "loss": 2.5346, "step": 10900 }, { "epoch": 0.9630577746391844, "grad_norm": 0.142578125, "learning_rate": 1.0247399229271257e-06, "loss": 2.5145, "step": 10910 }, { "epoch": 0.963940504038487, "grad_norm": 0.1435546875, "learning_rate": 9.763484192181758e-07, "loss": 2.5183, "step": 10920 }, { "epoch": 0.9648232334377896, "grad_norm": 0.14453125, "learning_rate": 9.291235904105632e-07, "loss": 2.5116, "step": 10930 }, { "epoch": 0.9657059628370923, "grad_norm": 0.1435546875, "learning_rate": 8.830658062175811e-07, "loss": 2.5246, "step": 10940 }, { "epoch": 0.966588692236395, "grad_norm": 0.1416015625, "learning_rate": 8.381754272159313e-07, "loss": 2.5085, "step": 10950 }, { "epoch": 0.9674714216356975, "grad_norm": 0.1494140625, "learning_rate": 7.944528048430098e-07, "loss": 2.5145, "step": 10960 }, { "epoch": 0.9683541510350002, "grad_norm": 0.142578125, "learning_rate": 7.518982813940255e-07, "loss": 2.5137, "step": 10970 }, { "epoch": 0.9692368804343029, "grad_norm": 0.1435546875, "learning_rate": 7.105121900194188e-07, "loss": 2.5439, "step": 10980 }, { "epoch": 0.9701196098336055, "grad_norm": 0.1416015625, "learning_rate": 6.702948547221976e-07, "loss": 2.5048, "step": 10990 }, { "epoch": 0.9710023392329081, "grad_norm": 0.1435546875, "learning_rate": 6.312465903554886e-07, "loss": 2.5225, "step": 11000 }, { "epoch": 0.9718850686322108, "grad_norm": 0.1513671875, "learning_rate": 5.933677026199235e-07, "loss": 2.5273, "step": 11010 }, { "epoch": 0.9727677980315135, "grad_norm": 0.1435546875, "learning_rate": 5.566584880613567e-07, "loss": 2.5182, "step": 11020 }, { "epoch": 0.973650527430816, "grad_norm": 0.1484375, "learning_rate": 5.211192340685843e-07, "loss": 2.519, "step": 11030 }, { "epoch": 0.9745332568301187, "grad_norm": 0.146484375, "learning_rate": 4.867502188709127e-07, "loss": 2.5026, "step": 11040 }, { "epoch": 0.9754159862294214, "grad_norm": 0.140625, "learning_rate": 4.535517115361764e-07, "loss": 2.5197, "step": 11050 }, { "epoch": 0.9762987156287241, "grad_norm": 0.146484375, "learning_rate": 4.215239719685071e-07, "loss": 2.5235, "step": 11060 }, { "epoch": 0.9771814450280266, "grad_norm": 0.146484375, "learning_rate": 3.9066725090636794e-07, "loss": 2.531, "step": 11070 }, { "epoch": 0.9780641744273293, "grad_norm": 0.1474609375, "learning_rate": 3.609817899205725e-07, "loss": 2.5139, "step": 11080 }, { "epoch": 0.978946903826632, "grad_norm": 0.14453125, "learning_rate": 3.3246782141233553e-07, "loss": 2.5171, "step": 11090 }, { "epoch": 0.9798296332259346, "grad_norm": 0.1455078125, "learning_rate": 3.051255686115417e-07, "loss": 2.5078, "step": 11100 }, { "epoch": 0.9807123626252372, "grad_norm": 0.1416015625, "learning_rate": 2.7895524557497995e-07, "loss": 2.5229, "step": 11110 }, { "epoch": 0.9815950920245399, "grad_norm": 0.15625, "learning_rate": 2.539570571845784e-07, "loss": 2.5278, "step": 11120 }, { "epoch": 0.9824778214238425, "grad_norm": 0.1484375, "learning_rate": 2.3013119914592204e-07, "loss": 2.5069, "step": 11130 }, { "epoch": 0.9833605508231452, "grad_norm": 0.1533203125, "learning_rate": 2.0747785798667095e-07, "loss": 2.5199, "step": 11140 }, { "epoch": 0.9842432802224478, "grad_norm": 0.1533203125, "learning_rate": 1.8599721105506116e-07, "loss": 2.5158, "step": 11150 }, { "epoch": 0.9851260096217505, "grad_norm": 0.1484375, "learning_rate": 1.6568942651857263e-07, "loss": 2.5209, "step": 11160 }, { "epoch": 0.9860087390210531, "grad_norm": 0.14453125, "learning_rate": 1.4655466336261356e-07, "loss": 2.5236, "step": 11170 }, { "epoch": 0.9868914684203557, "grad_norm": 0.1455078125, "learning_rate": 1.2859307138920472e-07, "loss": 2.526, "step": 11180 }, { "epoch": 0.9877741978196584, "grad_norm": 0.1455078125, "learning_rate": 1.1180479121588037e-07, "loss": 2.5154, "step": 11190 }, { "epoch": 0.988656927218961, "grad_norm": 0.14453125, "learning_rate": 9.618995427455589e-08, "loss": 2.5115, "step": 11200 }, { "epoch": 0.9895396566182637, "grad_norm": 0.1416015625, "learning_rate": 8.174868281054513e-08, "loss": 2.5229, "step": 11210 }, { "epoch": 0.9904223860175663, "grad_norm": 0.1484375, "learning_rate": 6.8481089881528e-08, "loss": 2.5213, "step": 11220 }, { "epoch": 0.9913051154168689, "grad_norm": 0.1455078125, "learning_rate": 5.638727935666776e-08, "loss": 2.5222, "step": 11230 }, { "epoch": 0.9921878448161716, "grad_norm": 0.1484375, "learning_rate": 4.5467345915911615e-08, "loss": 2.5187, "step": 11240 }, { "epoch": 0.9930705742154743, "grad_norm": 0.1474609375, "learning_rate": 3.57213750491081e-08, "loss": 2.5222, "step": 11250 }, { "epoch": 0.9939533036147769, "grad_norm": 0.1435546875, "learning_rate": 2.7149443055424214e-08, "loss": 2.5178, "step": 11260 }, { "epoch": 0.9948360330140795, "grad_norm": 0.14453125, "learning_rate": 1.9751617042712554e-08, "loss": 2.5302, "step": 11270 }, { "epoch": 0.9957187624133822, "grad_norm": 0.14453125, "learning_rate": 1.35279549269951e-08, "loss": 2.509, "step": 11280 }, { "epoch": 0.9966014918126849, "grad_norm": 0.1435546875, "learning_rate": 8.478505432030214e-09, "loss": 2.5229, "step": 11290 }, { "epoch": 0.9974842212119874, "grad_norm": 0.1435546875, "learning_rate": 4.603308088879654e-09, "loss": 2.5128, "step": 11300 }, { "epoch": 0.9983669506112901, "grad_norm": 0.150390625, "learning_rate": 1.9023932356754258e-09, "loss": 2.5175, "step": 11310 }, { "epoch": 0.9992496800105928, "grad_norm": 0.142578125, "learning_rate": 3.757820173033721e-10, "loss": 2.5188, "step": 11320 }, { "epoch": 0.9999558635300348, "step": 11328, "total_flos": 2.202782600225295e+20, "train_loss": 2.5603223798301933, "train_runtime": 24610.1089, "train_samples_per_second": 117.842, "train_steps_per_second": 0.46 } ], "logging_steps": 10, "max_steps": 11328, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.202782600225295e+20, "train_batch_size": 2, "trial_name": null, "trial_params": null }