diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,7 +1,7 @@ { - "best_metric": 0.22200879454612732, + "best_metric": 0.279557466506958, "best_model_checkpoint": "output/output_minicpmv26_upsampled_new/checkpoint-100", - "epoch": 2.821869488536155, + "epoch": 0.9720534629404617, "eval_steps": 100, "global_step": 400, "is_hyper_param_search": false, @@ -9,2854 +9,2854 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.007054673721340388, - "grad_norm": 1.6801574230194092, + "epoch": 0.002430133657351154, + "grad_norm": 3.186406373977661, "learning_rate": 0.0, - "loss": 0.302, + "loss": 0.2724, "step": 1 }, { - "epoch": 0.014109347442680775, - "grad_norm": 1.6801574230194092, + "epoch": 0.004860267314702308, + "grad_norm": 3.186406373977661, "learning_rate": 0.0, - "loss": 0.6048, + "loss": 0.475, "step": 2 }, { - "epoch": 0.021164021164021163, - "grad_norm": 1.6801574230194092, + "epoch": 0.007290400972053463, + "grad_norm": 3.186406373977661, "learning_rate": 0.0, - "loss": 0.6328, + "loss": 0.6319, "step": 3 }, { - "epoch": 0.02821869488536155, - "grad_norm": 5.175243377685547, - "learning_rate": 5.017166594399687e-06, - "loss": 0.6277, + "epoch": 0.009720534629404616, + "grad_norm": 3.186406373977661, + "learning_rate": 0.0, + "loss": 0.5354, "step": 4 }, { - "epoch": 0.03527336860670194, - "grad_norm": 5.2468085289001465, - "learning_rate": 7.952020911994375e-06, - "loss": 0.569, + "epoch": 0.012150668286755772, + "grad_norm": 9.746641159057617, + "learning_rate": 5.017166594399687e-06, + "loss": 0.6389, "step": 5 }, { - "epoch": 0.042328042328042326, - "grad_norm": 3.1532468795776367, - "learning_rate": 1.0034333188799373e-05, - "loss": 0.4483, + "epoch": 0.014580801944106925, + "grad_norm": 10.395784378051758, + "learning_rate": 7.952020911994375e-06, + "loss": 0.6399, "step": 6 }, { - "epoch": 0.04938271604938271, - "grad_norm": 2.239643096923828, - "learning_rate": 1.164950007226698e-05, - "loss": 0.2535, + "epoch": 0.01701093560145808, + "grad_norm": 3.7885968685150146, + "learning_rate": 1.0034333188799373e-05, + "loss": 0.3931, "step": 7 }, { - "epoch": 0.0564373897707231, - "grad_norm": 1.6188478469848633, - "learning_rate": 1.2969187506394062e-05, - "loss": 0.3263, + "epoch": 0.019441069258809233, + "grad_norm": 3.202871799468994, + "learning_rate": 1.164950007226698e-05, + "loss": 0.4881, "step": 8 }, { - "epoch": 0.06349206349206349, - "grad_norm": 1.5941171646118164, - "learning_rate": 1.4084967333570947e-05, - "loss": 0.2719, + "epoch": 0.02187120291616039, + "grad_norm": 4.895542144775391, + "learning_rate": 1.2969187506394062e-05, + "loss": 0.3696, "step": 9 }, { - "epoch": 0.07054673721340388, - "grad_norm": 2.198554039001465, - "learning_rate": 1.505149978319906e-05, - "loss": 0.2925, + "epoch": 0.024301336573511544, + "grad_norm": 2.880216360092163, + "learning_rate": 1.4084967333570947e-05, + "loss": 0.4236, "step": 10 }, { - "epoch": 0.07760141093474426, - "grad_norm": 1.4477227926254272, - "learning_rate": 1.590404182398875e-05, - "loss": 0.2046, + "epoch": 0.026731470230862697, + "grad_norm": 1.7968358993530273, + "learning_rate": 1.505149978319906e-05, + "loss": 0.2569, "step": 11 }, { - "epoch": 0.08465608465608465, - "grad_norm": 1.196413278579712, - "learning_rate": 1.666666666666667e-05, - "loss": 0.1979, + "epoch": 0.02916160388821385, + "grad_norm": 2.6668541431427, + "learning_rate": 1.590404182398875e-05, + "loss": 0.5213, "step": 12 }, { - "epoch": 0.09171075837742504, - "grad_norm": 1.5520116090774536, - "learning_rate": 1.7356544752637084e-05, - "loss": 0.2322, + "epoch": 0.031591737545565005, + "grad_norm": 2.0502774715423584, + "learning_rate": 1.666666666666667e-05, + "loss": 0.3246, "step": 13 }, { - "epoch": 0.09876543209876543, - "grad_norm": 1.1696501970291138, - "learning_rate": 1.7986354100793748e-05, - "loss": 0.2485, + "epoch": 0.03402187120291616, + "grad_norm": 1.6508930921554565, + "learning_rate": 1.7356544752637084e-05, + "loss": 0.1883, "step": 14 }, { - "epoch": 0.10582010582010581, - "grad_norm": 1.172625184059143, - "learning_rate": 1.8565722538447282e-05, - "loss": 0.1389, + "epoch": 0.03645200486026731, + "grad_norm": 1.4152283668518066, + "learning_rate": 1.7986354100793748e-05, + "loss": 0.2409, "step": 15 }, { - "epoch": 0.1128747795414462, - "grad_norm": 1.0148265361785889, - "learning_rate": 1.9102133927970633e-05, - "loss": 0.1329, + "epoch": 0.038882138517618466, + "grad_norm": 2.176948308944702, + "learning_rate": 1.8565722538447282e-05, + "loss": 0.2042, "step": 16 }, { - "epoch": 0.11992945326278659, - "grad_norm": 1.0606085062026978, - "learning_rate": 1.9601520984261358e-05, - "loss": 0.2202, + "epoch": 0.041312272174969626, + "grad_norm": 2.2497684955596924, + "learning_rate": 1.9102133927970633e-05, + "loss": 0.2568, "step": 17 }, { - "epoch": 0.12698412698412698, - "grad_norm": 0.9954428672790527, - "learning_rate": 2.0068666377598747e-05, - "loss": 0.1762, + "epoch": 0.04374240583232078, + "grad_norm": 2.5578770637512207, + "learning_rate": 1.9601520984261358e-05, + "loss": 0.1914, "step": 18 }, { - "epoch": 0.13403880070546736, - "grad_norm": 0.918416440486908, - "learning_rate": 2.0507482022971233e-05, - "loss": 0.1175, + "epoch": 0.046172539489671933, + "grad_norm": 1.6818372011184692, + "learning_rate": 2.0068666377598747e-05, + "loss": 0.2355, "step": 19 }, { - "epoch": 0.14109347442680775, - "grad_norm": 0.8776475191116333, - "learning_rate": 2.0921208418388435e-05, - "loss": 0.1791, + "epoch": 0.04860267314702309, + "grad_norm": 1.2788841724395752, + "learning_rate": 2.0507482022971233e-05, + "loss": 0.2149, "step": 20 }, { - "epoch": 0.14814814814814814, - "grad_norm": 1.228663682937622, - "learning_rate": 2.1312560015880482e-05, - "loss": 0.195, + "epoch": 0.05103280680437424, + "grad_norm": 1.4694865942001343, + "learning_rate": 2.0921208418388435e-05, + "loss": 0.1945, "step": 21 }, { - "epoch": 0.15520282186948853, - "grad_norm": 0.999839723110199, - "learning_rate": 2.1683833261066357e-05, - "loss": 0.2305, + "epoch": 0.053462940461725394, + "grad_norm": 1.3222826719284058, + "learning_rate": 2.1312560015880482e-05, + "loss": 0.271, "step": 22 }, { - "epoch": 0.16225749559082892, - "grad_norm": 1.4857021570205688, - "learning_rate": 2.2036988245565324e-05, - "loss": 0.2297, + "epoch": 0.05589307411907655, + "grad_norm": 1.3958441019058228, + "learning_rate": 2.1683833261066357e-05, + "loss": 0.2217, "step": 23 }, { - "epoch": 0.1693121693121693, - "grad_norm": 0.9725483655929565, - "learning_rate": 2.2373711347036773e-05, - "loss": 0.1334, + "epoch": 0.0583232077764277, + "grad_norm": 1.6378260850906372, + "learning_rate": 2.2036988245565324e-05, + "loss": 0.235, "step": 24 }, { - "epoch": 0.1763668430335097, - "grad_norm": 1.1450296640396118, - "learning_rate": 2.269546393362655e-05, - "loss": 0.1456, + "epoch": 0.060753341433778855, + "grad_norm": 1.7126753330230713, + "learning_rate": 2.2373711347036773e-05, + "loss": 0.1788, "step": 25 }, { - "epoch": 0.18342151675485008, - "grad_norm": 0.9552989602088928, - "learning_rate": 2.3003520695193437e-05, - "loss": 0.1589, + "epoch": 0.06318347509113001, + "grad_norm": 1.4920830726623535, + "learning_rate": 2.269546393362655e-05, + "loss": 0.2211, "step": 26 }, { - "epoch": 0.19047619047619047, - "grad_norm": 0.8792590498924255, - "learning_rate": 2.329900014453396e-05, - "loss": 0.1554, + "epoch": 0.06561360874848117, + "grad_norm": 1.6582273244857788, + "learning_rate": 2.3003520695193437e-05, + "loss": 0.1942, "step": 27 }, { - "epoch": 0.19753086419753085, - "grad_norm": 0.9049666523933411, - "learning_rate": 2.3582889132846968e-05, - "loss": 0.1716, + "epoch": 0.06804374240583232, + "grad_norm": 1.3107630014419556, + "learning_rate": 2.329900014453396e-05, + "loss": 0.1218, "step": 28 }, { - "epoch": 0.20458553791887124, - "grad_norm": 0.9107133746147156, - "learning_rate": 2.3856062735983123e-05, - "loss": 0.1428, + "epoch": 0.07047387606318348, + "grad_norm": 1.492126226425171, + "learning_rate": 2.3582889132846968e-05, + "loss": 0.1473, "step": 29 }, { - "epoch": 0.21164021164021163, - "grad_norm": 1.2052390575408936, - "learning_rate": 2.4119300522370322e-05, - "loss": 0.0895, + "epoch": 0.07290400972053462, + "grad_norm": 2.5211129188537598, + "learning_rate": 2.3856062735983123e-05, + "loss": 0.3478, "step": 30 }, { - "epoch": 0.21869488536155202, - "grad_norm": 0.8711329698562622, - "learning_rate": 2.4373299964982603e-05, - "loss": 0.1179, + "epoch": 0.07533414337788578, + "grad_norm": 1.6695655584335327, + "learning_rate": 2.4119300522370322e-05, + "loss": 0.1309, "step": 31 }, { - "epoch": 0.2257495590828924, - "grad_norm": 0.9174087047576904, - "learning_rate": 2.4618687578661044e-05, - "loss": 0.1514, + "epoch": 0.07776427703523693, + "grad_norm": 1.708465576171875, + "learning_rate": 2.4373299964982603e-05, + "loss": 0.2028, "step": 32 }, { - "epoch": 0.2328042328042328, - "grad_norm": 0.8083134293556213, - "learning_rate": 2.4856028230571212e-05, - "loss": 0.125, + "epoch": 0.08019441069258809, + "grad_norm": 1.2873278856277466, + "learning_rate": 2.4618687578661044e-05, + "loss": 0.1629, "step": 33 }, { - "epoch": 0.23985890652557318, - "grad_norm": 1.2985553741455078, - "learning_rate": 2.5085832971998436e-05, - "loss": 0.1992, + "epoch": 0.08262454434993925, + "grad_norm": 1.6095136404037476, + "learning_rate": 2.4856028230571212e-05, + "loss": 0.2027, "step": 34 }, { - "epoch": 0.24691358024691357, - "grad_norm": 1.0799403190612793, - "learning_rate": 2.530856566463146e-05, - "loss": 0.1456, + "epoch": 0.0850546780072904, + "grad_norm": 2.230327844619751, + "learning_rate": 2.5085832971998436e-05, + "loss": 0.2505, "step": 35 }, { - "epoch": 0.25396825396825395, - "grad_norm": 1.2140672206878662, - "learning_rate": 2.552464861737092e-05, - "loss": 0.0722, + "epoch": 0.08748481166464156, + "grad_norm": 0.9581132531166077, + "learning_rate": 2.530856566463146e-05, + "loss": 0.1652, "step": 36 }, { - "epoch": 0.26102292768959434, - "grad_norm": 0.8151078820228577, - "learning_rate": 2.5734467405837933e-05, - "loss": 0.0777, + "epoch": 0.0899149453219927, + "grad_norm": 2.2543814182281494, + "learning_rate": 2.552464861737092e-05, + "loss": 0.2478, "step": 37 }, { - "epoch": 0.26807760141093473, - "grad_norm": 0.8580430150032043, - "learning_rate": 2.5938375012788124e-05, - "loss": 0.0929, + "epoch": 0.09234507897934387, + "grad_norm": 1.2664082050323486, + "learning_rate": 2.5734467405837933e-05, + "loss": 0.1083, "step": 38 }, { - "epoch": 0.2751322751322751, - "grad_norm": 0.6314868330955505, - "learning_rate": 2.6136695401116585e-05, - "loss": 0.0749, + "epoch": 0.09477521263669501, + "grad_norm": 1.69247567653656, + "learning_rate": 2.5938375012788124e-05, + "loss": 0.3731, "step": 39 }, { - "epoch": 0.2821869488536155, - "grad_norm": 0.7801815271377563, - "learning_rate": 2.6329726610280168e-05, - "loss": 0.0835, + "epoch": 0.09720534629404617, + "grad_norm": 1.6350871324539185, + "learning_rate": 2.6136695401116585e-05, + "loss": 0.3767, "step": 40 }, { - "epoch": 0.2892416225749559, - "grad_norm": 0.6293598413467407, - "learning_rate": 2.651774345044166e-05, - "loss": 0.0946, + "epoch": 0.09963547995139732, + "grad_norm": 1.7677048444747925, + "learning_rate": 2.6329726610280168e-05, + "loss": 0.1207, "step": 41 }, { - "epoch": 0.2962962962962963, - "grad_norm": 1.1139250993728638, - "learning_rate": 2.6700999855466042e-05, - "loss": 0.0642, + "epoch": 0.10206561360874848, + "grad_norm": 1.1837761402130127, + "learning_rate": 2.651774345044166e-05, + "loss": 0.119, "step": 42 }, { - "epoch": 0.30335097001763667, - "grad_norm": 0.7656267285346985, - "learning_rate": 2.687973094532893e-05, - "loss": 0.0575, + "epoch": 0.10449574726609964, + "grad_norm": 1.1551034450531006, + "learning_rate": 2.6700999855466042e-05, + "loss": 0.1566, "step": 43 }, { - "epoch": 0.31040564373897706, - "grad_norm": 1.263577938079834, - "learning_rate": 2.7054154839965013e-05, - "loss": 0.1556, + "epoch": 0.10692588092345079, + "grad_norm": 1.4947562217712402, + "learning_rate": 2.687973094532893e-05, + "loss": 0.2224, "step": 44 }, { - "epoch": 0.31746031746031744, - "grad_norm": 0.8415663838386536, - "learning_rate": 2.722447425965978e-05, - "loss": 0.1244, + "epoch": 0.10935601458080195, + "grad_norm": 1.3775482177734375, + "learning_rate": 2.7054154839965013e-05, + "loss": 0.158, "step": 45 }, { - "epoch": 0.32451499118165783, - "grad_norm": 0.8434031009674072, - "learning_rate": 2.739087794143646e-05, - "loss": 0.0585, + "epoch": 0.1117861482381531, + "grad_norm": 1.6749316453933716, + "learning_rate": 2.722447425965978e-05, + "loss": 0.1058, "step": 46 }, { - "epoch": 0.3315696649029982, - "grad_norm": 0.8752081394195557, - "learning_rate": 2.755354189625573e-05, - "loss": 0.0888, + "epoch": 0.11421628189550426, + "grad_norm": 0.8986448049545288, + "learning_rate": 2.739087794143646e-05, + "loss": 0.1299, "step": 47 }, { - "epoch": 0.3386243386243386, - "grad_norm": 0.7323986291885376, - "learning_rate": 2.771263052802624e-05, - "loss": 0.0658, + "epoch": 0.1166464155528554, + "grad_norm": 1.2980709075927734, + "learning_rate": 2.755354189625573e-05, + "loss": 0.1029, "step": 48 }, { - "epoch": 0.345679012345679, - "grad_norm": 0.7197067141532898, - "learning_rate": 2.7868297632261957e-05, - "loss": 0.0708, + "epoch": 0.11907654921020656, + "grad_norm": 1.271391749382019, + "learning_rate": 2.771263052802624e-05, + "loss": 0.1576, "step": 49 }, { - "epoch": 0.3527336860670194, - "grad_norm": 0.7956255078315735, - "learning_rate": 2.8020687289593123e-05, - "loss": 0.0858, + "epoch": 0.12150668286755771, + "grad_norm": 1.4220929145812988, + "learning_rate": 2.7868297632261957e-05, + "loss": 0.1338, "step": 50 }, { - "epoch": 0.35978835978835977, - "grad_norm": 0.7490425109863281, - "learning_rate": 2.8169934667141895e-05, - "loss": 0.0764, + "epoch": 0.12393681652490887, + "grad_norm": 0.962883472442627, + "learning_rate": 2.8020687289593123e-05, + "loss": 0.1546, "step": 51 }, { - "epoch": 0.36684303350970016, - "grad_norm": 0.5713589191436768, - "learning_rate": 2.8316166738933646e-05, - "loss": 0.0706, + "epoch": 0.12636695018226002, + "grad_norm": 0.8322131633758545, + "learning_rate": 2.8169934667141895e-05, + "loss": 0.1877, "step": 52 }, { - "epoch": 0.37389770723104054, - "grad_norm": 0.6174349188804626, - "learning_rate": 2.845950293496561e-05, - "loss": 0.0372, + "epoch": 0.12879708383961117, + "grad_norm": 1.5774791240692139, + "learning_rate": 2.8316166738933646e-05, + "loss": 0.1473, "step": 53 }, { - "epoch": 0.38095238095238093, - "grad_norm": 0.9179003834724426, - "learning_rate": 2.8600055727246657e-05, - "loss": 0.0812, + "epoch": 0.13122721749696234, + "grad_norm": 1.2331901788711548, + "learning_rate": 2.845950293496561e-05, + "loss": 0.226, "step": 54 }, { - "epoch": 0.3880070546737213, - "grad_norm": 0.5890799760818481, - "learning_rate": 2.8737931160013153e-05, - "loss": 0.0524, + "epoch": 0.1336573511543135, + "grad_norm": 2.1382243633270264, + "learning_rate": 2.8600055727246657e-05, + "loss": 0.2057, "step": 55 }, { - "epoch": 0.3950617283950617, - "grad_norm": 0.8446977734565735, - "learning_rate": 2.8873229330382812e-05, - "loss": 0.14, + "epoch": 0.13608748481166463, + "grad_norm": 0.8981903195381165, + "learning_rate": 2.8737931160013153e-05, + "loss": 0.1151, "step": 56 }, { - "epoch": 0.4021164021164021, - "grad_norm": 0.7590024471282959, - "learning_rate": 2.9006044824904066e-05, - "loss": 0.0908, + "epoch": 0.1385176184690158, + "grad_norm": 2.6175801753997803, + "learning_rate": 2.8873229330382812e-05, + "loss": 0.187, "step": 57 }, { - "epoch": 0.4091710758377425, - "grad_norm": 0.7498160600662231, - "learning_rate": 2.913646711677001e-05, - "loss": 0.0903, + "epoch": 0.14094775212636695, + "grad_norm": 2.0045104026794434, + "learning_rate": 2.9006044824904066e-05, + "loss": 0.1901, "step": 58 }, { - "epoch": 0.41622574955908287, - "grad_norm": 0.6221739649772644, - "learning_rate": 2.926458092787486e-05, - "loss": 0.0813, + "epoch": 0.1433778857837181, + "grad_norm": 1.3968229293823242, + "learning_rate": 2.913646711677001e-05, + "loss": 0.1513, "step": 59 }, { - "epoch": 0.42328042328042326, - "grad_norm": 0.6228092908859253, - "learning_rate": 2.939046655938229e-05, - "loss": 0.0656, + "epoch": 0.14580801944106925, + "grad_norm": 8.191553115844727, + "learning_rate": 2.926458092787486e-05, + "loss": 0.1286, "step": 60 }, { - "epoch": 0.43033509700176364, - "grad_norm": 0.8543663024902344, - "learning_rate": 2.951420019403574e-05, - "loss": 0.1006, + "epoch": 0.14823815309842042, + "grad_norm": 2.5880401134490967, + "learning_rate": 2.939046655938229e-05, + "loss": 0.1799, "step": 61 }, { - "epoch": 0.43738977072310403, - "grad_norm": 0.7968020439147949, - "learning_rate": 2.963585417306073e-05, - "loss": 0.0621, + "epoch": 0.15066828675577157, + "grad_norm": 1.9319133758544922, + "learning_rate": 2.951420019403574e-05, + "loss": 0.2835, "step": 62 }, { - "epoch": 0.4444444444444444, - "grad_norm": 1.0612839460372925, - "learning_rate": 2.9755497250179453e-05, - "loss": 0.0834, + "epoch": 0.15309842041312272, + "grad_norm": 2.653153419494629, + "learning_rate": 2.963585417306073e-05, + "loss": 0.1936, "step": 63 }, { - "epoch": 0.4514991181657848, - "grad_norm": 1.0608887672424316, - "learning_rate": 2.98731948249709e-05, - "loss": 0.1091, + "epoch": 0.15552855407047386, + "grad_norm": 1.4456685781478882, + "learning_rate": 2.9755497250179453e-05, + "loss": 0.2256, "step": 64 }, { - "epoch": 0.4585537918871252, - "grad_norm": 0.8623563051223755, - "learning_rate": 2.9989009157559694e-05, - "loss": 0.1079, + "epoch": 0.15795868772782504, + "grad_norm": 1.168137788772583, + "learning_rate": 2.98731948249709e-05, + "loss": 0.1318, "step": 65 }, { - "epoch": 0.4656084656084656, - "grad_norm": 0.6711964011192322, - "learning_rate": 3.010299956639812e-05, - "loss": 0.1042, + "epoch": 0.16038882138517618, + "grad_norm": 1.0261298418045044, + "learning_rate": 2.9989009157559694e-05, + "loss": 0.1738, "step": 66 }, { - "epoch": 0.47266313932980597, - "grad_norm": 0.6842482089996338, - "learning_rate": 3.021522261071426e-05, - "loss": 0.0417, + "epoch": 0.16281895504252733, + "grad_norm": 2.1802849769592285, + "learning_rate": 3.010299956639812e-05, + "loss": 0.1304, "step": 67 }, { - "epoch": 0.47971781305114636, - "grad_norm": 0.5719308257102966, - "learning_rate": 3.0325732259031143e-05, - "loss": 0.054, + "epoch": 0.1652490886998785, + "grad_norm": 0.9934577941894531, + "learning_rate": 3.021522261071426e-05, + "loss": 0.1472, "step": 68 }, { - "epoch": 0.48677248677248675, - "grad_norm": 0.8470659255981445, - "learning_rate": 3.043458004501377e-05, - "loss": 0.0397, + "epoch": 0.16767922235722965, + "grad_norm": 1.7603391408920288, + "learning_rate": 3.0325732259031143e-05, + "loss": 0.0965, "step": 69 }, { - "epoch": 0.49382716049382713, - "grad_norm": 0.6628620028495789, - "learning_rate": 3.054181521177061e-05, - "loss": 0.0622, + "epoch": 0.1701093560145808, + "grad_norm": 1.1827404499053955, + "learning_rate": 3.043458004501377e-05, + "loss": 0.0957, "step": 70 }, { - "epoch": 0.5008818342151675, - "grad_norm": 0.42102503776550293, - "learning_rate": 3.064748484562093e-05, - "loss": 0.0233, + "epoch": 0.17253948967193194, + "grad_norm": 1.950810432434082, + "learning_rate": 3.054181521177061e-05, + "loss": 0.1916, "step": 71 }, { - "epoch": 0.5079365079365079, - "grad_norm": 0.8417813181877136, - "learning_rate": 3.0751634000237615e-05, - "loss": 0.1027, + "epoch": 0.17496962332928312, + "grad_norm": 1.1083086729049683, + "learning_rate": 3.064748484562093e-05, + "loss": 0.1189, "step": 72 }, { - "epoch": 0.5149911816578483, - "grad_norm": 0.6327218413352966, - "learning_rate": 3.085430581198459e-05, - "loss": 0.0588, + "epoch": 0.17739975698663427, + "grad_norm": 1.8297152519226074, + "learning_rate": 3.0751634000237615e-05, + "loss": 0.1802, "step": 73 }, { - "epoch": 0.5220458553791887, - "grad_norm": 0.5988758206367493, - "learning_rate": 3.095554160718781e-05, - "loss": 0.0294, + "epoch": 0.1798298906439854, + "grad_norm": 1.1761589050292969, + "learning_rate": 3.085430581198459e-05, + "loss": 0.1516, "step": 74 }, { - "epoch": 0.5291005291005291, - "grad_norm": 0.6376883387565613, - "learning_rate": 3.10553810020076e-05, - "loss": 0.061, + "epoch": 0.1822600243013366, + "grad_norm": 1.0804104804992676, + "learning_rate": 3.095554160718781e-05, + "loss": 0.1177, "step": 75 }, { - "epoch": 0.5361552028218695, - "grad_norm": 0.5831305384635925, - "learning_rate": 3.115386199551628e-05, - "loss": 0.0579, + "epoch": 0.18469015795868773, + "grad_norm": 1.3176584243774414, + "learning_rate": 3.10553810020076e-05, + "loss": 0.2232, "step": 76 }, { - "epoch": 0.5432098765432098, - "grad_norm": 0.9941970109939575, - "learning_rate": 3.1251021056528336e-05, - "loss": 0.0773, + "epoch": 0.18712029161603888, + "grad_norm": 0.8258953094482422, + "learning_rate": 3.115386199551628e-05, + "loss": 0.0939, "step": 77 }, { - "epoch": 0.5502645502645502, - "grad_norm": 0.712217390537262, - "learning_rate": 3.134689320467986e-05, - "loss": 0.0675, + "epoch": 0.18955042527339003, + "grad_norm": 1.2120673656463623, + "learning_rate": 3.1251021056528336e-05, + "loss": 0.0949, "step": 78 }, { - "epoch": 0.5573192239858906, - "grad_norm": 0.6384962201118469, - "learning_rate": 3.144151208620804e-05, - "loss": 0.0637, + "epoch": 0.1919805589307412, + "grad_norm": 2.156952142715454, + "learning_rate": 3.134689320467986e-05, + "loss": 0.1487, "step": 79 }, { - "epoch": 0.564373897707231, - "grad_norm": 0.804009199142456, - "learning_rate": 3.1534910044841344e-05, - "loss": 0.0574, + "epoch": 0.19441069258809235, + "grad_norm": 1.1182252168655396, + "learning_rate": 3.144151208620804e-05, + "loss": 0.1251, "step": 80 }, { - "epoch": 0.5714285714285714, - "grad_norm": 0.606472909450531, - "learning_rate": 3.1627118188174024e-05, - "loss": 0.0599, + "epoch": 0.1968408262454435, + "grad_norm": 0.9124034643173218, + "learning_rate": 3.1534910044841344e-05, + "loss": 0.0881, "step": 81 }, { - "epoch": 0.5784832451499118, - "grad_norm": 0.6898069977760315, - "learning_rate": 3.171816644986573e-05, - "loss": 0.0744, + "epoch": 0.19927095990279464, + "grad_norm": 1.1577218770980835, + "learning_rate": 3.1627118188174024e-05, + "loss": 0.1401, "step": 82 }, { - "epoch": 0.5855379188712522, - "grad_norm": 0.5865432024002075, - "learning_rate": 3.18080836479775e-05, - "loss": 0.0458, + "epoch": 0.20170109356014582, + "grad_norm": 0.8857666254043579, + "learning_rate": 3.171816644986573e-05, + "loss": 0.1202, "step": 83 }, { - "epoch": 0.5925925925925926, - "grad_norm": 0.7742403149604797, - "learning_rate": 3.1896897539728616e-05, - "loss": 0.0401, + "epoch": 0.20413122721749696, + "grad_norm": 0.8261206746101379, + "learning_rate": 3.18080836479775e-05, + "loss": 0.1156, "step": 84 }, { - "epoch": 0.599647266313933, - "grad_norm": 0.9450638890266418, - "learning_rate": 3.198463487293457e-05, - "loss": 0.0591, + "epoch": 0.2065613608748481, + "grad_norm": 2.775029182434082, + "learning_rate": 3.1896897539728616e-05, + "loss": 0.1777, "step": 85 }, { - "epoch": 0.6067019400352733, - "grad_norm": 0.6740131378173828, - "learning_rate": 3.207132143436469e-05, - "loss": 0.0606, + "epoch": 0.20899149453219928, + "grad_norm": 0.9188308715820312, + "learning_rate": 3.198463487293457e-05, + "loss": 0.0676, "step": 86 }, { - "epoch": 0.6137566137566137, - "grad_norm": 0.6818442344665527, - "learning_rate": 3.215698209523821e-05, - "loss": 0.0582, + "epoch": 0.21142162818955043, + "grad_norm": 1.3610761165618896, + "learning_rate": 3.207132143436469e-05, + "loss": 0.1576, "step": 87 }, { - "epoch": 0.6208112874779541, - "grad_norm": 0.6413015723228455, - "learning_rate": 3.224164085405946e-05, - "loss": 0.0654, + "epoch": 0.21385176184690158, + "grad_norm": 1.8237483501434326, + "learning_rate": 3.215698209523821e-05, + "loss": 0.0858, "step": 88 }, { - "epoch": 0.6278659611992945, - "grad_norm": 0.6277798414230347, - "learning_rate": 3.232532087697698e-05, - "loss": 0.0359, + "epoch": 0.21628189550425272, + "grad_norm": 1.4661478996276855, + "learning_rate": 3.224164085405946e-05, + "loss": 0.1596, "step": 89 }, { - "epoch": 0.6349206349206349, - "grad_norm": 0.9701242446899414, - "learning_rate": 3.240804453583615e-05, - "loss": 0.0346, + "epoch": 0.2187120291616039, + "grad_norm": 0.8811361193656921, + "learning_rate": 3.232532087697698e-05, + "loss": 0.1661, "step": 90 }, { - "epoch": 0.6419753086419753, - "grad_norm": 0.5861459374427795, - "learning_rate": 3.248983344408188e-05, - "loss": 0.045, + "epoch": 0.22114216281895505, + "grad_norm": 1.274592638015747, + "learning_rate": 3.240804453583615e-05, + "loss": 0.0793, "step": 91 }, { - "epoch": 0.6490299823633157, - "grad_norm": 0.6812423467636108, - "learning_rate": 3.2570708490655414e-05, - "loss": 0.0741, + "epoch": 0.2235722964763062, + "grad_norm": 1.125353455543518, + "learning_rate": 3.248983344408188e-05, + "loss": 0.2034, "step": 92 }, { - "epoch": 0.656084656084656, - "grad_norm": 0.6713656187057495, - "learning_rate": 3.265068987201822e-05, - "loss": 0.0762, + "epoch": 0.22600243013365734, + "grad_norm": 1.1274268627166748, + "learning_rate": 3.2570708490655414e-05, + "loss": 0.117, "step": 93 }, { - "epoch": 0.6631393298059964, - "grad_norm": 3.140953779220581, - "learning_rate": 3.2729797122425925e-05, - "loss": 0.0323, + "epoch": 0.2284325637910085, + "grad_norm": 0.8626168370246887, + "learning_rate": 3.265068987201822e-05, + "loss": 0.0469, "step": 94 }, { - "epoch": 0.6701940035273368, - "grad_norm": 0.5071882605552673, - "learning_rate": 3.280804914256559e-05, - "loss": 0.0289, + "epoch": 0.23086269744835966, + "grad_norm": 0.9799898862838745, + "learning_rate": 3.2729797122425925e-05, + "loss": 0.1822, "step": 95 }, { - "epoch": 0.6772486772486772, - "grad_norm": 0.779321551322937, - "learning_rate": 3.288546422666164e-05, - "loss": 0.1021, + "epoch": 0.2332928311057108, + "grad_norm": 1.093001127243042, + "learning_rate": 3.280804914256559e-05, + "loss": 0.1182, "step": 96 }, { - "epoch": 0.6843033509700176, - "grad_norm": 0.6520841717720032, - "learning_rate": 3.2962060088147464e-05, - "loss": 0.0206, + "epoch": 0.23572296476306198, + "grad_norm": 0.8328416347503662, + "learning_rate": 3.288546422666164e-05, + "loss": 0.1668, "step": 97 }, { - "epoch": 0.691358024691358, - "grad_norm": 0.46021828055381775, - "learning_rate": 3.3037853883992805e-05, - "loss": 0.026, + "epoch": 0.23815309842041313, + "grad_norm": 1.4609471559524536, + "learning_rate": 3.2962060088147464e-05, + "loss": 0.1545, "step": 98 }, { - "epoch": 0.6984126984126984, - "grad_norm": 0.5235745310783386, - "learning_rate": 3.3112862237770756e-05, - "loss": 0.0388, + "epoch": 0.24058323207776428, + "grad_norm": 1.207360029220581, + "learning_rate": 3.3037853883992805e-05, + "loss": 0.2099, "step": 99 }, { - "epoch": 0.7054673721340388, - "grad_norm": 0.6948632001876831, - "learning_rate": 3.3187101261541584e-05, - "loss": 0.0349, + "epoch": 0.24301336573511542, + "grad_norm": 0.9304331541061401, + "learning_rate": 3.3112862237770756e-05, + "loss": 0.0543, "step": 100 }, { - "epoch": 0.7054673721340388, - "eval_loss": 0.22200879454612732, - "eval_runtime": 349.6846, - "eval_samples_per_second": 6.486, - "eval_steps_per_second": 0.406, + "epoch": 0.24301336573511542, + "eval_loss": 0.279557466506958, + "eval_runtime": 507.19, + "eval_samples_per_second": 5.418, + "eval_steps_per_second": 0.678, "step": 100 }, { - "epoch": 0.7125220458553791, - "grad_norm": 0.6536000967025757, - "learning_rate": 3.326058657662584e-05, - "loss": 0.0705, + "epoch": 0.2454434993924666, + "grad_norm": 0.8421845436096191, + "learning_rate": 3.3187101261541584e-05, + "loss": 0.117, "step": 101 }, { - "epoch": 0.7195767195767195, - "grad_norm": 0.5648466944694519, - "learning_rate": 3.333333333333334e-05, - "loss": 0.0335, + "epoch": 0.24787363304981774, + "grad_norm": 1.1542550325393677, + "learning_rate": 3.326058657662584e-05, + "loss": 0.172, "step": 102 }, { - "epoch": 0.7266313932980599, - "grad_norm": 1.2128064632415771, - "learning_rate": 3.340535622971072e-05, - "loss": 0.0279, + "epoch": 0.2503037667071689, + "grad_norm": 1.1323540210723877, + "learning_rate": 3.333333333333334e-05, + "loss": 0.1541, "step": 103 }, { - "epoch": 0.7336860670194003, - "grad_norm": 0.44416865706443787, - "learning_rate": 3.3476669529365295e-05, - "loss": 0.0428, + "epoch": 0.25273390036452004, + "grad_norm": 0.9894705414772034, + "learning_rate": 3.340535622971072e-05, + "loss": 0.0689, "step": 104 }, { - "epoch": 0.7407407407407407, - "grad_norm": 0.7246749997138977, - "learning_rate": 3.3547287078419544e-05, - "loss": 0.0443, + "epoch": 0.2551640340218712, + "grad_norm": 1.0771222114562988, + "learning_rate": 3.3476669529365295e-05, + "loss": 0.1796, "step": 105 }, { - "epoch": 0.7477954144620811, - "grad_norm": 0.6053627729415894, - "learning_rate": 3.361722232164634e-05, - "loss": 0.0368, + "epoch": 0.25759416767922233, + "grad_norm": 0.9119389653205872, + "learning_rate": 3.3547287078419544e-05, + "loss": 0.1092, "step": 106 }, { - "epoch": 0.7548500881834215, - "grad_norm": 0.5114538073539734, - "learning_rate": 3.3686488317832306e-05, - "loss": 0.0288, + "epoch": 0.2600243013365735, + "grad_norm": 1.194972038269043, + "learning_rate": 3.361722232164634e-05, + "loss": 0.1077, "step": 107 }, { - "epoch": 0.7619047619047619, - "grad_norm": 0.9469549059867859, - "learning_rate": 3.375509775441284e-05, - "loss": 0.1024, + "epoch": 0.2624544349939247, + "grad_norm": 1.0005968809127808, + "learning_rate": 3.3686488317832306e-05, + "loss": 0.2024, "step": 108 }, { - "epoch": 0.7689594356261023, - "grad_norm": 0.7715326547622681, - "learning_rate": 3.382306296142016e-05, - "loss": 0.0303, + "epoch": 0.2648845686512758, + "grad_norm": 1.0404926538467407, + "learning_rate": 3.375509775441284e-05, + "loss": 0.0672, "step": 109 }, { - "epoch": 0.7760141093474426, - "grad_norm": 0.6786747574806213, - "learning_rate": 3.38903959247825e-05, - "loss": 0.0514, + "epoch": 0.267314702308627, + "grad_norm": 0.9089542031288147, + "learning_rate": 3.382306296142016e-05, + "loss": 0.0831, "step": 110 }, { - "epoch": 0.783068783068783, - "grad_norm": 0.8495022058486938, - "learning_rate": 3.395710829901039e-05, - "loss": 0.0337, + "epoch": 0.26974483596597815, + "grad_norm": 1.2017402648925781, + "learning_rate": 3.38903959247825e-05, + "loss": 0.1336, "step": 111 }, { - "epoch": 0.7901234567901234, - "grad_norm": 0.5682428479194641, - "learning_rate": 3.402321141930376e-05, - "loss": 0.0351, + "epoch": 0.27217496962332927, + "grad_norm": 0.9206855893135071, + "learning_rate": 3.395710829901039e-05, + "loss": 0.0721, "step": 112 }, { - "epoch": 0.7971781305114638, - "grad_norm": 0.989920437335968, - "learning_rate": 3.4088716313110955e-05, - "loss": 0.0424, + "epoch": 0.27460510328068044, + "grad_norm": 1.4966216087341309, + "learning_rate": 3.402321141930376e-05, + "loss": 0.124, "step": 113 }, { - "epoch": 0.8042328042328042, - "grad_norm": 0.6969882845878601, - "learning_rate": 3.415363371116969e-05, - "loss": 0.0413, + "epoch": 0.2770352369380316, + "grad_norm": 0.7973242998123169, + "learning_rate": 3.4088716313110955e-05, + "loss": 0.0597, "step": 114 }, { - "epoch": 0.8112874779541446, - "grad_norm": 0.5075703859329224, - "learning_rate": 3.4217974058057e-05, - "loss": 0.0219, + "epoch": 0.27946537059538273, + "grad_norm": 0.941777765750885, + "learning_rate": 3.415363371116969e-05, + "loss": 0.0737, "step": 115 }, { - "epoch": 0.818342151675485, - "grad_norm": 0.33993738889694214, - "learning_rate": 3.428174752227455e-05, - "loss": 0.015, + "epoch": 0.2818955042527339, + "grad_norm": 0.9339935183525085, + "learning_rate": 3.4217974058057e-05, + "loss": 0.1308, "step": 116 }, { - "epoch": 0.8253968253968254, - "grad_norm": 0.6452199220657349, - "learning_rate": 3.434496400589353e-05, - "loss": 0.0318, + "epoch": 0.284325637910085, + "grad_norm": 1.0190166234970093, + "learning_rate": 3.428174752227455e-05, + "loss": 0.117, "step": 117 }, { - "epoch": 0.8324514991181657, - "grad_norm": 0.43750643730163574, - "learning_rate": 3.440763315378198e-05, - "loss": 0.0292, + "epoch": 0.2867557715674362, + "grad_norm": 1.7908596992492676, + "learning_rate": 3.434496400589353e-05, + "loss": 0.1853, "step": 118 }, { - "epoch": 0.8395061728395061, - "grad_norm": 0.5939832925796509, - "learning_rate": 3.446976436243603e-05, - "loss": 0.049, + "epoch": 0.2891859052247874, + "grad_norm": 1.1210516691207886, + "learning_rate": 3.440763315378198e-05, + "loss": 0.1775, "step": 119 }, { - "epoch": 0.8465608465608465, - "grad_norm": 0.5244298577308655, - "learning_rate": 3.4531366788435425e-05, - "loss": 0.0298, + "epoch": 0.2916160388821385, + "grad_norm": 0.916373610496521, + "learning_rate": 3.446976436243603e-05, + "loss": 0.0902, "step": 120 }, { - "epoch": 0.8536155202821869, - "grad_norm": 1.2761309146881104, - "learning_rate": 3.459244935654219e-05, - "loss": 0.0387, + "epoch": 0.29404617253948967, + "grad_norm": 1.1219305992126465, + "learning_rate": 3.4531366788435425e-05, + "loss": 0.1742, "step": 121 }, { - "epoch": 0.8606701940035273, - "grad_norm": 0.5901013612747192, - "learning_rate": 3.465302076746041e-05, - "loss": 0.0344, + "epoch": 0.29647630619684084, + "grad_norm": 0.6167263388633728, + "learning_rate": 3.459244935654219e-05, + "loss": 0.0674, "step": 122 }, { - "epoch": 0.8677248677248677, - "grad_norm": 0.7754112482070923, - "learning_rate": 3.471308950527417e-05, - "loss": 0.058, + "epoch": 0.29890643985419196, + "grad_norm": 1.4466544389724731, + "learning_rate": 3.465302076746041e-05, + "loss": 0.1252, "step": 123 }, { - "epoch": 0.8747795414462081, - "grad_norm": 0.9062497615814209, - "learning_rate": 3.477266384457914e-05, - "loss": 0.0547, + "epoch": 0.30133657351154314, + "grad_norm": 1.1142324209213257, + "learning_rate": 3.471308950527417e-05, + "loss": 0.2289, "step": 124 }, { - "epoch": 0.8818342151675485, - "grad_norm": 0.6741510629653931, - "learning_rate": 3.48317518573233e-05, - "loss": 0.0183, + "epoch": 0.3037667071688943, + "grad_norm": 1.2791327238082886, + "learning_rate": 3.477266384457914e-05, + "loss": 0.0885, "step": 125 }, { - "epoch": 0.8888888888888888, - "grad_norm": 0.56861412525177, - "learning_rate": 3.489036141937059e-05, - "loss": 0.0394, + "epoch": 0.30619684082624543, + "grad_norm": 0.87410968542099, + "learning_rate": 3.48317518573233e-05, + "loss": 0.1188, "step": 126 }, { - "epoch": 0.8959435626102292, - "grad_norm": 0.6512686014175415, - "learning_rate": 3.494850021680094e-05, - "loss": 0.0444, + "epoch": 0.3086269744835966, + "grad_norm": 1.3250837326049805, + "learning_rate": 3.489036141937059e-05, + "loss": 0.1572, "step": 127 }, { - "epoch": 0.9029982363315696, - "grad_norm": 0.6218048334121704, - "learning_rate": 3.500617575195938e-05, - "loss": 0.041, + "epoch": 0.3110571081409477, + "grad_norm": 1.2806202173233032, + "learning_rate": 3.494850021680094e-05, + "loss": 0.1576, "step": 128 }, { - "epoch": 0.91005291005291, - "grad_norm": 0.4835250675678253, - "learning_rate": 3.5063395349265945e-05, - "loss": 0.04, + "epoch": 0.3134872417982989, + "grad_norm": 1.1694215536117554, + "learning_rate": 3.500617575195938e-05, + "loss": 0.0828, "step": 129 }, { - "epoch": 0.9171075837742504, - "grad_norm": 0.5520550608634949, - "learning_rate": 3.5120166160797804e-05, - "loss": 0.0293, + "epoch": 0.3159173754556501, + "grad_norm": 1.1224102973937988, + "learning_rate": 3.5063395349265945e-05, + "loss": 0.197, "step": 130 }, { - "epoch": 0.9241622574955908, - "grad_norm": 0.6331014037132263, - "learning_rate": 3.517649517165415e-05, - "loss": 0.0502, + "epoch": 0.3183475091130012, + "grad_norm": 5.477957248687744, + "learning_rate": 3.5120166160797804e-05, + "loss": 0.1175, "step": 131 }, { - "epoch": 0.9312169312169312, - "grad_norm": 0.4391537308692932, - "learning_rate": 3.523238920511395e-05, - "loss": 0.0233, + "epoch": 0.32077764277035237, + "grad_norm": 0.8001928925514221, + "learning_rate": 3.517649517165415e-05, + "loss": 0.0827, "step": 132 }, { - "epoch": 0.9382716049382716, - "grad_norm": 0.5481865406036377, - "learning_rate": 3.528785492759607e-05, - "loss": 0.0261, + "epoch": 0.32320777642770354, + "grad_norm": 0.8747657537460327, + "learning_rate": 3.523238920511395e-05, + "loss": 0.0611, "step": 133 }, { - "epoch": 0.9453262786596119, - "grad_norm": 0.5946213006973267, - "learning_rate": 3.5342898853430836e-05, - "loss": 0.0182, + "epoch": 0.32563791008505466, + "grad_norm": 0.7952237725257874, + "learning_rate": 3.528785492759607e-05, + "loss": 0.0875, "step": 134 }, { - "epoch": 0.9523809523809523, - "grad_norm": 0.5419771671295166, - "learning_rate": 3.539752734945143e-05, - "loss": 0.0423, + "epoch": 0.32806804374240583, + "grad_norm": 1.1550030708312988, + "learning_rate": 3.5342898853430836e-05, + "loss": 0.1051, "step": 135 }, { - "epoch": 0.9594356261022927, - "grad_norm": 0.4333636164665222, - "learning_rate": 3.5451746639413466e-05, - "loss": 0.0307, + "epoch": 0.330498177399757, + "grad_norm": 1.0629292726516724, + "learning_rate": 3.539752734945143e-05, + "loss": 0.0615, "step": 136 }, { - "epoch": 0.9664902998236331, - "grad_norm": 1.6216697692871094, - "learning_rate": 3.550556280825011e-05, - "loss": 0.0561, + "epoch": 0.33292831105710813, + "grad_norm": 1.4681527614593506, + "learning_rate": 3.5451746639413466e-05, + "loss": 0.1419, "step": 137 }, { - "epoch": 0.9735449735449735, - "grad_norm": 0.6340786218643188, - "learning_rate": 3.55589818061703e-05, - "loss": 0.0331, + "epoch": 0.3353584447144593, + "grad_norm": 1.924991488456726, + "learning_rate": 3.550556280825011e-05, + "loss": 0.1506, "step": 138 }, { - "epoch": 0.9805996472663139, - "grad_norm": 0.4583222568035126, - "learning_rate": 3.561200945260678e-05, - "loss": 0.0291, + "epoch": 0.3377885783718105, + "grad_norm": 1.485156774520874, + "learning_rate": 3.55589818061703e-05, + "loss": 0.1149, "step": 139 }, { - "epoch": 0.9876543209876543, - "grad_norm": 0.4486044943332672, - "learning_rate": 3.5664651440020616e-05, - "loss": 0.0272, + "epoch": 0.3402187120291616, + "grad_norm": 1.5923975706100464, + "learning_rate": 3.561200945260678e-05, + "loss": 0.1039, "step": 140 }, { - "epoch": 0.9947089947089947, - "grad_norm": 0.8528138399124146, - "learning_rate": 3.571691333756825e-05, - "loss": 0.049, + "epoch": 0.34264884568651277, + "grad_norm": 1.22085440158844, + "learning_rate": 3.5664651440020616e-05, + "loss": 0.1192, "step": 141 }, { - "epoch": 1.001763668430335, - "grad_norm": 0.4883541166782379, - "learning_rate": 3.5768800594637304e-05, - "loss": 0.032, + "epoch": 0.3450789793438639, + "grad_norm": 1.2707493305206299, + "learning_rate": 3.571691333756825e-05, + "loss": 0.1605, "step": 142 }, { - "epoch": 1.0088183421516754, - "grad_norm": 0.4896632134914398, - "learning_rate": 3.582031854425634e-05, - "loss": 0.0152, + "epoch": 0.34750911300121506, + "grad_norm": 1.3047083616256714, + "learning_rate": 3.5768800594637304e-05, + "loss": 0.085, "step": 143 }, { - "epoch": 1.0158730158730158, - "grad_norm": 1.3984421491622925, - "learning_rate": 3.587147240638428e-05, - "loss": 0.031, + "epoch": 0.34993924665856624, + "grad_norm": 1.43068265914917, + "learning_rate": 3.582031854425634e-05, + "loss": 0.1063, "step": 144 }, { - "epoch": 1.0229276895943562, - "grad_norm": 0.5898779630661011, - "learning_rate": 3.5922267291084366e-05, - "loss": 0.0375, + "epoch": 0.35236938031591736, + "grad_norm": 1.2630374431610107, + "learning_rate": 3.587147240638428e-05, + "loss": 0.0877, "step": 145 }, { - "epoch": 1.0299823633156966, - "grad_norm": 0.5027557611465454, - "learning_rate": 3.5972708201587496e-05, - "loss": 0.0349, + "epoch": 0.35479951397326853, + "grad_norm": 0.7847033143043518, + "learning_rate": 3.5922267291084366e-05, + "loss": 0.0309, "step": 146 }, { - "epoch": 1.037037037037037, - "grad_norm": 0.7262934446334839, - "learning_rate": 3.6022800037249585e-05, - "loss": 0.0265, + "epoch": 0.3572296476306197, + "grad_norm": 1.0574724674224854, + "learning_rate": 3.5972708201587496e-05, + "loss": 0.1964, "step": 147 }, { - "epoch": 1.0440917107583774, - "grad_norm": 0.5979143977165222, - "learning_rate": 3.607254759640729e-05, - "loss": 0.0215, + "epoch": 0.3596597812879708, + "grad_norm": 1.662292718887329, + "learning_rate": 3.6022800037249585e-05, + "loss": 0.0853, "step": 148 }, { - "epoch": 1.0511463844797178, - "grad_norm": 0.7840447425842285, - "learning_rate": 3.612195557913627e-05, - "loss": 0.0275, + "epoch": 0.362089914945322, + "grad_norm": 2.2652666568756104, + "learning_rate": 3.607254759640729e-05, + "loss": 0.1134, "step": 149 }, { - "epoch": 1.0582010582010581, - "grad_norm": 0.7244076728820801, - "learning_rate": 3.6171028589915954e-05, - "loss": 0.0298, + "epoch": 0.3645200486026732, + "grad_norm": 2.5058281421661377, + "learning_rate": 3.612195557913627e-05, + "loss": 0.0678, "step": 150 }, { - "epoch": 1.0652557319223985, - "grad_norm": 0.35125666856765747, - "learning_rate": 3.6219771140204575e-05, - "loss": 0.0107, + "epoch": 0.3669501822600243, + "grad_norm": 1.5377355813980103, + "learning_rate": 3.6171028589915954e-05, + "loss": 0.2186, "step": 151 }, { - "epoch": 1.072310405643739, - "grad_norm": 0.5769489407539368, - "learning_rate": 3.626818765092802e-05, - "loss": 0.0336, + "epoch": 0.36938031591737547, + "grad_norm": 1.1862763166427612, + "learning_rate": 3.6219771140204575e-05, + "loss": 0.1957, "step": 152 }, { - "epoch": 1.0793650793650793, - "grad_norm": 0.6769897937774658, - "learning_rate": 3.6316282454886157e-05, - "loss": 0.032, + "epoch": 0.3718104495747266, + "grad_norm": 0.6389171481132507, + "learning_rate": 3.626818765092802e-05, + "loss": 0.1293, "step": 153 }, { - "epoch": 1.0864197530864197, - "grad_norm": 0.48454728722572327, - "learning_rate": 3.636405979907955e-05, - "loss": 0.026, + "epoch": 0.37424058323207776, + "grad_norm": 0.7620474696159363, + "learning_rate": 3.6316282454886157e-05, + "loss": 0.1237, "step": 154 }, { - "epoch": 1.09347442680776, - "grad_norm": 0.6515180468559265, - "learning_rate": 3.6411523846959985e-05, - "loss": 0.0472, + "epoch": 0.37667071688942894, + "grad_norm": 2.63171124458313, + "learning_rate": 3.636405979907955e-05, + "loss": 0.1276, "step": 155 }, { - "epoch": 1.1005291005291005, - "grad_norm": 0.39670661091804504, - "learning_rate": 3.645867868060772e-05, - "loss": 0.0223, + "epoch": 0.37910085054678005, + "grad_norm": 1.1460998058319092, + "learning_rate": 3.6411523846959985e-05, + "loss": 0.0713, "step": 156 }, { - "epoch": 1.1075837742504409, - "grad_norm": 0.5824379920959473, - "learning_rate": 3.6505528302838193e-05, - "loss": 0.0408, + "epoch": 0.38153098420413123, + "grad_norm": 0.7793575525283813, + "learning_rate": 3.645867868060772e-05, + "loss": 0.0684, "step": 157 }, { - "epoch": 1.1146384479717812, - "grad_norm": 0.4789552688598633, - "learning_rate": 3.6552076639241027e-05, - "loss": 0.0344, + "epoch": 0.3839611178614824, + "grad_norm": 0.973029613494873, + "learning_rate": 3.6505528302838193e-05, + "loss": 0.0667, "step": 158 }, { - "epoch": 1.1216931216931216, - "grad_norm": 0.3795282542705536, - "learning_rate": 3.65983275401539e-05, - "loss": 0.0164, + "epoch": 0.3863912515188335, + "grad_norm": 2.509524345397949, + "learning_rate": 3.6552076639241027e-05, + "loss": 0.2404, "step": 159 }, { - "epoch": 1.128747795414462, - "grad_norm": 0.6095085144042969, - "learning_rate": 3.664428478257371e-05, - "loss": 0.017, + "epoch": 0.3888213851761847, + "grad_norm": 1.7816277742385864, + "learning_rate": 3.65983275401539e-05, + "loss": 0.0928, "step": 160 }, { - "epoch": 1.1358024691358024, - "grad_norm": 0.7060067057609558, - "learning_rate": 3.668995207200753e-05, - "loss": 0.0296, + "epoch": 0.39125151883353587, + "grad_norm": 0.7536255121231079, + "learning_rate": 3.664428478257371e-05, + "loss": 0.1489, "step": 161 }, { - "epoch": 1.1428571428571428, - "grad_norm": 0.8230766654014587, - "learning_rate": 3.673533304426541e-05, - "loss": 0.0424, + "epoch": 0.393681652490887, + "grad_norm": 0.8865494132041931, + "learning_rate": 3.668995207200753e-05, + "loss": 0.1395, "step": 162 }, { - "epoch": 1.1499118165784832, - "grad_norm": 0.4897718131542206, - "learning_rate": 3.67804312671975e-05, - "loss": 0.0217, + "epoch": 0.39611178614823817, + "grad_norm": 1.5424951314926147, + "learning_rate": 3.673533304426541e-05, + "loss": 0.2781, "step": 163 }, { - "epoch": 1.1569664902998236, - "grad_norm": 0.4311521351337433, - "learning_rate": 3.682525024237719e-05, - "loss": 0.0278, + "epoch": 0.3985419198055893, + "grad_norm": 1.0128049850463867, + "learning_rate": 3.67804312671975e-05, + "loss": 0.161, "step": 164 }, { - "epoch": 1.164021164021164, - "grad_norm": 0.37872302532196045, - "learning_rate": 3.6869793406732636e-05, - "loss": 0.015, + "epoch": 0.40097205346294046, + "grad_norm": 1.1372522115707397, + "learning_rate": 3.682525024237719e-05, + "loss": 0.1299, "step": 165 }, { - "epoch": 1.1710758377425043, - "grad_norm": 0.3771253526210785, - "learning_rate": 3.69140641341283e-05, - "loss": 0.0278, + "epoch": 0.40340218712029163, + "grad_norm": 1.0910232067108154, + "learning_rate": 3.6869793406732636e-05, + "loss": 0.0693, "step": 166 }, { - "epoch": 1.1781305114638447, - "grad_norm": 0.6320034265518188, - "learning_rate": 3.695806573689844e-05, - "loss": 0.0144, + "epoch": 0.40583232077764275, + "grad_norm": 0.8842087388038635, + "learning_rate": 3.69140641341283e-05, + "loss": 0.136, "step": 167 }, { - "epoch": 1.1851851851851851, - "grad_norm": 0.40642789006233215, - "learning_rate": 3.700180146733426e-05, - "loss": 0.0394, + "epoch": 0.4082624544349939, + "grad_norm": 1.3073561191558838, + "learning_rate": 3.695806573689844e-05, + "loss": 0.0366, "step": 168 }, { - "epoch": 1.1922398589065255, - "grad_norm": 0.490627259016037, - "learning_rate": 3.704527451912639e-05, - "loss": 0.0285, + "epoch": 0.4106925880923451, + "grad_norm": 1.3095803260803223, + "learning_rate": 3.700180146733426e-05, + "loss": 0.1218, "step": 169 }, { - "epoch": 1.199294532627866, - "grad_norm": 0.5026156306266785, - "learning_rate": 3.708848802876438e-05, - "loss": 0.0255, + "epoch": 0.4131227217496962, + "grad_norm": 1.3316633701324463, + "learning_rate": 3.704527451912639e-05, + "loss": 0.14, "step": 170 }, { - "epoch": 1.2063492063492063, - "grad_norm": 0.3265487551689148, - "learning_rate": 3.7131445076894564e-05, - "loss": 0.0126, + "epoch": 0.4155528554070474, + "grad_norm": 0.7762789130210876, + "learning_rate": 3.708848802876438e-05, + "loss": 0.1055, "step": 171 }, { - "epoch": 1.2134038800705467, - "grad_norm": 0.49885180592536926, - "learning_rate": 3.717414868963791e-05, - "loss": 0.0203, + "epoch": 0.41798298906439857, + "grad_norm": 1.0042897462844849, + "learning_rate": 3.7131445076894564e-05, + "loss": 0.0627, "step": 172 }, { - "epoch": 1.220458553791887, - "grad_norm": 0.661622941493988, - "learning_rate": 3.721660183986924e-05, - "loss": 0.0332, + "epoch": 0.4204131227217497, + "grad_norm": 2.6669344902038574, + "learning_rate": 3.717414868963791e-05, + "loss": 0.1243, "step": 173 }, { - "epoch": 1.2275132275132274, - "grad_norm": 0.3157746195793152, - "learning_rate": 3.725880744845915e-05, - "loss": 0.0144, + "epoch": 0.42284325637910086, + "grad_norm": 0.6236822605133057, + "learning_rate": 3.721660183986924e-05, + "loss": 0.0497, "step": 174 }, { - "epoch": 1.2345679012345678, - "grad_norm": 1.141743779182434, - "learning_rate": 3.730076838547993e-05, - "loss": 0.0245, + "epoch": 0.425273390036452, + "grad_norm": 1.0069197416305542, + "learning_rate": 3.725880744845915e-05, + "loss": 0.1257, "step": 175 }, { - "epoch": 1.2416225749559082, - "grad_norm": 0.4302665591239929, - "learning_rate": 3.734248747137666e-05, - "loss": 0.0129, + "epoch": 0.42770352369380316, + "grad_norm": 0.8991190195083618, + "learning_rate": 3.730076838547993e-05, + "loss": 0.1139, "step": 176 }, { - "epoch": 1.2486772486772486, - "grad_norm": 1.7228344678878784, - "learning_rate": 3.738396747810492e-05, - "loss": 0.0412, + "epoch": 0.43013365735115433, + "grad_norm": 1.1606578826904297, + "learning_rate": 3.734248747137666e-05, + "loss": 0.0454, "step": 177 }, { - "epoch": 1.255731922398589, - "grad_norm": 0.5942803025245667, - "learning_rate": 3.7425211130235834e-05, - "loss": 0.0311, + "epoch": 0.43256379100850545, + "grad_norm": 1.6333225965499878, + "learning_rate": 3.738396747810492e-05, + "loss": 0.0885, "step": 178 }, { - "epoch": 1.2627865961199294, - "grad_norm": 2.3315789699554443, - "learning_rate": 3.7466221106030115e-05, - "loss": 0.0433, + "epoch": 0.4349939246658566, + "grad_norm": 1.1033371686935425, + "learning_rate": 3.7425211130235834e-05, + "loss": 0.139, "step": 179 }, { - "epoch": 1.2698412698412698, - "grad_norm": 0.44636741280555725, - "learning_rate": 3.750700003848157e-05, - "loss": 0.0214, + "epoch": 0.4374240583232078, + "grad_norm": 0.5453643202781677, + "learning_rate": 3.7466221106030115e-05, + "loss": 0.0543, "step": 180 }, { - "epoch": 1.2768959435626102, - "grad_norm": 0.43351835012435913, - "learning_rate": 3.7547550516331555e-05, - "loss": 0.043, + "epoch": 0.4398541919805589, + "grad_norm": 0.8695247769355774, + "learning_rate": 3.750700003848157e-05, + "loss": 0.1419, "step": 181 }, { - "epoch": 1.2839506172839505, - "grad_norm": 0.8504909873008728, - "learning_rate": 3.75878750850551e-05, - "loss": 0.0207, + "epoch": 0.4422843256379101, + "grad_norm": 1.7627193927764893, + "learning_rate": 3.7547550516331555e-05, + "loss": 0.1566, "step": 182 }, { - "epoch": 1.291005291005291, - "grad_norm": 0.9498050808906555, - "learning_rate": 3.7627976247819744e-05, - "loss": 0.0501, + "epoch": 0.44471445929526127, + "grad_norm": 0.7778682708740234, + "learning_rate": 3.75878750850551e-05, + "loss": 0.0559, "step": 183 }, { - "epoch": 1.2980599647266313, - "grad_norm": 0.426117867231369, - "learning_rate": 3.766785646641792e-05, - "loss": 0.0195, + "epoch": 0.4471445929526124, + "grad_norm": 0.8081510663032532, + "learning_rate": 3.7627976247819744e-05, + "loss": 0.0729, "step": 184 }, { - "epoch": 1.3051146384479717, - "grad_norm": 0.3924143314361572, - "learning_rate": 3.770751816217383e-05, - "loss": 0.0195, + "epoch": 0.44957472660996356, + "grad_norm": 0.6429978013038635, + "learning_rate": 3.766785646641792e-05, + "loss": 0.0474, "step": 185 }, { - "epoch": 1.312169312169312, - "grad_norm": 0.5952396988868713, - "learning_rate": 3.7746963716825615e-05, - "loss": 0.042, + "epoch": 0.4520048602673147, + "grad_norm": 1.088581919670105, + "learning_rate": 3.770751816217383e-05, + "loss": 0.0644, "step": 186 }, { - "epoch": 1.3192239858906525, - "grad_norm": 0.44761860370635986, - "learning_rate": 3.778619547338356e-05, - "loss": 0.0299, + "epoch": 0.45443499392466585, + "grad_norm": 1.4708858728408813, + "learning_rate": 3.7746963716825615e-05, + "loss": 0.1651, "step": 187 }, { - "epoch": 1.3262786596119929, - "grad_norm": 0.3791397213935852, - "learning_rate": 3.782521573696528e-05, - "loss": 0.014, + "epoch": 0.456865127582017, + "grad_norm": 0.8583030700683594, + "learning_rate": 3.778619547338356e-05, + "loss": 0.0822, "step": 188 }, { - "epoch": 1.3333333333333333, - "grad_norm": 0.3473624587059021, - "learning_rate": 3.786402677560832e-05, - "loss": 0.0149, + "epoch": 0.45929526123936815, + "grad_norm": 1.0450993776321411, + "learning_rate": 3.782521573696528e-05, + "loss": 0.0731, "step": 189 }, { - "epoch": 1.3403880070546736, - "grad_norm": 0.5772216320037842, - "learning_rate": 3.790263082106134e-05, - "loss": 0.0375, + "epoch": 0.4617253948967193, + "grad_norm": 1.1490970849990845, + "learning_rate": 3.786402677560832e-05, + "loss": 0.1354, "step": 190 }, { - "epoch": 1.347442680776014, - "grad_norm": 0.39003878831863403, - "learning_rate": 3.794103006955407e-05, - "loss": 0.0155, + "epoch": 0.4641555285540705, + "grad_norm": 0.5680958032608032, + "learning_rate": 3.790263082106134e-05, + "loss": 0.0836, "step": 191 }, { - "epoch": 1.3544973544973544, - "grad_norm": 0.6276049017906189, - "learning_rate": 3.797922668254715e-05, - "loss": 0.0224, + "epoch": 0.4665856622114216, + "grad_norm": 0.7936691641807556, + "learning_rate": 3.794103006955407e-05, + "loss": 0.0526, "step": 192 }, { - "epoch": 1.3615520282186948, - "grad_norm": 0.920841634273529, - "learning_rate": 3.801722278746213e-05, - "loss": 0.0331, + "epoch": 0.4690157958687728, + "grad_norm": 1.0569026470184326, + "learning_rate": 3.797922668254715e-05, + "loss": 0.1512, "step": 193 }, { - "epoch": 1.3686067019400352, - "grad_norm": 0.4426538944244385, - "learning_rate": 3.8055020478392495e-05, - "loss": 0.0188, + "epoch": 0.47144592952612396, + "grad_norm": 1.2363556623458862, + "learning_rate": 3.801722278746213e-05, + "loss": 0.1316, "step": 194 }, { - "epoch": 1.3756613756613756, - "grad_norm": 0.6603868007659912, - "learning_rate": 3.809262181679623e-05, - "loss": 0.0183, + "epoch": 0.4738760631834751, + "grad_norm": 1.1622111797332764, + "learning_rate": 3.8055020478392495e-05, + "loss": 0.1432, "step": 195 }, { - "epoch": 1.382716049382716, - "grad_norm": 0.27960118651390076, - "learning_rate": 3.813002883217044e-05, - "loss": 0.013, + "epoch": 0.47630619684082626, + "grad_norm": 2.1137237548828125, + "learning_rate": 3.809262181679623e-05, + "loss": 0.1273, "step": 196 }, { - "epoch": 1.3897707231040564, - "grad_norm": 0.4121691584587097, - "learning_rate": 3.816724352270863e-05, - "loss": 0.0195, + "epoch": 0.4787363304981774, + "grad_norm": 1.0623483657836914, + "learning_rate": 3.813002883217044e-05, + "loss": 0.1066, "step": 197 }, { - "epoch": 1.3968253968253967, - "grad_norm": 0.3956167995929718, - "learning_rate": 3.8204267855941266e-05, - "loss": 0.018, + "epoch": 0.48116646415552855, + "grad_norm": 1.0300410985946655, + "learning_rate": 3.816724352270863e-05, + "loss": 0.1443, "step": 198 }, { - "epoch": 1.4038800705467371, - "grad_norm": 0.3960922360420227, - "learning_rate": 3.824110376935989e-05, - "loss": 0.0302, + "epoch": 0.4835965978128797, + "grad_norm": 0.6886430382728577, + "learning_rate": 3.8204267855941266e-05, + "loss": 0.0969, "step": 199 }, { - "epoch": 1.4109347442680775, - "grad_norm": 0.67223060131073, - "learning_rate": 3.827775317102552e-05, - "loss": 0.0435, + "epoch": 0.48602673147023084, + "grad_norm": 0.6511848568916321, + "learning_rate": 3.824110376935989e-05, + "loss": 0.0791, "step": 200 }, { - "epoch": 1.4109347442680775, - "eval_loss": 0.26601219177246094, - "eval_runtime": 350.5408, - "eval_samples_per_second": 6.47, - "eval_steps_per_second": 0.405, + "epoch": 0.48602673147023084, + "eval_loss": 0.311313271522522, + "eval_runtime": 505.4888, + "eval_samples_per_second": 5.436, + "eval_steps_per_second": 0.681, "step": 200 }, { - "epoch": 1.417989417989418, - "grad_norm": 0.34897902607917786, - "learning_rate": 3.831421794016178e-05, - "loss": 0.0113, + "epoch": 0.488456865127582, + "grad_norm": 0.9628679752349854, + "learning_rate": 3.827775317102552e-05, + "loss": 0.0952, "step": 201 }, { - "epoch": 1.4250440917107583, - "grad_norm": 0.4447449743747711, - "learning_rate": 3.835049992773302e-05, - "loss": 0.0197, + "epoch": 0.4908869987849332, + "grad_norm": 1.1625686883926392, + "learning_rate": 3.831421794016178e-05, + "loss": 0.1665, "step": 202 }, { - "epoch": 1.4320987654320987, - "grad_norm": 0.6095953583717346, - "learning_rate": 3.838660095700815e-05, - "loss": 0.0263, + "epoch": 0.4933171324422843, + "grad_norm": 1.7337137460708618, + "learning_rate": 3.835049992773302e-05, + "loss": 0.141, "step": 203 }, { - "epoch": 1.439153439153439, - "grad_norm": 0.7258620858192444, - "learning_rate": 3.84225228241104e-05, - "loss": 0.0121, + "epoch": 0.4957472660996355, + "grad_norm": 1.1475183963775635, + "learning_rate": 3.838660095700815e-05, + "loss": 0.0996, "step": 204 }, { - "epoch": 1.4462081128747795, - "grad_norm": 0.4723944365978241, - "learning_rate": 3.8458267298553554e-05, - "loss": 0.025, + "epoch": 0.49817739975698666, + "grad_norm": 0.6650044918060303, + "learning_rate": 3.84225228241104e-05, + "loss": 0.0611, "step": 205 }, { - "epoch": 1.4532627865961198, - "grad_norm": 0.37399163842201233, - "learning_rate": 3.8493836123764984e-05, - "loss": 0.0326, + "epoch": 0.5006075334143378, + "grad_norm": 0.6770364046096802, + "learning_rate": 3.8458267298553554e-05, + "loss": 0.051, "step": 206 }, { - "epoch": 1.4603174603174602, - "grad_norm": 0.5367653369903564, - "learning_rate": 3.852923101759591e-05, - "loss": 0.033, + "epoch": 0.503037667071689, + "grad_norm": 1.22215735912323, + "learning_rate": 3.8493836123764984e-05, + "loss": 0.0724, "step": 207 }, { - "epoch": 1.4673721340388006, - "grad_norm": 0.36451810598373413, - "learning_rate": 3.856445367281923e-05, - "loss": 0.012, + "epoch": 0.5054678007290401, + "grad_norm": 1.1168265342712402, + "learning_rate": 3.852923101759591e-05, + "loss": 0.116, "step": 208 }, { - "epoch": 1.474426807760141, - "grad_norm": 0.5865375995635986, - "learning_rate": 3.859950575761529e-05, - "loss": 0.0461, + "epoch": 0.5078979343863913, + "grad_norm": 0.7812952995300293, + "learning_rate": 3.856445367281923e-05, + "loss": 0.0726, "step": 209 }, { - "epoch": 1.4814814814814814, - "grad_norm": 0.4337036609649658, - "learning_rate": 3.8634388916046025e-05, - "loss": 0.0281, + "epoch": 0.5103280680437424, + "grad_norm": 0.7324075698852539, + "learning_rate": 3.859950575761529e-05, + "loss": 0.0359, "step": 210 }, { - "epoch": 1.4885361552028218, - "grad_norm": 0.3850497603416443, - "learning_rate": 3.866910476851757e-05, - "loss": 0.0141, + "epoch": 0.5127582017010935, + "grad_norm": 0.8063955903053284, + "learning_rate": 3.8634388916046025e-05, + "loss": 0.0912, "step": 211 }, { - "epoch": 1.4955908289241622, - "grad_norm": 0.31644490361213684, - "learning_rate": 3.870365491223199e-05, - "loss": 0.0098, + "epoch": 0.5151883353584447, + "grad_norm": 4.057889938354492, + "learning_rate": 3.866910476851757e-05, + "loss": 0.2407, "step": 212 }, { - "epoch": 1.5026455026455028, - "grad_norm": 0.5170356631278992, - "learning_rate": 3.8738040921628215e-05, - "loss": 0.0531, + "epoch": 0.5176184690157959, + "grad_norm": 0.7670718431472778, + "learning_rate": 3.870365491223199e-05, + "loss": 0.058, "step": 213 }, { - "epoch": 1.509700176366843, - "grad_norm": 0.34572136402130127, - "learning_rate": 3.877226434881253e-05, - "loss": 0.022, + "epoch": 0.520048602673147, + "grad_norm": 0.717004120349884, + "learning_rate": 3.8738040921628215e-05, + "loss": 0.0308, "step": 214 }, { - "epoch": 1.5167548500881836, - "grad_norm": 0.3803236782550812, - "learning_rate": 3.880632672397897e-05, - "loss": 0.0217, + "epoch": 0.5224787363304981, + "grad_norm": 1.2053970098495483, + "learning_rate": 3.877226434881253e-05, + "loss": 0.091, "step": 215 }, { - "epoch": 1.5238095238095237, - "grad_norm": 0.7378140091896057, - "learning_rate": 3.884022955581985e-05, - "loss": 0.0149, + "epoch": 0.5249088699878494, + "grad_norm": 1.3508349657058716, + "learning_rate": 3.880632672397897e-05, + "loss": 0.1201, "step": 216 }, { - "epoch": 1.5308641975308643, - "grad_norm": 0.3270248472690582, - "learning_rate": 3.887397433192676e-05, - "loss": 0.0176, + "epoch": 0.5273390036452005, + "grad_norm": 0.6895163059234619, + "learning_rate": 3.884022955581985e-05, + "loss": 0.0403, "step": 217 }, { - "epoch": 1.5379188712522045, - "grad_norm": 0.6880928874015808, - "learning_rate": 3.890756251918219e-05, - "loss": 0.0319, + "epoch": 0.5297691373025516, + "grad_norm": 0.7722972631454468, + "learning_rate": 3.887397433192676e-05, + "loss": 0.0984, "step": 218 }, { - "epoch": 1.544973544973545, - "grad_norm": 0.3759993612766266, - "learning_rate": 3.894099556414216e-05, - "loss": 0.0171, + "epoch": 0.5321992709599028, + "grad_norm": 0.897186279296875, + "learning_rate": 3.890756251918219e-05, + "loss": 0.1564, "step": 219 }, { - "epoch": 1.5520282186948853, - "grad_norm": 0.4239048361778259, - "learning_rate": 3.897427489341009e-05, - "loss": 0.0175, + "epoch": 0.534629404617254, + "grad_norm": 0.5847256183624268, + "learning_rate": 3.894099556414216e-05, + "loss": 0.0905, "step": 220 }, { - "epoch": 1.5590828924162259, - "grad_norm": 0.3262343108654022, - "learning_rate": 3.900740191400198e-05, - "loss": 0.0213, + "epoch": 0.5370595382746051, + "grad_norm": 0.7907549142837524, + "learning_rate": 3.897427489341009e-05, + "loss": 0.075, "step": 221 }, { - "epoch": 1.566137566137566, - "grad_norm": 0.5575190782546997, - "learning_rate": 3.904037801370344e-05, - "loss": 0.0185, + "epoch": 0.5394896719319563, + "grad_norm": 0.8477361798286438, + "learning_rate": 3.900740191400198e-05, + "loss": 0.028, "step": 222 }, { - "epoch": 1.5731922398589067, - "grad_norm": 0.5186108350753784, - "learning_rate": 3.9073204561418514e-05, - "loss": 0.0358, + "epoch": 0.5419198055893074, + "grad_norm": 1.1732168197631836, + "learning_rate": 3.904037801370344e-05, + "loss": 0.0921, "step": 223 }, { - "epoch": 1.5802469135802468, - "grad_norm": 0.6314687728881836, - "learning_rate": 3.9105882907510644e-05, - "loss": 0.0358, + "epoch": 0.5443499392466585, + "grad_norm": 0.9721484184265137, + "learning_rate": 3.9073204561418514e-05, + "loss": 0.1273, "step": 224 }, { - "epoch": 1.5873015873015874, - "grad_norm": 0.5658460855484009, - "learning_rate": 3.913841438413601e-05, - "loss": 0.022, + "epoch": 0.5467800729040098, + "grad_norm": 0.6401745676994324, + "learning_rate": 3.9105882907510644e-05, + "loss": 0.0433, "step": 225 }, { - "epoch": 1.5943562610229276, - "grad_norm": 0.2356942743062973, - "learning_rate": 3.917080030556938e-05, - "loss": 0.0094, + "epoch": 0.5492102065613609, + "grad_norm": 1.2300125360488892, + "learning_rate": 3.913841438413601e-05, + "loss": 0.0574, "step": 226 }, { - "epoch": 1.6014109347442682, - "grad_norm": 0.2790418565273285, - "learning_rate": 3.9203041968522716e-05, - "loss": 0.0138, + "epoch": 0.551640340218712, + "grad_norm": 0.9645660519599915, + "learning_rate": 3.917080030556938e-05, + "loss": 0.0491, "step": 227 }, { - "epoch": 1.6084656084656084, - "grad_norm": 0.3582196533679962, - "learning_rate": 3.923514065245669e-05, - "loss": 0.0483, + "epoch": 0.5540704738760632, + "grad_norm": 1.5600403547286987, + "learning_rate": 3.9203041968522716e-05, + "loss": 0.1232, "step": 228 }, { - "epoch": 1.615520282186949, - "grad_norm": 0.4799334406852722, - "learning_rate": 3.926709761988538e-05, - "loss": 0.026, + "epoch": 0.5565006075334143, + "grad_norm": 1.07868230342865, + "learning_rate": 3.923514065245669e-05, + "loss": 0.0674, "step": 229 }, { - "epoch": 1.6225749559082892, - "grad_norm": 0.5170966386795044, - "learning_rate": 3.929891411667424e-05, - "loss": 0.0269, + "epoch": 0.5589307411907655, + "grad_norm": 0.933269739151001, + "learning_rate": 3.926709761988538e-05, + "loss": 0.0753, "step": 230 }, { - "epoch": 1.6296296296296298, - "grad_norm": 0.3738221824169159, - "learning_rate": 3.933059137233147e-05, - "loss": 0.0252, + "epoch": 0.5613608748481167, + "grad_norm": 2.12733793258667, + "learning_rate": 3.929891411667424e-05, + "loss": 0.1144, "step": 231 }, { - "epoch": 1.63668430335097, - "grad_norm": 0.28906935453414917, - "learning_rate": 3.9362130600293214e-05, - "loss": 0.0161, + "epoch": 0.5637910085054678, + "grad_norm": 1.380554437637329, + "learning_rate": 3.933059137233147e-05, + "loss": 0.0843, "step": 232 }, { - "epoch": 1.6437389770723105, - "grad_norm": 0.20773281157016754, - "learning_rate": 3.9393532998202405e-05, - "loss": 0.0117, + "epoch": 0.5662211421628189, + "grad_norm": 1.0420360565185547, + "learning_rate": 3.9362130600293214e-05, + "loss": 0.111, "step": 233 }, { - "epoch": 1.6507936507936507, - "grad_norm": 0.509770929813385, - "learning_rate": 3.942479974818166e-05, - "loss": 0.0208, + "epoch": 0.56865127582017, + "grad_norm": 0.6409225463867188, + "learning_rate": 3.9393532998202405e-05, + "loss": 0.0425, "step": 234 }, { - "epoch": 1.6578483245149913, - "grad_norm": 0.9972848296165466, - "learning_rate": 3.945593201710032e-05, - "loss": 0.0218, + "epoch": 0.5710814094775213, + "grad_norm": 0.8021889328956604, + "learning_rate": 3.942479974818166e-05, + "loss": 0.0386, "step": 235 }, { - "epoch": 1.6649029982363315, - "grad_norm": 0.5581172704696655, - "learning_rate": 3.9486930956835724e-05, - "loss": 0.0148, + "epoch": 0.5735115431348724, + "grad_norm": 1.5499992370605469, + "learning_rate": 3.945593201710032e-05, + "loss": 0.0404, "step": 236 }, { - "epoch": 1.671957671957672, - "grad_norm": 0.37124931812286377, - "learning_rate": 3.951779770452894e-05, - "loss": 0.0265, + "epoch": 0.5759416767922235, + "grad_norm": 0.7606313824653625, + "learning_rate": 3.9486930956835724e-05, + "loss": 0.0789, "step": 237 }, { - "epoch": 1.6790123456790123, - "grad_norm": 2.602940320968628, - "learning_rate": 3.954853338283512e-05, - "loss": 0.0194, + "epoch": 0.5783718104495748, + "grad_norm": 0.9025908708572388, + "learning_rate": 3.951779770452894e-05, + "loss": 0.0617, "step": 238 }, { - "epoch": 1.6860670194003529, - "grad_norm": 0.5400798916816711, - "learning_rate": 3.9579139100168404e-05, - "loss": 0.0261, + "epoch": 0.5808019441069259, + "grad_norm": 1.1557445526123047, + "learning_rate": 3.954853338283512e-05, + "loss": 0.0751, "step": 239 }, { - "epoch": 1.693121693121693, - "grad_norm": 0.45997512340545654, - "learning_rate": 3.960961595094187e-05, - "loss": 0.0258, + "epoch": 0.583232077764277, + "grad_norm": 1.5104789733886719, + "learning_rate": 3.9579139100168404e-05, + "loss": 0.0732, "step": 240 }, { - "epoch": 1.7001763668430336, - "grad_norm": 0.46961885690689087, - "learning_rate": 3.96399650158023e-05, - "loss": 0.0342, + "epoch": 0.5856622114216282, + "grad_norm": 0.9768268465995789, + "learning_rate": 3.960961595094187e-05, + "loss": 0.0797, "step": 241 }, { - "epoch": 1.7072310405643738, - "grad_norm": 0.3677431643009186, - "learning_rate": 3.96701873618601e-05, - "loss": 0.0246, + "epoch": 0.5880923450789793, + "grad_norm": 0.7394477725028992, + "learning_rate": 3.96399650158023e-05, + "loss": 0.0397, "step": 242 }, { - "epoch": 1.7142857142857144, - "grad_norm": 0.5030479431152344, - "learning_rate": 3.970028404291448e-05, - "loss": 0.0337, + "epoch": 0.5905224787363305, + "grad_norm": 0.742852509021759, + "learning_rate": 3.96701873618601e-05, + "loss": 0.0742, "step": 243 }, { - "epoch": 1.7213403880070546, - "grad_norm": 0.3739985525608063, - "learning_rate": 3.9730256099673865e-05, - "loss": 0.0123, + "epoch": 0.5929526123936817, + "grad_norm": 0.7512255311012268, + "learning_rate": 3.970028404291448e-05, + "loss": 0.0281, "step": 244 }, { - "epoch": 1.7283950617283952, - "grad_norm": 0.28314831852912903, - "learning_rate": 3.976010455997187e-05, - "loss": 0.0123, + "epoch": 0.5953827460510328, + "grad_norm": 0.6248149871826172, + "learning_rate": 3.9730256099673865e-05, + "loss": 0.0375, "step": 245 }, { - "epoch": 1.7354497354497354, - "grad_norm": 0.3528687059879303, - "learning_rate": 3.978983043897883e-05, - "loss": 0.0273, + "epoch": 0.5978128797083839, + "grad_norm": 0.8596628904342651, + "learning_rate": 3.976010455997187e-05, + "loss": 0.1213, "step": 246 }, { - "epoch": 1.742504409171076, - "grad_norm": 0.394379585981369, - "learning_rate": 3.981943473940888e-05, - "loss": 0.0154, + "epoch": 0.6002430133657352, + "grad_norm": 0.7119196057319641, + "learning_rate": 3.978983043897883e-05, + "loss": 0.0849, "step": 247 }, { - "epoch": 1.7495590828924161, - "grad_norm": 0.4291190207004547, - "learning_rate": 3.984891845172299e-05, - "loss": 0.0306, + "epoch": 0.6026731470230863, + "grad_norm": 0.8290873765945435, + "learning_rate": 3.981943473940888e-05, + "loss": 0.0531, "step": 248 }, { - "epoch": 1.7566137566137567, - "grad_norm": 0.5018404722213745, - "learning_rate": 3.987828255432777e-05, - "loss": 0.0234, + "epoch": 0.6051032806804374, + "grad_norm": 0.4561799168586731, + "learning_rate": 3.984891845172299e-05, + "loss": 0.0327, "step": 249 }, { - "epoch": 1.763668430335097, - "grad_norm": 0.4209292232990265, - "learning_rate": 3.9907528013770276e-05, - "loss": 0.0204, + "epoch": 0.6075334143377886, + "grad_norm": 0.8212061524391174, + "learning_rate": 3.987828255432777e-05, + "loss": 0.0769, "step": 250 }, { - "epoch": 1.7707231040564375, - "grad_norm": 0.2811947166919708, - "learning_rate": 3.993665578492894e-05, - "loss": 0.0088, + "epoch": 0.6099635479951397, + "grad_norm": 0.6895563006401062, + "learning_rate": 3.9907528013770276e-05, + "loss": 0.0405, "step": 251 }, { - "epoch": 1.7777777777777777, - "grad_norm": 0.5545956492424011, - "learning_rate": 3.9965666811200624e-05, - "loss": 0.0251, + "epoch": 0.6123936816524909, + "grad_norm": 1.4649648666381836, + "learning_rate": 3.993665578492894e-05, + "loss": 0.1539, "step": 252 }, { - "epoch": 1.7848324514991183, - "grad_norm": 0.49265754222869873, - "learning_rate": 3.999456202468397e-05, - "loss": 0.0384, + "epoch": 0.6148238153098421, + "grad_norm": 1.6070215702056885, + "learning_rate": 3.9965666811200624e-05, + "loss": 0.2167, "step": 253 }, { - "epoch": 1.7918871252204585, - "grad_norm": 0.4486110508441925, - "learning_rate": 4.002334234635907e-05, - "loss": 0.0195, + "epoch": 0.6172539489671932, + "grad_norm": 0.9879380464553833, + "learning_rate": 3.999456202468397e-05, + "loss": 0.1, "step": 254 }, { - "epoch": 1.798941798941799, - "grad_norm": 0.3127782940864563, - "learning_rate": 4.005200868626364e-05, - "loss": 0.0155, + "epoch": 0.6196840826245443, + "grad_norm": 0.8622350096702576, + "learning_rate": 4.002334234635907e-05, + "loss": 0.1184, "step": 255 }, { - "epoch": 1.8059964726631392, - "grad_norm": 0.6288918256759644, - "learning_rate": 4.008056194366564e-05, - "loss": 0.0306, + "epoch": 0.6221142162818954, + "grad_norm": 1.95242440700531, + "learning_rate": 4.005200868626364e-05, + "loss": 0.1317, "step": 256 }, { - "epoch": 1.8130511463844798, - "grad_norm": 0.3001275062561035, - "learning_rate": 4.010900300723259e-05, - "loss": 0.018, + "epoch": 0.6245443499392467, + "grad_norm": 1.3259199857711792, + "learning_rate": 4.008056194366564e-05, + "loss": 0.1522, "step": 257 }, { - "epoch": 1.82010582010582, - "grad_norm": 0.33980268239974976, - "learning_rate": 4.013733275519749e-05, - "loss": 0.0153, + "epoch": 0.6269744835965978, + "grad_norm": 0.9809350371360779, + "learning_rate": 4.010900300723259e-05, + "loss": 0.0581, "step": 258 }, { - "epoch": 1.8271604938271606, - "grad_norm": 0.42874833941459656, - "learning_rate": 4.016555205552158e-05, - "loss": 0.0261, + "epoch": 0.6294046172539489, + "grad_norm": 1.1465263366699219, + "learning_rate": 4.013733275519749e-05, + "loss": 0.1303, "step": 259 }, { - "epoch": 1.8342151675485008, - "grad_norm": 0.2633664011955261, - "learning_rate": 4.0193661766053834e-05, - "loss": 0.01, + "epoch": 0.6318347509113001, + "grad_norm": 1.4853742122650146, + "learning_rate": 4.016555205552158e-05, + "loss": 0.0983, "step": 260 }, { - "epoch": 1.8412698412698414, - "grad_norm": 0.39592593908309937, - "learning_rate": 4.022166273468753e-05, - "loss": 0.0358, + "epoch": 0.6342648845686513, + "grad_norm": 0.4413992166519165, + "learning_rate": 4.0193661766053834e-05, + "loss": 0.0473, "step": 261 }, { - "epoch": 1.8483245149911816, - "grad_norm": 0.2792883813381195, - "learning_rate": 4.024955579951363e-05, - "loss": 0.0151, + "epoch": 0.6366950182260024, + "grad_norm": 0.9577488899230957, + "learning_rate": 4.022166273468753e-05, + "loss": 0.15, "step": 262 }, { - "epoch": 1.8553791887125222, - "grad_norm": 0.4893220067024231, - "learning_rate": 4.027734178897136e-05, - "loss": 0.0307, + "epoch": 0.6391251518833536, + "grad_norm": 0.7132108211517334, + "learning_rate": 4.024955579951363e-05, + "loss": 0.1153, "step": 263 }, { - "epoch": 1.8624338624338623, - "grad_norm": 0.3258034288883209, - "learning_rate": 4.030502152199576e-05, - "loss": 0.0228, + "epoch": 0.6415552855407047, + "grad_norm": 0.7191299796104431, + "learning_rate": 4.027734178897136e-05, + "loss": 0.0538, "step": 264 }, { - "epoch": 1.869488536155203, - "grad_norm": 0.33434444665908813, - "learning_rate": 4.033259580816264e-05, - "loss": 0.0165, + "epoch": 0.6439854191980559, + "grad_norm": 0.6709555983543396, + "learning_rate": 4.030502152199576e-05, + "loss": 0.1552, "step": 265 }, { - "epoch": 1.876543209876543, - "grad_norm": 0.3734183609485626, - "learning_rate": 4.036006544783052e-05, - "loss": 0.018, + "epoch": 0.6464155528554071, + "grad_norm": 0.8649526834487915, + "learning_rate": 4.033259580816264e-05, + "loss": 0.1492, "step": 266 }, { - "epoch": 1.8835978835978837, - "grad_norm": 0.23990300297737122, - "learning_rate": 4.0387431232280135e-05, - "loss": 0.0099, + "epoch": 0.6488456865127582, + "grad_norm": 0.6234789490699768, + "learning_rate": 4.036006544783052e-05, + "loss": 0.0777, "step": 267 }, { - "epoch": 1.8906525573192239, - "grad_norm": 0.4307016432285309, - "learning_rate": 4.041469394385112e-05, - "loss": 0.0276, + "epoch": 0.6512758201701093, + "grad_norm": 1.0768671035766602, + "learning_rate": 4.0387431232280135e-05, + "loss": 0.177, "step": 268 }, { - "epoch": 1.8977072310405645, - "grad_norm": 0.5069787502288818, - "learning_rate": 4.0441854356076257e-05, - "loss": 0.0398, + "epoch": 0.6537059538274606, + "grad_norm": 0.7391580939292908, + "learning_rate": 4.041469394385112e-05, + "loss": 0.1217, "step": 269 }, { - "epoch": 1.9047619047619047, - "grad_norm": 0.24547268450260162, - "learning_rate": 4.046891323381315e-05, - "loss": 0.0135, + "epoch": 0.6561360874848117, + "grad_norm": 0.5944250226020813, + "learning_rate": 4.0441854356076257e-05, + "loss": 0.0678, "step": 270 }, { - "epoch": 1.9118165784832453, - "grad_norm": 0.49004027247428894, - "learning_rate": 4.049587133337347e-05, - "loss": 0.0356, + "epoch": 0.6585662211421628, + "grad_norm": 0.5040566325187683, + "learning_rate": 4.046891323381315e-05, + "loss": 0.0865, "step": 271 }, { - "epoch": 1.9188712522045854, - "grad_norm": 0.3411717116832733, - "learning_rate": 4.0522729402649793e-05, - "loss": 0.0206, + "epoch": 0.660996354799514, + "grad_norm": 1.0286433696746826, + "learning_rate": 4.049587133337347e-05, + "loss": 0.0643, "step": 272 }, { - "epoch": 1.925925925925926, - "grad_norm": 0.6035274267196655, - "learning_rate": 4.0549488181240096e-05, - "loss": 0.0416, + "epoch": 0.6634264884568651, + "grad_norm": 1.6537009477615356, + "learning_rate": 4.0522729402649793e-05, + "loss": 0.1008, "step": 273 }, { - "epoch": 1.9329805996472662, - "grad_norm": 0.5692528486251831, - "learning_rate": 4.057614840056998e-05, - "loss": 0.024, + "epoch": 0.6658566221142163, + "grad_norm": 0.7121666669845581, + "learning_rate": 4.0549488181240096e-05, + "loss": 0.0865, "step": 274 }, { - "epoch": 1.9400352733686068, - "grad_norm": 0.35133275389671326, - "learning_rate": 4.06027107840126e-05, - "loss": 0.0129, + "epoch": 0.6682867557715675, + "grad_norm": 0.8037539720535278, + "learning_rate": 4.057614840056998e-05, + "loss": 0.0976, "step": 275 }, { - "epoch": 1.947089947089947, - "grad_norm": 0.4275812804698944, - "learning_rate": 4.0629176047006474e-05, - "loss": 0.0211, + "epoch": 0.6707168894289186, + "grad_norm": 0.6083033680915833, + "learning_rate": 4.06027107840126e-05, + "loss": 0.1118, "step": 276 }, { - "epoch": 1.9541446208112876, - "grad_norm": 0.33566832542419434, - "learning_rate": 4.065554489717105e-05, - "loss": 0.0149, + "epoch": 0.6731470230862697, + "grad_norm": 1.8657127618789673, + "learning_rate": 4.0629176047006474e-05, + "loss": 0.0523, "step": 277 }, { - "epoch": 1.9611992945326278, - "grad_norm": 0.2758539319038391, - "learning_rate": 4.068181803442029e-05, - "loss": 0.0155, + "epoch": 0.675577156743621, + "grad_norm": 0.6102950572967529, + "learning_rate": 4.065554489717105e-05, + "loss": 0.0926, "step": 278 }, { - "epoch": 1.9682539682539684, - "grad_norm": 0.5552444458007812, - "learning_rate": 4.0707996151074147e-05, - "loss": 0.0338, + "epoch": 0.6780072904009721, + "grad_norm": 0.7026309370994568, + "learning_rate": 4.068181803442029e-05, + "loss": 0.0968, "step": 279 }, { - "epoch": 1.9753086419753085, - "grad_norm": 0.30761590600013733, - "learning_rate": 4.073407993196794e-05, - "loss": 0.0124, + "epoch": 0.6804374240583232, + "grad_norm": 0.6937738656997681, + "learning_rate": 4.0707996151074147e-05, + "loss": 0.0554, "step": 280 }, { - "epoch": 1.9823633156966491, - "grad_norm": 0.483833909034729, - "learning_rate": 4.076007005455996e-05, - "loss": 0.0184, + "epoch": 0.6828675577156743, + "grad_norm": 1.4066294431686401, + "learning_rate": 4.073407993196794e-05, + "loss": 0.0964, "step": 281 }, { - "epoch": 1.9894179894179893, - "grad_norm": 0.4557156264781952, - "learning_rate": 4.0785967189036986e-05, - "loss": 0.0281, + "epoch": 0.6852976913730255, + "grad_norm": 0.6314956545829773, + "learning_rate": 4.076007005455996e-05, + "loss": 0.1161, "step": 282 }, { - "epoch": 1.99647266313933, - "grad_norm": 0.3032929301261902, - "learning_rate": 4.0811771998418e-05, - "loss": 0.0127, + "epoch": 0.6877278250303767, + "grad_norm": 0.9460674524307251, + "learning_rate": 4.0785967189036986e-05, + "loss": 0.044, "step": 283 }, { - "epoch": 2.00352733686067, - "grad_norm": 0.306891530752182, - "learning_rate": 4.083748513865602e-05, - "loss": 0.0245, + "epoch": 0.6901579586877278, + "grad_norm": 0.7385574579238892, + "learning_rate": 4.0811771998418e-05, + "loss": 0.118, "step": 284 }, { - "epoch": 2.0105820105820107, - "grad_norm": 0.30066776275634766, - "learning_rate": 4.086310725873818e-05, - "loss": 0.0149, + "epoch": 0.692588092345079, + "grad_norm": 0.7021672129631042, + "learning_rate": 4.083748513865602e-05, + "loss": 0.085, "step": 285 }, { - "epoch": 2.017636684303351, - "grad_norm": 0.346320778131485, - "learning_rate": 4.0888639000783966e-05, - "loss": 0.0133, + "epoch": 0.6950182260024301, + "grad_norm": 0.6057882308959961, + "learning_rate": 4.086310725873818e-05, + "loss": 0.0389, "step": 286 }, { - "epoch": 2.0246913580246915, - "grad_norm": 0.23275919258594513, - "learning_rate": 4.0914081000141844e-05, - "loss": 0.011, + "epoch": 0.6974483596597812, + "grad_norm": 0.7348142862319946, + "learning_rate": 4.0888639000783966e-05, + "loss": 0.0888, "step": 287 }, { - "epoch": 2.0317460317460316, - "grad_norm": 0.3610820174217224, - "learning_rate": 4.0939433885484055e-05, - "loss": 0.0197, + "epoch": 0.6998784933171325, + "grad_norm": 0.8555133938789368, + "learning_rate": 4.0914081000141844e-05, + "loss": 0.0917, "step": 288 }, { - "epoch": 2.0388007054673722, - "grad_norm": 0.453847736120224, - "learning_rate": 4.0964698278899874e-05, - "loss": 0.0268, + "epoch": 0.7023086269744836, + "grad_norm": 0.8100624084472656, + "learning_rate": 4.0939433885484055e-05, + "loss": 0.0811, "step": 289 }, { - "epoch": 2.0458553791887124, - "grad_norm": 0.4168277680873871, - "learning_rate": 4.0989874795987185e-05, - "loss": 0.0173, + "epoch": 0.7047387606318347, + "grad_norm": 0.5672865509986877, + "learning_rate": 4.0964698278899874e-05, + "loss": 0.0389, "step": 290 }, { - "epoch": 2.052910052910053, - "grad_norm": 0.2898412346839905, - "learning_rate": 4.1014964045942465e-05, - "loss": 0.0126, + "epoch": 0.707168894289186, + "grad_norm": 0.842689573764801, + "learning_rate": 4.0989874795987185e-05, + "loss": 0.0887, "step": 291 }, { - "epoch": 2.059964726631393, - "grad_norm": 0.4505179226398468, - "learning_rate": 4.103996663164927e-05, - "loss": 0.0161, + "epoch": 0.7095990279465371, + "grad_norm": 1.099148154258728, + "learning_rate": 4.1014964045942465e-05, + "loss": 0.0592, "step": 292 }, { - "epoch": 2.067019400352734, - "grad_norm": 0.5993428230285645, - "learning_rate": 4.106488314976513e-05, - "loss": 0.0253, + "epoch": 0.7120291616038882, + "grad_norm": 1.0394737720489502, + "learning_rate": 4.103996663164927e-05, + "loss": 0.0568, "step": 293 }, { - "epoch": 2.074074074074074, - "grad_norm": 0.41079506278038025, - "learning_rate": 4.108971419080698e-05, - "loss": 0.0186, + "epoch": 0.7144592952612394, + "grad_norm": 0.5482613444328308, + "learning_rate": 4.106488314976513e-05, + "loss": 0.0724, "step": 294 }, { - "epoch": 2.0811287477954146, - "grad_norm": 0.3127981126308441, - "learning_rate": 4.111446033923516e-05, - "loss": 0.0114, + "epoch": 0.7168894289185905, + "grad_norm": 0.6032484173774719, + "learning_rate": 4.108971419080698e-05, + "loss": 0.0448, "step": 295 }, { - "epoch": 2.0881834215167547, - "grad_norm": 0.43792659044265747, - "learning_rate": 4.113912217353596e-05, - "loss": 0.0238, + "epoch": 0.7193195625759417, + "grad_norm": 0.7295458316802979, + "learning_rate": 4.111446033923516e-05, + "loss": 0.093, "step": 296 }, { - "epoch": 2.0952380952380953, - "grad_norm": 0.2403118908405304, - "learning_rate": 4.116370026630272e-05, - "loss": 0.0072, + "epoch": 0.7217496962332929, + "grad_norm": 0.5472877621650696, + "learning_rate": 4.113912217353596e-05, + "loss": 0.0799, "step": 297 }, { - "epoch": 2.1022927689594355, - "grad_norm": 0.2544070780277252, - "learning_rate": 4.118819518431564e-05, - "loss": 0.0154, + "epoch": 0.724179829890644, + "grad_norm": 0.682966411113739, + "learning_rate": 4.116370026630272e-05, + "loss": 0.0575, "step": 298 }, { - "epoch": 2.109347442680776, - "grad_norm": 0.40166372060775757, - "learning_rate": 4.121260748862021e-05, - "loss": 0.0259, + "epoch": 0.7266099635479951, + "grad_norm": 0.4737589657306671, + "learning_rate": 4.118819518431564e-05, + "loss": 0.0949, "step": 299 }, { - "epoch": 2.1164021164021163, - "grad_norm": 0.4582963287830353, - "learning_rate": 4.123693773460426e-05, - "loss": 0.02, + "epoch": 0.7290400972053463, + "grad_norm": 0.6645620465278625, + "learning_rate": 4.121260748862021e-05, + "loss": 0.0233, "step": 300 }, { - "epoch": 2.1164021164021163, - "eval_loss": 0.2626274526119232, - "eval_runtime": 350.1926, - "eval_samples_per_second": 6.476, - "eval_steps_per_second": 0.405, + "epoch": 0.7290400972053463, + "eval_loss": 0.3244495689868927, + "eval_runtime": 503.9011, + "eval_samples_per_second": 5.453, + "eval_steps_per_second": 0.683, "step": 300 }, { - "epoch": 2.123456790123457, - "grad_norm": 0.2750282287597656, - "learning_rate": 4.126118647207383e-05, - "loss": 0.0133, + "epoch": 0.7314702308626975, + "grad_norm": 0.6423314809799194, + "learning_rate": 4.123693773460426e-05, + "loss": 0.0774, "step": 301 }, { - "epoch": 2.130511463844797, - "grad_norm": 0.3560563027858734, - "learning_rate": 4.1285354245327715e-05, - "loss": 0.0115, + "epoch": 0.7339003645200486, + "grad_norm": 0.6238884329795837, + "learning_rate": 4.126118647207383e-05, + "loss": 0.0638, "step": 302 }, { - "epoch": 2.1375661375661377, - "grad_norm": 0.41415050625801086, - "learning_rate": 4.1309441593230726e-05, - "loss": 0.0231, + "epoch": 0.7363304981773997, + "grad_norm": 2.3978679180145264, + "learning_rate": 4.1285354245327715e-05, + "loss": 0.1022, "step": 303 }, { - "epoch": 2.144620811287478, - "grad_norm": 0.48563840985298157, - "learning_rate": 4.133344904928585e-05, - "loss": 0.043, + "epoch": 0.7387606318347509, + "grad_norm": 0.49952298402786255, + "learning_rate": 4.1309441593230726e-05, + "loss": 0.0477, "step": 304 }, { - "epoch": 2.1516754850088184, - "grad_norm": 0.41206222772598267, - "learning_rate": 4.1357377141705084e-05, - "loss": 0.0177, + "epoch": 0.741190765492102, + "grad_norm": 1.5169883966445923, + "learning_rate": 4.133344904928585e-05, + "loss": 0.1601, "step": 305 }, { - "epoch": 2.1587301587301586, - "grad_norm": 0.24333642423152924, - "learning_rate": 4.1381226393479236e-05, - "loss": 0.0085, + "epoch": 0.7436208991494532, + "grad_norm": 0.5875476002693176, + "learning_rate": 4.1357377141705084e-05, + "loss": 0.0866, "step": 306 }, { - "epoch": 2.165784832451499, - "grad_norm": 0.45870691537857056, - "learning_rate": 4.1404997322446435e-05, - "loss": 0.0123, + "epoch": 0.7460510328068044, + "grad_norm": 0.9874062538146973, + "learning_rate": 4.1381226393479236e-05, + "loss": 0.0901, "step": 307 }, { - "epoch": 2.1728395061728394, - "grad_norm": 0.5222560167312622, - "learning_rate": 4.142869044135967e-05, - "loss": 0.0235, + "epoch": 0.7484811664641555, + "grad_norm": 1.2504879236221313, + "learning_rate": 4.1404997322446435e-05, + "loss": 0.1227, "step": 308 }, { - "epoch": 2.17989417989418, - "grad_norm": 0.4070928394794464, - "learning_rate": 4.145230625795311e-05, - "loss": 0.0477, + "epoch": 0.7509113001215066, + "grad_norm": 0.6713179349899292, + "learning_rate": 4.142869044135967e-05, + "loss": 0.0347, "step": 309 }, { - "epoch": 2.18694885361552, - "grad_norm": 0.43290865421295166, - "learning_rate": 4.14758452750074e-05, - "loss": 0.0144, + "epoch": 0.7533414337788579, + "grad_norm": 0.8156313896179199, + "learning_rate": 4.145230625795311e-05, + "loss": 0.0548, "step": 310 }, { - "epoch": 2.1940035273368608, - "grad_norm": 0.35074254870414734, - "learning_rate": 4.149930799041392e-05, - "loss": 0.0261, + "epoch": 0.755771567436209, + "grad_norm": 0.9566905498504639, + "learning_rate": 4.14758452750074e-05, + "loss": 0.1255, "step": 311 }, { - "epoch": 2.201058201058201, - "grad_norm": 0.5638989806175232, - "learning_rate": 4.152269489723788e-05, - "loss": 0.016, + "epoch": 0.7582017010935601, + "grad_norm": 0.8393445611000061, + "learning_rate": 4.149930799041392e-05, + "loss": 0.0587, "step": 312 }, { - "epoch": 2.2081128747795415, - "grad_norm": 0.15761926770210266, - "learning_rate": 4.1546006483780626e-05, - "loss": 0.0062, + "epoch": 0.7606318347509113, + "grad_norm": 0.637996256351471, + "learning_rate": 4.152269489723788e-05, + "loss": 0.0881, "step": 313 }, { - "epoch": 2.2151675485008817, - "grad_norm": 0.20456282794475555, - "learning_rate": 4.156924323364072e-05, - "loss": 0.0105, + "epoch": 0.7630619684082625, + "grad_norm": 0.8390913605690002, + "learning_rate": 4.1546006483780626e-05, + "loss": 0.0881, "step": 314 }, { - "epoch": 2.2222222222222223, - "grad_norm": 0.23817496001720428, - "learning_rate": 4.1592405625774144e-05, - "loss": 0.0111, + "epoch": 0.7654921020656136, + "grad_norm": 0.7430179715156555, + "learning_rate": 4.156924323364072e-05, + "loss": 0.0409, "step": 315 }, { - "epoch": 2.2292768959435625, - "grad_norm": 0.4282923936843872, - "learning_rate": 4.161549413455358e-05, - "loss": 0.0135, + "epoch": 0.7679222357229648, + "grad_norm": 0.7785168886184692, + "learning_rate": 4.1592405625774144e-05, + "loss": 0.0511, "step": 316 }, { - "epoch": 2.236331569664903, - "grad_norm": 0.1764887571334839, - "learning_rate": 4.163850922982668e-05, - "loss": 0.0096, + "epoch": 0.7703523693803159, + "grad_norm": 0.8135663866996765, + "learning_rate": 4.161549413455358e-05, + "loss": 0.0703, "step": 317 }, { - "epoch": 2.2433862433862433, - "grad_norm": 0.8953129649162292, - "learning_rate": 4.16614513769734e-05, - "loss": 0.0128, + "epoch": 0.772782503037667, + "grad_norm": 0.8496614694595337, + "learning_rate": 4.163850922982668e-05, + "loss": 0.1502, "step": 318 }, { - "epoch": 2.250440917107584, - "grad_norm": 0.4081833064556122, - "learning_rate": 4.1684321036962526e-05, - "loss": 0.0156, + "epoch": 0.7752126366950183, + "grad_norm": 0.8001265525817871, + "learning_rate": 4.16614513769734e-05, + "loss": 0.0917, "step": 319 }, { - "epoch": 2.257495590828924, - "grad_norm": 0.38541778922080994, - "learning_rate": 4.170711866640721e-05, - "loss": 0.016, + "epoch": 0.7776427703523694, + "grad_norm": 0.5384124517440796, + "learning_rate": 4.1684321036962526e-05, + "loss": 0.0574, "step": 320 }, { - "epoch": 2.2645502645502646, - "grad_norm": 0.3514292240142822, - "learning_rate": 4.1729844717619684e-05, - "loss": 0.0174, + "epoch": 0.7800729040097205, + "grad_norm": 0.6082786917686462, + "learning_rate": 4.170711866640721e-05, + "loss": 0.0285, "step": 321 }, { - "epoch": 2.271604938271605, - "grad_norm": 0.28806954622268677, - "learning_rate": 4.17524996386651e-05, - "loss": 0.0249, + "epoch": 0.7825030376670717, + "grad_norm": 0.6169834136962891, + "learning_rate": 4.1729844717619684e-05, + "loss": 0.0529, "step": 322 }, { - "epoch": 2.2786596119929454, - "grad_norm": 0.29771488904953003, - "learning_rate": 4.177508387341454e-05, - "loss": 0.0175, + "epoch": 0.7849331713244229, + "grad_norm": 1.1811317205429077, + "learning_rate": 4.17524996386651e-05, + "loss": 0.0449, "step": 323 }, { - "epoch": 2.2857142857142856, - "grad_norm": 0.5112621188163757, - "learning_rate": 4.179759786159719e-05, - "loss": 0.0465, + "epoch": 0.787363304981774, + "grad_norm": 0.7238284945487976, + "learning_rate": 4.177508387341454e-05, + "loss": 0.046, "step": 324 }, { - "epoch": 2.292768959435626, - "grad_norm": 0.2908097207546234, - "learning_rate": 4.182004203885172e-05, - "loss": 0.0165, + "epoch": 0.7897934386391251, + "grad_norm": 1.2236160039901733, + "learning_rate": 4.179759786159719e-05, + "loss": 0.0871, "step": 325 }, { - "epoch": 2.2998236331569664, - "grad_norm": 0.3347030282020569, - "learning_rate": 4.184241683677687e-05, - "loss": 0.0176, + "epoch": 0.7922235722964763, + "grad_norm": 0.8143868446350098, + "learning_rate": 4.182004203885172e-05, + "loss": 0.0629, "step": 326 }, { - "epoch": 2.306878306878307, - "grad_norm": 0.39874500036239624, - "learning_rate": 4.1864722682981245e-05, - "loss": 0.0177, + "epoch": 0.7946537059538274, + "grad_norm": 0.33017951250076294, + "learning_rate": 4.184241683677687e-05, + "loss": 0.0117, "step": 327 }, { - "epoch": 2.313932980599647, - "grad_norm": 0.3885672688484192, - "learning_rate": 4.188696000113232e-05, - "loss": 0.0172, + "epoch": 0.7970838396111786, + "grad_norm": 0.3505575358867645, + "learning_rate": 4.1864722682981245e-05, + "loss": 0.0158, "step": 328 }, { - "epoch": 2.3209876543209877, - "grad_norm": 0.37026986479759216, - "learning_rate": 4.190912921100477e-05, - "loss": 0.0258, + "epoch": 0.7995139732685298, + "grad_norm": 1.4309784173965454, + "learning_rate": 4.188696000113232e-05, + "loss": 0.0492, "step": 329 }, { - "epoch": 2.328042328042328, - "grad_norm": 0.43948009610176086, - "learning_rate": 4.1931230728527994e-05, - "loss": 0.0189, + "epoch": 0.8019441069258809, + "grad_norm": 1.023452639579773, + "learning_rate": 4.190912921100477e-05, + "loss": 0.0623, "step": 330 }, { - "epoch": 2.3350970017636685, - "grad_norm": 0.371096134185791, - "learning_rate": 4.195326496583291e-05, - "loss": 0.0166, + "epoch": 0.804374240583232, + "grad_norm": 0.7057633996009827, + "learning_rate": 4.1931230728527994e-05, + "loss": 0.0468, "step": 331 }, { - "epoch": 2.3421516754850087, - "grad_norm": 0.33448880910873413, - "learning_rate": 4.1975232331298125e-05, - "loss": 0.0254, + "epoch": 0.8068043742405833, + "grad_norm": 0.8735628724098206, + "learning_rate": 4.195326496583291e-05, + "loss": 0.0709, "step": 332 }, { - "epoch": 2.3492063492063493, - "grad_norm": 0.38793328404426575, - "learning_rate": 4.1997133229595316e-05, - "loss": 0.0184, + "epoch": 0.8092345078979344, + "grad_norm": 3.259680986404419, + "learning_rate": 4.1975232331298125e-05, + "loss": 0.0491, "step": 333 }, { - "epoch": 2.3562610229276895, - "grad_norm": 0.22659848630428314, - "learning_rate": 4.201896806173394e-05, - "loss": 0.0083, + "epoch": 0.8116646415552855, + "grad_norm": 1.0592741966247559, + "learning_rate": 4.1997133229595316e-05, + "loss": 0.044, "step": 334 }, { - "epoch": 2.36331569664903, - "grad_norm": 0.3306357264518738, - "learning_rate": 4.2040737225105335e-05, - "loss": 0.0152, + "epoch": 0.8140947752126367, + "grad_norm": 0.5978744029998779, + "learning_rate": 4.201896806173394e-05, + "loss": 0.0563, "step": 335 }, { - "epoch": 2.3703703703703702, - "grad_norm": 0.2892155647277832, - "learning_rate": 4.206244111352608e-05, - "loss": 0.0148, + "epoch": 0.8165249088699879, + "grad_norm": 0.8989129662513733, + "learning_rate": 4.2040737225105335e-05, + "loss": 0.0582, "step": 336 }, { - "epoch": 2.377425044091711, - "grad_norm": 0.26743531227111816, - "learning_rate": 4.2084080117280756e-05, - "loss": 0.0132, + "epoch": 0.818955042527339, + "grad_norm": 0.9518970251083374, + "learning_rate": 4.206244111352608e-05, + "loss": 0.0764, "step": 337 }, { - "epoch": 2.384479717813051, - "grad_norm": 0.30841487646102905, - "learning_rate": 4.210565462316407e-05, - "loss": 0.0153, + "epoch": 0.8213851761846902, + "grad_norm": 0.48601099848747253, + "learning_rate": 4.2084080117280756e-05, + "loss": 0.0495, "step": 338 }, { - "epoch": 2.3915343915343916, - "grad_norm": 0.35935595631599426, - "learning_rate": 4.2127165014522315e-05, - "loss": 0.0188, + "epoch": 0.8238153098420413, + "grad_norm": 0.6095461249351501, + "learning_rate": 4.210565462316407e-05, + "loss": 0.0234, "step": 339 }, { - "epoch": 2.398589065255732, - "grad_norm": 0.3620130121707916, - "learning_rate": 4.214861167129425e-05, - "loss": 0.0186, + "epoch": 0.8262454434993924, + "grad_norm": 0.6459489464759827, + "learning_rate": 4.2127165014522315e-05, + "loss": 0.0236, "step": 340 }, { - "epoch": 2.4056437389770724, - "grad_norm": 0.40925925970077515, - "learning_rate": 4.2169994970051365e-05, - "loss": 0.0254, + "epoch": 0.8286755771567437, + "grad_norm": 0.8500393629074097, + "learning_rate": 4.214861167129425e-05, + "loss": 0.1073, "step": 341 }, { - "epoch": 2.4126984126984126, - "grad_norm": 0.3751520812511444, - "learning_rate": 4.219131528403759e-05, - "loss": 0.0202, + "epoch": 0.8311057108140948, + "grad_norm": 0.7379801869392395, + "learning_rate": 4.2169994970051365e-05, + "loss": 0.0779, "step": 342 }, { - "epoch": 2.419753086419753, - "grad_norm": 0.44678637385368347, - "learning_rate": 4.22125729832083e-05, - "loss": 0.0252, + "epoch": 0.8335358444714459, + "grad_norm": 0.7911482453346252, + "learning_rate": 4.219131528403759e-05, + "loss": 0.083, "step": 343 }, { - "epoch": 2.4268077601410933, - "grad_norm": 0.3179228901863098, - "learning_rate": 4.2233768434268914e-05, - "loss": 0.0107, + "epoch": 0.8359659781287971, + "grad_norm": 1.379714012145996, + "learning_rate": 4.22125729832083e-05, + "loss": 0.0527, "step": 344 }, { - "epoch": 2.433862433862434, - "grad_norm": 0.24804812669754028, - "learning_rate": 4.225490200071284e-05, - "loss": 0.0122, + "epoch": 0.8383961117861483, + "grad_norm": 0.7738386988639832, + "learning_rate": 4.2233768434268914e-05, + "loss": 0.0439, "step": 345 }, { - "epoch": 2.440917107583774, - "grad_norm": 0.4041373133659363, - "learning_rate": 4.227597404285883e-05, - "loss": 0.0234, + "epoch": 0.8408262454434994, + "grad_norm": 0.9524173140525818, + "learning_rate": 4.225490200071284e-05, + "loss": 0.0635, "step": 346 }, { - "epoch": 2.4479717813051147, - "grad_norm": 0.20905975997447968, - "learning_rate": 4.229698491788791e-05, - "loss": 0.0091, + "epoch": 0.8432563791008505, + "grad_norm": 0.843377947807312, + "learning_rate": 4.227597404285883e-05, + "loss": 0.0531, "step": 347 }, { - "epoch": 2.455026455026455, - "grad_norm": 0.28704166412353516, - "learning_rate": 4.231793497987961e-05, - "loss": 0.0121, + "epoch": 0.8456865127582017, + "grad_norm": 2.120123863220215, + "learning_rate": 4.229698491788791e-05, + "loss": 0.068, "step": 348 }, { - "epoch": 2.4620811287477955, - "grad_norm": 0.35453909635543823, - "learning_rate": 4.2338824579847904e-05, - "loss": 0.0185, + "epoch": 0.8481166464155528, + "grad_norm": 0.804137647151947, + "learning_rate": 4.231793497987961e-05, + "loss": 0.0577, "step": 349 }, { - "epoch": 2.4691358024691357, - "grad_norm": 0.40766483545303345, - "learning_rate": 4.235965406577636e-05, - "loss": 0.0073, + "epoch": 0.850546780072904, + "grad_norm": 0.8609597086906433, + "learning_rate": 4.2338824579847904e-05, + "loss": 0.0204, "step": 350 }, { - "epoch": 2.4761904761904763, - "grad_norm": 0.302290141582489, - "learning_rate": 4.2380423782653e-05, - "loss": 0.0089, + "epoch": 0.8529769137302552, + "grad_norm": 0.7206110954284668, + "learning_rate": 4.235965406577636e-05, + "loss": 0.0682, "step": 351 }, { - "epoch": 2.4832451499118164, - "grad_norm": 0.438491553068161, - "learning_rate": 4.240113407250459e-05, - "loss": 0.0218, + "epoch": 0.8554070473876063, + "grad_norm": 0.8393117785453796, + "learning_rate": 4.2380423782653e-05, + "loss": 0.0442, "step": 352 }, { - "epoch": 2.490299823633157, - "grad_norm": 0.3508157432079315, - "learning_rate": 4.24217852744304e-05, - "loss": 0.0129, + "epoch": 0.8578371810449574, + "grad_norm": 0.5761812329292297, + "learning_rate": 4.240113407250459e-05, + "loss": 0.0985, "step": 353 }, { - "epoch": 2.497354497354497, - "grad_norm": 0.2605820894241333, - "learning_rate": 4.244237772463552e-05, - "loss": 0.0098, + "epoch": 0.8602673147023087, + "grad_norm": 0.6769473552703857, + "learning_rate": 4.24217852744304e-05, + "loss": 0.0347, "step": 354 }, { - "epoch": 2.504409171075838, - "grad_norm": 0.23794864118099213, - "learning_rate": 4.246291175646371e-05, - "loss": 0.0136, + "epoch": 0.8626974483596598, + "grad_norm": 0.6801010370254517, + "learning_rate": 4.244237772463552e-05, + "loss": 0.0541, "step": 355 }, { - "epoch": 2.511463844797178, - "grad_norm": 0.3486771881580353, - "learning_rate": 4.24833877004298e-05, - "loss": 0.0172, + "epoch": 0.8651275820170109, + "grad_norm": 0.9833145141601562, + "learning_rate": 4.246291175646371e-05, + "loss": 0.0909, "step": 356 }, { - "epoch": 2.5185185185185186, - "grad_norm": 0.4761708676815033, - "learning_rate": 4.250380588425157e-05, - "loss": 0.0258, + "epoch": 0.8675577156743621, + "grad_norm": 0.8934769034385681, + "learning_rate": 4.24833877004298e-05, + "loss": 0.0769, "step": 357 }, { - "epoch": 2.5255731922398588, - "grad_norm": 0.2902567386627197, - "learning_rate": 4.2524166632881255e-05, - "loss": 0.0172, + "epoch": 0.8699878493317132, + "grad_norm": 0.523007869720459, + "learning_rate": 4.250380588425157e-05, + "loss": 0.0413, "step": 358 }, { - "epoch": 2.5326278659611994, - "grad_norm": 0.25851529836654663, - "learning_rate": 4.254447026853656e-05, - "loss": 0.0075, + "epoch": 0.8724179829890644, + "grad_norm": 2.014488935470581, + "learning_rate": 4.2524166632881255e-05, + "loss": 0.0651, "step": 359 }, { - "epoch": 2.5396825396825395, - "grad_norm": 0.23005281388759613, - "learning_rate": 4.2564717110731244e-05, - "loss": 0.0107, + "epoch": 0.8748481166464156, + "grad_norm": 0.8834489583969116, + "learning_rate": 4.254447026853656e-05, + "loss": 0.0288, "step": 360 }, { - "epoch": 2.54673721340388, - "grad_norm": 0.2701972723007202, - "learning_rate": 4.258490747630532e-05, - "loss": 0.0119, + "epoch": 0.8772782503037667, + "grad_norm": 0.8947015404701233, + "learning_rate": 4.2564717110731244e-05, + "loss": 0.1693, "step": 361 }, { - "epoch": 2.5537918871252203, - "grad_norm": 0.3264407515525818, - "learning_rate": 4.260504167945479e-05, - "loss": 0.0204, + "epoch": 0.8797083839611178, + "grad_norm": 1.9122964143753052, + "learning_rate": 4.258490747630532e-05, + "loss": 0.0738, "step": 362 }, { - "epoch": 2.560846560846561, - "grad_norm": 0.3354279100894928, - "learning_rate": 4.2625120031760965e-05, - "loss": 0.0205, + "epoch": 0.8821385176184691, + "grad_norm": 0.8690800070762634, + "learning_rate": 4.260504167945479e-05, + "loss": 0.0472, "step": 363 }, { - "epoch": 2.567901234567901, - "grad_norm": 0.38325926661491394, - "learning_rate": 4.264514284221944e-05, - "loss": 0.0185, + "epoch": 0.8845686512758202, + "grad_norm": 0.7206094861030579, + "learning_rate": 4.2625120031760965e-05, + "loss": 0.0881, "step": 364 }, { - "epoch": 2.5749559082892417, - "grad_norm": 0.4619055390357971, - "learning_rate": 4.266511041726854e-05, - "loss": 0.0202, + "epoch": 0.8869987849331713, + "grad_norm": 0.6016607284545898, + "learning_rate": 4.264514284221944e-05, + "loss": 0.0403, "step": 365 }, { - "epoch": 2.582010582010582, - "grad_norm": 0.37551018595695496, - "learning_rate": 4.26850230608176e-05, - "loss": 0.0231, + "epoch": 0.8894289185905225, + "grad_norm": 0.8117083311080933, + "learning_rate": 4.266511041726854e-05, + "loss": 0.0533, "step": 366 }, { - "epoch": 2.5890652557319225, - "grad_norm": 0.21586036682128906, - "learning_rate": 4.2704881074274584e-05, - "loss": 0.0109, + "epoch": 0.8918590522478737, + "grad_norm": 0.7658905982971191, + "learning_rate": 4.26850230608176e-05, + "loss": 0.067, "step": 367 }, { - "epoch": 2.5961199294532626, - "grad_norm": 0.19090792536735535, - "learning_rate": 4.272468475657351e-05, - "loss": 0.0086, + "epoch": 0.8942891859052248, + "grad_norm": 0.8042786717414856, + "learning_rate": 4.2704881074274584e-05, + "loss": 0.08, "step": 368 }, { - "epoch": 2.6031746031746033, - "grad_norm": 1.218457579612732, - "learning_rate": 4.2744434404201497e-05, - "loss": 0.0169, + "epoch": 0.8967193195625759, + "grad_norm": 0.5545147657394409, + "learning_rate": 4.272468475657351e-05, + "loss": 0.0534, "step": 369 }, { - "epoch": 2.6102292768959434, - "grad_norm": 0.3905993700027466, - "learning_rate": 4.27641303112253e-05, - "loss": 0.0149, + "epoch": 0.8991494532199271, + "grad_norm": 0.7363461256027222, + "learning_rate": 4.2744434404201497e-05, + "loss": 0.0394, "step": 370 }, { - "epoch": 2.617283950617284, - "grad_norm": 0.3336049020290375, - "learning_rate": 4.278377276931767e-05, - "loss": 0.0181, + "epoch": 0.9015795868772782, + "grad_norm": 0.5137555003166199, + "learning_rate": 4.27641303112253e-05, + "loss": 0.0602, "step": 371 }, { - "epoch": 2.624338624338624, - "grad_norm": 0.4581795930862427, - "learning_rate": 4.2803362067783256e-05, - "loss": 0.0267, + "epoch": 0.9040097205346294, + "grad_norm": 0.7932950258255005, + "learning_rate": 4.278377276931767e-05, + "loss": 0.0711, "step": 372 }, { - "epoch": 2.631393298059965, - "grad_norm": 0.2101754993200302, - "learning_rate": 4.2822898493584104e-05, - "loss": 0.0116, + "epoch": 0.9064398541919806, + "grad_norm": 2.459850788116455, + "learning_rate": 4.2803362067783256e-05, + "loss": 0.1969, "step": 373 }, { - "epoch": 2.638447971781305, - "grad_norm": 0.42322415113449097, - "learning_rate": 4.284238233136496e-05, - "loss": 0.0175, + "epoch": 0.9088699878493317, + "grad_norm": 0.6758642792701721, + "learning_rate": 4.2822898493584104e-05, + "loss": 0.1131, "step": 374 }, { - "epoch": 2.6455026455026456, - "grad_norm": 0.329816997051239, - "learning_rate": 4.286181386347813e-05, - "loss": 0.0133, + "epoch": 0.9113001215066828, + "grad_norm": 0.6686858534812927, + "learning_rate": 4.284238233136496e-05, + "loss": 0.0951, "step": 375 }, { - "epoch": 2.6525573192239857, - "grad_norm": 0.2963216304779053, - "learning_rate": 4.288119337000801e-05, - "loss": 0.0172, + "epoch": 0.913730255164034, + "grad_norm": 0.8299110531806946, + "learning_rate": 4.286181386347813e-05, + "loss": 0.046, "step": 376 }, { - "epoch": 2.6596119929453264, - "grad_norm": 0.3237278461456299, - "learning_rate": 4.2900521128795315e-05, - "loss": 0.0375, + "epoch": 0.9161603888213852, + "grad_norm": 2.4468648433685303, + "learning_rate": 4.288119337000801e-05, + "loss": 0.222, "step": 377 }, { - "epoch": 2.6666666666666665, - "grad_norm": 0.2482115775346756, - "learning_rate": 4.291979741546102e-05, - "loss": 0.012, + "epoch": 0.9185905224787363, + "grad_norm": 1.187517523765564, + "learning_rate": 4.2900521128795315e-05, + "loss": 0.1061, "step": 378 }, { - "epoch": 2.673721340388007, - "grad_norm": 0.46383151412010193, - "learning_rate": 4.293902250342989e-05, - "loss": 0.0269, + "epoch": 0.9210206561360875, + "grad_norm": 0.6776664853096008, + "learning_rate": 4.291979741546102e-05, + "loss": 0.0287, "step": 379 }, { - "epoch": 2.6807760141093473, - "grad_norm": 0.40127432346343994, - "learning_rate": 4.295819666395376e-05, - "loss": 0.0253, + "epoch": 0.9234507897934386, + "grad_norm": 0.6103045344352722, + "learning_rate": 4.293902250342989e-05, + "loss": 0.0439, "step": 380 }, { - "epoch": 2.687830687830688, - "grad_norm": 0.25148409605026245, - "learning_rate": 4.297732016613454e-05, - "loss": 0.0117, + "epoch": 0.9258809234507898, + "grad_norm": 0.44634753465652466, + "learning_rate": 4.295819666395376e-05, + "loss": 0.03, "step": 381 }, { - "epoch": 2.694885361552028, - "grad_norm": 0.3709105849266052, - "learning_rate": 4.299639327694684e-05, - "loss": 0.024, + "epoch": 0.928311057108141, + "grad_norm": 0.8639276027679443, + "learning_rate": 4.297732016613454e-05, + "loss": 0.1479, "step": 382 }, { - "epoch": 2.7019400352733687, - "grad_norm": 0.3348330855369568, - "learning_rate": 4.3015416261260325e-05, - "loss": 0.0167, + "epoch": 0.9307411907654921, + "grad_norm": 0.7611154913902283, + "learning_rate": 4.299639327694684e-05, + "loss": 0.0403, "step": 383 }, { - "epoch": 2.708994708994709, - "grad_norm": 0.4439306855201721, - "learning_rate": 4.303438938186182e-05, - "loss": 0.0153, + "epoch": 0.9331713244228432, + "grad_norm": 0.8710222244262695, + "learning_rate": 4.3015416261260325e-05, + "loss": 0.0522, "step": 384 }, { - "epoch": 2.7160493827160495, - "grad_norm": 0.4493269622325897, - "learning_rate": 4.305331289947705e-05, - "loss": 0.0159, + "epoch": 0.9356014580801945, + "grad_norm": 0.7666921615600586, + "learning_rate": 4.303438938186182e-05, + "loss": 0.0303, "step": 385 }, { - "epoch": 2.7231040564373896, - "grad_norm": 0.3308960199356079, - "learning_rate": 4.3072187072792184e-05, - "loss": 0.0157, + "epoch": 0.9380315917375456, + "grad_norm": 0.5829209089279175, + "learning_rate": 4.305331289947705e-05, + "loss": 0.0315, "step": 386 }, { - "epoch": 2.7301587301587302, - "grad_norm": 0.1948912888765335, - "learning_rate": 4.309101215847502e-05, - "loss": 0.0107, + "epoch": 0.9404617253948967, + "grad_norm": 0.9368737936019897, + "learning_rate": 4.3072187072792184e-05, + "loss": 0.0661, "step": 387 }, { - "epoch": 2.7372134038800704, - "grad_norm": 0.22952932119369507, - "learning_rate": 4.3109788411195924e-05, - "loss": 0.0147, + "epoch": 0.9428918590522479, + "grad_norm": 0.43866387009620667, + "learning_rate": 4.309101215847502e-05, + "loss": 0.0335, "step": 388 }, { - "epoch": 2.744268077601411, - "grad_norm": 0.5562791228294373, - "learning_rate": 4.312851608364853e-05, - "loss": 0.0158, + "epoch": 0.945321992709599, + "grad_norm": 0.6128959059715271, + "learning_rate": 4.3109788411195924e-05, + "loss": 0.108, "step": 389 }, { - "epoch": 2.751322751322751, - "grad_norm": 0.29296743869781494, - "learning_rate": 4.314719542657013e-05, - "loss": 0.0096, + "epoch": 0.9477521263669502, + "grad_norm": 0.5031237602233887, + "learning_rate": 4.312851608364853e-05, + "loss": 0.0332, "step": 390 }, { - "epoch": 2.758377425044092, - "grad_norm": 0.2562165856361389, - "learning_rate": 4.3165826688761796e-05, - "loss": 0.0086, + "epoch": 0.9501822600243013, + "grad_norm": 0.6804950833320618, + "learning_rate": 4.314719542657013e-05, + "loss": 0.0259, "step": 391 }, { - "epoch": 2.765432098765432, - "grad_norm": 0.21070915460586548, - "learning_rate": 4.318441011710833e-05, - "loss": 0.0105, + "epoch": 0.9526123936816525, + "grad_norm": 0.9872898459434509, + "learning_rate": 4.3165826688761796e-05, + "loss": 0.0281, "step": 392 }, { - "epoch": 2.7724867724867726, - "grad_norm": 0.3632257580757141, - "learning_rate": 4.3202945956597786e-05, - "loss": 0.0149, + "epoch": 0.9550425273390036, + "grad_norm": 1.9561671018600464, + "learning_rate": 4.318441011710833e-05, + "loss": 0.0342, "step": 393 }, { - "epoch": 2.7795414462081127, - "grad_norm": 0.2885260581970215, - "learning_rate": 4.3221434450340956e-05, - "loss": 0.0098, + "epoch": 0.9574726609963548, + "grad_norm": 0.6444841027259827, + "learning_rate": 4.3202945956597786e-05, + "loss": 0.035, "step": 394 }, { - "epoch": 2.7865961199294533, - "grad_norm": 0.3417116701602936, - "learning_rate": 4.323987583959045e-05, - "loss": 0.02, + "epoch": 0.959902794653706, + "grad_norm": 0.49825266003608704, + "learning_rate": 4.3221434450340956e-05, + "loss": 0.0248, "step": 395 }, { - "epoch": 2.7936507936507935, - "grad_norm": 0.26477646827697754, - "learning_rate": 4.325827036375957e-05, - "loss": 0.0121, + "epoch": 0.9623329283110571, + "grad_norm": 0.548538863658905, + "learning_rate": 4.323987583959045e-05, + "loss": 0.076, "step": 396 }, { - "epoch": 2.800705467372134, - "grad_norm": 0.18265554308891296, - "learning_rate": 4.327661826044101e-05, - "loss": 0.0077, + "epoch": 0.9647630619684082, + "grad_norm": 0.6437348127365112, + "learning_rate": 4.325827036375957e-05, + "loss": 0.0749, "step": 397 }, { - "epoch": 2.8077601410934743, - "grad_norm": 0.33462291955947876, - "learning_rate": 4.329491976542521e-05, - "loss": 0.0124, + "epoch": 0.9671931956257594, + "grad_norm": 0.5854353308677673, + "learning_rate": 4.327661826044101e-05, + "loss": 0.0413, "step": 398 }, { - "epoch": 2.814814814814815, - "grad_norm": 0.3718152940273285, - "learning_rate": 4.331317511271859e-05, - "loss": 0.016, + "epoch": 0.9696233292831106, + "grad_norm": 1.0665239095687866, + "learning_rate": 4.329491976542521e-05, + "loss": 0.0574, "step": 399 }, { - "epoch": 2.821869488536155, - "grad_norm": 0.353544145822525, - "learning_rate": 4.333138453456147e-05, - "loss": 0.0204, + "epoch": 0.9720534629404617, + "grad_norm": 0.7473851442337036, + "learning_rate": 4.331317511271859e-05, + "loss": 0.1447, "step": 400 }, { - "epoch": 2.821869488536155, - "eval_loss": 0.3034096956253052, - "eval_runtime": 352.3366, - "eval_samples_per_second": 6.437, - "eval_steps_per_second": 0.403, + "epoch": 0.9720534629404617, + "eval_loss": 0.31331515312194824, + "eval_runtime": 503.3031, + "eval_samples_per_second": 5.46, + "eval_steps_per_second": 0.683, "step": 400 }, { - "epoch": 2.821869488536155, + "epoch": 0.9720534629404617, "step": 400, - "total_flos": 2.583955738959282e+18, - "train_loss": 0.050165310025913644, - "train_runtime": 16554.5706, - "train_samples_per_second": 38.66, - "train_steps_per_second": 0.604 + "total_flos": 1.1378390115664527e+18, + "train_loss": 0.11899106367724016, + "train_runtime": 12503.2939, + "train_samples_per_second": 25.593, + "train_steps_per_second": 0.8 } ], "logging_steps": 1.0, "max_steps": 10000, "num_input_tokens_seen": 0, - "num_train_epochs": 71, + "num_train_epochs": 25, "save_steps": 100, - "total_flos": 2.583955738959282e+18, - "train_batch_size": 4, + "total_flos": 1.1378390115664527e+18, + "train_batch_size": 2, "trial_name": null, "trial_params": null }