{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 5460, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 0.0019010665128007531, "learning_rate": 0.0005, "loss": 0.0009, "step": 25 }, { "epoch": 0.09, "grad_norm": 0.33505979180336, "learning_rate": 0.001, "loss": 0.0059, "step": 50 }, { "epoch": 0.14, "grad_norm": 2.521899700164795, "learning_rate": 0.0009953789279112755, "loss": 0.2569, "step": 75 }, { "epoch": 0.18, "grad_norm": 1.3200310468673706, "learning_rate": 0.000990757855822551, "loss": 2.1666, "step": 100 }, { "epoch": 0.23, "grad_norm": 1.1342906951904297, "learning_rate": 0.0009861367837338264, "loss": 0.3677, "step": 125 }, { "epoch": 0.27, "grad_norm": 0.6839067935943604, "learning_rate": 0.0009815157116451016, "loss": 0.2573, "step": 150 }, { "epoch": 0.32, "grad_norm": 1.120778203010559, "learning_rate": 0.0009768946395563771, "loss": 0.1784, "step": 175 }, { "epoch": 0.37, "grad_norm": 0.39630964398384094, "learning_rate": 0.0009722735674676525, "loss": 0.1007, "step": 200 }, { "epoch": 0.41, "grad_norm": 0.8115565776824951, "learning_rate": 0.0009676524953789279, "loss": 0.1009, "step": 225 }, { "epoch": 0.46, "grad_norm": 0.4256753921508789, "learning_rate": 0.0009630314232902033, "loss": 0.0613, "step": 250 }, { "epoch": 0.5, "grad_norm": 1.3296879529953003, "learning_rate": 0.0009584103512014787, "loss": 0.0656, "step": 275 }, { "epoch": 0.55, "grad_norm": 0.2332513928413391, "learning_rate": 0.0009537892791127542, "loss": 0.1056, "step": 300 }, { "epoch": 0.6, "grad_norm": 0.361102819442749, "learning_rate": 0.0009491682070240297, "loss": 0.0714, "step": 325 }, { "epoch": 0.64, "grad_norm": 0.3176443576812744, "learning_rate": 0.000944547134935305, "loss": 0.051, "step": 350 }, { "epoch": 0.69, "grad_norm": 0.5561681389808655, "learning_rate": 0.0009399260628465805, "loss": 0.0987, "step": 375 }, { "epoch": 0.73, "grad_norm": 0.17937970161437988, "learning_rate": 0.0009353049907578558, "loss": 0.0705, "step": 400 }, { "epoch": 0.78, "grad_norm": 0.11134567111730576, "learning_rate": 0.0009306839186691313, "loss": 0.0449, "step": 425 }, { "epoch": 0.82, "grad_norm": 0.1100844293832779, "learning_rate": 0.0009260628465804066, "loss": 0.0605, "step": 450 }, { "epoch": 0.87, "grad_norm": 0.5044831037521362, "learning_rate": 0.0009214417744916821, "loss": 0.0682, "step": 475 }, { "epoch": 0.92, "grad_norm": 0.1506507396697998, "learning_rate": 0.0009168207024029575, "loss": 0.0382, "step": 500 }, { "epoch": 0.96, "grad_norm": 0.358964741230011, "learning_rate": 0.0009121996303142329, "loss": 0.0416, "step": 525 }, { "epoch": 1.0, "eval_loss": 0.22685140371322632, "eval_runtime": 467.2357, "eval_samples_per_second": 1.74, "eval_steps_per_second": 0.291, "step": 546 }, { "epoch": 1.01, "grad_norm": 0.21843843162059784, "learning_rate": 0.0009075785582255084, "loss": 0.0283, "step": 550 }, { "epoch": 1.05, "grad_norm": 0.3907661437988281, "learning_rate": 0.0009029574861367837, "loss": 0.0322, "step": 575 }, { "epoch": 1.1, "grad_norm": 0.026330502703785896, "learning_rate": 0.0008983364140480592, "loss": 0.021, "step": 600 }, { "epoch": 1.14, "grad_norm": 0.015783503651618958, "learning_rate": 0.0008937153419593346, "loss": 0.0237, "step": 625 }, { "epoch": 1.19, "grad_norm": 0.07014349848031998, "learning_rate": 0.00088909426987061, "loss": 0.0307, "step": 650 }, { "epoch": 1.24, "grad_norm": 0.05321989953517914, "learning_rate": 0.0008844731977818854, "loss": 0.0217, "step": 675 }, { "epoch": 1.28, "grad_norm": 0.07509706914424896, "learning_rate": 0.0008798521256931608, "loss": 0.0319, "step": 700 }, { "epoch": 1.33, "grad_norm": 0.239598348736763, "learning_rate": 0.0008752310536044362, "loss": 0.0373, "step": 725 }, { "epoch": 1.37, "grad_norm": 0.1276847869157791, "learning_rate": 0.0008706099815157116, "loss": 0.0308, "step": 750 }, { "epoch": 1.42, "grad_norm": 0.17856040596961975, "learning_rate": 0.000865988909426987, "loss": 0.0326, "step": 775 }, { "epoch": 1.47, "grad_norm": 0.5299984216690063, "learning_rate": 0.0008613678373382626, "loss": 0.034, "step": 800 }, { "epoch": 1.51, "grad_norm": 0.18570055067539215, "learning_rate": 0.0008567467652495379, "loss": 0.0255, "step": 825 }, { "epoch": 1.56, "grad_norm": 0.012400169856846333, "learning_rate": 0.0008521256931608134, "loss": 0.0133, "step": 850 }, { "epoch": 1.6, "grad_norm": 0.18435439467430115, "learning_rate": 0.0008475046210720887, "loss": 0.0392, "step": 875 }, { "epoch": 1.65, "grad_norm": 0.20227985084056854, "learning_rate": 0.0008428835489833642, "loss": 0.0339, "step": 900 }, { "epoch": 1.69, "grad_norm": 0.09818145632743835, "learning_rate": 0.0008382624768946395, "loss": 0.039, "step": 925 }, { "epoch": 1.74, "grad_norm": 0.17143449187278748, "learning_rate": 0.000833641404805915, "loss": 0.0256, "step": 950 }, { "epoch": 1.79, "grad_norm": 0.3052353858947754, "learning_rate": 0.0008290203327171904, "loss": 0.0279, "step": 975 }, { "epoch": 1.83, "grad_norm": 0.09069275110960007, "learning_rate": 0.0008243992606284658, "loss": 0.0253, "step": 1000 }, { "epoch": 1.88, "grad_norm": 0.7721070647239685, "learning_rate": 0.0008197781885397413, "loss": 0.0222, "step": 1025 }, { "epoch": 1.92, "grad_norm": 0.2378959357738495, "learning_rate": 0.0008151571164510166, "loss": 0.0386, "step": 1050 }, { "epoch": 1.97, "grad_norm": 0.004904525820165873, "learning_rate": 0.0008105360443622921, "loss": 0.0243, "step": 1075 }, { "epoch": 2.0, "eval_loss": 0.20544852316379547, "eval_runtime": 463.5714, "eval_samples_per_second": 1.754, "eval_steps_per_second": 0.293, "step": 1092 }, { "epoch": 2.01, "grad_norm": 0.13851934671401978, "learning_rate": 0.0008059149722735675, "loss": 0.0267, "step": 1100 }, { "epoch": 2.06, "grad_norm": 0.01875193975865841, "learning_rate": 0.0008012939001848429, "loss": 0.0117, "step": 1125 }, { "epoch": 2.11, "grad_norm": 0.0016854548593983054, "learning_rate": 0.0007966728280961183, "loss": 0.0317, "step": 1150 }, { "epoch": 2.15, "grad_norm": 0.10259977728128433, "learning_rate": 0.0007920517560073937, "loss": 0.0253, "step": 1175 }, { "epoch": 2.2, "grad_norm": 0.054936520755290985, "learning_rate": 0.0007874306839186691, "loss": 0.0228, "step": 1200 }, { "epoch": 2.24, "grad_norm": 0.08183781057596207, "learning_rate": 0.0007828096118299445, "loss": 0.0163, "step": 1225 }, { "epoch": 2.29, "grad_norm": 0.03448671102523804, "learning_rate": 0.0007781885397412199, "loss": 0.0252, "step": 1250 }, { "epoch": 2.34, "grad_norm": 0.026254719123244286, "learning_rate": 0.0007735674676524955, "loss": 0.0263, "step": 1275 }, { "epoch": 2.38, "grad_norm": 0.07833431661128998, "learning_rate": 0.0007689463955637708, "loss": 0.0268, "step": 1300 }, { "epoch": 2.43, "grad_norm": 0.3215916156768799, "learning_rate": 0.0007643253234750463, "loss": 0.0184, "step": 1325 }, { "epoch": 2.47, "grad_norm": 0.1949949562549591, "learning_rate": 0.0007597042513863216, "loss": 0.0234, "step": 1350 }, { "epoch": 2.52, "grad_norm": 0.10761301964521408, "learning_rate": 0.0007550831792975971, "loss": 0.0214, "step": 1375 }, { "epoch": 2.56, "grad_norm": 0.23488566279411316, "learning_rate": 0.0007504621072088724, "loss": 0.0321, "step": 1400 }, { "epoch": 2.61, "grad_norm": 0.16079489886760712, "learning_rate": 0.0007458410351201479, "loss": 0.041, "step": 1425 }, { "epoch": 2.66, "grad_norm": 0.3500367999076843, "learning_rate": 0.0007412199630314234, "loss": 0.0218, "step": 1450 }, { "epoch": 2.7, "grad_norm": 0.01966880075633526, "learning_rate": 0.0007365988909426987, "loss": 0.0185, "step": 1475 }, { "epoch": 2.75, "grad_norm": 0.10084854066371918, "learning_rate": 0.0007319778188539742, "loss": 0.0158, "step": 1500 }, { "epoch": 2.79, "grad_norm": 0.045843809843063354, "learning_rate": 0.0007273567467652495, "loss": 0.0193, "step": 1525 }, { "epoch": 2.84, "grad_norm": 0.19230197370052338, "learning_rate": 0.000722735674676525, "loss": 0.0115, "step": 1550 }, { "epoch": 2.88, "grad_norm": 0.10168833285570145, "learning_rate": 0.0007181146025878004, "loss": 0.0173, "step": 1575 }, { "epoch": 2.93, "grad_norm": 0.24770613014698029, "learning_rate": 0.0007134935304990758, "loss": 0.019, "step": 1600 }, { "epoch": 2.98, "grad_norm": 0.04277370125055313, "learning_rate": 0.0007088724584103512, "loss": 0.0262, "step": 1625 }, { "epoch": 3.0, "eval_loss": 0.18655328452587128, "eval_runtime": 463.8687, "eval_samples_per_second": 1.753, "eval_steps_per_second": 0.293, "step": 1638 }, { "epoch": 3.02, "grad_norm": 0.03143952414393425, "learning_rate": 0.0007042513863216266, "loss": 0.0185, "step": 1650 }, { "epoch": 3.07, "grad_norm": 0.061480745673179626, "learning_rate": 0.000699630314232902, "loss": 0.0158, "step": 1675 }, { "epoch": 3.11, "grad_norm": 0.05645143985748291, "learning_rate": 0.0006950092421441774, "loss": 0.0163, "step": 1700 }, { "epoch": 3.16, "grad_norm": 0.3927539885044098, "learning_rate": 0.0006903881700554528, "loss": 0.0257, "step": 1725 }, { "epoch": 3.21, "grad_norm": 0.1579461544752121, "learning_rate": 0.0006857670979667284, "loss": 0.0203, "step": 1750 }, { "epoch": 3.25, "grad_norm": 0.003284105099737644, "learning_rate": 0.0006811460258780037, "loss": 0.012, "step": 1775 }, { "epoch": 3.3, "grad_norm": 0.0939943715929985, "learning_rate": 0.0006765249537892792, "loss": 0.0139, "step": 1800 }, { "epoch": 3.34, "grad_norm": 0.08114974200725555, "learning_rate": 0.0006719038817005545, "loss": 0.0134, "step": 1825 }, { "epoch": 3.39, "grad_norm": 0.008277042768895626, "learning_rate": 0.00066728280961183, "loss": 0.0203, "step": 1850 }, { "epoch": 3.43, "grad_norm": 0.014137201942503452, "learning_rate": 0.0006626617375231053, "loss": 0.018, "step": 1875 }, { "epoch": 3.48, "grad_norm": 0.04209378361701965, "learning_rate": 0.0006580406654343808, "loss": 0.0107, "step": 1900 }, { "epoch": 3.53, "grad_norm": 0.015557551756501198, "learning_rate": 0.0006534195933456563, "loss": 0.0104, "step": 1925 }, { "epoch": 3.57, "grad_norm": 0.021405475214123726, "learning_rate": 0.0006487985212569316, "loss": 0.0117, "step": 1950 }, { "epoch": 3.62, "grad_norm": 0.0015239958884194493, "learning_rate": 0.0006441774491682071, "loss": 0.0176, "step": 1975 }, { "epoch": 3.66, "grad_norm": 0.0997876301407814, "learning_rate": 0.0006395563770794824, "loss": 0.0183, "step": 2000 }, { "epoch": 3.71, "grad_norm": 0.004715020768344402, "learning_rate": 0.0006349353049907579, "loss": 0.0199, "step": 2025 }, { "epoch": 3.75, "grad_norm": 0.1075858548283577, "learning_rate": 0.0006303142329020333, "loss": 0.0201, "step": 2050 }, { "epoch": 3.8, "grad_norm": 0.020496558398008347, "learning_rate": 0.0006256931608133087, "loss": 0.0145, "step": 2075 }, { "epoch": 3.85, "grad_norm": 0.11063025891780853, "learning_rate": 0.0006210720887245841, "loss": 0.0201, "step": 2100 }, { "epoch": 3.89, "grad_norm": 0.1012192815542221, "learning_rate": 0.0006164510166358595, "loss": 0.0223, "step": 2125 }, { "epoch": 3.94, "grad_norm": 0.04694315418601036, "learning_rate": 0.0006118299445471349, "loss": 0.0163, "step": 2150 }, { "epoch": 3.98, "grad_norm": 0.05395512282848358, "learning_rate": 0.0006072088724584103, "loss": 0.009, "step": 2175 }, { "epoch": 4.0, "eval_loss": 0.20004291832447052, "eval_runtime": 464.3668, "eval_samples_per_second": 1.751, "eval_steps_per_second": 0.293, "step": 2184 }, { "epoch": 4.03, "grad_norm": 0.08517912030220032, "learning_rate": 0.0006025878003696857, "loss": 0.0152, "step": 2200 }, { "epoch": 4.08, "grad_norm": 0.23693686723709106, "learning_rate": 0.0005979667282809613, "loss": 0.0123, "step": 2225 }, { "epoch": 4.12, "grad_norm": 0.04390133172273636, "learning_rate": 0.0005933456561922366, "loss": 0.0104, "step": 2250 }, { "epoch": 4.17, "grad_norm": 0.048480305820703506, "learning_rate": 0.0005887245841035121, "loss": 0.0191, "step": 2275 }, { "epoch": 4.21, "grad_norm": 0.07334431260824203, "learning_rate": 0.0005841035120147874, "loss": 0.0079, "step": 2300 }, { "epoch": 4.26, "grad_norm": 0.26686009764671326, "learning_rate": 0.0005794824399260629, "loss": 0.0134, "step": 2325 }, { "epoch": 4.3, "grad_norm": 0.18834412097930908, "learning_rate": 0.0005748613678373382, "loss": 0.0108, "step": 2350 }, { "epoch": 4.35, "grad_norm": 0.11365604400634766, "learning_rate": 0.0005702402957486137, "loss": 0.0116, "step": 2375 }, { "epoch": 4.4, "grad_norm": 0.21077445149421692, "learning_rate": 0.0005656192236598892, "loss": 0.017, "step": 2400 }, { "epoch": 4.44, "grad_norm": 0.14450936019420624, "learning_rate": 0.0005609981515711645, "loss": 0.0056, "step": 2425 }, { "epoch": 4.49, "grad_norm": 0.07659462839365005, "learning_rate": 0.00055637707948244, "loss": 0.0128, "step": 2450 }, { "epoch": 4.53, "grad_norm": 0.07819797843694687, "learning_rate": 0.0005517560073937153, "loss": 0.0085, "step": 2475 }, { "epoch": 4.58, "grad_norm": 0.10529200732707977, "learning_rate": 0.0005471349353049908, "loss": 0.0156, "step": 2500 }, { "epoch": 4.62, "grad_norm": 0.034541305154561996, "learning_rate": 0.0005425138632162662, "loss": 0.0114, "step": 2525 }, { "epoch": 4.67, "grad_norm": 0.0043388293124735355, "learning_rate": 0.0005378927911275416, "loss": 0.0114, "step": 2550 }, { "epoch": 4.72, "grad_norm": 0.09843795001506805, "learning_rate": 0.000533271719038817, "loss": 0.0097, "step": 2575 }, { "epoch": 4.76, "grad_norm": 0.1924191564321518, "learning_rate": 0.0005286506469500924, "loss": 0.0138, "step": 2600 }, { "epoch": 4.81, "grad_norm": 0.0032940045930445194, "learning_rate": 0.0005240295748613678, "loss": 0.009, "step": 2625 }, { "epoch": 4.85, "grad_norm": 0.17411276698112488, "learning_rate": 0.0005194085027726432, "loss": 0.005, "step": 2650 }, { "epoch": 4.9, "grad_norm": 0.0008068850729614496, "learning_rate": 0.0005147874306839186, "loss": 0.0091, "step": 2675 }, { "epoch": 4.95, "grad_norm": 0.013785873539745808, "learning_rate": 0.0005101663585951941, "loss": 0.0174, "step": 2700 }, { "epoch": 4.99, "grad_norm": 0.06957102566957474, "learning_rate": 0.0005055452865064695, "loss": 0.0196, "step": 2725 }, { "epoch": 5.0, "eval_loss": 0.1927657425403595, "eval_runtime": 464.5332, "eval_samples_per_second": 1.75, "eval_steps_per_second": 0.293, "step": 2730 }, { "epoch": 5.04, "grad_norm": 0.1873362511396408, "learning_rate": 0.000500924214417745, "loss": 0.0114, "step": 2750 }, { "epoch": 5.08, "grad_norm": 0.013944294303655624, "learning_rate": 0.0004963031423290203, "loss": 0.0047, "step": 2775 }, { "epoch": 5.13, "grad_norm": 0.14739681780338287, "learning_rate": 0.0004916820702402958, "loss": 0.0064, "step": 2800 }, { "epoch": 5.17, "grad_norm": 0.039295587688684464, "learning_rate": 0.00048706099815157115, "loss": 0.0061, "step": 2825 }, { "epoch": 5.22, "grad_norm": 0.009731476195156574, "learning_rate": 0.0004824399260628466, "loss": 0.0064, "step": 2850 }, { "epoch": 5.27, "grad_norm": 0.009130421094596386, "learning_rate": 0.000477818853974122, "loss": 0.0056, "step": 2875 }, { "epoch": 5.31, "grad_norm": 0.10517439246177673, "learning_rate": 0.0004731977818853974, "loss": 0.0095, "step": 2900 }, { "epoch": 5.36, "grad_norm": 0.03147244080901146, "learning_rate": 0.00046857670979667283, "loss": 0.0069, "step": 2925 }, { "epoch": 5.4, "grad_norm": 0.07550155371427536, "learning_rate": 0.00046395563770794824, "loss": 0.0084, "step": 2950 }, { "epoch": 5.45, "grad_norm": 0.09899873286485672, "learning_rate": 0.00045933456561922365, "loss": 0.0087, "step": 2975 }, { "epoch": 5.49, "grad_norm": 0.062454238533973694, "learning_rate": 0.00045471349353049906, "loss": 0.0114, "step": 3000 }, { "epoch": 5.54, "grad_norm": 0.14996998012065887, "learning_rate": 0.00045009242144177446, "loss": 0.0091, "step": 3025 }, { "epoch": 5.59, "grad_norm": 0.19108814001083374, "learning_rate": 0.00044547134935304987, "loss": 0.0147, "step": 3050 }, { "epoch": 5.63, "grad_norm": 0.14450325071811676, "learning_rate": 0.00044085027726432533, "loss": 0.0152, "step": 3075 }, { "epoch": 5.68, "grad_norm": 0.04423892870545387, "learning_rate": 0.0004362292051756008, "loss": 0.006, "step": 3100 }, { "epoch": 5.72, "grad_norm": 0.13844439387321472, "learning_rate": 0.0004316081330868762, "loss": 0.009, "step": 3125 }, { "epoch": 5.77, "grad_norm": 0.0006735218339599669, "learning_rate": 0.0004269870609981516, "loss": 0.0058, "step": 3150 }, { "epoch": 5.82, "grad_norm": 0.011760660447180271, "learning_rate": 0.000422365988909427, "loss": 0.0049, "step": 3175 }, { "epoch": 5.86, "grad_norm": 0.08969856053590775, "learning_rate": 0.0004177449168207024, "loss": 0.0065, "step": 3200 }, { "epoch": 5.91, "grad_norm": 0.12556907534599304, "learning_rate": 0.00041312384473197783, "loss": 0.0089, "step": 3225 }, { "epoch": 5.95, "grad_norm": 0.017725255340337753, "learning_rate": 0.00040850277264325324, "loss": 0.0088, "step": 3250 }, { "epoch": 6.0, "grad_norm": 0.009897828102111816, "learning_rate": 0.00040388170055452864, "loss": 0.0071, "step": 3275 }, { "epoch": 6.0, "eval_loss": 0.20994354784488678, "eval_runtime": 463.0078, "eval_samples_per_second": 1.756, "eval_steps_per_second": 0.294, "step": 3276 }, { "epoch": 6.04, "grad_norm": 0.0028004159685224295, "learning_rate": 0.00039926062846580405, "loss": 0.0093, "step": 3300 }, { "epoch": 6.09, "grad_norm": 0.10490375012159348, "learning_rate": 0.0003946395563770795, "loss": 0.0053, "step": 3325 }, { "epoch": 6.14, "grad_norm": 0.019779745489358902, "learning_rate": 0.0003900184842883549, "loss": 0.0084, "step": 3350 }, { "epoch": 6.18, "grad_norm": 0.00020589173072949052, "learning_rate": 0.00038539741219963033, "loss": 0.0029, "step": 3375 }, { "epoch": 6.23, "grad_norm": 0.003221085062250495, "learning_rate": 0.00038077634011090574, "loss": 0.0051, "step": 3400 }, { "epoch": 6.27, "grad_norm": 0.00455264188349247, "learning_rate": 0.00037615526802218114, "loss": 0.0063, "step": 3425 }, { "epoch": 6.32, "grad_norm": 0.00967650581151247, "learning_rate": 0.00037153419593345655, "loss": 0.0035, "step": 3450 }, { "epoch": 6.36, "grad_norm": 0.009352604858577251, "learning_rate": 0.00036691312384473196, "loss": 0.0065, "step": 3475 }, { "epoch": 6.41, "grad_norm": 0.002876508515328169, "learning_rate": 0.00036229205175600736, "loss": 0.0049, "step": 3500 }, { "epoch": 6.46, "grad_norm": 0.002914861775934696, "learning_rate": 0.00035767097966728277, "loss": 0.0043, "step": 3525 }, { "epoch": 6.5, "grad_norm": 0.021481545642018318, "learning_rate": 0.0003530499075785583, "loss": 0.0072, "step": 3550 }, { "epoch": 6.55, "grad_norm": 0.08110266923904419, "learning_rate": 0.0003484288354898337, "loss": 0.0044, "step": 3575 }, { "epoch": 6.59, "grad_norm": 0.020943278446793556, "learning_rate": 0.0003438077634011091, "loss": 0.0072, "step": 3600 }, { "epoch": 6.64, "grad_norm": 0.005692564882338047, "learning_rate": 0.0003391866913123845, "loss": 0.0078, "step": 3625 }, { "epoch": 6.68, "grad_norm": 0.11609622091054916, "learning_rate": 0.0003345656192236599, "loss": 0.008, "step": 3650 }, { "epoch": 6.73, "grad_norm": 0.05904560536146164, "learning_rate": 0.0003299445471349353, "loss": 0.0061, "step": 3675 }, { "epoch": 6.78, "grad_norm": 0.03346557542681694, "learning_rate": 0.00032532347504621073, "loss": 0.0069, "step": 3700 }, { "epoch": 6.82, "grad_norm": 0.04848520830273628, "learning_rate": 0.00032070240295748614, "loss": 0.0078, "step": 3725 }, { "epoch": 6.87, "grad_norm": 0.11064545810222626, "learning_rate": 0.00031608133086876155, "loss": 0.0083, "step": 3750 }, { "epoch": 6.91, "grad_norm": 0.001821186626330018, "learning_rate": 0.00031146025878003695, "loss": 0.0084, "step": 3775 }, { "epoch": 6.96, "grad_norm": 0.03919747844338417, "learning_rate": 0.0003068391866913124, "loss": 0.0054, "step": 3800 }, { "epoch": 7.0, "eval_loss": 0.20703129470348358, "eval_runtime": 463.5868, "eval_samples_per_second": 1.754, "eval_steps_per_second": 0.293, "step": 3822 }, { "epoch": 7.01, "grad_norm": 0.008144889958202839, "learning_rate": 0.0003022181146025878, "loss": 0.0057, "step": 3825 }, { "epoch": 7.05, "grad_norm": 0.005378293804824352, "learning_rate": 0.00029759704251386323, "loss": 0.004, "step": 3850 }, { "epoch": 7.1, "grad_norm": 0.03501349315047264, "learning_rate": 0.00029297597042513864, "loss": 0.003, "step": 3875 }, { "epoch": 7.14, "grad_norm": 0.07073014974594116, "learning_rate": 0.00028835489833641404, "loss": 0.0029, "step": 3900 }, { "epoch": 7.19, "grad_norm": 0.09017951786518097, "learning_rate": 0.00028373382624768945, "loss": 0.0027, "step": 3925 }, { "epoch": 7.23, "grad_norm": 0.009881277568638325, "learning_rate": 0.00027911275415896486, "loss": 0.0044, "step": 3950 }, { "epoch": 7.28, "grad_norm": 0.0018990118987858295, "learning_rate": 0.00027449168207024027, "loss": 0.0031, "step": 3975 }, { "epoch": 7.33, "grad_norm": 0.004116680007427931, "learning_rate": 0.00026987060998151567, "loss": 0.0026, "step": 4000 }, { "epoch": 7.37, "grad_norm": 0.03917807340621948, "learning_rate": 0.00026524953789279113, "loss": 0.0038, "step": 4025 }, { "epoch": 7.42, "grad_norm": 0.0030583201441913843, "learning_rate": 0.0002606284658040666, "loss": 0.0032, "step": 4050 }, { "epoch": 7.46, "grad_norm": 0.0014874553307890892, "learning_rate": 0.000256007393715342, "loss": 0.0054, "step": 4075 }, { "epoch": 7.51, "grad_norm": 0.0008628646028228104, "learning_rate": 0.0002513863216266174, "loss": 0.0019, "step": 4100 }, { "epoch": 7.55, "grad_norm": 0.02715575322508812, "learning_rate": 0.00024676524953789276, "loss": 0.0037, "step": 4125 }, { "epoch": 7.6, "grad_norm": 0.0031906655058264732, "learning_rate": 0.00024214417744916822, "loss": 0.0058, "step": 4150 }, { "epoch": 7.65, "grad_norm": 0.011863148771226406, "learning_rate": 0.00023752310536044363, "loss": 0.0022, "step": 4175 }, { "epoch": 7.69, "grad_norm": 0.0015202141366899014, "learning_rate": 0.00023290203327171904, "loss": 0.0045, "step": 4200 }, { "epoch": 7.74, "grad_norm": 0.02240474335849285, "learning_rate": 0.00022828096118299447, "loss": 0.0039, "step": 4225 }, { "epoch": 7.78, "grad_norm": 0.00918908603489399, "learning_rate": 0.00022365988909426988, "loss": 0.0014, "step": 4250 }, { "epoch": 7.83, "grad_norm": 0.005950120277702808, "learning_rate": 0.0002190388170055453, "loss": 0.0031, "step": 4275 }, { "epoch": 7.88, "grad_norm": 0.07433830946683884, "learning_rate": 0.0002144177449168207, "loss": 0.002, "step": 4300 }, { "epoch": 7.92, "grad_norm": 0.09878811240196228, "learning_rate": 0.0002097966728280961, "loss": 0.0039, "step": 4325 }, { "epoch": 7.97, "grad_norm": 0.004627088084816933, "learning_rate": 0.00020517560073937154, "loss": 0.0066, "step": 4350 }, { "epoch": 8.0, "eval_loss": 0.21887589991092682, "eval_runtime": 464.2458, "eval_samples_per_second": 1.751, "eval_steps_per_second": 0.293, "step": 4368 }, { "epoch": 8.01, "grad_norm": 0.016953645274043083, "learning_rate": 0.00020055452865064697, "loss": 0.002, "step": 4375 }, { "epoch": 8.06, "grad_norm": 0.00016232863708864897, "learning_rate": 0.00019593345656192238, "loss": 0.0022, "step": 4400 }, { "epoch": 8.1, "grad_norm": 0.00045125139877200127, "learning_rate": 0.00019131238447319779, "loss": 0.0016, "step": 4425 }, { "epoch": 8.15, "grad_norm": 0.02065761759877205, "learning_rate": 0.0001866913123844732, "loss": 0.0017, "step": 4450 }, { "epoch": 8.2, "grad_norm": 0.042185261845588684, "learning_rate": 0.00018207024029574863, "loss": 0.0027, "step": 4475 }, { "epoch": 8.24, "grad_norm": 0.003087196499109268, "learning_rate": 0.00017744916820702404, "loss": 0.0018, "step": 4500 }, { "epoch": 8.29, "grad_norm": 0.02859407104551792, "learning_rate": 0.00017282809611829944, "loss": 0.0015, "step": 4525 }, { "epoch": 8.33, "grad_norm": 0.00041793755372054875, "learning_rate": 0.00016820702402957485, "loss": 0.0035, "step": 4550 }, { "epoch": 8.38, "grad_norm": 0.0037734461948275566, "learning_rate": 0.00016358595194085026, "loss": 0.002, "step": 4575 }, { "epoch": 8.42, "grad_norm": 0.0030207443051040173, "learning_rate": 0.00015896487985212572, "loss": 0.0022, "step": 4600 }, { "epoch": 8.47, "grad_norm": 0.0026946039870381355, "learning_rate": 0.00015434380776340113, "loss": 0.0028, "step": 4625 }, { "epoch": 8.52, "grad_norm": 0.041892848908901215, "learning_rate": 0.00014972273567467653, "loss": 0.001, "step": 4650 }, { "epoch": 8.56, "grad_norm": 0.06906843930482864, "learning_rate": 0.00014510166358595194, "loss": 0.0013, "step": 4675 }, { "epoch": 8.61, "grad_norm": 0.00067297019995749, "learning_rate": 0.00014048059149722737, "loss": 0.0029, "step": 4700 }, { "epoch": 8.65, "grad_norm": 0.011746911332011223, "learning_rate": 0.00013585951940850278, "loss": 0.0012, "step": 4725 }, { "epoch": 8.7, "grad_norm": 0.0013995037879794836, "learning_rate": 0.0001312384473197782, "loss": 0.0017, "step": 4750 }, { "epoch": 8.75, "grad_norm": 0.009580204263329506, "learning_rate": 0.0001266173752310536, "loss": 0.001, "step": 4775 }, { "epoch": 8.79, "grad_norm": 0.0008843488758429885, "learning_rate": 0.00012199630314232903, "loss": 0.0015, "step": 4800 }, { "epoch": 8.84, "grad_norm": 0.0013571062590926886, "learning_rate": 0.00011737523105360444, "loss": 0.0024, "step": 4825 }, { "epoch": 8.88, "grad_norm": 0.01475840900093317, "learning_rate": 0.00011275415896487985, "loss": 0.0009, "step": 4850 }, { "epoch": 8.93, "grad_norm": 0.008486463688313961, "learning_rate": 0.00010813308687615527, "loss": 0.0022, "step": 4875 }, { "epoch": 8.97, "grad_norm": 0.0005981879075989127, "learning_rate": 0.00010351201478743069, "loss": 0.0006, "step": 4900 }, { "epoch": 9.0, "eval_loss": 0.23254649341106415, "eval_runtime": 463.7576, "eval_samples_per_second": 1.753, "eval_steps_per_second": 0.293, "step": 4914 }, { "epoch": 9.02, "grad_norm": 0.011784604750573635, "learning_rate": 9.889094269870611e-05, "loss": 0.0028, "step": 4925 }, { "epoch": 9.07, "grad_norm": 0.0005488657625392079, "learning_rate": 9.426987060998152e-05, "loss": 0.0016, "step": 4950 }, { "epoch": 9.11, "grad_norm": 0.0024228901602327824, "learning_rate": 8.964879852125694e-05, "loss": 0.001, "step": 4975 }, { "epoch": 9.16, "grad_norm": 0.0021140037570148706, "learning_rate": 8.502772643253234e-05, "loss": 0.0014, "step": 5000 }, { "epoch": 9.2, "grad_norm": 0.0011844311375170946, "learning_rate": 8.040665434380776e-05, "loss": 0.001, "step": 5025 }, { "epoch": 9.25, "grad_norm": 0.011841055937111378, "learning_rate": 7.578558225508319e-05, "loss": 0.0009, "step": 5050 }, { "epoch": 9.29, "grad_norm": 0.013395372778177261, "learning_rate": 7.116451016635859e-05, "loss": 0.0009, "step": 5075 }, { "epoch": 9.34, "grad_norm": 0.05545121058821678, "learning_rate": 6.654343807763401e-05, "loss": 0.0012, "step": 5100 }, { "epoch": 9.39, "grad_norm": 0.01891588233411312, "learning_rate": 6.192236598890943e-05, "loss": 0.0006, "step": 5125 }, { "epoch": 9.43, "grad_norm": 0.0025335114914923906, "learning_rate": 5.730129390018484e-05, "loss": 0.0006, "step": 5150 }, { "epoch": 9.48, "grad_norm": 0.0021167423110455275, "learning_rate": 5.268022181146026e-05, "loss": 0.0007, "step": 5175 }, { "epoch": 9.52, "grad_norm": 0.0011415353510528803, "learning_rate": 4.8059149722735676e-05, "loss": 0.0014, "step": 5200 }, { "epoch": 9.57, "grad_norm": 0.00026013093884103, "learning_rate": 4.343807763401109e-05, "loss": 0.0007, "step": 5225 }, { "epoch": 9.62, "grad_norm": 0.03879648819565773, "learning_rate": 3.8817005545286504e-05, "loss": 0.0007, "step": 5250 }, { "epoch": 9.66, "grad_norm": 0.006720875855535269, "learning_rate": 3.4195933456561925e-05, "loss": 0.0009, "step": 5275 }, { "epoch": 9.71, "grad_norm": 0.006371485069394112, "learning_rate": 2.957486136783734e-05, "loss": 0.0009, "step": 5300 }, { "epoch": 9.75, "grad_norm": 0.012291524559259415, "learning_rate": 2.4953789279112753e-05, "loss": 0.0012, "step": 5325 }, { "epoch": 9.8, "grad_norm": 0.012388636358082294, "learning_rate": 2.033271719038817e-05, "loss": 0.0006, "step": 5350 }, { "epoch": 9.84, "grad_norm": 0.0905984491109848, "learning_rate": 1.5711645101663588e-05, "loss": 0.0011, "step": 5375 }, { "epoch": 9.89, "grad_norm": 0.0024207117967307568, "learning_rate": 1.1090573012939002e-05, "loss": 0.001, "step": 5400 }, { "epoch": 9.94, "grad_norm": 0.003070174716413021, "learning_rate": 6.469500924214418e-06, "loss": 0.0008, "step": 5425 }, { "epoch": 9.98, "grad_norm": 0.012533812783658504, "learning_rate": 1.8484288354898337e-06, "loss": 0.001, "step": 5450 }, { "epoch": 10.0, "eval_loss": 0.23037216067314148, "eval_runtime": 463.1996, "eval_samples_per_second": 1.755, "eval_steps_per_second": 0.294, "step": 5460 }, { "epoch": 10.0, "step": 5460, "total_flos": 1.135723105419264e+20, "train_loss": 0.029495533068592733, "train_runtime": 29108.855, "train_samples_per_second": 1.124, "train_steps_per_second": 0.188 } ], "logging_steps": 25, "max_steps": 5460, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.135723105419264e+20, "train_batch_size": 6, "trial_name": null, "trial_params": null }