{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.1402364451691804, "eval_steps": 500, "global_step": 31500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.8877573609352112, "learning_rate": 9.98641119717353e-06, "loss": 2.7747, "step": 100 }, { "epoch": 0.01, "grad_norm": 0.891560435295105, "learning_rate": 9.97282239434706e-06, "loss": 2.3805, "step": 200 }, { "epoch": 0.02, "grad_norm": 1.140250563621521, "learning_rate": 9.959233591520588e-06, "loss": 2.2018, "step": 300 }, { "epoch": 0.03, "grad_norm": 1.5957276821136475, "learning_rate": 9.945644788694116e-06, "loss": 2.1426, "step": 400 }, { "epoch": 0.03, "grad_norm": 1.0516589879989624, "learning_rate": 9.932055985867646e-06, "loss": 2.0379, "step": 500 }, { "epoch": 0.03, "eval_codebleu": 0.02236215080832533, "eval_dataflow_match_score": 0.020328127538115238, "eval_loss": 2.005807876586914, "eval_ngram_match_score": 7.828601095920155e-05, "eval_runtime": 211.5866, "eval_samples_per_second": 12.019, "eval_steps_per_second": 1.503, "eval_syntax_match_score": 0.06800422172156935, "eval_weighted_ngram_match_score": 0.0010379679626575443, "step": 500 }, { "epoch": 0.04, "grad_norm": 1.4408249855041504, "learning_rate": 9.918467183041175e-06, "loss": 2.1005, "step": 600 }, { "epoch": 0.05, "grad_norm": 1.042541742324829, "learning_rate": 9.904878380214705e-06, "loss": 2.0475, "step": 700 }, { "epoch": 0.05, "grad_norm": 0.9206609129905701, "learning_rate": 9.891289577388233e-06, "loss": 2.0145, "step": 800 }, { "epoch": 0.06, "grad_norm": 1.2876330614089966, "learning_rate": 9.87770077456176e-06, "loss": 1.9729, "step": 900 }, { "epoch": 0.07, "grad_norm": 1.5496817827224731, "learning_rate": 9.86411197173529e-06, "loss": 1.9755, "step": 1000 }, { "epoch": 0.07, "eval_codebleu": 0.026625421991617458, "eval_dataflow_match_score": 0.017334601907502377, "eval_loss": 1.8824141025543213, "eval_ngram_match_score": 0.0001360979918249884, "eval_runtime": 210.1093, "eval_samples_per_second": 12.103, "eval_steps_per_second": 1.513, "eval_syntax_match_score": 0.085743571008139, "eval_weighted_ngram_match_score": 0.0032874170590034683, "step": 1000 }, { "epoch": 0.07, "grad_norm": 1.479344129562378, "learning_rate": 9.85052316890882e-06, "loss": 1.9526, "step": 1100 }, { "epoch": 0.08, "grad_norm": 1.1165575981140137, "learning_rate": 9.83693436608235e-06, "loss": 1.9212, "step": 1200 }, { "epoch": 0.09, "grad_norm": 1.871772289276123, "learning_rate": 9.823345563255877e-06, "loss": 1.8674, "step": 1300 }, { "epoch": 0.1, "grad_norm": 1.1310107707977295, "learning_rate": 9.809756760429407e-06, "loss": 1.9083, "step": 1400 }, { "epoch": 0.1, "grad_norm": 1.2779932022094727, "learning_rate": 9.796167957602935e-06, "loss": 1.8672, "step": 1500 }, { "epoch": 0.1, "eval_codebleu": 0.03411263800674984, "eval_dataflow_match_score": 0.035922307567354324, "eval_loss": 1.8209562301635742, "eval_ngram_match_score": 0.00019790870128407514, "eval_runtime": 209.7736, "eval_samples_per_second": 12.123, "eval_steps_per_second": 1.516, "eval_syntax_match_score": 0.09630802330065559, "eval_weighted_ngram_match_score": 0.004022312457705368, "step": 1500 }, { "epoch": 0.11, "grad_norm": 1.2445776462554932, "learning_rate": 9.782579154776465e-06, "loss": 1.8163, "step": 1600 }, { "epoch": 0.12, "grad_norm": 1.673412561416626, "learning_rate": 9.768990351949994e-06, "loss": 1.8664, "step": 1700 }, { "epoch": 0.12, "grad_norm": 1.3831863403320312, "learning_rate": 9.755401549123524e-06, "loss": 1.8336, "step": 1800 }, { "epoch": 0.13, "grad_norm": 1.5055395364761353, "learning_rate": 9.741812746297052e-06, "loss": 1.879, "step": 1900 }, { "epoch": 0.14, "grad_norm": 1.3736721277236938, "learning_rate": 9.72822394347058e-06, "loss": 1.8522, "step": 2000 }, { "epoch": 0.14, "eval_codebleu": 0.04137599574546961, "eval_dataflow_match_score": 0.06186619636599912, "eval_loss": 1.7707544565200806, "eval_ngram_match_score": 0.00026554637185762897, "eval_runtime": 210.486, "eval_samples_per_second": 12.082, "eval_steps_per_second": 1.511, "eval_syntax_match_score": 0.09913941829547991, "eval_weighted_ngram_match_score": 0.00423282194854177, "step": 2000 }, { "epoch": 0.14, "grad_norm": 1.5695489645004272, "learning_rate": 9.71463514064411e-06, "loss": 1.7906, "step": 2100 }, { "epoch": 0.15, "grad_norm": 1.4151173830032349, "learning_rate": 9.701046337817639e-06, "loss": 1.8182, "step": 2200 }, { "epoch": 0.16, "grad_norm": 1.7115099430084229, "learning_rate": 9.687457534991169e-06, "loss": 1.8232, "step": 2300 }, { "epoch": 0.16, "grad_norm": 1.1658021211624146, "learning_rate": 9.673868732164697e-06, "loss": 1.7381, "step": 2400 }, { "epoch": 0.17, "grad_norm": 1.5483379364013672, "learning_rate": 9.660279929338226e-06, "loss": 1.7707, "step": 2500 }, { "epoch": 0.17, "eval_codebleu": 0.041373696094825684, "eval_dataflow_match_score": 0.06151811199034646, "eval_loss": 1.7324126958847046, "eval_ngram_match_score": 0.00031767245966084565, "eval_runtime": 210.1473, "eval_samples_per_second": 12.101, "eval_steps_per_second": 1.513, "eval_syntax_match_score": 0.09862185146846902, "eval_weighted_ngram_match_score": 0.0050371484608264, "step": 2500 }, { "epoch": 0.18, "grad_norm": 1.138310194015503, "learning_rate": 9.646691126511754e-06, "loss": 1.702, "step": 2600 }, { "epoch": 0.18, "grad_norm": 1.5409811735153198, "learning_rate": 9.633102323685284e-06, "loss": 1.7403, "step": 2700 }, { "epoch": 0.19, "grad_norm": 1.2863227128982544, "learning_rate": 9.619513520858813e-06, "loss": 1.7514, "step": 2800 }, { "epoch": 0.2, "grad_norm": 1.523219347000122, "learning_rate": 9.605924718032343e-06, "loss": 1.7735, "step": 2900 }, { "epoch": 0.2, "grad_norm": 1.6315879821777344, "learning_rate": 9.592335915205871e-06, "loss": 1.7289, "step": 3000 }, { "epoch": 0.2, "eval_codebleu": 0.03736357788677419, "eval_dataflow_match_score": 0.04515814633467152, "eval_loss": 1.6959866285324097, "eval_ngram_match_score": 0.00026766267346779954, "eval_runtime": 209.9032, "eval_samples_per_second": 12.115, "eval_steps_per_second": 1.515, "eval_syntax_match_score": 0.09879437374413931, "eval_weighted_ngram_match_score": 0.0052341287948181195, "step": 3000 }, { "epoch": 0.21, "grad_norm": 1.5990880727767944, "learning_rate": 9.5787471123794e-06, "loss": 1.7275, "step": 3100 }, { "epoch": 0.22, "grad_norm": 1.787031650543213, "learning_rate": 9.565158309552928e-06, "loss": 1.7112, "step": 3200 }, { "epoch": 0.22, "grad_norm": 1.6794238090515137, "learning_rate": 9.551569506726458e-06, "loss": 1.683, "step": 3300 }, { "epoch": 0.23, "grad_norm": 2.1808717250823975, "learning_rate": 9.537980703899988e-06, "loss": 1.8087, "step": 3400 }, { "epoch": 0.24, "grad_norm": 1.737121343612671, "learning_rate": 9.524391901073516e-06, "loss": 1.7739, "step": 3500 }, { "epoch": 0.24, "eval_codebleu": 0.03697189900391692, "eval_dataflow_match_score": 0.03780196319587868, "eval_loss": 1.668063998222351, "eval_ngram_match_score": 0.0003274646611745997, "eval_runtime": 208.872, "eval_samples_per_second": 12.175, "eval_steps_per_second": 1.522, "eval_syntax_match_score": 0.1035438105096511, "eval_weighted_ngram_match_score": 0.006214357648963285, "step": 3500 }, { "epoch": 0.24, "grad_norm": 1.6818939447402954, "learning_rate": 9.510803098247045e-06, "loss": 1.7366, "step": 3600 }, { "epoch": 0.25, "grad_norm": 1.4390071630477905, "learning_rate": 9.497214295420573e-06, "loss": 1.6693, "step": 3700 }, { "epoch": 0.26, "grad_norm": 1.443108081817627, "learning_rate": 9.483625492594103e-06, "loss": 1.6856, "step": 3800 }, { "epoch": 0.26, "grad_norm": 1.6540530920028687, "learning_rate": 9.470036689767632e-06, "loss": 1.6965, "step": 3900 }, { "epoch": 0.27, "grad_norm": 1.178989291191101, "learning_rate": 9.456447886941162e-06, "loss": 1.6797, "step": 4000 }, { "epoch": 0.27, "eval_codebleu": 0.033575238882649495, "eval_dataflow_match_score": 0.025967094423688303, "eval_loss": 1.6396487951278687, "eval_ngram_match_score": 0.00023558729997467283, "eval_runtime": 209.0765, "eval_samples_per_second": 12.163, "eval_steps_per_second": 1.521, "eval_syntax_match_score": 0.1018591812295764, "eval_weighted_ngram_match_score": 0.006239092577358598, "step": 4000 }, { "epoch": 0.28, "grad_norm": 1.888967752456665, "learning_rate": 9.44285908411469e-06, "loss": 1.6815, "step": 4100 }, { "epoch": 0.29, "grad_norm": 1.4612641334533691, "learning_rate": 9.42927028128822e-06, "loss": 1.6089, "step": 4200 }, { "epoch": 0.29, "grad_norm": 1.5609626770019531, "learning_rate": 9.415681478461748e-06, "loss": 1.6824, "step": 4300 }, { "epoch": 0.3, "grad_norm": 1.38056218624115, "learning_rate": 9.402092675635277e-06, "loss": 1.679, "step": 4400 }, { "epoch": 0.31, "grad_norm": 1.928477168083191, "learning_rate": 9.388503872808807e-06, "loss": 1.6737, "step": 4500 }, { "epoch": 0.31, "eval_codebleu": 0.03436485059379589, "eval_dataflow_match_score": 0.025433365047687558, "eval_loss": 1.619603157043457, "eval_ngram_match_score": 0.00025042578946432695, "eval_runtime": 210.4293, "eval_samples_per_second": 12.085, "eval_steps_per_second": 1.511, "eval_syntax_match_score": 0.10487324686922811, "eval_weighted_ngram_match_score": 0.006902364668803559, "step": 4500 }, { "epoch": 0.31, "grad_norm": 1.467225432395935, "learning_rate": 9.374915069982335e-06, "loss": 1.642, "step": 4600 }, { "epoch": 0.32, "grad_norm": 1.4231284856796265, "learning_rate": 9.361326267155864e-06, "loss": 1.6238, "step": 4700 }, { "epoch": 0.33, "grad_norm": 1.61458158493042, "learning_rate": 9.347737464329394e-06, "loss": 1.6226, "step": 4800 }, { "epoch": 0.33, "grad_norm": 1.8465476036071777, "learning_rate": 9.334148661502922e-06, "loss": 1.6427, "step": 4900 }, { "epoch": 0.34, "grad_norm": 2.1492788791656494, "learning_rate": 9.320559858676452e-06, "loss": 1.6783, "step": 5000 }, { "epoch": 0.34, "eval_codebleu": 0.03197730480981573, "eval_dataflow_match_score": 0.018842967535330563, "eval_loss": 1.596398115158081, "eval_ngram_match_score": 0.00021050334214284166, "eval_runtime": 209.9696, "eval_samples_per_second": 12.111, "eval_steps_per_second": 1.515, "eval_syntax_match_score": 0.10182873612210518, "eval_weighted_ngram_match_score": 0.0070270122396843505, "step": 5000 }, { "epoch": 0.35, "grad_norm": 1.5661336183547974, "learning_rate": 9.306971055849981e-06, "loss": 1.6227, "step": 5100 }, { "epoch": 0.35, "grad_norm": 1.5441151857376099, "learning_rate": 9.293382253023509e-06, "loss": 1.6525, "step": 5200 }, { "epoch": 0.36, "grad_norm": 1.507628083229065, "learning_rate": 9.279793450197039e-06, "loss": 1.649, "step": 5300 }, { "epoch": 0.37, "grad_norm": 1.5360280275344849, "learning_rate": 9.266204647370567e-06, "loss": 1.6738, "step": 5400 }, { "epoch": 0.37, "grad_norm": 1.6375775337219238, "learning_rate": 9.252615844544096e-06, "loss": 1.5854, "step": 5500 }, { "epoch": 0.37, "eval_loss": 1.5711854696273804, "eval_runtime": 70.8487, "eval_samples_per_second": 35.893, "eval_steps_per_second": 4.488, "step": 5500 }, { "epoch": 0.38, "grad_norm": 1.5210243463516235, "learning_rate": 9.239027041717626e-06, "loss": 1.621, "step": 5600 }, { "epoch": 0.39, "grad_norm": 1.2972959280014038, "learning_rate": 9.225438238891154e-06, "loss": 1.6124, "step": 5700 }, { "epoch": 0.39, "grad_norm": 1.902424693107605, "learning_rate": 9.211849436064683e-06, "loss": 1.6012, "step": 5800 }, { "epoch": 0.4, "grad_norm": 1.865466594696045, "learning_rate": 9.198260633238213e-06, "loss": 1.5745, "step": 5900 }, { "epoch": 0.41, "grad_norm": 1.6546335220336914, "learning_rate": 9.184671830411741e-06, "loss": 1.5874, "step": 6000 }, { "epoch": 0.41, "eval_loss": 1.553696870803833, "eval_runtime": 70.799, "eval_samples_per_second": 35.919, "eval_steps_per_second": 4.492, "step": 6000 }, { "epoch": 0.41, "grad_norm": 1.7478700876235962, "learning_rate": 9.17108302758527e-06, "loss": 1.5666, "step": 6100 }, { "epoch": 0.42, "grad_norm": 1.5560758113861084, "learning_rate": 9.1574942247588e-06, "loss": 1.5382, "step": 6200 }, { "epoch": 0.43, "grad_norm": 1.4764758348464966, "learning_rate": 9.143905421932328e-06, "loss": 1.5408, "step": 6300 }, { "epoch": 0.43, "grad_norm": 1.6634376049041748, "learning_rate": 9.130316619105858e-06, "loss": 1.5688, "step": 6400 }, { "epoch": 0.44, "grad_norm": 1.673550009727478, "learning_rate": 9.116727816279387e-06, "loss": 1.5445, "step": 6500 }, { "epoch": 0.44, "eval_loss": 1.5321872234344482, "eval_runtime": 70.8079, "eval_samples_per_second": 35.914, "eval_steps_per_second": 4.491, "step": 6500 }, { "epoch": 0.45, "grad_norm": 1.572766900062561, "learning_rate": 9.103139013452915e-06, "loss": 1.5596, "step": 6600 }, { "epoch": 0.46, "grad_norm": 1.4650744199752808, "learning_rate": 9.089550210626445e-06, "loss": 1.5327, "step": 6700 }, { "epoch": 0.46, "grad_norm": 1.8133198022842407, "learning_rate": 9.075961407799973e-06, "loss": 1.5789, "step": 6800 }, { "epoch": 0.47, "grad_norm": 1.6164945363998413, "learning_rate": 9.062372604973503e-06, "loss": 1.4918, "step": 6900 }, { "epoch": 0.48, "grad_norm": 1.7517341375350952, "learning_rate": 9.048783802147032e-06, "loss": 1.4947, "step": 7000 }, { "epoch": 0.48, "eval_loss": 1.5142408609390259, "eval_runtime": 70.7829, "eval_samples_per_second": 35.927, "eval_steps_per_second": 4.493, "step": 7000 }, { "epoch": 0.48, "grad_norm": 2.3255422115325928, "learning_rate": 9.03519499932056e-06, "loss": 1.5434, "step": 7100 }, { "epoch": 0.49, "grad_norm": 2.118253469467163, "learning_rate": 9.02160619649409e-06, "loss": 1.5191, "step": 7200 }, { "epoch": 0.5, "grad_norm": 1.5948559045791626, "learning_rate": 9.008017393667618e-06, "loss": 1.5437, "step": 7300 }, { "epoch": 0.5, "grad_norm": 1.4735013246536255, "learning_rate": 8.994428590841147e-06, "loss": 1.5217, "step": 7400 }, { "epoch": 0.51, "grad_norm": 1.5940884351730347, "learning_rate": 8.980839788014677e-06, "loss": 1.5113, "step": 7500 }, { "epoch": 0.51, "eval_loss": 1.4991660118103027, "eval_runtime": 70.7857, "eval_samples_per_second": 35.925, "eval_steps_per_second": 4.492, "step": 7500 }, { "epoch": 0.52, "grad_norm": 1.7738714218139648, "learning_rate": 8.967250985188207e-06, "loss": 1.5253, "step": 7600 }, { "epoch": 0.52, "grad_norm": 1.8445935249328613, "learning_rate": 8.953662182361734e-06, "loss": 1.5359, "step": 7700 }, { "epoch": 0.53, "grad_norm": 1.5108485221862793, "learning_rate": 8.940073379535264e-06, "loss": 1.5228, "step": 7800 }, { "epoch": 0.54, "grad_norm": 1.7147557735443115, "learning_rate": 8.926484576708792e-06, "loss": 1.5228, "step": 7900 }, { "epoch": 0.54, "grad_norm": 1.5899161100387573, "learning_rate": 8.912895773882322e-06, "loss": 1.4889, "step": 8000 }, { "epoch": 0.54, "eval_loss": 1.4857795238494873, "eval_runtime": 70.8945, "eval_samples_per_second": 35.87, "eval_steps_per_second": 4.486, "step": 8000 }, { "epoch": 0.55, "grad_norm": 1.6886754035949707, "learning_rate": 8.899306971055851e-06, "loss": 1.4898, "step": 8100 }, { "epoch": 0.56, "grad_norm": 1.5048600435256958, "learning_rate": 8.885718168229381e-06, "loss": 1.5239, "step": 8200 }, { "epoch": 0.56, "grad_norm": 1.8773216009140015, "learning_rate": 8.872129365402909e-06, "loss": 1.4889, "step": 8300 }, { "epoch": 0.57, "grad_norm": 1.7063783407211304, "learning_rate": 8.858540562576437e-06, "loss": 1.4472, "step": 8400 }, { "epoch": 0.58, "grad_norm": 1.5942984819412231, "learning_rate": 8.844951759749966e-06, "loss": 1.4998, "step": 8500 }, { "epoch": 0.58, "eval_loss": 1.4711333513259888, "eval_runtime": 70.9176, "eval_samples_per_second": 35.858, "eval_steps_per_second": 4.484, "step": 8500 }, { "epoch": 0.58, "grad_norm": 1.818272352218628, "learning_rate": 8.831362956923496e-06, "loss": 1.5504, "step": 8600 }, { "epoch": 0.59, "grad_norm": 1.8230923414230347, "learning_rate": 8.817774154097026e-06, "loss": 1.5182, "step": 8700 }, { "epoch": 0.6, "grad_norm": 1.8054814338684082, "learning_rate": 8.804185351270554e-06, "loss": 1.5136, "step": 8800 }, { "epoch": 0.6, "grad_norm": 1.7768468856811523, "learning_rate": 8.790596548444083e-06, "loss": 1.4643, "step": 8900 }, { "epoch": 0.61, "grad_norm": 1.9298087358474731, "learning_rate": 8.777007745617611e-06, "loss": 1.4449, "step": 9000 }, { "epoch": 0.61, "eval_loss": 1.4553042650222778, "eval_runtime": 71.0741, "eval_samples_per_second": 35.78, "eval_steps_per_second": 4.474, "step": 9000 }, { "epoch": 0.62, "grad_norm": 1.979022741317749, "learning_rate": 8.76341894279114e-06, "loss": 1.5057, "step": 9100 }, { "epoch": 0.63, "grad_norm": 1.6144100427627563, "learning_rate": 8.74983013996467e-06, "loss": 1.4893, "step": 9200 }, { "epoch": 0.63, "grad_norm": 1.7385257482528687, "learning_rate": 8.7362413371382e-06, "loss": 1.4427, "step": 9300 }, { "epoch": 0.64, "grad_norm": 2.218280792236328, "learning_rate": 8.722652534311728e-06, "loss": 1.445, "step": 9400 }, { "epoch": 0.65, "grad_norm": 1.760903000831604, "learning_rate": 8.709063731485256e-06, "loss": 1.4364, "step": 9500 }, { "epoch": 0.65, "eval_loss": 1.4402815103530884, "eval_runtime": 70.7715, "eval_samples_per_second": 35.933, "eval_steps_per_second": 4.493, "step": 9500 }, { "epoch": 0.65, "grad_norm": 1.4513877630233765, "learning_rate": 8.695474928658785e-06, "loss": 1.437, "step": 9600 }, { "epoch": 0.66, "grad_norm": 1.703837513923645, "learning_rate": 8.681886125832315e-06, "loss": 1.4565, "step": 9700 }, { "epoch": 0.67, "grad_norm": 2.192049980163574, "learning_rate": 8.668297323005845e-06, "loss": 1.4654, "step": 9800 }, { "epoch": 0.67, "grad_norm": 2.012014150619507, "learning_rate": 8.654708520179373e-06, "loss": 1.4975, "step": 9900 }, { "epoch": 0.68, "grad_norm": 2.117527484893799, "learning_rate": 8.641119717352902e-06, "loss": 1.4446, "step": 10000 }, { "epoch": 0.68, "eval_loss": 1.4303960800170898, "eval_runtime": 70.9752, "eval_samples_per_second": 35.829, "eval_steps_per_second": 4.48, "step": 10000 }, { "epoch": 0.69, "grad_norm": 2.3108479976654053, "learning_rate": 8.62753091452643e-06, "loss": 1.4801, "step": 10100 }, { "epoch": 0.69, "grad_norm": 1.4589275121688843, "learning_rate": 8.61394211169996e-06, "loss": 1.4579, "step": 10200 }, { "epoch": 0.7, "grad_norm": 1.7688006162643433, "learning_rate": 8.60035330887349e-06, "loss": 1.471, "step": 10300 }, { "epoch": 0.71, "grad_norm": 1.6766855716705322, "learning_rate": 8.586764506047019e-06, "loss": 1.4532, "step": 10400 }, { "epoch": 0.71, "grad_norm": 2.0386102199554443, "learning_rate": 8.573175703220547e-06, "loss": 1.3998, "step": 10500 }, { "epoch": 0.71, "eval_loss": 1.4204617738723755, "eval_runtime": 70.7476, "eval_samples_per_second": 35.945, "eval_steps_per_second": 4.495, "step": 10500 }, { "epoch": 0.72, "grad_norm": 1.9797570705413818, "learning_rate": 8.559586900394075e-06, "loss": 1.3922, "step": 10600 }, { "epoch": 0.73, "grad_norm": 1.7562373876571655, "learning_rate": 8.545998097567605e-06, "loss": 1.4378, "step": 10700 }, { "epoch": 0.73, "grad_norm": 1.6127831935882568, "learning_rate": 8.532409294741134e-06, "loss": 1.4483, "step": 10800 }, { "epoch": 0.74, "grad_norm": 1.6120541095733643, "learning_rate": 8.518820491914664e-06, "loss": 1.3961, "step": 10900 }, { "epoch": 0.75, "grad_norm": 1.5521306991577148, "learning_rate": 8.505231689088192e-06, "loss": 1.4101, "step": 11000 }, { "epoch": 0.75, "eval_loss": 1.4052175283432007, "eval_runtime": 70.9316, "eval_samples_per_second": 35.851, "eval_steps_per_second": 4.483, "step": 11000 }, { "epoch": 0.75, "grad_norm": 2.4100379943847656, "learning_rate": 8.491642886261721e-06, "loss": 1.4224, "step": 11100 }, { "epoch": 0.76, "grad_norm": 1.7542225122451782, "learning_rate": 8.47805408343525e-06, "loss": 1.4238, "step": 11200 }, { "epoch": 0.77, "grad_norm": 2.3809213638305664, "learning_rate": 8.464465280608779e-06, "loss": 1.3968, "step": 11300 }, { "epoch": 0.77, "grad_norm": 1.490343451499939, "learning_rate": 8.450876477782309e-06, "loss": 1.4512, "step": 11400 }, { "epoch": 0.78, "grad_norm": 1.663609504699707, "learning_rate": 8.437287674955838e-06, "loss": 1.4772, "step": 11500 }, { "epoch": 0.78, "eval_loss": 1.3936774730682373, "eval_runtime": 70.914, "eval_samples_per_second": 35.86, "eval_steps_per_second": 4.484, "step": 11500 }, { "epoch": 0.79, "grad_norm": 1.5684208869934082, "learning_rate": 8.423698872129366e-06, "loss": 1.4276, "step": 11600 }, { "epoch": 0.79, "grad_norm": 1.6131608486175537, "learning_rate": 8.410110069302894e-06, "loss": 1.4067, "step": 11700 }, { "epoch": 0.8, "grad_norm": 2.017564058303833, "learning_rate": 8.396521266476424e-06, "loss": 1.4028, "step": 11800 }, { "epoch": 0.81, "grad_norm": 2.383514165878296, "learning_rate": 8.382932463649953e-06, "loss": 1.4028, "step": 11900 }, { "epoch": 0.82, "grad_norm": 2.202026605606079, "learning_rate": 8.369343660823483e-06, "loss": 1.3671, "step": 12000 }, { "epoch": 0.82, "eval_loss": 1.3839792013168335, "eval_runtime": 70.7236, "eval_samples_per_second": 35.957, "eval_steps_per_second": 4.496, "step": 12000 }, { "epoch": 0.82, "grad_norm": 1.499125361442566, "learning_rate": 8.355754857997011e-06, "loss": 1.4193, "step": 12100 }, { "epoch": 0.83, "grad_norm": 1.3109521865844727, "learning_rate": 8.34216605517054e-06, "loss": 1.3969, "step": 12200 }, { "epoch": 0.84, "grad_norm": 2.689412832260132, "learning_rate": 8.328577252344068e-06, "loss": 1.4141, "step": 12300 }, { "epoch": 0.84, "grad_norm": 1.6615593433380127, "learning_rate": 8.314988449517598e-06, "loss": 1.3512, "step": 12400 }, { "epoch": 0.85, "grad_norm": 1.994040846824646, "learning_rate": 8.301399646691128e-06, "loss": 1.4268, "step": 12500 }, { "epoch": 0.85, "eval_loss": 1.3757482767105103, "eval_runtime": 70.9944, "eval_samples_per_second": 35.82, "eval_steps_per_second": 4.479, "step": 12500 }, { "epoch": 0.86, "grad_norm": 2.2422096729278564, "learning_rate": 8.287810843864657e-06, "loss": 1.3549, "step": 12600 }, { "epoch": 0.86, "grad_norm": 1.4407843351364136, "learning_rate": 8.274222041038185e-06, "loss": 1.3934, "step": 12700 }, { "epoch": 0.87, "grad_norm": 3.7289652824401855, "learning_rate": 8.260633238211713e-06, "loss": 1.3916, "step": 12800 }, { "epoch": 0.88, "grad_norm": 1.819023847579956, "learning_rate": 8.247044435385243e-06, "loss": 1.3878, "step": 12900 }, { "epoch": 0.88, "grad_norm": 1.6075499057769775, "learning_rate": 8.233455632558772e-06, "loss": 1.3469, "step": 13000 }, { "epoch": 0.88, "eval_loss": 1.365922212600708, "eval_runtime": 71.0057, "eval_samples_per_second": 35.814, "eval_steps_per_second": 4.479, "step": 13000 }, { "epoch": 0.89, "grad_norm": 1.9204126596450806, "learning_rate": 8.219866829732302e-06, "loss": 1.3456, "step": 13100 }, { "epoch": 0.9, "grad_norm": 2.0110292434692383, "learning_rate": 8.20627802690583e-06, "loss": 1.3921, "step": 13200 }, { "epoch": 0.9, "grad_norm": 1.4502629041671753, "learning_rate": 8.192689224079358e-06, "loss": 1.383, "step": 13300 }, { "epoch": 0.91, "grad_norm": 2.5011653900146484, "learning_rate": 8.179100421252888e-06, "loss": 1.3413, "step": 13400 }, { "epoch": 0.92, "grad_norm": 1.4338220357894897, "learning_rate": 8.165511618426417e-06, "loss": 1.3531, "step": 13500 }, { "epoch": 0.92, "eval_loss": 1.3567384481430054, "eval_runtime": 70.8084, "eval_samples_per_second": 35.914, "eval_steps_per_second": 4.491, "step": 13500 }, { "epoch": 0.92, "grad_norm": 1.8867733478546143, "learning_rate": 8.151922815599947e-06, "loss": 1.4115, "step": 13600 }, { "epoch": 0.93, "grad_norm": 1.897558331489563, "learning_rate": 8.138334012773476e-06, "loss": 1.3473, "step": 13700 }, { "epoch": 0.94, "grad_norm": 2.6677191257476807, "learning_rate": 8.124745209947004e-06, "loss": 1.3982, "step": 13800 }, { "epoch": 0.94, "grad_norm": 1.6690632104873657, "learning_rate": 8.111156407120532e-06, "loss": 1.3371, "step": 13900 }, { "epoch": 0.95, "grad_norm": 1.668286919593811, "learning_rate": 8.097567604294062e-06, "loss": 1.3463, "step": 14000 }, { "epoch": 0.95, "eval_loss": 1.3470206260681152, "eval_runtime": 70.8694, "eval_samples_per_second": 35.883, "eval_steps_per_second": 4.487, "step": 14000 }, { "epoch": 0.96, "grad_norm": 1.3303313255310059, "learning_rate": 8.083978801467592e-06, "loss": 1.3283, "step": 14100 }, { "epoch": 0.96, "grad_norm": 1.8314011096954346, "learning_rate": 8.070389998641121e-06, "loss": 1.3382, "step": 14200 }, { "epoch": 0.97, "grad_norm": 1.6911287307739258, "learning_rate": 8.056801195814649e-06, "loss": 1.3249, "step": 14300 }, { "epoch": 0.98, "grad_norm": 2.0255990028381348, "learning_rate": 8.043212392988179e-06, "loss": 1.4345, "step": 14400 }, { "epoch": 0.99, "grad_norm": 1.6872771978378296, "learning_rate": 8.029623590161707e-06, "loss": 1.3662, "step": 14500 }, { "epoch": 0.99, "eval_loss": 1.3394687175750732, "eval_runtime": 70.8165, "eval_samples_per_second": 35.91, "eval_steps_per_second": 4.49, "step": 14500 }, { "epoch": 0.99, "grad_norm": 1.2456538677215576, "learning_rate": 8.016034787335236e-06, "loss": 1.3152, "step": 14600 }, { "epoch": 1.0, "grad_norm": 1.9343585968017578, "learning_rate": 8.002445984508766e-06, "loss": 1.3179, "step": 14700 }, { "epoch": 1.01, "grad_norm": 1.6026442050933838, "learning_rate": 7.988857181682294e-06, "loss": 1.3445, "step": 14800 }, { "epoch": 1.01, "grad_norm": 1.8159044981002808, "learning_rate": 7.975268378855823e-06, "loss": 1.3259, "step": 14900 }, { "epoch": 1.02, "grad_norm": 1.6430504322052002, "learning_rate": 7.961679576029351e-06, "loss": 1.337, "step": 15000 }, { "epoch": 1.02, "eval_loss": 1.3323568105697632, "eval_runtime": 70.9018, "eval_samples_per_second": 35.867, "eval_steps_per_second": 4.485, "step": 15000 }, { "epoch": 1.03, "grad_norm": 2.036970853805542, "learning_rate": 7.948090773202881e-06, "loss": 1.3217, "step": 15100 }, { "epoch": 1.03, "grad_norm": 1.6756584644317627, "learning_rate": 7.93450197037641e-06, "loss": 1.3008, "step": 15200 }, { "epoch": 1.04, "grad_norm": 1.5923326015472412, "learning_rate": 7.92091316754994e-06, "loss": 1.3347, "step": 15300 }, { "epoch": 1.05, "grad_norm": 1.809383749961853, "learning_rate": 7.907460252751734e-06, "loss": 1.3192, "step": 15400 }, { "epoch": 1.05, "grad_norm": 2.035680055618286, "learning_rate": 7.893871449925262e-06, "loss": 1.3627, "step": 15500 }, { "epoch": 1.05, "eval_loss": 1.3219527006149292, "eval_runtime": 70.9632, "eval_samples_per_second": 35.835, "eval_steps_per_second": 4.481, "step": 15500 }, { "epoch": 1.06, "grad_norm": 1.7485100030899048, "learning_rate": 7.880282647098791e-06, "loss": 1.3015, "step": 15600 }, { "epoch": 1.07, "grad_norm": 2.0771241188049316, "learning_rate": 7.86669384427232e-06, "loss": 1.3261, "step": 15700 }, { "epoch": 1.07, "grad_norm": 1.8625783920288086, "learning_rate": 7.853105041445849e-06, "loss": 1.3308, "step": 15800 }, { "epoch": 1.08, "grad_norm": 1.8347725868225098, "learning_rate": 7.839516238619378e-06, "loss": 1.3637, "step": 15900 }, { "epoch": 1.09, "grad_norm": 1.9449338912963867, "learning_rate": 7.825927435792908e-06, "loss": 1.2906, "step": 16000 }, { "epoch": 1.09, "eval_loss": 1.3169829845428467, "eval_runtime": 70.9704, "eval_samples_per_second": 35.832, "eval_steps_per_second": 4.481, "step": 16000 }, { "epoch": 1.09, "grad_norm": 1.6746830940246582, "learning_rate": 7.812338632966436e-06, "loss": 1.3326, "step": 16100 }, { "epoch": 1.1, "grad_norm": 1.5581905841827393, "learning_rate": 7.798749830139966e-06, "loss": 1.2964, "step": 16200 }, { "epoch": 1.11, "grad_norm": 2.1636734008789062, "learning_rate": 7.785161027313495e-06, "loss": 1.2867, "step": 16300 }, { "epoch": 1.11, "grad_norm": 1.760335922241211, "learning_rate": 7.771572224487023e-06, "loss": 1.3017, "step": 16400 }, { "epoch": 1.12, "grad_norm": 1.9209500551223755, "learning_rate": 7.757983421660553e-06, "loss": 1.331, "step": 16500 }, { "epoch": 1.12, "eval_loss": 1.3085339069366455, "eval_runtime": 70.9762, "eval_samples_per_second": 35.829, "eval_steps_per_second": 4.48, "step": 16500 }, { "epoch": 1.13, "grad_norm": 1.8936728239059448, "learning_rate": 7.74439461883408e-06, "loss": 1.3494, "step": 16600 }, { "epoch": 1.13, "grad_norm": 1.6603670120239258, "learning_rate": 7.73080581600761e-06, "loss": 1.3443, "step": 16700 }, { "epoch": 1.14, "grad_norm": 1.9962695837020874, "learning_rate": 7.71721701318114e-06, "loss": 1.3016, "step": 16800 }, { "epoch": 1.15, "grad_norm": 1.7902451753616333, "learning_rate": 7.70362821035467e-06, "loss": 1.3107, "step": 16900 }, { "epoch": 1.16, "grad_norm": 1.7962889671325684, "learning_rate": 7.690039407528197e-06, "loss": 1.3082, "step": 17000 }, { "epoch": 1.16, "eval_loss": 1.3007583618164062, "eval_runtime": 70.8534, "eval_samples_per_second": 35.891, "eval_steps_per_second": 4.488, "step": 17000 }, { "epoch": 1.16, "grad_norm": 1.8448220491409302, "learning_rate": 7.676450604701725e-06, "loss": 1.3148, "step": 17100 }, { "epoch": 1.17, "grad_norm": 2.124708652496338, "learning_rate": 7.662861801875255e-06, "loss": 1.3348, "step": 17200 }, { "epoch": 1.18, "grad_norm": 1.5953021049499512, "learning_rate": 7.649272999048785e-06, "loss": 1.2575, "step": 17300 }, { "epoch": 1.18, "grad_norm": 1.7431753873825073, "learning_rate": 7.635684196222314e-06, "loss": 1.3239, "step": 17400 }, { "epoch": 1.19, "grad_norm": 2.0628209114074707, "learning_rate": 7.622095393395842e-06, "loss": 1.2904, "step": 17500 }, { "epoch": 1.19, "eval_loss": 1.2971383333206177, "eval_runtime": 70.8465, "eval_samples_per_second": 35.894, "eval_steps_per_second": 4.489, "step": 17500 }, { "epoch": 1.2, "grad_norm": 2.154141902923584, "learning_rate": 7.608506590569371e-06, "loss": 1.2824, "step": 17600 }, { "epoch": 1.2, "grad_norm": 1.7325314283370972, "learning_rate": 7.595053675771166e-06, "loss": 1.2587, "step": 17700 }, { "epoch": 1.21, "grad_norm": 1.7533044815063477, "learning_rate": 7.581464872944694e-06, "loss": 1.2697, "step": 17800 }, { "epoch": 1.22, "grad_norm": 1.642408847808838, "learning_rate": 7.567876070118223e-06, "loss": 1.2587, "step": 17900 }, { "epoch": 1.22, "grad_norm": 2.0030946731567383, "learning_rate": 7.554287267291752e-06, "loss": 1.2825, "step": 18000 }, { "epoch": 1.22, "eval_loss": 1.2882283926010132, "eval_runtime": 70.9711, "eval_samples_per_second": 35.831, "eval_steps_per_second": 4.481, "step": 18000 }, { "epoch": 1.23, "grad_norm": 1.828447699546814, "learning_rate": 7.540698464465281e-06, "loss": 1.2162, "step": 18100 }, { "epoch": 1.24, "grad_norm": 1.9078677892684937, "learning_rate": 7.527109661638811e-06, "loss": 1.2618, "step": 18200 }, { "epoch": 1.24, "grad_norm": 1.7438205480575562, "learning_rate": 7.513520858812339e-06, "loss": 1.2972, "step": 18300 }, { "epoch": 1.25, "grad_norm": 1.5308886766433716, "learning_rate": 7.499932055985868e-06, "loss": 1.2635, "step": 18400 }, { "epoch": 1.26, "grad_norm": 1.7570804357528687, "learning_rate": 7.486343253159397e-06, "loss": 1.3104, "step": 18500 }, { "epoch": 1.26, "eval_loss": 1.2821784019470215, "eval_runtime": 70.9616, "eval_samples_per_second": 35.836, "eval_steps_per_second": 4.481, "step": 18500 }, { "epoch": 1.26, "grad_norm": 1.820691466331482, "learning_rate": 7.472754450332927e-06, "loss": 1.2978, "step": 18600 }, { "epoch": 1.27, "grad_norm": 2.0996739864349365, "learning_rate": 7.4591656475064555e-06, "loss": 1.2925, "step": 18700 }, { "epoch": 1.28, "grad_norm": 1.6602342128753662, "learning_rate": 7.445576844679985e-06, "loss": 1.243, "step": 18800 }, { "epoch": 1.28, "grad_norm": 2.034649133682251, "learning_rate": 7.431988041853513e-06, "loss": 1.2775, "step": 18900 }, { "epoch": 1.29, "grad_norm": 1.898582100868225, "learning_rate": 7.418399239027042e-06, "loss": 1.2786, "step": 19000 }, { "epoch": 1.29, "eval_loss": 1.2745426893234253, "eval_runtime": 70.9759, "eval_samples_per_second": 35.829, "eval_steps_per_second": 4.48, "step": 19000 }, { "epoch": 1.3, "grad_norm": 1.4128785133361816, "learning_rate": 7.4048104362005715e-06, "loss": 1.2656, "step": 19100 }, { "epoch": 1.3, "grad_norm": 2.2971534729003906, "learning_rate": 7.3912216333741e-06, "loss": 1.2426, "step": 19200 }, { "epoch": 1.31, "grad_norm": 1.783996820449829, "learning_rate": 7.37763283054763e-06, "loss": 1.272, "step": 19300 }, { "epoch": 1.32, "grad_norm": 1.8958848714828491, "learning_rate": 7.364044027721159e-06, "loss": 1.2168, "step": 19400 }, { "epoch": 1.32, "grad_norm": 2.1363043785095215, "learning_rate": 7.350455224894687e-06, "loss": 1.2734, "step": 19500 }, { "epoch": 1.32, "eval_loss": 1.2699941396713257, "eval_runtime": 71.0334, "eval_samples_per_second": 35.8, "eval_steps_per_second": 4.477, "step": 19500 }, { "epoch": 1.33, "grad_norm": 2.642695903778076, "learning_rate": 7.336866422068216e-06, "loss": 1.3135, "step": 19600 }, { "epoch": 1.34, "grad_norm": 1.8240422010421753, "learning_rate": 7.323277619241746e-06, "loss": 1.2924, "step": 19700 }, { "epoch": 1.35, "grad_norm": 1.8623692989349365, "learning_rate": 7.309688816415275e-06, "loss": 1.2907, "step": 19800 }, { "epoch": 1.35, "grad_norm": 2.2778708934783936, "learning_rate": 7.296100013588804e-06, "loss": 1.2727, "step": 19900 }, { "epoch": 1.36, "grad_norm": 1.8957061767578125, "learning_rate": 7.282511210762332e-06, "loss": 1.2656, "step": 20000 }, { "epoch": 1.36, "eval_loss": 1.2644336223602295, "eval_runtime": 70.9373, "eval_samples_per_second": 35.849, "eval_steps_per_second": 4.483, "step": 20000 }, { "epoch": 1.37, "grad_norm": 1.7855497598648071, "learning_rate": 7.268922407935861e-06, "loss": 1.2158, "step": 20100 }, { "epoch": 1.37, "grad_norm": 1.8924943208694458, "learning_rate": 7.2553336051093905e-06, "loss": 1.2753, "step": 20200 }, { "epoch": 1.38, "grad_norm": 2.0177762508392334, "learning_rate": 7.241880690311184e-06, "loss": 1.2406, "step": 20300 }, { "epoch": 1.39, "grad_norm": 2.4161458015441895, "learning_rate": 7.228291887484713e-06, "loss": 1.2453, "step": 20400 }, { "epoch": 1.39, "grad_norm": 1.7210235595703125, "learning_rate": 7.214703084658242e-06, "loss": 1.2107, "step": 20500 }, { "epoch": 1.39, "eval_loss": 1.258453607559204, "eval_runtime": 71.0477, "eval_samples_per_second": 35.793, "eval_steps_per_second": 4.476, "step": 20500 }, { "epoch": 1.4, "grad_norm": 1.5457173585891724, "learning_rate": 7.201114281831771e-06, "loss": 1.2377, "step": 20600 }, { "epoch": 1.41, "grad_norm": 2.403831720352173, "learning_rate": 7.187525479005301e-06, "loss": 1.238, "step": 20700 }, { "epoch": 1.41, "grad_norm": 2.01042103767395, "learning_rate": 7.173936676178829e-06, "loss": 1.2528, "step": 20800 }, { "epoch": 1.42, "grad_norm": 2.197006940841675, "learning_rate": 7.160347873352358e-06, "loss": 1.2395, "step": 20900 }, { "epoch": 1.43, "grad_norm": 2.503634214401245, "learning_rate": 7.146759070525887e-06, "loss": 1.2822, "step": 21000 }, { "epoch": 1.43, "eval_loss": 1.2508896589279175, "eval_runtime": 71.0512, "eval_samples_per_second": 35.791, "eval_steps_per_second": 4.476, "step": 21000 }, { "epoch": 1.43, "grad_norm": 1.4275486469268799, "learning_rate": 7.133170267699417e-06, "loss": 1.2337, "step": 21100 }, { "epoch": 1.44, "grad_norm": 2.1461949348449707, "learning_rate": 7.11971735290121e-06, "loss": 1.2576, "step": 21200 }, { "epoch": 1.45, "grad_norm": 1.705665111541748, "learning_rate": 7.106128550074739e-06, "loss": 1.2311, "step": 21300 }, { "epoch": 1.45, "grad_norm": 2.058223247528076, "learning_rate": 7.0925397472482685e-06, "loss": 1.2619, "step": 21400 }, { "epoch": 1.46, "grad_norm": 1.4659618139266968, "learning_rate": 7.078950944421797e-06, "loss": 1.2188, "step": 21500 }, { "epoch": 1.46, "eval_loss": 1.2478315830230713, "eval_runtime": 70.8088, "eval_samples_per_second": 35.914, "eval_steps_per_second": 4.491, "step": 21500 }, { "epoch": 1.47, "grad_norm": 1.7102398872375488, "learning_rate": 7.065362141595325e-06, "loss": 1.2553, "step": 21600 }, { "epoch": 1.47, "grad_norm": 2.445326089859009, "learning_rate": 7.051773338768855e-06, "loss": 1.2687, "step": 21700 }, { "epoch": 1.48, "grad_norm": 2.055088758468628, "learning_rate": 7.038184535942384e-06, "loss": 1.254, "step": 21800 }, { "epoch": 1.49, "grad_norm": 2.5781538486480713, "learning_rate": 7.024595733115913e-06, "loss": 1.2319, "step": 21900 }, { "epoch": 1.49, "grad_norm": 1.9507685899734497, "learning_rate": 7.011006930289442e-06, "loss": 1.2185, "step": 22000 }, { "epoch": 1.49, "eval_loss": 1.245086908340454, "eval_runtime": 71.042, "eval_samples_per_second": 35.796, "eval_steps_per_second": 4.476, "step": 22000 }, { "epoch": 1.5, "grad_norm": 2.347245216369629, "learning_rate": 6.997418127462972e-06, "loss": 1.2024, "step": 22100 }, { "epoch": 1.51, "grad_norm": 1.6178568601608276, "learning_rate": 6.9838293246364995e-06, "loss": 1.2379, "step": 22200 }, { "epoch": 1.52, "grad_norm": 1.7551498413085938, "learning_rate": 6.970240521810029e-06, "loss": 1.2344, "step": 22300 }, { "epoch": 1.52, "grad_norm": 1.9250737428665161, "learning_rate": 6.956651718983558e-06, "loss": 1.2942, "step": 22400 }, { "epoch": 1.53, "grad_norm": 1.640271782875061, "learning_rate": 6.9430629161570876e-06, "loss": 1.2441, "step": 22500 }, { "epoch": 1.53, "eval_loss": 1.2352341413497925, "eval_runtime": 70.9437, "eval_samples_per_second": 35.845, "eval_steps_per_second": 4.482, "step": 22500 }, { "epoch": 1.54, "grad_norm": 2.083061456680298, "learning_rate": 6.929474113330616e-06, "loss": 1.1819, "step": 22600 }, { "epoch": 1.54, "grad_norm": 1.8274168968200684, "learning_rate": 6.915885310504146e-06, "loss": 1.2243, "step": 22700 }, { "epoch": 1.55, "grad_norm": 1.7711529731750488, "learning_rate": 6.902296507677674e-06, "loss": 1.2318, "step": 22800 }, { "epoch": 1.56, "grad_norm": 2.0537028312683105, "learning_rate": 6.888707704851203e-06, "loss": 1.1958, "step": 22900 }, { "epoch": 1.56, "grad_norm": 1.8466728925704956, "learning_rate": 6.875118902024732e-06, "loss": 1.2564, "step": 23000 }, { "epoch": 1.56, "eval_loss": 1.2326687574386597, "eval_runtime": 70.9545, "eval_samples_per_second": 35.84, "eval_steps_per_second": 4.482, "step": 23000 }, { "epoch": 1.57, "grad_norm": 1.6301406621932983, "learning_rate": 6.861530099198261e-06, "loss": 1.1904, "step": 23100 }, { "epoch": 1.58, "grad_norm": 2.1303887367248535, "learning_rate": 6.847941296371791e-06, "loss": 1.2, "step": 23200 }, { "epoch": 1.58, "grad_norm": 2.042210340499878, "learning_rate": 6.834352493545319e-06, "loss": 1.2432, "step": 23300 }, { "epoch": 1.59, "grad_norm": 1.8403574228286743, "learning_rate": 6.820763690718848e-06, "loss": 1.2195, "step": 23400 }, { "epoch": 1.6, "grad_norm": 1.8628817796707153, "learning_rate": 6.807174887892377e-06, "loss": 1.2032, "step": 23500 }, { "epoch": 1.6, "eval_loss": 1.2271267175674438, "eval_runtime": 70.9472, "eval_samples_per_second": 35.844, "eval_steps_per_second": 4.482, "step": 23500 }, { "epoch": 1.6, "grad_norm": 2.2309892177581787, "learning_rate": 6.793586085065907e-06, "loss": 1.1931, "step": 23600 }, { "epoch": 1.61, "grad_norm": 1.4337612390518188, "learning_rate": 6.7799972822394354e-06, "loss": 1.2401, "step": 23700 }, { "epoch": 1.62, "grad_norm": 1.7968145608901978, "learning_rate": 6.766408479412965e-06, "loss": 1.233, "step": 23800 }, { "epoch": 1.62, "grad_norm": 1.7918980121612549, "learning_rate": 6.7529555646147584e-06, "loss": 1.1874, "step": 23900 }, { "epoch": 1.63, "grad_norm": 1.9370090961456299, "learning_rate": 6.739366761788287e-06, "loss": 1.2031, "step": 24000 }, { "epoch": 1.63, "eval_loss": 1.2228479385375977, "eval_runtime": 70.9391, "eval_samples_per_second": 35.848, "eval_steps_per_second": 4.483, "step": 24000 }, { "epoch": 1.64, "grad_norm": 2.238128900527954, "learning_rate": 6.725777958961815e-06, "loss": 1.2207, "step": 24100 }, { "epoch": 1.64, "grad_norm": 1.9183790683746338, "learning_rate": 6.712189156135345e-06, "loss": 1.2263, "step": 24200 }, { "epoch": 1.65, "grad_norm": 2.407428026199341, "learning_rate": 6.6986003533088736e-06, "loss": 1.2424, "step": 24300 }, { "epoch": 1.66, "grad_norm": 1.837365746498108, "learning_rate": 6.685011550482403e-06, "loss": 1.1832, "step": 24400 }, { "epoch": 1.66, "grad_norm": 1.9926724433898926, "learning_rate": 6.671422747655932e-06, "loss": 1.2088, "step": 24500 }, { "epoch": 1.66, "eval_loss": 1.2178888320922852, "eval_runtime": 71.0012, "eval_samples_per_second": 35.816, "eval_steps_per_second": 4.479, "step": 24500 }, { "epoch": 1.67, "grad_norm": 1.6491261720657349, "learning_rate": 6.657833944829462e-06, "loss": 1.2103, "step": 24600 }, { "epoch": 1.68, "grad_norm": 1.825826644897461, "learning_rate": 6.6442451420029895e-06, "loss": 1.1911, "step": 24700 }, { "epoch": 1.69, "grad_norm": 1.788091778755188, "learning_rate": 6.630656339176519e-06, "loss": 1.2246, "step": 24800 }, { "epoch": 1.69, "grad_norm": 1.8941233158111572, "learning_rate": 6.617067536350048e-06, "loss": 1.197, "step": 24900 }, { "epoch": 1.7, "grad_norm": 2.360272169113159, "learning_rate": 6.6034787335235775e-06, "loss": 1.1925, "step": 25000 }, { "epoch": 1.7, "eval_loss": 1.2120610475540161, "eval_runtime": 70.9625, "eval_samples_per_second": 35.836, "eval_steps_per_second": 4.481, "step": 25000 }, { "epoch": 1.71, "grad_norm": 2.0026679039001465, "learning_rate": 6.589889930697106e-06, "loss": 1.2063, "step": 25100 }, { "epoch": 1.71, "grad_norm": 1.9979290962219238, "learning_rate": 6.576301127870636e-06, "loss": 1.1784, "step": 25200 }, { "epoch": 1.72, "grad_norm": 1.682900071144104, "learning_rate": 6.562712325044164e-06, "loss": 1.1587, "step": 25300 }, { "epoch": 1.73, "grad_norm": 2.0586678981781006, "learning_rate": 6.549123522217693e-06, "loss": 1.2031, "step": 25400 }, { "epoch": 1.73, "grad_norm": 2.5424463748931885, "learning_rate": 6.535534719391222e-06, "loss": 1.2061, "step": 25500 }, { "epoch": 1.73, "eval_loss": 1.209425449371338, "eval_runtime": 70.8814, "eval_samples_per_second": 35.877, "eval_steps_per_second": 4.486, "step": 25500 }, { "epoch": 1.74, "grad_norm": 2.0070347785949707, "learning_rate": 6.521945916564751e-06, "loss": 1.2175, "step": 25600 }, { "epoch": 1.75, "grad_norm": 1.7913732528686523, "learning_rate": 6.508357113738281e-06, "loss": 1.2005, "step": 25700 }, { "epoch": 1.75, "grad_norm": 1.552306890487671, "learning_rate": 6.494768310911809e-06, "loss": 1.2011, "step": 25800 }, { "epoch": 1.76, "grad_norm": 3.1545894145965576, "learning_rate": 6.481179508085338e-06, "loss": 1.17, "step": 25900 }, { "epoch": 1.77, "grad_norm": 1.7653687000274658, "learning_rate": 6.467590705258867e-06, "loss": 1.1984, "step": 26000 }, { "epoch": 1.77, "eval_loss": 1.2038514614105225, "eval_runtime": 70.8478, "eval_samples_per_second": 35.894, "eval_steps_per_second": 4.488, "step": 26000 }, { "epoch": 1.77, "grad_norm": 1.9187010526657104, "learning_rate": 6.454001902432397e-06, "loss": 1.2051, "step": 26100 }, { "epoch": 1.78, "grad_norm": 1.6188615560531616, "learning_rate": 6.440413099605925e-06, "loss": 1.1712, "step": 26200 }, { "epoch": 1.79, "grad_norm": 1.9360331296920776, "learning_rate": 6.426824296779455e-06, "loss": 1.1531, "step": 26300 }, { "epoch": 1.79, "grad_norm": 2.710357189178467, "learning_rate": 6.413235493952983e-06, "loss": 1.1707, "step": 26400 }, { "epoch": 1.8, "grad_norm": 2.3331565856933594, "learning_rate": 6.399646691126512e-06, "loss": 1.1929, "step": 26500 }, { "epoch": 1.8, "eval_loss": 1.2011253833770752, "eval_runtime": 70.8706, "eval_samples_per_second": 35.882, "eval_steps_per_second": 4.487, "step": 26500 }, { "epoch": 1.81, "grad_norm": 2.030912399291992, "learning_rate": 6.386057888300041e-06, "loss": 1.1986, "step": 26600 }, { "epoch": 1.81, "grad_norm": 2.1584174633026123, "learning_rate": 6.37246908547357e-06, "loss": 1.2564, "step": 26700 }, { "epoch": 1.82, "grad_norm": 2.3068361282348633, "learning_rate": 6.3588802826471e-06, "loss": 1.1933, "step": 26800 }, { "epoch": 1.83, "grad_norm": 1.5643947124481201, "learning_rate": 6.3452914798206285e-06, "loss": 1.1691, "step": 26900 }, { "epoch": 1.83, "grad_norm": 2.531083822250366, "learning_rate": 6.331702676994157e-06, "loss": 1.1387, "step": 27000 }, { "epoch": 1.83, "eval_loss": 1.1969281435012817, "eval_runtime": 70.8053, "eval_samples_per_second": 35.915, "eval_steps_per_second": 4.491, "step": 27000 }, { "epoch": 1.84, "grad_norm": 1.800430417060852, "learning_rate": 6.318113874167686e-06, "loss": 1.1646, "step": 27100 }, { "epoch": 1.85, "grad_norm": 1.8300161361694336, "learning_rate": 6.304525071341216e-06, "loss": 1.2104, "step": 27200 }, { "epoch": 1.85, "grad_norm": 1.7128742933273315, "learning_rate": 6.2909362685147445e-06, "loss": 1.1932, "step": 27300 }, { "epoch": 1.86, "grad_norm": 1.9414857625961304, "learning_rate": 6.277347465688274e-06, "loss": 1.1818, "step": 27400 }, { "epoch": 1.87, "grad_norm": 1.9951707124710083, "learning_rate": 6.263758662861802e-06, "loss": 1.2024, "step": 27500 }, { "epoch": 1.87, "eval_loss": 1.1932079792022705, "eval_runtime": 70.8009, "eval_samples_per_second": 35.918, "eval_steps_per_second": 4.491, "step": 27500 }, { "epoch": 1.88, "grad_norm": 1.9903321266174316, "learning_rate": 6.250169860035331e-06, "loss": 1.1824, "step": 27600 }, { "epoch": 1.88, "grad_norm": 2.8851983547210693, "learning_rate": 6.2365810572088605e-06, "loss": 1.1851, "step": 27700 }, { "epoch": 1.89, "grad_norm": 2.0669078826904297, "learning_rate": 6.222992254382389e-06, "loss": 1.1646, "step": 27800 }, { "epoch": 1.9, "grad_norm": 1.9089607000350952, "learning_rate": 6.209403451555919e-06, "loss": 1.1776, "step": 27900 }, { "epoch": 1.9, "grad_norm": 2.2551538944244385, "learning_rate": 6.195814648729448e-06, "loss": 1.1909, "step": 28000 }, { "epoch": 1.9, "eval_loss": 1.1877614259719849, "eval_runtime": 70.8691, "eval_samples_per_second": 35.883, "eval_steps_per_second": 4.487, "step": 28000 }, { "epoch": 1.91, "grad_norm": 1.5612294673919678, "learning_rate": 6.182225845902976e-06, "loss": 1.1559, "step": 28100 }, { "epoch": 1.92, "grad_norm": 2.8218579292297363, "learning_rate": 6.168637043076505e-06, "loss": 1.1652, "step": 28200 }, { "epoch": 1.92, "grad_norm": 1.9702138900756836, "learning_rate": 6.155048240250035e-06, "loss": 1.1917, "step": 28300 }, { "epoch": 1.93, "grad_norm": 2.3673105239868164, "learning_rate": 6.141459437423564e-06, "loss": 1.1336, "step": 28400 }, { "epoch": 1.94, "grad_norm": 1.8467798233032227, "learning_rate": 6.127870634597093e-06, "loss": 1.1786, "step": 28500 }, { "epoch": 1.94, "eval_loss": 1.1837141513824463, "eval_runtime": 70.7666, "eval_samples_per_second": 35.935, "eval_steps_per_second": 4.494, "step": 28500 }, { "epoch": 1.94, "grad_norm": 1.830837368965149, "learning_rate": 6.114281831770621e-06, "loss": 1.1908, "step": 28600 }, { "epoch": 1.95, "grad_norm": 1.7194899320602417, "learning_rate": 6.10069302894415e-06, "loss": 1.1541, "step": 28700 }, { "epoch": 1.96, "grad_norm": 1.798368215560913, "learning_rate": 6.0871042261176796e-06, "loss": 1.1487, "step": 28800 }, { "epoch": 1.96, "grad_norm": 1.9699339866638184, "learning_rate": 6.073515423291208e-06, "loss": 1.166, "step": 28900 }, { "epoch": 1.97, "grad_norm": 2.1379947662353516, "learning_rate": 6.059926620464738e-06, "loss": 1.1724, "step": 29000 }, { "epoch": 1.97, "eval_loss": 1.181123971939087, "eval_runtime": 92.5562, "eval_samples_per_second": 27.475, "eval_steps_per_second": 3.436, "step": 29000 }, { "epoch": 1.98, "grad_norm": 2.405134439468384, "learning_rate": 6.046337817638267e-06, "loss": 1.1651, "step": 29100 }, { "epoch": 1.98, "grad_norm": 1.8744902610778809, "learning_rate": 6.032749014811795e-06, "loss": 1.201, "step": 29200 }, { "epoch": 1.99, "grad_norm": 3.014401435852051, "learning_rate": 6.019160211985324e-06, "loss": 1.1783, "step": 29300 }, { "epoch": 2.0, "grad_norm": 2.104191780090332, "learning_rate": 6.005571409158854e-06, "loss": 1.1479, "step": 29400 }, { "epoch": 2.0, "grad_norm": 1.9670746326446533, "learning_rate": 5.991982606332383e-06, "loss": 1.1372, "step": 29500 }, { "epoch": 2.0, "eval_loss": 1.176620602607727, "eval_runtime": 92.464, "eval_samples_per_second": 27.503, "eval_steps_per_second": 3.439, "step": 29500 }, { "epoch": 2.01, "grad_norm": 2.3720388412475586, "learning_rate": 5.978393803505912e-06, "loss": 1.1476, "step": 29600 }, { "epoch": 2.02, "grad_norm": 2.047060251235962, "learning_rate": 5.964805000679441e-06, "loss": 1.1303, "step": 29700 }, { "epoch": 2.02, "grad_norm": 1.9792667627334595, "learning_rate": 5.951216197852969e-06, "loss": 1.1462, "step": 29800 }, { "epoch": 2.03, "grad_norm": 2.241187572479248, "learning_rate": 5.937627395026499e-06, "loss": 1.1031, "step": 29900 }, { "epoch": 2.04, "grad_norm": 2.0233969688415527, "learning_rate": 5.9240385922000274e-06, "loss": 1.1396, "step": 30000 }, { "epoch": 2.04, "eval_loss": 1.1728562116622925, "eval_runtime": 92.5707, "eval_samples_per_second": 27.471, "eval_steps_per_second": 3.435, "step": 30000 }, { "epoch": 2.05, "grad_norm": 2.0683882236480713, "learning_rate": 5.910449789373557e-06, "loss": 1.1431, "step": 30100 }, { "epoch": 2.05, "grad_norm": 1.9208968877792358, "learning_rate": 5.896860986547086e-06, "loss": 1.1391, "step": 30200 }, { "epoch": 2.06, "grad_norm": 1.6621592044830322, "learning_rate": 5.883272183720614e-06, "loss": 1.1361, "step": 30300 }, { "epoch": 2.07, "grad_norm": 1.9728045463562012, "learning_rate": 5.869683380894143e-06, "loss": 1.1606, "step": 30400 }, { "epoch": 2.07, "grad_norm": 2.3189892768859863, "learning_rate": 5.856094578067672e-06, "loss": 1.1565, "step": 30500 }, { "epoch": 2.07, "eval_loss": 1.1692627668380737, "eval_runtime": 92.5367, "eval_samples_per_second": 27.481, "eval_steps_per_second": 3.436, "step": 30500 }, { "epoch": 2.08, "grad_norm": 1.936689853668213, "learning_rate": 5.842505775241202e-06, "loss": 1.1294, "step": 30600 }, { "epoch": 2.09, "grad_norm": 1.4617129564285278, "learning_rate": 5.8289169724147306e-06, "loss": 1.1591, "step": 30700 }, { "epoch": 2.09, "grad_norm": 1.5474071502685547, "learning_rate": 5.81532816958826e-06, "loss": 1.1377, "step": 30800 }, { "epoch": 2.1, "grad_norm": 1.7175779342651367, "learning_rate": 5.801739366761788e-06, "loss": 1.1532, "step": 30900 }, { "epoch": 2.11, "grad_norm": 1.8924795389175415, "learning_rate": 5.788150563935318e-06, "loss": 1.1002, "step": 31000 }, { "epoch": 2.11, "eval_loss": 1.1667861938476562, "eval_runtime": 92.475, "eval_samples_per_second": 27.499, "eval_steps_per_second": 3.439, "step": 31000 }, { "epoch": 2.11, "grad_norm": 2.3616528511047363, "learning_rate": 5.7745617611088465e-06, "loss": 1.1616, "step": 31100 }, { "epoch": 2.12, "grad_norm": 1.7967276573181152, "learning_rate": 5.760972958282376e-06, "loss": 1.1817, "step": 31200 }, { "epoch": 2.13, "grad_norm": 2.9053776264190674, "learning_rate": 5.747384155455905e-06, "loss": 1.1611, "step": 31300 }, { "epoch": 2.13, "grad_norm": 2.2042810916900635, "learning_rate": 5.7337953526294346e-06, "loss": 1.1394, "step": 31400 }, { "epoch": 2.14, "grad_norm": 1.8876034021377563, "learning_rate": 5.7202065498029625e-06, "loss": 1.1171, "step": 31500 }, { "epoch": 2.14, "eval_loss": 1.1626156568527222, "eval_runtime": 92.4663, "eval_samples_per_second": 27.502, "eval_steps_per_second": 3.439, "step": 31500 } ], "logging_steps": 100, "max_steps": 73590, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.55848975386624e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }