{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9871244635193133, "eval_steps": 500, "global_step": 522, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02861230329041488, "grad_norm": 2827.3015022702225, "learning_rate": 9.766490138119515e-06, "loss": 6.1313, "step": 5 }, { "epoch": 0.05722460658082976, "grad_norm": 516.297994115725, "learning_rate": 1.3972688495262568e-05, "loss": 3.18, "step": 10 }, { "epoch": 0.08583690987124463, "grad_norm": 10.64334537255521, "learning_rate": 1.6433156804786183e-05, "loss": 0.6962, "step": 15 }, { "epoch": 0.11444921316165951, "grad_norm": 21.79989086330797, "learning_rate": 1.8178886852405614e-05, "loss": 0.4631, "step": 20 }, { "epoch": 0.1430615164520744, "grad_norm": 8.407923664935195, "learning_rate": 1.953298027623903e-05, "loss": 0.3384, "step": 25 }, { "epoch": 0.17167381974248927, "grad_norm": 4.980763069359495, "learning_rate": 2e-05, "loss": 0.2112, "step": 30 }, { "epoch": 0.20028612303290416, "grad_norm": 3.3050323983887084, "learning_rate": 2e-05, "loss": 0.1508, "step": 35 }, { "epoch": 0.22889842632331903, "grad_norm": 2.747487969978562, "learning_rate": 2e-05, "loss": 0.1331, "step": 40 }, { "epoch": 0.2575107296137339, "grad_norm": 3.8930367082100776, "learning_rate": 2e-05, "loss": 0.1279, "step": 45 }, { "epoch": 0.2861230329041488, "grad_norm": 2.262054738771732, "learning_rate": 2e-05, "loss": 0.1245, "step": 50 }, { "epoch": 0.3147353361945637, "grad_norm": 2.1250650940368487, "learning_rate": 2e-05, "loss": 0.1133, "step": 55 }, { "epoch": 0.34334763948497854, "grad_norm": 1.770036809289755, "learning_rate": 2e-05, "loss": 0.0945, "step": 60 }, { "epoch": 0.3719599427753934, "grad_norm": 1.7938876086769961, "learning_rate": 2e-05, "loss": 0.0866, "step": 65 }, { "epoch": 0.4005722460658083, "grad_norm": 1.4764321868179242, "learning_rate": 2e-05, "loss": 0.0827, "step": 70 }, { "epoch": 0.4291845493562232, "grad_norm": 1.9128984486999376, "learning_rate": 2e-05, "loss": 0.0798, "step": 75 }, { "epoch": 0.45779685264663805, "grad_norm": 1.46483598523261, "learning_rate": 2e-05, "loss": 0.0804, "step": 80 }, { "epoch": 0.4864091559370529, "grad_norm": 1.6523142945768603, "learning_rate": 2e-05, "loss": 0.0758, "step": 85 }, { "epoch": 0.5150214592274678, "grad_norm": 1.5333568395438437, "learning_rate": 2e-05, "loss": 0.0709, "step": 90 }, { "epoch": 0.5436337625178826, "grad_norm": 1.5031573664657636, "learning_rate": 2e-05, "loss": 0.0687, "step": 95 }, { "epoch": 0.5722460658082976, "grad_norm": 1.4132321325139292, "learning_rate": 2e-05, "loss": 0.0574, "step": 100 }, { "epoch": 0.6008583690987125, "grad_norm": 1.3745064149711035, "learning_rate": 2e-05, "loss": 0.0572, "step": 105 }, { "epoch": 0.6294706723891274, "grad_norm": 1.3470725333478024, "learning_rate": 2e-05, "loss": 0.0611, "step": 110 }, { "epoch": 0.6580829756795422, "grad_norm": 1.1315562008981583, "learning_rate": 2e-05, "loss": 0.05, "step": 115 }, { "epoch": 0.6866952789699571, "grad_norm": 1.2282177741629088, "learning_rate": 2e-05, "loss": 0.0525, "step": 120 }, { "epoch": 0.7153075822603719, "grad_norm": 1.3933198446492454, "learning_rate": 2e-05, "loss": 0.0519, "step": 125 }, { "epoch": 0.7439198855507868, "grad_norm": 1.7829406158054193, "learning_rate": 2e-05, "loss": 0.051, "step": 130 }, { "epoch": 0.7725321888412017, "grad_norm": 1.1948798936363785, "learning_rate": 2e-05, "loss": 0.0514, "step": 135 }, { "epoch": 0.8011444921316166, "grad_norm": 1.0816375150345683, "learning_rate": 2e-05, "loss": 0.0512, "step": 140 }, { "epoch": 0.8297567954220315, "grad_norm": 1.469354846951377, "learning_rate": 2e-05, "loss": 0.0465, "step": 145 }, { "epoch": 0.8583690987124464, "grad_norm": 1.2522970466753844, "learning_rate": 2e-05, "loss": 0.0493, "step": 150 }, { "epoch": 0.8869814020028612, "grad_norm": 1.162286189716735, "learning_rate": 2e-05, "loss": 0.0474, "step": 155 }, { "epoch": 0.9155937052932761, "grad_norm": 1.0718851830181713, "learning_rate": 2e-05, "loss": 0.0415, "step": 160 }, { "epoch": 0.944206008583691, "grad_norm": 1.0733174430217316, "learning_rate": 2e-05, "loss": 0.0528, "step": 165 }, { "epoch": 0.9728183118741058, "grad_norm": 1.0644789712973826, "learning_rate": 2e-05, "loss": 0.0456, "step": 170 }, { "epoch": 1.0014306151645207, "grad_norm": 1.1496891631410193, "learning_rate": 2e-05, "loss": 0.0432, "step": 175 }, { "epoch": 1.0300429184549356, "grad_norm": 1.0200369998966563, "learning_rate": 2e-05, "loss": 0.0241, "step": 180 }, { "epoch": 1.0586552217453504, "grad_norm": 1.3931472581826994, "learning_rate": 2e-05, "loss": 0.0225, "step": 185 }, { "epoch": 1.0872675250357653, "grad_norm": 1.0114502707229882, "learning_rate": 2e-05, "loss": 0.0239, "step": 190 }, { "epoch": 1.1158798283261802, "grad_norm": 0.7510204564859241, "learning_rate": 2e-05, "loss": 0.027, "step": 195 }, { "epoch": 1.144492131616595, "grad_norm": 0.7295063245482636, "learning_rate": 2e-05, "loss": 0.0235, "step": 200 }, { "epoch": 1.17310443490701, "grad_norm": 0.856671906186716, "learning_rate": 2e-05, "loss": 0.0247, "step": 205 }, { "epoch": 1.201716738197425, "grad_norm": 0.7255694844652782, "learning_rate": 2e-05, "loss": 0.0258, "step": 210 }, { "epoch": 1.2303290414878398, "grad_norm": 0.9184490713297652, "learning_rate": 2e-05, "loss": 0.0276, "step": 215 }, { "epoch": 1.2589413447782547, "grad_norm": 0.818205379161712, "learning_rate": 2e-05, "loss": 0.0219, "step": 220 }, { "epoch": 1.2875536480686696, "grad_norm": 0.6654369429209799, "learning_rate": 2e-05, "loss": 0.0261, "step": 225 }, { "epoch": 1.3161659513590844, "grad_norm": 0.5829250860946364, "learning_rate": 2e-05, "loss": 0.017, "step": 230 }, { "epoch": 1.3447782546494993, "grad_norm": 0.6661015288674467, "learning_rate": 2e-05, "loss": 0.0228, "step": 235 }, { "epoch": 1.3733905579399142, "grad_norm": 0.7177595155125043, "learning_rate": 2e-05, "loss": 0.0178, "step": 240 }, { "epoch": 1.402002861230329, "grad_norm": 0.8848450057764279, "learning_rate": 2e-05, "loss": 0.0147, "step": 245 }, { "epoch": 1.4306151645207439, "grad_norm": 0.6571566830429797, "learning_rate": 2e-05, "loss": 0.0216, "step": 250 }, { "epoch": 1.4592274678111588, "grad_norm": 0.812436234834659, "learning_rate": 2e-05, "loss": 0.023, "step": 255 }, { "epoch": 1.4878397711015736, "grad_norm": 0.7840598860015469, "learning_rate": 2e-05, "loss": 0.0209, "step": 260 }, { "epoch": 1.5164520743919887, "grad_norm": 0.7844249253805873, "learning_rate": 2e-05, "loss": 0.0205, "step": 265 }, { "epoch": 1.5450643776824036, "grad_norm": 0.7640044613122257, "learning_rate": 2e-05, "loss": 0.0238, "step": 270 }, { "epoch": 1.5736766809728184, "grad_norm": 1.0261564863265702, "learning_rate": 2e-05, "loss": 0.0271, "step": 275 }, { "epoch": 1.6022889842632333, "grad_norm": 0.6603554019675723, "learning_rate": 2e-05, "loss": 0.0224, "step": 280 }, { "epoch": 1.6309012875536482, "grad_norm": 0.6112434008888443, "learning_rate": 2e-05, "loss": 0.0201, "step": 285 }, { "epoch": 1.659513590844063, "grad_norm": 0.6941562759172227, "learning_rate": 2e-05, "loss": 0.0209, "step": 290 }, { "epoch": 1.688125894134478, "grad_norm": 0.920122539331784, "learning_rate": 2e-05, "loss": 0.0224, "step": 295 }, { "epoch": 1.7167381974248928, "grad_norm": 0.784663743006703, "learning_rate": 2e-05, "loss": 0.0204, "step": 300 }, { "epoch": 1.7453505007153076, "grad_norm": 0.5371735092585386, "learning_rate": 2e-05, "loss": 0.0175, "step": 305 }, { "epoch": 1.7739628040057225, "grad_norm": 0.4569754495971157, "learning_rate": 2e-05, "loss": 0.0191, "step": 310 }, { "epoch": 1.8025751072961373, "grad_norm": 0.5809346149070659, "learning_rate": 2e-05, "loss": 0.0163, "step": 315 }, { "epoch": 1.8311874105865522, "grad_norm": 0.9416876917379606, "learning_rate": 2e-05, "loss": 0.0186, "step": 320 }, { "epoch": 1.859799713876967, "grad_norm": 0.9128407546360238, "learning_rate": 2e-05, "loss": 0.0177, "step": 325 }, { "epoch": 1.888412017167382, "grad_norm": 0.6404265787090032, "learning_rate": 2e-05, "loss": 0.0142, "step": 330 }, { "epoch": 1.9170243204577968, "grad_norm": 0.8659777729110113, "learning_rate": 2e-05, "loss": 0.0188, "step": 335 }, { "epoch": 1.9456366237482117, "grad_norm": 0.7389926990941214, "learning_rate": 2e-05, "loss": 0.0205, "step": 340 }, { "epoch": 1.9742489270386265, "grad_norm": 0.5010753341589564, "learning_rate": 2e-05, "loss": 0.0173, "step": 345 }, { "epoch": 2.0028612303290414, "grad_norm": 0.44585995494505515, "learning_rate": 2e-05, "loss": 0.0176, "step": 350 }, { "epoch": 2.0314735336194563, "grad_norm": 0.3372647662374521, "learning_rate": 2e-05, "loss": 0.0114, "step": 355 }, { "epoch": 2.060085836909871, "grad_norm": 0.5724492338755197, "learning_rate": 2e-05, "loss": 0.0144, "step": 360 }, { "epoch": 2.088698140200286, "grad_norm": 0.5187760378661088, "learning_rate": 2e-05, "loss": 0.0137, "step": 365 }, { "epoch": 2.117310443490701, "grad_norm": 0.5106154674345083, "learning_rate": 2e-05, "loss": 0.0116, "step": 370 }, { "epoch": 2.1459227467811157, "grad_norm": 0.5354245824814097, "learning_rate": 2e-05, "loss": 0.0122, "step": 375 }, { "epoch": 2.1745350500715306, "grad_norm": 0.5465913593959528, "learning_rate": 2e-05, "loss": 0.0135, "step": 380 }, { "epoch": 2.2031473533619454, "grad_norm": 0.7553346807762494, "learning_rate": 2e-05, "loss": 0.0139, "step": 385 }, { "epoch": 2.2317596566523603, "grad_norm": 0.47986647806349203, "learning_rate": 2e-05, "loss": 0.0125, "step": 390 }, { "epoch": 2.260371959942775, "grad_norm": 0.6926460223156353, "learning_rate": 2e-05, "loss": 0.012, "step": 395 }, { "epoch": 2.28898426323319, "grad_norm": 0.7721328467064986, "learning_rate": 2e-05, "loss": 0.0104, "step": 400 }, { "epoch": 2.317596566523605, "grad_norm": 0.3938351782885143, "learning_rate": 2e-05, "loss": 0.0094, "step": 405 }, { "epoch": 2.34620886981402, "grad_norm": 0.7956446454256582, "learning_rate": 2e-05, "loss": 0.0127, "step": 410 }, { "epoch": 2.374821173104435, "grad_norm": 0.7163390976077646, "learning_rate": 2e-05, "loss": 0.0117, "step": 415 }, { "epoch": 2.40343347639485, "grad_norm": 0.40417151839328475, "learning_rate": 2e-05, "loss": 0.0116, "step": 420 }, { "epoch": 2.432045779685265, "grad_norm": 0.7406033829214401, "learning_rate": 2e-05, "loss": 0.0115, "step": 425 }, { "epoch": 2.4606580829756797, "grad_norm": 1.0372948520488305, "learning_rate": 2e-05, "loss": 0.013, "step": 430 }, { "epoch": 2.4892703862660945, "grad_norm": 0.48303247551117084, "learning_rate": 2e-05, "loss": 0.0121, "step": 435 }, { "epoch": 2.5178826895565094, "grad_norm": 0.822531770752665, "learning_rate": 2e-05, "loss": 0.0091, "step": 440 }, { "epoch": 2.5464949928469243, "grad_norm": 0.5751055353850153, "learning_rate": 2e-05, "loss": 0.0117, "step": 445 }, { "epoch": 2.575107296137339, "grad_norm": 0.8111046011909318, "learning_rate": 2e-05, "loss": 0.0138, "step": 450 }, { "epoch": 2.603719599427754, "grad_norm": 0.5529988693729204, "learning_rate": 2e-05, "loss": 0.0137, "step": 455 }, { "epoch": 2.632331902718169, "grad_norm": 0.6426046706622803, "learning_rate": 2e-05, "loss": 0.0128, "step": 460 }, { "epoch": 2.6609442060085837, "grad_norm": 0.5842059243112792, "learning_rate": 2e-05, "loss": 0.0097, "step": 465 }, { "epoch": 2.6895565092989986, "grad_norm": 0.9462035289351468, "learning_rate": 2e-05, "loss": 0.0111, "step": 470 }, { "epoch": 2.7181688125894135, "grad_norm": 0.47730280851213186, "learning_rate": 2e-05, "loss": 0.01, "step": 475 }, { "epoch": 2.7467811158798283, "grad_norm": 0.7829145546901836, "learning_rate": 2e-05, "loss": 0.0147, "step": 480 }, { "epoch": 2.775393419170243, "grad_norm": 0.41532702346006606, "learning_rate": 2e-05, "loss": 0.0106, "step": 485 }, { "epoch": 2.804005722460658, "grad_norm": 0.42916154288878555, "learning_rate": 2e-05, "loss": 0.0135, "step": 490 }, { "epoch": 2.832618025751073, "grad_norm": 0.5117471137135019, "learning_rate": 2e-05, "loss": 0.0095, "step": 495 }, { "epoch": 2.8612303290414878, "grad_norm": 0.42607884594383777, "learning_rate": 2e-05, "loss": 0.0105, "step": 500 }, { "epoch": 2.8898426323319026, "grad_norm": 0.5078330866142711, "learning_rate": 2e-05, "loss": 0.0111, "step": 505 }, { "epoch": 2.9184549356223175, "grad_norm": 0.872560256021644, "learning_rate": 2e-05, "loss": 0.0096, "step": 510 }, { "epoch": 2.9470672389127324, "grad_norm": 0.5473131169980906, "learning_rate": 2e-05, "loss": 0.0119, "step": 515 }, { "epoch": 2.9756795422031472, "grad_norm": 0.33829761962434335, "learning_rate": 2e-05, "loss": 0.0094, "step": 520 } ], "logging_steps": 5, "max_steps": 522, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 10457554665472.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }