{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 37815, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 4.9338886685177844e-05, "loss": 3.2991, "step": 500 }, { "epoch": 0.08, "learning_rate": 4.8677773370355686e-05, "loss": 2.7662, "step": 1000 }, { "epoch": 0.12, "learning_rate": 4.801666005553352e-05, "loss": 2.662, "step": 1500 }, { "epoch": 0.16, "learning_rate": 4.735554674071136e-05, "loss": 2.5305, "step": 2000 }, { "epoch": 0.2, "learning_rate": 4.66944334258892e-05, "loss": 2.4368, "step": 2500 }, { "epoch": 0.24, "learning_rate": 4.603332011106704e-05, "loss": 2.4286, "step": 3000 }, { "epoch": 0.28, "learning_rate": 4.5372206796244874e-05, "loss": 2.3404, "step": 3500 }, { "epoch": 0.32, "learning_rate": 4.4711093481422716e-05, "loss": 2.2398, "step": 4000 }, { "epoch": 0.36, "learning_rate": 4.404998016660056e-05, "loss": 2.2966, "step": 4500 }, { "epoch": 0.4, "learning_rate": 4.33888668517784e-05, "loss": 2.2255, "step": 5000 }, { "epoch": 0.44, "learning_rate": 4.272775353695624e-05, "loss": 2.2713, "step": 5500 }, { "epoch": 0.48, "learning_rate": 4.2066640222134076e-05, "loss": 2.2018, "step": 6000 }, { "epoch": 0.52, "learning_rate": 4.140552690731192e-05, "loss": 2.1203, "step": 6500 }, { "epoch": 0.56, "learning_rate": 4.074441359248975e-05, "loss": 2.1148, "step": 7000 }, { "epoch": 0.6, "learning_rate": 4.0083300277667595e-05, "loss": 2.1149, "step": 7500 }, { "epoch": 0.63, "learning_rate": 3.942218696284543e-05, "loss": 2.0219, "step": 8000 }, { "epoch": 0.67, "learning_rate": 3.876107364802327e-05, "loss": 2.0354, "step": 8500 }, { "epoch": 0.71, "learning_rate": 3.809996033320111e-05, "loss": 2.0341, "step": 9000 }, { "epoch": 0.75, "learning_rate": 3.7438847018378955e-05, "loss": 1.9972, "step": 9500 }, { "epoch": 0.79, "learning_rate": 3.6777733703556796e-05, "loss": 1.9654, "step": 10000 }, { "epoch": 0.83, "learning_rate": 3.611662038873463e-05, "loss": 1.9853, "step": 10500 }, { "epoch": 0.87, "learning_rate": 3.5455507073912466e-05, "loss": 1.9487, "step": 11000 }, { "epoch": 0.91, "learning_rate": 3.479439375909031e-05, "loss": 1.9498, "step": 11500 }, { "epoch": 0.95, "learning_rate": 3.413328044426815e-05, "loss": 1.8963, "step": 12000 }, { "epoch": 0.99, "learning_rate": 3.3472167129445985e-05, "loss": 1.9259, "step": 12500 }, { "epoch": 1.03, "learning_rate": 3.2811053814623827e-05, "loss": 1.5878, "step": 13000 }, { "epoch": 1.07, "learning_rate": 3.214994049980167e-05, "loss": 1.4017, "step": 13500 }, { "epoch": 1.11, "learning_rate": 3.148882718497951e-05, "loss": 1.4809, "step": 14000 }, { "epoch": 1.15, "learning_rate": 3.082771387015735e-05, "loss": 1.4646, "step": 14500 }, { "epoch": 1.19, "learning_rate": 3.0166600555335183e-05, "loss": 1.5017, "step": 15000 }, { "epoch": 1.23, "learning_rate": 2.9505487240513025e-05, "loss": 1.4745, "step": 15500 }, { "epoch": 1.27, "learning_rate": 2.8844373925690867e-05, "loss": 1.4496, "step": 16000 }, { "epoch": 1.31, "learning_rate": 2.8183260610868705e-05, "loss": 1.4599, "step": 16500 }, { "epoch": 1.35, "learning_rate": 2.752214729604654e-05, "loss": 1.3974, "step": 17000 }, { "epoch": 1.39, "learning_rate": 2.6861033981224382e-05, "loss": 1.397, "step": 17500 }, { "epoch": 1.43, "learning_rate": 2.6199920666402224e-05, "loss": 1.436, "step": 18000 }, { "epoch": 1.47, "learning_rate": 2.5538807351580062e-05, "loss": 1.4359, "step": 18500 }, { "epoch": 1.51, "learning_rate": 2.48776940367579e-05, "loss": 1.4215, "step": 19000 }, { "epoch": 1.55, "learning_rate": 2.4216580721935742e-05, "loss": 1.3611, "step": 19500 }, { "epoch": 1.59, "learning_rate": 2.355546740711358e-05, "loss": 1.4515, "step": 20000 }, { "epoch": 1.63, "learning_rate": 2.289435409229142e-05, "loss": 1.3923, "step": 20500 }, { "epoch": 1.67, "learning_rate": 2.2233240777469257e-05, "loss": 1.3968, "step": 21000 }, { "epoch": 1.71, "learning_rate": 2.15721274626471e-05, "loss": 1.4511, "step": 21500 }, { "epoch": 1.75, "learning_rate": 2.091101414782494e-05, "loss": 1.3736, "step": 22000 }, { "epoch": 1.79, "learning_rate": 2.024990083300278e-05, "loss": 1.4196, "step": 22500 }, { "epoch": 1.82, "learning_rate": 1.9588787518180617e-05, "loss": 1.4012, "step": 23000 }, { "epoch": 1.86, "learning_rate": 1.8927674203358456e-05, "loss": 1.3995, "step": 23500 }, { "epoch": 1.9, "learning_rate": 1.8266560888536297e-05, "loss": 1.3706, "step": 24000 }, { "epoch": 1.94, "learning_rate": 1.7605447573714136e-05, "loss": 1.357, "step": 24500 }, { "epoch": 1.98, "learning_rate": 1.6944334258891974e-05, "loss": 1.3894, "step": 25000 }, { "epoch": 2.02, "learning_rate": 1.6283220944069812e-05, "loss": 1.1266, "step": 25500 }, { "epoch": 2.06, "learning_rate": 1.5622107629247654e-05, "loss": 0.9034, "step": 26000 }, { "epoch": 2.1, "learning_rate": 1.4960994314425494e-05, "loss": 0.88, "step": 26500 }, { "epoch": 2.14, "learning_rate": 1.4299880999603333e-05, "loss": 0.9379, "step": 27000 }, { "epoch": 2.18, "learning_rate": 1.3638767684781173e-05, "loss": 0.9224, "step": 27500 }, { "epoch": 2.22, "learning_rate": 1.2977654369959011e-05, "loss": 0.9108, "step": 28000 }, { "epoch": 2.26, "learning_rate": 1.2316541055136851e-05, "loss": 0.9068, "step": 28500 }, { "epoch": 2.3, "learning_rate": 1.1655427740314691e-05, "loss": 0.9158, "step": 29000 }, { "epoch": 2.34, "learning_rate": 1.099431442549253e-05, "loss": 0.901, "step": 29500 }, { "epoch": 2.38, "learning_rate": 1.033320111067037e-05, "loss": 0.8898, "step": 30000 }, { "epoch": 2.42, "learning_rate": 9.67208779584821e-06, "loss": 0.9295, "step": 30500 }, { "epoch": 2.46, "learning_rate": 9.010974481026048e-06, "loss": 0.9325, "step": 31000 }, { "epoch": 2.5, "learning_rate": 8.349861166203888e-06, "loss": 0.9357, "step": 31500 }, { "epoch": 2.54, "learning_rate": 7.688747851381726e-06, "loss": 0.8832, "step": 32000 }, { "epoch": 2.58, "learning_rate": 7.027634536559567e-06, "loss": 0.9101, "step": 32500 }, { "epoch": 2.62, "learning_rate": 6.366521221737406e-06, "loss": 0.9018, "step": 33000 }, { "epoch": 2.66, "learning_rate": 5.7054079069152455e-06, "loss": 0.8886, "step": 33500 }, { "epoch": 2.7, "learning_rate": 5.044294592093085e-06, "loss": 0.8771, "step": 34000 }, { "epoch": 2.74, "learning_rate": 4.383181277270925e-06, "loss": 0.8956, "step": 34500 }, { "epoch": 2.78, "learning_rate": 3.7220679624487635e-06, "loss": 0.8586, "step": 35000 }, { "epoch": 2.82, "learning_rate": 3.060954647626603e-06, "loss": 0.9039, "step": 35500 }, { "epoch": 2.86, "learning_rate": 2.3998413328044427e-06, "loss": 0.8817, "step": 36000 }, { "epoch": 2.9, "learning_rate": 1.7387280179822822e-06, "loss": 0.8601, "step": 36500 }, { "epoch": 2.94, "learning_rate": 1.0776147031601218e-06, "loss": 0.8837, "step": 37000 }, { "epoch": 2.98, "learning_rate": 4.165013883379611e-07, "loss": 0.8894, "step": 37500 }, { "epoch": 3.0, "step": 37815, "total_flos": 9288563680542720.0, "train_loss": 1.5174339204652274, "train_runtime": 9956.4506, "train_samples_per_second": 15.192, "train_steps_per_second": 3.798 } ], "logging_steps": 500, "max_steps": 37815, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 9288563680542720.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }