{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5460336720764447, "eval_steps": 100, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 0.3099820017814636, "learning_rate": 0.0001, "loss": 1.8122, "step": 100 }, { "epoch": 0.03, "eval_loss": 1.7837988138198853, "eval_runtime": 208.895, "eval_samples_per_second": 6.3, "eval_steps_per_second": 1.575, "step": 100 }, { "epoch": 0.06, "grad_norm": 0.17340590059757233, "learning_rate": 9.655172413793105e-05, "loss": 1.6719, "step": 200 }, { "epoch": 0.06, "eval_loss": 1.7731986045837402, "eval_runtime": 208.8797, "eval_samples_per_second": 6.3, "eval_steps_per_second": 1.575, "step": 200 }, { "epoch": 0.09, "grad_norm": 0.16290397942066193, "learning_rate": 9.310344827586207e-05, "loss": 1.6564, "step": 300 }, { "epoch": 0.09, "eval_loss": 1.76987886428833, "eval_runtime": 208.9321, "eval_samples_per_second": 6.299, "eval_steps_per_second": 1.575, "step": 300 }, { "epoch": 0.12, "grad_norm": 0.1616089642047882, "learning_rate": 8.96551724137931e-05, "loss": 1.6781, "step": 400 }, { "epoch": 0.12, "eval_loss": 1.7673217058181763, "eval_runtime": 208.8655, "eval_samples_per_second": 6.301, "eval_steps_per_second": 1.575, "step": 400 }, { "epoch": 0.15, "grad_norm": 0.21069930493831635, "learning_rate": 8.620689655172413e-05, "loss": 1.6603, "step": 500 }, { "epoch": 0.15, "eval_loss": 1.7704393863677979, "eval_runtime": 208.9211, "eval_samples_per_second": 6.299, "eval_steps_per_second": 1.575, "step": 500 }, { "epoch": 0.18, "grad_norm": 0.19658587872982025, "learning_rate": 8.275862068965517e-05, "loss": 1.6579, "step": 600 }, { "epoch": 0.18, "eval_loss": 1.7655415534973145, "eval_runtime": 208.8518, "eval_samples_per_second": 6.301, "eval_steps_per_second": 1.575, "step": 600 }, { "epoch": 0.21, "grad_norm": 0.17741167545318604, "learning_rate": 7.931034482758621e-05, "loss": 1.6573, "step": 700 }, { "epoch": 0.21, "eval_loss": 1.7669495344161987, "eval_runtime": 208.8515, "eval_samples_per_second": 6.301, "eval_steps_per_second": 1.575, "step": 700 }, { "epoch": 0.24, "grad_norm": 0.17904286086559296, "learning_rate": 7.586206896551724e-05, "loss": 1.6398, "step": 800 }, { "epoch": 0.24, "eval_loss": 1.7682162523269653, "eval_runtime": 208.8599, "eval_samples_per_second": 6.301, "eval_steps_per_second": 1.575, "step": 800 }, { "epoch": 0.27, "grad_norm": 0.17510022222995758, "learning_rate": 7.241379310344828e-05, "loss": 1.6464, "step": 900 }, { "epoch": 0.27, "eval_loss": 1.7680007219314575, "eval_runtime": 208.9236, "eval_samples_per_second": 6.299, "eval_steps_per_second": 1.575, "step": 900 }, { "epoch": 0.3, "grad_norm": 0.16618536412715912, "learning_rate": 6.896551724137931e-05, "loss": 1.6402, "step": 1000 }, { "epoch": 0.3, "eval_loss": 1.7689732313156128, "eval_runtime": 208.9404, "eval_samples_per_second": 6.298, "eval_steps_per_second": 1.575, "step": 1000 }, { "epoch": 0.33, "grad_norm": 0.1838076114654541, "learning_rate": 6.551724137931034e-05, "loss": 1.6602, "step": 1100 }, { "epoch": 0.33, "eval_loss": 1.7677079439163208, "eval_runtime": 208.8621, "eval_samples_per_second": 6.301, "eval_steps_per_second": 1.575, "step": 1100 }, { "epoch": 0.36, "grad_norm": 0.1798967868089676, "learning_rate": 6.206896551724138e-05, "loss": 1.6399, "step": 1200 }, { "epoch": 0.36, "eval_loss": 1.7679466009140015, "eval_runtime": 208.7864, "eval_samples_per_second": 6.303, "eval_steps_per_second": 1.576, "step": 1200 }, { "epoch": 0.39, "grad_norm": 0.19117586314678192, "learning_rate": 5.862068965517241e-05, "loss": 1.6428, "step": 1300 }, { "epoch": 0.39, "eval_loss": 1.7693601846694946, "eval_runtime": 208.7845, "eval_samples_per_second": 6.303, "eval_steps_per_second": 1.576, "step": 1300 }, { "epoch": 0.42, "grad_norm": 0.18876388669013977, "learning_rate": 5.517241379310345e-05, "loss": 1.6403, "step": 1400 }, { "epoch": 0.42, "eval_loss": 1.7661293745040894, "eval_runtime": 208.7761, "eval_samples_per_second": 6.303, "eval_steps_per_second": 1.576, "step": 1400 }, { "epoch": 0.46, "grad_norm": 0.17072780430316925, "learning_rate": 5.172413793103449e-05, "loss": 1.6403, "step": 1500 }, { "epoch": 0.46, "eval_loss": 1.7662379741668701, "eval_runtime": 208.8393, "eval_samples_per_second": 6.301, "eval_steps_per_second": 1.575, "step": 1500 }, { "epoch": 0.49, "grad_norm": 0.20399431884288788, "learning_rate": 4.827586206896552e-05, "loss": 1.637, "step": 1600 }, { "epoch": 0.49, "eval_loss": 1.7682374715805054, "eval_runtime": 208.7744, "eval_samples_per_second": 6.303, "eval_steps_per_second": 1.576, "step": 1600 }, { "epoch": 0.52, "grad_norm": 0.2173861563205719, "learning_rate": 4.482758620689655e-05, "loss": 1.6366, "step": 1700 }, { "epoch": 0.52, "eval_loss": 1.7696117162704468, "eval_runtime": 208.7744, "eval_samples_per_second": 6.303, "eval_steps_per_second": 1.576, "step": 1700 }, { "epoch": 0.55, "grad_norm": 0.19538724422454834, "learning_rate": 4.1379310344827587e-05, "loss": 1.6377, "step": 1800 }, { "epoch": 0.55, "eval_loss": 1.7682461738586426, "eval_runtime": 208.7775, "eval_samples_per_second": 6.303, "eval_steps_per_second": 1.576, "step": 1800 } ], "logging_steps": 100, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 4.823467998404936e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }