{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.043478260869565216, "grad_norm": 175.07723361554483, "learning_rate": 8.695652173913044e-07, "loss": 1.9054, "step": 1 }, { "epoch": 0.21739130434782608, "grad_norm": 62.962929820729734, "learning_rate": 4.347826086956522e-06, "loss": 1.4576, "step": 5 }, { "epoch": 0.43478260869565216, "grad_norm": 8.648281883873826, "learning_rate": 8.695652173913044e-06, "loss": 0.6132, "step": 10 }, { "epoch": 0.6521739130434783, "grad_norm": 16.239431113601718, "learning_rate": 1.3043478260869566e-05, "loss": 0.3115, "step": 15 }, { "epoch": 0.8695652173913043, "grad_norm": 3.450808311744288, "learning_rate": 1.739130434782609e-05, "loss": 0.203, "step": 20 }, { "epoch": 1.0, "eval_loss": 0.10863249748945236, "eval_runtime": 67.8349, "eval_samples_per_second": 21.169, "eval_steps_per_second": 0.663, "step": 23 }, { "epoch": 1.0869565217391304, "grad_norm": 34.3749757818615, "learning_rate": 1.9995393663024054e-05, "loss": 0.1339, "step": 25 }, { "epoch": 1.3043478260869565, "grad_norm": 2.3841429135265555, "learning_rate": 1.9943621095573588e-05, "loss": 0.1249, "step": 30 }, { "epoch": 1.5217391304347827, "grad_norm": 1.2759644316371075, "learning_rate": 1.9834617016337424e-05, "loss": 0.088, "step": 35 }, { "epoch": 1.7391304347826086, "grad_norm": 34.92728113964099, "learning_rate": 1.9669008809262064e-05, "loss": 0.0829, "step": 40 }, { "epoch": 1.9565217391304348, "grad_norm": 2.0028289380489945, "learning_rate": 1.944774964904754e-05, "loss": 0.0942, "step": 45 }, { "epoch": 2.0, "eval_loss": 0.07020141929388046, "eval_runtime": 67.1251, "eval_samples_per_second": 21.393, "eval_steps_per_second": 0.67, "step": 46 }, { "epoch": 2.1739130434782608, "grad_norm": 1.3438276529586066, "learning_rate": 1.917211301505453e-05, "loss": 0.0674, "step": 50 }, { "epoch": 2.391304347826087, "grad_norm": 1142.7739612838905, "learning_rate": 1.8843685361665724e-05, "loss": 0.0668, "step": 55 }, { "epoch": 2.608695652173913, "grad_norm": 2.660687233406025, "learning_rate": 1.8464356987288012e-05, "loss": 0.065, "step": 60 }, { "epoch": 2.8260869565217392, "grad_norm": 6.723586538182285, "learning_rate": 1.8036311154549783e-05, "loss": 0.064, "step": 65 }, { "epoch": 3.0, "eval_loss": 0.057872917503118515, "eval_runtime": 67.6569, "eval_samples_per_second": 21.225, "eval_steps_per_second": 0.665, "step": 69 }, { "epoch": 3.0434782608695654, "grad_norm": 0.7827050845032839, "learning_rate": 1.7562011524313187e-05, "loss": 0.0604, "step": 70 }, { "epoch": 3.260869565217391, "grad_norm": 1.2639368123822572, "learning_rate": 1.7044187975826126e-05, "loss": 0.0755, "step": 75 }, { "epoch": 3.4782608695652173, "grad_norm": 0.9047997047917037, "learning_rate": 1.648582089462756e-05, "loss": 0.0569, "step": 80 }, { "epoch": 3.6956521739130435, "grad_norm": 0.5428178712369748, "learning_rate": 1.589012401863864e-05, "loss": 0.0577, "step": 85 }, { "epoch": 3.9130434782608696, "grad_norm": 0.6450527453899985, "learning_rate": 1.526052594117071e-05, "loss": 0.055, "step": 90 }, { "epoch": 4.0, "eval_loss": 0.05387277901172638, "eval_runtime": 67.296, "eval_samples_per_second": 21.339, "eval_steps_per_second": 0.669, "step": 92 }, { "epoch": 4.130434782608695, "grad_norm": 0.6167289429330038, "learning_rate": 1.4600650377311523e-05, "loss": 0.0549, "step": 95 }, { "epoch": 4.3478260869565215, "grad_norm": 0.5274268025844417, "learning_rate": 1.3914295307268396e-05, "loss": 0.0535, "step": 100 }, { "epoch": 4.565217391304348, "grad_norm": 4.645394192055986, "learning_rate": 1.3205411116710973e-05, "loss": 0.0558, "step": 105 }, { "epoch": 4.782608695652174, "grad_norm": 0.44016634132559596, "learning_rate": 1.2478077859929e-05, "loss": 0.0557, "step": 110 }, { "epoch": 5.0, "grad_norm": 0.37871781752696215, "learning_rate": 1.1736481776669307e-05, "loss": 0.0533, "step": 115 }, { "epoch": 5.0, "eval_loss": 0.05291759595274925, "eval_runtime": 67.622, "eval_samples_per_second": 21.236, "eval_steps_per_second": 0.665, "step": 115 }, { "epoch": 5.217391304347826, "grad_norm": 0.4605215085179153, "learning_rate": 1.0984891197811686e-05, "loss": 0.0526, "step": 120 }, { "epoch": 5.434782608695652, "grad_norm": 0.5874492251340868, "learning_rate": 1.0227631978561057e-05, "loss": 0.0531, "step": 125 }, { "epoch": 5.6521739130434785, "grad_norm": 0.29510087532653284, "learning_rate": 9.469062600552509e-06, "loss": 0.0525, "step": 130 }, { "epoch": 5.869565217391305, "grad_norm": 0.344906161076209, "learning_rate": 8.71354908617169e-06, "loss": 0.0525, "step": 135 }, { "epoch": 6.0, "eval_loss": 0.0515441857278347, "eval_runtime": 67.5553, "eval_samples_per_second": 21.257, "eval_steps_per_second": 0.666, "step": 138 }, { "epoch": 6.086956521739131, "grad_norm": 0.36354029610859134, "learning_rate": 7.965439869473664e-06, "loss": 0.0514, "step": 140 }, { "epoch": 6.304347826086957, "grad_norm": 0.26428382375759113, "learning_rate": 7.2290407683331154e-06, "loss": 0.0515, "step": 145 }, { "epoch": 6.521739130434782, "grad_norm": 0.22822648732081055, "learning_rate": 6.508590201876317e-06, "loss": 0.0512, "step": 150 }, { "epoch": 6.739130434782608, "grad_norm": 0.25828737593751994, "learning_rate": 5.8082347958333625e-06, "loss": 0.0513, "step": 155 }, { "epoch": 6.956521739130435, "grad_norm": 0.2285799063924952, "learning_rate": 5.132005516216512e-06, "loss": 0.0519, "step": 160 }, { "epoch": 7.0, "eval_loss": 0.05051277205348015, "eval_runtime": 67.5123, "eval_samples_per_second": 21.27, "eval_steps_per_second": 0.667, "step": 161 }, { "epoch": 7.173913043478261, "grad_norm": 0.24475626828923558, "learning_rate": 4.483794468689728e-06, "loss": 0.0508, "step": 165 }, { "epoch": 7.391304347826087, "grad_norm": 0.2520072015362384, "learning_rate": 3.867332497162836e-06, "loss": 0.0504, "step": 170 }, { "epoch": 7.608695652173913, "grad_norm": 0.27382520169651525, "learning_rate": 3.2861677105440335e-06, "loss": 0.0504, "step": 175 }, { "epoch": 7.826086956521739, "grad_norm": 0.2716287514584957, "learning_rate": 2.7436450612420098e-06, "loss": 0.0505, "step": 180 }, { "epoch": 8.0, "eval_loss": 0.04957514628767967, "eval_runtime": 67.276, "eval_samples_per_second": 21.345, "eval_steps_per_second": 0.669, "step": 184 }, { "epoch": 8.043478260869565, "grad_norm": 0.21787711573464694, "learning_rate": 2.2428870929558012e-06, "loss": 0.0496, "step": 185 }, { "epoch": 8.26086956521739, "grad_norm": 0.21445383159071862, "learning_rate": 1.7867759685603115e-06, "loss": 0.0494, "step": 190 }, { "epoch": 8.478260869565217, "grad_norm": 0.24405861737397047, "learning_rate": 1.3779368815278648e-06, "loss": 0.0491, "step": 195 }, { "epoch": 8.695652173913043, "grad_norm": 0.2697463930100617, "learning_rate": 1.01872294636304e-06, "loss": 0.0495, "step": 200 }, { "epoch": 8.91304347826087, "grad_norm": 0.21112878071345867, "learning_rate": 7.1120165501533e-07, "loss": 0.0494, "step": 205 }, { "epoch": 9.0, "eval_loss": 0.048730239272117615, "eval_runtime": 66.913, "eval_samples_per_second": 21.461, "eval_steps_per_second": 0.673, "step": 207 }, { "epoch": 9.130434782608695, "grad_norm": 0.19010353057716406, "learning_rate": 4.5714297722121105e-07, "loss": 0.049, "step": 210 }, { "epoch": 9.347826086956522, "grad_norm": 0.17948234694281104, "learning_rate": 2.5800917326521013e-07, "loss": 0.0491, "step": 215 }, { "epoch": 9.565217391304348, "grad_norm": 0.19343675188279383, "learning_rate": 1.1494637779369766e-07, "loss": 0.0489, "step": 220 }, { "epoch": 9.782608695652174, "grad_norm": 0.21072621537133224, "learning_rate": 2.8778003121607834e-08, "loss": 0.0483, "step": 225 }, { "epoch": 10.0, "grad_norm": 0.18309652115048786, "learning_rate": 0.0, "loss": 0.0484, "step": 230 }, { "epoch": 10.0, "eval_loss": 0.0484623983502388, "eval_runtime": 67.5159, "eval_samples_per_second": 21.269, "eval_steps_per_second": 0.667, "step": 230 }, { "epoch": 10.0, "step": 230, "total_flos": 48157320806400.0, "train_loss": 0.1130801611620447, "train_runtime": 3471.4646, "train_samples_per_second": 4.137, "train_steps_per_second": 0.066 } ], "logging_steps": 5, "max_steps": 230, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 48157320806400.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }