{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.592910848549947, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.34, "grad_norm": 0.6707318425178528, "learning_rate": 1.97816091954023e-05, "loss": 2.2888, "step": 20 }, { "epoch": 0.69, "grad_norm": NaN, "learning_rate": 1.9574712643678162e-05, "loss": 2.0848, "step": 40 }, { "epoch": 1.03, "grad_norm": 0.7721680402755737, "learning_rate": 1.9344827586206897e-05, "loss": 2.0044, "step": 60 }, { "epoch": 1.37, "grad_norm": 1.1140433549880981, "learning_rate": 1.9126436781609195e-05, "loss": 1.8016, "step": 80 }, { "epoch": 1.72, "grad_norm": 0.7205075621604919, "learning_rate": 1.8896551724137934e-05, "loss": 1.7217, "step": 100 }, { "epoch": 2.06, "grad_norm": 0.8933233618736267, "learning_rate": 1.866666666666667e-05, "loss": 1.5705, "step": 120 }, { "epoch": 2.41, "grad_norm": 0.7114273905754089, "learning_rate": 1.8436781609195404e-05, "loss": 1.4006, "step": 140 }, { "epoch": 2.75, "grad_norm": 0.7229479551315308, "learning_rate": 1.820689655172414e-05, "loss": 1.3137, "step": 160 }, { "epoch": 3.09, "grad_norm": 0.9370490908622742, "learning_rate": 1.7977011494252874e-05, "loss": 1.1898, "step": 180 }, { "epoch": 3.44, "grad_norm": 0.6051978468894958, "learning_rate": 1.774712643678161e-05, "loss": 1.1229, "step": 200 }, { "epoch": 3.78, "grad_norm": 0.6857028007507324, "learning_rate": 1.7517241379310347e-05, "loss": 1.051, "step": 220 }, { "epoch": 4.12, "grad_norm": 0.6715748310089111, "learning_rate": 1.7287356321839082e-05, "loss": 0.9894, "step": 240 }, { "epoch": 4.47, "grad_norm": 0.5918118953704834, "learning_rate": 1.7057471264367817e-05, "loss": 0.9687, "step": 260 }, { "epoch": 4.81, "grad_norm": 0.6621690392494202, "learning_rate": 1.6827586206896552e-05, "loss": 0.9199, "step": 280 }, { "epoch": 5.16, "grad_norm": 0.6697206497192383, "learning_rate": 1.659770114942529e-05, "loss": 0.9303, "step": 300 }, { "epoch": 5.5, "grad_norm": 0.8184316158294678, "learning_rate": 1.6367816091954025e-05, "loss": 0.8898, "step": 320 }, { "epoch": 5.84, "grad_norm": 0.6429987549781799, "learning_rate": 1.613793103448276e-05, "loss": 0.8623, "step": 340 }, { "epoch": 6.19, "grad_norm": 0.7518043518066406, "learning_rate": 1.5908045977011495e-05, "loss": 0.8239, "step": 360 }, { "epoch": 6.53, "grad_norm": 0.6667824983596802, "learning_rate": 1.567816091954023e-05, "loss": 0.8119, "step": 380 }, { "epoch": 6.87, "grad_norm": 0.8569457530975342, "learning_rate": 1.5448275862068965e-05, "loss": 0.8139, "step": 400 }, { "epoch": 7.22, "grad_norm": 0.7754850387573242, "learning_rate": 1.5218390804597702e-05, "loss": 0.7835, "step": 420 }, { "epoch": 7.56, "grad_norm": 1.159196138381958, "learning_rate": 1.4988505747126439e-05, "loss": 0.7546, "step": 440 }, { "epoch": 7.91, "grad_norm": 1.119764804840088, "learning_rate": 1.4758620689655174e-05, "loss": 0.7571, "step": 460 }, { "epoch": 8.25, "grad_norm": 1.3600786924362183, "learning_rate": 1.452873563218391e-05, "loss": 0.7451, "step": 480 }, { "epoch": 8.59, "grad_norm": 0.7608994245529175, "learning_rate": 1.4298850574712644e-05, "loss": 0.7109, "step": 500 } ], "logging_steps": 20, "max_steps": 1740, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "total_flos": 1.2995638935552e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }