{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.6,
  "eval_steps": 500,
  "global_step": 25,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.02,
      "grad_norm": 0.4254041016101837,
      "learning_rate": 0.0002,
      "loss": 2.9554,
      "step": 1
    },
    {
      "epoch": 0.05,
      "grad_norm": 0.36921417713165283,
      "learning_rate": 0.0002,
      "loss": 3.2312,
      "step": 2
    },
    {
      "epoch": 0.07,
      "grad_norm": 23399.443359375,
      "learning_rate": 0.0002,
      "loss": 3.2781,
      "step": 3
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.4666428864002228,
      "learning_rate": 0.0002,
      "loss": 2.6009,
      "step": 4
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.5176392197608948,
      "learning_rate": 0.0002,
      "loss": 2.3716,
      "step": 5
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.4933389723300934,
      "learning_rate": 0.0002,
      "loss": 2.1643,
      "step": 6
    },
    {
      "epoch": 0.17,
      "grad_norm": 0.4874391257762909,
      "learning_rate": 0.0002,
      "loss": 1.9646,
      "step": 7
    },
    {
      "epoch": 0.19,
      "grad_norm": 51451.42578125,
      "learning_rate": 0.0002,
      "loss": 1.6756,
      "step": 8
    },
    {
      "epoch": 0.22,
      "grad_norm": 35186.67578125,
      "learning_rate": 0.0002,
      "loss": 1.531,
      "step": 9
    },
    {
      "epoch": 0.24,
      "grad_norm": 36239.921875,
      "learning_rate": 0.0002,
      "loss": 1.3601,
      "step": 10
    },
    {
      "epoch": 0.26,
      "grad_norm": 4.011983871459961,
      "learning_rate": 0.0002,
      "loss": 4.4568,
      "step": 11
    },
    {
      "epoch": 0.29,
      "grad_norm": 0.5809924006462097,
      "learning_rate": 0.0002,
      "loss": 1.3738,
      "step": 12
    },
    {
      "epoch": 0.31,
      "grad_norm": 47272.734375,
      "learning_rate": 0.0002,
      "loss": 1.2202,
      "step": 13
    },
    {
      "epoch": 0.34,
      "grad_norm": 47726.953125,
      "learning_rate": 0.0002,
      "loss": 1.141,
      "step": 14
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.37444865703582764,
      "learning_rate": 0.0002,
      "loss": 1.0439,
      "step": 15
    },
    {
      "epoch": 0.38,
      "grad_norm": 0.3057764768600464,
      "learning_rate": 0.0002,
      "loss": 0.9621,
      "step": 16
    },
    {
      "epoch": 0.41,
      "grad_norm": 0.26601067185401917,
      "learning_rate": 0.0002,
      "loss": 0.8753,
      "step": 17
    },
    {
      "epoch": 0.43,
      "grad_norm": 0.4599006175994873,
      "learning_rate": 0.0002,
      "loss": 0.8101,
      "step": 18
    },
    {
      "epoch": 0.46,
      "grad_norm": 45638.9921875,
      "learning_rate": 0.0002,
      "loss": 0.7067,
      "step": 19
    },
    {
      "epoch": 0.48,
      "grad_norm": 45151.078125,
      "learning_rate": 0.0002,
      "loss": 0.7156,
      "step": 20
    },
    {
      "epoch": 0.5,
      "grad_norm": 1.2297028303146362,
      "learning_rate": 0.0002,
      "loss": 3.7072,
      "step": 21
    },
    {
      "epoch": 0.53,
      "grad_norm": 0.25821492075920105,
      "learning_rate": 0.0002,
      "loss": 0.9328,
      "step": 22
    },
    {
      "epoch": 0.55,
      "grad_norm": 0.15906491875648499,
      "learning_rate": 0.0002,
      "loss": 0.8799,
      "step": 23
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.18819023668766022,
      "learning_rate": 0.0002,
      "loss": 0.8159,
      "step": 24
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.16911110281944275,
      "learning_rate": 0.0002,
      "loss": 0.7989,
      "step": 25
    }
  ],
  "logging_steps": 1,
  "max_steps": 41,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 25,
  "total_flos": 3223684174970880.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}