{
  "best_metric": 1.7247449159622192,
  "best_model_checkpoint": "lora_lr/google/gemma-1.1-7b-it/unaligned/checkpoint-1000",
  "epoch": 1.310300703774792,
  "eval_steps": 50,
  "global_step": 1024,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0,
      "grad_norm": 1.0390625,
      "learning_rate": 2.0000000000000003e-06,
      "loss": 4.7488,
      "step": 1
    },
    {
      "epoch": 0.0,
      "grad_norm": 0.97265625,
      "learning_rate": 4.000000000000001e-06,
      "loss": 4.5149,
      "step": 2
    },
    {
      "epoch": 0.0,
      "grad_norm": 0.9296875,
      "learning_rate": 6e-06,
      "loss": 4.3836,
      "step": 3
    },
    {
      "epoch": 0.01,
      "grad_norm": 1.0234375,
      "learning_rate": 8.000000000000001e-06,
      "loss": 4.6777,
      "step": 4
    },
    {
      "epoch": 0.01,
      "grad_norm": 1.03125,
      "learning_rate": 1e-05,
      "loss": 4.6688,
      "step": 5
    },
    {
      "epoch": 0.01,
      "grad_norm": 0.94921875,
      "learning_rate": 1.2e-05,
      "loss": 4.5565,
      "step": 6
    },
    {
      "epoch": 0.01,
      "grad_norm": 1.0078125,
      "learning_rate": 1.4000000000000001e-05,
      "loss": 4.5658,
      "step": 7
    },
    {
      "epoch": 0.01,
      "grad_norm": 1.015625,
      "learning_rate": 1.6000000000000003e-05,
      "loss": 4.6618,
      "step": 8
    },
    {
      "epoch": 0.01,
      "grad_norm": 1.0546875,
      "learning_rate": 1.8e-05,
      "loss": 4.6095,
      "step": 9
    },
    {
      "epoch": 0.01,
      "grad_norm": 1.0703125,
      "learning_rate": 2e-05,
      "loss": 4.7608,
      "step": 10
    },
    {
      "epoch": 0.01,
      "grad_norm": 1.0546875,
      "learning_rate": 2.2000000000000003e-05,
      "loss": 4.5421,
      "step": 11
    },
    {
      "epoch": 0.02,
      "grad_norm": 0.9609375,
      "learning_rate": 2.4e-05,
      "loss": 4.2895,
      "step": 12
    },
    {
      "epoch": 0.02,
      "grad_norm": 1.125,
      "learning_rate": 2.6000000000000002e-05,
      "loss": 4.8112,
      "step": 13
    },
    {
      "epoch": 0.02,
      "grad_norm": 1.1328125,
      "learning_rate": 2.8000000000000003e-05,
      "loss": 4.6709,
      "step": 14
    },
    {
      "epoch": 0.02,
      "grad_norm": 1.1484375,
      "learning_rate": 3e-05,
      "loss": 4.5234,
      "step": 15
    },
    {
      "epoch": 0.02,
      "grad_norm": 1.1328125,
      "learning_rate": 3.2000000000000005e-05,
      "loss": 4.3751,
      "step": 16
    },
    {
      "epoch": 0.02,
      "grad_norm": 1.3046875,
      "learning_rate": 3.4000000000000007e-05,
      "loss": 4.5247,
      "step": 17
    },
    {
      "epoch": 0.02,
      "grad_norm": 1.265625,
      "learning_rate": 3.6e-05,
      "loss": 4.3906,
      "step": 18
    },
    {
      "epoch": 0.02,
      "grad_norm": 1.40625,
      "learning_rate": 3.8e-05,
      "loss": 4.3193,
      "step": 19
    },
    {
      "epoch": 0.03,
      "grad_norm": 1.4296875,
      "learning_rate": 4e-05,
      "loss": 4.2987,
      "step": 20
    },
    {
      "epoch": 0.03,
      "grad_norm": 1.4375,
      "learning_rate": 4.2e-05,
      "loss": 4.2167,
      "step": 21
    },
    {
      "epoch": 0.03,
      "grad_norm": 1.6171875,
      "learning_rate": 4.4000000000000006e-05,
      "loss": 4.3088,
      "step": 22
    },
    {
      "epoch": 0.03,
      "grad_norm": 1.6640625,
      "learning_rate": 4.600000000000001e-05,
      "loss": 4.2658,
      "step": 23
    },
    {
      "epoch": 0.03,
      "grad_norm": 1.734375,
      "learning_rate": 4.8e-05,
      "loss": 4.1625,
      "step": 24
    },
    {
      "epoch": 0.03,
      "grad_norm": 1.8203125,
      "learning_rate": 5e-05,
      "loss": 4.0392,
      "step": 25
    },
    {
      "epoch": 0.03,
      "grad_norm": 1.8828125,
      "learning_rate": 5.2000000000000004e-05,
      "loss": 3.9772,
      "step": 26
    },
    {
      "epoch": 0.03,
      "grad_norm": 1.8828125,
      "learning_rate": 5.4000000000000005e-05,
      "loss": 3.9029,
      "step": 27
    },
    {
      "epoch": 0.04,
      "grad_norm": 1.875,
      "learning_rate": 5.6000000000000006e-05,
      "loss": 3.8107,
      "step": 28
    },
    {
      "epoch": 0.04,
      "grad_norm": 1.96875,
      "learning_rate": 5.8e-05,
      "loss": 3.8199,
      "step": 29
    },
    {
      "epoch": 0.04,
      "grad_norm": 1.953125,
      "learning_rate": 6e-05,
      "loss": 3.5035,
      "step": 30
    },
    {
      "epoch": 0.04,
      "grad_norm": 2.03125,
      "learning_rate": 6.2e-05,
      "loss": 3.6236,
      "step": 31
    },
    {
      "epoch": 0.04,
      "grad_norm": 1.796875,
      "learning_rate": 6.400000000000001e-05,
      "loss": 3.3359,
      "step": 32
    },
    {
      "epoch": 0.04,
      "grad_norm": 1.7578125,
      "learning_rate": 6.6e-05,
      "loss": 3.3019,
      "step": 33
    },
    {
      "epoch": 0.04,
      "grad_norm": 1.7109375,
      "learning_rate": 6.800000000000001e-05,
      "loss": 3.2003,
      "step": 34
    },
    {
      "epoch": 0.04,
      "grad_norm": 1.6640625,
      "learning_rate": 7e-05,
      "loss": 3.1585,
      "step": 35
    },
    {
      "epoch": 0.05,
      "grad_norm": 1.53125,
      "learning_rate": 7.2e-05,
      "loss": 2.9377,
      "step": 36
    },
    {
      "epoch": 0.05,
      "grad_norm": 1.5625,
      "learning_rate": 7.4e-05,
      "loss": 3.13,
      "step": 37
    },
    {
      "epoch": 0.05,
      "grad_norm": 1.453125,
      "learning_rate": 7.6e-05,
      "loss": 2.9445,
      "step": 38
    },
    {
      "epoch": 0.05,
      "grad_norm": 1.21875,
      "learning_rate": 7.800000000000001e-05,
      "loss": 2.7348,
      "step": 39
    },
    {
      "epoch": 0.05,
      "grad_norm": 1.1171875,
      "learning_rate": 8e-05,
      "loss": 2.6842,
      "step": 40
    },
    {
      "epoch": 0.05,
      "grad_norm": 0.90625,
      "learning_rate": 8.2e-05,
      "loss": 2.6921,
      "step": 41
    },
    {
      "epoch": 0.05,
      "grad_norm": 0.9453125,
      "learning_rate": 8.4e-05,
      "loss": 2.6504,
      "step": 42
    },
    {
      "epoch": 0.06,
      "grad_norm": 1.125,
      "learning_rate": 8.6e-05,
      "loss": 2.4995,
      "step": 43
    },
    {
      "epoch": 0.06,
      "grad_norm": 1.34375,
      "learning_rate": 8.800000000000001e-05,
      "loss": 2.4545,
      "step": 44
    },
    {
      "epoch": 0.06,
      "grad_norm": 1.28125,
      "learning_rate": 9e-05,
      "loss": 2.4311,
      "step": 45
    },
    {
      "epoch": 0.06,
      "grad_norm": 0.640625,
      "learning_rate": 9.200000000000001e-05,
      "loss": 2.4045,
      "step": 46
    },
    {
      "epoch": 0.06,
      "grad_norm": 0.54296875,
      "learning_rate": 9.4e-05,
      "loss": 2.413,
      "step": 47
    },
    {
      "epoch": 0.06,
      "grad_norm": 0.578125,
      "learning_rate": 9.6e-05,
      "loss": 2.3691,
      "step": 48
    },
    {
      "epoch": 0.06,
      "grad_norm": 0.65234375,
      "learning_rate": 9.8e-05,
      "loss": 2.2936,
      "step": 49
    },
    {
      "epoch": 0.06,
      "grad_norm": 0.62109375,
      "learning_rate": 0.0001,
      "loss": 2.3459,
      "step": 50
    },
    {
      "epoch": 0.06,
      "eval_loss": 2.298555374145508,
      "eval_runtime": 125.5458,
      "eval_samples_per_second": 39.826,
      "eval_steps_per_second": 1.251,
      "step": 50
    },
    {
      "epoch": 0.07,
      "grad_norm": 0.5859375,
      "learning_rate": 0.00010200000000000001,
      "loss": 2.458,
      "step": 51
    },
    {
      "epoch": 0.07,
      "grad_norm": 0.61328125,
      "learning_rate": 0.00010400000000000001,
      "loss": 2.2395,
      "step": 52
    },
    {
      "epoch": 0.07,
      "grad_norm": 0.55859375,
      "learning_rate": 0.00010600000000000002,
      "loss": 2.2084,
      "step": 53
    },
    {
      "epoch": 0.07,
      "grad_norm": 0.83203125,
      "learning_rate": 0.00010800000000000001,
      "loss": 2.2806,
      "step": 54
    },
    {
      "epoch": 0.07,
      "grad_norm": 1.15625,
      "learning_rate": 0.00011000000000000002,
      "loss": 2.1546,
      "step": 55
    },
    {
      "epoch": 0.07,
      "grad_norm": 0.91015625,
      "learning_rate": 0.00011200000000000001,
      "loss": 2.2027,
      "step": 56
    },
    {
      "epoch": 0.07,
      "grad_norm": 0.6640625,
      "learning_rate": 0.00011399999999999999,
      "loss": 2.1795,
      "step": 57
    },
    {
      "epoch": 0.07,
      "grad_norm": 0.5,
      "learning_rate": 0.000116,
      "loss": 2.1918,
      "step": 58
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.39453125,
      "learning_rate": 0.000118,
      "loss": 2.143,
      "step": 59
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.4375,
      "learning_rate": 0.00012,
      "loss": 2.1451,
      "step": 60
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.388671875,
      "learning_rate": 0.000122,
      "loss": 2.1542,
      "step": 61
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.3984375,
      "learning_rate": 0.000124,
      "loss": 2.1125,
      "step": 62
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.6953125,
      "learning_rate": 0.000126,
      "loss": 2.1123,
      "step": 63
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.451171875,
      "learning_rate": 0.00012800000000000002,
      "loss": 2.0354,
      "step": 64
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.451171875,
      "learning_rate": 0.00013000000000000002,
      "loss": 2.1228,
      "step": 65
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.77734375,
      "learning_rate": 0.000132,
      "loss": 2.1666,
      "step": 66
    },
    {
      "epoch": 0.09,
      "grad_norm": 0.455078125,
      "learning_rate": 0.000134,
      "loss": 2.0429,
      "step": 67
    },
    {
      "epoch": 0.09,
      "grad_norm": 0.384765625,
      "learning_rate": 0.00013600000000000003,
      "loss": 2.0367,
      "step": 68
    },
    {
      "epoch": 0.09,
      "grad_norm": 0.365234375,
      "learning_rate": 0.000138,
      "loss": 2.0139,
      "step": 69
    },
    {
      "epoch": 0.09,
      "grad_norm": 0.3671875,
      "learning_rate": 0.00014,
      "loss": 2.0284,
      "step": 70
    },
    {
      "epoch": 0.09,
      "grad_norm": 0.404296875,
      "learning_rate": 0.000142,
      "loss": 2.0303,
      "step": 71
    },
    {
      "epoch": 0.09,
      "grad_norm": 0.3515625,
      "learning_rate": 0.000144,
      "loss": 2.0645,
      "step": 72
    },
    {
      "epoch": 0.09,
      "grad_norm": 0.365234375,
      "learning_rate": 0.000146,
      "loss": 2.001,
      "step": 73
    },
    {
      "epoch": 0.09,
      "grad_norm": 0.3046875,
      "learning_rate": 0.000148,
      "loss": 2.0946,
      "step": 74
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.314453125,
      "learning_rate": 0.00015000000000000001,
      "loss": 1.9969,
      "step": 75
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.419921875,
      "learning_rate": 0.000152,
      "loss": 1.9911,
      "step": 76
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.328125,
      "learning_rate": 0.000154,
      "loss": 2.0011,
      "step": 77
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.234375,
      "learning_rate": 0.00015600000000000002,
      "loss": 1.9893,
      "step": 78
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.380859375,
      "learning_rate": 0.00015800000000000002,
      "loss": 1.9336,
      "step": 79
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00016,
      "loss": 1.9876,
      "step": 80
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.255859375,
      "learning_rate": 0.000162,
      "loss": 1.9679,
      "step": 81
    },
    {
      "epoch": 0.1,
      "grad_norm": 0.23828125,
      "learning_rate": 0.000164,
      "loss": 1.9157,
      "step": 82
    },
    {
      "epoch": 0.11,
      "grad_norm": 0.373046875,
      "learning_rate": 0.000166,
      "loss": 1.9939,
      "step": 83
    },
    {
      "epoch": 0.11,
      "grad_norm": 0.255859375,
      "learning_rate": 0.000168,
      "loss": 1.9457,
      "step": 84
    },
    {
      "epoch": 0.11,
      "grad_norm": 0.26171875,
      "learning_rate": 0.00017,
      "loss": 1.9924,
      "step": 85
    },
    {
      "epoch": 0.11,
      "grad_norm": 0.2275390625,
      "learning_rate": 0.000172,
      "loss": 1.8708,
      "step": 86
    },
    {
      "epoch": 0.11,
      "grad_norm": 0.26953125,
      "learning_rate": 0.000174,
      "loss": 1.946,
      "step": 87
    },
    {
      "epoch": 0.11,
      "grad_norm": 0.255859375,
      "learning_rate": 0.00017600000000000002,
      "loss": 1.9743,
      "step": 88
    },
    {
      "epoch": 0.11,
      "grad_norm": 0.2490234375,
      "learning_rate": 0.00017800000000000002,
      "loss": 1.9257,
      "step": 89
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.26953125,
      "learning_rate": 0.00018,
      "loss": 1.9041,
      "step": 90
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.265625,
      "learning_rate": 0.000182,
      "loss": 1.9502,
      "step": 91
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.240234375,
      "learning_rate": 0.00018400000000000003,
      "loss": 1.8514,
      "step": 92
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.255859375,
      "learning_rate": 0.00018600000000000002,
      "loss": 2.0176,
      "step": 93
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.298828125,
      "learning_rate": 0.000188,
      "loss": 1.9056,
      "step": 94
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.26171875,
      "learning_rate": 0.00019,
      "loss": 1.9149,
      "step": 95
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.212890625,
      "learning_rate": 0.000192,
      "loss": 1.9587,
      "step": 96
    },
    {
      "epoch": 0.12,
      "grad_norm": 0.26953125,
      "learning_rate": 0.000194,
      "loss": 1.9103,
      "step": 97
    },
    {
      "epoch": 0.13,
      "grad_norm": 0.28515625,
      "learning_rate": 0.000196,
      "loss": 1.8814,
      "step": 98
    },
    {
      "epoch": 0.13,
      "grad_norm": 0.25,
      "learning_rate": 0.00019800000000000002,
      "loss": 1.842,
      "step": 99
    },
    {
      "epoch": 0.13,
      "grad_norm": 0.205078125,
      "learning_rate": 0.0002,
      "loss": 1.8251,
      "step": 100
    },
    {
      "epoch": 0.13,
      "eval_loss": 1.8968226909637451,
      "eval_runtime": 125.222,
      "eval_samples_per_second": 39.929,
      "eval_steps_per_second": 1.254,
      "step": 100
    },
    {
      "epoch": 0.13,
      "grad_norm": 0.275390625,
      "learning_rate": 0.00019978354978354978,
      "loss": 1.84,
      "step": 101
    },
    {
      "epoch": 0.13,
      "grad_norm": 0.2236328125,
      "learning_rate": 0.00019956709956709957,
      "loss": 1.9066,
      "step": 102
    },
    {
      "epoch": 0.13,
      "grad_norm": 0.244140625,
      "learning_rate": 0.00019935064935064936,
      "loss": 1.9553,
      "step": 103
    },
    {
      "epoch": 0.13,
      "grad_norm": 0.2451171875,
      "learning_rate": 0.00019913419913419916,
      "loss": 1.922,
      "step": 104
    },
    {
      "epoch": 0.13,
      "grad_norm": 0.255859375,
      "learning_rate": 0.00019891774891774892,
      "loss": 1.8927,
      "step": 105
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.21484375,
      "learning_rate": 0.00019870129870129872,
      "loss": 1.9244,
      "step": 106
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.265625,
      "learning_rate": 0.0001984848484848485,
      "loss": 1.8875,
      "step": 107
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.388671875,
      "learning_rate": 0.00019826839826839827,
      "loss": 1.837,
      "step": 108
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.228515625,
      "learning_rate": 0.00019805194805194807,
      "loss": 1.8613,
      "step": 109
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.2216796875,
      "learning_rate": 0.00019783549783549783,
      "loss": 1.9369,
      "step": 110
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.234375,
      "learning_rate": 0.00019761904761904763,
      "loss": 1.8546,
      "step": 111
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.3125,
      "learning_rate": 0.00019740259740259742,
      "loss": 1.8774,
      "step": 112
    },
    {
      "epoch": 0.14,
      "grad_norm": 0.28125,
      "learning_rate": 0.0001971861471861472,
      "loss": 1.8799,
      "step": 113
    },
    {
      "epoch": 0.15,
      "grad_norm": 0.224609375,
      "learning_rate": 0.00019696969696969698,
      "loss": 1.8776,
      "step": 114
    },
    {
      "epoch": 0.15,
      "grad_norm": 0.2412109375,
      "learning_rate": 0.00019675324675324675,
      "loss": 1.8823,
      "step": 115
    },
    {
      "epoch": 0.15,
      "grad_norm": 0.234375,
      "learning_rate": 0.00019653679653679654,
      "loss": 1.8877,
      "step": 116
    },
    {
      "epoch": 0.15,
      "grad_norm": 0.259765625,
      "learning_rate": 0.00019632034632034633,
      "loss": 1.9692,
      "step": 117
    },
    {
      "epoch": 0.15,
      "grad_norm": 0.208984375,
      "learning_rate": 0.00019610389610389613,
      "loss": 1.8985,
      "step": 118
    },
    {
      "epoch": 0.15,
      "grad_norm": 0.248046875,
      "learning_rate": 0.0001958874458874459,
      "loss": 1.8564,
      "step": 119
    },
    {
      "epoch": 0.15,
      "grad_norm": 0.2109375,
      "learning_rate": 0.00019567099567099566,
      "loss": 1.9001,
      "step": 120
    },
    {
      "epoch": 0.15,
      "grad_norm": 0.2080078125,
      "learning_rate": 0.00019545454545454548,
      "loss": 1.879,
      "step": 121
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.326171875,
      "learning_rate": 0.00019523809523809525,
      "loss": 1.8385,
      "step": 122
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.24609375,
      "learning_rate": 0.00019502164502164504,
      "loss": 1.8581,
      "step": 123
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.2294921875,
      "learning_rate": 0.0001948051948051948,
      "loss": 1.8517,
      "step": 124
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.21875,
      "learning_rate": 0.0001945887445887446,
      "loss": 1.8695,
      "step": 125
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.3125,
      "learning_rate": 0.0001943722943722944,
      "loss": 1.8675,
      "step": 126
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.220703125,
      "learning_rate": 0.00019415584415584416,
      "loss": 1.9089,
      "step": 127
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.2314453125,
      "learning_rate": 0.00019393939393939395,
      "loss": 1.831,
      "step": 128
    },
    {
      "epoch": 0.17,
      "grad_norm": 0.2734375,
      "learning_rate": 0.00019372294372294372,
      "loss": 1.932,
      "step": 129
    },
    {
      "epoch": 0.17,
      "grad_norm": 0.2216796875,
      "learning_rate": 0.00019350649350649354,
      "loss": 1.917,
      "step": 130
    },
    {
      "epoch": 0.17,
      "grad_norm": 0.28125,
      "learning_rate": 0.0001932900432900433,
      "loss": 1.8179,
      "step": 131
    },
    {
      "epoch": 0.17,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00019307359307359307,
      "loss": 1.8764,
      "step": 132
    },
    {
      "epoch": 0.17,
      "grad_norm": 0.26171875,
      "learning_rate": 0.00019285714285714286,
      "loss": 1.8926,
      "step": 133
    },
    {
      "epoch": 0.17,
      "grad_norm": 0.24609375,
      "learning_rate": 0.00019264069264069266,
      "loss": 1.9443,
      "step": 134
    },
    {
      "epoch": 0.17,
      "grad_norm": 0.251953125,
      "learning_rate": 0.00019242424242424245,
      "loss": 1.8702,
      "step": 135
    },
    {
      "epoch": 0.17,
      "grad_norm": 0.32421875,
      "learning_rate": 0.00019220779220779222,
      "loss": 1.8728,
      "step": 136
    },
    {
      "epoch": 0.18,
      "grad_norm": 0.287109375,
      "learning_rate": 0.000191991341991342,
      "loss": 1.7953,
      "step": 137
    },
    {
      "epoch": 0.18,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00019177489177489178,
      "loss": 1.7883,
      "step": 138
    },
    {
      "epoch": 0.18,
      "grad_norm": 0.302734375,
      "learning_rate": 0.00019155844155844157,
      "loss": 1.8741,
      "step": 139
    },
    {
      "epoch": 0.18,
      "grad_norm": 0.25390625,
      "learning_rate": 0.00019134199134199136,
      "loss": 1.8094,
      "step": 140
    },
    {
      "epoch": 0.18,
      "grad_norm": 0.302734375,
      "learning_rate": 0.00019112554112554113,
      "loss": 1.8629,
      "step": 141
    },
    {
      "epoch": 0.18,
      "grad_norm": 0.2333984375,
      "learning_rate": 0.00019090909090909092,
      "loss": 1.855,
      "step": 142
    },
    {
      "epoch": 0.18,
      "grad_norm": 0.236328125,
      "learning_rate": 0.0001906926406926407,
      "loss": 1.848,
      "step": 143
    },
    {
      "epoch": 0.18,
      "grad_norm": 0.265625,
      "learning_rate": 0.00019047619047619048,
      "loss": 1.8853,
      "step": 144
    },
    {
      "epoch": 0.19,
      "grad_norm": 0.47265625,
      "learning_rate": 0.00019025974025974027,
      "loss": 1.8435,
      "step": 145
    },
    {
      "epoch": 0.19,
      "grad_norm": 0.263671875,
      "learning_rate": 0.00019004329004329004,
      "loss": 1.8619,
      "step": 146
    },
    {
      "epoch": 0.19,
      "grad_norm": 0.37890625,
      "learning_rate": 0.00018982683982683983,
      "loss": 1.8298,
      "step": 147
    },
    {
      "epoch": 0.19,
      "grad_norm": 0.236328125,
      "learning_rate": 0.00018961038961038963,
      "loss": 1.7478,
      "step": 148
    },
    {
      "epoch": 0.19,
      "grad_norm": 0.275390625,
      "learning_rate": 0.00018939393939393942,
      "loss": 1.906,
      "step": 149
    },
    {
      "epoch": 0.19,
      "grad_norm": 0.3046875,
      "learning_rate": 0.0001891774891774892,
      "loss": 1.9013,
      "step": 150
    },
    {
      "epoch": 0.19,
      "eval_loss": 1.8299968242645264,
      "eval_runtime": 125.2613,
      "eval_samples_per_second": 39.917,
      "eval_steps_per_second": 1.253,
      "step": 150
    },
    {
      "epoch": 0.19,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00018896103896103895,
      "loss": 1.7772,
      "step": 151
    },
    {
      "epoch": 0.19,
      "grad_norm": 0.2490234375,
      "learning_rate": 0.00018874458874458875,
      "loss": 1.9428,
      "step": 152
    },
    {
      "epoch": 0.2,
      "grad_norm": 0.2314453125,
      "learning_rate": 0.00018852813852813854,
      "loss": 1.8121,
      "step": 153
    },
    {
      "epoch": 0.2,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00018831168831168833,
      "loss": 1.7921,
      "step": 154
    },
    {
      "epoch": 0.2,
      "grad_norm": 0.357421875,
      "learning_rate": 0.0001880952380952381,
      "loss": 1.846,
      "step": 155
    },
    {
      "epoch": 0.2,
      "grad_norm": 0.2236328125,
      "learning_rate": 0.0001878787878787879,
      "loss": 1.7703,
      "step": 156
    },
    {
      "epoch": 0.2,
      "grad_norm": 0.2119140625,
      "learning_rate": 0.00018766233766233769,
      "loss": 1.8048,
      "step": 157
    },
    {
      "epoch": 0.2,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00018744588744588745,
      "loss": 1.8353,
      "step": 158
    },
    {
      "epoch": 0.2,
      "grad_norm": 0.26953125,
      "learning_rate": 0.00018722943722943725,
      "loss": 1.8538,
      "step": 159
    },
    {
      "epoch": 0.2,
      "grad_norm": 0.259765625,
      "learning_rate": 0.000187012987012987,
      "loss": 1.828,
      "step": 160
    },
    {
      "epoch": 0.21,
      "grad_norm": 0.2412109375,
      "learning_rate": 0.0001867965367965368,
      "loss": 1.8555,
      "step": 161
    },
    {
      "epoch": 0.21,
      "grad_norm": 0.3125,
      "learning_rate": 0.0001865800865800866,
      "loss": 1.8,
      "step": 162
    },
    {
      "epoch": 0.21,
      "grad_norm": 0.236328125,
      "learning_rate": 0.00018636363636363636,
      "loss": 1.8241,
      "step": 163
    },
    {
      "epoch": 0.21,
      "grad_norm": 0.25,
      "learning_rate": 0.00018614718614718616,
      "loss": 1.7903,
      "step": 164
    },
    {
      "epoch": 0.21,
      "grad_norm": 0.2431640625,
      "learning_rate": 0.00018593073593073592,
      "loss": 1.7279,
      "step": 165
    },
    {
      "epoch": 0.21,
      "grad_norm": 0.33984375,
      "learning_rate": 0.00018571428571428572,
      "loss": 1.8231,
      "step": 166
    },
    {
      "epoch": 0.21,
      "grad_norm": 0.2451171875,
      "learning_rate": 0.0001854978354978355,
      "loss": 1.8534,
      "step": 167
    },
    {
      "epoch": 0.21,
      "grad_norm": 0.2470703125,
      "learning_rate": 0.0001852813852813853,
      "loss": 1.9175,
      "step": 168
    },
    {
      "epoch": 0.22,
      "grad_norm": 0.341796875,
      "learning_rate": 0.00018506493506493507,
      "loss": 1.7986,
      "step": 169
    },
    {
      "epoch": 0.22,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00018484848484848484,
      "loss": 1.9097,
      "step": 170
    },
    {
      "epoch": 0.22,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00018463203463203466,
      "loss": 1.8232,
      "step": 171
    },
    {
      "epoch": 0.22,
      "grad_norm": 0.38671875,
      "learning_rate": 0.00018441558441558442,
      "loss": 1.7505,
      "step": 172
    },
    {
      "epoch": 0.22,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00018419913419913422,
      "loss": 1.8454,
      "step": 173
    },
    {
      "epoch": 0.22,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00018398268398268398,
      "loss": 1.8028,
      "step": 174
    },
    {
      "epoch": 0.22,
      "grad_norm": 0.28125,
      "learning_rate": 0.00018376623376623378,
      "loss": 1.9089,
      "step": 175
    },
    {
      "epoch": 0.23,
      "grad_norm": 0.26953125,
      "learning_rate": 0.00018354978354978357,
      "loss": 1.8681,
      "step": 176
    },
    {
      "epoch": 0.23,
      "grad_norm": 0.45703125,
      "learning_rate": 0.00018333333333333334,
      "loss": 1.8012,
      "step": 177
    },
    {
      "epoch": 0.23,
      "grad_norm": 0.376953125,
      "learning_rate": 0.00018311688311688313,
      "loss": 1.7399,
      "step": 178
    },
    {
      "epoch": 0.23,
      "grad_norm": 0.3125,
      "learning_rate": 0.0001829004329004329,
      "loss": 1.8339,
      "step": 179
    },
    {
      "epoch": 0.23,
      "grad_norm": 0.470703125,
      "learning_rate": 0.00018268398268398272,
      "loss": 1.8236,
      "step": 180
    },
    {
      "epoch": 0.23,
      "grad_norm": 0.466796875,
      "learning_rate": 0.00018246753246753248,
      "loss": 1.8088,
      "step": 181
    },
    {
      "epoch": 0.23,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00018225108225108225,
      "loss": 1.77,
      "step": 182
    },
    {
      "epoch": 0.23,
      "grad_norm": 0.453125,
      "learning_rate": 0.00018203463203463204,
      "loss": 1.7954,
      "step": 183
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.53125,
      "learning_rate": 0.00018181818181818183,
      "loss": 1.7682,
      "step": 184
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.248046875,
      "learning_rate": 0.00018160173160173163,
      "loss": 1.9032,
      "step": 185
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.2431640625,
      "learning_rate": 0.0001813852813852814,
      "loss": 1.8145,
      "step": 186
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.482421875,
      "learning_rate": 0.0001811688311688312,
      "loss": 1.8433,
      "step": 187
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.3515625,
      "learning_rate": 0.00018095238095238095,
      "loss": 1.7845,
      "step": 188
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00018073593073593075,
      "loss": 1.8512,
      "step": 189
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.328125,
      "learning_rate": 0.00018051948051948054,
      "loss": 1.8208,
      "step": 190
    },
    {
      "epoch": 0.24,
      "grad_norm": 0.28515625,
      "learning_rate": 0.0001803030303030303,
      "loss": 1.8353,
      "step": 191
    },
    {
      "epoch": 0.25,
      "grad_norm": 0.251953125,
      "learning_rate": 0.0001800865800865801,
      "loss": 1.8627,
      "step": 192
    },
    {
      "epoch": 0.25,
      "grad_norm": 0.275390625,
      "learning_rate": 0.00017987012987012987,
      "loss": 1.7228,
      "step": 193
    },
    {
      "epoch": 0.25,
      "grad_norm": 0.279296875,
      "learning_rate": 0.00017965367965367966,
      "loss": 1.878,
      "step": 194
    },
    {
      "epoch": 0.25,
      "grad_norm": 0.255859375,
      "learning_rate": 0.00017943722943722945,
      "loss": 1.7724,
      "step": 195
    },
    {
      "epoch": 0.25,
      "grad_norm": 0.251953125,
      "learning_rate": 0.00017922077922077922,
      "loss": 1.7845,
      "step": 196
    },
    {
      "epoch": 0.25,
      "grad_norm": 0.326171875,
      "learning_rate": 0.000179004329004329,
      "loss": 1.7848,
      "step": 197
    },
    {
      "epoch": 0.25,
      "grad_norm": 0.265625,
      "learning_rate": 0.0001787878787878788,
      "loss": 1.808,
      "step": 198
    },
    {
      "epoch": 0.25,
      "grad_norm": 0.310546875,
      "learning_rate": 0.0001785714285714286,
      "loss": 1.834,
      "step": 199
    },
    {
      "epoch": 0.26,
      "grad_norm": 0.275390625,
      "learning_rate": 0.00017835497835497836,
      "loss": 1.7679,
      "step": 200
    },
    {
      "epoch": 0.26,
      "eval_loss": 1.8024407625198364,
      "eval_runtime": 125.3989,
      "eval_samples_per_second": 39.873,
      "eval_steps_per_second": 1.252,
      "step": 200
    },
    {
      "epoch": 0.26,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00017813852813852813,
      "loss": 1.8003,
      "step": 201
    },
    {
      "epoch": 0.26,
      "grad_norm": 0.267578125,
      "learning_rate": 0.00017792207792207792,
      "loss": 1.8544,
      "step": 202
    },
    {
      "epoch": 0.26,
      "grad_norm": 0.259765625,
      "learning_rate": 0.00017770562770562772,
      "loss": 1.8894,
      "step": 203
    },
    {
      "epoch": 0.26,
      "grad_norm": 0.28125,
      "learning_rate": 0.0001774891774891775,
      "loss": 1.8496,
      "step": 204
    },
    {
      "epoch": 0.26,
      "grad_norm": 0.265625,
      "learning_rate": 0.00017727272727272728,
      "loss": 1.7319,
      "step": 205
    },
    {
      "epoch": 0.26,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00017705627705627707,
      "loss": 1.8156,
      "step": 206
    },
    {
      "epoch": 0.26,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00017683982683982684,
      "loss": 1.6916,
      "step": 207
    },
    {
      "epoch": 0.27,
      "grad_norm": 0.267578125,
      "learning_rate": 0.00017662337662337663,
      "loss": 1.826,
      "step": 208
    },
    {
      "epoch": 0.27,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00017640692640692642,
      "loss": 1.8066,
      "step": 209
    },
    {
      "epoch": 0.27,
      "grad_norm": 0.32421875,
      "learning_rate": 0.0001761904761904762,
      "loss": 1.8503,
      "step": 210
    },
    {
      "epoch": 0.27,
      "grad_norm": 0.279296875,
      "learning_rate": 0.00017597402597402598,
      "loss": 1.8168,
      "step": 211
    },
    {
      "epoch": 0.27,
      "grad_norm": 0.26953125,
      "learning_rate": 0.00017575757575757578,
      "loss": 1.7903,
      "step": 212
    },
    {
      "epoch": 0.27,
      "grad_norm": 0.287109375,
      "learning_rate": 0.00017554112554112554,
      "loss": 1.8139,
      "step": 213
    },
    {
      "epoch": 0.27,
      "grad_norm": 0.26171875,
      "learning_rate": 0.00017532467532467534,
      "loss": 1.7982,
      "step": 214
    },
    {
      "epoch": 0.28,
      "grad_norm": 0.291015625,
      "learning_rate": 0.0001751082251082251,
      "loss": 1.7828,
      "step": 215
    },
    {
      "epoch": 0.28,
      "grad_norm": 0.26953125,
      "learning_rate": 0.0001748917748917749,
      "loss": 1.7416,
      "step": 216
    },
    {
      "epoch": 0.28,
      "grad_norm": 0.34375,
      "learning_rate": 0.0001746753246753247,
      "loss": 1.806,
      "step": 217
    },
    {
      "epoch": 0.28,
      "grad_norm": 0.265625,
      "learning_rate": 0.00017445887445887448,
      "loss": 1.8822,
      "step": 218
    },
    {
      "epoch": 0.28,
      "grad_norm": 0.345703125,
      "learning_rate": 0.00017424242424242425,
      "loss": 1.7818,
      "step": 219
    },
    {
      "epoch": 0.28,
      "grad_norm": 0.275390625,
      "learning_rate": 0.00017402597402597401,
      "loss": 1.843,
      "step": 220
    },
    {
      "epoch": 0.28,
      "grad_norm": 0.33984375,
      "learning_rate": 0.00017380952380952383,
      "loss": 1.8087,
      "step": 221
    },
    {
      "epoch": 0.28,
      "grad_norm": 0.27734375,
      "learning_rate": 0.0001735930735930736,
      "loss": 1.8346,
      "step": 222
    },
    {
      "epoch": 0.29,
      "grad_norm": 0.296875,
      "learning_rate": 0.0001733766233766234,
      "loss": 1.7843,
      "step": 223
    },
    {
      "epoch": 0.29,
      "grad_norm": 0.330078125,
      "learning_rate": 0.00017316017316017316,
      "loss": 1.7974,
      "step": 224
    },
    {
      "epoch": 0.29,
      "grad_norm": 0.26953125,
      "learning_rate": 0.00017294372294372295,
      "loss": 1.7798,
      "step": 225
    },
    {
      "epoch": 0.29,
      "grad_norm": 0.287109375,
      "learning_rate": 0.00017272727272727275,
      "loss": 1.7318,
      "step": 226
    },
    {
      "epoch": 0.29,
      "grad_norm": 0.27734375,
      "learning_rate": 0.0001725108225108225,
      "loss": 1.8244,
      "step": 227
    },
    {
      "epoch": 0.29,
      "grad_norm": 0.345703125,
      "learning_rate": 0.0001722943722943723,
      "loss": 1.8291,
      "step": 228
    },
    {
      "epoch": 0.29,
      "grad_norm": 0.275390625,
      "learning_rate": 0.00017207792207792207,
      "loss": 1.7127,
      "step": 229
    },
    {
      "epoch": 0.29,
      "grad_norm": 0.27734375,
      "learning_rate": 0.00017186147186147187,
      "loss": 1.8416,
      "step": 230
    },
    {
      "epoch": 0.3,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00017164502164502166,
      "loss": 1.8327,
      "step": 231
    },
    {
      "epoch": 0.3,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00017142857142857143,
      "loss": 1.7058,
      "step": 232
    },
    {
      "epoch": 0.3,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00017121212121212122,
      "loss": 1.7438,
      "step": 233
    },
    {
      "epoch": 0.3,
      "grad_norm": 0.298828125,
      "learning_rate": 0.00017099567099567098,
      "loss": 1.822,
      "step": 234
    },
    {
      "epoch": 0.3,
      "grad_norm": 0.275390625,
      "learning_rate": 0.0001707792207792208,
      "loss": 1.8308,
      "step": 235
    },
    {
      "epoch": 0.3,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00017056277056277057,
      "loss": 1.8354,
      "step": 236
    },
    {
      "epoch": 0.3,
      "grad_norm": 0.265625,
      "learning_rate": 0.00017034632034632036,
      "loss": 1.7753,
      "step": 237
    },
    {
      "epoch": 0.3,
      "grad_norm": 0.337890625,
      "learning_rate": 0.00017012987012987013,
      "loss": 1.8091,
      "step": 238
    },
    {
      "epoch": 0.31,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00016991341991341992,
      "loss": 1.8526,
      "step": 239
    },
    {
      "epoch": 0.31,
      "grad_norm": 0.33984375,
      "learning_rate": 0.00016969696969696972,
      "loss": 1.8296,
      "step": 240
    },
    {
      "epoch": 0.31,
      "grad_norm": 0.322265625,
      "learning_rate": 0.00016948051948051948,
      "loss": 1.8309,
      "step": 241
    },
    {
      "epoch": 0.31,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00016926406926406928,
      "loss": 1.7423,
      "step": 242
    },
    {
      "epoch": 0.31,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00016904761904761904,
      "loss": 1.7644,
      "step": 243
    },
    {
      "epoch": 0.31,
      "grad_norm": 0.30859375,
      "learning_rate": 0.00016883116883116884,
      "loss": 1.7392,
      "step": 244
    },
    {
      "epoch": 0.31,
      "grad_norm": 0.349609375,
      "learning_rate": 0.00016861471861471863,
      "loss": 1.8515,
      "step": 245
    },
    {
      "epoch": 0.31,
      "grad_norm": 0.30859375,
      "learning_rate": 0.0001683982683982684,
      "loss": 1.751,
      "step": 246
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.3046875,
      "learning_rate": 0.0001681818181818182,
      "loss": 1.8196,
      "step": 247
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.28515625,
      "learning_rate": 0.00016796536796536798,
      "loss": 1.8556,
      "step": 248
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.390625,
      "learning_rate": 0.00016774891774891778,
      "loss": 1.7934,
      "step": 249
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00016753246753246754,
      "loss": 1.753,
      "step": 250
    },
    {
      "epoch": 0.32,
      "eval_loss": 1.7896223068237305,
      "eval_runtime": 125.3191,
      "eval_samples_per_second": 39.898,
      "eval_steps_per_second": 1.253,
      "step": 250
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.2890625,
      "learning_rate": 0.0001673160173160173,
      "loss": 1.8485,
      "step": 251
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.33984375,
      "learning_rate": 0.0001670995670995671,
      "loss": 1.7847,
      "step": 252
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.294921875,
      "learning_rate": 0.0001668831168831169,
      "loss": 1.688,
      "step": 253
    },
    {
      "epoch": 0.33,
      "grad_norm": 0.271484375,
      "learning_rate": 0.0001666666666666667,
      "loss": 1.7079,
      "step": 254
    },
    {
      "epoch": 0.33,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00016645021645021645,
      "loss": 1.7858,
      "step": 255
    },
    {
      "epoch": 0.33,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00016623376623376625,
      "loss": 1.8125,
      "step": 256
    },
    {
      "epoch": 0.33,
      "grad_norm": 0.32421875,
      "learning_rate": 0.00016601731601731601,
      "loss": 1.7826,
      "step": 257
    },
    {
      "epoch": 0.33,
      "grad_norm": 0.3515625,
      "learning_rate": 0.0001658008658008658,
      "loss": 1.7352,
      "step": 258
    },
    {
      "epoch": 0.33,
      "grad_norm": 0.357421875,
      "learning_rate": 0.0001655844155844156,
      "loss": 1.7863,
      "step": 259
    },
    {
      "epoch": 0.33,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00016536796536796537,
      "loss": 1.7868,
      "step": 260
    },
    {
      "epoch": 0.33,
      "grad_norm": 0.279296875,
      "learning_rate": 0.00016515151515151516,
      "loss": 1.7958,
      "step": 261
    },
    {
      "epoch": 0.34,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00016493506493506495,
      "loss": 1.7685,
      "step": 262
    },
    {
      "epoch": 0.34,
      "grad_norm": 0.279296875,
      "learning_rate": 0.00016471861471861472,
      "loss": 1.7561,
      "step": 263
    },
    {
      "epoch": 0.34,
      "grad_norm": 0.2890625,
      "learning_rate": 0.0001645021645021645,
      "loss": 1.7634,
      "step": 264
    },
    {
      "epoch": 0.34,
      "grad_norm": 0.296875,
      "learning_rate": 0.00016428571428571428,
      "loss": 1.8092,
      "step": 265
    },
    {
      "epoch": 0.34,
      "grad_norm": 0.31640625,
      "learning_rate": 0.00016406926406926407,
      "loss": 1.8324,
      "step": 266
    },
    {
      "epoch": 0.34,
      "grad_norm": 0.28125,
      "learning_rate": 0.00016385281385281387,
      "loss": 1.7221,
      "step": 267
    },
    {
      "epoch": 0.34,
      "grad_norm": 0.298828125,
      "learning_rate": 0.00016363636363636366,
      "loss": 1.7796,
      "step": 268
    },
    {
      "epoch": 0.34,
      "grad_norm": 0.302734375,
      "learning_rate": 0.00016341991341991343,
      "loss": 1.7713,
      "step": 269
    },
    {
      "epoch": 0.35,
      "grad_norm": 0.265625,
      "learning_rate": 0.0001632034632034632,
      "loss": 1.8178,
      "step": 270
    },
    {
      "epoch": 0.35,
      "grad_norm": 0.404296875,
      "learning_rate": 0.000162987012987013,
      "loss": 1.8104,
      "step": 271
    },
    {
      "epoch": 0.35,
      "grad_norm": 0.3125,
      "learning_rate": 0.00016277056277056278,
      "loss": 1.8387,
      "step": 272
    },
    {
      "epoch": 0.35,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00016255411255411257,
      "loss": 1.8897,
      "step": 273
    },
    {
      "epoch": 0.35,
      "grad_norm": 0.28125,
      "learning_rate": 0.00016233766233766234,
      "loss": 1.8162,
      "step": 274
    },
    {
      "epoch": 0.35,
      "grad_norm": 0.40625,
      "learning_rate": 0.00016212121212121213,
      "loss": 1.8086,
      "step": 275
    },
    {
      "epoch": 0.35,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00016190476190476192,
      "loss": 1.9186,
      "step": 276
    },
    {
      "epoch": 0.35,
      "grad_norm": 0.345703125,
      "learning_rate": 0.0001616883116883117,
      "loss": 1.8123,
      "step": 277
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.27734375,
      "learning_rate": 0.00016147186147186148,
      "loss": 1.8115,
      "step": 278
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00016125541125541125,
      "loss": 1.8785,
      "step": 279
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.3125,
      "learning_rate": 0.00016103896103896104,
      "loss": 1.805,
      "step": 280
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00016082251082251084,
      "loss": 1.8656,
      "step": 281
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.3515625,
      "learning_rate": 0.0001606060606060606,
      "loss": 1.7848,
      "step": 282
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.3125,
      "learning_rate": 0.0001603896103896104,
      "loss": 1.7817,
      "step": 283
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.30078125,
      "learning_rate": 0.00016017316017316016,
      "loss": 1.8217,
      "step": 284
    },
    {
      "epoch": 0.36,
      "grad_norm": 0.318359375,
      "learning_rate": 0.00015995670995670998,
      "loss": 1.8771,
      "step": 285
    },
    {
      "epoch": 0.37,
      "grad_norm": 0.31640625,
      "learning_rate": 0.00015974025974025975,
      "loss": 1.8028,
      "step": 286
    },
    {
      "epoch": 0.37,
      "grad_norm": 0.291015625,
      "learning_rate": 0.00015952380952380954,
      "loss": 1.8331,
      "step": 287
    },
    {
      "epoch": 0.37,
      "grad_norm": 0.357421875,
      "learning_rate": 0.0001593073593073593,
      "loss": 1.7528,
      "step": 288
    },
    {
      "epoch": 0.37,
      "grad_norm": 0.267578125,
      "learning_rate": 0.0001590909090909091,
      "loss": 1.8012,
      "step": 289
    },
    {
      "epoch": 0.37,
      "grad_norm": 0.2734375,
      "learning_rate": 0.0001588744588744589,
      "loss": 1.7787,
      "step": 290
    },
    {
      "epoch": 0.37,
      "grad_norm": 0.3125,
      "learning_rate": 0.00015865800865800866,
      "loss": 1.803,
      "step": 291
    },
    {
      "epoch": 0.37,
      "grad_norm": 0.4375,
      "learning_rate": 0.00015844155844155845,
      "loss": 1.7466,
      "step": 292
    },
    {
      "epoch": 0.37,
      "grad_norm": 0.30078125,
      "learning_rate": 0.00015822510822510822,
      "loss": 1.8551,
      "step": 293
    },
    {
      "epoch": 0.38,
      "grad_norm": 0.279296875,
      "learning_rate": 0.00015800865800865801,
      "loss": 1.7932,
      "step": 294
    },
    {
      "epoch": 0.38,
      "grad_norm": 0.359375,
      "learning_rate": 0.0001577922077922078,
      "loss": 1.7295,
      "step": 295
    },
    {
      "epoch": 0.38,
      "grad_norm": 0.318359375,
      "learning_rate": 0.00015757575757575757,
      "loss": 1.7984,
      "step": 296
    },
    {
      "epoch": 0.38,
      "grad_norm": 0.40234375,
      "learning_rate": 0.00015735930735930737,
      "loss": 1.8666,
      "step": 297
    },
    {
      "epoch": 0.38,
      "grad_norm": 0.287109375,
      "learning_rate": 0.00015714285714285716,
      "loss": 1.7149,
      "step": 298
    },
    {
      "epoch": 0.38,
      "grad_norm": 0.462890625,
      "learning_rate": 0.00015692640692640695,
      "loss": 1.7447,
      "step": 299
    },
    {
      "epoch": 0.38,
      "grad_norm": 0.4296875,
      "learning_rate": 0.00015670995670995672,
      "loss": 1.8199,
      "step": 300
    },
    {
      "epoch": 0.38,
      "eval_loss": 1.7805144786834717,
      "eval_runtime": 125.5041,
      "eval_samples_per_second": 39.839,
      "eval_steps_per_second": 1.251,
      "step": 300
    },
    {
      "epoch": 0.39,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00015649350649350649,
      "loss": 1.7937,
      "step": 301
    },
    {
      "epoch": 0.39,
      "grad_norm": 0.400390625,
      "learning_rate": 0.00015627705627705628,
      "loss": 1.7617,
      "step": 302
    },
    {
      "epoch": 0.39,
      "grad_norm": 0.435546875,
      "learning_rate": 0.00015606060606060607,
      "loss": 1.7755,
      "step": 303
    },
    {
      "epoch": 0.39,
      "grad_norm": 0.326171875,
      "learning_rate": 0.00015584415584415587,
      "loss": 1.862,
      "step": 304
    },
    {
      "epoch": 0.39,
      "grad_norm": 0.365234375,
      "learning_rate": 0.00015562770562770563,
      "loss": 1.8572,
      "step": 305
    },
    {
      "epoch": 0.39,
      "grad_norm": 0.330078125,
      "learning_rate": 0.00015541125541125543,
      "loss": 1.7678,
      "step": 306
    },
    {
      "epoch": 0.39,
      "grad_norm": 0.359375,
      "learning_rate": 0.0001551948051948052,
      "loss": 1.7435,
      "step": 307
    },
    {
      "epoch": 0.39,
      "grad_norm": 0.326171875,
      "learning_rate": 0.00015497835497835498,
      "loss": 1.7535,
      "step": 308
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.39453125,
      "learning_rate": 0.00015476190476190478,
      "loss": 1.7495,
      "step": 309
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.28515625,
      "learning_rate": 0.00015454545454545454,
      "loss": 1.7107,
      "step": 310
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.298828125,
      "learning_rate": 0.00015432900432900434,
      "loss": 1.7419,
      "step": 311
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.296875,
      "learning_rate": 0.00015411255411255413,
      "loss": 1.7466,
      "step": 312
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.3046875,
      "learning_rate": 0.0001538961038961039,
      "loss": 1.8045,
      "step": 313
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.369140625,
      "learning_rate": 0.0001536796536796537,
      "loss": 1.7756,
      "step": 314
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.35546875,
      "learning_rate": 0.00015346320346320346,
      "loss": 1.8018,
      "step": 315
    },
    {
      "epoch": 0.4,
      "grad_norm": 0.287109375,
      "learning_rate": 0.00015324675324675325,
      "loss": 1.8246,
      "step": 316
    },
    {
      "epoch": 0.41,
      "grad_norm": 0.28515625,
      "learning_rate": 0.00015303030303030304,
      "loss": 1.761,
      "step": 317
    },
    {
      "epoch": 0.41,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00015281385281385284,
      "loss": 1.773,
      "step": 318
    },
    {
      "epoch": 0.41,
      "grad_norm": 0.345703125,
      "learning_rate": 0.0001525974025974026,
      "loss": 1.7889,
      "step": 319
    },
    {
      "epoch": 0.41,
      "grad_norm": 0.38671875,
      "learning_rate": 0.00015238095238095237,
      "loss": 1.827,
      "step": 320
    },
    {
      "epoch": 0.41,
      "grad_norm": 0.333984375,
      "learning_rate": 0.0001521645021645022,
      "loss": 1.7928,
      "step": 321
    },
    {
      "epoch": 0.41,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00015194805194805196,
      "loss": 1.7618,
      "step": 322
    },
    {
      "epoch": 0.41,
      "grad_norm": 0.36328125,
      "learning_rate": 0.00015173160173160175,
      "loss": 1.7576,
      "step": 323
    },
    {
      "epoch": 0.41,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00015151515151515152,
      "loss": 1.6683,
      "step": 324
    },
    {
      "epoch": 0.42,
      "grad_norm": 0.27734375,
      "learning_rate": 0.0001512987012987013,
      "loss": 1.7967,
      "step": 325
    },
    {
      "epoch": 0.42,
      "grad_norm": 0.28125,
      "learning_rate": 0.0001510822510822511,
      "loss": 1.8209,
      "step": 326
    },
    {
      "epoch": 0.42,
      "grad_norm": 0.39453125,
      "learning_rate": 0.00015086580086580087,
      "loss": 1.8743,
      "step": 327
    },
    {
      "epoch": 0.42,
      "grad_norm": 0.27734375,
      "learning_rate": 0.00015064935064935066,
      "loss": 1.8559,
      "step": 328
    },
    {
      "epoch": 0.42,
      "grad_norm": 0.337890625,
      "learning_rate": 0.00015043290043290043,
      "loss": 1.7791,
      "step": 329
    },
    {
      "epoch": 0.42,
      "grad_norm": 0.353515625,
      "learning_rate": 0.00015021645021645022,
      "loss": 1.8626,
      "step": 330
    },
    {
      "epoch": 0.42,
      "grad_norm": 0.4140625,
      "learning_rate": 0.00015000000000000001,
      "loss": 1.8169,
      "step": 331
    },
    {
      "epoch": 0.42,
      "grad_norm": 0.373046875,
      "learning_rate": 0.00014978354978354978,
      "loss": 1.8006,
      "step": 332
    },
    {
      "epoch": 0.43,
      "grad_norm": 0.302734375,
      "learning_rate": 0.00014956709956709957,
      "loss": 1.8818,
      "step": 333
    },
    {
      "epoch": 0.43,
      "grad_norm": 0.423828125,
      "learning_rate": 0.00014935064935064934,
      "loss": 1.7898,
      "step": 334
    },
    {
      "epoch": 0.43,
      "grad_norm": 0.314453125,
      "learning_rate": 0.00014913419913419916,
      "loss": 1.7639,
      "step": 335
    },
    {
      "epoch": 0.43,
      "grad_norm": 0.375,
      "learning_rate": 0.00014891774891774893,
      "loss": 1.8276,
      "step": 336
    },
    {
      "epoch": 0.43,
      "grad_norm": 0.34375,
      "learning_rate": 0.00014870129870129872,
      "loss": 1.8718,
      "step": 337
    },
    {
      "epoch": 0.43,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00014848484848484849,
      "loss": 1.7142,
      "step": 338
    },
    {
      "epoch": 0.43,
      "grad_norm": 0.330078125,
      "learning_rate": 0.00014826839826839828,
      "loss": 1.8261,
      "step": 339
    },
    {
      "epoch": 0.44,
      "grad_norm": 0.328125,
      "learning_rate": 0.00014805194805194807,
      "loss": 1.8177,
      "step": 340
    },
    {
      "epoch": 0.44,
      "grad_norm": 0.357421875,
      "learning_rate": 0.00014783549783549784,
      "loss": 1.739,
      "step": 341
    },
    {
      "epoch": 0.44,
      "grad_norm": 0.291015625,
      "learning_rate": 0.00014761904761904763,
      "loss": 1.7867,
      "step": 342
    },
    {
      "epoch": 0.44,
      "grad_norm": 0.380859375,
      "learning_rate": 0.0001474025974025974,
      "loss": 1.7825,
      "step": 343
    },
    {
      "epoch": 0.44,
      "grad_norm": 0.29296875,
      "learning_rate": 0.0001471861471861472,
      "loss": 1.7784,
      "step": 344
    },
    {
      "epoch": 0.44,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00014696969696969698,
      "loss": 1.8077,
      "step": 345
    },
    {
      "epoch": 0.44,
      "grad_norm": 0.296875,
      "learning_rate": 0.00014675324675324675,
      "loss": 1.7616,
      "step": 346
    },
    {
      "epoch": 0.44,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00014653679653679654,
      "loss": 1.7738,
      "step": 347
    },
    {
      "epoch": 0.45,
      "grad_norm": 0.345703125,
      "learning_rate": 0.00014632034632034634,
      "loss": 1.7019,
      "step": 348
    },
    {
      "epoch": 0.45,
      "grad_norm": 0.33984375,
      "learning_rate": 0.00014610389610389613,
      "loss": 1.6911,
      "step": 349
    },
    {
      "epoch": 0.45,
      "grad_norm": 0.3359375,
      "learning_rate": 0.0001458874458874459,
      "loss": 1.7708,
      "step": 350
    },
    {
      "epoch": 0.45,
      "eval_loss": 1.7709890604019165,
      "eval_runtime": 125.3163,
      "eval_samples_per_second": 39.899,
      "eval_steps_per_second": 1.253,
      "step": 350
    },
    {
      "epoch": 0.45,
      "grad_norm": 0.349609375,
      "learning_rate": 0.00014567099567099566,
      "loss": 1.76,
      "step": 351
    },
    {
      "epoch": 0.45,
      "grad_norm": 0.38671875,
      "learning_rate": 0.00014545454545454546,
      "loss": 1.7913,
      "step": 352
    },
    {
      "epoch": 0.45,
      "grad_norm": 0.421875,
      "learning_rate": 0.00014523809523809525,
      "loss": 1.8083,
      "step": 353
    },
    {
      "epoch": 0.45,
      "grad_norm": 0.388671875,
      "learning_rate": 0.00014502164502164504,
      "loss": 1.7886,
      "step": 354
    },
    {
      "epoch": 0.45,
      "grad_norm": 0.265625,
      "learning_rate": 0.0001448051948051948,
      "loss": 1.8125,
      "step": 355
    },
    {
      "epoch": 0.46,
      "grad_norm": 0.353515625,
      "learning_rate": 0.00014458874458874458,
      "loss": 1.6922,
      "step": 356
    },
    {
      "epoch": 0.46,
      "grad_norm": 0.41796875,
      "learning_rate": 0.00014437229437229437,
      "loss": 1.6483,
      "step": 357
    },
    {
      "epoch": 0.46,
      "grad_norm": 0.38671875,
      "learning_rate": 0.00014415584415584416,
      "loss": 1.805,
      "step": 358
    },
    {
      "epoch": 0.46,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00014393939393939396,
      "loss": 1.7458,
      "step": 359
    },
    {
      "epoch": 0.46,
      "grad_norm": 0.416015625,
      "learning_rate": 0.00014372294372294372,
      "loss": 1.7989,
      "step": 360
    },
    {
      "epoch": 0.46,
      "grad_norm": 0.404296875,
      "learning_rate": 0.00014350649350649352,
      "loss": 1.7271,
      "step": 361
    },
    {
      "epoch": 0.46,
      "grad_norm": 0.296875,
      "learning_rate": 0.0001432900432900433,
      "loss": 1.7804,
      "step": 362
    },
    {
      "epoch": 0.46,
      "grad_norm": 0.296875,
      "learning_rate": 0.00014307359307359307,
      "loss": 1.8581,
      "step": 363
    },
    {
      "epoch": 0.47,
      "grad_norm": 0.353515625,
      "learning_rate": 0.00014285714285714287,
      "loss": 1.757,
      "step": 364
    },
    {
      "epoch": 0.47,
      "grad_norm": 0.3203125,
      "learning_rate": 0.00014264069264069263,
      "loss": 1.7536,
      "step": 365
    },
    {
      "epoch": 0.47,
      "grad_norm": 0.3203125,
      "learning_rate": 0.00014242424242424243,
      "loss": 1.7191,
      "step": 366
    },
    {
      "epoch": 0.47,
      "grad_norm": 0.384765625,
      "learning_rate": 0.00014220779220779222,
      "loss": 1.7741,
      "step": 367
    },
    {
      "epoch": 0.47,
      "grad_norm": 0.3359375,
      "learning_rate": 0.00014199134199134201,
      "loss": 1.7626,
      "step": 368
    },
    {
      "epoch": 0.47,
      "grad_norm": 0.4140625,
      "learning_rate": 0.00014177489177489178,
      "loss": 1.8351,
      "step": 369
    },
    {
      "epoch": 0.47,
      "grad_norm": 0.4921875,
      "learning_rate": 0.00014155844155844155,
      "loss": 1.7943,
      "step": 370
    },
    {
      "epoch": 0.47,
      "grad_norm": 0.28515625,
      "learning_rate": 0.00014134199134199137,
      "loss": 1.7729,
      "step": 371
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.36328125,
      "learning_rate": 0.00014112554112554113,
      "loss": 1.853,
      "step": 372
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.373046875,
      "learning_rate": 0.00014090909090909093,
      "loss": 1.7805,
      "step": 373
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.375,
      "learning_rate": 0.0001406926406926407,
      "loss": 1.805,
      "step": 374
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.365234375,
      "learning_rate": 0.00014047619047619049,
      "loss": 1.7717,
      "step": 375
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.375,
      "learning_rate": 0.00014025974025974028,
      "loss": 1.793,
      "step": 376
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.353515625,
      "learning_rate": 0.00014004329004329005,
      "loss": 1.8017,
      "step": 377
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.359375,
      "learning_rate": 0.00013982683982683984,
      "loss": 1.758,
      "step": 378
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.35546875,
      "learning_rate": 0.0001396103896103896,
      "loss": 1.7761,
      "step": 379
    },
    {
      "epoch": 0.49,
      "grad_norm": 0.33203125,
      "learning_rate": 0.0001393939393939394,
      "loss": 1.7626,
      "step": 380
    },
    {
      "epoch": 0.49,
      "grad_norm": 0.361328125,
      "learning_rate": 0.0001391774891774892,
      "loss": 1.8463,
      "step": 381
    },
    {
      "epoch": 0.49,
      "grad_norm": 0.4140625,
      "learning_rate": 0.00013896103896103896,
      "loss": 1.7563,
      "step": 382
    },
    {
      "epoch": 0.49,
      "grad_norm": 0.318359375,
      "learning_rate": 0.00013874458874458875,
      "loss": 1.7085,
      "step": 383
    },
    {
      "epoch": 0.49,
      "grad_norm": 0.337890625,
      "learning_rate": 0.00013852813852813852,
      "loss": 1.8202,
      "step": 384
    },
    {
      "epoch": 0.49,
      "grad_norm": 0.30078125,
      "learning_rate": 0.00013831168831168834,
      "loss": 1.7618,
      "step": 385
    },
    {
      "epoch": 0.49,
      "grad_norm": 0.3046875,
      "learning_rate": 0.0001380952380952381,
      "loss": 1.8142,
      "step": 386
    },
    {
      "epoch": 0.5,
      "grad_norm": 0.318359375,
      "learning_rate": 0.0001378787878787879,
      "loss": 1.6778,
      "step": 387
    },
    {
      "epoch": 0.5,
      "grad_norm": 0.287109375,
      "learning_rate": 0.00013766233766233766,
      "loss": 1.749,
      "step": 388
    },
    {
      "epoch": 0.5,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00013744588744588746,
      "loss": 1.7236,
      "step": 389
    },
    {
      "epoch": 0.5,
      "grad_norm": 0.322265625,
      "learning_rate": 0.00013722943722943725,
      "loss": 1.7329,
      "step": 390
    },
    {
      "epoch": 0.5,
      "grad_norm": 0.384765625,
      "learning_rate": 0.00013701298701298702,
      "loss": 1.8287,
      "step": 391
    },
    {
      "epoch": 0.5,
      "grad_norm": 0.318359375,
      "learning_rate": 0.0001367965367965368,
      "loss": 1.8173,
      "step": 392
    },
    {
      "epoch": 0.5,
      "grad_norm": 0.5234375,
      "learning_rate": 0.00013658008658008658,
      "loss": 1.8881,
      "step": 393
    },
    {
      "epoch": 0.5,
      "grad_norm": 0.4453125,
      "learning_rate": 0.00013636363636363637,
      "loss": 1.7895,
      "step": 394
    },
    {
      "epoch": 0.51,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00013614718614718616,
      "loss": 1.7531,
      "step": 395
    },
    {
      "epoch": 0.51,
      "grad_norm": 0.337890625,
      "learning_rate": 0.00013593073593073593,
      "loss": 1.7601,
      "step": 396
    },
    {
      "epoch": 0.51,
      "grad_norm": 0.353515625,
      "learning_rate": 0.00013571428571428572,
      "loss": 1.6574,
      "step": 397
    },
    {
      "epoch": 0.51,
      "grad_norm": 0.345703125,
      "learning_rate": 0.0001354978354978355,
      "loss": 1.7577,
      "step": 398
    },
    {
      "epoch": 0.51,
      "grad_norm": 0.455078125,
      "learning_rate": 0.0001352813852813853,
      "loss": 1.6886,
      "step": 399
    },
    {
      "epoch": 0.51,
      "grad_norm": 0.337890625,
      "learning_rate": 0.00013506493506493507,
      "loss": 1.7786,
      "step": 400
    },
    {
      "epoch": 0.51,
      "eval_loss": 1.7625454664230347,
      "eval_runtime": 125.3037,
      "eval_samples_per_second": 39.903,
      "eval_steps_per_second": 1.253,
      "step": 400
    },
    {
      "epoch": 0.51,
      "grad_norm": 0.392578125,
      "learning_rate": 0.00013484848484848484,
      "loss": 1.7421,
      "step": 401
    },
    {
      "epoch": 0.51,
      "grad_norm": 0.53125,
      "learning_rate": 0.00013463203463203463,
      "loss": 1.7626,
      "step": 402
    },
    {
      "epoch": 0.52,
      "grad_norm": 0.34765625,
      "learning_rate": 0.00013441558441558443,
      "loss": 1.7943,
      "step": 403
    },
    {
      "epoch": 0.52,
      "grad_norm": 0.435546875,
      "learning_rate": 0.00013419913419913422,
      "loss": 1.8116,
      "step": 404
    },
    {
      "epoch": 0.52,
      "grad_norm": 0.419921875,
      "learning_rate": 0.000133982683982684,
      "loss": 1.7532,
      "step": 405
    },
    {
      "epoch": 0.52,
      "grad_norm": 0.345703125,
      "learning_rate": 0.00013376623376623375,
      "loss": 1.7478,
      "step": 406
    },
    {
      "epoch": 0.52,
      "grad_norm": 0.40234375,
      "learning_rate": 0.00013354978354978355,
      "loss": 1.8208,
      "step": 407
    },
    {
      "epoch": 0.52,
      "grad_norm": 0.314453125,
      "learning_rate": 0.00013333333333333334,
      "loss": 1.7518,
      "step": 408
    },
    {
      "epoch": 0.52,
      "grad_norm": 0.470703125,
      "learning_rate": 0.00013311688311688313,
      "loss": 1.8063,
      "step": 409
    },
    {
      "epoch": 0.52,
      "grad_norm": 0.419921875,
      "learning_rate": 0.0001329004329004329,
      "loss": 1.7366,
      "step": 410
    },
    {
      "epoch": 0.53,
      "grad_norm": 0.326171875,
      "learning_rate": 0.0001326839826839827,
      "loss": 1.7605,
      "step": 411
    },
    {
      "epoch": 0.53,
      "grad_norm": 0.287109375,
      "learning_rate": 0.00013246753246753249,
      "loss": 1.8454,
      "step": 412
    },
    {
      "epoch": 0.53,
      "grad_norm": 0.376953125,
      "learning_rate": 0.00013225108225108225,
      "loss": 1.7342,
      "step": 413
    },
    {
      "epoch": 0.53,
      "grad_norm": 0.416015625,
      "learning_rate": 0.00013203463203463205,
      "loss": 1.7031,
      "step": 414
    },
    {
      "epoch": 0.53,
      "grad_norm": 0.3984375,
      "learning_rate": 0.0001318181818181818,
      "loss": 1.8397,
      "step": 415
    },
    {
      "epoch": 0.53,
      "grad_norm": 0.369140625,
      "learning_rate": 0.0001316017316017316,
      "loss": 1.8038,
      "step": 416
    },
    {
      "epoch": 0.53,
      "grad_norm": 0.369140625,
      "learning_rate": 0.0001313852813852814,
      "loss": 1.7529,
      "step": 417
    },
    {
      "epoch": 0.53,
      "grad_norm": 0.396484375,
      "learning_rate": 0.0001311688311688312,
      "loss": 1.7746,
      "step": 418
    },
    {
      "epoch": 0.54,
      "grad_norm": 0.31640625,
      "learning_rate": 0.00013095238095238096,
      "loss": 1.8441,
      "step": 419
    },
    {
      "epoch": 0.54,
      "grad_norm": 0.287109375,
      "learning_rate": 0.00013073593073593072,
      "loss": 1.7302,
      "step": 420
    },
    {
      "epoch": 0.54,
      "grad_norm": 0.470703125,
      "learning_rate": 0.00013051948051948052,
      "loss": 1.7149,
      "step": 421
    },
    {
      "epoch": 0.54,
      "grad_norm": 0.330078125,
      "learning_rate": 0.0001303030303030303,
      "loss": 1.6881,
      "step": 422
    },
    {
      "epoch": 0.54,
      "grad_norm": 0.28515625,
      "learning_rate": 0.0001300865800865801,
      "loss": 1.7627,
      "step": 423
    },
    {
      "epoch": 0.54,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00012987012987012987,
      "loss": 1.7386,
      "step": 424
    },
    {
      "epoch": 0.54,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00012965367965367964,
      "loss": 1.7331,
      "step": 425
    },
    {
      "epoch": 0.55,
      "grad_norm": 0.384765625,
      "learning_rate": 0.00012943722943722946,
      "loss": 1.766,
      "step": 426
    },
    {
      "epoch": 0.55,
      "grad_norm": 0.3828125,
      "learning_rate": 0.00012922077922077922,
      "loss": 1.7111,
      "step": 427
    },
    {
      "epoch": 0.55,
      "grad_norm": 0.37890625,
      "learning_rate": 0.00012900432900432902,
      "loss": 1.7874,
      "step": 428
    },
    {
      "epoch": 0.55,
      "grad_norm": 0.380859375,
      "learning_rate": 0.00012878787878787878,
      "loss": 1.7565,
      "step": 429
    },
    {
      "epoch": 0.55,
      "grad_norm": 0.37109375,
      "learning_rate": 0.00012857142857142858,
      "loss": 1.8451,
      "step": 430
    },
    {
      "epoch": 0.55,
      "grad_norm": 0.419921875,
      "learning_rate": 0.00012835497835497837,
      "loss": 1.7024,
      "step": 431
    },
    {
      "epoch": 0.55,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00012813852813852814,
      "loss": 1.7037,
      "step": 432
    },
    {
      "epoch": 0.55,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00012792207792207793,
      "loss": 1.7166,
      "step": 433
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.404296875,
      "learning_rate": 0.0001277056277056277,
      "loss": 1.7534,
      "step": 434
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.3359375,
      "learning_rate": 0.00012748917748917752,
      "loss": 1.7315,
      "step": 435
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.349609375,
      "learning_rate": 0.00012727272727272728,
      "loss": 1.7707,
      "step": 436
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.328125,
      "learning_rate": 0.00012705627705627707,
      "loss": 1.811,
      "step": 437
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00012683982683982684,
      "loss": 1.8211,
      "step": 438
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.330078125,
      "learning_rate": 0.00012662337662337663,
      "loss": 1.6992,
      "step": 439
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00012640692640692643,
      "loss": 1.8294,
      "step": 440
    },
    {
      "epoch": 0.56,
      "grad_norm": 0.326171875,
      "learning_rate": 0.0001261904761904762,
      "loss": 1.7351,
      "step": 441
    },
    {
      "epoch": 0.57,
      "grad_norm": 0.384765625,
      "learning_rate": 0.000125974025974026,
      "loss": 1.732,
      "step": 442
    },
    {
      "epoch": 0.57,
      "grad_norm": 0.41796875,
      "learning_rate": 0.00012575757575757575,
      "loss": 1.7839,
      "step": 443
    },
    {
      "epoch": 0.57,
      "grad_norm": 0.34375,
      "learning_rate": 0.00012554112554112555,
      "loss": 1.7892,
      "step": 444
    },
    {
      "epoch": 0.57,
      "grad_norm": 0.37890625,
      "learning_rate": 0.00012532467532467534,
      "loss": 1.7428,
      "step": 445
    },
    {
      "epoch": 0.57,
      "grad_norm": 0.3359375,
      "learning_rate": 0.0001251082251082251,
      "loss": 1.8055,
      "step": 446
    },
    {
      "epoch": 0.57,
      "grad_norm": 0.384765625,
      "learning_rate": 0.0001248917748917749,
      "loss": 1.8227,
      "step": 447
    },
    {
      "epoch": 0.57,
      "grad_norm": 0.349609375,
      "learning_rate": 0.00012467532467532467,
      "loss": 1.7273,
      "step": 448
    },
    {
      "epoch": 0.57,
      "grad_norm": 0.328125,
      "learning_rate": 0.00012445887445887449,
      "loss": 1.8338,
      "step": 449
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.4140625,
      "learning_rate": 0.00012424242424242425,
      "loss": 1.71,
      "step": 450
    },
    {
      "epoch": 0.58,
      "eval_loss": 1.7571938037872314,
      "eval_runtime": 125.7305,
      "eval_samples_per_second": 39.768,
      "eval_steps_per_second": 1.249,
      "step": 450
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.408203125,
      "learning_rate": 0.00012402597402597402,
      "loss": 1.6852,
      "step": 451
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.3125,
      "learning_rate": 0.0001238095238095238,
      "loss": 1.7876,
      "step": 452
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.314453125,
      "learning_rate": 0.0001235930735930736,
      "loss": 1.7676,
      "step": 453
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.439453125,
      "learning_rate": 0.0001233766233766234,
      "loss": 1.7567,
      "step": 454
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00012316017316017316,
      "loss": 1.7835,
      "step": 455
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00012294372294372293,
      "loss": 1.8254,
      "step": 456
    },
    {
      "epoch": 0.58,
      "grad_norm": 0.30078125,
      "learning_rate": 0.00012272727272727272,
      "loss": 1.7415,
      "step": 457
    },
    {
      "epoch": 0.59,
      "grad_norm": 0.3359375,
      "learning_rate": 0.00012251082251082252,
      "loss": 1.7718,
      "step": 458
    },
    {
      "epoch": 0.59,
      "grad_norm": 0.361328125,
      "learning_rate": 0.0001222943722943723,
      "loss": 1.7734,
      "step": 459
    },
    {
      "epoch": 0.59,
      "grad_norm": 0.3125,
      "learning_rate": 0.00012207792207792208,
      "loss": 1.7413,
      "step": 460
    },
    {
      "epoch": 0.59,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00012186147186147187,
      "loss": 1.8832,
      "step": 461
    },
    {
      "epoch": 0.59,
      "grad_norm": 0.328125,
      "learning_rate": 0.00012164502164502165,
      "loss": 1.71,
      "step": 462
    },
    {
      "epoch": 0.59,
      "grad_norm": 0.3203125,
      "learning_rate": 0.00012142857142857143,
      "loss": 1.7611,
      "step": 463
    },
    {
      "epoch": 0.59,
      "grad_norm": 0.373046875,
      "learning_rate": 0.00012121212121212122,
      "loss": 1.809,
      "step": 464
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00012099567099567099,
      "loss": 1.6685,
      "step": 465
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.310546875,
      "learning_rate": 0.0001207792207792208,
      "loss": 1.7246,
      "step": 466
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.30859375,
      "learning_rate": 0.00012056277056277056,
      "loss": 1.7187,
      "step": 467
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.296875,
      "learning_rate": 0.00012034632034632037,
      "loss": 1.7165,
      "step": 468
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.353515625,
      "learning_rate": 0.00012012987012987014,
      "loss": 1.7451,
      "step": 469
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00011991341991341991,
      "loss": 1.7889,
      "step": 470
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.34375,
      "learning_rate": 0.00011969696969696971,
      "loss": 1.8299,
      "step": 471
    },
    {
      "epoch": 0.6,
      "grad_norm": 0.314453125,
      "learning_rate": 0.00011948051948051949,
      "loss": 1.7567,
      "step": 472
    },
    {
      "epoch": 0.61,
      "grad_norm": 0.3359375,
      "learning_rate": 0.00011926406926406928,
      "loss": 1.7335,
      "step": 473
    },
    {
      "epoch": 0.61,
      "grad_norm": 0.30859375,
      "learning_rate": 0.00011904761904761905,
      "loss": 1.7394,
      "step": 474
    },
    {
      "epoch": 0.61,
      "grad_norm": 0.34765625,
      "learning_rate": 0.00011883116883116883,
      "loss": 1.7834,
      "step": 475
    },
    {
      "epoch": 0.61,
      "grad_norm": 0.322265625,
      "learning_rate": 0.00011861471861471862,
      "loss": 1.7819,
      "step": 476
    },
    {
      "epoch": 0.61,
      "grad_norm": 0.271484375,
      "learning_rate": 0.0001183982683982684,
      "loss": 1.7574,
      "step": 477
    },
    {
      "epoch": 0.61,
      "grad_norm": 0.330078125,
      "learning_rate": 0.0001181818181818182,
      "loss": 1.7031,
      "step": 478
    },
    {
      "epoch": 0.61,
      "grad_norm": 0.30859375,
      "learning_rate": 0.00011796536796536797,
      "loss": 1.7361,
      "step": 479
    },
    {
      "epoch": 0.61,
      "grad_norm": 0.333984375,
      "learning_rate": 0.00011774891774891777,
      "loss": 1.7638,
      "step": 480
    },
    {
      "epoch": 0.62,
      "grad_norm": 0.34375,
      "learning_rate": 0.00011753246753246753,
      "loss": 1.7476,
      "step": 481
    },
    {
      "epoch": 0.62,
      "grad_norm": 0.333984375,
      "learning_rate": 0.00011731601731601731,
      "loss": 1.6884,
      "step": 482
    },
    {
      "epoch": 0.62,
      "grad_norm": 0.296875,
      "learning_rate": 0.0001170995670995671,
      "loss": 1.6966,
      "step": 483
    },
    {
      "epoch": 0.62,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00011688311688311689,
      "loss": 1.7041,
      "step": 484
    },
    {
      "epoch": 0.62,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00011666666666666668,
      "loss": 1.7788,
      "step": 485
    },
    {
      "epoch": 0.62,
      "grad_norm": 0.328125,
      "learning_rate": 0.00011645021645021646,
      "loss": 1.7744,
      "step": 486
    },
    {
      "epoch": 0.62,
      "grad_norm": 0.423828125,
      "learning_rate": 0.00011623376623376625,
      "loss": 1.7492,
      "step": 487
    },
    {
      "epoch": 0.62,
      "grad_norm": 0.318359375,
      "learning_rate": 0.00011601731601731602,
      "loss": 1.7537,
      "step": 488
    },
    {
      "epoch": 0.63,
      "grad_norm": 0.337890625,
      "learning_rate": 0.0001158008658008658,
      "loss": 1.7363,
      "step": 489
    },
    {
      "epoch": 0.63,
      "grad_norm": 0.349609375,
      "learning_rate": 0.00011558441558441559,
      "loss": 1.7846,
      "step": 490
    },
    {
      "epoch": 0.63,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00011536796536796537,
      "loss": 1.7081,
      "step": 491
    },
    {
      "epoch": 0.63,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00011515151515151516,
      "loss": 1.7275,
      "step": 492
    },
    {
      "epoch": 0.63,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00011493506493506494,
      "loss": 1.7196,
      "step": 493
    },
    {
      "epoch": 0.63,
      "grad_norm": 0.30078125,
      "learning_rate": 0.00011471861471861471,
      "loss": 1.6845,
      "step": 494
    },
    {
      "epoch": 0.63,
      "grad_norm": 0.337890625,
      "learning_rate": 0.00011450216450216452,
      "loss": 1.7443,
      "step": 495
    },
    {
      "epoch": 0.63,
      "grad_norm": 0.3359375,
      "learning_rate": 0.00011428571428571428,
      "loss": 1.7104,
      "step": 496
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.380859375,
      "learning_rate": 0.00011406926406926408,
      "loss": 1.7307,
      "step": 497
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.361328125,
      "learning_rate": 0.00011385281385281386,
      "loss": 1.8338,
      "step": 498
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.296875,
      "learning_rate": 0.00011363636363636365,
      "loss": 1.7235,
      "step": 499
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.291015625,
      "learning_rate": 0.00011341991341991343,
      "loss": 1.8493,
      "step": 500
    },
    {
      "epoch": 0.64,
      "eval_loss": 1.7474199533462524,
      "eval_runtime": 125.2746,
      "eval_samples_per_second": 39.912,
      "eval_steps_per_second": 1.253,
      "step": 500
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.33203125,
      "learning_rate": 0.0001132034632034632,
      "loss": 1.7369,
      "step": 501
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.322265625,
      "learning_rate": 0.000112987012987013,
      "loss": 1.7215,
      "step": 502
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.36328125,
      "learning_rate": 0.00011277056277056277,
      "loss": 1.7981,
      "step": 503
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.318359375,
      "learning_rate": 0.00011255411255411256,
      "loss": 1.6949,
      "step": 504
    },
    {
      "epoch": 0.65,
      "grad_norm": 0.38671875,
      "learning_rate": 0.00011233766233766234,
      "loss": 1.7577,
      "step": 505
    },
    {
      "epoch": 0.65,
      "grad_norm": 0.52734375,
      "learning_rate": 0.00011212121212121212,
      "loss": 1.7396,
      "step": 506
    },
    {
      "epoch": 0.65,
      "grad_norm": 0.3359375,
      "learning_rate": 0.00011190476190476191,
      "loss": 1.7094,
      "step": 507
    },
    {
      "epoch": 0.65,
      "grad_norm": 0.314453125,
      "learning_rate": 0.00011168831168831168,
      "loss": 1.7354,
      "step": 508
    },
    {
      "epoch": 0.65,
      "grad_norm": 0.37109375,
      "learning_rate": 0.00011147186147186149,
      "loss": 1.7109,
      "step": 509
    },
    {
      "epoch": 0.65,
      "grad_norm": 0.4140625,
      "learning_rate": 0.00011125541125541125,
      "loss": 1.7809,
      "step": 510
    },
    {
      "epoch": 0.65,
      "grad_norm": 0.455078125,
      "learning_rate": 0.00011103896103896105,
      "loss": 1.7939,
      "step": 511
    },
    {
      "epoch": 0.66,
      "grad_norm": 0.291015625,
      "learning_rate": 0.00011082251082251083,
      "loss": 1.7942,
      "step": 512
    },
    {
      "epoch": 0.66,
      "grad_norm": 0.609375,
      "learning_rate": 0.00011060606060606061,
      "loss": 1.7189,
      "step": 513
    },
    {
      "epoch": 0.66,
      "grad_norm": 0.67578125,
      "learning_rate": 0.0001103896103896104,
      "loss": 1.7662,
      "step": 514
    },
    {
      "epoch": 0.66,
      "grad_norm": 0.353515625,
      "learning_rate": 0.00011017316017316017,
      "loss": 1.7288,
      "step": 515
    },
    {
      "epoch": 0.66,
      "grad_norm": 0.322265625,
      "learning_rate": 0.00010995670995670997,
      "loss": 1.715,
      "step": 516
    },
    {
      "epoch": 0.66,
      "grad_norm": 0.40625,
      "learning_rate": 0.00010974025974025974,
      "loss": 1.6725,
      "step": 517
    },
    {
      "epoch": 0.66,
      "grad_norm": 0.52734375,
      "learning_rate": 0.00010952380952380953,
      "loss": 1.7752,
      "step": 518
    },
    {
      "epoch": 0.66,
      "grad_norm": 0.48828125,
      "learning_rate": 0.00010930735930735931,
      "loss": 1.7345,
      "step": 519
    },
    {
      "epoch": 0.67,
      "grad_norm": 0.3203125,
      "learning_rate": 0.00010909090909090909,
      "loss": 1.7054,
      "step": 520
    },
    {
      "epoch": 0.67,
      "grad_norm": 0.45703125,
      "learning_rate": 0.00010887445887445889,
      "loss": 1.8508,
      "step": 521
    },
    {
      "epoch": 0.67,
      "grad_norm": 0.50390625,
      "learning_rate": 0.00010865800865800865,
      "loss": 1.723,
      "step": 522
    },
    {
      "epoch": 0.67,
      "grad_norm": 0.361328125,
      "learning_rate": 0.00010844155844155846,
      "loss": 1.7008,
      "step": 523
    },
    {
      "epoch": 0.67,
      "grad_norm": 0.330078125,
      "learning_rate": 0.00010822510822510823,
      "loss": 1.6947,
      "step": 524
    },
    {
      "epoch": 0.67,
      "grad_norm": 0.439453125,
      "learning_rate": 0.000108008658008658,
      "loss": 1.8002,
      "step": 525
    },
    {
      "epoch": 0.67,
      "grad_norm": 0.419921875,
      "learning_rate": 0.0001077922077922078,
      "loss": 1.7645,
      "step": 526
    },
    {
      "epoch": 0.67,
      "grad_norm": 0.392578125,
      "learning_rate": 0.00010757575757575758,
      "loss": 1.7568,
      "step": 527
    },
    {
      "epoch": 0.68,
      "grad_norm": 0.34375,
      "learning_rate": 0.00010735930735930737,
      "loss": 1.7684,
      "step": 528
    },
    {
      "epoch": 0.68,
      "grad_norm": 0.337890625,
      "learning_rate": 0.00010714285714285715,
      "loss": 1.7795,
      "step": 529
    },
    {
      "epoch": 0.68,
      "grad_norm": 0.375,
      "learning_rate": 0.00010692640692640694,
      "loss": 1.7471,
      "step": 530
    },
    {
      "epoch": 0.68,
      "grad_norm": 0.369140625,
      "learning_rate": 0.00010670995670995671,
      "loss": 1.8058,
      "step": 531
    },
    {
      "epoch": 0.68,
      "grad_norm": 0.326171875,
      "learning_rate": 0.00010649350649350649,
      "loss": 1.673,
      "step": 532
    },
    {
      "epoch": 0.68,
      "grad_norm": 0.365234375,
      "learning_rate": 0.00010627705627705628,
      "loss": 1.728,
      "step": 533
    },
    {
      "epoch": 0.68,
      "grad_norm": 0.423828125,
      "learning_rate": 0.00010606060606060606,
      "loss": 1.6786,
      "step": 534
    },
    {
      "epoch": 0.68,
      "grad_norm": 0.37890625,
      "learning_rate": 0.00010584415584415586,
      "loss": 1.733,
      "step": 535
    },
    {
      "epoch": 0.69,
      "grad_norm": 0.34375,
      "learning_rate": 0.00010562770562770564,
      "loss": 1.7435,
      "step": 536
    },
    {
      "epoch": 0.69,
      "grad_norm": 0.3125,
      "learning_rate": 0.00010541125541125543,
      "loss": 1.7421,
      "step": 537
    },
    {
      "epoch": 0.69,
      "grad_norm": 0.376953125,
      "learning_rate": 0.0001051948051948052,
      "loss": 1.8177,
      "step": 538
    },
    {
      "epoch": 0.69,
      "grad_norm": 0.390625,
      "learning_rate": 0.00010497835497835498,
      "loss": 1.853,
      "step": 539
    },
    {
      "epoch": 0.69,
      "grad_norm": 0.384765625,
      "learning_rate": 0.00010476190476190477,
      "loss": 1.7235,
      "step": 540
    },
    {
      "epoch": 0.69,
      "grad_norm": 0.35546875,
      "learning_rate": 0.00010454545454545455,
      "loss": 1.6917,
      "step": 541
    },
    {
      "epoch": 0.69,
      "grad_norm": 0.314453125,
      "learning_rate": 0.00010432900432900434,
      "loss": 1.7533,
      "step": 542
    },
    {
      "epoch": 0.69,
      "grad_norm": 0.390625,
      "learning_rate": 0.00010411255411255412,
      "loss": 1.699,
      "step": 543
    },
    {
      "epoch": 0.7,
      "grad_norm": 0.32421875,
      "learning_rate": 0.00010389610389610389,
      "loss": 1.7324,
      "step": 544
    },
    {
      "epoch": 0.7,
      "grad_norm": 0.3515625,
      "learning_rate": 0.00010367965367965368,
      "loss": 1.7698,
      "step": 545
    },
    {
      "epoch": 0.7,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00010346320346320346,
      "loss": 1.7495,
      "step": 546
    },
    {
      "epoch": 0.7,
      "grad_norm": 0.318359375,
      "learning_rate": 0.00010324675324675325,
      "loss": 1.7885,
      "step": 547
    },
    {
      "epoch": 0.7,
      "grad_norm": 0.310546875,
      "learning_rate": 0.00010303030303030303,
      "loss": 1.7624,
      "step": 548
    },
    {
      "epoch": 0.7,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00010281385281385283,
      "loss": 1.7373,
      "step": 549
    },
    {
      "epoch": 0.7,
      "grad_norm": 0.353515625,
      "learning_rate": 0.00010259740259740261,
      "loss": 1.7663,
      "step": 550
    },
    {
      "epoch": 0.7,
      "eval_loss": 1.7420332431793213,
      "eval_runtime": 125.4197,
      "eval_samples_per_second": 39.866,
      "eval_steps_per_second": 1.252,
      "step": 550
    },
    {
      "epoch": 0.71,
      "grad_norm": 0.326171875,
      "learning_rate": 0.00010238095238095237,
      "loss": 1.8104,
      "step": 551
    },
    {
      "epoch": 0.71,
      "grad_norm": 0.34375,
      "learning_rate": 0.00010216450216450218,
      "loss": 1.7708,
      "step": 552
    },
    {
      "epoch": 0.71,
      "grad_norm": 0.3984375,
      "learning_rate": 0.00010194805194805195,
      "loss": 1.7057,
      "step": 553
    },
    {
      "epoch": 0.71,
      "grad_norm": 0.328125,
      "learning_rate": 0.00010173160173160174,
      "loss": 1.718,
      "step": 554
    },
    {
      "epoch": 0.71,
      "grad_norm": 0.3671875,
      "learning_rate": 0.00010151515151515152,
      "loss": 1.6913,
      "step": 555
    },
    {
      "epoch": 0.71,
      "grad_norm": 0.5078125,
      "learning_rate": 0.0001012987012987013,
      "loss": 1.7444,
      "step": 556
    },
    {
      "epoch": 0.71,
      "grad_norm": 0.357421875,
      "learning_rate": 0.00010108225108225109,
      "loss": 1.7699,
      "step": 557
    },
    {
      "epoch": 0.71,
      "grad_norm": 0.423828125,
      "learning_rate": 0.00010086580086580086,
      "loss": 1.8066,
      "step": 558
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.333984375,
      "learning_rate": 0.00010064935064935067,
      "loss": 1.7054,
      "step": 559
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.41796875,
      "learning_rate": 0.00010043290043290043,
      "loss": 1.7562,
      "step": 560
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.41015625,
      "learning_rate": 0.00010021645021645023,
      "loss": 1.6979,
      "step": 561
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.34765625,
      "learning_rate": 0.0001,
      "loss": 1.8067,
      "step": 562
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.31640625,
      "learning_rate": 9.978354978354978e-05,
      "loss": 1.7718,
      "step": 563
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.36328125,
      "learning_rate": 9.956709956709958e-05,
      "loss": 1.8256,
      "step": 564
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.33984375,
      "learning_rate": 9.935064935064936e-05,
      "loss": 1.7328,
      "step": 565
    },
    {
      "epoch": 0.72,
      "grad_norm": 0.361328125,
      "learning_rate": 9.913419913419914e-05,
      "loss": 1.6943,
      "step": 566
    },
    {
      "epoch": 0.73,
      "grad_norm": 0.380859375,
      "learning_rate": 9.891774891774892e-05,
      "loss": 1.7398,
      "step": 567
    },
    {
      "epoch": 0.73,
      "grad_norm": 0.39453125,
      "learning_rate": 9.870129870129871e-05,
      "loss": 1.6486,
      "step": 568
    },
    {
      "epoch": 0.73,
      "grad_norm": 0.388671875,
      "learning_rate": 9.848484848484849e-05,
      "loss": 1.7058,
      "step": 569
    },
    {
      "epoch": 0.73,
      "grad_norm": 0.30859375,
      "learning_rate": 9.826839826839827e-05,
      "loss": 1.7299,
      "step": 570
    },
    {
      "epoch": 0.73,
      "grad_norm": 0.30078125,
      "learning_rate": 9.805194805194806e-05,
      "loss": 1.7634,
      "step": 571
    },
    {
      "epoch": 0.73,
      "grad_norm": 0.34375,
      "learning_rate": 9.783549783549783e-05,
      "loss": 1.7341,
      "step": 572
    },
    {
      "epoch": 0.73,
      "grad_norm": 0.337890625,
      "learning_rate": 9.761904761904762e-05,
      "loss": 1.7888,
      "step": 573
    },
    {
      "epoch": 0.73,
      "grad_norm": 0.34375,
      "learning_rate": 9.74025974025974e-05,
      "loss": 1.7765,
      "step": 574
    },
    {
      "epoch": 0.74,
      "grad_norm": 0.55078125,
      "learning_rate": 9.71861471861472e-05,
      "loss": 1.7764,
      "step": 575
    },
    {
      "epoch": 0.74,
      "grad_norm": 0.314453125,
      "learning_rate": 9.696969696969698e-05,
      "loss": 1.745,
      "step": 576
    },
    {
      "epoch": 0.74,
      "grad_norm": 0.357421875,
      "learning_rate": 9.675324675324677e-05,
      "loss": 1.7382,
      "step": 577
    },
    {
      "epoch": 0.74,
      "grad_norm": 0.384765625,
      "learning_rate": 9.653679653679654e-05,
      "loss": 1.7118,
      "step": 578
    },
    {
      "epoch": 0.74,
      "grad_norm": 0.380859375,
      "learning_rate": 9.632034632034633e-05,
      "loss": 1.6819,
      "step": 579
    },
    {
      "epoch": 0.74,
      "grad_norm": 0.326171875,
      "learning_rate": 9.610389610389611e-05,
      "loss": 1.8091,
      "step": 580
    },
    {
      "epoch": 0.74,
      "grad_norm": 0.380859375,
      "learning_rate": 9.588744588744589e-05,
      "loss": 1.796,
      "step": 581
    },
    {
      "epoch": 0.74,
      "grad_norm": 0.380859375,
      "learning_rate": 9.567099567099568e-05,
      "loss": 1.7027,
      "step": 582
    },
    {
      "epoch": 0.75,
      "grad_norm": 0.43359375,
      "learning_rate": 9.545454545454546e-05,
      "loss": 1.736,
      "step": 583
    },
    {
      "epoch": 0.75,
      "grad_norm": 0.337890625,
      "learning_rate": 9.523809523809524e-05,
      "loss": 1.738,
      "step": 584
    },
    {
      "epoch": 0.75,
      "grad_norm": 0.369140625,
      "learning_rate": 9.502164502164502e-05,
      "loss": 1.7425,
      "step": 585
    },
    {
      "epoch": 0.75,
      "grad_norm": 0.333984375,
      "learning_rate": 9.480519480519481e-05,
      "loss": 1.6909,
      "step": 586
    },
    {
      "epoch": 0.75,
      "grad_norm": 0.50390625,
      "learning_rate": 9.45887445887446e-05,
      "loss": 1.7845,
      "step": 587
    },
    {
      "epoch": 0.75,
      "grad_norm": 0.44140625,
      "learning_rate": 9.437229437229437e-05,
      "loss": 1.7345,
      "step": 588
    },
    {
      "epoch": 0.75,
      "grad_norm": 0.283203125,
      "learning_rate": 9.415584415584417e-05,
      "loss": 1.7717,
      "step": 589
    },
    {
      "epoch": 0.75,
      "grad_norm": 0.44921875,
      "learning_rate": 9.393939393939395e-05,
      "loss": 1.7243,
      "step": 590
    },
    {
      "epoch": 0.76,
      "grad_norm": 0.431640625,
      "learning_rate": 9.372294372294373e-05,
      "loss": 1.7662,
      "step": 591
    },
    {
      "epoch": 0.76,
      "grad_norm": 0.40625,
      "learning_rate": 9.35064935064935e-05,
      "loss": 1.7209,
      "step": 592
    },
    {
      "epoch": 0.76,
      "grad_norm": 0.3828125,
      "learning_rate": 9.32900432900433e-05,
      "loss": 1.6923,
      "step": 593
    },
    {
      "epoch": 0.76,
      "grad_norm": 0.34375,
      "learning_rate": 9.307359307359308e-05,
      "loss": 1.8138,
      "step": 594
    },
    {
      "epoch": 0.76,
      "grad_norm": 0.3515625,
      "learning_rate": 9.285714285714286e-05,
      "loss": 1.7474,
      "step": 595
    },
    {
      "epoch": 0.76,
      "grad_norm": 0.421875,
      "learning_rate": 9.264069264069265e-05,
      "loss": 1.776,
      "step": 596
    },
    {
      "epoch": 0.76,
      "grad_norm": 0.361328125,
      "learning_rate": 9.242424242424242e-05,
      "loss": 1.7624,
      "step": 597
    },
    {
      "epoch": 0.77,
      "grad_norm": 0.341796875,
      "learning_rate": 9.220779220779221e-05,
      "loss": 1.733,
      "step": 598
    },
    {
      "epoch": 0.77,
      "grad_norm": 0.326171875,
      "learning_rate": 9.199134199134199e-05,
      "loss": 1.7617,
      "step": 599
    },
    {
      "epoch": 0.77,
      "grad_norm": 0.326171875,
      "learning_rate": 9.177489177489178e-05,
      "loss": 1.6983,
      "step": 600
    },
    {
      "epoch": 0.77,
      "eval_loss": 1.7379374504089355,
      "eval_runtime": 124.9706,
      "eval_samples_per_second": 40.009,
      "eval_steps_per_second": 1.256,
      "step": 600
    },
    {
      "epoch": 0.77,
      "grad_norm": 0.34375,
      "learning_rate": 9.155844155844156e-05,
      "loss": 1.7099,
      "step": 601
    },
    {
      "epoch": 0.77,
      "grad_norm": 0.32421875,
      "learning_rate": 9.134199134199136e-05,
      "loss": 1.671,
      "step": 602
    },
    {
      "epoch": 0.77,
      "grad_norm": 0.302734375,
      "learning_rate": 9.112554112554112e-05,
      "loss": 1.7249,
      "step": 603
    },
    {
      "epoch": 0.77,
      "grad_norm": 0.318359375,
      "learning_rate": 9.090909090909092e-05,
      "loss": 1.7958,
      "step": 604
    },
    {
      "epoch": 0.77,
      "grad_norm": 0.37109375,
      "learning_rate": 9.06926406926407e-05,
      "loss": 1.7202,
      "step": 605
    },
    {
      "epoch": 0.78,
      "grad_norm": 0.33203125,
      "learning_rate": 9.047619047619048e-05,
      "loss": 1.7261,
      "step": 606
    },
    {
      "epoch": 0.78,
      "grad_norm": 0.32421875,
      "learning_rate": 9.025974025974027e-05,
      "loss": 1.8016,
      "step": 607
    },
    {
      "epoch": 0.78,
      "grad_norm": 0.310546875,
      "learning_rate": 9.004329004329005e-05,
      "loss": 1.7957,
      "step": 608
    },
    {
      "epoch": 0.78,
      "grad_norm": 0.3125,
      "learning_rate": 8.982683982683983e-05,
      "loss": 1.7856,
      "step": 609
    },
    {
      "epoch": 0.78,
      "grad_norm": 0.341796875,
      "learning_rate": 8.961038961038961e-05,
      "loss": 1.7807,
      "step": 610
    },
    {
      "epoch": 0.78,
      "grad_norm": 0.341796875,
      "learning_rate": 8.93939393939394e-05,
      "loss": 1.7536,
      "step": 611
    },
    {
      "epoch": 0.78,
      "grad_norm": 0.3125,
      "learning_rate": 8.917748917748918e-05,
      "loss": 1.7499,
      "step": 612
    },
    {
      "epoch": 0.78,
      "grad_norm": 0.365234375,
      "learning_rate": 8.896103896103896e-05,
      "loss": 1.7726,
      "step": 613
    },
    {
      "epoch": 0.79,
      "grad_norm": 0.37109375,
      "learning_rate": 8.874458874458876e-05,
      "loss": 1.7558,
      "step": 614
    },
    {
      "epoch": 0.79,
      "grad_norm": 0.298828125,
      "learning_rate": 8.852813852813854e-05,
      "loss": 1.726,
      "step": 615
    },
    {
      "epoch": 0.79,
      "grad_norm": 0.330078125,
      "learning_rate": 8.831168831168831e-05,
      "loss": 1.7632,
      "step": 616
    },
    {
      "epoch": 0.79,
      "grad_norm": 0.361328125,
      "learning_rate": 8.80952380952381e-05,
      "loss": 1.7293,
      "step": 617
    },
    {
      "epoch": 0.79,
      "grad_norm": 0.337890625,
      "learning_rate": 8.787878787878789e-05,
      "loss": 1.7263,
      "step": 618
    },
    {
      "epoch": 0.79,
      "grad_norm": 0.35546875,
      "learning_rate": 8.766233766233767e-05,
      "loss": 1.8403,
      "step": 619
    },
    {
      "epoch": 0.79,
      "grad_norm": 0.33984375,
      "learning_rate": 8.744588744588745e-05,
      "loss": 1.6896,
      "step": 620
    },
    {
      "epoch": 0.79,
      "grad_norm": 0.328125,
      "learning_rate": 8.722943722943724e-05,
      "loss": 1.797,
      "step": 621
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.439453125,
      "learning_rate": 8.701298701298701e-05,
      "loss": 1.7971,
      "step": 622
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.39453125,
      "learning_rate": 8.67965367965368e-05,
      "loss": 1.7697,
      "step": 623
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.326171875,
      "learning_rate": 8.658008658008658e-05,
      "loss": 1.7567,
      "step": 624
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.3203125,
      "learning_rate": 8.636363636363637e-05,
      "loss": 1.659,
      "step": 625
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.3671875,
      "learning_rate": 8.614718614718615e-05,
      "loss": 1.7546,
      "step": 626
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.32421875,
      "learning_rate": 8.593073593073593e-05,
      "loss": 1.6824,
      "step": 627
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.42578125,
      "learning_rate": 8.571428571428571e-05,
      "loss": 1.7049,
      "step": 628
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.3125,
      "learning_rate": 8.549783549783549e-05,
      "loss": 1.6639,
      "step": 629
    },
    {
      "epoch": 0.81,
      "grad_norm": 0.337890625,
      "learning_rate": 8.528138528138529e-05,
      "loss": 1.7667,
      "step": 630
    },
    {
      "epoch": 0.81,
      "grad_norm": 0.326171875,
      "learning_rate": 8.506493506493507e-05,
      "loss": 1.7703,
      "step": 631
    },
    {
      "epoch": 0.81,
      "grad_norm": 0.388671875,
      "learning_rate": 8.484848484848486e-05,
      "loss": 1.7526,
      "step": 632
    },
    {
      "epoch": 0.81,
      "grad_norm": 0.376953125,
      "learning_rate": 8.463203463203464e-05,
      "loss": 1.7658,
      "step": 633
    },
    {
      "epoch": 0.81,
      "grad_norm": 0.318359375,
      "learning_rate": 8.441558441558442e-05,
      "loss": 1.7888,
      "step": 634
    },
    {
      "epoch": 0.81,
      "grad_norm": 0.3125,
      "learning_rate": 8.41991341991342e-05,
      "loss": 1.7197,
      "step": 635
    },
    {
      "epoch": 0.81,
      "grad_norm": 0.328125,
      "learning_rate": 8.398268398268399e-05,
      "loss": 1.6745,
      "step": 636
    },
    {
      "epoch": 0.82,
      "grad_norm": 0.369140625,
      "learning_rate": 8.376623376623377e-05,
      "loss": 1.7168,
      "step": 637
    },
    {
      "epoch": 0.82,
      "grad_norm": 0.33984375,
      "learning_rate": 8.354978354978355e-05,
      "loss": 1.6904,
      "step": 638
    },
    {
      "epoch": 0.82,
      "grad_norm": 0.396484375,
      "learning_rate": 8.333333333333334e-05,
      "loss": 1.6792,
      "step": 639
    },
    {
      "epoch": 0.82,
      "grad_norm": 0.34375,
      "learning_rate": 8.311688311688312e-05,
      "loss": 1.7299,
      "step": 640
    },
    {
      "epoch": 0.82,
      "grad_norm": 0.365234375,
      "learning_rate": 8.29004329004329e-05,
      "loss": 1.7334,
      "step": 641
    },
    {
      "epoch": 0.82,
      "grad_norm": 0.32421875,
      "learning_rate": 8.268398268398268e-05,
      "loss": 1.7236,
      "step": 642
    },
    {
      "epoch": 0.82,
      "grad_norm": 0.408203125,
      "learning_rate": 8.246753246753248e-05,
      "loss": 1.7398,
      "step": 643
    },
    {
      "epoch": 0.82,
      "grad_norm": 0.337890625,
      "learning_rate": 8.225108225108226e-05,
      "loss": 1.7407,
      "step": 644
    },
    {
      "epoch": 0.83,
      "grad_norm": 0.34375,
      "learning_rate": 8.203463203463204e-05,
      "loss": 1.8405,
      "step": 645
    },
    {
      "epoch": 0.83,
      "grad_norm": 0.306640625,
      "learning_rate": 8.181818181818183e-05,
      "loss": 1.7225,
      "step": 646
    },
    {
      "epoch": 0.83,
      "grad_norm": 0.318359375,
      "learning_rate": 8.16017316017316e-05,
      "loss": 1.8285,
      "step": 647
    },
    {
      "epoch": 0.83,
      "grad_norm": 0.4453125,
      "learning_rate": 8.138528138528139e-05,
      "loss": 1.7646,
      "step": 648
    },
    {
      "epoch": 0.83,
      "grad_norm": 0.3203125,
      "learning_rate": 8.116883116883117e-05,
      "loss": 1.7172,
      "step": 649
    },
    {
      "epoch": 0.83,
      "grad_norm": 0.39453125,
      "learning_rate": 8.095238095238096e-05,
      "loss": 1.8033,
      "step": 650
    },
    {
      "epoch": 0.83,
      "eval_loss": 1.7356816530227661,
      "eval_runtime": 125.4294,
      "eval_samples_per_second": 39.863,
      "eval_steps_per_second": 1.252,
      "step": 650
    },
    {
      "epoch": 0.83,
      "grad_norm": 0.30859375,
      "learning_rate": 8.073593073593074e-05,
      "loss": 1.7494,
      "step": 651
    },
    {
      "epoch": 0.83,
      "grad_norm": 0.40234375,
      "learning_rate": 8.051948051948052e-05,
      "loss": 1.7573,
      "step": 652
    },
    {
      "epoch": 0.84,
      "grad_norm": 0.46484375,
      "learning_rate": 8.03030303030303e-05,
      "loss": 1.7712,
      "step": 653
    },
    {
      "epoch": 0.84,
      "grad_norm": 0.40234375,
      "learning_rate": 8.008658008658008e-05,
      "loss": 1.6894,
      "step": 654
    },
    {
      "epoch": 0.84,
      "grad_norm": 0.361328125,
      "learning_rate": 7.987012987012987e-05,
      "loss": 1.6714,
      "step": 655
    },
    {
      "epoch": 0.84,
      "grad_norm": 0.33984375,
      "learning_rate": 7.965367965367965e-05,
      "loss": 1.801,
      "step": 656
    },
    {
      "epoch": 0.84,
      "grad_norm": 0.41015625,
      "learning_rate": 7.943722943722945e-05,
      "loss": 1.75,
      "step": 657
    },
    {
      "epoch": 0.84,
      "grad_norm": 0.421875,
      "learning_rate": 7.922077922077923e-05,
      "loss": 1.6875,
      "step": 658
    },
    {
      "epoch": 0.84,
      "grad_norm": 0.421875,
      "learning_rate": 7.900432900432901e-05,
      "loss": 1.6635,
      "step": 659
    },
    {
      "epoch": 0.84,
      "grad_norm": 0.302734375,
      "learning_rate": 7.878787878787879e-05,
      "loss": 1.782,
      "step": 660
    },
    {
      "epoch": 0.85,
      "grad_norm": 0.3671875,
      "learning_rate": 7.857142857142858e-05,
      "loss": 1.6838,
      "step": 661
    },
    {
      "epoch": 0.85,
      "grad_norm": 0.341796875,
      "learning_rate": 7.835497835497836e-05,
      "loss": 1.6957,
      "step": 662
    },
    {
      "epoch": 0.85,
      "grad_norm": 0.3359375,
      "learning_rate": 7.813852813852814e-05,
      "loss": 1.7366,
      "step": 663
    },
    {
      "epoch": 0.85,
      "grad_norm": 0.404296875,
      "learning_rate": 7.792207792207793e-05,
      "loss": 1.6709,
      "step": 664
    },
    {
      "epoch": 0.85,
      "grad_norm": 0.373046875,
      "learning_rate": 7.770562770562771e-05,
      "loss": 1.7653,
      "step": 665
    },
    {
      "epoch": 0.85,
      "grad_norm": 0.375,
      "learning_rate": 7.748917748917749e-05,
      "loss": 1.7467,
      "step": 666
    },
    {
      "epoch": 0.85,
      "grad_norm": 0.3046875,
      "learning_rate": 7.727272727272727e-05,
      "loss": 1.7499,
      "step": 667
    },
    {
      "epoch": 0.85,
      "grad_norm": 0.337890625,
      "learning_rate": 7.705627705627707e-05,
      "loss": 1.7462,
      "step": 668
    },
    {
      "epoch": 0.86,
      "grad_norm": 0.3828125,
      "learning_rate": 7.683982683982685e-05,
      "loss": 1.6883,
      "step": 669
    },
    {
      "epoch": 0.86,
      "grad_norm": 0.337890625,
      "learning_rate": 7.662337662337662e-05,
      "loss": 1.7674,
      "step": 670
    },
    {
      "epoch": 0.86,
      "grad_norm": 0.37890625,
      "learning_rate": 7.640692640692642e-05,
      "loss": 1.7388,
      "step": 671
    },
    {
      "epoch": 0.86,
      "grad_norm": 0.33203125,
      "learning_rate": 7.619047619047618e-05,
      "loss": 1.7206,
      "step": 672
    },
    {
      "epoch": 0.86,
      "grad_norm": 0.341796875,
      "learning_rate": 7.597402597402598e-05,
      "loss": 1.749,
      "step": 673
    },
    {
      "epoch": 0.86,
      "grad_norm": 0.361328125,
      "learning_rate": 7.575757575757576e-05,
      "loss": 1.7318,
      "step": 674
    },
    {
      "epoch": 0.86,
      "grad_norm": 0.333984375,
      "learning_rate": 7.554112554112555e-05,
      "loss": 1.7234,
      "step": 675
    },
    {
      "epoch": 0.87,
      "grad_norm": 0.3359375,
      "learning_rate": 7.532467532467533e-05,
      "loss": 1.6407,
      "step": 676
    },
    {
      "epoch": 0.87,
      "grad_norm": 0.431640625,
      "learning_rate": 7.510822510822511e-05,
      "loss": 1.7951,
      "step": 677
    },
    {
      "epoch": 0.87,
      "grad_norm": 0.49609375,
      "learning_rate": 7.489177489177489e-05,
      "loss": 1.7958,
      "step": 678
    },
    {
      "epoch": 0.87,
      "grad_norm": 0.345703125,
      "learning_rate": 7.467532467532467e-05,
      "loss": 1.745,
      "step": 679
    },
    {
      "epoch": 0.87,
      "grad_norm": 0.396484375,
      "learning_rate": 7.445887445887446e-05,
      "loss": 1.6285,
      "step": 680
    },
    {
      "epoch": 0.87,
      "grad_norm": 0.390625,
      "learning_rate": 7.424242424242424e-05,
      "loss": 1.792,
      "step": 681
    },
    {
      "epoch": 0.87,
      "grad_norm": 0.466796875,
      "learning_rate": 7.402597402597404e-05,
      "loss": 1.8207,
      "step": 682
    },
    {
      "epoch": 0.87,
      "grad_norm": 0.4609375,
      "learning_rate": 7.380952380952382e-05,
      "loss": 1.7013,
      "step": 683
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.349609375,
      "learning_rate": 7.35930735930736e-05,
      "loss": 1.759,
      "step": 684
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.3671875,
      "learning_rate": 7.337662337662338e-05,
      "loss": 1.7398,
      "step": 685
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.318359375,
      "learning_rate": 7.316017316017317e-05,
      "loss": 1.6995,
      "step": 686
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.4296875,
      "learning_rate": 7.294372294372295e-05,
      "loss": 1.7617,
      "step": 687
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.431640625,
      "learning_rate": 7.272727272727273e-05,
      "loss": 1.6959,
      "step": 688
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.322265625,
      "learning_rate": 7.251082251082252e-05,
      "loss": 1.7879,
      "step": 689
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.35546875,
      "learning_rate": 7.229437229437229e-05,
      "loss": 1.6613,
      "step": 690
    },
    {
      "epoch": 0.88,
      "grad_norm": 0.3515625,
      "learning_rate": 7.207792207792208e-05,
      "loss": 1.6485,
      "step": 691
    },
    {
      "epoch": 0.89,
      "grad_norm": 0.400390625,
      "learning_rate": 7.186147186147186e-05,
      "loss": 1.719,
      "step": 692
    },
    {
      "epoch": 0.89,
      "grad_norm": 0.32421875,
      "learning_rate": 7.164502164502165e-05,
      "loss": 1.7316,
      "step": 693
    },
    {
      "epoch": 0.89,
      "grad_norm": 0.33984375,
      "learning_rate": 7.142857142857143e-05,
      "loss": 1.7509,
      "step": 694
    },
    {
      "epoch": 0.89,
      "grad_norm": 0.375,
      "learning_rate": 7.121212121212121e-05,
      "loss": 1.785,
      "step": 695
    },
    {
      "epoch": 0.89,
      "grad_norm": 0.328125,
      "learning_rate": 7.099567099567101e-05,
      "loss": 1.7643,
      "step": 696
    },
    {
      "epoch": 0.89,
      "grad_norm": 0.341796875,
      "learning_rate": 7.077922077922077e-05,
      "loss": 1.7058,
      "step": 697
    },
    {
      "epoch": 0.89,
      "grad_norm": 0.34375,
      "learning_rate": 7.056277056277057e-05,
      "loss": 1.807,
      "step": 698
    },
    {
      "epoch": 0.89,
      "grad_norm": 0.349609375,
      "learning_rate": 7.034632034632035e-05,
      "loss": 1.6489,
      "step": 699
    },
    {
      "epoch": 0.9,
      "grad_norm": 0.359375,
      "learning_rate": 7.012987012987014e-05,
      "loss": 1.8146,
      "step": 700
    },
    {
      "epoch": 0.9,
      "eval_loss": 1.7302906513214111,
      "eval_runtime": 125.8875,
      "eval_samples_per_second": 39.718,
      "eval_steps_per_second": 1.247,
      "step": 700
    },
    {
      "epoch": 0.9,
      "grad_norm": 0.298828125,
      "learning_rate": 6.991341991341992e-05,
      "loss": 1.6505,
      "step": 701
    },
    {
      "epoch": 0.9,
      "grad_norm": 0.39453125,
      "learning_rate": 6.96969696969697e-05,
      "loss": 1.7259,
      "step": 702
    },
    {
      "epoch": 0.9,
      "grad_norm": 0.314453125,
      "learning_rate": 6.948051948051948e-05,
      "loss": 1.81,
      "step": 703
    },
    {
      "epoch": 0.9,
      "grad_norm": 0.3515625,
      "learning_rate": 6.926406926406926e-05,
      "loss": 1.7801,
      "step": 704
    },
    {
      "epoch": 0.9,
      "grad_norm": 0.3359375,
      "learning_rate": 6.904761904761905e-05,
      "loss": 1.7213,
      "step": 705
    },
    {
      "epoch": 0.9,
      "grad_norm": 0.3125,
      "learning_rate": 6.883116883116883e-05,
      "loss": 1.8054,
      "step": 706
    },
    {
      "epoch": 0.9,
      "grad_norm": 0.31640625,
      "learning_rate": 6.861471861471862e-05,
      "loss": 1.7339,
      "step": 707
    },
    {
      "epoch": 0.91,
      "grad_norm": 0.328125,
      "learning_rate": 6.83982683982684e-05,
      "loss": 1.79,
      "step": 708
    },
    {
      "epoch": 0.91,
      "grad_norm": 0.36328125,
      "learning_rate": 6.818181818181818e-05,
      "loss": 1.6615,
      "step": 709
    },
    {
      "epoch": 0.91,
      "grad_norm": 0.306640625,
      "learning_rate": 6.796536796536796e-05,
      "loss": 1.8298,
      "step": 710
    },
    {
      "epoch": 0.91,
      "grad_norm": 0.298828125,
      "learning_rate": 6.774891774891774e-05,
      "loss": 1.6835,
      "step": 711
    },
    {
      "epoch": 0.91,
      "grad_norm": 0.32421875,
      "learning_rate": 6.753246753246754e-05,
      "loss": 1.7739,
      "step": 712
    },
    {
      "epoch": 0.91,
      "grad_norm": 0.337890625,
      "learning_rate": 6.731601731601732e-05,
      "loss": 1.7431,
      "step": 713
    },
    {
      "epoch": 0.91,
      "grad_norm": 0.337890625,
      "learning_rate": 6.709956709956711e-05,
      "loss": 1.7816,
      "step": 714
    },
    {
      "epoch": 0.91,
      "grad_norm": 0.32421875,
      "learning_rate": 6.688311688311688e-05,
      "loss": 1.7516,
      "step": 715
    },
    {
      "epoch": 0.92,
      "grad_norm": 0.341796875,
      "learning_rate": 6.666666666666667e-05,
      "loss": 1.7276,
      "step": 716
    },
    {
      "epoch": 0.92,
      "grad_norm": 0.330078125,
      "learning_rate": 6.645021645021645e-05,
      "loss": 1.7521,
      "step": 717
    },
    {
      "epoch": 0.92,
      "grad_norm": 0.36328125,
      "learning_rate": 6.623376623376624e-05,
      "loss": 1.7368,
      "step": 718
    },
    {
      "epoch": 0.92,
      "grad_norm": 0.3828125,
      "learning_rate": 6.601731601731602e-05,
      "loss": 1.7896,
      "step": 719
    },
    {
      "epoch": 0.92,
      "grad_norm": 0.365234375,
      "learning_rate": 6.58008658008658e-05,
      "loss": 1.8057,
      "step": 720
    },
    {
      "epoch": 0.92,
      "grad_norm": 0.3828125,
      "learning_rate": 6.55844155844156e-05,
      "loss": 1.7596,
      "step": 721
    },
    {
      "epoch": 0.92,
      "grad_norm": 0.40625,
      "learning_rate": 6.536796536796536e-05,
      "loss": 1.7793,
      "step": 722
    },
    {
      "epoch": 0.93,
      "grad_norm": 0.365234375,
      "learning_rate": 6.515151515151516e-05,
      "loss": 1.6966,
      "step": 723
    },
    {
      "epoch": 0.93,
      "grad_norm": 0.32421875,
      "learning_rate": 6.493506493506494e-05,
      "loss": 1.6619,
      "step": 724
    },
    {
      "epoch": 0.93,
      "grad_norm": 0.3359375,
      "learning_rate": 6.471861471861473e-05,
      "loss": 1.7305,
      "step": 725
    },
    {
      "epoch": 0.93,
      "grad_norm": 0.365234375,
      "learning_rate": 6.450216450216451e-05,
      "loss": 1.7337,
      "step": 726
    },
    {
      "epoch": 0.93,
      "grad_norm": 0.37890625,
      "learning_rate": 6.428571428571429e-05,
      "loss": 1.7255,
      "step": 727
    },
    {
      "epoch": 0.93,
      "grad_norm": 0.345703125,
      "learning_rate": 6.406926406926407e-05,
      "loss": 1.75,
      "step": 728
    },
    {
      "epoch": 0.93,
      "grad_norm": 0.337890625,
      "learning_rate": 6.385281385281385e-05,
      "loss": 1.7166,
      "step": 729
    },
    {
      "epoch": 0.93,
      "grad_norm": 0.3203125,
      "learning_rate": 6.363636363636364e-05,
      "loss": 1.7407,
      "step": 730
    },
    {
      "epoch": 0.94,
      "grad_norm": 0.349609375,
      "learning_rate": 6.341991341991342e-05,
      "loss": 1.6759,
      "step": 731
    },
    {
      "epoch": 0.94,
      "grad_norm": 0.33203125,
      "learning_rate": 6.320346320346321e-05,
      "loss": 1.7485,
      "step": 732
    },
    {
      "epoch": 0.94,
      "grad_norm": 0.37890625,
      "learning_rate": 6.2987012987013e-05,
      "loss": 1.8,
      "step": 733
    },
    {
      "epoch": 0.94,
      "grad_norm": 0.404296875,
      "learning_rate": 6.277056277056277e-05,
      "loss": 1.6966,
      "step": 734
    },
    {
      "epoch": 0.94,
      "grad_norm": 0.37890625,
      "learning_rate": 6.255411255411255e-05,
      "loss": 1.7524,
      "step": 735
    },
    {
      "epoch": 0.94,
      "grad_norm": 0.3515625,
      "learning_rate": 6.233766233766233e-05,
      "loss": 1.7087,
      "step": 736
    },
    {
      "epoch": 0.94,
      "grad_norm": 0.37109375,
      "learning_rate": 6.212121212121213e-05,
      "loss": 1.7963,
      "step": 737
    },
    {
      "epoch": 0.94,
      "grad_norm": 0.361328125,
      "learning_rate": 6.19047619047619e-05,
      "loss": 1.7043,
      "step": 738
    },
    {
      "epoch": 0.95,
      "grad_norm": 0.3046875,
      "learning_rate": 6.16883116883117e-05,
      "loss": 1.6797,
      "step": 739
    },
    {
      "epoch": 0.95,
      "grad_norm": 0.396484375,
      "learning_rate": 6.147186147186147e-05,
      "loss": 1.6791,
      "step": 740
    },
    {
      "epoch": 0.95,
      "grad_norm": 0.330078125,
      "learning_rate": 6.125541125541126e-05,
      "loss": 1.7069,
      "step": 741
    },
    {
      "epoch": 0.95,
      "grad_norm": 0.33984375,
      "learning_rate": 6.103896103896104e-05,
      "loss": 1.6997,
      "step": 742
    },
    {
      "epoch": 0.95,
      "grad_norm": 0.376953125,
      "learning_rate": 6.0822510822510825e-05,
      "loss": 1.7395,
      "step": 743
    },
    {
      "epoch": 0.95,
      "grad_norm": 0.388671875,
      "learning_rate": 6.060606060606061e-05,
      "loss": 1.7289,
      "step": 744
    },
    {
      "epoch": 0.95,
      "grad_norm": 0.375,
      "learning_rate": 6.03896103896104e-05,
      "loss": 1.7963,
      "step": 745
    },
    {
      "epoch": 0.95,
      "grad_norm": 0.318359375,
      "learning_rate": 6.0173160173160184e-05,
      "loss": 1.8322,
      "step": 746
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.3984375,
      "learning_rate": 5.995670995670996e-05,
      "loss": 1.6923,
      "step": 747
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.306640625,
      "learning_rate": 5.9740259740259744e-05,
      "loss": 1.7534,
      "step": 748
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.326171875,
      "learning_rate": 5.9523809523809524e-05,
      "loss": 1.7843,
      "step": 749
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.330078125,
      "learning_rate": 5.930735930735931e-05,
      "loss": 1.7837,
      "step": 750
    },
    {
      "epoch": 0.96,
      "eval_loss": 1.728518009185791,
      "eval_runtime": 125.8224,
      "eval_samples_per_second": 39.739,
      "eval_steps_per_second": 1.248,
      "step": 750
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.34375,
      "learning_rate": 5.90909090909091e-05,
      "loss": 1.8208,
      "step": 751
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.39453125,
      "learning_rate": 5.887445887445888e-05,
      "loss": 1.7173,
      "step": 752
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.37109375,
      "learning_rate": 5.8658008658008656e-05,
      "loss": 1.6923,
      "step": 753
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.41015625,
      "learning_rate": 5.844155844155844e-05,
      "loss": 1.7685,
      "step": 754
    },
    {
      "epoch": 0.97,
      "grad_norm": 0.3125,
      "learning_rate": 5.822510822510823e-05,
      "loss": 1.677,
      "step": 755
    },
    {
      "epoch": 0.97,
      "grad_norm": 0.33203125,
      "learning_rate": 5.800865800865801e-05,
      "loss": 1.6898,
      "step": 756
    },
    {
      "epoch": 0.97,
      "grad_norm": 0.3671875,
      "learning_rate": 5.7792207792207796e-05,
      "loss": 1.7492,
      "step": 757
    },
    {
      "epoch": 0.97,
      "grad_norm": 0.478515625,
      "learning_rate": 5.757575757575758e-05,
      "loss": 1.7804,
      "step": 758
    },
    {
      "epoch": 0.97,
      "grad_norm": 0.388671875,
      "learning_rate": 5.7359307359307355e-05,
      "loss": 1.7285,
      "step": 759
    },
    {
      "epoch": 0.97,
      "grad_norm": 0.34765625,
      "learning_rate": 5.714285714285714e-05,
      "loss": 1.7054,
      "step": 760
    },
    {
      "epoch": 0.97,
      "grad_norm": 0.361328125,
      "learning_rate": 5.692640692640693e-05,
      "loss": 1.7949,
      "step": 761
    },
    {
      "epoch": 0.98,
      "grad_norm": 0.3828125,
      "learning_rate": 5.6709956709956715e-05,
      "loss": 1.6984,
      "step": 762
    },
    {
      "epoch": 0.98,
      "grad_norm": 0.3359375,
      "learning_rate": 5.64935064935065e-05,
      "loss": 1.5885,
      "step": 763
    },
    {
      "epoch": 0.98,
      "grad_norm": 0.373046875,
      "learning_rate": 5.627705627705628e-05,
      "loss": 1.8189,
      "step": 764
    },
    {
      "epoch": 0.98,
      "grad_norm": 0.3359375,
      "learning_rate": 5.606060606060606e-05,
      "loss": 1.7489,
      "step": 765
    },
    {
      "epoch": 0.98,
      "grad_norm": 0.318359375,
      "learning_rate": 5.584415584415584e-05,
      "loss": 1.692,
      "step": 766
    },
    {
      "epoch": 0.98,
      "grad_norm": 0.337890625,
      "learning_rate": 5.562770562770563e-05,
      "loss": 1.7128,
      "step": 767
    },
    {
      "epoch": 0.98,
      "grad_norm": 0.369140625,
      "learning_rate": 5.5411255411255414e-05,
      "loss": 1.7056,
      "step": 768
    },
    {
      "epoch": 0.98,
      "grad_norm": 0.345703125,
      "learning_rate": 5.51948051948052e-05,
      "loss": 1.8224,
      "step": 769
    },
    {
      "epoch": 0.99,
      "grad_norm": 0.310546875,
      "learning_rate": 5.497835497835499e-05,
      "loss": 1.7297,
      "step": 770
    },
    {
      "epoch": 0.99,
      "grad_norm": 0.34375,
      "learning_rate": 5.4761904761904766e-05,
      "loss": 1.7245,
      "step": 771
    },
    {
      "epoch": 0.99,
      "grad_norm": 0.330078125,
      "learning_rate": 5.4545454545454546e-05,
      "loss": 1.7286,
      "step": 772
    },
    {
      "epoch": 0.99,
      "grad_norm": 0.357421875,
      "learning_rate": 5.4329004329004326e-05,
      "loss": 1.616,
      "step": 773
    },
    {
      "epoch": 0.99,
      "grad_norm": 0.361328125,
      "learning_rate": 5.411255411255411e-05,
      "loss": 1.7709,
      "step": 774
    },
    {
      "epoch": 0.99,
      "grad_norm": 0.423828125,
      "learning_rate": 5.38961038961039e-05,
      "loss": 1.7746,
      "step": 775
    },
    {
      "epoch": 0.99,
      "grad_norm": 0.34765625,
      "learning_rate": 5.3679653679653686e-05,
      "loss": 1.7077,
      "step": 776
    },
    {
      "epoch": 0.99,
      "grad_norm": 0.3515625,
      "learning_rate": 5.346320346320347e-05,
      "loss": 1.6281,
      "step": 777
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.361328125,
      "learning_rate": 5.3246753246753245e-05,
      "loss": 1.7438,
      "step": 778
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.353515625,
      "learning_rate": 5.303030303030303e-05,
      "loss": 1.6973,
      "step": 779
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.380859375,
      "learning_rate": 5.281385281385282e-05,
      "loss": 1.7698,
      "step": 780
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.40234375,
      "learning_rate": 5.25974025974026e-05,
      "loss": 1.7707,
      "step": 781
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.375,
      "learning_rate": 5.2380952380952384e-05,
      "loss": 1.7133,
      "step": 782
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.34765625,
      "learning_rate": 5.216450216450217e-05,
      "loss": 1.7593,
      "step": 783
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.349609375,
      "learning_rate": 5.1948051948051944e-05,
      "loss": 1.7286,
      "step": 784
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.33984375,
      "learning_rate": 5.173160173160173e-05,
      "loss": 1.724,
      "step": 785
    },
    {
      "epoch": 1.01,
      "grad_norm": 0.3359375,
      "learning_rate": 5.151515151515152e-05,
      "loss": 1.6978,
      "step": 786
    },
    {
      "epoch": 1.01,
      "grad_norm": 0.32421875,
      "learning_rate": 5.1298701298701304e-05,
      "loss": 1.8041,
      "step": 787
    },
    {
      "epoch": 1.01,
      "grad_norm": 0.287109375,
      "learning_rate": 5.108225108225109e-05,
      "loss": 1.7141,
      "step": 788
    },
    {
      "epoch": 1.01,
      "grad_norm": 0.326171875,
      "learning_rate": 5.086580086580087e-05,
      "loss": 1.6585,
      "step": 789
    },
    {
      "epoch": 1.01,
      "grad_norm": 0.33203125,
      "learning_rate": 5.064935064935065e-05,
      "loss": 1.733,
      "step": 790
    },
    {
      "epoch": 1.01,
      "grad_norm": 0.388671875,
      "learning_rate": 5.043290043290043e-05,
      "loss": 1.7602,
      "step": 791
    },
    {
      "epoch": 1.01,
      "grad_norm": 0.310546875,
      "learning_rate": 5.0216450216450216e-05,
      "loss": 1.804,
      "step": 792
    },
    {
      "epoch": 1.01,
      "grad_norm": 0.302734375,
      "learning_rate": 5e-05,
      "loss": 1.6816,
      "step": 793
    },
    {
      "epoch": 1.02,
      "grad_norm": 0.34375,
      "learning_rate": 4.978354978354979e-05,
      "loss": 1.7515,
      "step": 794
    },
    {
      "epoch": 1.02,
      "grad_norm": 0.33203125,
      "learning_rate": 4.956709956709957e-05,
      "loss": 1.6996,
      "step": 795
    },
    {
      "epoch": 1.02,
      "grad_norm": 0.333984375,
      "learning_rate": 4.9350649350649355e-05,
      "loss": 1.7045,
      "step": 796
    },
    {
      "epoch": 1.02,
      "grad_norm": 0.357421875,
      "learning_rate": 4.9134199134199135e-05,
      "loss": 1.7026,
      "step": 797
    },
    {
      "epoch": 1.02,
      "grad_norm": 0.30078125,
      "learning_rate": 4.8917748917748915e-05,
      "loss": 1.6444,
      "step": 798
    },
    {
      "epoch": 1.02,
      "grad_norm": 0.345703125,
      "learning_rate": 4.87012987012987e-05,
      "loss": 1.7705,
      "step": 799
    },
    {
      "epoch": 1.02,
      "grad_norm": 0.35546875,
      "learning_rate": 4.848484848484849e-05,
      "loss": 1.7178,
      "step": 800
    },
    {
      "epoch": 1.02,
      "eval_loss": 1.7301437854766846,
      "eval_runtime": 124.4902,
      "eval_samples_per_second": 40.164,
      "eval_steps_per_second": 1.261,
      "step": 800
    },
    {
      "epoch": 1.02,
      "grad_norm": 0.33203125,
      "learning_rate": 4.826839826839827e-05,
      "loss": 1.7434,
      "step": 801
    },
    {
      "epoch": 1.03,
      "grad_norm": 0.365234375,
      "learning_rate": 4.8051948051948054e-05,
      "loss": 1.7148,
      "step": 802
    },
    {
      "epoch": 1.03,
      "grad_norm": 0.330078125,
      "learning_rate": 4.783549783549784e-05,
      "loss": 1.7685,
      "step": 803
    },
    {
      "epoch": 1.03,
      "grad_norm": 0.3515625,
      "learning_rate": 4.761904761904762e-05,
      "loss": 1.8048,
      "step": 804
    },
    {
      "epoch": 1.03,
      "grad_norm": 0.35546875,
      "learning_rate": 4.740259740259741e-05,
      "loss": 1.7467,
      "step": 805
    },
    {
      "epoch": 1.03,
      "grad_norm": 0.33203125,
      "learning_rate": 4.718614718614719e-05,
      "loss": 1.6937,
      "step": 806
    },
    {
      "epoch": 1.03,
      "grad_norm": 0.306640625,
      "learning_rate": 4.696969696969697e-05,
      "loss": 1.6432,
      "step": 807
    },
    {
      "epoch": 1.03,
      "grad_norm": 0.3515625,
      "learning_rate": 4.675324675324675e-05,
      "loss": 1.6467,
      "step": 808
    },
    {
      "epoch": 1.04,
      "grad_norm": 0.373046875,
      "learning_rate": 4.653679653679654e-05,
      "loss": 1.7572,
      "step": 809
    },
    {
      "epoch": 1.04,
      "grad_norm": 0.365234375,
      "learning_rate": 4.6320346320346326e-05,
      "loss": 1.7623,
      "step": 810
    },
    {
      "epoch": 1.04,
      "grad_norm": 0.333984375,
      "learning_rate": 4.6103896103896106e-05,
      "loss": 1.6898,
      "step": 811
    },
    {
      "epoch": 1.04,
      "grad_norm": 0.412109375,
      "learning_rate": 4.588744588744589e-05,
      "loss": 1.6788,
      "step": 812
    },
    {
      "epoch": 1.04,
      "grad_norm": 0.373046875,
      "learning_rate": 4.567099567099568e-05,
      "loss": 1.6631,
      "step": 813
    },
    {
      "epoch": 1.04,
      "grad_norm": 0.33203125,
      "learning_rate": 4.545454545454546e-05,
      "loss": 1.6311,
      "step": 814
    },
    {
      "epoch": 1.04,
      "grad_norm": 0.408203125,
      "learning_rate": 4.523809523809524e-05,
      "loss": 1.6636,
      "step": 815
    },
    {
      "epoch": 1.04,
      "grad_norm": 0.357421875,
      "learning_rate": 4.5021645021645025e-05,
      "loss": 1.8099,
      "step": 816
    },
    {
      "epoch": 1.05,
      "grad_norm": 0.359375,
      "learning_rate": 4.4805194805194805e-05,
      "loss": 1.6849,
      "step": 817
    },
    {
      "epoch": 1.05,
      "grad_norm": 0.36328125,
      "learning_rate": 4.458874458874459e-05,
      "loss": 1.6601,
      "step": 818
    },
    {
      "epoch": 1.05,
      "grad_norm": 0.365234375,
      "learning_rate": 4.437229437229438e-05,
      "loss": 1.7999,
      "step": 819
    },
    {
      "epoch": 1.05,
      "grad_norm": 0.396484375,
      "learning_rate": 4.415584415584416e-05,
      "loss": 1.6323,
      "step": 820
    },
    {
      "epoch": 1.05,
      "grad_norm": 0.3203125,
      "learning_rate": 4.3939393939393944e-05,
      "loss": 1.7141,
      "step": 821
    },
    {
      "epoch": 1.05,
      "grad_norm": 0.3359375,
      "learning_rate": 4.3722943722943724e-05,
      "loss": 1.7166,
      "step": 822
    },
    {
      "epoch": 1.05,
      "grad_norm": 0.421875,
      "learning_rate": 4.3506493506493503e-05,
      "loss": 1.6969,
      "step": 823
    },
    {
      "epoch": 1.05,
      "grad_norm": 0.333984375,
      "learning_rate": 4.329004329004329e-05,
      "loss": 1.674,
      "step": 824
    },
    {
      "epoch": 1.06,
      "grad_norm": 0.32421875,
      "learning_rate": 4.3073593073593077e-05,
      "loss": 1.6893,
      "step": 825
    },
    {
      "epoch": 1.06,
      "grad_norm": 0.36328125,
      "learning_rate": 4.2857142857142856e-05,
      "loss": 1.6985,
      "step": 826
    },
    {
      "epoch": 1.06,
      "grad_norm": 0.328125,
      "learning_rate": 4.264069264069264e-05,
      "loss": 1.7245,
      "step": 827
    },
    {
      "epoch": 1.06,
      "grad_norm": 0.35546875,
      "learning_rate": 4.242424242424243e-05,
      "loss": 1.7277,
      "step": 828
    },
    {
      "epoch": 1.06,
      "grad_norm": 0.3671875,
      "learning_rate": 4.220779220779221e-05,
      "loss": 1.6765,
      "step": 829
    },
    {
      "epoch": 1.06,
      "grad_norm": 0.36328125,
      "learning_rate": 4.1991341991341996e-05,
      "loss": 1.7863,
      "step": 830
    },
    {
      "epoch": 1.06,
      "grad_norm": 0.33984375,
      "learning_rate": 4.1774891774891775e-05,
      "loss": 1.6689,
      "step": 831
    },
    {
      "epoch": 1.06,
      "grad_norm": 0.365234375,
      "learning_rate": 4.155844155844156e-05,
      "loss": 1.7957,
      "step": 832
    },
    {
      "epoch": 1.07,
      "grad_norm": 0.298828125,
      "learning_rate": 4.134199134199134e-05,
      "loss": 1.6454,
      "step": 833
    },
    {
      "epoch": 1.07,
      "grad_norm": 0.341796875,
      "learning_rate": 4.112554112554113e-05,
      "loss": 1.6748,
      "step": 834
    },
    {
      "epoch": 1.07,
      "grad_norm": 0.380859375,
      "learning_rate": 4.0909090909090915e-05,
      "loss": 1.8057,
      "step": 835
    },
    {
      "epoch": 1.07,
      "grad_norm": 0.38671875,
      "learning_rate": 4.0692640692640695e-05,
      "loss": 1.7343,
      "step": 836
    },
    {
      "epoch": 1.07,
      "grad_norm": 0.353515625,
      "learning_rate": 4.047619047619048e-05,
      "loss": 1.6868,
      "step": 837
    },
    {
      "epoch": 1.07,
      "grad_norm": 0.365234375,
      "learning_rate": 4.025974025974026e-05,
      "loss": 1.6663,
      "step": 838
    },
    {
      "epoch": 1.07,
      "grad_norm": 0.3359375,
      "learning_rate": 4.004329004329004e-05,
      "loss": 1.6779,
      "step": 839
    },
    {
      "epoch": 1.07,
      "grad_norm": 0.353515625,
      "learning_rate": 3.982683982683983e-05,
      "loss": 1.7324,
      "step": 840
    },
    {
      "epoch": 1.08,
      "grad_norm": 0.458984375,
      "learning_rate": 3.9610389610389614e-05,
      "loss": 1.7377,
      "step": 841
    },
    {
      "epoch": 1.08,
      "grad_norm": 0.3515625,
      "learning_rate": 3.939393939393939e-05,
      "loss": 1.7468,
      "step": 842
    },
    {
      "epoch": 1.08,
      "grad_norm": 0.3359375,
      "learning_rate": 3.917748917748918e-05,
      "loss": 1.703,
      "step": 843
    },
    {
      "epoch": 1.08,
      "grad_norm": 0.38671875,
      "learning_rate": 3.8961038961038966e-05,
      "loss": 1.7124,
      "step": 844
    },
    {
      "epoch": 1.08,
      "grad_norm": 0.341796875,
      "learning_rate": 3.8744588744588746e-05,
      "loss": 1.6444,
      "step": 845
    },
    {
      "epoch": 1.08,
      "grad_norm": 0.341796875,
      "learning_rate": 3.852813852813853e-05,
      "loss": 1.7465,
      "step": 846
    },
    {
      "epoch": 1.08,
      "grad_norm": 0.41015625,
      "learning_rate": 3.831168831168831e-05,
      "loss": 1.7153,
      "step": 847
    },
    {
      "epoch": 1.09,
      "grad_norm": 0.359375,
      "learning_rate": 3.809523809523809e-05,
      "loss": 1.756,
      "step": 848
    },
    {
      "epoch": 1.09,
      "grad_norm": 0.37890625,
      "learning_rate": 3.787878787878788e-05,
      "loss": 1.6992,
      "step": 849
    },
    {
      "epoch": 1.09,
      "grad_norm": 0.33203125,
      "learning_rate": 3.7662337662337665e-05,
      "loss": 1.7487,
      "step": 850
    },
    {
      "epoch": 1.09,
      "eval_loss": 1.725529670715332,
      "eval_runtime": 124.4606,
      "eval_samples_per_second": 40.173,
      "eval_steps_per_second": 1.261,
      "step": 850
    },
    {
      "epoch": 1.09,
      "grad_norm": 0.3828125,
      "learning_rate": 3.7445887445887445e-05,
      "loss": 1.7665,
      "step": 851
    },
    {
      "epoch": 1.09,
      "grad_norm": 0.349609375,
      "learning_rate": 3.722943722943723e-05,
      "loss": 1.7517,
      "step": 852
    },
    {
      "epoch": 1.09,
      "grad_norm": 0.353515625,
      "learning_rate": 3.701298701298702e-05,
      "loss": 1.7896,
      "step": 853
    },
    {
      "epoch": 1.09,
      "grad_norm": 0.37109375,
      "learning_rate": 3.67965367965368e-05,
      "loss": 1.7554,
      "step": 854
    },
    {
      "epoch": 1.09,
      "grad_norm": 0.3515625,
      "learning_rate": 3.6580086580086584e-05,
      "loss": 1.7757,
      "step": 855
    },
    {
      "epoch": 1.1,
      "grad_norm": 0.390625,
      "learning_rate": 3.6363636363636364e-05,
      "loss": 1.7977,
      "step": 856
    },
    {
      "epoch": 1.1,
      "grad_norm": 0.376953125,
      "learning_rate": 3.6147186147186144e-05,
      "loss": 1.7066,
      "step": 857
    },
    {
      "epoch": 1.1,
      "grad_norm": 0.3671875,
      "learning_rate": 3.593073593073593e-05,
      "loss": 1.7159,
      "step": 858
    },
    {
      "epoch": 1.1,
      "grad_norm": 0.3515625,
      "learning_rate": 3.571428571428572e-05,
      "loss": 1.5972,
      "step": 859
    },
    {
      "epoch": 1.1,
      "grad_norm": 0.365234375,
      "learning_rate": 3.5497835497835503e-05,
      "loss": 1.7102,
      "step": 860
    },
    {
      "epoch": 1.1,
      "grad_norm": 0.369140625,
      "learning_rate": 3.528138528138528e-05,
      "loss": 1.661,
      "step": 861
    },
    {
      "epoch": 1.1,
      "grad_norm": 0.47265625,
      "learning_rate": 3.506493506493507e-05,
      "loss": 1.691,
      "step": 862
    },
    {
      "epoch": 1.1,
      "grad_norm": 0.3125,
      "learning_rate": 3.484848484848485e-05,
      "loss": 1.7247,
      "step": 863
    },
    {
      "epoch": 1.11,
      "grad_norm": 0.390625,
      "learning_rate": 3.463203463203463e-05,
      "loss": 1.6295,
      "step": 864
    },
    {
      "epoch": 1.11,
      "grad_norm": 0.361328125,
      "learning_rate": 3.4415584415584416e-05,
      "loss": 1.7508,
      "step": 865
    },
    {
      "epoch": 1.11,
      "grad_norm": 0.330078125,
      "learning_rate": 3.41991341991342e-05,
      "loss": 1.6085,
      "step": 866
    },
    {
      "epoch": 1.11,
      "grad_norm": 0.35546875,
      "learning_rate": 3.398268398268398e-05,
      "loss": 1.7507,
      "step": 867
    },
    {
      "epoch": 1.11,
      "grad_norm": 0.296875,
      "learning_rate": 3.376623376623377e-05,
      "loss": 1.7143,
      "step": 868
    },
    {
      "epoch": 1.11,
      "grad_norm": 0.34765625,
      "learning_rate": 3.3549783549783555e-05,
      "loss": 1.7195,
      "step": 869
    },
    {
      "epoch": 1.11,
      "grad_norm": 0.3984375,
      "learning_rate": 3.3333333333333335e-05,
      "loss": 1.7606,
      "step": 870
    },
    {
      "epoch": 1.11,
      "grad_norm": 0.3515625,
      "learning_rate": 3.311688311688312e-05,
      "loss": 1.7038,
      "step": 871
    },
    {
      "epoch": 1.12,
      "grad_norm": 0.33984375,
      "learning_rate": 3.29004329004329e-05,
      "loss": 1.7545,
      "step": 872
    },
    {
      "epoch": 1.12,
      "grad_norm": 0.306640625,
      "learning_rate": 3.268398268398268e-05,
      "loss": 1.6866,
      "step": 873
    },
    {
      "epoch": 1.12,
      "grad_norm": 0.404296875,
      "learning_rate": 3.246753246753247e-05,
      "loss": 1.683,
      "step": 874
    },
    {
      "epoch": 1.12,
      "grad_norm": 0.345703125,
      "learning_rate": 3.2251082251082254e-05,
      "loss": 1.7329,
      "step": 875
    },
    {
      "epoch": 1.12,
      "grad_norm": 0.341796875,
      "learning_rate": 3.2034632034632034e-05,
      "loss": 1.7675,
      "step": 876
    },
    {
      "epoch": 1.12,
      "grad_norm": 0.333984375,
      "learning_rate": 3.181818181818182e-05,
      "loss": 1.6585,
      "step": 877
    },
    {
      "epoch": 1.12,
      "grad_norm": 0.34375,
      "learning_rate": 3.160173160173161e-05,
      "loss": 1.7628,
      "step": 878
    },
    {
      "epoch": 1.12,
      "grad_norm": 0.384765625,
      "learning_rate": 3.1385281385281387e-05,
      "loss": 1.6784,
      "step": 879
    },
    {
      "epoch": 1.13,
      "grad_norm": 0.345703125,
      "learning_rate": 3.1168831168831166e-05,
      "loss": 1.7177,
      "step": 880
    },
    {
      "epoch": 1.13,
      "grad_norm": 0.34375,
      "learning_rate": 3.095238095238095e-05,
      "loss": 1.6945,
      "step": 881
    },
    {
      "epoch": 1.13,
      "grad_norm": 0.380859375,
      "learning_rate": 3.073593073593073e-05,
      "loss": 1.7096,
      "step": 882
    },
    {
      "epoch": 1.13,
      "grad_norm": 0.361328125,
      "learning_rate": 3.051948051948052e-05,
      "loss": 1.7545,
      "step": 883
    },
    {
      "epoch": 1.13,
      "grad_norm": 0.369140625,
      "learning_rate": 3.0303030303030306e-05,
      "loss": 1.7122,
      "step": 884
    },
    {
      "epoch": 1.13,
      "grad_norm": 0.36328125,
      "learning_rate": 3.0086580086580092e-05,
      "loss": 1.6433,
      "step": 885
    },
    {
      "epoch": 1.13,
      "grad_norm": 0.37109375,
      "learning_rate": 2.9870129870129872e-05,
      "loss": 1.6908,
      "step": 886
    },
    {
      "epoch": 1.13,
      "grad_norm": 0.326171875,
      "learning_rate": 2.9653679653679655e-05,
      "loss": 1.8096,
      "step": 887
    },
    {
      "epoch": 1.14,
      "grad_norm": 0.375,
      "learning_rate": 2.943722943722944e-05,
      "loss": 1.5972,
      "step": 888
    },
    {
      "epoch": 1.14,
      "grad_norm": 0.345703125,
      "learning_rate": 2.922077922077922e-05,
      "loss": 1.7858,
      "step": 889
    },
    {
      "epoch": 1.14,
      "grad_norm": 0.326171875,
      "learning_rate": 2.9004329004329005e-05,
      "loss": 1.7353,
      "step": 890
    },
    {
      "epoch": 1.14,
      "grad_norm": 0.35546875,
      "learning_rate": 2.878787878787879e-05,
      "loss": 1.7572,
      "step": 891
    },
    {
      "epoch": 1.14,
      "grad_norm": 0.361328125,
      "learning_rate": 2.857142857142857e-05,
      "loss": 1.7268,
      "step": 892
    },
    {
      "epoch": 1.14,
      "grad_norm": 0.37109375,
      "learning_rate": 2.8354978354978357e-05,
      "loss": 1.7919,
      "step": 893
    },
    {
      "epoch": 1.14,
      "grad_norm": 0.37109375,
      "learning_rate": 2.813852813852814e-05,
      "loss": 1.735,
      "step": 894
    },
    {
      "epoch": 1.15,
      "grad_norm": 0.32421875,
      "learning_rate": 2.792207792207792e-05,
      "loss": 1.7174,
      "step": 895
    },
    {
      "epoch": 1.15,
      "grad_norm": 0.37890625,
      "learning_rate": 2.7705627705627707e-05,
      "loss": 1.6896,
      "step": 896
    },
    {
      "epoch": 1.15,
      "grad_norm": 0.318359375,
      "learning_rate": 2.7489177489177493e-05,
      "loss": 1.7908,
      "step": 897
    },
    {
      "epoch": 1.15,
      "grad_norm": 0.33984375,
      "learning_rate": 2.7272727272727273e-05,
      "loss": 1.7684,
      "step": 898
    },
    {
      "epoch": 1.15,
      "grad_norm": 0.33203125,
      "learning_rate": 2.7056277056277056e-05,
      "loss": 1.7138,
      "step": 899
    },
    {
      "epoch": 1.15,
      "grad_norm": 0.34375,
      "learning_rate": 2.6839826839826843e-05,
      "loss": 1.7012,
      "step": 900
    },
    {
      "epoch": 1.15,
      "eval_loss": 1.725927710533142,
      "eval_runtime": 125.737,
      "eval_samples_per_second": 39.766,
      "eval_steps_per_second": 1.249,
      "step": 900
    },
    {
      "epoch": 1.15,
      "grad_norm": 0.35546875,
      "learning_rate": 2.6623376623376623e-05,
      "loss": 1.7415,
      "step": 901
    },
    {
      "epoch": 1.15,
      "grad_norm": 0.365234375,
      "learning_rate": 2.640692640692641e-05,
      "loss": 1.7195,
      "step": 902
    },
    {
      "epoch": 1.16,
      "grad_norm": 0.337890625,
      "learning_rate": 2.6190476190476192e-05,
      "loss": 1.7539,
      "step": 903
    },
    {
      "epoch": 1.16,
      "grad_norm": 0.369140625,
      "learning_rate": 2.5974025974025972e-05,
      "loss": 1.6218,
      "step": 904
    },
    {
      "epoch": 1.16,
      "grad_norm": 0.3125,
      "learning_rate": 2.575757575757576e-05,
      "loss": 1.6949,
      "step": 905
    },
    {
      "epoch": 1.16,
      "grad_norm": 0.361328125,
      "learning_rate": 2.5541125541125545e-05,
      "loss": 1.7539,
      "step": 906
    },
    {
      "epoch": 1.16,
      "grad_norm": 0.345703125,
      "learning_rate": 2.5324675324675325e-05,
      "loss": 1.7398,
      "step": 907
    },
    {
      "epoch": 1.16,
      "grad_norm": 0.3515625,
      "learning_rate": 2.5108225108225108e-05,
      "loss": 1.7924,
      "step": 908
    },
    {
      "epoch": 1.16,
      "grad_norm": 0.34765625,
      "learning_rate": 2.4891774891774894e-05,
      "loss": 1.7182,
      "step": 909
    },
    {
      "epoch": 1.16,
      "grad_norm": 0.412109375,
      "learning_rate": 2.4675324675324678e-05,
      "loss": 1.7197,
      "step": 910
    },
    {
      "epoch": 1.17,
      "grad_norm": 0.310546875,
      "learning_rate": 2.4458874458874457e-05,
      "loss": 1.6976,
      "step": 911
    },
    {
      "epoch": 1.17,
      "grad_norm": 0.369140625,
      "learning_rate": 2.4242424242424244e-05,
      "loss": 1.7478,
      "step": 912
    },
    {
      "epoch": 1.17,
      "grad_norm": 0.333984375,
      "learning_rate": 2.4025974025974027e-05,
      "loss": 1.6187,
      "step": 913
    },
    {
      "epoch": 1.17,
      "grad_norm": 0.345703125,
      "learning_rate": 2.380952380952381e-05,
      "loss": 1.7191,
      "step": 914
    },
    {
      "epoch": 1.17,
      "grad_norm": 0.439453125,
      "learning_rate": 2.3593073593073593e-05,
      "loss": 1.6616,
      "step": 915
    },
    {
      "epoch": 1.17,
      "grad_norm": 0.328125,
      "learning_rate": 2.3376623376623376e-05,
      "loss": 1.6345,
      "step": 916
    },
    {
      "epoch": 1.17,
      "grad_norm": 0.447265625,
      "learning_rate": 2.3160173160173163e-05,
      "loss": 1.7367,
      "step": 917
    },
    {
      "epoch": 1.17,
      "grad_norm": 0.40625,
      "learning_rate": 2.2943722943722946e-05,
      "loss": 1.7257,
      "step": 918
    },
    {
      "epoch": 1.18,
      "grad_norm": 0.35546875,
      "learning_rate": 2.272727272727273e-05,
      "loss": 1.6999,
      "step": 919
    },
    {
      "epoch": 1.18,
      "grad_norm": 0.37109375,
      "learning_rate": 2.2510822510822512e-05,
      "loss": 1.6337,
      "step": 920
    },
    {
      "epoch": 1.18,
      "grad_norm": 0.349609375,
      "learning_rate": 2.2294372294372296e-05,
      "loss": 1.7632,
      "step": 921
    },
    {
      "epoch": 1.18,
      "grad_norm": 0.341796875,
      "learning_rate": 2.207792207792208e-05,
      "loss": 1.7686,
      "step": 922
    },
    {
      "epoch": 1.18,
      "grad_norm": 0.3671875,
      "learning_rate": 2.1861471861471862e-05,
      "loss": 1.6897,
      "step": 923
    },
    {
      "epoch": 1.18,
      "grad_norm": 0.365234375,
      "learning_rate": 2.1645021645021645e-05,
      "loss": 1.8555,
      "step": 924
    },
    {
      "epoch": 1.18,
      "grad_norm": 0.408203125,
      "learning_rate": 2.1428571428571428e-05,
      "loss": 1.7255,
      "step": 925
    },
    {
      "epoch": 1.18,
      "grad_norm": 0.3359375,
      "learning_rate": 2.1212121212121215e-05,
      "loss": 1.6774,
      "step": 926
    },
    {
      "epoch": 1.19,
      "grad_norm": 0.39453125,
      "learning_rate": 2.0995670995670998e-05,
      "loss": 1.7524,
      "step": 927
    },
    {
      "epoch": 1.19,
      "grad_norm": 0.328125,
      "learning_rate": 2.077922077922078e-05,
      "loss": 1.6569,
      "step": 928
    },
    {
      "epoch": 1.19,
      "grad_norm": 0.5078125,
      "learning_rate": 2.0562770562770564e-05,
      "loss": 1.7028,
      "step": 929
    },
    {
      "epoch": 1.19,
      "grad_norm": 0.349609375,
      "learning_rate": 2.0346320346320347e-05,
      "loss": 1.7377,
      "step": 930
    },
    {
      "epoch": 1.19,
      "grad_norm": 0.341796875,
      "learning_rate": 2.012987012987013e-05,
      "loss": 1.6855,
      "step": 931
    },
    {
      "epoch": 1.19,
      "grad_norm": 0.326171875,
      "learning_rate": 1.9913419913419914e-05,
      "loss": 1.6917,
      "step": 932
    },
    {
      "epoch": 1.19,
      "grad_norm": 0.341796875,
      "learning_rate": 1.9696969696969697e-05,
      "loss": 1.714,
      "step": 933
    },
    {
      "epoch": 1.2,
      "grad_norm": 0.333984375,
      "learning_rate": 1.9480519480519483e-05,
      "loss": 1.7653,
      "step": 934
    },
    {
      "epoch": 1.2,
      "grad_norm": 0.337890625,
      "learning_rate": 1.9264069264069266e-05,
      "loss": 1.6973,
      "step": 935
    },
    {
      "epoch": 1.2,
      "grad_norm": 0.34765625,
      "learning_rate": 1.9047619047619046e-05,
      "loss": 1.7066,
      "step": 936
    },
    {
      "epoch": 1.2,
      "grad_norm": 0.34375,
      "learning_rate": 1.8831168831168833e-05,
      "loss": 1.6812,
      "step": 937
    },
    {
      "epoch": 1.2,
      "grad_norm": 0.341796875,
      "learning_rate": 1.8614718614718616e-05,
      "loss": 1.7114,
      "step": 938
    },
    {
      "epoch": 1.2,
      "grad_norm": 0.3984375,
      "learning_rate": 1.83982683982684e-05,
      "loss": 1.7109,
      "step": 939
    },
    {
      "epoch": 1.2,
      "grad_norm": 0.375,
      "learning_rate": 1.8181818181818182e-05,
      "loss": 1.6037,
      "step": 940
    },
    {
      "epoch": 1.2,
      "grad_norm": 0.34765625,
      "learning_rate": 1.7965367965367965e-05,
      "loss": 1.7588,
      "step": 941
    },
    {
      "epoch": 1.21,
      "grad_norm": 0.412109375,
      "learning_rate": 1.7748917748917752e-05,
      "loss": 1.7093,
      "step": 942
    },
    {
      "epoch": 1.21,
      "grad_norm": 0.353515625,
      "learning_rate": 1.7532467532467535e-05,
      "loss": 1.7642,
      "step": 943
    },
    {
      "epoch": 1.21,
      "grad_norm": 0.466796875,
      "learning_rate": 1.7316017316017315e-05,
      "loss": 1.724,
      "step": 944
    },
    {
      "epoch": 1.21,
      "grad_norm": 0.34375,
      "learning_rate": 1.70995670995671e-05,
      "loss": 1.6136,
      "step": 945
    },
    {
      "epoch": 1.21,
      "grad_norm": 0.431640625,
      "learning_rate": 1.6883116883116884e-05,
      "loss": 1.6256,
      "step": 946
    },
    {
      "epoch": 1.21,
      "grad_norm": 0.400390625,
      "learning_rate": 1.6666666666666667e-05,
      "loss": 1.6878,
      "step": 947
    },
    {
      "epoch": 1.21,
      "grad_norm": 0.388671875,
      "learning_rate": 1.645021645021645e-05,
      "loss": 1.7281,
      "step": 948
    },
    {
      "epoch": 1.21,
      "grad_norm": 0.3359375,
      "learning_rate": 1.6233766233766234e-05,
      "loss": 1.7021,
      "step": 949
    },
    {
      "epoch": 1.22,
      "grad_norm": 0.375,
      "learning_rate": 1.6017316017316017e-05,
      "loss": 1.7982,
      "step": 950
    },
    {
      "epoch": 1.22,
      "eval_loss": 1.7258570194244385,
      "eval_runtime": 124.9465,
      "eval_samples_per_second": 40.017,
      "eval_steps_per_second": 1.257,
      "step": 950
    },
    {
      "epoch": 1.22,
      "grad_norm": 0.400390625,
      "learning_rate": 1.5800865800865803e-05,
      "loss": 1.641,
      "step": 951
    },
    {
      "epoch": 1.22,
      "grad_norm": 0.357421875,
      "learning_rate": 1.5584415584415583e-05,
      "loss": 1.6773,
      "step": 952
    },
    {
      "epoch": 1.22,
      "grad_norm": 0.341796875,
      "learning_rate": 1.5367965367965366e-05,
      "loss": 1.654,
      "step": 953
    },
    {
      "epoch": 1.22,
      "grad_norm": 0.380859375,
      "learning_rate": 1.5151515151515153e-05,
      "loss": 1.7277,
      "step": 954
    },
    {
      "epoch": 1.22,
      "grad_norm": 0.359375,
      "learning_rate": 1.4935064935064936e-05,
      "loss": 1.6995,
      "step": 955
    },
    {
      "epoch": 1.22,
      "grad_norm": 0.359375,
      "learning_rate": 1.471861471861472e-05,
      "loss": 1.7112,
      "step": 956
    },
    {
      "epoch": 1.22,
      "grad_norm": 0.35546875,
      "learning_rate": 1.4502164502164502e-05,
      "loss": 1.7486,
      "step": 957
    },
    {
      "epoch": 1.23,
      "grad_norm": 0.40234375,
      "learning_rate": 1.4285714285714285e-05,
      "loss": 1.7627,
      "step": 958
    },
    {
      "epoch": 1.23,
      "grad_norm": 0.34375,
      "learning_rate": 1.406926406926407e-05,
      "loss": 1.6709,
      "step": 959
    },
    {
      "epoch": 1.23,
      "grad_norm": 0.345703125,
      "learning_rate": 1.3852813852813853e-05,
      "loss": 1.7112,
      "step": 960
    },
    {
      "epoch": 1.23,
      "grad_norm": 0.375,
      "learning_rate": 1.3636363636363637e-05,
      "loss": 1.7299,
      "step": 961
    },
    {
      "epoch": 1.23,
      "grad_norm": 0.390625,
      "learning_rate": 1.3419913419913421e-05,
      "loss": 1.6656,
      "step": 962
    },
    {
      "epoch": 1.23,
      "grad_norm": 0.37890625,
      "learning_rate": 1.3203463203463205e-05,
      "loss": 1.6621,
      "step": 963
    },
    {
      "epoch": 1.23,
      "grad_norm": 0.33203125,
      "learning_rate": 1.2987012987012986e-05,
      "loss": 1.6751,
      "step": 964
    },
    {
      "epoch": 1.23,
      "grad_norm": 0.48828125,
      "learning_rate": 1.2770562770562773e-05,
      "loss": 1.7,
      "step": 965
    },
    {
      "epoch": 1.24,
      "grad_norm": 0.345703125,
      "learning_rate": 1.2554112554112554e-05,
      "loss": 1.7547,
      "step": 966
    },
    {
      "epoch": 1.24,
      "grad_norm": 0.357421875,
      "learning_rate": 1.2337662337662339e-05,
      "loss": 1.6858,
      "step": 967
    },
    {
      "epoch": 1.24,
      "grad_norm": 0.33984375,
      "learning_rate": 1.2121212121212122e-05,
      "loss": 1.5868,
      "step": 968
    },
    {
      "epoch": 1.24,
      "grad_norm": 0.3359375,
      "learning_rate": 1.1904761904761905e-05,
      "loss": 1.7628,
      "step": 969
    },
    {
      "epoch": 1.24,
      "grad_norm": 0.314453125,
      "learning_rate": 1.1688311688311688e-05,
      "loss": 1.6664,
      "step": 970
    },
    {
      "epoch": 1.24,
      "grad_norm": 0.33984375,
      "learning_rate": 1.1471861471861473e-05,
      "loss": 1.7387,
      "step": 971
    },
    {
      "epoch": 1.24,
      "grad_norm": 0.392578125,
      "learning_rate": 1.1255411255411256e-05,
      "loss": 1.7765,
      "step": 972
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.341796875,
      "learning_rate": 1.103896103896104e-05,
      "loss": 1.7887,
      "step": 973
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.341796875,
      "learning_rate": 1.0822510822510823e-05,
      "loss": 1.7143,
      "step": 974
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.33984375,
      "learning_rate": 1.0606060606060607e-05,
      "loss": 1.6573,
      "step": 975
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.33203125,
      "learning_rate": 1.038961038961039e-05,
      "loss": 1.7191,
      "step": 976
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.337890625,
      "learning_rate": 1.0173160173160174e-05,
      "loss": 1.7437,
      "step": 977
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.36328125,
      "learning_rate": 9.956709956709957e-06,
      "loss": 1.8012,
      "step": 978
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.3515625,
      "learning_rate": 9.740259740259742e-06,
      "loss": 1.6769,
      "step": 979
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.326171875,
      "learning_rate": 9.523809523809523e-06,
      "loss": 1.7836,
      "step": 980
    },
    {
      "epoch": 1.26,
      "grad_norm": 0.375,
      "learning_rate": 9.307359307359308e-06,
      "loss": 1.6654,
      "step": 981
    },
    {
      "epoch": 1.26,
      "grad_norm": 0.37109375,
      "learning_rate": 9.090909090909091e-06,
      "loss": 1.7367,
      "step": 982
    },
    {
      "epoch": 1.26,
      "grad_norm": 0.353515625,
      "learning_rate": 8.874458874458876e-06,
      "loss": 1.6735,
      "step": 983
    },
    {
      "epoch": 1.26,
      "grad_norm": 0.32421875,
      "learning_rate": 8.658008658008657e-06,
      "loss": 1.5795,
      "step": 984
    },
    {
      "epoch": 1.26,
      "grad_norm": 0.333984375,
      "learning_rate": 8.441558441558442e-06,
      "loss": 1.6784,
      "step": 985
    },
    {
      "epoch": 1.26,
      "grad_norm": 0.33203125,
      "learning_rate": 8.225108225108225e-06,
      "loss": 1.6743,
      "step": 986
    },
    {
      "epoch": 1.26,
      "grad_norm": 0.3671875,
      "learning_rate": 8.008658008658008e-06,
      "loss": 1.6877,
      "step": 987
    },
    {
      "epoch": 1.26,
      "grad_norm": 0.3125,
      "learning_rate": 7.792207792207792e-06,
      "loss": 1.7913,
      "step": 988
    },
    {
      "epoch": 1.27,
      "grad_norm": 0.38671875,
      "learning_rate": 7.5757575757575764e-06,
      "loss": 1.6147,
      "step": 989
    },
    {
      "epoch": 1.27,
      "grad_norm": 0.33203125,
      "learning_rate": 7.35930735930736e-06,
      "loss": 1.7615,
      "step": 990
    },
    {
      "epoch": 1.27,
      "grad_norm": 0.3515625,
      "learning_rate": 7.142857142857143e-06,
      "loss": 1.7406,
      "step": 991
    },
    {
      "epoch": 1.27,
      "grad_norm": 0.427734375,
      "learning_rate": 6.926406926406927e-06,
      "loss": 1.6287,
      "step": 992
    },
    {
      "epoch": 1.27,
      "grad_norm": 0.345703125,
      "learning_rate": 6.709956709956711e-06,
      "loss": 1.7655,
      "step": 993
    },
    {
      "epoch": 1.27,
      "grad_norm": 0.330078125,
      "learning_rate": 6.493506493506493e-06,
      "loss": 1.7003,
      "step": 994
    },
    {
      "epoch": 1.27,
      "grad_norm": 0.345703125,
      "learning_rate": 6.277056277056277e-06,
      "loss": 1.7531,
      "step": 995
    },
    {
      "epoch": 1.27,
      "grad_norm": 0.365234375,
      "learning_rate": 6.060606060606061e-06,
      "loss": 1.7475,
      "step": 996
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.361328125,
      "learning_rate": 5.844155844155844e-06,
      "loss": 1.6811,
      "step": 997
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.3203125,
      "learning_rate": 5.627705627705628e-06,
      "loss": 1.7256,
      "step": 998
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.349609375,
      "learning_rate": 5.411255411255411e-06,
      "loss": 1.7238,
      "step": 999
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.34375,
      "learning_rate": 5.194805194805195e-06,
      "loss": 1.6972,
      "step": 1000
    },
    {
      "epoch": 1.28,
      "eval_loss": 1.7247449159622192,
      "eval_runtime": 125.5655,
      "eval_samples_per_second": 39.82,
      "eval_steps_per_second": 1.25,
      "step": 1000
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.359375,
      "learning_rate": 4.978354978354978e-06,
      "loss": 1.6889,
      "step": 1001
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.330078125,
      "learning_rate": 4.7619047619047615e-06,
      "loss": 1.7495,
      "step": 1002
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.37890625,
      "learning_rate": 4.5454545454545455e-06,
      "loss": 1.6499,
      "step": 1003
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.353515625,
      "learning_rate": 4.329004329004329e-06,
      "loss": 1.6646,
      "step": 1004
    },
    {
      "epoch": 1.29,
      "grad_norm": 0.3671875,
      "learning_rate": 4.112554112554113e-06,
      "loss": 1.7523,
      "step": 1005
    },
    {
      "epoch": 1.29,
      "grad_norm": 0.333984375,
      "learning_rate": 3.896103896103896e-06,
      "loss": 1.7232,
      "step": 1006
    },
    {
      "epoch": 1.29,
      "grad_norm": 0.400390625,
      "learning_rate": 3.67965367965368e-06,
      "loss": 1.7209,
      "step": 1007
    },
    {
      "epoch": 1.29,
      "grad_norm": 0.333984375,
      "learning_rate": 3.4632034632034634e-06,
      "loss": 1.6627,
      "step": 1008
    },
    {
      "epoch": 1.29,
      "grad_norm": 0.3203125,
      "learning_rate": 3.2467532467532465e-06,
      "loss": 1.6678,
      "step": 1009
    },
    {
      "epoch": 1.29,
      "grad_norm": 0.333984375,
      "learning_rate": 3.0303030303030305e-06,
      "loss": 1.6757,
      "step": 1010
    },
    {
      "epoch": 1.29,
      "grad_norm": 0.33203125,
      "learning_rate": 2.813852813852814e-06,
      "loss": 1.7232,
      "step": 1011
    },
    {
      "epoch": 1.29,
      "grad_norm": 0.33203125,
      "learning_rate": 2.5974025974025976e-06,
      "loss": 1.6797,
      "step": 1012
    },
    {
      "epoch": 1.3,
      "grad_norm": 0.337890625,
      "learning_rate": 2.3809523809523808e-06,
      "loss": 1.825,
      "step": 1013
    },
    {
      "epoch": 1.3,
      "grad_norm": 0.337890625,
      "learning_rate": 2.1645021645021643e-06,
      "loss": 1.6841,
      "step": 1014
    },
    {
      "epoch": 1.3,
      "grad_norm": 0.328125,
      "learning_rate": 1.948051948051948e-06,
      "loss": 1.726,
      "step": 1015
    },
    {
      "epoch": 1.3,
      "grad_norm": 0.392578125,
      "learning_rate": 1.7316017316017317e-06,
      "loss": 1.7702,
      "step": 1016
    },
    {
      "epoch": 1.3,
      "grad_norm": 0.3359375,
      "learning_rate": 1.5151515151515152e-06,
      "loss": 1.7689,
      "step": 1017
    },
    {
      "epoch": 1.3,
      "grad_norm": 0.37109375,
      "learning_rate": 1.2987012987012988e-06,
      "loss": 1.7003,
      "step": 1018
    },
    {
      "epoch": 1.3,
      "grad_norm": 0.40234375,
      "learning_rate": 1.0822510822510822e-06,
      "loss": 1.6841,
      "step": 1019
    },
    {
      "epoch": 1.31,
      "grad_norm": 0.380859375,
      "learning_rate": 8.658008658008658e-07,
      "loss": 1.7318,
      "step": 1020
    },
    {
      "epoch": 1.31,
      "grad_norm": 0.345703125,
      "learning_rate": 6.493506493506494e-07,
      "loss": 1.7868,
      "step": 1021
    },
    {
      "epoch": 1.31,
      "grad_norm": 0.34765625,
      "learning_rate": 4.329004329004329e-07,
      "loss": 1.7199,
      "step": 1022
    },
    {
      "epoch": 1.31,
      "grad_norm": 0.359375,
      "learning_rate": 2.1645021645021646e-07,
      "loss": 1.7633,
      "step": 1023
    },
    {
      "epoch": 1.31,
      "grad_norm": 0.328125,
      "learning_rate": 0.0,
      "loss": 1.7878,
      "step": 1024
    },
    {
      "epoch": 1.31,
      "step": 1024,
      "total_flos": 7.46638513614422e+17,
      "train_loss": 1.8673716291086748,
      "train_runtime": 7714.204,
      "train_samples_per_second": 8.495,
      "train_steps_per_second": 0.133
    },
    {
      "epoch": 1.31,
      "eval_loss": 1.7247449159622192,
      "eval_runtime": 125.8509,
      "eval_samples_per_second": 39.73,
      "eval_steps_per_second": 1.248,
      "step": 1024
    }
  ],
  "logging_steps": 1,
  "max_steps": 1024,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 50,
  "total_flos": 7.46638513614422e+17,
  "train_batch_size": 32,
  "trial_name": null,
  "trial_params": null
}