{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.18875838926174496, "eval_steps": 199, "global_step": 2600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 380.0, "learning_rate": 8.403361344537815e-08, "loss": 10.0168, "num_input_tokens_seen": 2097152, "step": 1 }, { "epoch": 0.0, "grad_norm": 348.0, "learning_rate": 1.680672268907563e-07, "loss": 9.9578, "num_input_tokens_seen": 4194304, "step": 2 }, { "epoch": 0.0, "grad_norm": 372.0, "learning_rate": 2.5210084033613445e-07, "loss": 9.9307, "num_input_tokens_seen": 6291456, "step": 3 }, { "epoch": 0.0, "grad_norm": 356.0, "learning_rate": 3.361344537815126e-07, "loss": 10.0096, "num_input_tokens_seen": 8388608, "step": 4 }, { "epoch": 0.0, "grad_norm": 422.0, "learning_rate": 4.201680672268908e-07, "loss": 10.0019, "num_input_tokens_seen": 10485760, "step": 5 }, { "epoch": 0.0, "grad_norm": 344.0, "learning_rate": 5.042016806722689e-07, "loss": 9.8981, "num_input_tokens_seen": 12582912, "step": 6 }, { "epoch": 0.0, "grad_norm": 346.0, "learning_rate": 5.882352941176471e-07, "loss": 9.9229, "num_input_tokens_seen": 14680064, "step": 7 }, { "epoch": 0.0, "grad_norm": 304.0, "learning_rate": 6.722689075630252e-07, "loss": 9.8722, "num_input_tokens_seen": 16777216, "step": 8 }, { "epoch": 0.0, "grad_norm": 264.0, "learning_rate": 7.563025210084034e-07, "loss": 9.84, "num_input_tokens_seen": 18874368, "step": 9 }, { "epoch": 0.0, "grad_norm": 252.0, "learning_rate": 8.403361344537816e-07, "loss": 9.7397, "num_input_tokens_seen": 20971520, "step": 10 }, { "epoch": 0.0, "grad_norm": 248.0, "learning_rate": 9.243697478991598e-07, "loss": 9.7562, "num_input_tokens_seen": 23068672, "step": 11 }, { "epoch": 0.0, "grad_norm": 214.0, "learning_rate": 1.0084033613445378e-06, "loss": 9.6955, "num_input_tokens_seen": 25165824, "step": 12 }, { "epoch": 0.0, "grad_norm": 196.0, "learning_rate": 1.092436974789916e-06, "loss": 9.6317, "num_input_tokens_seen": 27262976, "step": 13 }, { "epoch": 0.0, "grad_norm": 157.0, "learning_rate": 1.1764705882352942e-06, "loss": 9.522, "num_input_tokens_seen": 29360128, "step": 14 }, { "epoch": 0.0, "grad_norm": 133.0, "learning_rate": 1.2605042016806724e-06, "loss": 9.4459, "num_input_tokens_seen": 31457280, "step": 15 }, { "epoch": 0.0, "grad_norm": 111.5, "learning_rate": 1.3445378151260504e-06, "loss": 9.3929, "num_input_tokens_seen": 33554432, "step": 16 }, { "epoch": 0.0, "grad_norm": 96.5, "learning_rate": 1.4285714285714286e-06, "loss": 9.3054, "num_input_tokens_seen": 35651584, "step": 17 }, { "epoch": 0.0, "grad_norm": 75.5, "learning_rate": 1.5126050420168068e-06, "loss": 9.2098, "num_input_tokens_seen": 37748736, "step": 18 }, { "epoch": 0.0, "grad_norm": 65.0, "learning_rate": 1.5966386554621848e-06, "loss": 9.1634, "num_input_tokens_seen": 39845888, "step": 19 }, { "epoch": 0.0, "grad_norm": 54.0, "learning_rate": 1.6806722689075632e-06, "loss": 9.1167, "num_input_tokens_seen": 41943040, "step": 20 }, { "epoch": 0.0, "grad_norm": 44.25, "learning_rate": 1.7647058823529414e-06, "loss": 9.0489, "num_input_tokens_seen": 44040192, "step": 21 }, { "epoch": 0.0, "grad_norm": 38.0, "learning_rate": 1.8487394957983196e-06, "loss": 8.9928, "num_input_tokens_seen": 46137344, "step": 22 }, { "epoch": 0.0, "grad_norm": 35.25, "learning_rate": 1.932773109243698e-06, "loss": 8.8916, "num_input_tokens_seen": 48234496, "step": 23 }, { "epoch": 0.01, "grad_norm": 30.0, "learning_rate": 2.0168067226890756e-06, "loss": 8.8398, "num_input_tokens_seen": 50331648, "step": 24 }, { "epoch": 0.01, "grad_norm": 26.125, "learning_rate": 2.100840336134454e-06, "loss": 8.75, "num_input_tokens_seen": 52428800, "step": 25 }, { "epoch": 0.01, "grad_norm": 24.5, "learning_rate": 2.184873949579832e-06, "loss": 8.6363, "num_input_tokens_seen": 54525952, "step": 26 }, { "epoch": 0.01, "grad_norm": 24.625, "learning_rate": 2.2689075630252102e-06, "loss": 8.6346, "num_input_tokens_seen": 56623104, "step": 27 }, { "epoch": 0.01, "grad_norm": 26.75, "learning_rate": 2.3529411764705885e-06, "loss": 8.5871, "num_input_tokens_seen": 58720256, "step": 28 }, { "epoch": 0.01, "grad_norm": 26.75, "learning_rate": 2.4369747899159667e-06, "loss": 8.5476, "num_input_tokens_seen": 60817408, "step": 29 }, { "epoch": 0.01, "grad_norm": 25.25, "learning_rate": 2.521008403361345e-06, "loss": 8.5004, "num_input_tokens_seen": 62914560, "step": 30 }, { "epoch": 0.01, "grad_norm": 22.75, "learning_rate": 2.605042016806723e-06, "loss": 8.4537, "num_input_tokens_seen": 65011712, "step": 31 }, { "epoch": 0.01, "grad_norm": 18.75, "learning_rate": 2.689075630252101e-06, "loss": 8.4088, "num_input_tokens_seen": 67108864, "step": 32 }, { "epoch": 0.01, "grad_norm": 18.5, "learning_rate": 2.7731092436974795e-06, "loss": 8.389, "num_input_tokens_seen": 69206016, "step": 33 }, { "epoch": 0.01, "grad_norm": 15.5625, "learning_rate": 2.8571428571428573e-06, "loss": 8.3228, "num_input_tokens_seen": 71303168, "step": 34 }, { "epoch": 0.01, "grad_norm": 13.5625, "learning_rate": 2.9411764705882355e-06, "loss": 8.2781, "num_input_tokens_seen": 73400320, "step": 35 }, { "epoch": 0.01, "grad_norm": 13.0625, "learning_rate": 3.0252100840336137e-06, "loss": 8.2525, "num_input_tokens_seen": 75497472, "step": 36 }, { "epoch": 0.01, "grad_norm": 13.25, "learning_rate": 3.109243697478992e-06, "loss": 8.1735, "num_input_tokens_seen": 77594624, "step": 37 }, { "epoch": 0.01, "grad_norm": 12.625, "learning_rate": 3.1932773109243696e-06, "loss": 8.1301, "num_input_tokens_seen": 79691776, "step": 38 }, { "epoch": 0.01, "grad_norm": 12.5625, "learning_rate": 3.2773109243697483e-06, "loss": 8.0726, "num_input_tokens_seen": 81788928, "step": 39 }, { "epoch": 0.01, "grad_norm": 11.0, "learning_rate": 3.3613445378151265e-06, "loss": 8.0339, "num_input_tokens_seen": 83886080, "step": 40 }, { "epoch": 0.01, "grad_norm": 10.1875, "learning_rate": 3.4453781512605043e-06, "loss": 8.0077, "num_input_tokens_seen": 85983232, "step": 41 }, { "epoch": 0.01, "grad_norm": 9.5625, "learning_rate": 3.529411764705883e-06, "loss": 7.9286, "num_input_tokens_seen": 88080384, "step": 42 }, { "epoch": 0.01, "grad_norm": 8.4375, "learning_rate": 3.6134453781512607e-06, "loss": 7.9055, "num_input_tokens_seen": 90177536, "step": 43 }, { "epoch": 0.01, "grad_norm": 7.53125, "learning_rate": 3.6974789915966393e-06, "loss": 7.8555, "num_input_tokens_seen": 92274688, "step": 44 }, { "epoch": 0.01, "grad_norm": 7.15625, "learning_rate": 3.781512605042017e-06, "loss": 7.7934, "num_input_tokens_seen": 94371840, "step": 45 }, { "epoch": 0.01, "grad_norm": 7.125, "learning_rate": 3.865546218487396e-06, "loss": 7.7522, "num_input_tokens_seen": 96468992, "step": 46 }, { "epoch": 0.01, "grad_norm": 7.09375, "learning_rate": 3.9495798319327735e-06, "loss": 7.7007, "num_input_tokens_seen": 98566144, "step": 47 }, { "epoch": 0.01, "grad_norm": 7.3125, "learning_rate": 4.033613445378151e-06, "loss": 7.6809, "num_input_tokens_seen": 100663296, "step": 48 }, { "epoch": 0.01, "grad_norm": 7.59375, "learning_rate": 4.11764705882353e-06, "loss": 7.623, "num_input_tokens_seen": 102760448, "step": 49 }, { "epoch": 0.01, "grad_norm": 6.625, "learning_rate": 4.201680672268908e-06, "loss": 7.6064, "num_input_tokens_seen": 104857600, "step": 50 }, { "epoch": 0.01, "grad_norm": 6.1875, "learning_rate": 4.2857142857142855e-06, "loss": 7.5352, "num_input_tokens_seen": 106954752, "step": 51 }, { "epoch": 0.01, "grad_norm": 6.28125, "learning_rate": 4.369747899159664e-06, "loss": 7.4911, "num_input_tokens_seen": 109051904, "step": 52 }, { "epoch": 0.01, "grad_norm": 6.15625, "learning_rate": 4.453781512605043e-06, "loss": 7.4393, "num_input_tokens_seen": 111149056, "step": 53 }, { "epoch": 0.01, "grad_norm": 5.78125, "learning_rate": 4.5378151260504205e-06, "loss": 7.4134, "num_input_tokens_seen": 113246208, "step": 54 }, { "epoch": 0.01, "grad_norm": 5.8125, "learning_rate": 4.621848739495799e-06, "loss": 7.3876, "num_input_tokens_seen": 115343360, "step": 55 }, { "epoch": 0.01, "grad_norm": 5.375, "learning_rate": 4.705882352941177e-06, "loss": 7.3283, "num_input_tokens_seen": 117440512, "step": 56 }, { "epoch": 0.01, "grad_norm": 5.5625, "learning_rate": 4.7899159663865555e-06, "loss": 7.2959, "num_input_tokens_seen": 119537664, "step": 57 }, { "epoch": 0.01, "grad_norm": 5.21875, "learning_rate": 4.873949579831933e-06, "loss": 7.2391, "num_input_tokens_seen": 121634816, "step": 58 }, { "epoch": 0.01, "grad_norm": 5.1875, "learning_rate": 4.957983193277311e-06, "loss": 7.1795, "num_input_tokens_seen": 123731968, "step": 59 }, { "epoch": 0.01, "grad_norm": 5.0625, "learning_rate": 5.04201680672269e-06, "loss": 7.1513, "num_input_tokens_seen": 125829120, "step": 60 }, { "epoch": 0.01, "grad_norm": 5.34375, "learning_rate": 5.1260504201680675e-06, "loss": 7.0947, "num_input_tokens_seen": 127926272, "step": 61 }, { "epoch": 0.01, "grad_norm": 4.90625, "learning_rate": 5.210084033613446e-06, "loss": 7.051, "num_input_tokens_seen": 130023424, "step": 62 }, { "epoch": 0.01, "grad_norm": 4.90625, "learning_rate": 5.294117647058824e-06, "loss": 7.0014, "num_input_tokens_seen": 132120576, "step": 63 }, { "epoch": 0.01, "grad_norm": 4.625, "learning_rate": 5.378151260504202e-06, "loss": 6.9464, "num_input_tokens_seen": 134217728, "step": 64 }, { "epoch": 0.01, "grad_norm": 6.0625, "learning_rate": 5.4621848739495795e-06, "loss": 6.881, "num_input_tokens_seen": 136314880, "step": 65 }, { "epoch": 0.01, "grad_norm": 5.03125, "learning_rate": 5.546218487394959e-06, "loss": 6.8252, "num_input_tokens_seen": 138412032, "step": 66 }, { "epoch": 0.01, "grad_norm": 4.96875, "learning_rate": 5.630252100840337e-06, "loss": 6.8061, "num_input_tokens_seen": 140509184, "step": 67 }, { "epoch": 0.01, "grad_norm": 5.34375, "learning_rate": 5.7142857142857145e-06, "loss": 6.7608, "num_input_tokens_seen": 142606336, "step": 68 }, { "epoch": 0.01, "grad_norm": 4.375, "learning_rate": 5.798319327731093e-06, "loss": 6.7459, "num_input_tokens_seen": 144703488, "step": 69 }, { "epoch": 0.01, "grad_norm": 4.65625, "learning_rate": 5.882352941176471e-06, "loss": 6.679, "num_input_tokens_seen": 146800640, "step": 70 }, { "epoch": 0.01, "grad_norm": 5.625, "learning_rate": 5.9663865546218495e-06, "loss": 6.6458, "num_input_tokens_seen": 148897792, "step": 71 }, { "epoch": 0.02, "grad_norm": 6.25, "learning_rate": 6.050420168067227e-06, "loss": 6.6044, "num_input_tokens_seen": 150994944, "step": 72 }, { "epoch": 0.02, "grad_norm": 4.90625, "learning_rate": 6.134453781512606e-06, "loss": 6.5125, "num_input_tokens_seen": 153092096, "step": 73 }, { "epoch": 0.02, "grad_norm": 4.6875, "learning_rate": 6.218487394957984e-06, "loss": 6.5516, "num_input_tokens_seen": 155189248, "step": 74 }, { "epoch": 0.02, "grad_norm": 4.6875, "learning_rate": 6.3025210084033615e-06, "loss": 6.4681, "num_input_tokens_seen": 157286400, "step": 75 }, { "epoch": 0.02, "grad_norm": 6.1875, "learning_rate": 6.386554621848739e-06, "loss": 6.4484, "num_input_tokens_seen": 159383552, "step": 76 }, { "epoch": 0.02, "grad_norm": 4.625, "learning_rate": 6.470588235294119e-06, "loss": 6.4324, "num_input_tokens_seen": 161480704, "step": 77 }, { "epoch": 0.02, "grad_norm": 4.84375, "learning_rate": 6.5546218487394966e-06, "loss": 6.3783, "num_input_tokens_seen": 163577856, "step": 78 }, { "epoch": 0.02, "grad_norm": 6.34375, "learning_rate": 6.638655462184874e-06, "loss": 6.3552, "num_input_tokens_seen": 165675008, "step": 79 }, { "epoch": 0.02, "grad_norm": 5.8125, "learning_rate": 6.722689075630253e-06, "loss": 6.3333, "num_input_tokens_seen": 167772160, "step": 80 }, { "epoch": 0.02, "grad_norm": 6.21875, "learning_rate": 6.806722689075631e-06, "loss": 6.2396, "num_input_tokens_seen": 169869312, "step": 81 }, { "epoch": 0.02, "grad_norm": 9.0, "learning_rate": 6.8907563025210085e-06, "loss": 6.2413, "num_input_tokens_seen": 171966464, "step": 82 }, { "epoch": 0.02, "grad_norm": 7.53125, "learning_rate": 6.974789915966387e-06, "loss": 6.2187, "num_input_tokens_seen": 174063616, "step": 83 }, { "epoch": 0.02, "grad_norm": 8.4375, "learning_rate": 7.058823529411766e-06, "loss": 6.199, "num_input_tokens_seen": 176160768, "step": 84 }, { "epoch": 0.02, "grad_norm": 8.375, "learning_rate": 7.1428571428571436e-06, "loss": 6.1808, "num_input_tokens_seen": 178257920, "step": 85 }, { "epoch": 0.02, "grad_norm": 6.75, "learning_rate": 7.226890756302521e-06, "loss": 6.152, "num_input_tokens_seen": 180355072, "step": 86 }, { "epoch": 0.02, "grad_norm": 7.6875, "learning_rate": 7.310924369747899e-06, "loss": 6.0847, "num_input_tokens_seen": 182452224, "step": 87 }, { "epoch": 0.02, "grad_norm": 7.125, "learning_rate": 7.394957983193279e-06, "loss": 6.1132, "num_input_tokens_seen": 184549376, "step": 88 }, { "epoch": 0.02, "grad_norm": 7.46875, "learning_rate": 7.478991596638656e-06, "loss": 6.1028, "num_input_tokens_seen": 186646528, "step": 89 }, { "epoch": 0.02, "grad_norm": 6.6875, "learning_rate": 7.563025210084034e-06, "loss": 6.0179, "num_input_tokens_seen": 188743680, "step": 90 }, { "epoch": 0.02, "grad_norm": 6.6875, "learning_rate": 7.647058823529411e-06, "loss": 6.0006, "num_input_tokens_seen": 190840832, "step": 91 }, { "epoch": 0.02, "grad_norm": 7.40625, "learning_rate": 7.731092436974791e-06, "loss": 5.9917, "num_input_tokens_seen": 192937984, "step": 92 }, { "epoch": 0.02, "grad_norm": 5.5, "learning_rate": 7.815126050420168e-06, "loss": 5.9925, "num_input_tokens_seen": 195035136, "step": 93 }, { "epoch": 0.02, "grad_norm": 7.90625, "learning_rate": 7.899159663865547e-06, "loss": 5.9592, "num_input_tokens_seen": 197132288, "step": 94 }, { "epoch": 0.02, "grad_norm": 7.625, "learning_rate": 7.983193277310926e-06, "loss": 5.8628, "num_input_tokens_seen": 199229440, "step": 95 }, { "epoch": 0.02, "grad_norm": 7.96875, "learning_rate": 8.067226890756303e-06, "loss": 5.8931, "num_input_tokens_seen": 201326592, "step": 96 }, { "epoch": 0.02, "grad_norm": 8.0625, "learning_rate": 8.151260504201681e-06, "loss": 5.8708, "num_input_tokens_seen": 203423744, "step": 97 }, { "epoch": 0.02, "grad_norm": 7.3125, "learning_rate": 8.23529411764706e-06, "loss": 5.8157, "num_input_tokens_seen": 205520896, "step": 98 }, { "epoch": 0.02, "grad_norm": 7.46875, "learning_rate": 8.319327731092438e-06, "loss": 5.8302, "num_input_tokens_seen": 207618048, "step": 99 }, { "epoch": 0.02, "grad_norm": 8.875, "learning_rate": 8.403361344537815e-06, "loss": 5.777, "num_input_tokens_seen": 209715200, "step": 100 }, { "epoch": 0.02, "grad_norm": 7.0625, "learning_rate": 8.487394957983194e-06, "loss": 5.7678, "num_input_tokens_seen": 211812352, "step": 101 }, { "epoch": 0.02, "grad_norm": 10.5625, "learning_rate": 8.571428571428571e-06, "loss": 5.7426, "num_input_tokens_seen": 213909504, "step": 102 }, { "epoch": 0.02, "grad_norm": 11.1875, "learning_rate": 8.655462184873951e-06, "loss": 5.7215, "num_input_tokens_seen": 216006656, "step": 103 }, { "epoch": 0.02, "grad_norm": 5.34375, "learning_rate": 8.739495798319328e-06, "loss": 5.7174, "num_input_tokens_seen": 218103808, "step": 104 }, { "epoch": 0.02, "grad_norm": 12.9375, "learning_rate": 8.823529411764707e-06, "loss": 5.7294, "num_input_tokens_seen": 220200960, "step": 105 }, { "epoch": 0.02, "grad_norm": 12.125, "learning_rate": 8.907563025210085e-06, "loss": 5.7037, "num_input_tokens_seen": 222298112, "step": 106 }, { "epoch": 0.02, "grad_norm": 7.78125, "learning_rate": 8.991596638655462e-06, "loss": 5.6525, "num_input_tokens_seen": 224395264, "step": 107 }, { "epoch": 0.02, "grad_norm": 9.0625, "learning_rate": 9.075630252100841e-06, "loss": 5.6111, "num_input_tokens_seen": 226492416, "step": 108 }, { "epoch": 0.02, "grad_norm": 10.125, "learning_rate": 9.15966386554622e-06, "loss": 5.5881, "num_input_tokens_seen": 228589568, "step": 109 }, { "epoch": 0.02, "grad_norm": 9.625, "learning_rate": 9.243697478991598e-06, "loss": 5.6082, "num_input_tokens_seen": 230686720, "step": 110 }, { "epoch": 0.02, "grad_norm": 8.1875, "learning_rate": 9.327731092436975e-06, "loss": 5.5969, "num_input_tokens_seen": 232783872, "step": 111 }, { "epoch": 0.02, "grad_norm": 10.375, "learning_rate": 9.411764705882354e-06, "loss": 5.5476, "num_input_tokens_seen": 234881024, "step": 112 }, { "epoch": 0.02, "grad_norm": 7.90625, "learning_rate": 9.49579831932773e-06, "loss": 5.5416, "num_input_tokens_seen": 236978176, "step": 113 }, { "epoch": 0.02, "grad_norm": 10.3125, "learning_rate": 9.579831932773111e-06, "loss": 5.4936, "num_input_tokens_seen": 239075328, "step": 114 }, { "epoch": 0.02, "grad_norm": 9.1875, "learning_rate": 9.663865546218488e-06, "loss": 5.4903, "num_input_tokens_seen": 241172480, "step": 115 }, { "epoch": 0.02, "grad_norm": 9.4375, "learning_rate": 9.747899159663867e-06, "loss": 5.4852, "num_input_tokens_seen": 243269632, "step": 116 }, { "epoch": 0.02, "grad_norm": 8.75, "learning_rate": 9.831932773109244e-06, "loss": 5.4291, "num_input_tokens_seen": 245366784, "step": 117 }, { "epoch": 0.02, "grad_norm": 6.0, "learning_rate": 9.915966386554622e-06, "loss": 5.4299, "num_input_tokens_seen": 247463936, "step": 118 }, { "epoch": 0.02, "grad_norm": 8.125, "learning_rate": 1e-05, "loss": 5.4144, "num_input_tokens_seen": 249561088, "step": 119 }, { "epoch": 0.03, "grad_norm": 6.96875, "learning_rate": 1.008403361344538e-05, "loss": 5.3902, "num_input_tokens_seen": 251658240, "step": 120 }, { "epoch": 0.03, "grad_norm": 8.1875, "learning_rate": 1.0168067226890756e-05, "loss": 5.3525, "num_input_tokens_seen": 253755392, "step": 121 }, { "epoch": 0.03, "grad_norm": 7.6875, "learning_rate": 1.0252100840336135e-05, "loss": 5.3348, "num_input_tokens_seen": 255852544, "step": 122 }, { "epoch": 0.03, "grad_norm": 9.5625, "learning_rate": 1.0336134453781514e-05, "loss": 5.3413, "num_input_tokens_seen": 257949696, "step": 123 }, { "epoch": 0.03, "grad_norm": 7.21875, "learning_rate": 1.0420168067226892e-05, "loss": 5.3003, "num_input_tokens_seen": 260046848, "step": 124 }, { "epoch": 0.03, "grad_norm": 11.1875, "learning_rate": 1.0504201680672271e-05, "loss": 5.2606, "num_input_tokens_seen": 262144000, "step": 125 }, { "epoch": 0.03, "grad_norm": 8.5625, "learning_rate": 1.0588235294117648e-05, "loss": 5.2561, "num_input_tokens_seen": 264241152, "step": 126 }, { "epoch": 0.03, "grad_norm": 10.8125, "learning_rate": 1.0672268907563026e-05, "loss": 5.2177, "num_input_tokens_seen": 266338304, "step": 127 }, { "epoch": 0.03, "grad_norm": 11.375, "learning_rate": 1.0756302521008403e-05, "loss": 5.2021, "num_input_tokens_seen": 268435456, "step": 128 }, { "epoch": 0.03, "grad_norm": 7.84375, "learning_rate": 1.0840336134453782e-05, "loss": 5.1699, "num_input_tokens_seen": 270532608, "step": 129 }, { "epoch": 0.03, "grad_norm": 11.5, "learning_rate": 1.0924369747899159e-05, "loss": 5.1648, "num_input_tokens_seen": 272629760, "step": 130 }, { "epoch": 0.03, "grad_norm": 8.875, "learning_rate": 1.100840336134454e-05, "loss": 5.1014, "num_input_tokens_seen": 274726912, "step": 131 }, { "epoch": 0.03, "grad_norm": 10.5, "learning_rate": 1.1092436974789918e-05, "loss": 5.106, "num_input_tokens_seen": 276824064, "step": 132 }, { "epoch": 0.03, "grad_norm": 11.125, "learning_rate": 1.1176470588235295e-05, "loss": 5.1104, "num_input_tokens_seen": 278921216, "step": 133 }, { "epoch": 0.03, "grad_norm": 8.0625, "learning_rate": 1.1260504201680673e-05, "loss": 5.1012, "num_input_tokens_seen": 281018368, "step": 134 }, { "epoch": 0.03, "grad_norm": 9.3125, "learning_rate": 1.134453781512605e-05, "loss": 5.0835, "num_input_tokens_seen": 283115520, "step": 135 }, { "epoch": 0.03, "grad_norm": 8.875, "learning_rate": 1.1428571428571429e-05, "loss": 5.0666, "num_input_tokens_seen": 285212672, "step": 136 }, { "epoch": 0.03, "grad_norm": 7.15625, "learning_rate": 1.1512605042016806e-05, "loss": 5.043, "num_input_tokens_seen": 287309824, "step": 137 }, { "epoch": 0.03, "grad_norm": 7.46875, "learning_rate": 1.1596638655462186e-05, "loss": 4.98, "num_input_tokens_seen": 289406976, "step": 138 }, { "epoch": 0.03, "grad_norm": 8.125, "learning_rate": 1.1680672268907565e-05, "loss": 4.9469, "num_input_tokens_seen": 291504128, "step": 139 }, { "epoch": 0.03, "grad_norm": 8.375, "learning_rate": 1.1764705882352942e-05, "loss": 4.9545, "num_input_tokens_seen": 293601280, "step": 140 }, { "epoch": 0.03, "grad_norm": 7.90625, "learning_rate": 1.184873949579832e-05, "loss": 4.9428, "num_input_tokens_seen": 295698432, "step": 141 }, { "epoch": 0.03, "grad_norm": 8.1875, "learning_rate": 1.1932773109243699e-05, "loss": 4.9189, "num_input_tokens_seen": 297795584, "step": 142 }, { "epoch": 0.03, "grad_norm": 8.4375, "learning_rate": 1.2016806722689076e-05, "loss": 4.901, "num_input_tokens_seen": 299892736, "step": 143 }, { "epoch": 0.03, "grad_norm": 7.78125, "learning_rate": 1.2100840336134455e-05, "loss": 4.8716, "num_input_tokens_seen": 301989888, "step": 144 }, { "epoch": 0.03, "grad_norm": 8.5625, "learning_rate": 1.2184873949579832e-05, "loss": 4.8398, "num_input_tokens_seen": 304087040, "step": 145 }, { "epoch": 0.03, "grad_norm": 6.96875, "learning_rate": 1.2268907563025212e-05, "loss": 4.8527, "num_input_tokens_seen": 306184192, "step": 146 }, { "epoch": 0.03, "grad_norm": 10.25, "learning_rate": 1.235294117647059e-05, "loss": 4.8271, "num_input_tokens_seen": 308281344, "step": 147 }, { "epoch": 0.03, "grad_norm": 9.9375, "learning_rate": 1.2436974789915967e-05, "loss": 4.8113, "num_input_tokens_seen": 310378496, "step": 148 }, { "epoch": 0.03, "grad_norm": 5.5, "learning_rate": 1.2521008403361346e-05, "loss": 4.7797, "num_input_tokens_seen": 312475648, "step": 149 }, { "epoch": 0.03, "grad_norm": 10.25, "learning_rate": 1.2605042016806723e-05, "loss": 4.773, "num_input_tokens_seen": 314572800, "step": 150 }, { "epoch": 0.03, "grad_norm": 9.875, "learning_rate": 1.2689075630252102e-05, "loss": 4.7342, "num_input_tokens_seen": 316669952, "step": 151 }, { "epoch": 0.03, "grad_norm": 7.375, "learning_rate": 1.2773109243697479e-05, "loss": 4.7429, "num_input_tokens_seen": 318767104, "step": 152 }, { "epoch": 0.03, "grad_norm": 8.25, "learning_rate": 1.2857142857142859e-05, "loss": 4.6898, "num_input_tokens_seen": 320864256, "step": 153 }, { "epoch": 0.03, "grad_norm": 7.65625, "learning_rate": 1.2941176470588238e-05, "loss": 4.7092, "num_input_tokens_seen": 322961408, "step": 154 }, { "epoch": 0.03, "grad_norm": 8.4375, "learning_rate": 1.3025210084033614e-05, "loss": 4.6652, "num_input_tokens_seen": 325058560, "step": 155 }, { "epoch": 0.03, "grad_norm": 7.0625, "learning_rate": 1.3109243697478993e-05, "loss": 4.6469, "num_input_tokens_seen": 327155712, "step": 156 }, { "epoch": 0.03, "grad_norm": 9.4375, "learning_rate": 1.319327731092437e-05, "loss": 4.6116, "num_input_tokens_seen": 329252864, "step": 157 }, { "epoch": 0.03, "grad_norm": 7.6875, "learning_rate": 1.3277310924369749e-05, "loss": 4.6151, "num_input_tokens_seen": 331350016, "step": 158 }, { "epoch": 0.03, "grad_norm": 6.90625, "learning_rate": 1.3361344537815126e-05, "loss": 4.589, "num_input_tokens_seen": 333447168, "step": 159 }, { "epoch": 0.03, "grad_norm": 8.9375, "learning_rate": 1.3445378151260506e-05, "loss": 4.554, "num_input_tokens_seen": 335544320, "step": 160 }, { "epoch": 0.03, "grad_norm": 6.3125, "learning_rate": 1.3529411764705885e-05, "loss": 4.5447, "num_input_tokens_seen": 337641472, "step": 161 }, { "epoch": 0.03, "grad_norm": 8.0, "learning_rate": 1.3613445378151261e-05, "loss": 4.5427, "num_input_tokens_seen": 339738624, "step": 162 }, { "epoch": 0.03, "grad_norm": 7.46875, "learning_rate": 1.369747899159664e-05, "loss": 4.5064, "num_input_tokens_seen": 341835776, "step": 163 }, { "epoch": 0.03, "grad_norm": 6.75, "learning_rate": 1.3781512605042017e-05, "loss": 4.5028, "num_input_tokens_seen": 343932928, "step": 164 }, { "epoch": 0.03, "grad_norm": 6.625, "learning_rate": 1.3865546218487396e-05, "loss": 4.4906, "num_input_tokens_seen": 346030080, "step": 165 }, { "epoch": 0.03, "grad_norm": 6.15625, "learning_rate": 1.3949579831932774e-05, "loss": 4.4578, "num_input_tokens_seen": 348127232, "step": 166 }, { "epoch": 0.04, "grad_norm": 5.09375, "learning_rate": 1.4033613445378151e-05, "loss": 4.4623, "num_input_tokens_seen": 350224384, "step": 167 }, { "epoch": 0.04, "grad_norm": 7.25, "learning_rate": 1.4117647058823532e-05, "loss": 4.4293, "num_input_tokens_seen": 352321536, "step": 168 }, { "epoch": 0.04, "grad_norm": 7.375, "learning_rate": 1.4201680672268908e-05, "loss": 4.4135, "num_input_tokens_seen": 354418688, "step": 169 }, { "epoch": 0.04, "grad_norm": 6.5625, "learning_rate": 1.4285714285714287e-05, "loss": 4.367, "num_input_tokens_seen": 356515840, "step": 170 }, { "epoch": 0.04, "grad_norm": 7.75, "learning_rate": 1.4369747899159666e-05, "loss": 4.3954, "num_input_tokens_seen": 358612992, "step": 171 }, { "epoch": 0.04, "grad_norm": 5.0, "learning_rate": 1.4453781512605043e-05, "loss": 4.3388, "num_input_tokens_seen": 360710144, "step": 172 }, { "epoch": 0.04, "grad_norm": 8.875, "learning_rate": 1.4537815126050421e-05, "loss": 4.3209, "num_input_tokens_seen": 362807296, "step": 173 }, { "epoch": 0.04, "grad_norm": 5.9375, "learning_rate": 1.4621848739495798e-05, "loss": 4.3165, "num_input_tokens_seen": 364904448, "step": 174 }, { "epoch": 0.04, "grad_norm": 6.0625, "learning_rate": 1.4705882352941179e-05, "loss": 4.3025, "num_input_tokens_seen": 367001600, "step": 175 }, { "epoch": 0.04, "grad_norm": 8.6875, "learning_rate": 1.4789915966386557e-05, "loss": 4.2631, "num_input_tokens_seen": 369098752, "step": 176 }, { "epoch": 0.04, "grad_norm": 6.75, "learning_rate": 1.4873949579831934e-05, "loss": 4.2474, "num_input_tokens_seen": 371195904, "step": 177 }, { "epoch": 0.04, "grad_norm": 9.5625, "learning_rate": 1.4957983193277313e-05, "loss": 4.2473, "num_input_tokens_seen": 373293056, "step": 178 }, { "epoch": 0.04, "grad_norm": 8.1875, "learning_rate": 1.504201680672269e-05, "loss": 4.194, "num_input_tokens_seen": 375390208, "step": 179 }, { "epoch": 0.04, "grad_norm": 9.4375, "learning_rate": 1.5126050420168068e-05, "loss": 4.1842, "num_input_tokens_seen": 377487360, "step": 180 }, { "epoch": 0.04, "grad_norm": 7.8125, "learning_rate": 1.5210084033613445e-05, "loss": 4.1562, "num_input_tokens_seen": 379584512, "step": 181 }, { "epoch": 0.04, "grad_norm": 10.375, "learning_rate": 1.5294117647058822e-05, "loss": 4.1952, "num_input_tokens_seen": 381681664, "step": 182 }, { "epoch": 0.04, "grad_norm": 10.0, "learning_rate": 1.5378151260504204e-05, "loss": 4.1421, "num_input_tokens_seen": 383778816, "step": 183 }, { "epoch": 0.04, "grad_norm": 6.5625, "learning_rate": 1.5462184873949583e-05, "loss": 4.158, "num_input_tokens_seen": 385875968, "step": 184 }, { "epoch": 0.04, "grad_norm": 8.3125, "learning_rate": 1.5546218487394958e-05, "loss": 4.1286, "num_input_tokens_seen": 387973120, "step": 185 }, { "epoch": 0.04, "grad_norm": 4.8125, "learning_rate": 1.5630252100840337e-05, "loss": 4.11, "num_input_tokens_seen": 390070272, "step": 186 }, { "epoch": 0.04, "grad_norm": 8.0625, "learning_rate": 1.5714285714285715e-05, "loss": 4.0771, "num_input_tokens_seen": 392167424, "step": 187 }, { "epoch": 0.04, "grad_norm": 6.8125, "learning_rate": 1.5798319327731094e-05, "loss": 4.07, "num_input_tokens_seen": 394264576, "step": 188 }, { "epoch": 0.04, "grad_norm": 7.90625, "learning_rate": 1.5882352941176473e-05, "loss": 4.0397, "num_input_tokens_seen": 396361728, "step": 189 }, { "epoch": 0.04, "grad_norm": 6.53125, "learning_rate": 1.596638655462185e-05, "loss": 4.0532, "num_input_tokens_seen": 398458880, "step": 190 }, { "epoch": 0.04, "grad_norm": 10.75, "learning_rate": 1.605042016806723e-05, "loss": 4.0002, "num_input_tokens_seen": 400556032, "step": 191 }, { "epoch": 0.04, "grad_norm": 11.125, "learning_rate": 1.6134453781512605e-05, "loss": 3.9925, "num_input_tokens_seen": 402653184, "step": 192 }, { "epoch": 0.04, "grad_norm": 6.25, "learning_rate": 1.6218487394957984e-05, "loss": 3.9996, "num_input_tokens_seen": 404750336, "step": 193 }, { "epoch": 0.04, "grad_norm": 8.5625, "learning_rate": 1.6302521008403362e-05, "loss": 3.9756, "num_input_tokens_seen": 406847488, "step": 194 }, { "epoch": 0.04, "grad_norm": 8.0625, "learning_rate": 1.638655462184874e-05, "loss": 3.9886, "num_input_tokens_seen": 408944640, "step": 195 }, { "epoch": 0.04, "grad_norm": 5.34375, "learning_rate": 1.647058823529412e-05, "loss": 3.9601, "num_input_tokens_seen": 411041792, "step": 196 }, { "epoch": 0.04, "grad_norm": 7.375, "learning_rate": 1.6554621848739495e-05, "loss": 3.953, "num_input_tokens_seen": 413138944, "step": 197 }, { "epoch": 0.04, "grad_norm": 6.125, "learning_rate": 1.6638655462184877e-05, "loss": 3.9374, "num_input_tokens_seen": 415236096, "step": 198 }, { "epoch": 0.04, "grad_norm": 7.75, "learning_rate": 1.6722689075630255e-05, "loss": 3.9469, "num_input_tokens_seen": 417333248, "step": 199 }, { "epoch": 0.04, "eval_loss": 3.913985252380371, "eval_runtime": 2006.3376, "eval_samples_per_second": 1.965, "eval_steps_per_second": 0.491, "num_input_tokens_seen": 417333248, "step": 199 }, { "epoch": 0.04, "grad_norm": 7.25, "learning_rate": 1.680672268907563e-05, "loss": 3.9549, "num_input_tokens_seen": 419430400, "step": 200 }, { "epoch": 0.04, "grad_norm": 7.40625, "learning_rate": 1.689075630252101e-05, "loss": 3.923, "num_input_tokens_seen": 421527552, "step": 201 }, { "epoch": 0.04, "grad_norm": 8.625, "learning_rate": 1.6974789915966388e-05, "loss": 3.9051, "num_input_tokens_seen": 423624704, "step": 202 }, { "epoch": 0.04, "grad_norm": 7.4375, "learning_rate": 1.7058823529411767e-05, "loss": 3.8699, "num_input_tokens_seen": 425721856, "step": 203 }, { "epoch": 0.04, "grad_norm": 5.75, "learning_rate": 1.7142857142857142e-05, "loss": 3.8571, "num_input_tokens_seen": 427819008, "step": 204 }, { "epoch": 0.04, "grad_norm": 7.125, "learning_rate": 1.7226890756302524e-05, "loss": 3.8535, "num_input_tokens_seen": 429916160, "step": 205 }, { "epoch": 0.04, "grad_norm": 6.15625, "learning_rate": 1.7310924369747902e-05, "loss": 3.7989, "num_input_tokens_seen": 432013312, "step": 206 }, { "epoch": 0.04, "grad_norm": 5.75, "learning_rate": 1.7394957983193278e-05, "loss": 3.8366, "num_input_tokens_seen": 434110464, "step": 207 }, { "epoch": 0.04, "grad_norm": 7.78125, "learning_rate": 1.7478991596638656e-05, "loss": 3.8179, "num_input_tokens_seen": 436207616, "step": 208 }, { "epoch": 0.04, "grad_norm": 7.5625, "learning_rate": 1.7563025210084035e-05, "loss": 3.7902, "num_input_tokens_seen": 438304768, "step": 209 }, { "epoch": 0.04, "grad_norm": 5.15625, "learning_rate": 1.7647058823529414e-05, "loss": 3.7911, "num_input_tokens_seen": 440401920, "step": 210 }, { "epoch": 0.04, "grad_norm": 10.0, "learning_rate": 1.7731092436974792e-05, "loss": 3.7648, "num_input_tokens_seen": 442499072, "step": 211 }, { "epoch": 0.04, "grad_norm": 10.0625, "learning_rate": 1.781512605042017e-05, "loss": 3.7809, "num_input_tokens_seen": 444596224, "step": 212 }, { "epoch": 0.04, "grad_norm": 6.375, "learning_rate": 1.789915966386555e-05, "loss": 3.7486, "num_input_tokens_seen": 446693376, "step": 213 }, { "epoch": 0.04, "grad_norm": 6.125, "learning_rate": 1.7983193277310925e-05, "loss": 3.7506, "num_input_tokens_seen": 448790528, "step": 214 }, { "epoch": 0.05, "grad_norm": 6.3125, "learning_rate": 1.8067226890756303e-05, "loss": 3.7224, "num_input_tokens_seen": 450887680, "step": 215 }, { "epoch": 0.05, "grad_norm": 6.96875, "learning_rate": 1.8151260504201682e-05, "loss": 3.6858, "num_input_tokens_seen": 452984832, "step": 216 }, { "epoch": 0.05, "grad_norm": 6.0625, "learning_rate": 1.823529411764706e-05, "loss": 3.7196, "num_input_tokens_seen": 455081984, "step": 217 }, { "epoch": 0.05, "grad_norm": 4.71875, "learning_rate": 1.831932773109244e-05, "loss": 3.7189, "num_input_tokens_seen": 457179136, "step": 218 }, { "epoch": 0.05, "grad_norm": 4.28125, "learning_rate": 1.8403361344537814e-05, "loss": 3.6689, "num_input_tokens_seen": 459276288, "step": 219 }, { "epoch": 0.05, "grad_norm": 5.1875, "learning_rate": 1.8487394957983196e-05, "loss": 3.6684, "num_input_tokens_seen": 461373440, "step": 220 }, { "epoch": 0.05, "grad_norm": 4.40625, "learning_rate": 1.8571428571428575e-05, "loss": 3.6715, "num_input_tokens_seen": 463470592, "step": 221 }, { "epoch": 0.05, "grad_norm": 5.78125, "learning_rate": 1.865546218487395e-05, "loss": 3.6402, "num_input_tokens_seen": 465567744, "step": 222 }, { "epoch": 0.05, "grad_norm": 5.4375, "learning_rate": 1.873949579831933e-05, "loss": 3.6363, "num_input_tokens_seen": 467664896, "step": 223 }, { "epoch": 0.05, "grad_norm": 5.40625, "learning_rate": 1.8823529411764708e-05, "loss": 3.6408, "num_input_tokens_seen": 469762048, "step": 224 }, { "epoch": 0.05, "grad_norm": 4.90625, "learning_rate": 1.8907563025210086e-05, "loss": 3.6181, "num_input_tokens_seen": 471859200, "step": 225 }, { "epoch": 0.05, "grad_norm": 4.78125, "learning_rate": 1.899159663865546e-05, "loss": 3.6198, "num_input_tokens_seen": 473956352, "step": 226 }, { "epoch": 0.05, "grad_norm": 4.71875, "learning_rate": 1.9075630252100844e-05, "loss": 3.5722, "num_input_tokens_seen": 476053504, "step": 227 }, { "epoch": 0.05, "grad_norm": 5.09375, "learning_rate": 1.9159663865546222e-05, "loss": 3.5493, "num_input_tokens_seen": 478150656, "step": 228 }, { "epoch": 0.05, "grad_norm": 5.84375, "learning_rate": 1.9243697478991597e-05, "loss": 3.5328, "num_input_tokens_seen": 480247808, "step": 229 }, { "epoch": 0.05, "grad_norm": 6.0625, "learning_rate": 1.9327731092436976e-05, "loss": 3.569, "num_input_tokens_seen": 482344960, "step": 230 }, { "epoch": 0.05, "grad_norm": 3.890625, "learning_rate": 1.9411764705882355e-05, "loss": 3.459, "num_input_tokens_seen": 484442112, "step": 231 }, { "epoch": 0.05, "grad_norm": 8.4375, "learning_rate": 1.9495798319327733e-05, "loss": 3.5222, "num_input_tokens_seen": 486539264, "step": 232 }, { "epoch": 0.05, "grad_norm": 7.375, "learning_rate": 1.957983193277311e-05, "loss": 3.5104, "num_input_tokens_seen": 488636416, "step": 233 }, { "epoch": 0.05, "grad_norm": 4.875, "learning_rate": 1.9663865546218487e-05, "loss": 3.5062, "num_input_tokens_seen": 490733568, "step": 234 }, { "epoch": 0.05, "grad_norm": 6.34375, "learning_rate": 1.974789915966387e-05, "loss": 3.4506, "num_input_tokens_seen": 492830720, "step": 235 }, { "epoch": 0.05, "grad_norm": 4.8125, "learning_rate": 1.9831932773109244e-05, "loss": 3.4939, "num_input_tokens_seen": 494927872, "step": 236 }, { "epoch": 0.05, "grad_norm": 5.8125, "learning_rate": 1.9915966386554623e-05, "loss": 3.4827, "num_input_tokens_seen": 497025024, "step": 237 }, { "epoch": 0.05, "grad_norm": 5.375, "learning_rate": 2e-05, "loss": 3.4589, "num_input_tokens_seen": 499122176, "step": 238 }, { "epoch": 0.05, "grad_norm": 4.09375, "learning_rate": 2e-05, "loss": 3.4193, "num_input_tokens_seen": 501219328, "step": 239 }, { "epoch": 0.05, "grad_norm": 4.28125, "learning_rate": 2e-05, "loss": 3.4441, "num_input_tokens_seen": 503316480, "step": 240 }, { "epoch": 0.05, "grad_norm": 3.59375, "learning_rate": 2e-05, "loss": 3.4409, "num_input_tokens_seen": 505413632, "step": 241 }, { "epoch": 0.05, "grad_norm": 4.65625, "learning_rate": 2e-05, "loss": 3.4129, "num_input_tokens_seen": 507510784, "step": 242 }, { "epoch": 0.05, "grad_norm": 3.90625, "learning_rate": 2e-05, "loss": 3.4003, "num_input_tokens_seen": 509607936, "step": 243 }, { "epoch": 0.05, "grad_norm": 3.015625, "learning_rate": 2e-05, "loss": 3.4113, "num_input_tokens_seen": 511705088, "step": 244 }, { "epoch": 0.05, "grad_norm": 4.25, "learning_rate": 2e-05, "loss": 3.38, "num_input_tokens_seen": 513802240, "step": 245 }, { "epoch": 0.05, "grad_norm": 3.40625, "learning_rate": 2e-05, "loss": 3.3491, "num_input_tokens_seen": 515899392, "step": 246 }, { "epoch": 0.05, "grad_norm": 4.09375, "learning_rate": 2e-05, "loss": 3.3739, "num_input_tokens_seen": 517996544, "step": 247 }, { "epoch": 0.05, "grad_norm": 3.25, "learning_rate": 2e-05, "loss": 3.3593, "num_input_tokens_seen": 520093696, "step": 248 }, { "epoch": 0.05, "grad_norm": 4.5625, "learning_rate": 2e-05, "loss": 3.3046, "num_input_tokens_seen": 522190848, "step": 249 }, { "epoch": 0.05, "grad_norm": 3.984375, "learning_rate": 2e-05, "loss": 3.344, "num_input_tokens_seen": 524288000, "step": 250 }, { "epoch": 0.05, "grad_norm": 4.125, "learning_rate": 2e-05, "loss": 3.3125, "num_input_tokens_seen": 526385152, "step": 251 }, { "epoch": 0.05, "grad_norm": 4.0625, "learning_rate": 2e-05, "loss": 3.2951, "num_input_tokens_seen": 528482304, "step": 252 }, { "epoch": 0.05, "grad_norm": 4.46875, "learning_rate": 2e-05, "loss": 3.3293, "num_input_tokens_seen": 530579456, "step": 253 }, { "epoch": 0.05, "grad_norm": 4.21875, "learning_rate": 2e-05, "loss": 3.2859, "num_input_tokens_seen": 532676608, "step": 254 }, { "epoch": 0.05, "grad_norm": 3.9375, "learning_rate": 2e-05, "loss": 3.295, "num_input_tokens_seen": 534773760, "step": 255 }, { "epoch": 0.05, "grad_norm": 3.84375, "learning_rate": 2e-05, "loss": 3.3064, "num_input_tokens_seen": 536870912, "step": 256 }, { "epoch": 0.05, "grad_norm": 3.296875, "learning_rate": 2e-05, "loss": 3.2556, "num_input_tokens_seen": 538968064, "step": 257 }, { "epoch": 0.05, "grad_norm": 3.515625, "learning_rate": 2e-05, "loss": 3.2735, "num_input_tokens_seen": 541065216, "step": 258 }, { "epoch": 0.05, "grad_norm": 2.890625, "learning_rate": 2e-05, "loss": 3.2571, "num_input_tokens_seen": 543162368, "step": 259 }, { "epoch": 0.05, "grad_norm": 3.265625, "learning_rate": 2e-05, "loss": 3.2361, "num_input_tokens_seen": 545259520, "step": 260 }, { "epoch": 0.05, "grad_norm": 3.359375, "learning_rate": 2e-05, "loss": 3.2487, "num_input_tokens_seen": 547356672, "step": 261 }, { "epoch": 0.05, "grad_norm": 4.28125, "learning_rate": 2e-05, "loss": 3.1921, "num_input_tokens_seen": 549453824, "step": 262 }, { "epoch": 0.06, "grad_norm": 3.078125, "learning_rate": 2e-05, "loss": 3.1962, "num_input_tokens_seen": 551550976, "step": 263 }, { "epoch": 0.06, "grad_norm": 4.59375, "learning_rate": 2e-05, "loss": 3.2288, "num_input_tokens_seen": 553648128, "step": 264 }, { "epoch": 0.06, "grad_norm": 3.765625, "learning_rate": 2e-05, "loss": 3.1997, "num_input_tokens_seen": 555745280, "step": 265 }, { "epoch": 0.06, "grad_norm": 3.9375, "learning_rate": 2e-05, "loss": 3.2058, "num_input_tokens_seen": 557842432, "step": 266 }, { "epoch": 0.06, "grad_norm": 4.5625, "learning_rate": 2e-05, "loss": 3.1865, "num_input_tokens_seen": 559939584, "step": 267 }, { "epoch": 0.06, "grad_norm": 4.3125, "learning_rate": 2e-05, "loss": 3.1915, "num_input_tokens_seen": 562036736, "step": 268 }, { "epoch": 0.06, "grad_norm": 4.375, "learning_rate": 2e-05, "loss": 3.1936, "num_input_tokens_seen": 564133888, "step": 269 }, { "epoch": 0.06, "grad_norm": 4.375, "learning_rate": 2e-05, "loss": 3.2147, "num_input_tokens_seen": 566231040, "step": 270 }, { "epoch": 0.06, "grad_norm": 3.484375, "learning_rate": 2e-05, "loss": 3.1502, "num_input_tokens_seen": 568328192, "step": 271 }, { "epoch": 0.06, "grad_norm": 6.125, "learning_rate": 2e-05, "loss": 3.212, "num_input_tokens_seen": 570425344, "step": 272 }, { "epoch": 0.06, "grad_norm": 5.9375, "learning_rate": 2e-05, "loss": 3.1481, "num_input_tokens_seen": 572522496, "step": 273 }, { "epoch": 0.06, "grad_norm": 3.078125, "learning_rate": 2e-05, "loss": 3.155, "num_input_tokens_seen": 574619648, "step": 274 }, { "epoch": 0.06, "grad_norm": 4.34375, "learning_rate": 2e-05, "loss": 3.1369, "num_input_tokens_seen": 576716800, "step": 275 }, { "epoch": 0.06, "grad_norm": 4.1875, "learning_rate": 2e-05, "loss": 3.1274, "num_input_tokens_seen": 578813952, "step": 276 }, { "epoch": 0.06, "grad_norm": 3.28125, "learning_rate": 2e-05, "loss": 3.1304, "num_input_tokens_seen": 580911104, "step": 277 }, { "epoch": 0.06, "grad_norm": 5.34375, "learning_rate": 2e-05, "loss": 3.1417, "num_input_tokens_seen": 583008256, "step": 278 }, { "epoch": 0.06, "grad_norm": 5.21875, "learning_rate": 2e-05, "loss": 3.1535, "num_input_tokens_seen": 585105408, "step": 279 }, { "epoch": 0.06, "grad_norm": 2.703125, "learning_rate": 2e-05, "loss": 3.0986, "num_input_tokens_seen": 587202560, "step": 280 }, { "epoch": 0.06, "grad_norm": 6.03125, "learning_rate": 2e-05, "loss": 3.1155, "num_input_tokens_seen": 589299712, "step": 281 }, { "epoch": 0.06, "grad_norm": 5.5625, "learning_rate": 2e-05, "loss": 3.0778, "num_input_tokens_seen": 591396864, "step": 282 }, { "epoch": 0.06, "grad_norm": 3.5, "learning_rate": 2e-05, "loss": 3.0915, "num_input_tokens_seen": 593494016, "step": 283 }, { "epoch": 0.06, "grad_norm": 3.46875, "learning_rate": 2e-05, "loss": 3.1342, "num_input_tokens_seen": 595591168, "step": 284 }, { "epoch": 0.06, "grad_norm": 3.65625, "learning_rate": 2e-05, "loss": 3.0678, "num_input_tokens_seen": 597688320, "step": 285 }, { "epoch": 0.06, "grad_norm": 4.15625, "learning_rate": 2e-05, "loss": 3.1034, "num_input_tokens_seen": 599785472, "step": 286 }, { "epoch": 0.06, "grad_norm": 2.90625, "learning_rate": 2e-05, "loss": 3.097, "num_input_tokens_seen": 601882624, "step": 287 }, { "epoch": 0.06, "grad_norm": 3.34375, "learning_rate": 2e-05, "loss": 3.1068, "num_input_tokens_seen": 603979776, "step": 288 }, { "epoch": 0.06, "grad_norm": 2.6875, "learning_rate": 2e-05, "loss": 3.0652, "num_input_tokens_seen": 606076928, "step": 289 }, { "epoch": 0.06, "grad_norm": 3.0, "learning_rate": 2e-05, "loss": 3.0617, "num_input_tokens_seen": 608174080, "step": 290 }, { "epoch": 0.06, "grad_norm": 2.515625, "learning_rate": 2e-05, "loss": 3.0646, "num_input_tokens_seen": 610271232, "step": 291 }, { "epoch": 0.06, "grad_norm": 3.546875, "learning_rate": 2e-05, "loss": 3.0943, "num_input_tokens_seen": 612368384, "step": 292 }, { "epoch": 0.06, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 3.0165, "num_input_tokens_seen": 614465536, "step": 293 }, { "epoch": 0.06, "grad_norm": 3.515625, "learning_rate": 2e-05, "loss": 3.0296, "num_input_tokens_seen": 616562688, "step": 294 }, { "epoch": 0.06, "grad_norm": 3.046875, "learning_rate": 2e-05, "loss": 3.0338, "num_input_tokens_seen": 618659840, "step": 295 }, { "epoch": 0.06, "grad_norm": 3.375, "learning_rate": 2e-05, "loss": 3.044, "num_input_tokens_seen": 620756992, "step": 296 }, { "epoch": 0.06, "grad_norm": 2.734375, "learning_rate": 2e-05, "loss": 3.0633, "num_input_tokens_seen": 622854144, "step": 297 }, { "epoch": 0.06, "grad_norm": 3.609375, "learning_rate": 2e-05, "loss": 3.0242, "num_input_tokens_seen": 624951296, "step": 298 }, { "epoch": 0.06, "grad_norm": 2.890625, "learning_rate": 2e-05, "loss": 3.0351, "num_input_tokens_seen": 627048448, "step": 299 }, { "epoch": 0.06, "grad_norm": 3.484375, "learning_rate": 2e-05, "loss": 2.9913, "num_input_tokens_seen": 629145600, "step": 300 }, { "epoch": 0.06, "grad_norm": 3.46875, "learning_rate": 2e-05, "loss": 3.0039, "num_input_tokens_seen": 631242752, "step": 301 }, { "epoch": 0.06, "grad_norm": 2.75, "learning_rate": 2e-05, "loss": 3.0468, "num_input_tokens_seen": 633339904, "step": 302 }, { "epoch": 0.06, "grad_norm": 3.640625, "learning_rate": 2e-05, "loss": 3.032, "num_input_tokens_seen": 635437056, "step": 303 }, { "epoch": 0.06, "grad_norm": 3.25, "learning_rate": 2e-05, "loss": 3.0188, "num_input_tokens_seen": 637534208, "step": 304 }, { "epoch": 0.06, "grad_norm": 2.96875, "learning_rate": 2e-05, "loss": 3.0026, "num_input_tokens_seen": 639631360, "step": 305 }, { "epoch": 0.06, "grad_norm": 4.03125, "learning_rate": 2e-05, "loss": 2.9779, "num_input_tokens_seen": 641728512, "step": 306 }, { "epoch": 0.06, "grad_norm": 3.765625, "learning_rate": 2e-05, "loss": 3.0523, "num_input_tokens_seen": 643825664, "step": 307 }, { "epoch": 0.06, "grad_norm": 3.015625, "learning_rate": 2e-05, "loss": 3.0251, "num_input_tokens_seen": 645922816, "step": 308 }, { "epoch": 0.06, "grad_norm": 3.203125, "learning_rate": 2e-05, "loss": 3.0103, "num_input_tokens_seen": 648019968, "step": 309 }, { "epoch": 0.07, "grad_norm": 2.625, "learning_rate": 2e-05, "loss": 3.0095, "num_input_tokens_seen": 650117120, "step": 310 }, { "epoch": 0.07, "grad_norm": 3.125, "learning_rate": 2e-05, "loss": 2.9667, "num_input_tokens_seen": 652214272, "step": 311 }, { "epoch": 0.07, "grad_norm": 2.46875, "learning_rate": 2e-05, "loss": 2.9823, "num_input_tokens_seen": 654311424, "step": 312 }, { "epoch": 0.07, "grad_norm": 2.90625, "learning_rate": 2e-05, "loss": 2.9807, "num_input_tokens_seen": 656408576, "step": 313 }, { "epoch": 0.07, "grad_norm": 3.609375, "learning_rate": 2e-05, "loss": 2.9975, "num_input_tokens_seen": 658505728, "step": 314 }, { "epoch": 0.07, "grad_norm": 2.890625, "learning_rate": 2e-05, "loss": 2.9686, "num_input_tokens_seen": 660602880, "step": 315 }, { "epoch": 0.07, "grad_norm": 3.109375, "learning_rate": 2e-05, "loss": 2.9864, "num_input_tokens_seen": 662700032, "step": 316 }, { "epoch": 0.07, "grad_norm": 3.0625, "learning_rate": 2e-05, "loss": 2.9523, "num_input_tokens_seen": 664797184, "step": 317 }, { "epoch": 0.07, "grad_norm": 2.828125, "learning_rate": 2e-05, "loss": 2.9933, "num_input_tokens_seen": 666894336, "step": 318 }, { "epoch": 0.07, "grad_norm": 2.90625, "learning_rate": 2e-05, "loss": 2.9572, "num_input_tokens_seen": 668991488, "step": 319 }, { "epoch": 0.07, "grad_norm": 2.796875, "learning_rate": 2e-05, "loss": 2.9504, "num_input_tokens_seen": 671088640, "step": 320 }, { "epoch": 0.07, "grad_norm": 2.671875, "learning_rate": 2e-05, "loss": 2.9586, "num_input_tokens_seen": 673185792, "step": 321 }, { "epoch": 0.07, "grad_norm": 2.484375, "learning_rate": 2e-05, "loss": 2.9605, "num_input_tokens_seen": 675282944, "step": 322 }, { "epoch": 0.07, "grad_norm": 2.5, "learning_rate": 2e-05, "loss": 2.9396, "num_input_tokens_seen": 677380096, "step": 323 }, { "epoch": 0.07, "grad_norm": 3.140625, "learning_rate": 2e-05, "loss": 2.9407, "num_input_tokens_seen": 679477248, "step": 324 }, { "epoch": 0.07, "grad_norm": 3.15625, "learning_rate": 2e-05, "loss": 2.9254, "num_input_tokens_seen": 681574400, "step": 325 }, { "epoch": 0.07, "grad_norm": 3.015625, "learning_rate": 2e-05, "loss": 2.9158, "num_input_tokens_seen": 683671552, "step": 326 }, { "epoch": 0.07, "grad_norm": 3.203125, "learning_rate": 2e-05, "loss": 2.9317, "num_input_tokens_seen": 685768704, "step": 327 }, { "epoch": 0.07, "grad_norm": 3.0625, "learning_rate": 2e-05, "loss": 2.9183, "num_input_tokens_seen": 687865856, "step": 328 }, { "epoch": 0.07, "grad_norm": 2.640625, "learning_rate": 2e-05, "loss": 2.9378, "num_input_tokens_seen": 689963008, "step": 329 }, { "epoch": 0.07, "grad_norm": 2.65625, "learning_rate": 2e-05, "loss": 2.9237, "num_input_tokens_seen": 692060160, "step": 330 }, { "epoch": 0.07, "grad_norm": 2.578125, "learning_rate": 2e-05, "loss": 2.9259, "num_input_tokens_seen": 694157312, "step": 331 }, { "epoch": 0.07, "grad_norm": 2.34375, "learning_rate": 2e-05, "loss": 2.9404, "num_input_tokens_seen": 696254464, "step": 332 }, { "epoch": 0.07, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 2.9242, "num_input_tokens_seen": 698351616, "step": 333 }, { "epoch": 0.07, "grad_norm": 3.15625, "learning_rate": 2e-05, "loss": 2.8951, "num_input_tokens_seen": 700448768, "step": 334 }, { "epoch": 0.07, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 2.8961, "num_input_tokens_seen": 702545920, "step": 335 }, { "epoch": 0.07, "grad_norm": 3.609375, "learning_rate": 2e-05, "loss": 2.8834, "num_input_tokens_seen": 704643072, "step": 336 }, { "epoch": 0.07, "grad_norm": 3.046875, "learning_rate": 2e-05, "loss": 2.8938, "num_input_tokens_seen": 706740224, "step": 337 }, { "epoch": 0.07, "grad_norm": 3.328125, "learning_rate": 2e-05, "loss": 2.9106, "num_input_tokens_seen": 708837376, "step": 338 }, { "epoch": 0.07, "grad_norm": 3.09375, "learning_rate": 2e-05, "loss": 2.8693, "num_input_tokens_seen": 710934528, "step": 339 }, { "epoch": 0.07, "grad_norm": 2.734375, "learning_rate": 2e-05, "loss": 2.927, "num_input_tokens_seen": 713031680, "step": 340 }, { "epoch": 0.07, "grad_norm": 3.296875, "learning_rate": 2e-05, "loss": 2.8972, "num_input_tokens_seen": 715128832, "step": 341 }, { "epoch": 0.07, "grad_norm": 2.234375, "learning_rate": 2e-05, "loss": 2.8604, "num_input_tokens_seen": 717225984, "step": 342 }, { "epoch": 0.07, "grad_norm": 4.28125, "learning_rate": 2e-05, "loss": 2.8847, "num_input_tokens_seen": 719323136, "step": 343 }, { "epoch": 0.07, "grad_norm": 3.3125, "learning_rate": 2e-05, "loss": 2.8893, "num_input_tokens_seen": 721420288, "step": 344 }, { "epoch": 0.07, "grad_norm": 3.8125, "learning_rate": 2e-05, "loss": 2.8779, "num_input_tokens_seen": 723517440, "step": 345 }, { "epoch": 0.07, "grad_norm": 4.4375, "learning_rate": 2e-05, "loss": 2.8721, "num_input_tokens_seen": 725614592, "step": 346 }, { "epoch": 0.07, "grad_norm": 2.265625, "learning_rate": 2e-05, "loss": 2.8788, "num_input_tokens_seen": 727711744, "step": 347 }, { "epoch": 0.07, "grad_norm": 4.59375, "learning_rate": 2e-05, "loss": 2.8812, "num_input_tokens_seen": 729808896, "step": 348 }, { "epoch": 0.07, "grad_norm": 5.59375, "learning_rate": 2e-05, "loss": 2.8586, "num_input_tokens_seen": 731906048, "step": 349 }, { "epoch": 0.07, "grad_norm": 2.84375, "learning_rate": 2e-05, "loss": 2.8703, "num_input_tokens_seen": 734003200, "step": 350 }, { "epoch": 0.07, "grad_norm": 4.09375, "learning_rate": 2e-05, "loss": 2.8958, "num_input_tokens_seen": 736100352, "step": 351 }, { "epoch": 0.07, "grad_norm": 4.125, "learning_rate": 2e-05, "loss": 2.8572, "num_input_tokens_seen": 738197504, "step": 352 }, { "epoch": 0.07, "grad_norm": 2.765625, "learning_rate": 2e-05, "loss": 2.8491, "num_input_tokens_seen": 740294656, "step": 353 }, { "epoch": 0.07, "grad_norm": 3.078125, "learning_rate": 2e-05, "loss": 2.8936, "num_input_tokens_seen": 742391808, "step": 354 }, { "epoch": 0.07, "grad_norm": 4.1875, "learning_rate": 2e-05, "loss": 2.8708, "num_input_tokens_seen": 744488960, "step": 355 }, { "epoch": 0.07, "grad_norm": 3.671875, "learning_rate": 2e-05, "loss": 2.8501, "num_input_tokens_seen": 746586112, "step": 356 }, { "epoch": 0.07, "grad_norm": 2.53125, "learning_rate": 2e-05, "loss": 2.8226, "num_input_tokens_seen": 748683264, "step": 357 }, { "epoch": 0.08, "grad_norm": 3.28125, "learning_rate": 2e-05, "loss": 2.8612, "num_input_tokens_seen": 750780416, "step": 358 }, { "epoch": 0.08, "grad_norm": 3.921875, "learning_rate": 2e-05, "loss": 2.8409, "num_input_tokens_seen": 752877568, "step": 359 }, { "epoch": 0.08, "grad_norm": 2.703125, "learning_rate": 2e-05, "loss": 2.8417, "num_input_tokens_seen": 754974720, "step": 360 }, { "epoch": 0.08, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 2.8713, "num_input_tokens_seen": 757071872, "step": 361 }, { "epoch": 0.08, "grad_norm": 2.875, "learning_rate": 2e-05, "loss": 2.8447, "num_input_tokens_seen": 759169024, "step": 362 }, { "epoch": 0.08, "grad_norm": 2.765625, "learning_rate": 2e-05, "loss": 2.8592, "num_input_tokens_seen": 761266176, "step": 363 }, { "epoch": 0.08, "grad_norm": 3.328125, "learning_rate": 2e-05, "loss": 2.8172, "num_input_tokens_seen": 763363328, "step": 364 }, { "epoch": 0.08, "grad_norm": 3.78125, "learning_rate": 2e-05, "loss": 2.8316, "num_input_tokens_seen": 765460480, "step": 365 }, { "epoch": 0.08, "grad_norm": 2.109375, "learning_rate": 2e-05, "loss": 2.8401, "num_input_tokens_seen": 767557632, "step": 366 }, { "epoch": 0.08, "grad_norm": 2.71875, "learning_rate": 2e-05, "loss": 2.8352, "num_input_tokens_seen": 769654784, "step": 367 }, { "epoch": 0.08, "grad_norm": 2.5625, "learning_rate": 2e-05, "loss": 2.8366, "num_input_tokens_seen": 771751936, "step": 368 }, { "epoch": 0.08, "grad_norm": 2.453125, "learning_rate": 2e-05, "loss": 2.8411, "num_input_tokens_seen": 773849088, "step": 369 }, { "epoch": 0.08, "grad_norm": 2.421875, "learning_rate": 2e-05, "loss": 2.8475, "num_input_tokens_seen": 775946240, "step": 370 }, { "epoch": 0.08, "grad_norm": 2.015625, "learning_rate": 2e-05, "loss": 2.8005, "num_input_tokens_seen": 778043392, "step": 371 }, { "epoch": 0.08, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 2.8355, "num_input_tokens_seen": 780140544, "step": 372 }, { "epoch": 0.08, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 2.8547, "num_input_tokens_seen": 782237696, "step": 373 }, { "epoch": 0.08, "grad_norm": 2.515625, "learning_rate": 2e-05, "loss": 2.8163, "num_input_tokens_seen": 784334848, "step": 374 }, { "epoch": 0.08, "grad_norm": 2.171875, "learning_rate": 2e-05, "loss": 2.7912, "num_input_tokens_seen": 786432000, "step": 375 }, { "epoch": 0.08, "grad_norm": 2.546875, "learning_rate": 2e-05, "loss": 2.8027, "num_input_tokens_seen": 788529152, "step": 376 }, { "epoch": 0.08, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 2.8182, "num_input_tokens_seen": 790626304, "step": 377 }, { "epoch": 0.08, "grad_norm": 2.71875, "learning_rate": 2e-05, "loss": 2.7678, "num_input_tokens_seen": 792723456, "step": 378 }, { "epoch": 0.08, "grad_norm": 2.9375, "learning_rate": 2e-05, "loss": 2.8225, "num_input_tokens_seen": 794820608, "step": 379 }, { "epoch": 0.08, "grad_norm": 2.6875, "learning_rate": 2e-05, "loss": 2.8011, "num_input_tokens_seen": 796917760, "step": 380 }, { "epoch": 0.08, "grad_norm": 2.359375, "learning_rate": 2e-05, "loss": 2.7975, "num_input_tokens_seen": 799014912, "step": 381 }, { "epoch": 0.08, "grad_norm": 3.203125, "learning_rate": 2e-05, "loss": 2.7733, "num_input_tokens_seen": 801112064, "step": 382 }, { "epoch": 0.08, "grad_norm": 2.84375, "learning_rate": 2e-05, "loss": 2.8112, "num_input_tokens_seen": 803209216, "step": 383 }, { "epoch": 0.08, "grad_norm": 2.453125, "learning_rate": 2e-05, "loss": 2.7977, "num_input_tokens_seen": 805306368, "step": 384 }, { "epoch": 0.08, "grad_norm": 2.625, "learning_rate": 2e-05, "loss": 2.8178, "num_input_tokens_seen": 807403520, "step": 385 }, { "epoch": 0.08, "grad_norm": 2.1875, "learning_rate": 2e-05, "loss": 2.7852, "num_input_tokens_seen": 809500672, "step": 386 }, { "epoch": 0.08, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 2.8056, "num_input_tokens_seen": 811597824, "step": 387 }, { "epoch": 0.08, "grad_norm": 2.4375, "learning_rate": 2e-05, "loss": 2.7934, "num_input_tokens_seen": 813694976, "step": 388 }, { "epoch": 0.08, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 2.806, "num_input_tokens_seen": 815792128, "step": 389 }, { "epoch": 0.08, "grad_norm": 2.109375, "learning_rate": 2e-05, "loss": 2.7833, "num_input_tokens_seen": 817889280, "step": 390 }, { "epoch": 0.08, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 2.7811, "num_input_tokens_seen": 819986432, "step": 391 }, { "epoch": 0.08, "grad_norm": 2.234375, "learning_rate": 2e-05, "loss": 2.7697, "num_input_tokens_seen": 822083584, "step": 392 }, { "epoch": 0.08, "grad_norm": 3.078125, "learning_rate": 2e-05, "loss": 2.7823, "num_input_tokens_seen": 824180736, "step": 393 }, { "epoch": 0.08, "grad_norm": 2.640625, "learning_rate": 2e-05, "loss": 2.8006, "num_input_tokens_seen": 826277888, "step": 394 }, { "epoch": 0.08, "grad_norm": 2.5625, "learning_rate": 2e-05, "loss": 2.7893, "num_input_tokens_seen": 828375040, "step": 395 }, { "epoch": 0.08, "grad_norm": 3.078125, "learning_rate": 2e-05, "loss": 2.7442, "num_input_tokens_seen": 830472192, "step": 396 }, { "epoch": 0.08, "grad_norm": 2.4375, "learning_rate": 2e-05, "loss": 2.7833, "num_input_tokens_seen": 832569344, "step": 397 }, { "epoch": 0.08, "grad_norm": 2.734375, "learning_rate": 2e-05, "loss": 2.8004, "num_input_tokens_seen": 834666496, "step": 398 }, { "epoch": 0.08, "eval_loss": 2.7730445861816406, "eval_runtime": 1661.813, "eval_samples_per_second": 2.372, "eval_steps_per_second": 0.593, "num_input_tokens_seen": 834666496, "step": 398 }, { "epoch": 0.08, "grad_norm": 3.78125, "learning_rate": 2e-05, "loss": 2.776, "num_input_tokens_seen": 836763648, "step": 399 }, { "epoch": 0.08, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 2.8008, "num_input_tokens_seen": 838860800, "step": 400 }, { "epoch": 0.08, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 2.7404, "num_input_tokens_seen": 840957952, "step": 401 }, { "epoch": 0.08, "grad_norm": 2.84375, "learning_rate": 2e-05, "loss": 2.737, "num_input_tokens_seen": 843055104, "step": 402 }, { "epoch": 0.08, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 2.7816, "num_input_tokens_seen": 845152256, "step": 403 }, { "epoch": 0.08, "grad_norm": 2.09375, "learning_rate": 2e-05, "loss": 2.7375, "num_input_tokens_seen": 847249408, "step": 404 }, { "epoch": 0.08, "grad_norm": 2.15625, "learning_rate": 2e-05, "loss": 2.7568, "num_input_tokens_seen": 849346560, "step": 405 }, { "epoch": 0.09, "grad_norm": 2.28125, "learning_rate": 2e-05, "loss": 2.7541, "num_input_tokens_seen": 851443712, "step": 406 }, { "epoch": 0.09, "grad_norm": 2.5625, "learning_rate": 2e-05, "loss": 2.7865, "num_input_tokens_seen": 853540864, "step": 407 }, { "epoch": 0.09, "grad_norm": 1.6015625, "learning_rate": 2e-05, "loss": 2.7722, "num_input_tokens_seen": 855638016, "step": 408 }, { "epoch": 0.09, "grad_norm": 2.59375, "learning_rate": 2e-05, "loss": 2.7532, "num_input_tokens_seen": 857735168, "step": 409 }, { "epoch": 0.09, "grad_norm": 2.5625, "learning_rate": 2e-05, "loss": 2.7556, "num_input_tokens_seen": 859832320, "step": 410 }, { "epoch": 0.09, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 2.735, "num_input_tokens_seen": 861929472, "step": 411 }, { "epoch": 0.09, "grad_norm": 2.75, "learning_rate": 2e-05, "loss": 2.7688, "num_input_tokens_seen": 864026624, "step": 412 }, { "epoch": 0.09, "grad_norm": 2.5625, "learning_rate": 2e-05, "loss": 2.7644, "num_input_tokens_seen": 866123776, "step": 413 }, { "epoch": 0.09, "grad_norm": 1.9453125, "learning_rate": 2e-05, "loss": 2.7741, "num_input_tokens_seen": 868220928, "step": 414 }, { "epoch": 0.09, "grad_norm": 2.859375, "learning_rate": 2e-05, "loss": 2.7287, "num_input_tokens_seen": 870318080, "step": 415 }, { "epoch": 0.09, "grad_norm": 3.1875, "learning_rate": 2e-05, "loss": 2.757, "num_input_tokens_seen": 872415232, "step": 416 }, { "epoch": 0.09, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 2.7484, "num_input_tokens_seen": 874512384, "step": 417 }, { "epoch": 0.09, "grad_norm": 2.5625, "learning_rate": 2e-05, "loss": 2.7537, "num_input_tokens_seen": 876609536, "step": 418 }, { "epoch": 0.09, "grad_norm": 3.265625, "learning_rate": 2e-05, "loss": 2.7671, "num_input_tokens_seen": 878706688, "step": 419 }, { "epoch": 0.09, "grad_norm": 2.53125, "learning_rate": 2e-05, "loss": 2.7466, "num_input_tokens_seen": 880803840, "step": 420 }, { "epoch": 0.09, "grad_norm": 4.59375, "learning_rate": 2e-05, "loss": 2.7435, "num_input_tokens_seen": 882900992, "step": 421 }, { "epoch": 0.09, "grad_norm": 3.078125, "learning_rate": 2e-05, "loss": 2.7226, "num_input_tokens_seen": 884998144, "step": 422 }, { "epoch": 0.09, "grad_norm": 5.0625, "learning_rate": 2e-05, "loss": 2.7318, "num_input_tokens_seen": 887095296, "step": 423 }, { "epoch": 0.09, "grad_norm": 3.546875, "learning_rate": 2e-05, "loss": 2.7617, "num_input_tokens_seen": 889192448, "step": 424 }, { "epoch": 0.09, "grad_norm": 11.0625, "learning_rate": 2e-05, "loss": 2.7973, "num_input_tokens_seen": 891289600, "step": 425 }, { "epoch": 0.09, "grad_norm": 10.4375, "learning_rate": 2e-05, "loss": 2.7629, "num_input_tokens_seen": 893386752, "step": 426 }, { "epoch": 0.09, "grad_norm": 3.203125, "learning_rate": 2e-05, "loss": 2.7409, "num_input_tokens_seen": 895483904, "step": 427 }, { "epoch": 0.09, "grad_norm": 7.8125, "learning_rate": 2e-05, "loss": 2.792, "num_input_tokens_seen": 897581056, "step": 428 }, { "epoch": 0.09, "grad_norm": 7.75, "learning_rate": 2e-05, "loss": 2.769, "num_input_tokens_seen": 899678208, "step": 429 }, { "epoch": 0.09, "grad_norm": 4.03125, "learning_rate": 2e-05, "loss": 2.7214, "num_input_tokens_seen": 901775360, "step": 430 }, { "epoch": 0.09, "grad_norm": 10.5625, "learning_rate": 2e-05, "loss": 2.7974, "num_input_tokens_seen": 903872512, "step": 431 }, { "epoch": 0.09, "grad_norm": 11.625, "learning_rate": 2e-05, "loss": 2.8303, "num_input_tokens_seen": 905969664, "step": 432 }, { "epoch": 0.09, "grad_norm": 7.78125, "learning_rate": 2e-05, "loss": 2.8003, "num_input_tokens_seen": 908066816, "step": 433 }, { "epoch": 0.09, "grad_norm": 3.859375, "learning_rate": 2e-05, "loss": 2.7659, "num_input_tokens_seen": 910163968, "step": 434 }, { "epoch": 0.09, "grad_norm": 5.4375, "learning_rate": 2e-05, "loss": 2.7698, "num_input_tokens_seen": 912261120, "step": 435 }, { "epoch": 0.09, "grad_norm": 4.71875, "learning_rate": 2e-05, "loss": 2.7832, "num_input_tokens_seen": 914358272, "step": 436 }, { "epoch": 0.09, "grad_norm": 2.703125, "learning_rate": 2e-05, "loss": 2.7357, "num_input_tokens_seen": 916455424, "step": 437 }, { "epoch": 0.09, "grad_norm": 4.21875, "learning_rate": 2e-05, "loss": 2.7697, "num_input_tokens_seen": 918552576, "step": 438 }, { "epoch": 0.09, "grad_norm": 3.703125, "learning_rate": 2e-05, "loss": 2.7555, "num_input_tokens_seen": 920649728, "step": 439 }, { "epoch": 0.09, "grad_norm": 2.4375, "learning_rate": 2e-05, "loss": 2.7363, "num_input_tokens_seen": 922746880, "step": 440 }, { "epoch": 0.09, "grad_norm": 3.8125, "learning_rate": 2e-05, "loss": 2.7422, "num_input_tokens_seen": 924844032, "step": 441 }, { "epoch": 0.09, "grad_norm": 3.515625, "learning_rate": 2e-05, "loss": 2.737, "num_input_tokens_seen": 926941184, "step": 442 }, { "epoch": 0.09, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 2.7024, "num_input_tokens_seen": 929038336, "step": 443 }, { "epoch": 0.09, "grad_norm": 3.78125, "learning_rate": 2e-05, "loss": 2.7377, "num_input_tokens_seen": 931135488, "step": 444 }, { "epoch": 0.09, "grad_norm": 4.0625, "learning_rate": 2e-05, "loss": 2.7484, "num_input_tokens_seen": 933232640, "step": 445 }, { "epoch": 0.09, "grad_norm": 2.5625, "learning_rate": 2e-05, "loss": 2.7241, "num_input_tokens_seen": 935329792, "step": 446 }, { "epoch": 0.09, "grad_norm": 2.796875, "learning_rate": 2e-05, "loss": 2.6985, "num_input_tokens_seen": 937426944, "step": 447 }, { "epoch": 0.09, "grad_norm": 3.53125, "learning_rate": 2e-05, "loss": 2.723, "num_input_tokens_seen": 939524096, "step": 448 }, { "epoch": 0.09, "grad_norm": 2.75, "learning_rate": 2e-05, "loss": 2.7293, "num_input_tokens_seen": 941621248, "step": 449 }, { "epoch": 0.09, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 2.7126, "num_input_tokens_seen": 943718400, "step": 450 }, { "epoch": 0.09, "grad_norm": 2.578125, "learning_rate": 2e-05, "loss": 2.7239, "num_input_tokens_seen": 945815552, "step": 451 }, { "epoch": 0.09, "grad_norm": 2.140625, "learning_rate": 2e-05, "loss": 2.7052, "num_input_tokens_seen": 947912704, "step": 452 }, { "epoch": 0.1, "grad_norm": 1.8359375, "learning_rate": 2e-05, "loss": 2.7283, "num_input_tokens_seen": 950009856, "step": 453 }, { "epoch": 0.1, "grad_norm": 2.375, "learning_rate": 2e-05, "loss": 2.7342, "num_input_tokens_seen": 952107008, "step": 454 }, { "epoch": 0.1, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 2.7049, "num_input_tokens_seen": 954204160, "step": 455 }, { "epoch": 0.1, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 2.7269, "num_input_tokens_seen": 956301312, "step": 456 }, { "epoch": 0.1, "grad_norm": 2.390625, "learning_rate": 2e-05, "loss": 2.7227, "num_input_tokens_seen": 958398464, "step": 457 }, { "epoch": 0.1, "grad_norm": 1.8046875, "learning_rate": 2e-05, "loss": 2.7137, "num_input_tokens_seen": 960495616, "step": 458 }, { "epoch": 0.1, "grad_norm": 2.359375, "learning_rate": 2e-05, "loss": 2.6715, "num_input_tokens_seen": 962592768, "step": 459 }, { "epoch": 0.1, "grad_norm": 2.5, "learning_rate": 2e-05, "loss": 2.7087, "num_input_tokens_seen": 964689920, "step": 460 }, { "epoch": 0.1, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 2.7266, "num_input_tokens_seen": 966787072, "step": 461 }, { "epoch": 0.1, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 2.6943, "num_input_tokens_seen": 968884224, "step": 462 }, { "epoch": 0.1, "grad_norm": 2.15625, "learning_rate": 2e-05, "loss": 2.6758, "num_input_tokens_seen": 970981376, "step": 463 }, { "epoch": 0.1, "grad_norm": 2.15625, "learning_rate": 2e-05, "loss": 2.6947, "num_input_tokens_seen": 973078528, "step": 464 }, { "epoch": 0.1, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 2.7042, "num_input_tokens_seen": 975175680, "step": 465 }, { "epoch": 0.1, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 2.7282, "num_input_tokens_seen": 977272832, "step": 466 }, { "epoch": 0.1, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 2.6719, "num_input_tokens_seen": 979369984, "step": 467 }, { "epoch": 0.1, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 2.7167, "num_input_tokens_seen": 981467136, "step": 468 }, { "epoch": 0.1, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 2.6965, "num_input_tokens_seen": 983564288, "step": 469 }, { "epoch": 0.1, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 2.7068, "num_input_tokens_seen": 985661440, "step": 470 }, { "epoch": 0.1, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 2.6865, "num_input_tokens_seen": 987758592, "step": 471 }, { "epoch": 0.1, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 2.7114, "num_input_tokens_seen": 989855744, "step": 472 }, { "epoch": 0.1, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 2.6863, "num_input_tokens_seen": 991952896, "step": 473 }, { "epoch": 0.1, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 2.6849, "num_input_tokens_seen": 994050048, "step": 474 }, { "epoch": 0.1, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 2.6971, "num_input_tokens_seen": 996147200, "step": 475 }, { "epoch": 0.1, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 2.6676, "num_input_tokens_seen": 998244352, "step": 476 }, { "epoch": 0.1, "grad_norm": 1.9765625, "learning_rate": 2e-05, "loss": 2.6912, "num_input_tokens_seen": 1000341504, "step": 477 }, { "epoch": 0.1, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 2.7104, "num_input_tokens_seen": 1002438656, "step": 478 }, { "epoch": 0.1, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 2.6734, "num_input_tokens_seen": 1004535808, "step": 479 }, { "epoch": 0.1, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 2.7041, "num_input_tokens_seen": 1006632960, "step": 480 }, { "epoch": 0.1, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 2.6588, "num_input_tokens_seen": 1008730112, "step": 481 }, { "epoch": 0.1, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 2.6661, "num_input_tokens_seen": 1010827264, "step": 482 }, { "epoch": 0.1, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 2.6809, "num_input_tokens_seen": 1012924416, "step": 483 }, { "epoch": 0.1, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 2.6699, "num_input_tokens_seen": 1015021568, "step": 484 }, { "epoch": 0.1, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 2.6395, "num_input_tokens_seen": 1017118720, "step": 485 }, { "epoch": 0.1, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 2.6861, "num_input_tokens_seen": 1019215872, "step": 486 }, { "epoch": 0.1, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 2.6745, "num_input_tokens_seen": 1021313024, "step": 487 }, { "epoch": 0.1, "grad_norm": 2.125, "learning_rate": 2e-05, "loss": 2.6688, "num_input_tokens_seen": 1023410176, "step": 488 }, { "epoch": 0.1, "grad_norm": 1.9296875, "learning_rate": 2e-05, "loss": 2.684, "num_input_tokens_seen": 1025507328, "step": 489 }, { "epoch": 0.1, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 2.681, "num_input_tokens_seen": 1027604480, "step": 490 }, { "epoch": 0.1, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 2.6767, "num_input_tokens_seen": 1029701632, "step": 491 }, { "epoch": 0.1, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 2.6788, "num_input_tokens_seen": 1031798784, "step": 492 }, { "epoch": 0.1, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 2.6646, "num_input_tokens_seen": 1033895936, "step": 493 }, { "epoch": 0.1, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 2.6781, "num_input_tokens_seen": 1035993088, "step": 494 }, { "epoch": 0.1, "grad_norm": 1.9921875, "learning_rate": 2e-05, "loss": 2.6442, "num_input_tokens_seen": 1038090240, "step": 495 }, { "epoch": 0.1, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 2.675, "num_input_tokens_seen": 1040187392, "step": 496 }, { "epoch": 0.1, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 2.6755, "num_input_tokens_seen": 1042284544, "step": 497 }, { "epoch": 0.1, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 2.6628, "num_input_tokens_seen": 1044381696, "step": 498 }, { "epoch": 0.1, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 2.641, "num_input_tokens_seen": 1046478848, "step": 499 }, { "epoch": 0.1, "grad_norm": 2.078125, "learning_rate": 2e-05, "loss": 2.6776, "num_input_tokens_seen": 1048576000, "step": 500 }, { "epoch": 0.11, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 2.6597, "num_input_tokens_seen": 1050673152, "step": 501 }, { "epoch": 0.11, "grad_norm": 2.234375, "learning_rate": 2e-05, "loss": 2.6489, "num_input_tokens_seen": 1052770304, "step": 502 }, { "epoch": 0.11, "grad_norm": 2.359375, "learning_rate": 2e-05, "loss": 2.6456, "num_input_tokens_seen": 1054867456, "step": 503 }, { "epoch": 0.11, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 2.6376, "num_input_tokens_seen": 1056964608, "step": 504 }, { "epoch": 0.11, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 2.639, "num_input_tokens_seen": 1059061760, "step": 505 }, { "epoch": 0.11, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 2.648, "num_input_tokens_seen": 1061158912, "step": 506 }, { "epoch": 0.11, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 2.6524, "num_input_tokens_seen": 1063256064, "step": 507 }, { "epoch": 0.11, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 2.6346, "num_input_tokens_seen": 1065353216, "step": 508 }, { "epoch": 0.11, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.6405, "num_input_tokens_seen": 1067450368, "step": 509 }, { "epoch": 0.11, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 2.6305, "num_input_tokens_seen": 1069547520, "step": 510 }, { "epoch": 0.11, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.6503, "num_input_tokens_seen": 1071644672, "step": 511 }, { "epoch": 0.11, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 2.6229, "num_input_tokens_seen": 1073741824, "step": 512 }, { "epoch": 0.11, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 2.6231, "num_input_tokens_seen": 1075838976, "step": 513 }, { "epoch": 0.11, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 2.6399, "num_input_tokens_seen": 1077936128, "step": 514 }, { "epoch": 0.11, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 2.62, "num_input_tokens_seen": 1080033280, "step": 515 }, { "epoch": 0.11, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 2.6226, "num_input_tokens_seen": 1082130432, "step": 516 }, { "epoch": 0.11, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 2.6314, "num_input_tokens_seen": 1084227584, "step": 517 }, { "epoch": 0.11, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 2.6009, "num_input_tokens_seen": 1086324736, "step": 518 }, { "epoch": 0.11, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 2.6375, "num_input_tokens_seen": 1088421888, "step": 519 }, { "epoch": 0.11, "grad_norm": 1.2734375, "learning_rate": 2e-05, "loss": 2.6241, "num_input_tokens_seen": 1090519040, "step": 520 }, { "epoch": 0.11, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 2.6293, "num_input_tokens_seen": 1092616192, "step": 521 }, { "epoch": 0.11, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.6184, "num_input_tokens_seen": 1094713344, "step": 522 }, { "epoch": 0.11, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 2.6116, "num_input_tokens_seen": 1096810496, "step": 523 }, { "epoch": 0.11, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 2.5915, "num_input_tokens_seen": 1098907648, "step": 524 }, { "epoch": 0.11, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.6169, "num_input_tokens_seen": 1101004800, "step": 525 }, { "epoch": 0.11, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 2.596, "num_input_tokens_seen": 1103101952, "step": 526 }, { "epoch": 0.11, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 2.6431, "num_input_tokens_seen": 1105199104, "step": 527 }, { "epoch": 0.11, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 2.6304, "num_input_tokens_seen": 1107296256, "step": 528 }, { "epoch": 0.11, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 2.6342, "num_input_tokens_seen": 1109393408, "step": 529 }, { "epoch": 0.11, "grad_norm": 1.96875, "learning_rate": 2e-05, "loss": 2.6268, "num_input_tokens_seen": 1111490560, "step": 530 }, { "epoch": 0.11, "grad_norm": 1.9140625, "learning_rate": 2e-05, "loss": 2.6557, "num_input_tokens_seen": 1113587712, "step": 531 }, { "epoch": 0.11, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 2.609, "num_input_tokens_seen": 1115684864, "step": 532 }, { "epoch": 0.11, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 2.5872, "num_input_tokens_seen": 1117782016, "step": 533 }, { "epoch": 0.11, "grad_norm": 1.9453125, "learning_rate": 2e-05, "loss": 2.6239, "num_input_tokens_seen": 1119879168, "step": 534 }, { "epoch": 0.11, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 2.5891, "num_input_tokens_seen": 1121976320, "step": 535 }, { "epoch": 0.11, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 2.6165, "num_input_tokens_seen": 1124073472, "step": 536 }, { "epoch": 0.11, "grad_norm": 2.046875, "learning_rate": 2e-05, "loss": 2.6474, "num_input_tokens_seen": 1126170624, "step": 537 }, { "epoch": 0.11, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 2.6127, "num_input_tokens_seen": 1128267776, "step": 538 }, { "epoch": 0.11, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 2.6028, "num_input_tokens_seen": 1130364928, "step": 539 }, { "epoch": 0.11, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 2.5819, "num_input_tokens_seen": 1132462080, "step": 540 }, { "epoch": 0.11, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 2.5896, "num_input_tokens_seen": 1134559232, "step": 541 }, { "epoch": 0.11, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 2.5851, "num_input_tokens_seen": 1136656384, "step": 542 }, { "epoch": 0.11, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 2.5615, "num_input_tokens_seen": 1138753536, "step": 543 }, { "epoch": 0.11, "grad_norm": 1.5703125, "learning_rate": 2e-05, "loss": 2.601, "num_input_tokens_seen": 1140850688, "step": 544 }, { "epoch": 0.11, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 2.6026, "num_input_tokens_seen": 1142947840, "step": 545 }, { "epoch": 0.11, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 2.6076, "num_input_tokens_seen": 1145044992, "step": 546 }, { "epoch": 0.11, "grad_norm": 1.265625, "learning_rate": 2e-05, "loss": 2.6204, "num_input_tokens_seen": 1147142144, "step": 547 }, { "epoch": 0.11, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 2.6222, "num_input_tokens_seen": 1149239296, "step": 548 }, { "epoch": 0.12, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 2.62, "num_input_tokens_seen": 1151336448, "step": 549 }, { "epoch": 0.12, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 2.6029, "num_input_tokens_seen": 1153433600, "step": 550 }, { "epoch": 0.12, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 2.6162, "num_input_tokens_seen": 1155530752, "step": 551 }, { "epoch": 0.12, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.5789, "num_input_tokens_seen": 1157627904, "step": 552 }, { "epoch": 0.12, "grad_norm": 1.9453125, "learning_rate": 2e-05, "loss": 2.5987, "num_input_tokens_seen": 1159725056, "step": 553 }, { "epoch": 0.12, "grad_norm": 1.671875, "learning_rate": 2e-05, "loss": 2.5675, "num_input_tokens_seen": 1161822208, "step": 554 }, { "epoch": 0.12, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 2.5748, "num_input_tokens_seen": 1163919360, "step": 555 }, { "epoch": 0.12, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 2.626, "num_input_tokens_seen": 1166016512, "step": 556 }, { "epoch": 0.12, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 2.6002, "num_input_tokens_seen": 1168113664, "step": 557 }, { "epoch": 0.12, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 2.6136, "num_input_tokens_seen": 1170210816, "step": 558 }, { "epoch": 0.12, "grad_norm": 1.1953125, "learning_rate": 2e-05, "loss": 2.5847, "num_input_tokens_seen": 1172307968, "step": 559 }, { "epoch": 0.12, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 2.5762, "num_input_tokens_seen": 1174405120, "step": 560 }, { "epoch": 0.12, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 2.5625, "num_input_tokens_seen": 1176502272, "step": 561 }, { "epoch": 0.12, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 2.5663, "num_input_tokens_seen": 1178599424, "step": 562 }, { "epoch": 0.12, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 2.5882, "num_input_tokens_seen": 1180696576, "step": 563 }, { "epoch": 0.12, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 2.5758, "num_input_tokens_seen": 1182793728, "step": 564 }, { "epoch": 0.12, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.5826, "num_input_tokens_seen": 1184890880, "step": 565 }, { "epoch": 0.12, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 2.5991, "num_input_tokens_seen": 1186988032, "step": 566 }, { "epoch": 0.12, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.5852, "num_input_tokens_seen": 1189085184, "step": 567 }, { "epoch": 0.12, "grad_norm": 1.890625, "learning_rate": 2e-05, "loss": 2.6272, "num_input_tokens_seen": 1191182336, "step": 568 }, { "epoch": 0.12, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 2.5717, "num_input_tokens_seen": 1193279488, "step": 569 }, { "epoch": 0.12, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 2.5766, "num_input_tokens_seen": 1195376640, "step": 570 }, { "epoch": 0.12, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 2.5873, "num_input_tokens_seen": 1197473792, "step": 571 }, { "epoch": 0.12, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 2.5767, "num_input_tokens_seen": 1199570944, "step": 572 }, { "epoch": 0.12, "grad_norm": 1.2890625, "learning_rate": 2e-05, "loss": 2.5672, "num_input_tokens_seen": 1201668096, "step": 573 }, { "epoch": 0.12, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 2.6013, "num_input_tokens_seen": 1203765248, "step": 574 }, { "epoch": 0.12, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 2.5735, "num_input_tokens_seen": 1205862400, "step": 575 }, { "epoch": 0.12, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 2.5683, "num_input_tokens_seen": 1207959552, "step": 576 }, { "epoch": 0.12, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 2.571, "num_input_tokens_seen": 1210056704, "step": 577 }, { "epoch": 0.12, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 2.5195, "num_input_tokens_seen": 1212153856, "step": 578 }, { "epoch": 0.12, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.6242, "num_input_tokens_seen": 1214251008, "step": 579 }, { "epoch": 0.12, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 2.5576, "num_input_tokens_seen": 1216348160, "step": 580 }, { "epoch": 0.12, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 2.5746, "num_input_tokens_seen": 1218445312, "step": 581 }, { "epoch": 0.12, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 2.5561, "num_input_tokens_seen": 1220542464, "step": 582 }, { "epoch": 0.12, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 2.6001, "num_input_tokens_seen": 1222639616, "step": 583 }, { "epoch": 0.12, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 2.5807, "num_input_tokens_seen": 1224736768, "step": 584 }, { "epoch": 0.12, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.5861, "num_input_tokens_seen": 1226833920, "step": 585 }, { "epoch": 0.12, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 2.5884, "num_input_tokens_seen": 1228931072, "step": 586 }, { "epoch": 0.12, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.56, "num_input_tokens_seen": 1231028224, "step": 587 }, { "epoch": 0.12, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.5724, "num_input_tokens_seen": 1233125376, "step": 588 }, { "epoch": 0.12, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 2.5797, "num_input_tokens_seen": 1235222528, "step": 589 }, { "epoch": 0.12, "grad_norm": 1.7578125, "learning_rate": 2e-05, "loss": 2.5742, "num_input_tokens_seen": 1237319680, "step": 590 }, { "epoch": 0.12, "grad_norm": 1.265625, "learning_rate": 2e-05, "loss": 2.5676, "num_input_tokens_seen": 1239416832, "step": 591 }, { "epoch": 0.12, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 2.5713, "num_input_tokens_seen": 1241513984, "step": 592 }, { "epoch": 0.12, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 2.5664, "num_input_tokens_seen": 1243611136, "step": 593 }, { "epoch": 0.12, "grad_norm": 1.1640625, "learning_rate": 2e-05, "loss": 2.5582, "num_input_tokens_seen": 1245708288, "step": 594 }, { "epoch": 0.12, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 2.5555, "num_input_tokens_seen": 1247805440, "step": 595 }, { "epoch": 0.12, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 2.5855, "num_input_tokens_seen": 1249902592, "step": 596 }, { "epoch": 0.13, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 2.5575, "num_input_tokens_seen": 1251999744, "step": 597 }, { "epoch": 0.13, "eval_loss": 2.5605051517486572, "eval_runtime": 1935.8541, "eval_samples_per_second": 2.036, "eval_steps_per_second": 0.509, "num_input_tokens_seen": 1251999744, "step": 597 }, { "epoch": 0.13, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 2.5658, "num_input_tokens_seen": 1254096896, "step": 598 }, { "epoch": 0.13, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.5309, "num_input_tokens_seen": 1256194048, "step": 599 }, { "epoch": 0.13, "grad_norm": 1.1953125, "learning_rate": 2e-05, "loss": 2.5477, "num_input_tokens_seen": 1258291200, "step": 600 }, { "epoch": 0.13, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 2.592, "num_input_tokens_seen": 1260388352, "step": 601 }, { "epoch": 0.13, "grad_norm": 1.828125, "learning_rate": 2e-05, "loss": 2.5534, "num_input_tokens_seen": 1262485504, "step": 602 }, { "epoch": 0.13, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 2.5679, "num_input_tokens_seen": 1264582656, "step": 603 }, { "epoch": 0.13, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 2.5482, "num_input_tokens_seen": 1266679808, "step": 604 }, { "epoch": 0.13, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 2.5591, "num_input_tokens_seen": 1268776960, "step": 605 }, { "epoch": 0.13, "grad_norm": 5.21875, "learning_rate": 2e-05, "loss": 2.5742, "num_input_tokens_seen": 1270874112, "step": 606 }, { "epoch": 0.13, "grad_norm": 1.9375, "learning_rate": 2e-05, "loss": 2.5941, "num_input_tokens_seen": 1272971264, "step": 607 }, { "epoch": 0.13, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 2.5539, "num_input_tokens_seen": 1275068416, "step": 608 }, { "epoch": 0.13, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 2.5596, "num_input_tokens_seen": 1277165568, "step": 609 }, { "epoch": 0.13, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 2.5698, "num_input_tokens_seen": 1279262720, "step": 610 }, { "epoch": 0.13, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.5453, "num_input_tokens_seen": 1281359872, "step": 611 }, { "epoch": 0.13, "grad_norm": 1.3984375, "learning_rate": 2e-05, "loss": 2.5818, "num_input_tokens_seen": 1283457024, "step": 612 }, { "epoch": 0.13, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 2.5525, "num_input_tokens_seen": 1285554176, "step": 613 }, { "epoch": 0.13, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 2.5555, "num_input_tokens_seen": 1287651328, "step": 614 }, { "epoch": 0.13, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.5297, "num_input_tokens_seen": 1289748480, "step": 615 }, { "epoch": 0.13, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 2.5414, "num_input_tokens_seen": 1291845632, "step": 616 }, { "epoch": 0.13, "grad_norm": 1.2265625, "learning_rate": 2e-05, "loss": 2.5455, "num_input_tokens_seen": 1293942784, "step": 617 }, { "epoch": 0.13, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 2.5228, "num_input_tokens_seen": 1296039936, "step": 618 }, { "epoch": 0.13, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 2.5407, "num_input_tokens_seen": 1298137088, "step": 619 }, { "epoch": 0.13, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.5332, "num_input_tokens_seen": 1300234240, "step": 620 }, { "epoch": 0.13, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 2.5389, "num_input_tokens_seen": 1302331392, "step": 621 }, { "epoch": 0.13, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.5826, "num_input_tokens_seen": 1304428544, "step": 622 }, { "epoch": 0.13, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 2.5491, "num_input_tokens_seen": 1306525696, "step": 623 }, { "epoch": 0.13, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 2.5525, "num_input_tokens_seen": 1308622848, "step": 624 }, { "epoch": 0.13, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 2.5601, "num_input_tokens_seen": 1310720000, "step": 625 }, { "epoch": 0.13, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 2.5455, "num_input_tokens_seen": 1312817152, "step": 626 }, { "epoch": 0.13, "grad_norm": 1.8203125, "learning_rate": 2e-05, "loss": 2.5564, "num_input_tokens_seen": 1314914304, "step": 627 }, { "epoch": 0.13, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 2.5425, "num_input_tokens_seen": 1317011456, "step": 628 }, { "epoch": 0.13, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 2.5185, "num_input_tokens_seen": 1319108608, "step": 629 }, { "epoch": 0.13, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 2.5366, "num_input_tokens_seen": 1321205760, "step": 630 }, { "epoch": 0.13, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 2.5214, "num_input_tokens_seen": 1323302912, "step": 631 }, { "epoch": 0.13, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 2.551, "num_input_tokens_seen": 1325400064, "step": 632 }, { "epoch": 0.13, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 2.556, "num_input_tokens_seen": 1327497216, "step": 633 }, { "epoch": 0.13, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 2.5366, "num_input_tokens_seen": 1329594368, "step": 634 }, { "epoch": 0.13, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 2.5448, "num_input_tokens_seen": 1331691520, "step": 635 }, { "epoch": 0.13, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 2.5318, "num_input_tokens_seen": 1333788672, "step": 636 }, { "epoch": 0.13, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.5578, "num_input_tokens_seen": 1335885824, "step": 637 }, { "epoch": 0.13, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 2.5275, "num_input_tokens_seen": 1337982976, "step": 638 }, { "epoch": 0.13, "grad_norm": 0.99609375, "learning_rate": 2e-05, "loss": 2.5075, "num_input_tokens_seen": 1340080128, "step": 639 }, { "epoch": 0.13, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 2.5553, "num_input_tokens_seen": 1342177280, "step": 640 }, { "epoch": 0.13, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 2.5386, "num_input_tokens_seen": 1344274432, "step": 641 }, { "epoch": 0.13, "grad_norm": 1.1796875, "learning_rate": 2e-05, "loss": 2.5444, "num_input_tokens_seen": 1346371584, "step": 642 }, { "epoch": 0.13, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 2.5225, "num_input_tokens_seen": 1348468736, "step": 643 }, { "epoch": 0.14, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 2.4973, "num_input_tokens_seen": 1350565888, "step": 644 }, { "epoch": 0.14, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.5117, "num_input_tokens_seen": 1352663040, "step": 645 }, { "epoch": 0.14, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 2.5352, "num_input_tokens_seen": 1354760192, "step": 646 }, { "epoch": 0.14, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.5621, "num_input_tokens_seen": 1356857344, "step": 647 }, { "epoch": 0.14, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.5334, "num_input_tokens_seen": 1358954496, "step": 648 }, { "epoch": 0.14, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.5368, "num_input_tokens_seen": 1361051648, "step": 649 }, { "epoch": 0.14, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.4947, "num_input_tokens_seen": 1363148800, "step": 650 }, { "epoch": 0.14, "grad_norm": 1.21875, "learning_rate": 2e-05, "loss": 2.5294, "num_input_tokens_seen": 1365245952, "step": 651 }, { "epoch": 0.14, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.5132, "num_input_tokens_seen": 1367343104, "step": 652 }, { "epoch": 0.14, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.5071, "num_input_tokens_seen": 1369440256, "step": 653 }, { "epoch": 0.14, "grad_norm": 1.2109375, "learning_rate": 2e-05, "loss": 2.5225, "num_input_tokens_seen": 1371537408, "step": 654 }, { "epoch": 0.14, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.5289, "num_input_tokens_seen": 1373634560, "step": 655 }, { "epoch": 0.14, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.5269, "num_input_tokens_seen": 1375731712, "step": 656 }, { "epoch": 0.14, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 2.5342, "num_input_tokens_seen": 1377828864, "step": 657 }, { "epoch": 0.14, "grad_norm": 1.6640625, "learning_rate": 2e-05, "loss": 2.5428, "num_input_tokens_seen": 1379926016, "step": 658 }, { "epoch": 0.14, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.5184, "num_input_tokens_seen": 1382023168, "step": 659 }, { "epoch": 0.14, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.5133, "num_input_tokens_seen": 1384120320, "step": 660 }, { "epoch": 0.14, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 2.5176, "num_input_tokens_seen": 1386217472, "step": 661 }, { "epoch": 0.14, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 2.5149, "num_input_tokens_seen": 1388314624, "step": 662 }, { "epoch": 0.14, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.4967, "num_input_tokens_seen": 1390411776, "step": 663 }, { "epoch": 0.14, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 2.5214, "num_input_tokens_seen": 1392508928, "step": 664 }, { "epoch": 0.14, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.5204, "num_input_tokens_seen": 1394606080, "step": 665 }, { "epoch": 0.14, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 2.5478, "num_input_tokens_seen": 1396703232, "step": 666 }, { "epoch": 0.14, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.5185, "num_input_tokens_seen": 1398800384, "step": 667 }, { "epoch": 0.14, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 2.5132, "num_input_tokens_seen": 1400897536, "step": 668 }, { "epoch": 0.14, "grad_norm": 1.1875, "learning_rate": 2e-05, "loss": 2.5008, "num_input_tokens_seen": 1402994688, "step": 669 }, { "epoch": 0.14, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 2.5182, "num_input_tokens_seen": 1405091840, "step": 670 }, { "epoch": 0.14, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 2.5165, "num_input_tokens_seen": 1407188992, "step": 671 }, { "epoch": 0.14, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 2.4936, "num_input_tokens_seen": 1409286144, "step": 672 }, { "epoch": 0.14, "grad_norm": 1.6171875, "learning_rate": 2e-05, "loss": 2.5164, "num_input_tokens_seen": 1411383296, "step": 673 }, { "epoch": 0.14, "grad_norm": 1.2890625, "learning_rate": 2e-05, "loss": 2.4825, "num_input_tokens_seen": 1413480448, "step": 674 }, { "epoch": 0.14, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 2.5515, "num_input_tokens_seen": 1415577600, "step": 675 }, { "epoch": 0.14, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.5405, "num_input_tokens_seen": 1417674752, "step": 676 }, { "epoch": 0.14, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 2.5031, "num_input_tokens_seen": 1419771904, "step": 677 }, { "epoch": 0.14, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 2.5156, "num_input_tokens_seen": 1421869056, "step": 678 }, { "epoch": 0.14, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 2.5381, "num_input_tokens_seen": 1423966208, "step": 679 }, { "epoch": 0.14, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.5342, "num_input_tokens_seen": 1426063360, "step": 680 }, { "epoch": 0.14, "grad_norm": 1.1640625, "learning_rate": 2e-05, "loss": 2.51, "num_input_tokens_seen": 1428160512, "step": 681 }, { "epoch": 0.14, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 2.5108, "num_input_tokens_seen": 1430257664, "step": 682 }, { "epoch": 0.14, "grad_norm": 1.2109375, "learning_rate": 2e-05, "loss": 2.5245, "num_input_tokens_seen": 1432354816, "step": 683 }, { "epoch": 0.14, "grad_norm": 1.6796875, "learning_rate": 2e-05, "loss": 2.5298, "num_input_tokens_seen": 1434451968, "step": 684 }, { "epoch": 0.14, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 2.4931, "num_input_tokens_seen": 1436549120, "step": 685 }, { "epoch": 0.14, "grad_norm": 1.2265625, "learning_rate": 2e-05, "loss": 2.5164, "num_input_tokens_seen": 1438646272, "step": 686 }, { "epoch": 0.14, "grad_norm": 1.921875, "learning_rate": 2e-05, "loss": 2.512, "num_input_tokens_seen": 1440743424, "step": 687 }, { "epoch": 0.14, "grad_norm": 2.15625, "learning_rate": 2e-05, "loss": 2.5037, "num_input_tokens_seen": 1442840576, "step": 688 }, { "epoch": 0.14, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.5065, "num_input_tokens_seen": 1444937728, "step": 689 }, { "epoch": 0.14, "grad_norm": 1.9375, "learning_rate": 2e-05, "loss": 2.5398, "num_input_tokens_seen": 1447034880, "step": 690 }, { "epoch": 0.14, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 2.5232, "num_input_tokens_seen": 1449132032, "step": 691 }, { "epoch": 0.15, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 2.4913, "num_input_tokens_seen": 1451229184, "step": 692 }, { "epoch": 0.15, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.493, "num_input_tokens_seen": 1453326336, "step": 693 }, { "epoch": 0.15, "grad_norm": 2.109375, "learning_rate": 2e-05, "loss": 2.4812, "num_input_tokens_seen": 1455423488, "step": 694 }, { "epoch": 0.15, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 2.4951, "num_input_tokens_seen": 1457520640, "step": 695 }, { "epoch": 0.15, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 2.4912, "num_input_tokens_seen": 1459617792, "step": 696 }, { "epoch": 0.15, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 2.4773, "num_input_tokens_seen": 1461714944, "step": 697 }, { "epoch": 0.15, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 2.4976, "num_input_tokens_seen": 1463812096, "step": 698 }, { "epoch": 0.15, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.5132, "num_input_tokens_seen": 1465909248, "step": 699 }, { "epoch": 0.15, "grad_norm": 1.2890625, "learning_rate": 2e-05, "loss": 2.5073, "num_input_tokens_seen": 1468006400, "step": 700 }, { "epoch": 0.15, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.4998, "num_input_tokens_seen": 1470103552, "step": 701 }, { "epoch": 0.15, "grad_norm": 1.359375, "learning_rate": 2e-05, "loss": 2.4995, "num_input_tokens_seen": 1472200704, "step": 702 }, { "epoch": 0.15, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.5251, "num_input_tokens_seen": 1474297856, "step": 703 }, { "epoch": 0.15, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.4824, "num_input_tokens_seen": 1476395008, "step": 704 }, { "epoch": 0.15, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.5041, "num_input_tokens_seen": 1478492160, "step": 705 }, { "epoch": 0.15, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.4976, "num_input_tokens_seen": 1480589312, "step": 706 }, { "epoch": 0.15, "grad_norm": 1.2265625, "learning_rate": 2e-05, "loss": 2.4816, "num_input_tokens_seen": 1482686464, "step": 707 }, { "epoch": 0.15, "grad_norm": 1.1796875, "learning_rate": 2e-05, "loss": 2.4875, "num_input_tokens_seen": 1484783616, "step": 708 }, { "epoch": 0.15, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.4723, "num_input_tokens_seen": 1486880768, "step": 709 }, { "epoch": 0.15, "grad_norm": 1.1171875, "learning_rate": 2e-05, "loss": 2.5009, "num_input_tokens_seen": 1488977920, "step": 710 }, { "epoch": 0.15, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.4659, "num_input_tokens_seen": 1491075072, "step": 711 }, { "epoch": 0.15, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.5063, "num_input_tokens_seen": 1493172224, "step": 712 }, { "epoch": 0.15, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.4913, "num_input_tokens_seen": 1495269376, "step": 713 }, { "epoch": 0.15, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.5255, "num_input_tokens_seen": 1497366528, "step": 714 }, { "epoch": 0.15, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.4863, "num_input_tokens_seen": 1499463680, "step": 715 }, { "epoch": 0.15, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.5022, "num_input_tokens_seen": 1501560832, "step": 716 }, { "epoch": 0.15, "grad_norm": 0.98046875, "learning_rate": 2e-05, "loss": 2.4878, "num_input_tokens_seen": 1503657984, "step": 717 }, { "epoch": 0.15, "grad_norm": 1.9921875, "learning_rate": 2e-05, "loss": 2.4977, "num_input_tokens_seen": 1505755136, "step": 718 }, { "epoch": 0.15, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.4998, "num_input_tokens_seen": 1507852288, "step": 719 }, { "epoch": 0.15, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 2.4922, "num_input_tokens_seen": 1509949440, "step": 720 }, { "epoch": 0.15, "grad_norm": 1.7421875, "learning_rate": 2e-05, "loss": 2.4744, "num_input_tokens_seen": 1512046592, "step": 721 }, { "epoch": 0.15, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 2.5217, "num_input_tokens_seen": 1514143744, "step": 722 }, { "epoch": 0.15, "grad_norm": 1.8359375, "learning_rate": 2e-05, "loss": 2.4968, "num_input_tokens_seen": 1516240896, "step": 723 }, { "epoch": 0.15, "grad_norm": 1.1875, "learning_rate": 2e-05, "loss": 2.5143, "num_input_tokens_seen": 1518338048, "step": 724 }, { "epoch": 0.15, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 2.4557, "num_input_tokens_seen": 1520435200, "step": 725 }, { "epoch": 0.15, "grad_norm": 2.0, "learning_rate": 2e-05, "loss": 2.4728, "num_input_tokens_seen": 1522532352, "step": 726 }, { "epoch": 0.15, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 2.5286, "num_input_tokens_seen": 1524629504, "step": 727 }, { "epoch": 0.15, "grad_norm": 3.9375, "learning_rate": 2e-05, "loss": 2.5188, "num_input_tokens_seen": 1526726656, "step": 728 }, { "epoch": 0.15, "grad_norm": 3.265625, "learning_rate": 2e-05, "loss": 2.5145, "num_input_tokens_seen": 1528823808, "step": 729 }, { "epoch": 0.15, "grad_norm": 4.0625, "learning_rate": 2e-05, "loss": 2.4789, "num_input_tokens_seen": 1530920960, "step": 730 }, { "epoch": 0.15, "grad_norm": 3.9375, "learning_rate": 2e-05, "loss": 2.513, "num_input_tokens_seen": 1533018112, "step": 731 }, { "epoch": 0.15, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 2.503, "num_input_tokens_seen": 1535115264, "step": 732 }, { "epoch": 0.15, "grad_norm": 3.171875, "learning_rate": 2e-05, "loss": 2.493, "num_input_tokens_seen": 1537212416, "step": 733 }, { "epoch": 0.15, "grad_norm": 2.71875, "learning_rate": 2e-05, "loss": 2.5021, "num_input_tokens_seen": 1539309568, "step": 734 }, { "epoch": 0.15, "grad_norm": 3.25, "learning_rate": 2e-05, "loss": 2.4947, "num_input_tokens_seen": 1541406720, "step": 735 }, { "epoch": 0.15, "grad_norm": 3.0, "learning_rate": 2e-05, "loss": 2.4681, "num_input_tokens_seen": 1543503872, "step": 736 }, { "epoch": 0.15, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 2.451, "num_input_tokens_seen": 1545601024, "step": 737 }, { "epoch": 0.15, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 2.4803, "num_input_tokens_seen": 1547698176, "step": 738 }, { "epoch": 0.15, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 2.4772, "num_input_tokens_seen": 1549795328, "step": 739 }, { "epoch": 0.16, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.4969, "num_input_tokens_seen": 1551892480, "step": 740 }, { "epoch": 0.16, "grad_norm": 1.8984375, "learning_rate": 2e-05, "loss": 2.5116, "num_input_tokens_seen": 1553989632, "step": 741 }, { "epoch": 0.16, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 2.5076, "num_input_tokens_seen": 1556086784, "step": 742 }, { "epoch": 0.16, "grad_norm": 1.9140625, "learning_rate": 2e-05, "loss": 2.4929, "num_input_tokens_seen": 1558183936, "step": 743 }, { "epoch": 0.16, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 2.4747, "num_input_tokens_seen": 1560281088, "step": 744 }, { "epoch": 0.16, "grad_norm": 2.65625, "learning_rate": 2e-05, "loss": 2.516, "num_input_tokens_seen": 1562378240, "step": 745 }, { "epoch": 0.16, "grad_norm": 2.46875, "learning_rate": 2e-05, "loss": 2.4961, "num_input_tokens_seen": 1564475392, "step": 746 }, { "epoch": 0.16, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 2.4857, "num_input_tokens_seen": 1566572544, "step": 747 }, { "epoch": 0.16, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 2.5108, "num_input_tokens_seen": 1568669696, "step": 748 }, { "epoch": 0.16, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 2.4894, "num_input_tokens_seen": 1570766848, "step": 749 }, { "epoch": 0.16, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.478, "num_input_tokens_seen": 1572864000, "step": 750 }, { "epoch": 0.16, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 2.4634, "num_input_tokens_seen": 1574961152, "step": 751 }, { "epoch": 0.16, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.4478, "num_input_tokens_seen": 1577058304, "step": 752 }, { "epoch": 0.16, "grad_norm": 1.8828125, "learning_rate": 2e-05, "loss": 2.4704, "num_input_tokens_seen": 1579155456, "step": 753 }, { "epoch": 0.16, "grad_norm": 1.859375, "learning_rate": 2e-05, "loss": 2.4485, "num_input_tokens_seen": 1581252608, "step": 754 }, { "epoch": 0.16, "grad_norm": 1.9140625, "learning_rate": 2e-05, "loss": 2.4761, "num_input_tokens_seen": 1583349760, "step": 755 }, { "epoch": 0.16, "grad_norm": 1.6484375, "learning_rate": 2e-05, "loss": 2.4574, "num_input_tokens_seen": 1585446912, "step": 756 }, { "epoch": 0.16, "grad_norm": 2.03125, "learning_rate": 2e-05, "loss": 2.4602, "num_input_tokens_seen": 1587544064, "step": 757 }, { "epoch": 0.16, "grad_norm": 2.109375, "learning_rate": 2e-05, "loss": 2.4939, "num_input_tokens_seen": 1589641216, "step": 758 }, { "epoch": 0.16, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.4781, "num_input_tokens_seen": 1591738368, "step": 759 }, { "epoch": 0.16, "grad_norm": 2.75, "learning_rate": 2e-05, "loss": 2.4875, "num_input_tokens_seen": 1593835520, "step": 760 }, { "epoch": 0.16, "grad_norm": 2.75, "learning_rate": 2e-05, "loss": 2.4751, "num_input_tokens_seen": 1595932672, "step": 761 }, { "epoch": 0.16, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.474, "num_input_tokens_seen": 1598029824, "step": 762 }, { "epoch": 0.16, "grad_norm": 2.640625, "learning_rate": 2e-05, "loss": 2.5156, "num_input_tokens_seen": 1600126976, "step": 763 }, { "epoch": 0.16, "grad_norm": 2.765625, "learning_rate": 2e-05, "loss": 2.508, "num_input_tokens_seen": 1602224128, "step": 764 }, { "epoch": 0.16, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 2.4859, "num_input_tokens_seen": 1604321280, "step": 765 }, { "epoch": 0.16, "grad_norm": 1.9609375, "learning_rate": 2e-05, "loss": 2.4775, "num_input_tokens_seen": 1606418432, "step": 766 }, { "epoch": 0.16, "grad_norm": 2.25, "learning_rate": 2e-05, "loss": 2.529, "num_input_tokens_seen": 1608515584, "step": 767 }, { "epoch": 0.16, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.4666, "num_input_tokens_seen": 1610612736, "step": 768 }, { "epoch": 0.16, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 2.4742, "num_input_tokens_seen": 1612709888, "step": 769 }, { "epoch": 0.16, "grad_norm": 1.703125, "learning_rate": 2e-05, "loss": 2.4831, "num_input_tokens_seen": 1614807040, "step": 770 }, { "epoch": 0.16, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 2.4959, "num_input_tokens_seen": 1616904192, "step": 771 }, { "epoch": 0.16, "grad_norm": 1.71875, "learning_rate": 2e-05, "loss": 2.4368, "num_input_tokens_seen": 1619001344, "step": 772 }, { "epoch": 0.16, "grad_norm": 1.765625, "learning_rate": 2e-05, "loss": 2.5079, "num_input_tokens_seen": 1621098496, "step": 773 }, { "epoch": 0.16, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 2.4824, "num_input_tokens_seen": 1623195648, "step": 774 }, { "epoch": 0.16, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 2.5034, "num_input_tokens_seen": 1625292800, "step": 775 }, { "epoch": 0.16, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 2.4613, "num_input_tokens_seen": 1627389952, "step": 776 }, { "epoch": 0.16, "grad_norm": 1.1640625, "learning_rate": 2e-05, "loss": 2.4565, "num_input_tokens_seen": 1629487104, "step": 777 }, { "epoch": 0.16, "grad_norm": 1.7265625, "learning_rate": 2e-05, "loss": 2.4804, "num_input_tokens_seen": 1631584256, "step": 778 }, { "epoch": 0.16, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 2.494, "num_input_tokens_seen": 1633681408, "step": 779 }, { "epoch": 0.16, "grad_norm": 0.97265625, "learning_rate": 2e-05, "loss": 2.5035, "num_input_tokens_seen": 1635778560, "step": 780 }, { "epoch": 0.16, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 2.4932, "num_input_tokens_seen": 1637875712, "step": 781 }, { "epoch": 0.16, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.4587, "num_input_tokens_seen": 1639972864, "step": 782 }, { "epoch": 0.16, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 2.4703, "num_input_tokens_seen": 1642070016, "step": 783 }, { "epoch": 0.16, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 2.4613, "num_input_tokens_seen": 1644167168, "step": 784 }, { "epoch": 0.16, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 2.4795, "num_input_tokens_seen": 1646264320, "step": 785 }, { "epoch": 0.16, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 2.448, "num_input_tokens_seen": 1648361472, "step": 786 }, { "epoch": 0.17, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.4778, "num_input_tokens_seen": 1650458624, "step": 787 }, { "epoch": 0.17, "grad_norm": 1.3828125, "learning_rate": 2e-05, "loss": 2.421, "num_input_tokens_seen": 1652555776, "step": 788 }, { "epoch": 0.17, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 2.4363, "num_input_tokens_seen": 1654652928, "step": 789 }, { "epoch": 0.17, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.4879, "num_input_tokens_seen": 1656750080, "step": 790 }, { "epoch": 0.17, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.4929, "num_input_tokens_seen": 1658847232, "step": 791 }, { "epoch": 0.17, "grad_norm": 1.90625, "learning_rate": 2e-05, "loss": 2.4515, "num_input_tokens_seen": 1660944384, "step": 792 }, { "epoch": 0.17, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.4379, "num_input_tokens_seen": 1663041536, "step": 793 }, { "epoch": 0.17, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.4681, "num_input_tokens_seen": 1665138688, "step": 794 }, { "epoch": 0.17, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.4484, "num_input_tokens_seen": 1667235840, "step": 795 }, { "epoch": 0.17, "grad_norm": 0.96875, "learning_rate": 2e-05, "loss": 2.4251, "num_input_tokens_seen": 1669332992, "step": 796 }, { "epoch": 0.17, "eval_loss": 2.4695074558258057, "eval_runtime": 2061.0517, "eval_samples_per_second": 1.913, "eval_steps_per_second": 0.478, "num_input_tokens_seen": 1669332992, "step": 796 }, { "epoch": 0.17, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.4705, "num_input_tokens_seen": 1671430144, "step": 797 }, { "epoch": 0.17, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.4855, "num_input_tokens_seen": 1673527296, "step": 798 }, { "epoch": 0.17, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.4348, "num_input_tokens_seen": 1675624448, "step": 799 }, { "epoch": 0.17, "grad_norm": 1.1484375, "learning_rate": 2e-05, "loss": 2.4627, "num_input_tokens_seen": 1677721600, "step": 800 }, { "epoch": 0.17, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.4761, "num_input_tokens_seen": 1679818752, "step": 801 }, { "epoch": 0.17, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.4697, "num_input_tokens_seen": 1681915904, "step": 802 }, { "epoch": 0.17, "grad_norm": 1.2109375, "learning_rate": 2e-05, "loss": 2.4944, "num_input_tokens_seen": 1684013056, "step": 803 }, { "epoch": 0.17, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.4437, "num_input_tokens_seen": 1686110208, "step": 804 }, { "epoch": 0.17, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.4706, "num_input_tokens_seen": 1688207360, "step": 805 }, { "epoch": 0.17, "grad_norm": 0.98828125, "learning_rate": 2e-05, "loss": 2.4807, "num_input_tokens_seen": 1690304512, "step": 806 }, { "epoch": 0.17, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.453, "num_input_tokens_seen": 1692401664, "step": 807 }, { "epoch": 0.17, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.4425, "num_input_tokens_seen": 1694498816, "step": 808 }, { "epoch": 0.17, "grad_norm": 1.140625, "learning_rate": 2e-05, "loss": 2.4742, "num_input_tokens_seen": 1696595968, "step": 809 }, { "epoch": 0.17, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.4811, "num_input_tokens_seen": 1698693120, "step": 810 }, { "epoch": 0.17, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.4935, "num_input_tokens_seen": 1700790272, "step": 811 }, { "epoch": 0.17, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.4464, "num_input_tokens_seen": 1702887424, "step": 812 }, { "epoch": 0.17, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.4123, "num_input_tokens_seen": 1704984576, "step": 813 }, { "epoch": 0.17, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.4494, "num_input_tokens_seen": 1707081728, "step": 814 }, { "epoch": 0.17, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.4583, "num_input_tokens_seen": 1709178880, "step": 815 }, { "epoch": 0.17, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.4519, "num_input_tokens_seen": 1711276032, "step": 816 }, { "epoch": 0.17, "grad_norm": 1.1875, "learning_rate": 2e-05, "loss": 2.4436, "num_input_tokens_seen": 1713373184, "step": 817 }, { "epoch": 0.17, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.4592, "num_input_tokens_seen": 1715470336, "step": 818 }, { "epoch": 0.17, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.4919, "num_input_tokens_seen": 1717567488, "step": 819 }, { "epoch": 0.17, "grad_norm": 1.109375, "learning_rate": 2e-05, "loss": 2.4772, "num_input_tokens_seen": 1719664640, "step": 820 }, { "epoch": 0.17, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.4854, "num_input_tokens_seen": 1721761792, "step": 821 }, { "epoch": 0.17, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.4528, "num_input_tokens_seen": 1723858944, "step": 822 }, { "epoch": 0.17, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.4591, "num_input_tokens_seen": 1725956096, "step": 823 }, { "epoch": 0.17, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.4214, "num_input_tokens_seen": 1728053248, "step": 824 }, { "epoch": 0.17, "grad_norm": 1.140625, "learning_rate": 2e-05, "loss": 2.4467, "num_input_tokens_seen": 1730150400, "step": 825 }, { "epoch": 0.17, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.4653, "num_input_tokens_seen": 1732247552, "step": 826 }, { "epoch": 0.17, "grad_norm": 1.1875, "learning_rate": 2e-05, "loss": 2.4252, "num_input_tokens_seen": 1734344704, "step": 827 }, { "epoch": 0.17, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.4554, "num_input_tokens_seen": 1736441856, "step": 828 }, { "epoch": 0.17, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.4544, "num_input_tokens_seen": 1738539008, "step": 829 }, { "epoch": 0.17, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.4676, "num_input_tokens_seen": 1740636160, "step": 830 }, { "epoch": 0.17, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.4643, "num_input_tokens_seen": 1742733312, "step": 831 }, { "epoch": 0.17, "grad_norm": 0.98046875, "learning_rate": 2e-05, "loss": 2.4789, "num_input_tokens_seen": 1744830464, "step": 832 }, { "epoch": 0.17, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.4576, "num_input_tokens_seen": 1746927616, "step": 833 }, { "epoch": 0.17, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.4416, "num_input_tokens_seen": 1749024768, "step": 834 }, { "epoch": 0.18, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.48, "num_input_tokens_seen": 1751121920, "step": 835 }, { "epoch": 0.18, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.4802, "num_input_tokens_seen": 1753219072, "step": 836 }, { "epoch": 0.18, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.4611, "num_input_tokens_seen": 1755316224, "step": 837 }, { "epoch": 0.18, "grad_norm": 0.98046875, "learning_rate": 2e-05, "loss": 2.472, "num_input_tokens_seen": 1757413376, "step": 838 }, { "epoch": 0.18, "grad_norm": 1.1640625, "learning_rate": 2e-05, "loss": 2.492, "num_input_tokens_seen": 1759510528, "step": 839 }, { "epoch": 0.18, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.4486, "num_input_tokens_seen": 1761607680, "step": 840 }, { "epoch": 0.18, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.452, "num_input_tokens_seen": 1763704832, "step": 841 }, { "epoch": 0.18, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.4545, "num_input_tokens_seen": 1765801984, "step": 842 }, { "epoch": 0.18, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.4087, "num_input_tokens_seen": 1767899136, "step": 843 }, { "epoch": 0.18, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.4595, "num_input_tokens_seen": 1769996288, "step": 844 }, { "epoch": 0.18, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.452, "num_input_tokens_seen": 1772093440, "step": 845 }, { "epoch": 0.18, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.4693, "num_input_tokens_seen": 1774190592, "step": 846 }, { "epoch": 0.18, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.4573, "num_input_tokens_seen": 1776287744, "step": 847 }, { "epoch": 0.18, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.4381, "num_input_tokens_seen": 1778384896, "step": 848 }, { "epoch": 0.18, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.4512, "num_input_tokens_seen": 1780482048, "step": 849 }, { "epoch": 0.18, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.4829, "num_input_tokens_seen": 1782579200, "step": 850 }, { "epoch": 0.18, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.453, "num_input_tokens_seen": 1784676352, "step": 851 }, { "epoch": 0.18, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 2.4334, "num_input_tokens_seen": 1786773504, "step": 852 }, { "epoch": 0.18, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 2.4499, "num_input_tokens_seen": 1788870656, "step": 853 }, { "epoch": 0.18, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.46, "num_input_tokens_seen": 1790967808, "step": 854 }, { "epoch": 0.18, "grad_norm": 1.2890625, "learning_rate": 2e-05, "loss": 2.4843, "num_input_tokens_seen": 1793064960, "step": 855 }, { "epoch": 0.18, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.4248, "num_input_tokens_seen": 1795162112, "step": 856 }, { "epoch": 0.18, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.4235, "num_input_tokens_seen": 1797259264, "step": 857 }, { "epoch": 0.18, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.4795, "num_input_tokens_seen": 1799356416, "step": 858 }, { "epoch": 0.18, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.4658, "num_input_tokens_seen": 1801453568, "step": 859 }, { "epoch": 0.18, "grad_norm": 1.1796875, "learning_rate": 2e-05, "loss": 2.4507, "num_input_tokens_seen": 1803550720, "step": 860 }, { "epoch": 0.18, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.4535, "num_input_tokens_seen": 1805647872, "step": 861 }, { "epoch": 0.18, "grad_norm": 0.98828125, "learning_rate": 2e-05, "loss": 2.457, "num_input_tokens_seen": 1807745024, "step": 862 }, { "epoch": 0.18, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 2.4409, "num_input_tokens_seen": 1809842176, "step": 863 }, { "epoch": 0.18, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 2.4444, "num_input_tokens_seen": 1811939328, "step": 864 }, { "epoch": 0.18, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.4728, "num_input_tokens_seen": 1814036480, "step": 865 }, { "epoch": 0.18, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.4656, "num_input_tokens_seen": 1816133632, "step": 866 }, { "epoch": 0.18, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.4547, "num_input_tokens_seen": 1818230784, "step": 867 }, { "epoch": 0.18, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.4452, "num_input_tokens_seen": 1820327936, "step": 868 }, { "epoch": 0.18, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 2.4749, "num_input_tokens_seen": 1822425088, "step": 869 }, { "epoch": 0.18, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.4474, "num_input_tokens_seen": 1824522240, "step": 870 }, { "epoch": 0.18, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.4634, "num_input_tokens_seen": 1826619392, "step": 871 }, { "epoch": 0.18, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.4355, "num_input_tokens_seen": 1828716544, "step": 872 }, { "epoch": 0.18, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.4704, "num_input_tokens_seen": 1830813696, "step": 873 }, { "epoch": 0.18, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.4355, "num_input_tokens_seen": 1832910848, "step": 874 }, { "epoch": 0.18, "grad_norm": 1.21875, "learning_rate": 2e-05, "loss": 2.4125, "num_input_tokens_seen": 1835008000, "step": 875 }, { "epoch": 0.18, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.4341, "num_input_tokens_seen": 1837105152, "step": 876 }, { "epoch": 0.18, "grad_norm": 1.8125, "learning_rate": 2e-05, "loss": 2.4112, "num_input_tokens_seen": 1839202304, "step": 877 }, { "epoch": 0.18, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 2.442, "num_input_tokens_seen": 1841299456, "step": 878 }, { "epoch": 0.18, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.4334, "num_input_tokens_seen": 1843396608, "step": 879 }, { "epoch": 0.18, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.4545, "num_input_tokens_seen": 1845493760, "step": 880 }, { "epoch": 0.18, "grad_norm": 1.1796875, "learning_rate": 2e-05, "loss": 2.4182, "num_input_tokens_seen": 1847590912, "step": 881 }, { "epoch": 0.18, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.4012, "num_input_tokens_seen": 1849688064, "step": 882 }, { "epoch": 0.19, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.4413, "num_input_tokens_seen": 1851785216, "step": 883 }, { "epoch": 0.19, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.4417, "num_input_tokens_seen": 1853882368, "step": 884 }, { "epoch": 0.19, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.4323, "num_input_tokens_seen": 1855979520, "step": 885 }, { "epoch": 0.19, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.4185, "num_input_tokens_seen": 1858076672, "step": 886 }, { "epoch": 0.19, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.4407, "num_input_tokens_seen": 1860173824, "step": 887 }, { "epoch": 0.19, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.4701, "num_input_tokens_seen": 1862270976, "step": 888 }, { "epoch": 0.19, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.4231, "num_input_tokens_seen": 1864368128, "step": 889 }, { "epoch": 0.19, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.4478, "num_input_tokens_seen": 1866465280, "step": 890 }, { "epoch": 0.19, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.4009, "num_input_tokens_seen": 1868562432, "step": 891 }, { "epoch": 0.19, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.4281, "num_input_tokens_seen": 1870659584, "step": 892 }, { "epoch": 0.19, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.4311, "num_input_tokens_seen": 1872756736, "step": 893 }, { "epoch": 0.19, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.4418, "num_input_tokens_seen": 1874853888, "step": 894 }, { "epoch": 0.19, "grad_norm": 1.1953125, "learning_rate": 2e-05, "loss": 2.4156, "num_input_tokens_seen": 1876951040, "step": 895 }, { "epoch": 0.19, "grad_norm": 1.1484375, "learning_rate": 2e-05, "loss": 2.4254, "num_input_tokens_seen": 1879048192, "step": 896 }, { "epoch": 0.19, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.4424, "num_input_tokens_seen": 1881145344, "step": 897 }, { "epoch": 0.19, "grad_norm": 1.140625, "learning_rate": 2e-05, "loss": 2.4737, "num_input_tokens_seen": 1883242496, "step": 898 }, { "epoch": 0.19, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.4374, "num_input_tokens_seen": 1885339648, "step": 899 }, { "epoch": 0.19, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.409, "num_input_tokens_seen": 1887436800, "step": 900 }, { "epoch": 0.0, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.4298, "num_input_tokens_seen": 1889533952, "step": 901 }, { "epoch": 0.0, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.4122, "num_input_tokens_seen": 1891631104, "step": 902 }, { "epoch": 0.0, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.4203, "num_input_tokens_seen": 1893728256, "step": 903 }, { "epoch": 0.0, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.4853, "num_input_tokens_seen": 1895825408, "step": 904 }, { "epoch": 0.0, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.4303, "num_input_tokens_seen": 1897922560, "step": 905 }, { "epoch": 0.0, "grad_norm": 0.96875, "learning_rate": 2e-05, "loss": 2.4303, "num_input_tokens_seen": 1900019712, "step": 906 }, { "epoch": 0.0, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 2.4605, "num_input_tokens_seen": 1902116864, "step": 907 }, { "epoch": 0.0, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.4464, "num_input_tokens_seen": 1904214016, "step": 908 }, { "epoch": 0.0, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.4673, "num_input_tokens_seen": 1906311168, "step": 909 }, { "epoch": 0.0, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.4465, "num_input_tokens_seen": 1908408320, "step": 910 }, { "epoch": 0.0, "grad_norm": 0.91796875, "learning_rate": 2e-05, "loss": 2.4285, "num_input_tokens_seen": 1910505472, "step": 911 }, { "epoch": 0.0, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.4267, "num_input_tokens_seen": 1912602624, "step": 912 }, { "epoch": 0.0, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.4029, "num_input_tokens_seen": 1914699776, "step": 913 }, { "epoch": 0.0, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.4503, "num_input_tokens_seen": 1916796928, "step": 914 }, { "epoch": 0.0, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.4261, "num_input_tokens_seen": 1918894080, "step": 915 }, { "epoch": 0.0, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.4751, "num_input_tokens_seen": 1920991232, "step": 916 }, { "epoch": 0.0, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.4031, "num_input_tokens_seen": 1923088384, "step": 917 }, { "epoch": 0.0, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.4036, "num_input_tokens_seen": 1925185536, "step": 918 }, { "epoch": 0.0, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.4391, "num_input_tokens_seen": 1927282688, "step": 919 }, { "epoch": 0.0, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.4295, "num_input_tokens_seen": 1929379840, "step": 920 }, { "epoch": 0.0, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.4197, "num_input_tokens_seen": 1931476992, "step": 921 }, { "epoch": 0.0, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.4067, "num_input_tokens_seen": 1933574144, "step": 922 }, { "epoch": 0.0, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.4255, "num_input_tokens_seen": 1935671296, "step": 923 }, { "epoch": 0.01, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.421, "num_input_tokens_seen": 1937768448, "step": 924 }, { "epoch": 0.01, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.4383, "num_input_tokens_seen": 1939865600, "step": 925 }, { "epoch": 0.01, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.3856, "num_input_tokens_seen": 1941962752, "step": 926 }, { "epoch": 0.01, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.4388, "num_input_tokens_seen": 1944059904, "step": 927 }, { "epoch": 0.01, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 2.4253, "num_input_tokens_seen": 1946157056, "step": 928 }, { "epoch": 0.01, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.4118, "num_input_tokens_seen": 1948254208, "step": 929 }, { "epoch": 0.01, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.4621, "num_input_tokens_seen": 1950351360, "step": 930 }, { "epoch": 0.01, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.4353, "num_input_tokens_seen": 1952448512, "step": 931 }, { "epoch": 0.01, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.4226, "num_input_tokens_seen": 1954545664, "step": 932 }, { "epoch": 0.01, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.4451, "num_input_tokens_seen": 1956642816, "step": 933 }, { "epoch": 0.01, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.4228, "num_input_tokens_seen": 1958739968, "step": 934 }, { "epoch": 0.01, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.433, "num_input_tokens_seen": 1960837120, "step": 935 }, { "epoch": 0.01, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.4262, "num_input_tokens_seen": 1962934272, "step": 936 }, { "epoch": 0.01, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.4121, "num_input_tokens_seen": 1965031424, "step": 937 }, { "epoch": 0.01, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.449, "num_input_tokens_seen": 1967128576, "step": 938 }, { "epoch": 0.01, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.4098, "num_input_tokens_seen": 1969225728, "step": 939 }, { "epoch": 0.01, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.4141, "num_input_tokens_seen": 1971322880, "step": 940 }, { "epoch": 0.01, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.4295, "num_input_tokens_seen": 1973420032, "step": 941 }, { "epoch": 0.01, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.4309, "num_input_tokens_seen": 1975517184, "step": 942 }, { "epoch": 0.01, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.4348, "num_input_tokens_seen": 1977614336, "step": 943 }, { "epoch": 0.01, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.4063, "num_input_tokens_seen": 1979711488, "step": 944 }, { "epoch": 0.01, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.4389, "num_input_tokens_seen": 1981808640, "step": 945 }, { "epoch": 0.01, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.404, "num_input_tokens_seen": 1983905792, "step": 946 }, { "epoch": 0.01, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.4343, "num_input_tokens_seen": 1986002944, "step": 947 }, { "epoch": 0.01, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.4231, "num_input_tokens_seen": 1988100096, "step": 948 }, { "epoch": 0.01, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.4037, "num_input_tokens_seen": 1990197248, "step": 949 }, { "epoch": 0.01, "grad_norm": 1.109375, "learning_rate": 2e-05, "loss": 2.4282, "num_input_tokens_seen": 1992294400, "step": 950 }, { "epoch": 0.01, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3873, "num_input_tokens_seen": 1994391552, "step": 951 }, { "epoch": 0.01, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.4168, "num_input_tokens_seen": 1996488704, "step": 952 }, { "epoch": 0.01, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.379, "num_input_tokens_seen": 1998585856, "step": 953 }, { "epoch": 0.01, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.4274, "num_input_tokens_seen": 2000683008, "step": 954 }, { "epoch": 0.01, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.4256, "num_input_tokens_seen": 2002780160, "step": 955 }, { "epoch": 0.01, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.3978, "num_input_tokens_seen": 2004877312, "step": 956 }, { "epoch": 0.01, "grad_norm": 1.1796875, "learning_rate": 2e-05, "loss": 2.436, "num_input_tokens_seen": 2006974464, "step": 957 }, { "epoch": 0.01, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.4034, "num_input_tokens_seen": 2009071616, "step": 958 }, { "epoch": 0.01, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.4084, "num_input_tokens_seen": 2011168768, "step": 959 }, { "epoch": 0.01, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.4185, "num_input_tokens_seen": 2013265920, "step": 960 }, { "epoch": 0.01, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.4243, "num_input_tokens_seen": 2015363072, "step": 961 }, { "epoch": 0.01, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.4175, "num_input_tokens_seen": 2017460224, "step": 962 }, { "epoch": 0.01, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.4205, "num_input_tokens_seen": 2019557376, "step": 963 }, { "epoch": 0.01, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.4379, "num_input_tokens_seen": 2021654528, "step": 964 }, { "epoch": 0.01, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.3875, "num_input_tokens_seen": 2023751680, "step": 965 }, { "epoch": 0.01, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.3922, "num_input_tokens_seen": 2025848832, "step": 966 }, { "epoch": 0.01, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.3979, "num_input_tokens_seen": 2027945984, "step": 967 }, { "epoch": 0.01, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.4118, "num_input_tokens_seen": 2030043136, "step": 968 }, { "epoch": 0.01, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.4504, "num_input_tokens_seen": 2032140288, "step": 969 }, { "epoch": 0.01, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.4206, "num_input_tokens_seen": 2034237440, "step": 970 }, { "epoch": 0.01, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.4255, "num_input_tokens_seen": 2036334592, "step": 971 }, { "epoch": 0.02, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.4186, "num_input_tokens_seen": 2038431744, "step": 972 }, { "epoch": 0.02, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.4145, "num_input_tokens_seen": 2040528896, "step": 973 }, { "epoch": 0.02, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.4348, "num_input_tokens_seen": 2042626048, "step": 974 }, { "epoch": 0.02, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.4064, "num_input_tokens_seen": 2044723200, "step": 975 }, { "epoch": 0.02, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.4116, "num_input_tokens_seen": 2046820352, "step": 976 }, { "epoch": 0.02, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.4249, "num_input_tokens_seen": 2048917504, "step": 977 }, { "epoch": 0.02, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.4131, "num_input_tokens_seen": 2051014656, "step": 978 }, { "epoch": 0.02, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.4183, "num_input_tokens_seen": 2053111808, "step": 979 }, { "epoch": 0.02, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.4368, "num_input_tokens_seen": 2055208960, "step": 980 }, { "epoch": 0.02, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.3713, "num_input_tokens_seen": 2057306112, "step": 981 }, { "epoch": 0.02, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.4042, "num_input_tokens_seen": 2059403264, "step": 982 }, { "epoch": 0.02, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.4134, "num_input_tokens_seen": 2061500416, "step": 983 }, { "epoch": 0.02, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.4265, "num_input_tokens_seen": 2063597568, "step": 984 }, { "epoch": 0.02, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.4184, "num_input_tokens_seen": 2065694720, "step": 985 }, { "epoch": 0.02, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.4253, "num_input_tokens_seen": 2067791872, "step": 986 }, { "epoch": 0.02, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.3763, "num_input_tokens_seen": 2069889024, "step": 987 }, { "epoch": 0.02, "grad_norm": 1.1171875, "learning_rate": 2e-05, "loss": 2.4447, "num_input_tokens_seen": 2071986176, "step": 988 }, { "epoch": 0.02, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.4417, "num_input_tokens_seen": 2074083328, "step": 989 }, { "epoch": 0.02, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.3975, "num_input_tokens_seen": 2076180480, "step": 990 }, { "epoch": 0.02, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.4077, "num_input_tokens_seen": 2078277632, "step": 991 }, { "epoch": 0.02, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.4241, "num_input_tokens_seen": 2080374784, "step": 992 }, { "epoch": 0.02, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 2.4241, "num_input_tokens_seen": 2082471936, "step": 993 }, { "epoch": 0.02, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 2.4238, "num_input_tokens_seen": 2084569088, "step": 994 }, { "epoch": 0.02, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.3792, "num_input_tokens_seen": 2086666240, "step": 995 }, { "epoch": 0.02, "eval_loss": 2.4099812507629395, "eval_runtime": 2026.4024, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.487, "num_input_tokens_seen": 2086666240, "step": 995 }, { "epoch": 0.02, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 2.3921, "num_input_tokens_seen": 2088763392, "step": 996 }, { "epoch": 0.02, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 2.3957, "num_input_tokens_seen": 2090860544, "step": 997 }, { "epoch": 0.02, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.3755, "num_input_tokens_seen": 2092957696, "step": 998 }, { "epoch": 0.02, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.406, "num_input_tokens_seen": 2095054848, "step": 999 }, { "epoch": 0.02, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.3788, "num_input_tokens_seen": 2097152000, "step": 1000 }, { "epoch": 0.02, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.395, "num_input_tokens_seen": 2099249152, "step": 1001 }, { "epoch": 0.02, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.3892, "num_input_tokens_seen": 2101346304, "step": 1002 }, { "epoch": 0.02, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.3921, "num_input_tokens_seen": 2103443456, "step": 1003 }, { "epoch": 0.02, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.4217, "num_input_tokens_seen": 2105540608, "step": 1004 }, { "epoch": 0.02, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.418, "num_input_tokens_seen": 2107637760, "step": 1005 }, { "epoch": 0.02, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.4269, "num_input_tokens_seen": 2109734912, "step": 1006 }, { "epoch": 0.02, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.4026, "num_input_tokens_seen": 2111832064, "step": 1007 }, { "epoch": 0.02, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.3926, "num_input_tokens_seen": 2113929216, "step": 1008 }, { "epoch": 0.02, "grad_norm": 1.2109375, "learning_rate": 2e-05, "loss": 2.4017, "num_input_tokens_seen": 2116026368, "step": 1009 }, { "epoch": 0.02, "grad_norm": 0.98046875, "learning_rate": 2e-05, "loss": 2.4085, "num_input_tokens_seen": 2118123520, "step": 1010 }, { "epoch": 0.02, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.4296, "num_input_tokens_seen": 2120220672, "step": 1011 }, { "epoch": 0.02, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 2.3999, "num_input_tokens_seen": 2122317824, "step": 1012 }, { "epoch": 0.02, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.4103, "num_input_tokens_seen": 2124414976, "step": 1013 }, { "epoch": 0.02, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.379, "num_input_tokens_seen": 2126512128, "step": 1014 }, { "epoch": 0.02, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 2.4024, "num_input_tokens_seen": 2128609280, "step": 1015 }, { "epoch": 0.02, "grad_norm": 1.9453125, "learning_rate": 2e-05, "loss": 2.4079, "num_input_tokens_seen": 2130706432, "step": 1016 }, { "epoch": 0.02, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.396, "num_input_tokens_seen": 2132803584, "step": 1017 }, { "epoch": 0.02, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.4098, "num_input_tokens_seen": 2134900736, "step": 1018 }, { "epoch": 0.02, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 2.4249, "num_input_tokens_seen": 2136997888, "step": 1019 }, { "epoch": 0.03, "grad_norm": 1.1796875, "learning_rate": 2e-05, "loss": 2.4094, "num_input_tokens_seen": 2139095040, "step": 1020 }, { "epoch": 0.03, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 2.4284, "num_input_tokens_seen": 2141192192, "step": 1021 }, { "epoch": 0.03, "grad_norm": 1.1953125, "learning_rate": 2e-05, "loss": 2.3914, "num_input_tokens_seen": 2143289344, "step": 1022 }, { "epoch": 0.03, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.4305, "num_input_tokens_seen": 2145386496, "step": 1023 }, { "epoch": 0.03, "grad_norm": 7.34375, "learning_rate": 2e-05, "loss": 2.4282, "num_input_tokens_seen": 2147483648, "step": 1024 }, { "epoch": 0.03, "grad_norm": 2.203125, "learning_rate": 2e-05, "loss": 2.3861, "num_input_tokens_seen": 2149580800, "step": 1025 }, { "epoch": 0.03, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 2.3983, "num_input_tokens_seen": 2151677952, "step": 1026 }, { "epoch": 0.03, "grad_norm": 1.5859375, "learning_rate": 2e-05, "loss": 2.4085, "num_input_tokens_seen": 2153775104, "step": 1027 }, { "epoch": 0.03, "grad_norm": 1.140625, "learning_rate": 2e-05, "loss": 2.3947, "num_input_tokens_seen": 2155872256, "step": 1028 }, { "epoch": 0.03, "grad_norm": 2.0625, "learning_rate": 2e-05, "loss": 2.3926, "num_input_tokens_seen": 2157969408, "step": 1029 }, { "epoch": 0.03, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 2.407, "num_input_tokens_seen": 2160066560, "step": 1030 }, { "epoch": 0.03, "grad_norm": 2.8125, "learning_rate": 2e-05, "loss": 2.3723, "num_input_tokens_seen": 2162163712, "step": 1031 }, { "epoch": 0.03, "grad_norm": 2.75, "learning_rate": 2e-05, "loss": 2.394, "num_input_tokens_seen": 2164260864, "step": 1032 }, { "epoch": 0.03, "grad_norm": 2.8125, "learning_rate": 2e-05, "loss": 2.4265, "num_input_tokens_seen": 2166358016, "step": 1033 }, { "epoch": 0.03, "grad_norm": 2.765625, "learning_rate": 2e-05, "loss": 2.4274, "num_input_tokens_seen": 2168455168, "step": 1034 }, { "epoch": 0.03, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.4115, "num_input_tokens_seen": 2170552320, "step": 1035 }, { "epoch": 0.03, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.409, "num_input_tokens_seen": 2172649472, "step": 1036 }, { "epoch": 0.03, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 2.4012, "num_input_tokens_seen": 2174746624, "step": 1037 }, { "epoch": 0.03, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.3897, "num_input_tokens_seen": 2176843776, "step": 1038 }, { "epoch": 0.03, "grad_norm": 1.1953125, "learning_rate": 2e-05, "loss": 2.3675, "num_input_tokens_seen": 2178940928, "step": 1039 }, { "epoch": 0.03, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 2.3848, "num_input_tokens_seen": 2181038080, "step": 1040 }, { "epoch": 0.03, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.4051, "num_input_tokens_seen": 2183135232, "step": 1041 }, { "epoch": 0.03, "grad_norm": 1.1640625, "learning_rate": 2e-05, "loss": 2.3864, "num_input_tokens_seen": 2185232384, "step": 1042 }, { "epoch": 0.03, "grad_norm": 1.2890625, "learning_rate": 2e-05, "loss": 2.4005, "num_input_tokens_seen": 2187329536, "step": 1043 }, { "epoch": 0.03, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.4049, "num_input_tokens_seen": 2189426688, "step": 1044 }, { "epoch": 0.03, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.378, "num_input_tokens_seen": 2191523840, "step": 1045 }, { "epoch": 0.03, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.3973, "num_input_tokens_seen": 2193620992, "step": 1046 }, { "epoch": 0.03, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 2.3868, "num_input_tokens_seen": 2195718144, "step": 1047 }, { "epoch": 0.03, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.4234, "num_input_tokens_seen": 2197815296, "step": 1048 }, { "epoch": 0.03, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.3843, "num_input_tokens_seen": 2199912448, "step": 1049 }, { "epoch": 0.03, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.4011, "num_input_tokens_seen": 2202009600, "step": 1050 }, { "epoch": 0.03, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.3929, "num_input_tokens_seen": 2204106752, "step": 1051 }, { "epoch": 0.03, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.411, "num_input_tokens_seen": 2206203904, "step": 1052 }, { "epoch": 0.03, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.3944, "num_input_tokens_seen": 2208301056, "step": 1053 }, { "epoch": 0.03, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.3961, "num_input_tokens_seen": 2210398208, "step": 1054 }, { "epoch": 0.03, "grad_norm": 1.1953125, "learning_rate": 2e-05, "loss": 2.4035, "num_input_tokens_seen": 2212495360, "step": 1055 }, { "epoch": 0.03, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.3996, "num_input_tokens_seen": 2214592512, "step": 1056 }, { "epoch": 0.03, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 2.3795, "num_input_tokens_seen": 2216689664, "step": 1057 }, { "epoch": 0.03, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.4011, "num_input_tokens_seen": 2218786816, "step": 1058 }, { "epoch": 0.03, "grad_norm": 0.97265625, "learning_rate": 2e-05, "loss": 2.4035, "num_input_tokens_seen": 2220883968, "step": 1059 }, { "epoch": 0.03, "grad_norm": 1.109375, "learning_rate": 2e-05, "loss": 2.3736, "num_input_tokens_seen": 2222981120, "step": 1060 }, { "epoch": 0.03, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.3902, "num_input_tokens_seen": 2225078272, "step": 1061 }, { "epoch": 0.03, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.4029, "num_input_tokens_seen": 2227175424, "step": 1062 }, { "epoch": 0.03, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.3827, "num_input_tokens_seen": 2229272576, "step": 1063 }, { "epoch": 0.03, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.3872, "num_input_tokens_seen": 2231369728, "step": 1064 }, { "epoch": 0.03, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.4143, "num_input_tokens_seen": 2233466880, "step": 1065 }, { "epoch": 0.03, "grad_norm": 0.91796875, "learning_rate": 2e-05, "loss": 2.4009, "num_input_tokens_seen": 2235564032, "step": 1066 }, { "epoch": 0.04, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.4237, "num_input_tokens_seen": 2237661184, "step": 1067 }, { "epoch": 0.04, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.4082, "num_input_tokens_seen": 2239758336, "step": 1068 }, { "epoch": 0.04, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.3918, "num_input_tokens_seen": 2241855488, "step": 1069 }, { "epoch": 0.04, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.3951, "num_input_tokens_seen": 2243952640, "step": 1070 }, { "epoch": 0.04, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.4266, "num_input_tokens_seen": 2246049792, "step": 1071 }, { "epoch": 0.04, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.3887, "num_input_tokens_seen": 2248146944, "step": 1072 }, { "epoch": 0.04, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.407, "num_input_tokens_seen": 2250244096, "step": 1073 }, { "epoch": 0.04, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.4165, "num_input_tokens_seen": 2252341248, "step": 1074 }, { "epoch": 0.04, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 2.4115, "num_input_tokens_seen": 2254438400, "step": 1075 }, { "epoch": 0.04, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.4064, "num_input_tokens_seen": 2256535552, "step": 1076 }, { "epoch": 0.04, "grad_norm": 0.98046875, "learning_rate": 2e-05, "loss": 2.4055, "num_input_tokens_seen": 2258632704, "step": 1077 }, { "epoch": 0.04, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.3839, "num_input_tokens_seen": 2260729856, "step": 1078 }, { "epoch": 0.04, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.3689, "num_input_tokens_seen": 2262827008, "step": 1079 }, { "epoch": 0.04, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3904, "num_input_tokens_seen": 2264924160, "step": 1080 }, { "epoch": 0.04, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.3937, "num_input_tokens_seen": 2267021312, "step": 1081 }, { "epoch": 0.04, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.394, "num_input_tokens_seen": 2269118464, "step": 1082 }, { "epoch": 0.04, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.3519, "num_input_tokens_seen": 2271215616, "step": 1083 }, { "epoch": 0.04, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.4015, "num_input_tokens_seen": 2273312768, "step": 1084 }, { "epoch": 0.04, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.4102, "num_input_tokens_seen": 2275409920, "step": 1085 }, { "epoch": 0.04, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.3982, "num_input_tokens_seen": 2277507072, "step": 1086 }, { "epoch": 0.04, "grad_norm": 0.98828125, "learning_rate": 2e-05, "loss": 2.3755, "num_input_tokens_seen": 2279604224, "step": 1087 }, { "epoch": 0.04, "grad_norm": 0.703125, "learning_rate": 2e-05, "loss": 2.3891, "num_input_tokens_seen": 2281701376, "step": 1088 }, { "epoch": 0.04, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.3752, "num_input_tokens_seen": 2283798528, "step": 1089 }, { "epoch": 0.04, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.4111, "num_input_tokens_seen": 2285895680, "step": 1090 }, { "epoch": 0.04, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.3601, "num_input_tokens_seen": 2287992832, "step": 1091 }, { "epoch": 0.04, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.3638, "num_input_tokens_seen": 2290089984, "step": 1092 }, { "epoch": 0.04, "grad_norm": 0.68359375, "learning_rate": 2e-05, "loss": 2.3908, "num_input_tokens_seen": 2292187136, "step": 1093 }, { "epoch": 0.04, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.3776, "num_input_tokens_seen": 2294284288, "step": 1094 }, { "epoch": 0.04, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.3821, "num_input_tokens_seen": 2296381440, "step": 1095 }, { "epoch": 0.04, "grad_norm": 0.6953125, "learning_rate": 2e-05, "loss": 2.3815, "num_input_tokens_seen": 2298478592, "step": 1096 }, { "epoch": 0.04, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.3876, "num_input_tokens_seen": 2300575744, "step": 1097 }, { "epoch": 0.04, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.365, "num_input_tokens_seen": 2302672896, "step": 1098 }, { "epoch": 0.04, "grad_norm": 0.6953125, "learning_rate": 2e-05, "loss": 2.3973, "num_input_tokens_seen": 2304770048, "step": 1099 }, { "epoch": 0.04, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.4167, "num_input_tokens_seen": 2306867200, "step": 1100 }, { "epoch": 0.04, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.3971, "num_input_tokens_seen": 2308964352, "step": 1101 }, { "epoch": 0.04, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.3951, "num_input_tokens_seen": 2311061504, "step": 1102 }, { "epoch": 0.04, "grad_norm": 0.6953125, "learning_rate": 2e-05, "loss": 2.3836, "num_input_tokens_seen": 2313158656, "step": 1103 }, { "epoch": 0.04, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.4029, "num_input_tokens_seen": 2315255808, "step": 1104 }, { "epoch": 0.04, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 2.3726, "num_input_tokens_seen": 2317352960, "step": 1105 }, { "epoch": 0.04, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.3544, "num_input_tokens_seen": 2319450112, "step": 1106 }, { "epoch": 0.04, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.4069, "num_input_tokens_seen": 2321547264, "step": 1107 }, { "epoch": 0.04, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.3749, "num_input_tokens_seen": 2323644416, "step": 1108 }, { "epoch": 0.04, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.372, "num_input_tokens_seen": 2325741568, "step": 1109 }, { "epoch": 0.04, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.381, "num_input_tokens_seen": 2327838720, "step": 1110 }, { "epoch": 0.04, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.3766, "num_input_tokens_seen": 2329935872, "step": 1111 }, { "epoch": 0.04, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.4038, "num_input_tokens_seen": 2332033024, "step": 1112 }, { "epoch": 0.04, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.3697, "num_input_tokens_seen": 2334130176, "step": 1113 }, { "epoch": 0.04, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.3886, "num_input_tokens_seen": 2336227328, "step": 1114 }, { "epoch": 0.05, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.405, "num_input_tokens_seen": 2338324480, "step": 1115 }, { "epoch": 0.05, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.364, "num_input_tokens_seen": 2340421632, "step": 1116 }, { "epoch": 0.05, "grad_norm": 0.69921875, "learning_rate": 2e-05, "loss": 2.3912, "num_input_tokens_seen": 2342518784, "step": 1117 }, { "epoch": 0.05, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.4048, "num_input_tokens_seen": 2344615936, "step": 1118 }, { "epoch": 0.05, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.382, "num_input_tokens_seen": 2346713088, "step": 1119 }, { "epoch": 0.05, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.3883, "num_input_tokens_seen": 2348810240, "step": 1120 }, { "epoch": 0.05, "grad_norm": 1.2109375, "learning_rate": 2e-05, "loss": 2.3781, "num_input_tokens_seen": 2350907392, "step": 1121 }, { "epoch": 0.05, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.3828, "num_input_tokens_seen": 2353004544, "step": 1122 }, { "epoch": 0.05, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.3987, "num_input_tokens_seen": 2355101696, "step": 1123 }, { "epoch": 0.05, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.3944, "num_input_tokens_seen": 2357198848, "step": 1124 }, { "epoch": 0.05, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.4032, "num_input_tokens_seen": 2359296000, "step": 1125 }, { "epoch": 0.05, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.3996, "num_input_tokens_seen": 2361393152, "step": 1126 }, { "epoch": 0.05, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.3727, "num_input_tokens_seen": 2363490304, "step": 1127 }, { "epoch": 0.05, "grad_norm": 0.97265625, "learning_rate": 2e-05, "loss": 2.3563, "num_input_tokens_seen": 2365587456, "step": 1128 }, { "epoch": 0.05, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.3712, "num_input_tokens_seen": 2367684608, "step": 1129 }, { "epoch": 0.05, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.3919, "num_input_tokens_seen": 2369781760, "step": 1130 }, { "epoch": 0.05, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.3306, "num_input_tokens_seen": 2371878912, "step": 1131 }, { "epoch": 0.05, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.3813, "num_input_tokens_seen": 2373976064, "step": 1132 }, { "epoch": 0.05, "grad_norm": 0.67578125, "learning_rate": 2e-05, "loss": 2.3796, "num_input_tokens_seen": 2376073216, "step": 1133 }, { "epoch": 0.05, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.3817, "num_input_tokens_seen": 2378170368, "step": 1134 }, { "epoch": 0.05, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.3592, "num_input_tokens_seen": 2380267520, "step": 1135 }, { "epoch": 0.05, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.3894, "num_input_tokens_seen": 2382364672, "step": 1136 }, { "epoch": 0.05, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.3929, "num_input_tokens_seen": 2384461824, "step": 1137 }, { "epoch": 0.05, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.37, "num_input_tokens_seen": 2386558976, "step": 1138 }, { "epoch": 0.05, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.355, "num_input_tokens_seen": 2388656128, "step": 1139 }, { "epoch": 0.05, "grad_norm": 0.72265625, "learning_rate": 2e-05, "loss": 2.389, "num_input_tokens_seen": 2390753280, "step": 1140 }, { "epoch": 0.05, "grad_norm": 0.70703125, "learning_rate": 2e-05, "loss": 2.378, "num_input_tokens_seen": 2392850432, "step": 1141 }, { "epoch": 0.05, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.3726, "num_input_tokens_seen": 2394947584, "step": 1142 }, { "epoch": 0.05, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.3594, "num_input_tokens_seen": 2397044736, "step": 1143 }, { "epoch": 0.05, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.3947, "num_input_tokens_seen": 2399141888, "step": 1144 }, { "epoch": 0.05, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.3654, "num_input_tokens_seen": 2401239040, "step": 1145 }, { "epoch": 0.05, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.3599, "num_input_tokens_seen": 2403336192, "step": 1146 }, { "epoch": 0.05, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.3896, "num_input_tokens_seen": 2405433344, "step": 1147 }, { "epoch": 0.05, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 2.3827, "num_input_tokens_seen": 2407530496, "step": 1148 }, { "epoch": 0.05, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.3571, "num_input_tokens_seen": 2409627648, "step": 1149 }, { "epoch": 0.05, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 2.3918, "num_input_tokens_seen": 2411724800, "step": 1150 }, { "epoch": 0.05, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.3673, "num_input_tokens_seen": 2413821952, "step": 1151 }, { "epoch": 0.05, "grad_norm": 0.68359375, "learning_rate": 2e-05, "loss": 2.3667, "num_input_tokens_seen": 2415919104, "step": 1152 }, { "epoch": 0.05, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.3982, "num_input_tokens_seen": 2418016256, "step": 1153 }, { "epoch": 0.05, "grad_norm": 0.70703125, "learning_rate": 2e-05, "loss": 2.375, "num_input_tokens_seen": 2420113408, "step": 1154 }, { "epoch": 0.05, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.3779, "num_input_tokens_seen": 2422210560, "step": 1155 }, { "epoch": 0.05, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.3876, "num_input_tokens_seen": 2424307712, "step": 1156 }, { "epoch": 0.05, "grad_norm": 0.6640625, "learning_rate": 2e-05, "loss": 2.3711, "num_input_tokens_seen": 2426404864, "step": 1157 }, { "epoch": 0.05, "grad_norm": 0.6953125, "learning_rate": 2e-05, "loss": 2.3751, "num_input_tokens_seen": 2428502016, "step": 1158 }, { "epoch": 0.05, "grad_norm": 0.671875, "learning_rate": 2e-05, "loss": 2.3844, "num_input_tokens_seen": 2430599168, "step": 1159 }, { "epoch": 0.05, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.3802, "num_input_tokens_seen": 2432696320, "step": 1160 }, { "epoch": 0.05, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.3819, "num_input_tokens_seen": 2434793472, "step": 1161 }, { "epoch": 0.05, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.3521, "num_input_tokens_seen": 2436890624, "step": 1162 }, { "epoch": 0.06, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.3467, "num_input_tokens_seen": 2438987776, "step": 1163 }, { "epoch": 0.06, "grad_norm": 0.65625, "learning_rate": 2e-05, "loss": 2.3812, "num_input_tokens_seen": 2441084928, "step": 1164 }, { "epoch": 0.06, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.3569, "num_input_tokens_seen": 2443182080, "step": 1165 }, { "epoch": 0.06, "grad_norm": 0.6796875, "learning_rate": 2e-05, "loss": 2.37, "num_input_tokens_seen": 2445279232, "step": 1166 }, { "epoch": 0.06, "grad_norm": 0.6171875, "learning_rate": 2e-05, "loss": 2.3787, "num_input_tokens_seen": 2447376384, "step": 1167 }, { "epoch": 0.06, "grad_norm": 0.70703125, "learning_rate": 2e-05, "loss": 2.3899, "num_input_tokens_seen": 2449473536, "step": 1168 }, { "epoch": 0.06, "grad_norm": 0.63671875, "learning_rate": 2e-05, "loss": 2.3714, "num_input_tokens_seen": 2451570688, "step": 1169 }, { "epoch": 0.06, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.4156, "num_input_tokens_seen": 2453667840, "step": 1170 }, { "epoch": 0.06, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.3558, "num_input_tokens_seen": 2455764992, "step": 1171 }, { "epoch": 0.06, "grad_norm": 0.703125, "learning_rate": 2e-05, "loss": 2.4273, "num_input_tokens_seen": 2457862144, "step": 1172 }, { "epoch": 0.06, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.3684, "num_input_tokens_seen": 2459959296, "step": 1173 }, { "epoch": 0.06, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.3827, "num_input_tokens_seen": 2462056448, "step": 1174 }, { "epoch": 0.06, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.362, "num_input_tokens_seen": 2464153600, "step": 1175 }, { "epoch": 0.06, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.3605, "num_input_tokens_seen": 2466250752, "step": 1176 }, { "epoch": 0.06, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.3563, "num_input_tokens_seen": 2468347904, "step": 1177 }, { "epoch": 0.06, "grad_norm": 0.72265625, "learning_rate": 2e-05, "loss": 2.3817, "num_input_tokens_seen": 2470445056, "step": 1178 }, { "epoch": 0.06, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.4126, "num_input_tokens_seen": 2472542208, "step": 1179 }, { "epoch": 0.06, "grad_norm": 0.6953125, "learning_rate": 2e-05, "loss": 2.3548, "num_input_tokens_seen": 2474639360, "step": 1180 }, { "epoch": 0.06, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.3785, "num_input_tokens_seen": 2476736512, "step": 1181 }, { "epoch": 0.06, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.3426, "num_input_tokens_seen": 2478833664, "step": 1182 }, { "epoch": 0.06, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.3601, "num_input_tokens_seen": 2480930816, "step": 1183 }, { "epoch": 0.06, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.4139, "num_input_tokens_seen": 2483027968, "step": 1184 }, { "epoch": 0.06, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.35, "num_input_tokens_seen": 2485125120, "step": 1185 }, { "epoch": 0.06, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.3875, "num_input_tokens_seen": 2487222272, "step": 1186 }, { "epoch": 0.06, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.3787, "num_input_tokens_seen": 2489319424, "step": 1187 }, { "epoch": 0.06, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.3997, "num_input_tokens_seen": 2491416576, "step": 1188 }, { "epoch": 0.06, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.3565, "num_input_tokens_seen": 2493513728, "step": 1189 }, { "epoch": 0.06, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.3515, "num_input_tokens_seen": 2495610880, "step": 1190 }, { "epoch": 0.06, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.3808, "num_input_tokens_seen": 2497708032, "step": 1191 }, { "epoch": 0.06, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.3939, "num_input_tokens_seen": 2499805184, "step": 1192 }, { "epoch": 0.06, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.3304, "num_input_tokens_seen": 2501902336, "step": 1193 }, { "epoch": 0.06, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.3546, "num_input_tokens_seen": 2503999488, "step": 1194 }, { "epoch": 0.06, "eval_loss": 2.372859001159668, "eval_runtime": 1897.0017, "eval_samples_per_second": 2.078, "eval_steps_per_second": 0.52, "num_input_tokens_seen": 2503999488, "step": 1194 }, { "epoch": 0.06, "grad_norm": 0.70703125, "learning_rate": 2e-05, "loss": 2.3559, "num_input_tokens_seen": 2506096640, "step": 1195 }, { "epoch": 0.06, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.3569, "num_input_tokens_seen": 2508193792, "step": 1196 }, { "epoch": 0.06, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.3959, "num_input_tokens_seen": 2510290944, "step": 1197 }, { "epoch": 0.06, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.3615, "num_input_tokens_seen": 2512388096, "step": 1198 }, { "epoch": 0.06, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.3735, "num_input_tokens_seen": 2514485248, "step": 1199 }, { "epoch": 0.06, "grad_norm": 0.671875, "learning_rate": 2e-05, "loss": 2.3342, "num_input_tokens_seen": 2516582400, "step": 1200 }, { "epoch": 0.06, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.3555, "num_input_tokens_seen": 2518679552, "step": 1201 }, { "epoch": 0.06, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.3955, "num_input_tokens_seen": 2520776704, "step": 1202 }, { "epoch": 0.06, "grad_norm": 0.72265625, "learning_rate": 2e-05, "loss": 2.3726, "num_input_tokens_seen": 2522873856, "step": 1203 }, { "epoch": 0.06, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.3716, "num_input_tokens_seen": 2524971008, "step": 1204 }, { "epoch": 0.06, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.3548, "num_input_tokens_seen": 2527068160, "step": 1205 }, { "epoch": 0.06, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 2.3516, "num_input_tokens_seen": 2529165312, "step": 1206 }, { "epoch": 0.06, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.3967, "num_input_tokens_seen": 2531262464, "step": 1207 }, { "epoch": 0.06, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.3888, "num_input_tokens_seen": 2533359616, "step": 1208 }, { "epoch": 0.06, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.3843, "num_input_tokens_seen": 2535456768, "step": 1209 }, { "epoch": 0.07, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.3755, "num_input_tokens_seen": 2537553920, "step": 1210 }, { "epoch": 0.07, "grad_norm": 0.6640625, "learning_rate": 2e-05, "loss": 2.3513, "num_input_tokens_seen": 2539651072, "step": 1211 }, { "epoch": 0.07, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.3526, "num_input_tokens_seen": 2541748224, "step": 1212 }, { "epoch": 0.07, "grad_norm": 0.671875, "learning_rate": 2e-05, "loss": 2.367, "num_input_tokens_seen": 2543845376, "step": 1213 }, { "epoch": 0.07, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.3739, "num_input_tokens_seen": 2545942528, "step": 1214 }, { "epoch": 0.07, "grad_norm": 0.66015625, "learning_rate": 2e-05, "loss": 2.3719, "num_input_tokens_seen": 2548039680, "step": 1215 }, { "epoch": 0.07, "grad_norm": 0.6796875, "learning_rate": 2e-05, "loss": 2.3799, "num_input_tokens_seen": 2550136832, "step": 1216 }, { "epoch": 0.07, "grad_norm": 0.72265625, "learning_rate": 2e-05, "loss": 2.3465, "num_input_tokens_seen": 2552233984, "step": 1217 }, { "epoch": 0.07, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.3854, "num_input_tokens_seen": 2554331136, "step": 1218 }, { "epoch": 0.07, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.363, "num_input_tokens_seen": 2556428288, "step": 1219 }, { "epoch": 0.07, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.3576, "num_input_tokens_seen": 2558525440, "step": 1220 }, { "epoch": 0.07, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.3671, "num_input_tokens_seen": 2560622592, "step": 1221 }, { "epoch": 0.07, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3709, "num_input_tokens_seen": 2562719744, "step": 1222 }, { "epoch": 0.07, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.3506, "num_input_tokens_seen": 2564816896, "step": 1223 }, { "epoch": 0.07, "grad_norm": 0.72265625, "learning_rate": 2e-05, "loss": 2.3674, "num_input_tokens_seen": 2566914048, "step": 1224 }, { "epoch": 0.07, "grad_norm": 0.640625, "learning_rate": 2e-05, "loss": 2.3599, "num_input_tokens_seen": 2569011200, "step": 1225 }, { "epoch": 0.07, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3493, "num_input_tokens_seen": 2571108352, "step": 1226 }, { "epoch": 0.07, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.369, "num_input_tokens_seen": 2573205504, "step": 1227 }, { "epoch": 0.07, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.3542, "num_input_tokens_seen": 2575302656, "step": 1228 }, { "epoch": 0.07, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.3671, "num_input_tokens_seen": 2577399808, "step": 1229 }, { "epoch": 0.07, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.3604, "num_input_tokens_seen": 2579496960, "step": 1230 }, { "epoch": 0.07, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 2.3795, "num_input_tokens_seen": 2581594112, "step": 1231 }, { "epoch": 0.07, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3827, "num_input_tokens_seen": 2583691264, "step": 1232 }, { "epoch": 0.07, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.3625, "num_input_tokens_seen": 2585788416, "step": 1233 }, { "epoch": 0.07, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.3476, "num_input_tokens_seen": 2587885568, "step": 1234 }, { "epoch": 0.07, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3495, "num_input_tokens_seen": 2589982720, "step": 1235 }, { "epoch": 0.07, "grad_norm": 0.69921875, "learning_rate": 2e-05, "loss": 2.3431, "num_input_tokens_seen": 2592079872, "step": 1236 }, { "epoch": 0.07, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 2.3571, "num_input_tokens_seen": 2594177024, "step": 1237 }, { "epoch": 0.07, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.3629, "num_input_tokens_seen": 2596274176, "step": 1238 }, { "epoch": 0.07, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.338, "num_input_tokens_seen": 2598371328, "step": 1239 }, { "epoch": 0.07, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.3846, "num_input_tokens_seen": 2600468480, "step": 1240 }, { "epoch": 0.07, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.3641, "num_input_tokens_seen": 2602565632, "step": 1241 }, { "epoch": 0.07, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 2.3304, "num_input_tokens_seen": 2604662784, "step": 1242 }, { "epoch": 0.07, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.3442, "num_input_tokens_seen": 2606759936, "step": 1243 }, { "epoch": 0.07, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.3699, "num_input_tokens_seen": 2608857088, "step": 1244 }, { "epoch": 0.07, "grad_norm": 0.6484375, "learning_rate": 2e-05, "loss": 2.3471, "num_input_tokens_seen": 2610954240, "step": 1245 }, { "epoch": 0.07, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 2.353, "num_input_tokens_seen": 2613051392, "step": 1246 }, { "epoch": 0.07, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.3584, "num_input_tokens_seen": 2615148544, "step": 1247 }, { "epoch": 0.07, "grad_norm": 0.6796875, "learning_rate": 2e-05, "loss": 2.368, "num_input_tokens_seen": 2617245696, "step": 1248 }, { "epoch": 0.07, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.3467, "num_input_tokens_seen": 2619342848, "step": 1249 }, { "epoch": 0.07, "grad_norm": 0.69921875, "learning_rate": 2e-05, "loss": 2.357, "num_input_tokens_seen": 2621440000, "step": 1250 }, { "epoch": 0.07, "grad_norm": 1.7890625, "learning_rate": 2e-05, "loss": 2.3929, "num_input_tokens_seen": 2623537152, "step": 1251 }, { "epoch": 0.07, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.3437, "num_input_tokens_seen": 2625634304, "step": 1252 }, { "epoch": 0.07, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.3393, "num_input_tokens_seen": 2627731456, "step": 1253 }, { "epoch": 0.07, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.3768, "num_input_tokens_seen": 2629828608, "step": 1254 }, { "epoch": 0.07, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.3709, "num_input_tokens_seen": 2631925760, "step": 1255 }, { "epoch": 0.07, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.3585, "num_input_tokens_seen": 2634022912, "step": 1256 }, { "epoch": 0.07, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 2.3238, "num_input_tokens_seen": 2636120064, "step": 1257 }, { "epoch": 0.08, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.3714, "num_input_tokens_seen": 2638217216, "step": 1258 }, { "epoch": 0.08, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.3459, "num_input_tokens_seen": 2640314368, "step": 1259 }, { "epoch": 0.08, "grad_norm": 0.76171875, "learning_rate": 2e-05, "loss": 2.3369, "num_input_tokens_seen": 2642411520, "step": 1260 }, { "epoch": 0.08, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.3652, "num_input_tokens_seen": 2644508672, "step": 1261 }, { "epoch": 0.08, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.3519, "num_input_tokens_seen": 2646605824, "step": 1262 }, { "epoch": 0.08, "grad_norm": 0.63671875, "learning_rate": 2e-05, "loss": 2.3634, "num_input_tokens_seen": 2648702976, "step": 1263 }, { "epoch": 0.08, "grad_norm": 0.99609375, "learning_rate": 2e-05, "loss": 2.3311, "num_input_tokens_seen": 2650800128, "step": 1264 }, { "epoch": 0.08, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 2.346, "num_input_tokens_seen": 2652897280, "step": 1265 }, { "epoch": 0.08, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.3565, "num_input_tokens_seen": 2654994432, "step": 1266 }, { "epoch": 0.08, "grad_norm": 1.140625, "learning_rate": 2e-05, "loss": 2.3524, "num_input_tokens_seen": 2657091584, "step": 1267 }, { "epoch": 0.08, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.3525, "num_input_tokens_seen": 2659188736, "step": 1268 }, { "epoch": 0.08, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.3705, "num_input_tokens_seen": 2661285888, "step": 1269 }, { "epoch": 0.08, "grad_norm": 0.98828125, "learning_rate": 2e-05, "loss": 2.3608, "num_input_tokens_seen": 2663383040, "step": 1270 }, { "epoch": 0.08, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.3262, "num_input_tokens_seen": 2665480192, "step": 1271 }, { "epoch": 0.08, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.3614, "num_input_tokens_seen": 2667577344, "step": 1272 }, { "epoch": 0.08, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 2.3722, "num_input_tokens_seen": 2669674496, "step": 1273 }, { "epoch": 0.08, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.3397, "num_input_tokens_seen": 2671771648, "step": 1274 }, { "epoch": 0.08, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.3184, "num_input_tokens_seen": 2673868800, "step": 1275 }, { "epoch": 0.08, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.3394, "num_input_tokens_seen": 2675965952, "step": 1276 }, { "epoch": 0.08, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.3482, "num_input_tokens_seen": 2678063104, "step": 1277 }, { "epoch": 0.08, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.3027, "num_input_tokens_seen": 2680160256, "step": 1278 }, { "epoch": 0.08, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.363, "num_input_tokens_seen": 2682257408, "step": 1279 }, { "epoch": 0.08, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.3426, "num_input_tokens_seen": 2684354560, "step": 1280 }, { "epoch": 0.08, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.3406, "num_input_tokens_seen": 2686451712, "step": 1281 }, { "epoch": 0.08, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.3247, "num_input_tokens_seen": 2688548864, "step": 1282 }, { "epoch": 0.08, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 2.3495, "num_input_tokens_seen": 2690646016, "step": 1283 }, { "epoch": 0.08, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.352, "num_input_tokens_seen": 2692743168, "step": 1284 }, { "epoch": 0.08, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.3591, "num_input_tokens_seen": 2694840320, "step": 1285 }, { "epoch": 0.08, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.342, "num_input_tokens_seen": 2696937472, "step": 1286 }, { "epoch": 0.08, "grad_norm": 0.65234375, "learning_rate": 2e-05, "loss": 2.3596, "num_input_tokens_seen": 2699034624, "step": 1287 }, { "epoch": 0.08, "grad_norm": 0.70703125, "learning_rate": 2e-05, "loss": 2.3537, "num_input_tokens_seen": 2701131776, "step": 1288 }, { "epoch": 0.08, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.3567, "num_input_tokens_seen": 2703228928, "step": 1289 }, { "epoch": 0.08, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.3327, "num_input_tokens_seen": 2705326080, "step": 1290 }, { "epoch": 0.08, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.3391, "num_input_tokens_seen": 2707423232, "step": 1291 }, { "epoch": 0.08, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.3264, "num_input_tokens_seen": 2709520384, "step": 1292 }, { "epoch": 0.08, "grad_norm": 0.63671875, "learning_rate": 2e-05, "loss": 2.3398, "num_input_tokens_seen": 2711617536, "step": 1293 }, { "epoch": 0.08, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.3584, "num_input_tokens_seen": 2713714688, "step": 1294 }, { "epoch": 0.08, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.347, "num_input_tokens_seen": 2715811840, "step": 1295 }, { "epoch": 0.08, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.3231, "num_input_tokens_seen": 2717908992, "step": 1296 }, { "epoch": 0.08, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.3543, "num_input_tokens_seen": 2720006144, "step": 1297 }, { "epoch": 0.08, "grad_norm": 0.76171875, "learning_rate": 2e-05, "loss": 2.3675, "num_input_tokens_seen": 2722103296, "step": 1298 }, { "epoch": 0.08, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.3499, "num_input_tokens_seen": 2724200448, "step": 1299 }, { "epoch": 0.08, "grad_norm": 0.98046875, "learning_rate": 2e-05, "loss": 2.3662, "num_input_tokens_seen": 2726297600, "step": 1300 }, { "epoch": 0.08, "grad_norm": 0.6640625, "learning_rate": 2e-05, "loss": 2.3167, "num_input_tokens_seen": 2728394752, "step": 1301 }, { "epoch": 0.08, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.3177, "num_input_tokens_seen": 2730491904, "step": 1302 }, { "epoch": 0.08, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.3457, "num_input_tokens_seen": 2732589056, "step": 1303 }, { "epoch": 0.08, "grad_norm": 0.67578125, "learning_rate": 2e-05, "loss": 2.3247, "num_input_tokens_seen": 2734686208, "step": 1304 }, { "epoch": 0.08, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 2.3278, "num_input_tokens_seen": 2736783360, "step": 1305 }, { "epoch": 0.09, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.3253, "num_input_tokens_seen": 2738880512, "step": 1306 }, { "epoch": 0.09, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.3687, "num_input_tokens_seen": 2740977664, "step": 1307 }, { "epoch": 0.09, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.3549, "num_input_tokens_seen": 2743074816, "step": 1308 }, { "epoch": 0.09, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.3305, "num_input_tokens_seen": 2745171968, "step": 1309 }, { "epoch": 0.09, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.3443, "num_input_tokens_seen": 2747269120, "step": 1310 }, { "epoch": 0.09, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.3134, "num_input_tokens_seen": 2749366272, "step": 1311 }, { "epoch": 0.09, "grad_norm": 0.703125, "learning_rate": 2e-05, "loss": 2.3547, "num_input_tokens_seen": 2751463424, "step": 1312 }, { "epoch": 0.09, "grad_norm": 0.98828125, "learning_rate": 2e-05, "loss": 2.3549, "num_input_tokens_seen": 2753560576, "step": 1313 }, { "epoch": 0.09, "grad_norm": 0.66796875, "learning_rate": 2e-05, "loss": 2.3566, "num_input_tokens_seen": 2755657728, "step": 1314 }, { "epoch": 0.09, "grad_norm": 0.703125, "learning_rate": 2e-05, "loss": 2.3337, "num_input_tokens_seen": 2757754880, "step": 1315 }, { "epoch": 0.09, "grad_norm": 0.984375, "learning_rate": 2e-05, "loss": 2.3448, "num_input_tokens_seen": 2759852032, "step": 1316 }, { "epoch": 0.09, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.3522, "num_input_tokens_seen": 2761949184, "step": 1317 }, { "epoch": 0.09, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.3557, "num_input_tokens_seen": 2764046336, "step": 1318 }, { "epoch": 0.09, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.36, "num_input_tokens_seen": 2766143488, "step": 1319 }, { "epoch": 0.09, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3518, "num_input_tokens_seen": 2768240640, "step": 1320 }, { "epoch": 0.09, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.3473, "num_input_tokens_seen": 2770337792, "step": 1321 }, { "epoch": 0.09, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.3245, "num_input_tokens_seen": 2772434944, "step": 1322 }, { "epoch": 0.09, "grad_norm": 0.70703125, "learning_rate": 2e-05, "loss": 2.3347, "num_input_tokens_seen": 2774532096, "step": 1323 }, { "epoch": 0.09, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.3589, "num_input_tokens_seen": 2776629248, "step": 1324 }, { "epoch": 0.09, "grad_norm": 0.69921875, "learning_rate": 2e-05, "loss": 2.3474, "num_input_tokens_seen": 2778726400, "step": 1325 }, { "epoch": 0.09, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.3154, "num_input_tokens_seen": 2780823552, "step": 1326 }, { "epoch": 0.09, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.342, "num_input_tokens_seen": 2782920704, "step": 1327 }, { "epoch": 0.09, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.3405, "num_input_tokens_seen": 2785017856, "step": 1328 }, { "epoch": 0.09, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 2.325, "num_input_tokens_seen": 2787115008, "step": 1329 }, { "epoch": 0.09, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.3214, "num_input_tokens_seen": 2789212160, "step": 1330 }, { "epoch": 0.09, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.3296, "num_input_tokens_seen": 2791309312, "step": 1331 }, { "epoch": 0.09, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.3388, "num_input_tokens_seen": 2793406464, "step": 1332 }, { "epoch": 0.09, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.3579, "num_input_tokens_seen": 2795503616, "step": 1333 }, { "epoch": 0.09, "grad_norm": 0.98828125, "learning_rate": 2e-05, "loss": 2.3578, "num_input_tokens_seen": 2797600768, "step": 1334 }, { "epoch": 0.09, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.3503, "num_input_tokens_seen": 2799697920, "step": 1335 }, { "epoch": 0.09, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.3739, "num_input_tokens_seen": 2801795072, "step": 1336 }, { "epoch": 0.09, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.3405, "num_input_tokens_seen": 2803892224, "step": 1337 }, { "epoch": 0.09, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.3737, "num_input_tokens_seen": 2805989376, "step": 1338 }, { "epoch": 0.09, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.3609, "num_input_tokens_seen": 2808086528, "step": 1339 }, { "epoch": 0.09, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.349, "num_input_tokens_seen": 2810183680, "step": 1340 }, { "epoch": 0.09, "grad_norm": 0.69921875, "learning_rate": 2e-05, "loss": 2.3427, "num_input_tokens_seen": 2812280832, "step": 1341 }, { "epoch": 0.09, "grad_norm": 0.6953125, "learning_rate": 2e-05, "loss": 2.3392, "num_input_tokens_seen": 2814377984, "step": 1342 }, { "epoch": 0.09, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.3159, "num_input_tokens_seen": 2816475136, "step": 1343 }, { "epoch": 0.09, "grad_norm": 0.66015625, "learning_rate": 2e-05, "loss": 2.357, "num_input_tokens_seen": 2818572288, "step": 1344 }, { "epoch": 0.09, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.352, "num_input_tokens_seen": 2820669440, "step": 1345 }, { "epoch": 0.09, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.3447, "num_input_tokens_seen": 2822766592, "step": 1346 }, { "epoch": 0.09, "grad_norm": 0.6796875, "learning_rate": 2e-05, "loss": 2.3139, "num_input_tokens_seen": 2824863744, "step": 1347 }, { "epoch": 0.09, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 2.3333, "num_input_tokens_seen": 2826960896, "step": 1348 }, { "epoch": 0.09, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.3452, "num_input_tokens_seen": 2829058048, "step": 1349 }, { "epoch": 0.09, "grad_norm": 0.640625, "learning_rate": 2e-05, "loss": 2.3266, "num_input_tokens_seen": 2831155200, "step": 1350 }, { "epoch": 0.09, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.3383, "num_input_tokens_seen": 2833252352, "step": 1351 }, { "epoch": 0.09, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.3234, "num_input_tokens_seen": 2835349504, "step": 1352 }, { "epoch": 0.1, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 2.3516, "num_input_tokens_seen": 2837446656, "step": 1353 }, { "epoch": 0.1, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.3414, "num_input_tokens_seen": 2839543808, "step": 1354 }, { "epoch": 0.1, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.3235, "num_input_tokens_seen": 2841640960, "step": 1355 }, { "epoch": 0.1, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.3593, "num_input_tokens_seen": 2843738112, "step": 1356 }, { "epoch": 0.1, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.3439, "num_input_tokens_seen": 2845835264, "step": 1357 }, { "epoch": 0.1, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.3227, "num_input_tokens_seen": 2847932416, "step": 1358 }, { "epoch": 0.1, "grad_norm": 0.62109375, "learning_rate": 2e-05, "loss": 2.3124, "num_input_tokens_seen": 2850029568, "step": 1359 }, { "epoch": 0.1, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.3324, "num_input_tokens_seen": 2852126720, "step": 1360 }, { "epoch": 0.1, "grad_norm": 0.72265625, "learning_rate": 2e-05, "loss": 2.359, "num_input_tokens_seen": 2854223872, "step": 1361 }, { "epoch": 0.1, "grad_norm": 0.64453125, "learning_rate": 2e-05, "loss": 2.3182, "num_input_tokens_seen": 2856321024, "step": 1362 }, { "epoch": 0.1, "grad_norm": 0.6796875, "learning_rate": 2e-05, "loss": 2.2981, "num_input_tokens_seen": 2858418176, "step": 1363 }, { "epoch": 0.1, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.3326, "num_input_tokens_seen": 2860515328, "step": 1364 }, { "epoch": 0.1, "grad_norm": 0.6796875, "learning_rate": 2e-05, "loss": 2.3344, "num_input_tokens_seen": 2862612480, "step": 1365 }, { "epoch": 0.1, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.3564, "num_input_tokens_seen": 2864709632, "step": 1366 }, { "epoch": 0.1, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.301, "num_input_tokens_seen": 2866806784, "step": 1367 }, { "epoch": 0.1, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.3407, "num_input_tokens_seen": 2868903936, "step": 1368 }, { "epoch": 0.1, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.3235, "num_input_tokens_seen": 2871001088, "step": 1369 }, { "epoch": 0.1, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.3369, "num_input_tokens_seen": 2873098240, "step": 1370 }, { "epoch": 0.1, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.321, "num_input_tokens_seen": 2875195392, "step": 1371 }, { "epoch": 0.1, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.3392, "num_input_tokens_seen": 2877292544, "step": 1372 }, { "epoch": 0.1, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.3167, "num_input_tokens_seen": 2879389696, "step": 1373 }, { "epoch": 0.1, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.3246, "num_input_tokens_seen": 2881486848, "step": 1374 }, { "epoch": 0.1, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.3355, "num_input_tokens_seen": 2883584000, "step": 1375 }, { "epoch": 0.1, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.305, "num_input_tokens_seen": 2885681152, "step": 1376 }, { "epoch": 0.1, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.3311, "num_input_tokens_seen": 2887778304, "step": 1377 }, { "epoch": 0.1, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.3486, "num_input_tokens_seen": 2889875456, "step": 1378 }, { "epoch": 0.1, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.3106, "num_input_tokens_seen": 2891972608, "step": 1379 }, { "epoch": 0.1, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.3444, "num_input_tokens_seen": 2894069760, "step": 1380 }, { "epoch": 0.1, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.3102, "num_input_tokens_seen": 2896166912, "step": 1381 }, { "epoch": 0.1, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3087, "num_input_tokens_seen": 2898264064, "step": 1382 }, { "epoch": 0.1, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.3304, "num_input_tokens_seen": 2900361216, "step": 1383 }, { "epoch": 0.1, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.3188, "num_input_tokens_seen": 2902458368, "step": 1384 }, { "epoch": 0.1, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 2.285, "num_input_tokens_seen": 2904555520, "step": 1385 }, { "epoch": 0.1, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.3244, "num_input_tokens_seen": 2906652672, "step": 1386 }, { "epoch": 0.1, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.3279, "num_input_tokens_seen": 2908749824, "step": 1387 }, { "epoch": 0.1, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.3265, "num_input_tokens_seen": 2910846976, "step": 1388 }, { "epoch": 0.1, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.3447, "num_input_tokens_seen": 2912944128, "step": 1389 }, { "epoch": 0.1, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.335, "num_input_tokens_seen": 2915041280, "step": 1390 }, { "epoch": 0.1, "grad_norm": 0.67578125, "learning_rate": 2e-05, "loss": 2.3385, "num_input_tokens_seen": 2917138432, "step": 1391 }, { "epoch": 0.1, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.339, "num_input_tokens_seen": 2919235584, "step": 1392 }, { "epoch": 0.1, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.3238, "num_input_tokens_seen": 2921332736, "step": 1393 }, { "epoch": 0.1, "eval_loss": 2.345611572265625, "eval_runtime": 2602.686, "eval_samples_per_second": 1.515, "eval_steps_per_second": 0.379, "num_input_tokens_seen": 2921332736, "step": 1393 }, { "epoch": 0.1, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.3365, "num_input_tokens_seen": 2923429888, "step": 1394 }, { "epoch": 0.1, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.308, "num_input_tokens_seen": 2925527040, "step": 1395 }, { "epoch": 0.1, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.3388, "num_input_tokens_seen": 2927624192, "step": 1396 }, { "epoch": 0.1, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.3425, "num_input_tokens_seen": 2929721344, "step": 1397 }, { "epoch": 0.1, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.3268, "num_input_tokens_seen": 2931818496, "step": 1398 }, { "epoch": 0.1, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.3037, "num_input_tokens_seen": 2933915648, "step": 1399 }, { "epoch": 0.1, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.345, "num_input_tokens_seen": 2936012800, "step": 1400 }, { "epoch": 0.11, "grad_norm": 1.0, "learning_rate": 2e-05, "loss": 2.3231, "num_input_tokens_seen": 2938109952, "step": 1401 }, { "epoch": 0.11, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.3219, "num_input_tokens_seen": 2940207104, "step": 1402 }, { "epoch": 0.11, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.3231, "num_input_tokens_seen": 2942304256, "step": 1403 }, { "epoch": 0.11, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.3063, "num_input_tokens_seen": 2944401408, "step": 1404 }, { "epoch": 0.11, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.3101, "num_input_tokens_seen": 2946498560, "step": 1405 }, { "epoch": 0.11, "grad_norm": 0.64453125, "learning_rate": 2e-05, "loss": 2.323, "num_input_tokens_seen": 2948595712, "step": 1406 }, { "epoch": 0.11, "grad_norm": 1.21875, "learning_rate": 2e-05, "loss": 2.321, "num_input_tokens_seen": 2950692864, "step": 1407 }, { "epoch": 0.11, "grad_norm": 1.0, "learning_rate": 2e-05, "loss": 2.3055, "num_input_tokens_seen": 2952790016, "step": 1408 }, { "epoch": 0.11, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.3156, "num_input_tokens_seen": 2954887168, "step": 1409 }, { "epoch": 0.11, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.3065, "num_input_tokens_seen": 2956984320, "step": 1410 }, { "epoch": 0.11, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.3132, "num_input_tokens_seen": 2959081472, "step": 1411 }, { "epoch": 0.11, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2997, "num_input_tokens_seen": 2961178624, "step": 1412 }, { "epoch": 0.11, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.3062, "num_input_tokens_seen": 2963275776, "step": 1413 }, { "epoch": 0.11, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.3117, "num_input_tokens_seen": 2965372928, "step": 1414 }, { "epoch": 0.11, "grad_norm": 0.69921875, "learning_rate": 2e-05, "loss": 2.2983, "num_input_tokens_seen": 2967470080, "step": 1415 }, { "epoch": 0.11, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.3026, "num_input_tokens_seen": 2969567232, "step": 1416 }, { "epoch": 0.11, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.3114, "num_input_tokens_seen": 2971664384, "step": 1417 }, { "epoch": 0.11, "grad_norm": 1.7734375, "learning_rate": 2e-05, "loss": 2.2923, "num_input_tokens_seen": 2973761536, "step": 1418 }, { "epoch": 0.11, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.3226, "num_input_tokens_seen": 2975858688, "step": 1419 }, { "epoch": 0.11, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.308, "num_input_tokens_seen": 2977955840, "step": 1420 }, { "epoch": 0.11, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.3125, "num_input_tokens_seen": 2980052992, "step": 1421 }, { "epoch": 0.11, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.3037, "num_input_tokens_seen": 2982150144, "step": 1422 }, { "epoch": 0.11, "grad_norm": 1.0, "learning_rate": 2e-05, "loss": 2.2974, "num_input_tokens_seen": 2984247296, "step": 1423 }, { "epoch": 0.11, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2781, "num_input_tokens_seen": 2986344448, "step": 1424 }, { "epoch": 0.11, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.3029, "num_input_tokens_seen": 2988441600, "step": 1425 }, { "epoch": 0.11, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.2919, "num_input_tokens_seen": 2990538752, "step": 1426 }, { "epoch": 0.11, "grad_norm": 0.96875, "learning_rate": 2e-05, "loss": 2.3393, "num_input_tokens_seen": 2992635904, "step": 1427 }, { "epoch": 0.11, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.3319, "num_input_tokens_seen": 2994733056, "step": 1428 }, { "epoch": 0.11, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.3246, "num_input_tokens_seen": 2996830208, "step": 1429 }, { "epoch": 0.11, "grad_norm": 0.99609375, "learning_rate": 2e-05, "loss": 2.3309, "num_input_tokens_seen": 2998927360, "step": 1430 }, { "epoch": 0.11, "grad_norm": 0.96875, "learning_rate": 2e-05, "loss": 2.3551, "num_input_tokens_seen": 3001024512, "step": 1431 }, { "epoch": 0.11, "grad_norm": 0.703125, "learning_rate": 2e-05, "loss": 2.3123, "num_input_tokens_seen": 3003121664, "step": 1432 }, { "epoch": 0.11, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.2897, "num_input_tokens_seen": 3005218816, "step": 1433 }, { "epoch": 0.11, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.3345, "num_input_tokens_seen": 3007315968, "step": 1434 }, { "epoch": 0.11, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.2838, "num_input_tokens_seen": 3009413120, "step": 1435 }, { "epoch": 0.11, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.3246, "num_input_tokens_seen": 3011510272, "step": 1436 }, { "epoch": 0.11, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.3546, "num_input_tokens_seen": 3013607424, "step": 1437 }, { "epoch": 0.11, "grad_norm": 0.91796875, "learning_rate": 2e-05, "loss": 2.3171, "num_input_tokens_seen": 3015704576, "step": 1438 }, { "epoch": 0.11, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.3081, "num_input_tokens_seen": 3017801728, "step": 1439 }, { "epoch": 0.11, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.2926, "num_input_tokens_seen": 3019898880, "step": 1440 }, { "epoch": 0.11, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.3037, "num_input_tokens_seen": 3021996032, "step": 1441 }, { "epoch": 0.11, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.2944, "num_input_tokens_seen": 3024093184, "step": 1442 }, { "epoch": 0.11, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2713, "num_input_tokens_seen": 3026190336, "step": 1443 }, { "epoch": 0.11, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.3103, "num_input_tokens_seen": 3028287488, "step": 1444 }, { "epoch": 0.11, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.3076, "num_input_tokens_seen": 3030384640, "step": 1445 }, { "epoch": 0.11, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.3239, "num_input_tokens_seen": 3032481792, "step": 1446 }, { "epoch": 0.11, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3302, "num_input_tokens_seen": 3034578944, "step": 1447 }, { "epoch": 0.11, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.3397, "num_input_tokens_seen": 3036676096, "step": 1448 }, { "epoch": 0.12, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.3288, "num_input_tokens_seen": 3038773248, "step": 1449 }, { "epoch": 0.12, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.314, "num_input_tokens_seen": 3040870400, "step": 1450 }, { "epoch": 0.12, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.3288, "num_input_tokens_seen": 3042967552, "step": 1451 }, { "epoch": 0.12, "grad_norm": 1.2265625, "learning_rate": 2e-05, "loss": 2.2871, "num_input_tokens_seen": 3045064704, "step": 1452 }, { "epoch": 0.12, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.3282, "num_input_tokens_seen": 3047161856, "step": 1453 }, { "epoch": 0.12, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.2942, "num_input_tokens_seen": 3049259008, "step": 1454 }, { "epoch": 0.12, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.2913, "num_input_tokens_seen": 3051356160, "step": 1455 }, { "epoch": 0.12, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.3423, "num_input_tokens_seen": 3053453312, "step": 1456 }, { "epoch": 0.12, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.3195, "num_input_tokens_seen": 3055550464, "step": 1457 }, { "epoch": 0.12, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.3354, "num_input_tokens_seen": 3057647616, "step": 1458 }, { "epoch": 0.12, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2998, "num_input_tokens_seen": 3059744768, "step": 1459 }, { "epoch": 0.12, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.2993, "num_input_tokens_seen": 3061841920, "step": 1460 }, { "epoch": 0.12, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.2753, "num_input_tokens_seen": 3063939072, "step": 1461 }, { "epoch": 0.12, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.2919, "num_input_tokens_seen": 3066036224, "step": 1462 }, { "epoch": 0.12, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.3217, "num_input_tokens_seen": 3068133376, "step": 1463 }, { "epoch": 0.12, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.3034, "num_input_tokens_seen": 3070230528, "step": 1464 }, { "epoch": 0.12, "grad_norm": 0.96875, "learning_rate": 2e-05, "loss": 2.307, "num_input_tokens_seen": 3072327680, "step": 1465 }, { "epoch": 0.12, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.3311, "num_input_tokens_seen": 3074424832, "step": 1466 }, { "epoch": 0.12, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.3075, "num_input_tokens_seen": 3076521984, "step": 1467 }, { "epoch": 0.12, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.3534, "num_input_tokens_seen": 3078619136, "step": 1468 }, { "epoch": 0.12, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.2958, "num_input_tokens_seen": 3080716288, "step": 1469 }, { "epoch": 0.12, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.2973, "num_input_tokens_seen": 3082813440, "step": 1470 }, { "epoch": 0.12, "grad_norm": 0.69921875, "learning_rate": 2e-05, "loss": 2.3137, "num_input_tokens_seen": 3084910592, "step": 1471 }, { "epoch": 0.12, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.3125, "num_input_tokens_seen": 3087007744, "step": 1472 }, { "epoch": 0.12, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.2951, "num_input_tokens_seen": 3089104896, "step": 1473 }, { "epoch": 0.12, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.3418, "num_input_tokens_seen": 3091202048, "step": 1474 }, { "epoch": 0.12, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.3099, "num_input_tokens_seen": 3093299200, "step": 1475 }, { "epoch": 0.12, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.2985, "num_input_tokens_seen": 3095396352, "step": 1476 }, { "epoch": 0.12, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.3031, "num_input_tokens_seen": 3097493504, "step": 1477 }, { "epoch": 0.12, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.2491, "num_input_tokens_seen": 3099590656, "step": 1478 }, { "epoch": 0.12, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.3481, "num_input_tokens_seen": 3101687808, "step": 1479 }, { "epoch": 0.12, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.2968, "num_input_tokens_seen": 3103784960, "step": 1480 }, { "epoch": 0.12, "grad_norm": 0.984375, "learning_rate": 2e-05, "loss": 2.3091, "num_input_tokens_seen": 3105882112, "step": 1481 }, { "epoch": 0.12, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2881, "num_input_tokens_seen": 3107979264, "step": 1482 }, { "epoch": 0.12, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.3319, "num_input_tokens_seen": 3110076416, "step": 1483 }, { "epoch": 0.12, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.3146, "num_input_tokens_seen": 3112173568, "step": 1484 }, { "epoch": 0.12, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.317, "num_input_tokens_seen": 3114270720, "step": 1485 }, { "epoch": 0.12, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.3253, "num_input_tokens_seen": 3116367872, "step": 1486 }, { "epoch": 0.12, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 2.2923, "num_input_tokens_seen": 3118465024, "step": 1487 }, { "epoch": 0.12, "grad_norm": 0.76171875, "learning_rate": 2e-05, "loss": 2.3056, "num_input_tokens_seen": 3120562176, "step": 1488 }, { "epoch": 0.12, "grad_norm": 0.76171875, "learning_rate": 2e-05, "loss": 2.3201, "num_input_tokens_seen": 3122659328, "step": 1489 }, { "epoch": 0.12, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.3172, "num_input_tokens_seen": 3124756480, "step": 1490 }, { "epoch": 0.12, "grad_norm": 0.69921875, "learning_rate": 2e-05, "loss": 2.3064, "num_input_tokens_seen": 3126853632, "step": 1491 }, { "epoch": 0.12, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3084, "num_input_tokens_seen": 3128950784, "step": 1492 }, { "epoch": 0.12, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.3146, "num_input_tokens_seen": 3131047936, "step": 1493 }, { "epoch": 0.12, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.2993, "num_input_tokens_seen": 3133145088, "step": 1494 }, { "epoch": 0.12, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.2999, "num_input_tokens_seen": 3135242240, "step": 1495 }, { "epoch": 0.12, "grad_norm": 1.140625, "learning_rate": 2e-05, "loss": 2.3379, "num_input_tokens_seen": 3137339392, "step": 1496 }, { "epoch": 0.13, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.3023, "num_input_tokens_seen": 3139436544, "step": 1497 }, { "epoch": 0.13, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.3057, "num_input_tokens_seen": 3141533696, "step": 1498 }, { "epoch": 0.13, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2778, "num_input_tokens_seen": 3143630848, "step": 1499 }, { "epoch": 0.13, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.3014, "num_input_tokens_seen": 3145728000, "step": 1500 }, { "epoch": 0.13, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.3445, "num_input_tokens_seen": 3147825152, "step": 1501 }, { "epoch": 0.13, "grad_norm": 0.70703125, "learning_rate": 2e-05, "loss": 2.3035, "num_input_tokens_seen": 3149922304, "step": 1502 }, { "epoch": 0.13, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.3127, "num_input_tokens_seen": 3152019456, "step": 1503 }, { "epoch": 0.13, "grad_norm": 0.70703125, "learning_rate": 2e-05, "loss": 2.3074, "num_input_tokens_seen": 3154116608, "step": 1504 }, { "epoch": 0.13, "grad_norm": 0.72265625, "learning_rate": 2e-05, "loss": 2.3211, "num_input_tokens_seen": 3156213760, "step": 1505 }, { "epoch": 0.13, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.3333, "num_input_tokens_seen": 3158310912, "step": 1506 }, { "epoch": 0.13, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.347, "num_input_tokens_seen": 3160408064, "step": 1507 }, { "epoch": 0.13, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.3159, "num_input_tokens_seen": 3162505216, "step": 1508 }, { "epoch": 0.13, "grad_norm": 0.66796875, "learning_rate": 2e-05, "loss": 2.3148, "num_input_tokens_seen": 3164602368, "step": 1509 }, { "epoch": 0.13, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.3245, "num_input_tokens_seen": 3166699520, "step": 1510 }, { "epoch": 0.13, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.2918, "num_input_tokens_seen": 3168796672, "step": 1511 }, { "epoch": 0.13, "grad_norm": 0.67578125, "learning_rate": 2e-05, "loss": 2.3361, "num_input_tokens_seen": 3170893824, "step": 1512 }, { "epoch": 0.13, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.2995, "num_input_tokens_seen": 3172990976, "step": 1513 }, { "epoch": 0.13, "grad_norm": 0.671875, "learning_rate": 2e-05, "loss": 2.3034, "num_input_tokens_seen": 3175088128, "step": 1514 }, { "epoch": 0.13, "grad_norm": 0.64453125, "learning_rate": 2e-05, "loss": 2.2793, "num_input_tokens_seen": 3177185280, "step": 1515 }, { "epoch": 0.13, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.2936, "num_input_tokens_seen": 3179282432, "step": 1516 }, { "epoch": 0.13, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.2948, "num_input_tokens_seen": 3181379584, "step": 1517 }, { "epoch": 0.13, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.2864, "num_input_tokens_seen": 3183476736, "step": 1518 }, { "epoch": 0.13, "grad_norm": 0.67578125, "learning_rate": 2e-05, "loss": 2.2957, "num_input_tokens_seen": 3185573888, "step": 1519 }, { "epoch": 0.13, "grad_norm": 0.65625, "learning_rate": 2e-05, "loss": 2.2876, "num_input_tokens_seen": 3187671040, "step": 1520 }, { "epoch": 0.13, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.3028, "num_input_tokens_seen": 3189768192, "step": 1521 }, { "epoch": 0.13, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.3318, "num_input_tokens_seen": 3191865344, "step": 1522 }, { "epoch": 0.13, "grad_norm": 0.671875, "learning_rate": 2e-05, "loss": 2.305, "num_input_tokens_seen": 3193962496, "step": 1523 }, { "epoch": 0.13, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.3164, "num_input_tokens_seen": 3196059648, "step": 1524 }, { "epoch": 0.13, "grad_norm": 0.62109375, "learning_rate": 2e-05, "loss": 2.3211, "num_input_tokens_seen": 3198156800, "step": 1525 }, { "epoch": 0.13, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.3152, "num_input_tokens_seen": 3200253952, "step": 1526 }, { "epoch": 0.13, "grad_norm": 0.68359375, "learning_rate": 2e-05, "loss": 2.3243, "num_input_tokens_seen": 3202351104, "step": 1527 }, { "epoch": 0.13, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.3001, "num_input_tokens_seen": 3204448256, "step": 1528 }, { "epoch": 0.13, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 2.292, "num_input_tokens_seen": 3206545408, "step": 1529 }, { "epoch": 0.13, "grad_norm": 0.6953125, "learning_rate": 2e-05, "loss": 2.3027, "num_input_tokens_seen": 3208642560, "step": 1530 }, { "epoch": 0.13, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.2806, "num_input_tokens_seen": 3210739712, "step": 1531 }, { "epoch": 0.13, "grad_norm": 0.66796875, "learning_rate": 2e-05, "loss": 2.3103, "num_input_tokens_seen": 3212836864, "step": 1532 }, { "epoch": 0.13, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.3184, "num_input_tokens_seen": 3214934016, "step": 1533 }, { "epoch": 0.13, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.2944, "num_input_tokens_seen": 3217031168, "step": 1534 }, { "epoch": 0.13, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.3094, "num_input_tokens_seen": 3219128320, "step": 1535 }, { "epoch": 0.13, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.2967, "num_input_tokens_seen": 3221225472, "step": 1536 }, { "epoch": 0.13, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.3103, "num_input_tokens_seen": 3223322624, "step": 1537 }, { "epoch": 0.13, "grad_norm": 0.76171875, "learning_rate": 2e-05, "loss": 2.2884, "num_input_tokens_seen": 3225419776, "step": 1538 }, { "epoch": 0.13, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.2545, "num_input_tokens_seen": 3227516928, "step": 1539 }, { "epoch": 0.13, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.3273, "num_input_tokens_seen": 3229614080, "step": 1540 }, { "epoch": 0.13, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.302, "num_input_tokens_seen": 3231711232, "step": 1541 }, { "epoch": 0.13, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2974, "num_input_tokens_seen": 3233808384, "step": 1542 }, { "epoch": 0.13, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.2889, "num_input_tokens_seen": 3235905536, "step": 1543 }, { "epoch": 0.14, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.2623, "num_input_tokens_seen": 3238002688, "step": 1544 }, { "epoch": 0.14, "grad_norm": 0.66015625, "learning_rate": 2e-05, "loss": 2.2779, "num_input_tokens_seen": 3240099840, "step": 1545 }, { "epoch": 0.14, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.3033, "num_input_tokens_seen": 3242196992, "step": 1546 }, { "epoch": 0.14, "grad_norm": 0.69140625, "learning_rate": 2e-05, "loss": 2.33, "num_input_tokens_seen": 3244294144, "step": 1547 }, { "epoch": 0.14, "grad_norm": 0.6640625, "learning_rate": 2e-05, "loss": 2.2953, "num_input_tokens_seen": 3246391296, "step": 1548 }, { "epoch": 0.14, "grad_norm": 0.68359375, "learning_rate": 2e-05, "loss": 2.3035, "num_input_tokens_seen": 3248488448, "step": 1549 }, { "epoch": 0.14, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.2579, "num_input_tokens_seen": 3250585600, "step": 1550 }, { "epoch": 0.14, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.3027, "num_input_tokens_seen": 3252682752, "step": 1551 }, { "epoch": 0.14, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 2.2756, "num_input_tokens_seen": 3254779904, "step": 1552 }, { "epoch": 0.14, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.2685, "num_input_tokens_seen": 3256877056, "step": 1553 }, { "epoch": 0.14, "grad_norm": 0.6015625, "learning_rate": 2e-05, "loss": 2.2914, "num_input_tokens_seen": 3258974208, "step": 1554 }, { "epoch": 0.14, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.296, "num_input_tokens_seen": 3261071360, "step": 1555 }, { "epoch": 0.14, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.2912, "num_input_tokens_seen": 3263168512, "step": 1556 }, { "epoch": 0.14, "grad_norm": 0.61328125, "learning_rate": 2e-05, "loss": 2.3095, "num_input_tokens_seen": 3265265664, "step": 1557 }, { "epoch": 0.14, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.3127, "num_input_tokens_seen": 3267362816, "step": 1558 }, { "epoch": 0.14, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2919, "num_input_tokens_seen": 3269459968, "step": 1559 }, { "epoch": 0.14, "grad_norm": 0.63671875, "learning_rate": 2e-05, "loss": 2.2808, "num_input_tokens_seen": 3271557120, "step": 1560 }, { "epoch": 0.14, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.2931, "num_input_tokens_seen": 3273654272, "step": 1561 }, { "epoch": 0.14, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2928, "num_input_tokens_seen": 3275751424, "step": 1562 }, { "epoch": 0.14, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.2614, "num_input_tokens_seen": 3277848576, "step": 1563 }, { "epoch": 0.14, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.2983, "num_input_tokens_seen": 3279945728, "step": 1564 }, { "epoch": 0.14, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.2946, "num_input_tokens_seen": 3282042880, "step": 1565 }, { "epoch": 0.14, "grad_norm": 1.1484375, "learning_rate": 2e-05, "loss": 2.3209, "num_input_tokens_seen": 3284140032, "step": 1566 }, { "epoch": 0.14, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.291, "num_input_tokens_seen": 3286237184, "step": 1567 }, { "epoch": 0.14, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.3051, "num_input_tokens_seen": 3288334336, "step": 1568 }, { "epoch": 0.14, "grad_norm": 0.70703125, "learning_rate": 2e-05, "loss": 2.2759, "num_input_tokens_seen": 3290431488, "step": 1569 }, { "epoch": 0.14, "grad_norm": 0.984375, "learning_rate": 2e-05, "loss": 2.3034, "num_input_tokens_seen": 3292528640, "step": 1570 }, { "epoch": 0.14, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.2953, "num_input_tokens_seen": 3294625792, "step": 1571 }, { "epoch": 0.14, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2751, "num_input_tokens_seen": 3296722944, "step": 1572 }, { "epoch": 0.14, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.3009, "num_input_tokens_seen": 3298820096, "step": 1573 }, { "epoch": 0.14, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2618, "num_input_tokens_seen": 3300917248, "step": 1574 }, { "epoch": 0.14, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.3361, "num_input_tokens_seen": 3303014400, "step": 1575 }, { "epoch": 0.14, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.3172, "num_input_tokens_seen": 3305111552, "step": 1576 }, { "epoch": 0.14, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.2948, "num_input_tokens_seen": 3307208704, "step": 1577 }, { "epoch": 0.14, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.3025, "num_input_tokens_seen": 3309305856, "step": 1578 }, { "epoch": 0.14, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.3234, "num_input_tokens_seen": 3311403008, "step": 1579 }, { "epoch": 0.14, "grad_norm": 0.671875, "learning_rate": 2e-05, "loss": 2.3175, "num_input_tokens_seen": 3313500160, "step": 1580 }, { "epoch": 0.14, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.291, "num_input_tokens_seen": 3315597312, "step": 1581 }, { "epoch": 0.14, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.3051, "num_input_tokens_seen": 3317694464, "step": 1582 }, { "epoch": 0.14, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.3157, "num_input_tokens_seen": 3319791616, "step": 1583 }, { "epoch": 0.14, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.3184, "num_input_tokens_seen": 3321888768, "step": 1584 }, { "epoch": 0.14, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.2849, "num_input_tokens_seen": 3323985920, "step": 1585 }, { "epoch": 0.14, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.3032, "num_input_tokens_seen": 3326083072, "step": 1586 }, { "epoch": 0.14, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.3154, "num_input_tokens_seen": 3328180224, "step": 1587 }, { "epoch": 0.14, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.3037, "num_input_tokens_seen": 3330277376, "step": 1588 }, { "epoch": 0.14, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.2993, "num_input_tokens_seen": 3332374528, "step": 1589 }, { "epoch": 0.14, "grad_norm": 0.99609375, "learning_rate": 2e-05, "loss": 2.3348, "num_input_tokens_seen": 3334471680, "step": 1590 }, { "epoch": 0.14, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.3247, "num_input_tokens_seen": 3336568832, "step": 1591 }, { "epoch": 0.15, "grad_norm": 0.72265625, "learning_rate": 2e-05, "loss": 2.2836, "num_input_tokens_seen": 3338665984, "step": 1592 }, { "epoch": 0.15, "eval_loss": 2.326249837875366, "eval_runtime": 3014.9343, "eval_samples_per_second": 1.307, "eval_steps_per_second": 0.327, "num_input_tokens_seen": 3338665984, "step": 1592 }, { "epoch": 0.15, "grad_norm": 0.984375, "learning_rate": 2e-05, "loss": 2.2936, "num_input_tokens_seen": 3340763136, "step": 1593 }, { "epoch": 0.15, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.2818, "num_input_tokens_seen": 3342860288, "step": 1594 }, { "epoch": 0.15, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.2946, "num_input_tokens_seen": 3344957440, "step": 1595 }, { "epoch": 0.15, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.2852, "num_input_tokens_seen": 3347054592, "step": 1596 }, { "epoch": 0.15, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.2745, "num_input_tokens_seen": 3349151744, "step": 1597 }, { "epoch": 0.15, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2982, "num_input_tokens_seen": 3351248896, "step": 1598 }, { "epoch": 0.15, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.3064, "num_input_tokens_seen": 3353346048, "step": 1599 }, { "epoch": 0.15, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.3027, "num_input_tokens_seen": 3355443200, "step": 1600 }, { "epoch": 0.15, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.2891, "num_input_tokens_seen": 3357540352, "step": 1601 }, { "epoch": 0.15, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.2901, "num_input_tokens_seen": 3359637504, "step": 1602 }, { "epoch": 0.15, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.3202, "num_input_tokens_seen": 3361734656, "step": 1603 }, { "epoch": 0.15, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.2695, "num_input_tokens_seen": 3363831808, "step": 1604 }, { "epoch": 0.15, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2895, "num_input_tokens_seen": 3365928960, "step": 1605 }, { "epoch": 0.15, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.2827, "num_input_tokens_seen": 3368026112, "step": 1606 }, { "epoch": 0.15, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.2657, "num_input_tokens_seen": 3370123264, "step": 1607 }, { "epoch": 0.15, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.2842, "num_input_tokens_seen": 3372220416, "step": 1608 }, { "epoch": 0.15, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.257, "num_input_tokens_seen": 3374317568, "step": 1609 }, { "epoch": 0.15, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.2934, "num_input_tokens_seen": 3376414720, "step": 1610 }, { "epoch": 0.15, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2541, "num_input_tokens_seen": 3378511872, "step": 1611 }, { "epoch": 0.15, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.297, "num_input_tokens_seen": 3380609024, "step": 1612 }, { "epoch": 0.15, "grad_norm": 0.96875, "learning_rate": 2e-05, "loss": 2.2778, "num_input_tokens_seen": 3382706176, "step": 1613 }, { "epoch": 0.15, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.3148, "num_input_tokens_seen": 3384803328, "step": 1614 }, { "epoch": 0.15, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.2765, "num_input_tokens_seen": 3386900480, "step": 1615 }, { "epoch": 0.15, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2946, "num_input_tokens_seen": 3388997632, "step": 1616 }, { "epoch": 0.15, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.2784, "num_input_tokens_seen": 3391094784, "step": 1617 }, { "epoch": 0.15, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.306, "num_input_tokens_seen": 3393191936, "step": 1618 }, { "epoch": 0.15, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2996, "num_input_tokens_seen": 3395289088, "step": 1619 }, { "epoch": 0.15, "grad_norm": 0.72265625, "learning_rate": 2e-05, "loss": 2.2958, "num_input_tokens_seen": 3397386240, "step": 1620 }, { "epoch": 0.15, "grad_norm": 0.609375, "learning_rate": 2e-05, "loss": 2.2875, "num_input_tokens_seen": 3399483392, "step": 1621 }, { "epoch": 0.15, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.3253, "num_input_tokens_seen": 3401580544, "step": 1622 }, { "epoch": 0.15, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.3098, "num_input_tokens_seen": 3403677696, "step": 1623 }, { "epoch": 0.15, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.3131, "num_input_tokens_seen": 3405774848, "step": 1624 }, { "epoch": 0.15, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.2679, "num_input_tokens_seen": 3407872000, "step": 1625 }, { "epoch": 0.15, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.2878, "num_input_tokens_seen": 3409969152, "step": 1626 }, { "epoch": 0.15, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.3364, "num_input_tokens_seen": 3412066304, "step": 1627 }, { "epoch": 0.15, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.3332, "num_input_tokens_seen": 3414163456, "step": 1628 }, { "epoch": 0.15, "grad_norm": 0.703125, "learning_rate": 2e-05, "loss": 2.3267, "num_input_tokens_seen": 3416260608, "step": 1629 }, { "epoch": 0.15, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.2794, "num_input_tokens_seen": 3418357760, "step": 1630 }, { "epoch": 0.15, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.3222, "num_input_tokens_seen": 3420454912, "step": 1631 }, { "epoch": 0.15, "grad_norm": 0.6484375, "learning_rate": 2e-05, "loss": 2.3013, "num_input_tokens_seen": 3422552064, "step": 1632 }, { "epoch": 0.15, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.3088, "num_input_tokens_seen": 3424649216, "step": 1633 }, { "epoch": 0.15, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.3134, "num_input_tokens_seen": 3426746368, "step": 1634 }, { "epoch": 0.15, "grad_norm": 0.60546875, "learning_rate": 2e-05, "loss": 2.3117, "num_input_tokens_seen": 3428843520, "step": 1635 }, { "epoch": 0.15, "grad_norm": 0.609375, "learning_rate": 2e-05, "loss": 2.2822, "num_input_tokens_seen": 3430940672, "step": 1636 }, { "epoch": 0.15, "grad_norm": 0.640625, "learning_rate": 2e-05, "loss": 2.2607, "num_input_tokens_seen": 3433037824, "step": 1637 }, { "epoch": 0.15, "grad_norm": 0.59765625, "learning_rate": 2e-05, "loss": 2.2904, "num_input_tokens_seen": 3435134976, "step": 1638 }, { "epoch": 0.15, "grad_norm": 0.62109375, "learning_rate": 2e-05, "loss": 2.2887, "num_input_tokens_seen": 3437232128, "step": 1639 }, { "epoch": 0.16, "grad_norm": 0.5859375, "learning_rate": 2e-05, "loss": 2.2991, "num_input_tokens_seen": 3439329280, "step": 1640 }, { "epoch": 0.16, "grad_norm": 0.67578125, "learning_rate": 2e-05, "loss": 2.3296, "num_input_tokens_seen": 3441426432, "step": 1641 }, { "epoch": 0.16, "grad_norm": 0.640625, "learning_rate": 2e-05, "loss": 2.3204, "num_input_tokens_seen": 3443523584, "step": 1642 }, { "epoch": 0.16, "grad_norm": 0.63671875, "learning_rate": 2e-05, "loss": 2.307, "num_input_tokens_seen": 3445620736, "step": 1643 }, { "epoch": 0.16, "grad_norm": 0.65625, "learning_rate": 2e-05, "loss": 2.2909, "num_input_tokens_seen": 3447717888, "step": 1644 }, { "epoch": 0.16, "grad_norm": 0.6640625, "learning_rate": 2e-05, "loss": 2.331, "num_input_tokens_seen": 3449815040, "step": 1645 }, { "epoch": 0.16, "grad_norm": 0.59375, "learning_rate": 2e-05, "loss": 2.3177, "num_input_tokens_seen": 3451912192, "step": 1646 }, { "epoch": 0.16, "grad_norm": 0.6796875, "learning_rate": 2e-05, "loss": 2.3045, "num_input_tokens_seen": 3454009344, "step": 1647 }, { "epoch": 0.16, "grad_norm": 0.69921875, "learning_rate": 2e-05, "loss": 2.3235, "num_input_tokens_seen": 3456106496, "step": 1648 }, { "epoch": 0.16, "grad_norm": 0.63671875, "learning_rate": 2e-05, "loss": 2.3073, "num_input_tokens_seen": 3458203648, "step": 1649 }, { "epoch": 0.16, "grad_norm": 0.6015625, "learning_rate": 2e-05, "loss": 2.2951, "num_input_tokens_seen": 3460300800, "step": 1650 }, { "epoch": 0.16, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.2826, "num_input_tokens_seen": 3462397952, "step": 1651 }, { "epoch": 0.16, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.2537, "num_input_tokens_seen": 3464495104, "step": 1652 }, { "epoch": 0.16, "grad_norm": 0.6171875, "learning_rate": 2e-05, "loss": 2.294, "num_input_tokens_seen": 3466592256, "step": 1653 }, { "epoch": 0.16, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.273, "num_input_tokens_seen": 3468689408, "step": 1654 }, { "epoch": 0.16, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.3012, "num_input_tokens_seen": 3470786560, "step": 1655 }, { "epoch": 0.16, "grad_norm": 0.671875, "learning_rate": 2e-05, "loss": 2.2721, "num_input_tokens_seen": 3472883712, "step": 1656 }, { "epoch": 0.16, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.2877, "num_input_tokens_seen": 3474980864, "step": 1657 }, { "epoch": 0.16, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.3078, "num_input_tokens_seen": 3477078016, "step": 1658 }, { "epoch": 0.16, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.2888, "num_input_tokens_seen": 3479175168, "step": 1659 }, { "epoch": 0.16, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.316, "num_input_tokens_seen": 3481272320, "step": 1660 }, { "epoch": 0.16, "grad_norm": 0.66796875, "learning_rate": 2e-05, "loss": 2.2982, "num_input_tokens_seen": 3483369472, "step": 1661 }, { "epoch": 0.16, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.2853, "num_input_tokens_seen": 3485466624, "step": 1662 }, { "epoch": 0.16, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.3363, "num_input_tokens_seen": 3487563776, "step": 1663 }, { "epoch": 0.16, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.3287, "num_input_tokens_seen": 3489660928, "step": 1664 }, { "epoch": 0.16, "grad_norm": 0.703125, "learning_rate": 2e-05, "loss": 2.3049, "num_input_tokens_seen": 3491758080, "step": 1665 }, { "epoch": 0.16, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2986, "num_input_tokens_seen": 3493855232, "step": 1666 }, { "epoch": 0.16, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.3511, "num_input_tokens_seen": 3495952384, "step": 1667 }, { "epoch": 0.16, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2855, "num_input_tokens_seen": 3498049536, "step": 1668 }, { "epoch": 0.16, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.2881, "num_input_tokens_seen": 3500146688, "step": 1669 }, { "epoch": 0.16, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.3014, "num_input_tokens_seen": 3502243840, "step": 1670 }, { "epoch": 0.16, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.3102, "num_input_tokens_seen": 3504340992, "step": 1671 }, { "epoch": 0.16, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.2604, "num_input_tokens_seen": 3506438144, "step": 1672 }, { "epoch": 0.16, "grad_norm": 0.6328125, "learning_rate": 2e-05, "loss": 2.3257, "num_input_tokens_seen": 3508535296, "step": 1673 }, { "epoch": 0.16, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.2873, "num_input_tokens_seen": 3510632448, "step": 1674 }, { "epoch": 0.16, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.3142, "num_input_tokens_seen": 3512729600, "step": 1675 }, { "epoch": 0.16, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2739, "num_input_tokens_seen": 3514826752, "step": 1676 }, { "epoch": 0.16, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.2643, "num_input_tokens_seen": 3516923904, "step": 1677 }, { "epoch": 0.16, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.3014, "num_input_tokens_seen": 3519021056, "step": 1678 }, { "epoch": 0.16, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.308, "num_input_tokens_seen": 3521118208, "step": 1679 }, { "epoch": 0.16, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.3124, "num_input_tokens_seen": 3523215360, "step": 1680 }, { "epoch": 0.16, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.3048, "num_input_tokens_seen": 3525312512, "step": 1681 }, { "epoch": 0.16, "grad_norm": 0.984375, "learning_rate": 2e-05, "loss": 2.275, "num_input_tokens_seen": 3527409664, "step": 1682 }, { "epoch": 0.16, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.2858, "num_input_tokens_seen": 3529506816, "step": 1683 }, { "epoch": 0.16, "grad_norm": 1.515625, "learning_rate": 2e-05, "loss": 2.2759, "num_input_tokens_seen": 3531603968, "step": 1684 }, { "epoch": 0.16, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 2.2927, "num_input_tokens_seen": 3533701120, "step": 1685 }, { "epoch": 0.16, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.2666, "num_input_tokens_seen": 3535798272, "step": 1686 }, { "epoch": 0.17, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.2874, "num_input_tokens_seen": 3537895424, "step": 1687 }, { "epoch": 0.17, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 2.2401, "num_input_tokens_seen": 3539992576, "step": 1688 }, { "epoch": 0.17, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2555, "num_input_tokens_seen": 3542089728, "step": 1689 }, { "epoch": 0.17, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.2911, "num_input_tokens_seen": 3544186880, "step": 1690 }, { "epoch": 0.17, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.3082, "num_input_tokens_seen": 3546284032, "step": 1691 }, { "epoch": 0.17, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.2766, "num_input_tokens_seen": 3548381184, "step": 1692 }, { "epoch": 0.17, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.2442, "num_input_tokens_seen": 3550478336, "step": 1693 }, { "epoch": 0.17, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.2798, "num_input_tokens_seen": 3552575488, "step": 1694 }, { "epoch": 0.17, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.2557, "num_input_tokens_seen": 3554672640, "step": 1695 }, { "epoch": 0.17, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.2351, "num_input_tokens_seen": 3556769792, "step": 1696 }, { "epoch": 0.17, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2715, "num_input_tokens_seen": 3558866944, "step": 1697 }, { "epoch": 0.17, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.2869, "num_input_tokens_seen": 3560964096, "step": 1698 }, { "epoch": 0.17, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 2.2346, "num_input_tokens_seen": 3563061248, "step": 1699 }, { "epoch": 0.17, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.2722, "num_input_tokens_seen": 3565158400, "step": 1700 }, { "epoch": 0.0, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.2666, "num_input_tokens_seen": 3567255552, "step": 1701 }, { "epoch": 0.0, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 2.2513, "num_input_tokens_seen": 3569352704, "step": 1702 }, { "epoch": 0.0, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.261, "num_input_tokens_seen": 3571449856, "step": 1703 }, { "epoch": 0.0, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.3282, "num_input_tokens_seen": 3573547008, "step": 1704 }, { "epoch": 0.0, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.267, "num_input_tokens_seen": 3575644160, "step": 1705 }, { "epoch": 0.0, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.2673, "num_input_tokens_seen": 3577741312, "step": 1706 }, { "epoch": 0.0, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.3082, "num_input_tokens_seen": 3579838464, "step": 1707 }, { "epoch": 0.0, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2932, "num_input_tokens_seen": 3581935616, "step": 1708 }, { "epoch": 0.0, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.3123, "num_input_tokens_seen": 3584032768, "step": 1709 }, { "epoch": 0.0, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.2871, "num_input_tokens_seen": 3586129920, "step": 1710 }, { "epoch": 0.0, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.268, "num_input_tokens_seen": 3588227072, "step": 1711 }, { "epoch": 0.0, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.2696, "num_input_tokens_seen": 3590324224, "step": 1712 }, { "epoch": 0.0, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.2469, "num_input_tokens_seen": 3592421376, "step": 1713 }, { "epoch": 0.0, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.2936, "num_input_tokens_seen": 3594518528, "step": 1714 }, { "epoch": 0.0, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.2657, "num_input_tokens_seen": 3596615680, "step": 1715 }, { "epoch": 0.0, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.3095, "num_input_tokens_seen": 3598712832, "step": 1716 }, { "epoch": 0.0, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.2402, "num_input_tokens_seen": 3600809984, "step": 1717 }, { "epoch": 0.0, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2438, "num_input_tokens_seen": 3602907136, "step": 1718 }, { "epoch": 0.0, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.2802, "num_input_tokens_seen": 3605004288, "step": 1719 }, { "epoch": 0.0, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.2693, "num_input_tokens_seen": 3607101440, "step": 1720 }, { "epoch": 0.0, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2613, "num_input_tokens_seen": 3609198592, "step": 1721 }, { "epoch": 0.0, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.2496, "num_input_tokens_seen": 3611295744, "step": 1722 }, { "epoch": 0.0, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2641, "num_input_tokens_seen": 3613392896, "step": 1723 }, { "epoch": 0.01, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2621, "num_input_tokens_seen": 3615490048, "step": 1724 }, { "epoch": 0.01, "grad_norm": 0.76171875, "learning_rate": 2e-05, "loss": 2.2827, "num_input_tokens_seen": 3617587200, "step": 1725 }, { "epoch": 0.01, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.2275, "num_input_tokens_seen": 3619684352, "step": 1726 }, { "epoch": 0.01, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.2833, "num_input_tokens_seen": 3621781504, "step": 1727 }, { "epoch": 0.01, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.2839, "num_input_tokens_seen": 3623878656, "step": 1728 }, { "epoch": 0.01, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.2622, "num_input_tokens_seen": 3625975808, "step": 1729 }, { "epoch": 0.01, "grad_norm": 1.21875, "learning_rate": 2e-05, "loss": 2.3042, "num_input_tokens_seen": 3628072960, "step": 1730 }, { "epoch": 0.01, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2832, "num_input_tokens_seen": 3630170112, "step": 1731 }, { "epoch": 0.01, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.2657, "num_input_tokens_seen": 3632267264, "step": 1732 }, { "epoch": 0.01, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.2854, "num_input_tokens_seen": 3634364416, "step": 1733 }, { "epoch": 0.01, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2651, "num_input_tokens_seen": 3636461568, "step": 1734 }, { "epoch": 0.01, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2769, "num_input_tokens_seen": 3638558720, "step": 1735 }, { "epoch": 0.01, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.2749, "num_input_tokens_seen": 3640655872, "step": 1736 }, { "epoch": 0.01, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.261, "num_input_tokens_seen": 3642753024, "step": 1737 }, { "epoch": 0.01, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.2924, "num_input_tokens_seen": 3644850176, "step": 1738 }, { "epoch": 0.01, "grad_norm": 1.265625, "learning_rate": 2e-05, "loss": 2.254, "num_input_tokens_seen": 3646947328, "step": 1739 }, { "epoch": 0.01, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.2607, "num_input_tokens_seen": 3649044480, "step": 1740 }, { "epoch": 0.01, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.2747, "num_input_tokens_seen": 3651141632, "step": 1741 }, { "epoch": 0.01, "grad_norm": 0.97265625, "learning_rate": 2e-05, "loss": 2.2796, "num_input_tokens_seen": 3653238784, "step": 1742 }, { "epoch": 0.01, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.2827, "num_input_tokens_seen": 3655335936, "step": 1743 }, { "epoch": 0.01, "grad_norm": 0.96875, "learning_rate": 2e-05, "loss": 2.2542, "num_input_tokens_seen": 3657433088, "step": 1744 }, { "epoch": 0.01, "grad_norm": 0.99609375, "learning_rate": 2e-05, "loss": 2.2911, "num_input_tokens_seen": 3659530240, "step": 1745 }, { "epoch": 0.01, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.2536, "num_input_tokens_seen": 3661627392, "step": 1746 }, { "epoch": 0.01, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.2906, "num_input_tokens_seen": 3663724544, "step": 1747 }, { "epoch": 0.01, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.2732, "num_input_tokens_seen": 3665821696, "step": 1748 }, { "epoch": 0.01, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.2547, "num_input_tokens_seen": 3667918848, "step": 1749 }, { "epoch": 0.01, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.2798, "num_input_tokens_seen": 3670016000, "step": 1750 }, { "epoch": 0.01, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2336, "num_input_tokens_seen": 3672113152, "step": 1751 }, { "epoch": 0.01, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.2673, "num_input_tokens_seen": 3674210304, "step": 1752 }, { "epoch": 0.01, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.2288, "num_input_tokens_seen": 3676307456, "step": 1753 }, { "epoch": 0.01, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.2763, "num_input_tokens_seen": 3678404608, "step": 1754 }, { "epoch": 0.01, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.2709, "num_input_tokens_seen": 3680501760, "step": 1755 }, { "epoch": 0.01, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.2476, "num_input_tokens_seen": 3682598912, "step": 1756 }, { "epoch": 0.01, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.2921, "num_input_tokens_seen": 3684696064, "step": 1757 }, { "epoch": 0.01, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.256, "num_input_tokens_seen": 3686793216, "step": 1758 }, { "epoch": 0.01, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.2576, "num_input_tokens_seen": 3688890368, "step": 1759 }, { "epoch": 0.01, "grad_norm": 1.1875, "learning_rate": 2e-05, "loss": 2.2661, "num_input_tokens_seen": 3690987520, "step": 1760 }, { "epoch": 0.01, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.274, "num_input_tokens_seen": 3693084672, "step": 1761 }, { "epoch": 0.01, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.2661, "num_input_tokens_seen": 3695181824, "step": 1762 }, { "epoch": 0.01, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.272, "num_input_tokens_seen": 3697278976, "step": 1763 }, { "epoch": 0.01, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2918, "num_input_tokens_seen": 3699376128, "step": 1764 }, { "epoch": 0.01, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.2393, "num_input_tokens_seen": 3701473280, "step": 1765 }, { "epoch": 0.01, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2422, "num_input_tokens_seen": 3703570432, "step": 1766 }, { "epoch": 0.01, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.25, "num_input_tokens_seen": 3705667584, "step": 1767 }, { "epoch": 0.01, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.2633, "num_input_tokens_seen": 3707764736, "step": 1768 }, { "epoch": 0.01, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.3031, "num_input_tokens_seen": 3709861888, "step": 1769 }, { "epoch": 0.01, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.2718, "num_input_tokens_seen": 3711959040, "step": 1770 }, { "epoch": 0.01, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2791, "num_input_tokens_seen": 3714056192, "step": 1771 }, { "epoch": 0.02, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.2702, "num_input_tokens_seen": 3716153344, "step": 1772 }, { "epoch": 0.02, "grad_norm": 0.91796875, "learning_rate": 2e-05, "loss": 2.2664, "num_input_tokens_seen": 3718250496, "step": 1773 }, { "epoch": 0.02, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.2805, "num_input_tokens_seen": 3720347648, "step": 1774 }, { "epoch": 0.02, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.2621, "num_input_tokens_seen": 3722444800, "step": 1775 }, { "epoch": 0.02, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2668, "num_input_tokens_seen": 3724541952, "step": 1776 }, { "epoch": 0.02, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.2762, "num_input_tokens_seen": 3726639104, "step": 1777 }, { "epoch": 0.02, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.2618, "num_input_tokens_seen": 3728736256, "step": 1778 }, { "epoch": 0.02, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.2736, "num_input_tokens_seen": 3730833408, "step": 1779 }, { "epoch": 0.02, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.2861, "num_input_tokens_seen": 3732930560, "step": 1780 }, { "epoch": 0.02, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2288, "num_input_tokens_seen": 3735027712, "step": 1781 }, { "epoch": 0.02, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.261, "num_input_tokens_seen": 3737124864, "step": 1782 }, { "epoch": 0.02, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.2725, "num_input_tokens_seen": 3739222016, "step": 1783 }, { "epoch": 0.02, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.2813, "num_input_tokens_seen": 3741319168, "step": 1784 }, { "epoch": 0.02, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.2767, "num_input_tokens_seen": 3743416320, "step": 1785 }, { "epoch": 0.02, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.2856, "num_input_tokens_seen": 3745513472, "step": 1786 }, { "epoch": 0.02, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.2418, "num_input_tokens_seen": 3747610624, "step": 1787 }, { "epoch": 0.02, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.3087, "num_input_tokens_seen": 3749707776, "step": 1788 }, { "epoch": 0.02, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.2991, "num_input_tokens_seen": 3751804928, "step": 1789 }, { "epoch": 0.02, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 2.267, "num_input_tokens_seen": 3753902080, "step": 1790 }, { "epoch": 0.02, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.278, "num_input_tokens_seen": 3755999232, "step": 1791 }, { "epoch": 0.02, "eval_loss": 2.3124513626098633, "eval_runtime": 1977.395, "eval_samples_per_second": 1.994, "eval_steps_per_second": 0.499, "num_input_tokens_seen": 3755999232, "step": 1791 }, { "epoch": 0.02, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.2862, "num_input_tokens_seen": 3758096384, "step": 1792 }, { "epoch": 0.02, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 2.2916, "num_input_tokens_seen": 3760193536, "step": 1793 }, { "epoch": 0.02, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.2969, "num_input_tokens_seen": 3762290688, "step": 1794 }, { "epoch": 0.02, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2444, "num_input_tokens_seen": 3764387840, "step": 1795 }, { "epoch": 0.02, "grad_norm": 1.2734375, "learning_rate": 2e-05, "loss": 2.2639, "num_input_tokens_seen": 3766484992, "step": 1796 }, { "epoch": 0.02, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.2653, "num_input_tokens_seen": 3768582144, "step": 1797 }, { "epoch": 0.02, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.2386, "num_input_tokens_seen": 3770679296, "step": 1798 }, { "epoch": 0.02, "grad_norm": 1.4375, "learning_rate": 2e-05, "loss": 2.2753, "num_input_tokens_seen": 3772776448, "step": 1799 }, { "epoch": 0.02, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.2456, "num_input_tokens_seen": 3774873600, "step": 1800 }, { "epoch": 0.02, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2582, "num_input_tokens_seen": 3776970752, "step": 1801 }, { "epoch": 0.02, "grad_norm": 1.78125, "learning_rate": 2e-05, "loss": 2.2532, "num_input_tokens_seen": 3779067904, "step": 1802 }, { "epoch": 0.02, "grad_norm": 1.390625, "learning_rate": 2e-05, "loss": 2.2598, "num_input_tokens_seen": 3781165056, "step": 1803 }, { "epoch": 0.02, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 2.2891, "num_input_tokens_seen": 3783262208, "step": 1804 }, { "epoch": 0.02, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.2787, "num_input_tokens_seen": 3785359360, "step": 1805 }, { "epoch": 0.02, "grad_norm": 1.1953125, "learning_rate": 2e-05, "loss": 2.2896, "num_input_tokens_seen": 3787456512, "step": 1806 }, { "epoch": 0.02, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.2622, "num_input_tokens_seen": 3789553664, "step": 1807 }, { "epoch": 0.02, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.2527, "num_input_tokens_seen": 3791650816, "step": 1808 }, { "epoch": 0.02, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.2761, "num_input_tokens_seen": 3793747968, "step": 1809 }, { "epoch": 0.02, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.2731, "num_input_tokens_seen": 3795845120, "step": 1810 }, { "epoch": 0.02, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.2938, "num_input_tokens_seen": 3797942272, "step": 1811 }, { "epoch": 0.02, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.278, "num_input_tokens_seen": 3800039424, "step": 1812 }, { "epoch": 0.02, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.2834, "num_input_tokens_seen": 3802136576, "step": 1813 }, { "epoch": 0.02, "grad_norm": 1.109375, "learning_rate": 2e-05, "loss": 2.2525, "num_input_tokens_seen": 3804233728, "step": 1814 }, { "epoch": 0.02, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.2818, "num_input_tokens_seen": 3806330880, "step": 1815 }, { "epoch": 0.02, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.2897, "num_input_tokens_seen": 3808428032, "step": 1816 }, { "epoch": 0.02, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.2676, "num_input_tokens_seen": 3810525184, "step": 1817 }, { "epoch": 0.02, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.2816, "num_input_tokens_seen": 3812622336, "step": 1818 }, { "epoch": 0.02, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.308, "num_input_tokens_seen": 3814719488, "step": 1819 }, { "epoch": 0.03, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2763, "num_input_tokens_seen": 3816816640, "step": 1820 }, { "epoch": 0.03, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.3067, "num_input_tokens_seen": 3818913792, "step": 1821 }, { "epoch": 0.03, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.268, "num_input_tokens_seen": 3821010944, "step": 1822 }, { "epoch": 0.03, "grad_norm": 0.74609375, "learning_rate": 2e-05, "loss": 2.308, "num_input_tokens_seen": 3823108096, "step": 1823 }, { "epoch": 0.03, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.3235, "num_input_tokens_seen": 3825205248, "step": 1824 }, { "epoch": 0.03, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.2718, "num_input_tokens_seen": 3827302400, "step": 1825 }, { "epoch": 0.03, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.275, "num_input_tokens_seen": 3829399552, "step": 1826 }, { "epoch": 0.03, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.2931, "num_input_tokens_seen": 3831496704, "step": 1827 }, { "epoch": 0.03, "grad_norm": 1.1484375, "learning_rate": 2e-05, "loss": 2.2712, "num_input_tokens_seen": 3833593856, "step": 1828 }, { "epoch": 0.03, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2832, "num_input_tokens_seen": 3835691008, "step": 1829 }, { "epoch": 0.03, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.2866, "num_input_tokens_seen": 3837788160, "step": 1830 }, { "epoch": 0.03, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 2.2607, "num_input_tokens_seen": 3839885312, "step": 1831 }, { "epoch": 0.03, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.283, "num_input_tokens_seen": 3841982464, "step": 1832 }, { "epoch": 0.03, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 2.317, "num_input_tokens_seen": 3844079616, "step": 1833 }, { "epoch": 0.03, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 2.3115, "num_input_tokens_seen": 3846176768, "step": 1834 }, { "epoch": 0.03, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.2787, "num_input_tokens_seen": 3848273920, "step": 1835 }, { "epoch": 0.03, "grad_norm": 1.625, "learning_rate": 2e-05, "loss": 2.2913, "num_input_tokens_seen": 3850371072, "step": 1836 }, { "epoch": 0.03, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.281, "num_input_tokens_seen": 3852468224, "step": 1837 }, { "epoch": 0.03, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 2.2691, "num_input_tokens_seen": 3854565376, "step": 1838 }, { "epoch": 0.03, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 2.2422, "num_input_tokens_seen": 3856662528, "step": 1839 }, { "epoch": 0.03, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 2.2639, "num_input_tokens_seen": 3858759680, "step": 1840 }, { "epoch": 0.03, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.2793, "num_input_tokens_seen": 3860856832, "step": 1841 }, { "epoch": 0.03, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.2604, "num_input_tokens_seen": 3862953984, "step": 1842 }, { "epoch": 0.03, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 2.2825, "num_input_tokens_seen": 3865051136, "step": 1843 }, { "epoch": 0.03, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.2733, "num_input_tokens_seen": 3867148288, "step": 1844 }, { "epoch": 0.03, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 2.2563, "num_input_tokens_seen": 3869245440, "step": 1845 }, { "epoch": 0.03, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 2.2643, "num_input_tokens_seen": 3871342592, "step": 1846 }, { "epoch": 0.03, "grad_norm": 1.5625, "learning_rate": 2e-05, "loss": 2.2678, "num_input_tokens_seen": 3873439744, "step": 1847 }, { "epoch": 0.03, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.2887, "num_input_tokens_seen": 3875536896, "step": 1848 }, { "epoch": 0.03, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 2.2596, "num_input_tokens_seen": 3877634048, "step": 1849 }, { "epoch": 0.03, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 2.2691, "num_input_tokens_seen": 3879731200, "step": 1850 }, { "epoch": 0.03, "grad_norm": 1.5390625, "learning_rate": 2e-05, "loss": 2.2658, "num_input_tokens_seen": 3881828352, "step": 1851 }, { "epoch": 0.03, "grad_norm": 1.1484375, "learning_rate": 2e-05, "loss": 2.279, "num_input_tokens_seen": 3883925504, "step": 1852 }, { "epoch": 0.03, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.2648, "num_input_tokens_seen": 3886022656, "step": 1853 }, { "epoch": 0.03, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.2621, "num_input_tokens_seen": 3888119808, "step": 1854 }, { "epoch": 0.03, "grad_norm": 1.4140625, "learning_rate": 2e-05, "loss": 2.2786, "num_input_tokens_seen": 3890216960, "step": 1855 }, { "epoch": 0.03, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 2.2639, "num_input_tokens_seen": 3892314112, "step": 1856 }, { "epoch": 0.03, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2535, "num_input_tokens_seen": 3894411264, "step": 1857 }, { "epoch": 0.03, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 2.2753, "num_input_tokens_seen": 3896508416, "step": 1858 }, { "epoch": 0.03, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.2683, "num_input_tokens_seen": 3898605568, "step": 1859 }, { "epoch": 0.03, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2422, "num_input_tokens_seen": 3900702720, "step": 1860 }, { "epoch": 0.03, "grad_norm": 0.97265625, "learning_rate": 2e-05, "loss": 2.2575, "num_input_tokens_seen": 3902799872, "step": 1861 }, { "epoch": 0.03, "grad_norm": 1.0, "learning_rate": 2e-05, "loss": 2.2694, "num_input_tokens_seen": 3904897024, "step": 1862 }, { "epoch": 0.03, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.2493, "num_input_tokens_seen": 3906994176, "step": 1863 }, { "epoch": 0.03, "grad_norm": 1.1171875, "learning_rate": 2e-05, "loss": 2.2533, "num_input_tokens_seen": 3909091328, "step": 1864 }, { "epoch": 0.03, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.2826, "num_input_tokens_seen": 3911188480, "step": 1865 }, { "epoch": 0.03, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.2654, "num_input_tokens_seen": 3913285632, "step": 1866 }, { "epoch": 0.04, "grad_norm": 0.95703125, "learning_rate": 2e-05, "loss": 2.2886, "num_input_tokens_seen": 3915382784, "step": 1867 }, { "epoch": 0.04, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.2768, "num_input_tokens_seen": 3917479936, "step": 1868 }, { "epoch": 0.04, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.2558, "num_input_tokens_seen": 3919577088, "step": 1869 }, { "epoch": 0.04, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.2596, "num_input_tokens_seen": 3921674240, "step": 1870 }, { "epoch": 0.04, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.2925, "num_input_tokens_seen": 3923771392, "step": 1871 }, { "epoch": 0.04, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.257, "num_input_tokens_seen": 3925868544, "step": 1872 }, { "epoch": 0.04, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.2747, "num_input_tokens_seen": 3927965696, "step": 1873 }, { "epoch": 0.04, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.2832, "num_input_tokens_seen": 3930062848, "step": 1874 }, { "epoch": 0.04, "grad_norm": 1.1484375, "learning_rate": 2e-05, "loss": 2.2889, "num_input_tokens_seen": 3932160000, "step": 1875 }, { "epoch": 0.04, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 2.2738, "num_input_tokens_seen": 3934257152, "step": 1876 }, { "epoch": 0.04, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.2712, "num_input_tokens_seen": 3936354304, "step": 1877 }, { "epoch": 0.04, "grad_norm": 1.1171875, "learning_rate": 2e-05, "loss": 2.2532, "num_input_tokens_seen": 3938451456, "step": 1878 }, { "epoch": 0.04, "grad_norm": 1.1640625, "learning_rate": 2e-05, "loss": 2.2394, "num_input_tokens_seen": 3940548608, "step": 1879 }, { "epoch": 0.04, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2577, "num_input_tokens_seen": 3942645760, "step": 1880 }, { "epoch": 0.04, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.2637, "num_input_tokens_seen": 3944742912, "step": 1881 }, { "epoch": 0.04, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.2607, "num_input_tokens_seen": 3946840064, "step": 1882 }, { "epoch": 0.04, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.2183, "num_input_tokens_seen": 3948937216, "step": 1883 }, { "epoch": 0.04, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.2656, "num_input_tokens_seen": 3951034368, "step": 1884 }, { "epoch": 0.04, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.2803, "num_input_tokens_seen": 3953131520, "step": 1885 }, { "epoch": 0.04, "grad_norm": 1.109375, "learning_rate": 2e-05, "loss": 2.2664, "num_input_tokens_seen": 3955228672, "step": 1886 }, { "epoch": 0.04, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2447, "num_input_tokens_seen": 3957325824, "step": 1887 }, { "epoch": 0.04, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.2562, "num_input_tokens_seen": 3959422976, "step": 1888 }, { "epoch": 0.04, "grad_norm": 1.46875, "learning_rate": 2e-05, "loss": 2.2404, "num_input_tokens_seen": 3961520128, "step": 1889 }, { "epoch": 0.04, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.2782, "num_input_tokens_seen": 3963617280, "step": 1890 }, { "epoch": 0.04, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2295, "num_input_tokens_seen": 3965714432, "step": 1891 }, { "epoch": 0.04, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.2327, "num_input_tokens_seen": 3967811584, "step": 1892 }, { "epoch": 0.04, "grad_norm": 0.76171875, "learning_rate": 2e-05, "loss": 2.2564, "num_input_tokens_seen": 3969908736, "step": 1893 }, { "epoch": 0.04, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.2454, "num_input_tokens_seen": 3972005888, "step": 1894 }, { "epoch": 0.04, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2514, "num_input_tokens_seen": 3974103040, "step": 1895 }, { "epoch": 0.04, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.2506, "num_input_tokens_seen": 3976200192, "step": 1896 }, { "epoch": 0.04, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.2528, "num_input_tokens_seen": 3978297344, "step": 1897 }, { "epoch": 0.04, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2347, "num_input_tokens_seen": 3980394496, "step": 1898 }, { "epoch": 0.04, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.2677, "num_input_tokens_seen": 3982491648, "step": 1899 }, { "epoch": 0.04, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.2871, "num_input_tokens_seen": 3984588800, "step": 1900 }, { "epoch": 0.04, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.2671, "num_input_tokens_seen": 3986685952, "step": 1901 }, { "epoch": 0.04, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.2609, "num_input_tokens_seen": 3988783104, "step": 1902 }, { "epoch": 0.04, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.25, "num_input_tokens_seen": 3990880256, "step": 1903 }, { "epoch": 0.04, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.2753, "num_input_tokens_seen": 3992977408, "step": 1904 }, { "epoch": 0.04, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2388, "num_input_tokens_seen": 3995074560, "step": 1905 }, { "epoch": 0.04, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.225, "num_input_tokens_seen": 3997171712, "step": 1906 }, { "epoch": 0.04, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.2758, "num_input_tokens_seen": 3999268864, "step": 1907 }, { "epoch": 0.04, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2452, "num_input_tokens_seen": 4001366016, "step": 1908 }, { "epoch": 0.04, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.2403, "num_input_tokens_seen": 4003463168, "step": 1909 }, { "epoch": 0.04, "grad_norm": 0.91796875, "learning_rate": 2e-05, "loss": 2.2516, "num_input_tokens_seen": 4005560320, "step": 1910 }, { "epoch": 0.04, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2475, "num_input_tokens_seen": 4007657472, "step": 1911 }, { "epoch": 0.04, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2799, "num_input_tokens_seen": 4009754624, "step": 1912 }, { "epoch": 0.04, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.2409, "num_input_tokens_seen": 4011851776, "step": 1913 }, { "epoch": 0.04, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2585, "num_input_tokens_seen": 4013948928, "step": 1914 }, { "epoch": 0.05, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.2801, "num_input_tokens_seen": 4016046080, "step": 1915 }, { "epoch": 0.05, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2397, "num_input_tokens_seen": 4018143232, "step": 1916 }, { "epoch": 0.05, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.2618, "num_input_tokens_seen": 4020240384, "step": 1917 }, { "epoch": 0.05, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2926, "num_input_tokens_seen": 4022337536, "step": 1918 }, { "epoch": 0.05, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.2566, "num_input_tokens_seen": 4024434688, "step": 1919 }, { "epoch": 0.05, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.2638, "num_input_tokens_seen": 4026531840, "step": 1920 }, { "epoch": 0.05, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.2595, "num_input_tokens_seen": 4028628992, "step": 1921 }, { "epoch": 0.05, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2581, "num_input_tokens_seen": 4030726144, "step": 1922 }, { "epoch": 0.05, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.2805, "num_input_tokens_seen": 4032823296, "step": 1923 }, { "epoch": 0.05, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.2686, "num_input_tokens_seen": 4034920448, "step": 1924 }, { "epoch": 0.05, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.2826, "num_input_tokens_seen": 4037017600, "step": 1925 }, { "epoch": 0.05, "grad_norm": 1.1484375, "learning_rate": 2e-05, "loss": 2.2745, "num_input_tokens_seen": 4039114752, "step": 1926 }, { "epoch": 0.05, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.2493, "num_input_tokens_seen": 4041211904, "step": 1927 }, { "epoch": 0.05, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.2325, "num_input_tokens_seen": 4043309056, "step": 1928 }, { "epoch": 0.05, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.2482, "num_input_tokens_seen": 4045406208, "step": 1929 }, { "epoch": 0.05, "grad_norm": 1.265625, "learning_rate": 2e-05, "loss": 2.2656, "num_input_tokens_seen": 4047503360, "step": 1930 }, { "epoch": 0.05, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.2079, "num_input_tokens_seen": 4049600512, "step": 1931 }, { "epoch": 0.05, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.2537, "num_input_tokens_seen": 4051697664, "step": 1932 }, { "epoch": 0.05, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.2515, "num_input_tokens_seen": 4053794816, "step": 1933 }, { "epoch": 0.05, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.2567, "num_input_tokens_seen": 4055891968, "step": 1934 }, { "epoch": 0.05, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.2395, "num_input_tokens_seen": 4057989120, "step": 1935 }, { "epoch": 0.05, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.2636, "num_input_tokens_seen": 4060086272, "step": 1936 }, { "epoch": 0.05, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.2678, "num_input_tokens_seen": 4062183424, "step": 1937 }, { "epoch": 0.05, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.25, "num_input_tokens_seen": 4064280576, "step": 1938 }, { "epoch": 0.05, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.2339, "num_input_tokens_seen": 4066377728, "step": 1939 }, { "epoch": 0.05, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.2674, "num_input_tokens_seen": 4068474880, "step": 1940 }, { "epoch": 0.05, "grad_norm": 1.2734375, "learning_rate": 2e-05, "loss": 2.2556, "num_input_tokens_seen": 4070572032, "step": 1941 }, { "epoch": 0.05, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.2508, "num_input_tokens_seen": 4072669184, "step": 1942 }, { "epoch": 0.05, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.2367, "num_input_tokens_seen": 4074766336, "step": 1943 }, { "epoch": 0.05, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.2733, "num_input_tokens_seen": 4076863488, "step": 1944 }, { "epoch": 0.05, "grad_norm": 1.1953125, "learning_rate": 2e-05, "loss": 2.2416, "num_input_tokens_seen": 4078960640, "step": 1945 }, { "epoch": 0.05, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2408, "num_input_tokens_seen": 4081057792, "step": 1946 }, { "epoch": 0.05, "grad_norm": 1.265625, "learning_rate": 2e-05, "loss": 2.2676, "num_input_tokens_seen": 4083154944, "step": 1947 }, { "epoch": 0.05, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 2.2693, "num_input_tokens_seen": 4085252096, "step": 1948 }, { "epoch": 0.05, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.2369, "num_input_tokens_seen": 4087349248, "step": 1949 }, { "epoch": 0.05, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.2695, "num_input_tokens_seen": 4089446400, "step": 1950 }, { "epoch": 0.05, "grad_norm": 1.3046875, "learning_rate": 2e-05, "loss": 2.2472, "num_input_tokens_seen": 4091543552, "step": 1951 }, { "epoch": 0.05, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.2432, "num_input_tokens_seen": 4093640704, "step": 1952 }, { "epoch": 0.05, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.2716, "num_input_tokens_seen": 4095737856, "step": 1953 }, { "epoch": 0.05, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.2521, "num_input_tokens_seen": 4097835008, "step": 1954 }, { "epoch": 0.05, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.2508, "num_input_tokens_seen": 4099932160, "step": 1955 }, { "epoch": 0.05, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.2603, "num_input_tokens_seen": 4102029312, "step": 1956 }, { "epoch": 0.05, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.25, "num_input_tokens_seen": 4104126464, "step": 1957 }, { "epoch": 0.05, "grad_norm": 0.671875, "learning_rate": 2e-05, "loss": 2.2533, "num_input_tokens_seen": 4106223616, "step": 1958 }, { "epoch": 0.05, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.2629, "num_input_tokens_seen": 4108320768, "step": 1959 }, { "epoch": 0.05, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.2577, "num_input_tokens_seen": 4110417920, "step": 1960 }, { "epoch": 0.05, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2608, "num_input_tokens_seen": 4112515072, "step": 1961 }, { "epoch": 0.05, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.2312, "num_input_tokens_seen": 4114612224, "step": 1962 }, { "epoch": 0.06, "grad_norm": 0.7109375, "learning_rate": 2e-05, "loss": 2.2245, "num_input_tokens_seen": 4116709376, "step": 1963 }, { "epoch": 0.06, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.2604, "num_input_tokens_seen": 4118806528, "step": 1964 }, { "epoch": 0.06, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.2348, "num_input_tokens_seen": 4120903680, "step": 1965 }, { "epoch": 0.06, "grad_norm": 0.76171875, "learning_rate": 2e-05, "loss": 2.2435, "num_input_tokens_seen": 4123000832, "step": 1966 }, { "epoch": 0.06, "grad_norm": 0.6875, "learning_rate": 2e-05, "loss": 2.2598, "num_input_tokens_seen": 4125097984, "step": 1967 }, { "epoch": 0.06, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.2712, "num_input_tokens_seen": 4127195136, "step": 1968 }, { "epoch": 0.06, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.2479, "num_input_tokens_seen": 4129292288, "step": 1969 }, { "epoch": 0.06, "grad_norm": 0.76171875, "learning_rate": 2e-05, "loss": 2.2934, "num_input_tokens_seen": 4131389440, "step": 1970 }, { "epoch": 0.06, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.2353, "num_input_tokens_seen": 4133486592, "step": 1971 }, { "epoch": 0.06, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.3041, "num_input_tokens_seen": 4135583744, "step": 1972 }, { "epoch": 0.06, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.2473, "num_input_tokens_seen": 4137680896, "step": 1973 }, { "epoch": 0.06, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.2624, "num_input_tokens_seen": 4139778048, "step": 1974 }, { "epoch": 0.06, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.2411, "num_input_tokens_seen": 4141875200, "step": 1975 }, { "epoch": 0.06, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.2376, "num_input_tokens_seen": 4143972352, "step": 1976 }, { "epoch": 0.06, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2363, "num_input_tokens_seen": 4146069504, "step": 1977 }, { "epoch": 0.06, "grad_norm": 1.1171875, "learning_rate": 2e-05, "loss": 2.261, "num_input_tokens_seen": 4148166656, "step": 1978 }, { "epoch": 0.06, "grad_norm": 1.2109375, "learning_rate": 2e-05, "loss": 2.2939, "num_input_tokens_seen": 4150263808, "step": 1979 }, { "epoch": 0.06, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.2356, "num_input_tokens_seen": 4152360960, "step": 1980 }, { "epoch": 0.06, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.2609, "num_input_tokens_seen": 4154458112, "step": 1981 }, { "epoch": 0.06, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.2245, "num_input_tokens_seen": 4156555264, "step": 1982 }, { "epoch": 0.06, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.2427, "num_input_tokens_seen": 4158652416, "step": 1983 }, { "epoch": 0.06, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.2955, "num_input_tokens_seen": 4160749568, "step": 1984 }, { "epoch": 0.06, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2335, "num_input_tokens_seen": 4162846720, "step": 1985 }, { "epoch": 0.06, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.2697, "num_input_tokens_seen": 4164943872, "step": 1986 }, { "epoch": 0.06, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.2625, "num_input_tokens_seen": 4167041024, "step": 1987 }, { "epoch": 0.06, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.2812, "num_input_tokens_seen": 4169138176, "step": 1988 }, { "epoch": 0.06, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.2425, "num_input_tokens_seen": 4171235328, "step": 1989 }, { "epoch": 0.06, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.2335, "num_input_tokens_seen": 4173332480, "step": 1990 }, { "epoch": 0.06, "eval_loss": 2.301752805709839, "eval_runtime": 3367.1268, "eval_samples_per_second": 1.171, "eval_steps_per_second": 0.293, "num_input_tokens_seen": 4173332480, "step": 1990 }, { "epoch": 0.06, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2671, "num_input_tokens_seen": 4175429632, "step": 1991 }, { "epoch": 0.06, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.2731, "num_input_tokens_seen": 4177526784, "step": 1992 }, { "epoch": 0.06, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.2155, "num_input_tokens_seen": 4179623936, "step": 1993 }, { "epoch": 0.06, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2369, "num_input_tokens_seen": 4181721088, "step": 1994 }, { "epoch": 0.06, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.2369, "num_input_tokens_seen": 4183818240, "step": 1995 }, { "epoch": 0.06, "grad_norm": 1.2734375, "learning_rate": 2e-05, "loss": 2.2395, "num_input_tokens_seen": 4185915392, "step": 1996 }, { "epoch": 0.06, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.2817, "num_input_tokens_seen": 4188012544, "step": 1997 }, { "epoch": 0.06, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.2454, "num_input_tokens_seen": 4190109696, "step": 1998 }, { "epoch": 0.06, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 2.2574, "num_input_tokens_seen": 4192206848, "step": 1999 }, { "epoch": 0.06, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 2.2191, "num_input_tokens_seen": 4194304000, "step": 2000 }, { "epoch": 0.06, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 2.2409, "num_input_tokens_seen": 4196401152, "step": 2001 }, { "epoch": 0.06, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.28, "num_input_tokens_seen": 4198498304, "step": 2002 }, { "epoch": 0.06, "grad_norm": 1.7109375, "learning_rate": 2e-05, "loss": 2.2531, "num_input_tokens_seen": 4200595456, "step": 2003 }, { "epoch": 0.06, "grad_norm": 1.8515625, "learning_rate": 2e-05, "loss": 2.257, "num_input_tokens_seen": 4202692608, "step": 2004 }, { "epoch": 0.06, "grad_norm": 1.5234375, "learning_rate": 2e-05, "loss": 2.2379, "num_input_tokens_seen": 4204789760, "step": 2005 }, { "epoch": 0.06, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.233, "num_input_tokens_seen": 4206886912, "step": 2006 }, { "epoch": 0.06, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.2742, "num_input_tokens_seen": 4208984064, "step": 2007 }, { "epoch": 0.06, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.2709, "num_input_tokens_seen": 4211081216, "step": 2008 }, { "epoch": 0.06, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.2703, "num_input_tokens_seen": 4213178368, "step": 2009 }, { "epoch": 0.07, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.26, "num_input_tokens_seen": 4215275520, "step": 2010 }, { "epoch": 0.07, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2376, "num_input_tokens_seen": 4217372672, "step": 2011 }, { "epoch": 0.07, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.2375, "num_input_tokens_seen": 4219469824, "step": 2012 }, { "epoch": 0.07, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.2525, "num_input_tokens_seen": 4221566976, "step": 2013 }, { "epoch": 0.07, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.2557, "num_input_tokens_seen": 4223664128, "step": 2014 }, { "epoch": 0.07, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2603, "num_input_tokens_seen": 4225761280, "step": 2015 }, { "epoch": 0.07, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.2631, "num_input_tokens_seen": 4227858432, "step": 2016 }, { "epoch": 0.07, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.2324, "num_input_tokens_seen": 4229955584, "step": 2017 }, { "epoch": 0.07, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.2694, "num_input_tokens_seen": 4232052736, "step": 2018 }, { "epoch": 0.07, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2485, "num_input_tokens_seen": 4234149888, "step": 2019 }, { "epoch": 0.07, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.2445, "num_input_tokens_seen": 4236247040, "step": 2020 }, { "epoch": 0.07, "grad_norm": 0.68359375, "learning_rate": 2e-05, "loss": 2.2537, "num_input_tokens_seen": 4238344192, "step": 2021 }, { "epoch": 0.07, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.2574, "num_input_tokens_seen": 4240441344, "step": 2022 }, { "epoch": 0.07, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2371, "num_input_tokens_seen": 4242538496, "step": 2023 }, { "epoch": 0.07, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2538, "num_input_tokens_seen": 4244635648, "step": 2024 }, { "epoch": 0.07, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.2459, "num_input_tokens_seen": 4246732800, "step": 2025 }, { "epoch": 0.07, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.2343, "num_input_tokens_seen": 4248829952, "step": 2026 }, { "epoch": 0.07, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.2579, "num_input_tokens_seen": 4250927104, "step": 2027 }, { "epoch": 0.07, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.2443, "num_input_tokens_seen": 4253024256, "step": 2028 }, { "epoch": 0.07, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.2548, "num_input_tokens_seen": 4255121408, "step": 2029 }, { "epoch": 0.07, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.2433, "num_input_tokens_seen": 4257218560, "step": 2030 }, { "epoch": 0.07, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2678, "num_input_tokens_seen": 4259315712, "step": 2031 }, { "epoch": 0.07, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2687, "num_input_tokens_seen": 4261412864, "step": 2032 }, { "epoch": 0.07, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.2481, "num_input_tokens_seen": 4263510016, "step": 2033 }, { "epoch": 0.07, "grad_norm": 1.1171875, "learning_rate": 2e-05, "loss": 2.2347, "num_input_tokens_seen": 4265607168, "step": 2034 }, { "epoch": 0.07, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2405, "num_input_tokens_seen": 4267704320, "step": 2035 }, { "epoch": 0.07, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2299, "num_input_tokens_seen": 4269801472, "step": 2036 }, { "epoch": 0.07, "grad_norm": 1.21875, "learning_rate": 2e-05, "loss": 2.2466, "num_input_tokens_seen": 4271898624, "step": 2037 }, { "epoch": 0.07, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.2504, "num_input_tokens_seen": 4273995776, "step": 2038 }, { "epoch": 0.07, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.225, "num_input_tokens_seen": 4276092928, "step": 2039 }, { "epoch": 0.07, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 2.2708, "num_input_tokens_seen": 4278190080, "step": 2040 }, { "epoch": 0.07, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 2.2537, "num_input_tokens_seen": 4280287232, "step": 2041 }, { "epoch": 0.07, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.2186, "num_input_tokens_seen": 4282384384, "step": 2042 }, { "epoch": 0.07, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.2279, "num_input_tokens_seen": 4284481536, "step": 2043 }, { "epoch": 0.07, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.2589, "num_input_tokens_seen": 4286578688, "step": 2044 }, { "epoch": 0.07, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.234, "num_input_tokens_seen": 4288675840, "step": 2045 }, { "epoch": 0.07, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.24, "num_input_tokens_seen": 4290772992, "step": 2046 }, { "epoch": 0.07, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.2474, "num_input_tokens_seen": 4292870144, "step": 2047 }, { "epoch": 0.07, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2538, "num_input_tokens_seen": 4294967296, "step": 2048 }, { "epoch": 0.07, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.2374, "num_input_tokens_seen": 4297064448, "step": 2049 }, { "epoch": 0.07, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.2482, "num_input_tokens_seen": 4299161600, "step": 2050 }, { "epoch": 0.07, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.3018, "num_input_tokens_seen": 4301258752, "step": 2051 }, { "epoch": 0.07, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.2305, "num_input_tokens_seen": 4303355904, "step": 2052 }, { "epoch": 0.07, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.2291, "num_input_tokens_seen": 4305453056, "step": 2053 }, { "epoch": 0.07, "grad_norm": 1.1953125, "learning_rate": 2e-05, "loss": 2.2681, "num_input_tokens_seen": 4307550208, "step": 2054 }, { "epoch": 0.07, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.2618, "num_input_tokens_seen": 4309647360, "step": 2055 }, { "epoch": 0.07, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.2511, "num_input_tokens_seen": 4311744512, "step": 2056 }, { "epoch": 0.07, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.2143, "num_input_tokens_seen": 4313841664, "step": 2057 }, { "epoch": 0.08, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2648, "num_input_tokens_seen": 4315938816, "step": 2058 }, { "epoch": 0.08, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.2365, "num_input_tokens_seen": 4318035968, "step": 2059 }, { "epoch": 0.08, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.2296, "num_input_tokens_seen": 4320133120, "step": 2060 }, { "epoch": 0.08, "grad_norm": 0.91796875, "learning_rate": 2e-05, "loss": 2.2537, "num_input_tokens_seen": 4322230272, "step": 2061 }, { "epoch": 0.08, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2419, "num_input_tokens_seen": 4324327424, "step": 2062 }, { "epoch": 0.08, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.2536, "num_input_tokens_seen": 4326424576, "step": 2063 }, { "epoch": 0.08, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.2211, "num_input_tokens_seen": 4328521728, "step": 2064 }, { "epoch": 0.08, "grad_norm": 1.2265625, "learning_rate": 2e-05, "loss": 2.2456, "num_input_tokens_seen": 4330618880, "step": 2065 }, { "epoch": 0.08, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.2504, "num_input_tokens_seen": 4332716032, "step": 2066 }, { "epoch": 0.08, "grad_norm": 1.2421875, "learning_rate": 2e-05, "loss": 2.251, "num_input_tokens_seen": 4334813184, "step": 2067 }, { "epoch": 0.08, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 2.2403, "num_input_tokens_seen": 4336910336, "step": 2068 }, { "epoch": 0.08, "grad_norm": 1.609375, "learning_rate": 2e-05, "loss": 2.2676, "num_input_tokens_seen": 4339007488, "step": 2069 }, { "epoch": 0.08, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 2.2543, "num_input_tokens_seen": 4341104640, "step": 2070 }, { "epoch": 0.08, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.2235, "num_input_tokens_seen": 4343201792, "step": 2071 }, { "epoch": 0.08, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.2591, "num_input_tokens_seen": 4345298944, "step": 2072 }, { "epoch": 0.08, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.2658, "num_input_tokens_seen": 4347396096, "step": 2073 }, { "epoch": 0.08, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.2363, "num_input_tokens_seen": 4349493248, "step": 2074 }, { "epoch": 0.08, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.2125, "num_input_tokens_seen": 4351590400, "step": 2075 }, { "epoch": 0.08, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.2348, "num_input_tokens_seen": 4353687552, "step": 2076 }, { "epoch": 0.08, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.2434, "num_input_tokens_seen": 4355784704, "step": 2077 }, { "epoch": 0.08, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.1948, "num_input_tokens_seen": 4357881856, "step": 2078 }, { "epoch": 0.08, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.2597, "num_input_tokens_seen": 4359979008, "step": 2079 }, { "epoch": 0.08, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.2381, "num_input_tokens_seen": 4362076160, "step": 2080 }, { "epoch": 0.08, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.2364, "num_input_tokens_seen": 4364173312, "step": 2081 }, { "epoch": 0.08, "grad_norm": 1.1640625, "learning_rate": 2e-05, "loss": 2.2216, "num_input_tokens_seen": 4366270464, "step": 2082 }, { "epoch": 0.08, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.2442, "num_input_tokens_seen": 4368367616, "step": 2083 }, { "epoch": 0.08, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.2489, "num_input_tokens_seen": 4370464768, "step": 2084 }, { "epoch": 0.08, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.2523, "num_input_tokens_seen": 4372561920, "step": 2085 }, { "epoch": 0.08, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.2367, "num_input_tokens_seen": 4374659072, "step": 2086 }, { "epoch": 0.08, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.256, "num_input_tokens_seen": 4376756224, "step": 2087 }, { "epoch": 0.08, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2477, "num_input_tokens_seen": 4378853376, "step": 2088 }, { "epoch": 0.08, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.252, "num_input_tokens_seen": 4380950528, "step": 2089 }, { "epoch": 0.08, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2262, "num_input_tokens_seen": 4383047680, "step": 2090 }, { "epoch": 0.08, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.2349, "num_input_tokens_seen": 4385144832, "step": 2091 }, { "epoch": 0.08, "grad_norm": 0.76171875, "learning_rate": 2e-05, "loss": 2.2263, "num_input_tokens_seen": 4387241984, "step": 2092 }, { "epoch": 0.08, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.2351, "num_input_tokens_seen": 4389339136, "step": 2093 }, { "epoch": 0.08, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.2602, "num_input_tokens_seen": 4391436288, "step": 2094 }, { "epoch": 0.08, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.2371, "num_input_tokens_seen": 4393533440, "step": 2095 }, { "epoch": 0.08, "grad_norm": 1.4296875, "learning_rate": 2e-05, "loss": 2.22, "num_input_tokens_seen": 4395630592, "step": 2096 }, { "epoch": 0.08, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 2.2533, "num_input_tokens_seen": 4397727744, "step": 2097 }, { "epoch": 0.08, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.2633, "num_input_tokens_seen": 4399824896, "step": 2098 }, { "epoch": 0.08, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 2.2472, "num_input_tokens_seen": 4401922048, "step": 2099 }, { "epoch": 0.08, "grad_norm": 1.546875, "learning_rate": 2e-05, "loss": 2.2618, "num_input_tokens_seen": 4404019200, "step": 2100 }, { "epoch": 0.08, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.2135, "num_input_tokens_seen": 4406116352, "step": 2101 }, { "epoch": 0.08, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2126, "num_input_tokens_seen": 4408213504, "step": 2102 }, { "epoch": 0.08, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 2.2389, "num_input_tokens_seen": 4410310656, "step": 2103 }, { "epoch": 0.08, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.2247, "num_input_tokens_seen": 4412407808, "step": 2104 }, { "epoch": 0.08, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.2196, "num_input_tokens_seen": 4414504960, "step": 2105 }, { "epoch": 0.09, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2177, "num_input_tokens_seen": 4416602112, "step": 2106 }, { "epoch": 0.09, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.265, "num_input_tokens_seen": 4418699264, "step": 2107 }, { "epoch": 0.09, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.2521, "num_input_tokens_seen": 4420796416, "step": 2108 }, { "epoch": 0.09, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.2208, "num_input_tokens_seen": 4422893568, "step": 2109 }, { "epoch": 0.09, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.2399, "num_input_tokens_seen": 4424990720, "step": 2110 }, { "epoch": 0.09, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.2121, "num_input_tokens_seen": 4427087872, "step": 2111 }, { "epoch": 0.09, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.2473, "num_input_tokens_seen": 4429185024, "step": 2112 }, { "epoch": 0.09, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.2501, "num_input_tokens_seen": 4431282176, "step": 2113 }, { "epoch": 0.09, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.2531, "num_input_tokens_seen": 4433379328, "step": 2114 }, { "epoch": 0.09, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.233, "num_input_tokens_seen": 4435476480, "step": 2115 }, { "epoch": 0.09, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.2337, "num_input_tokens_seen": 4437573632, "step": 2116 }, { "epoch": 0.09, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2492, "num_input_tokens_seen": 4439670784, "step": 2117 }, { "epoch": 0.09, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.2544, "num_input_tokens_seen": 4441767936, "step": 2118 }, { "epoch": 0.09, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.2546, "num_input_tokens_seen": 4443865088, "step": 2119 }, { "epoch": 0.09, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.2508, "num_input_tokens_seen": 4445962240, "step": 2120 }, { "epoch": 0.09, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.2426, "num_input_tokens_seen": 4448059392, "step": 2121 }, { "epoch": 0.09, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.2184, "num_input_tokens_seen": 4450156544, "step": 2122 }, { "epoch": 0.09, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.2317, "num_input_tokens_seen": 4452253696, "step": 2123 }, { "epoch": 0.09, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.2562, "num_input_tokens_seen": 4454350848, "step": 2124 }, { "epoch": 0.09, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2453, "num_input_tokens_seen": 4456448000, "step": 2125 }, { "epoch": 0.09, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.2139, "num_input_tokens_seen": 4458545152, "step": 2126 }, { "epoch": 0.09, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.2362, "num_input_tokens_seen": 4460642304, "step": 2127 }, { "epoch": 0.09, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.234, "num_input_tokens_seen": 4462739456, "step": 2128 }, { "epoch": 0.09, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.2327, "num_input_tokens_seen": 4464836608, "step": 2129 }, { "epoch": 0.09, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.22, "num_input_tokens_seen": 4466933760, "step": 2130 }, { "epoch": 0.09, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.2256, "num_input_tokens_seen": 4469030912, "step": 2131 }, { "epoch": 0.09, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2394, "num_input_tokens_seen": 4471128064, "step": 2132 }, { "epoch": 0.09, "grad_norm": 1.109375, "learning_rate": 2e-05, "loss": 2.257, "num_input_tokens_seen": 4473225216, "step": 2133 }, { "epoch": 0.09, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 2.2542, "num_input_tokens_seen": 4475322368, "step": 2134 }, { "epoch": 0.09, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.2487, "num_input_tokens_seen": 4477419520, "step": 2135 }, { "epoch": 0.09, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.2741, "num_input_tokens_seen": 4479516672, "step": 2136 }, { "epoch": 0.09, "grad_norm": 1.578125, "learning_rate": 2e-05, "loss": 2.2387, "num_input_tokens_seen": 4481613824, "step": 2137 }, { "epoch": 0.09, "grad_norm": 1.3125, "learning_rate": 2e-05, "loss": 2.2706, "num_input_tokens_seen": 4483710976, "step": 2138 }, { "epoch": 0.09, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.2613, "num_input_tokens_seen": 4485808128, "step": 2139 }, { "epoch": 0.09, "grad_norm": 1.21875, "learning_rate": 2e-05, "loss": 2.2488, "num_input_tokens_seen": 4487905280, "step": 2140 }, { "epoch": 0.09, "grad_norm": 1.2890625, "learning_rate": 2e-05, "loss": 2.2403, "num_input_tokens_seen": 4490002432, "step": 2141 }, { "epoch": 0.09, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.2388, "num_input_tokens_seen": 4492099584, "step": 2142 }, { "epoch": 0.09, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2168, "num_input_tokens_seen": 4494196736, "step": 2143 }, { "epoch": 0.09, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.257, "num_input_tokens_seen": 4496293888, "step": 2144 }, { "epoch": 0.09, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.2504, "num_input_tokens_seen": 4498391040, "step": 2145 }, { "epoch": 0.09, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.2443, "num_input_tokens_seen": 4500488192, "step": 2146 }, { "epoch": 0.09, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.2134, "num_input_tokens_seen": 4502585344, "step": 2147 }, { "epoch": 0.09, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2315, "num_input_tokens_seen": 4504682496, "step": 2148 }, { "epoch": 0.09, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.2434, "num_input_tokens_seen": 4506779648, "step": 2149 }, { "epoch": 0.09, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.2275, "num_input_tokens_seen": 4508876800, "step": 2150 }, { "epoch": 0.09, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.239, "num_input_tokens_seen": 4510973952, "step": 2151 }, { "epoch": 0.09, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2218, "num_input_tokens_seen": 4513071104, "step": 2152 }, { "epoch": 0.1, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.2536, "num_input_tokens_seen": 4515168256, "step": 2153 }, { "epoch": 0.1, "grad_norm": 0.734375, "learning_rate": 2e-05, "loss": 2.2386, "num_input_tokens_seen": 4517265408, "step": 2154 }, { "epoch": 0.1, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2258, "num_input_tokens_seen": 4519362560, "step": 2155 }, { "epoch": 0.1, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.2621, "num_input_tokens_seen": 4521459712, "step": 2156 }, { "epoch": 0.1, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.243, "num_input_tokens_seen": 4523556864, "step": 2157 }, { "epoch": 0.1, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.2213, "num_input_tokens_seen": 4525654016, "step": 2158 }, { "epoch": 0.1, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.2142, "num_input_tokens_seen": 4527751168, "step": 2159 }, { "epoch": 0.1, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.2339, "num_input_tokens_seen": 4529848320, "step": 2160 }, { "epoch": 0.1, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.2633, "num_input_tokens_seen": 4531945472, "step": 2161 }, { "epoch": 0.1, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.22, "num_input_tokens_seen": 4534042624, "step": 2162 }, { "epoch": 0.1, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.2008, "num_input_tokens_seen": 4536139776, "step": 2163 }, { "epoch": 0.1, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.236, "num_input_tokens_seen": 4538236928, "step": 2164 }, { "epoch": 0.1, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.2376, "num_input_tokens_seen": 4540334080, "step": 2165 }, { "epoch": 0.1, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.2557, "num_input_tokens_seen": 4542431232, "step": 2166 }, { "epoch": 0.1, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.2047, "num_input_tokens_seen": 4544528384, "step": 2167 }, { "epoch": 0.1, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.2418, "num_input_tokens_seen": 4546625536, "step": 2168 }, { "epoch": 0.1, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.2286, "num_input_tokens_seen": 4548722688, "step": 2169 }, { "epoch": 0.1, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2395, "num_input_tokens_seen": 4550819840, "step": 2170 }, { "epoch": 0.1, "grad_norm": 0.94140625, "learning_rate": 2e-05, "loss": 2.2208, "num_input_tokens_seen": 4552916992, "step": 2171 }, { "epoch": 0.1, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2427, "num_input_tokens_seen": 4555014144, "step": 2172 }, { "epoch": 0.1, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.2212, "num_input_tokens_seen": 4557111296, "step": 2173 }, { "epoch": 0.1, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.2273, "num_input_tokens_seen": 4559208448, "step": 2174 }, { "epoch": 0.1, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.2387, "num_input_tokens_seen": 4561305600, "step": 2175 }, { "epoch": 0.1, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.2109, "num_input_tokens_seen": 4563402752, "step": 2176 }, { "epoch": 0.1, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.229, "num_input_tokens_seen": 4565499904, "step": 2177 }, { "epoch": 0.1, "grad_norm": 0.98828125, "learning_rate": 2e-05, "loss": 2.2521, "num_input_tokens_seen": 4567597056, "step": 2178 }, { "epoch": 0.1, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.2126, "num_input_tokens_seen": 4569694208, "step": 2179 }, { "epoch": 0.1, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2463, "num_input_tokens_seen": 4571791360, "step": 2180 }, { "epoch": 0.1, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.2159, "num_input_tokens_seen": 4573888512, "step": 2181 }, { "epoch": 0.1, "grad_norm": 1.125, "learning_rate": 2e-05, "loss": 2.2124, "num_input_tokens_seen": 4575985664, "step": 2182 }, { "epoch": 0.1, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.2327, "num_input_tokens_seen": 4578082816, "step": 2183 }, { "epoch": 0.1, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.2233, "num_input_tokens_seen": 4580179968, "step": 2184 }, { "epoch": 0.1, "grad_norm": 0.98828125, "learning_rate": 2e-05, "loss": 2.1894, "num_input_tokens_seen": 4582277120, "step": 2185 }, { "epoch": 0.1, "grad_norm": 1.1796875, "learning_rate": 2e-05, "loss": 2.2294, "num_input_tokens_seen": 4584374272, "step": 2186 }, { "epoch": 0.1, "grad_norm": 1.0, "learning_rate": 2e-05, "loss": 2.2339, "num_input_tokens_seen": 4586471424, "step": 2187 }, { "epoch": 0.1, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.2275, "num_input_tokens_seen": 4588568576, "step": 2188 }, { "epoch": 0.1, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.2488, "num_input_tokens_seen": 4590665728, "step": 2189 }, { "epoch": 0.1, "eval_loss": 2.2922277450561523, "eval_runtime": 2101.3307, "eval_samples_per_second": 1.876, "eval_steps_per_second": 0.469, "num_input_tokens_seen": 4590665728, "step": 2189 }, { "epoch": 0.1, "grad_norm": 0.97265625, "learning_rate": 2e-05, "loss": 2.2405, "num_input_tokens_seen": 4592762880, "step": 2190 }, { "epoch": 0.1, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.2444, "num_input_tokens_seen": 4594860032, "step": 2191 }, { "epoch": 0.1, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.2438, "num_input_tokens_seen": 4596957184, "step": 2192 }, { "epoch": 0.1, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2298, "num_input_tokens_seen": 4599054336, "step": 2193 }, { "epoch": 0.1, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2418, "num_input_tokens_seen": 4601151488, "step": 2194 }, { "epoch": 0.1, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.2123, "num_input_tokens_seen": 4603248640, "step": 2195 }, { "epoch": 0.1, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.2411, "num_input_tokens_seen": 4605345792, "step": 2196 }, { "epoch": 0.1, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 2.2491, "num_input_tokens_seen": 4607442944, "step": 2197 }, { "epoch": 0.1, "grad_norm": 1.1484375, "learning_rate": 2e-05, "loss": 2.2304, "num_input_tokens_seen": 4609540096, "step": 2198 }, { "epoch": 0.1, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.212, "num_input_tokens_seen": 4611637248, "step": 2199 }, { "epoch": 0.1, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 2.2465, "num_input_tokens_seen": 4613734400, "step": 2200 }, { "epoch": 0.11, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 2.2287, "num_input_tokens_seen": 4615831552, "step": 2201 }, { "epoch": 0.11, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2248, "num_input_tokens_seen": 4617928704, "step": 2202 }, { "epoch": 0.11, "grad_norm": 1.28125, "learning_rate": 2e-05, "loss": 2.2287, "num_input_tokens_seen": 4620025856, "step": 2203 }, { "epoch": 0.11, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.2228, "num_input_tokens_seen": 4622123008, "step": 2204 }, { "epoch": 0.11, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.2178, "num_input_tokens_seen": 4624220160, "step": 2205 }, { "epoch": 0.11, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2298, "num_input_tokens_seen": 4626317312, "step": 2206 }, { "epoch": 0.11, "grad_norm": 1.1640625, "learning_rate": 2e-05, "loss": 2.2332, "num_input_tokens_seen": 4628414464, "step": 2207 }, { "epoch": 0.11, "grad_norm": 1.078125, "learning_rate": 2e-05, "loss": 2.2124, "num_input_tokens_seen": 4630511616, "step": 2208 }, { "epoch": 0.11, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.2237, "num_input_tokens_seen": 4632608768, "step": 2209 }, { "epoch": 0.11, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.2125, "num_input_tokens_seen": 4634705920, "step": 2210 }, { "epoch": 0.11, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2237, "num_input_tokens_seen": 4636803072, "step": 2211 }, { "epoch": 0.11, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.205, "num_input_tokens_seen": 4638900224, "step": 2212 }, { "epoch": 0.11, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.2087, "num_input_tokens_seen": 4640997376, "step": 2213 }, { "epoch": 0.11, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.218, "num_input_tokens_seen": 4643094528, "step": 2214 }, { "epoch": 0.11, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.2039, "num_input_tokens_seen": 4645191680, "step": 2215 }, { "epoch": 0.11, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2072, "num_input_tokens_seen": 4647288832, "step": 2216 }, { "epoch": 0.11, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2188, "num_input_tokens_seen": 4649385984, "step": 2217 }, { "epoch": 0.11, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2232, "num_input_tokens_seen": 4651483136, "step": 2218 }, { "epoch": 0.11, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.2337, "num_input_tokens_seen": 4653580288, "step": 2219 }, { "epoch": 0.11, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.2159, "num_input_tokens_seen": 4655677440, "step": 2220 }, { "epoch": 0.11, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.2206, "num_input_tokens_seen": 4657774592, "step": 2221 }, { "epoch": 0.11, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.2128, "num_input_tokens_seen": 4659871744, "step": 2222 }, { "epoch": 0.11, "grad_norm": 0.97265625, "learning_rate": 2e-05, "loss": 2.2072, "num_input_tokens_seen": 4661968896, "step": 2223 }, { "epoch": 0.11, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.1864, "num_input_tokens_seen": 4664066048, "step": 2224 }, { "epoch": 0.11, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.2103, "num_input_tokens_seen": 4666163200, "step": 2225 }, { "epoch": 0.11, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.2004, "num_input_tokens_seen": 4668260352, "step": 2226 }, { "epoch": 0.11, "grad_norm": 1.1484375, "learning_rate": 2e-05, "loss": 2.2474, "num_input_tokens_seen": 4670357504, "step": 2227 }, { "epoch": 0.11, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.2397, "num_input_tokens_seen": 4672454656, "step": 2228 }, { "epoch": 0.11, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.2307, "num_input_tokens_seen": 4674551808, "step": 2229 }, { "epoch": 0.11, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.2362, "num_input_tokens_seen": 4676648960, "step": 2230 }, { "epoch": 0.11, "grad_norm": 2.21875, "learning_rate": 2e-05, "loss": 2.2631, "num_input_tokens_seen": 4678746112, "step": 2231 }, { "epoch": 0.11, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.2201, "num_input_tokens_seen": 4680843264, "step": 2232 }, { "epoch": 0.11, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.1976, "num_input_tokens_seen": 4682940416, "step": 2233 }, { "epoch": 0.11, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.2425, "num_input_tokens_seen": 4685037568, "step": 2234 }, { "epoch": 0.11, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.1857, "num_input_tokens_seen": 4687134720, "step": 2235 }, { "epoch": 0.11, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.2355, "num_input_tokens_seen": 4689231872, "step": 2236 }, { "epoch": 0.11, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2559, "num_input_tokens_seen": 4691329024, "step": 2237 }, { "epoch": 0.11, "grad_norm": 0.96875, "learning_rate": 2e-05, "loss": 2.2244, "num_input_tokens_seen": 4693426176, "step": 2238 }, { "epoch": 0.11, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.2114, "num_input_tokens_seen": 4695523328, "step": 2239 }, { "epoch": 0.11, "grad_norm": 1.1640625, "learning_rate": 2e-05, "loss": 2.2016, "num_input_tokens_seen": 4697620480, "step": 2240 }, { "epoch": 0.11, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.2105, "num_input_tokens_seen": 4699717632, "step": 2241 }, { "epoch": 0.11, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.2006, "num_input_tokens_seen": 4701814784, "step": 2242 }, { "epoch": 0.11, "grad_norm": 1.2734375, "learning_rate": 2e-05, "loss": 2.1822, "num_input_tokens_seen": 4703911936, "step": 2243 }, { "epoch": 0.11, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.2176, "num_input_tokens_seen": 4706009088, "step": 2244 }, { "epoch": 0.11, "grad_norm": 0.73046875, "learning_rate": 2e-05, "loss": 2.2156, "num_input_tokens_seen": 4708106240, "step": 2245 }, { "epoch": 0.11, "grad_norm": 1.453125, "learning_rate": 2e-05, "loss": 2.2331, "num_input_tokens_seen": 4710203392, "step": 2246 }, { "epoch": 0.11, "grad_norm": 1.5078125, "learning_rate": 2e-05, "loss": 2.2408, "num_input_tokens_seen": 4712300544, "step": 2247 }, { "epoch": 0.11, "grad_norm": 0.765625, "learning_rate": 2e-05, "loss": 2.249, "num_input_tokens_seen": 4714397696, "step": 2248 }, { "epoch": 0.12, "grad_norm": 1.21875, "learning_rate": 2e-05, "loss": 2.2389, "num_input_tokens_seen": 4716494848, "step": 2249 }, { "epoch": 0.12, "grad_norm": 1.65625, "learning_rate": 2e-05, "loss": 2.2193, "num_input_tokens_seen": 4718592000, "step": 2250 }, { "epoch": 0.12, "grad_norm": 1.2265625, "learning_rate": 2e-05, "loss": 2.2399, "num_input_tokens_seen": 4720689152, "step": 2251 }, { "epoch": 0.12, "grad_norm": 1.2734375, "learning_rate": 2e-05, "loss": 2.2025, "num_input_tokens_seen": 4722786304, "step": 2252 }, { "epoch": 0.12, "grad_norm": 1.84375, "learning_rate": 2e-05, "loss": 2.2386, "num_input_tokens_seen": 4724883456, "step": 2253 }, { "epoch": 0.12, "grad_norm": 1.4453125, "learning_rate": 2e-05, "loss": 2.2075, "num_input_tokens_seen": 4726980608, "step": 2254 }, { "epoch": 0.12, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.1982, "num_input_tokens_seen": 4729077760, "step": 2255 }, { "epoch": 0.12, "grad_norm": 1.734375, "learning_rate": 2e-05, "loss": 2.2462, "num_input_tokens_seen": 4731174912, "step": 2256 }, { "epoch": 0.12, "grad_norm": 1.6328125, "learning_rate": 2e-05, "loss": 2.2281, "num_input_tokens_seen": 4733272064, "step": 2257 }, { "epoch": 0.12, "grad_norm": 0.99609375, "learning_rate": 2e-05, "loss": 2.243, "num_input_tokens_seen": 4735369216, "step": 2258 }, { "epoch": 0.12, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.2094, "num_input_tokens_seen": 4737466368, "step": 2259 }, { "epoch": 0.12, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 2.2054, "num_input_tokens_seen": 4739563520, "step": 2260 }, { "epoch": 0.12, "grad_norm": 3.6875, "learning_rate": 2e-05, "loss": 2.1821, "num_input_tokens_seen": 4741660672, "step": 2261 }, { "epoch": 0.12, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.2007, "num_input_tokens_seen": 4743757824, "step": 2262 }, { "epoch": 0.12, "grad_norm": 0.97265625, "learning_rate": 2e-05, "loss": 2.2274, "num_input_tokens_seen": 4745854976, "step": 2263 }, { "epoch": 0.12, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.2129, "num_input_tokens_seen": 4747952128, "step": 2264 }, { "epoch": 0.12, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2175, "num_input_tokens_seen": 4750049280, "step": 2265 }, { "epoch": 0.12, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.2393, "num_input_tokens_seen": 4752146432, "step": 2266 }, { "epoch": 0.12, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.2158, "num_input_tokens_seen": 4754243584, "step": 2267 }, { "epoch": 0.12, "grad_norm": 1.140625, "learning_rate": 2e-05, "loss": 2.2568, "num_input_tokens_seen": 4756340736, "step": 2268 }, { "epoch": 0.12, "grad_norm": 0.97265625, "learning_rate": 2e-05, "loss": 2.2007, "num_input_tokens_seen": 4758437888, "step": 2269 }, { "epoch": 0.12, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.2047, "num_input_tokens_seen": 4760535040, "step": 2270 }, { "epoch": 0.12, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 2.2197, "num_input_tokens_seen": 4762632192, "step": 2271 }, { "epoch": 0.12, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.2206, "num_input_tokens_seen": 4764729344, "step": 2272 }, { "epoch": 0.12, "grad_norm": 0.94921875, "learning_rate": 2e-05, "loss": 2.2069, "num_input_tokens_seen": 4766826496, "step": 2273 }, { "epoch": 0.12, "grad_norm": 1.2109375, "learning_rate": 2e-05, "loss": 2.247, "num_input_tokens_seen": 4768923648, "step": 2274 }, { "epoch": 0.12, "grad_norm": 1.484375, "learning_rate": 2e-05, "loss": 2.2155, "num_input_tokens_seen": 4771020800, "step": 2275 }, { "epoch": 0.12, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.2072, "num_input_tokens_seen": 4773117952, "step": 2276 }, { "epoch": 0.12, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 2.2116, "num_input_tokens_seen": 4775215104, "step": 2277 }, { "epoch": 0.12, "grad_norm": 1.5, "learning_rate": 2e-05, "loss": 2.1617, "num_input_tokens_seen": 4777312256, "step": 2278 }, { "epoch": 0.12, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.2556, "num_input_tokens_seen": 4779409408, "step": 2279 }, { "epoch": 0.12, "grad_norm": 1.2890625, "learning_rate": 2e-05, "loss": 2.2067, "num_input_tokens_seen": 4781506560, "step": 2280 }, { "epoch": 0.12, "grad_norm": 1.5546875, "learning_rate": 2e-05, "loss": 2.2204, "num_input_tokens_seen": 4783603712, "step": 2281 }, { "epoch": 0.12, "grad_norm": 1.0, "learning_rate": 2e-05, "loss": 2.1997, "num_input_tokens_seen": 4785700864, "step": 2282 }, { "epoch": 0.12, "grad_norm": 1.421875, "learning_rate": 2e-05, "loss": 2.2384, "num_input_tokens_seen": 4787798016, "step": 2283 }, { "epoch": 0.12, "grad_norm": 1.34375, "learning_rate": 2e-05, "loss": 2.2249, "num_input_tokens_seen": 4789895168, "step": 2284 }, { "epoch": 0.12, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.226, "num_input_tokens_seen": 4791992320, "step": 2285 }, { "epoch": 0.12, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.2345, "num_input_tokens_seen": 4794089472, "step": 2286 }, { "epoch": 0.12, "grad_norm": 1.0, "learning_rate": 2e-05, "loss": 2.2021, "num_input_tokens_seen": 4796186624, "step": 2287 }, { "epoch": 0.12, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.2128, "num_input_tokens_seen": 4798283776, "step": 2288 }, { "epoch": 0.12, "grad_norm": 0.8359375, "learning_rate": 2e-05, "loss": 2.2279, "num_input_tokens_seen": 4800380928, "step": 2289 }, { "epoch": 0.12, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2229, "num_input_tokens_seen": 4802478080, "step": 2290 }, { "epoch": 0.12, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2167, "num_input_tokens_seen": 4804575232, "step": 2291 }, { "epoch": 0.12, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2161, "num_input_tokens_seen": 4806672384, "step": 2292 }, { "epoch": 0.12, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.2243, "num_input_tokens_seen": 4808769536, "step": 2293 }, { "epoch": 0.12, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2112, "num_input_tokens_seen": 4810866688, "step": 2294 }, { "epoch": 0.12, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.2128, "num_input_tokens_seen": 4812963840, "step": 2295 }, { "epoch": 0.12, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.2524, "num_input_tokens_seen": 4815060992, "step": 2296 }, { "epoch": 0.13, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.211, "num_input_tokens_seen": 4817158144, "step": 2297 }, { "epoch": 0.13, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.2139, "num_input_tokens_seen": 4819255296, "step": 2298 }, { "epoch": 0.13, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.1893, "num_input_tokens_seen": 4821352448, "step": 2299 }, { "epoch": 0.13, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.2162, "num_input_tokens_seen": 4823449600, "step": 2300 }, { "epoch": 0.13, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.2545, "num_input_tokens_seen": 4825546752, "step": 2301 }, { "epoch": 0.13, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2129, "num_input_tokens_seen": 4827643904, "step": 2302 }, { "epoch": 0.13, "grad_norm": 1.203125, "learning_rate": 2e-05, "loss": 2.2218, "num_input_tokens_seen": 4829741056, "step": 2303 }, { "epoch": 0.13, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.2165, "num_input_tokens_seen": 4831838208, "step": 2304 }, { "epoch": 0.13, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.2324, "num_input_tokens_seen": 4833935360, "step": 2305 }, { "epoch": 0.13, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.2368, "num_input_tokens_seen": 4836032512, "step": 2306 }, { "epoch": 0.13, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.2558, "num_input_tokens_seen": 4838129664, "step": 2307 }, { "epoch": 0.13, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.2291, "num_input_tokens_seen": 4840226816, "step": 2308 }, { "epoch": 0.13, "grad_norm": 0.93359375, "learning_rate": 2e-05, "loss": 2.2295, "num_input_tokens_seen": 4842323968, "step": 2309 }, { "epoch": 0.13, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.2345, "num_input_tokens_seen": 4844421120, "step": 2310 }, { "epoch": 0.13, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.2057, "num_input_tokens_seen": 4846518272, "step": 2311 }, { "epoch": 0.13, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.2473, "num_input_tokens_seen": 4848615424, "step": 2312 }, { "epoch": 0.13, "grad_norm": 0.7265625, "learning_rate": 2e-05, "loss": 2.2094, "num_input_tokens_seen": 4850712576, "step": 2313 }, { "epoch": 0.13, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2132, "num_input_tokens_seen": 4852809728, "step": 2314 }, { "epoch": 0.13, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.193, "num_input_tokens_seen": 4854906880, "step": 2315 }, { "epoch": 0.13, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2043, "num_input_tokens_seen": 4857004032, "step": 2316 }, { "epoch": 0.13, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2047, "num_input_tokens_seen": 4859101184, "step": 2317 }, { "epoch": 0.13, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2007, "num_input_tokens_seen": 4861198336, "step": 2318 }, { "epoch": 0.13, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.2062, "num_input_tokens_seen": 4863295488, "step": 2319 }, { "epoch": 0.13, "grad_norm": 0.953125, "learning_rate": 2e-05, "loss": 2.1987, "num_input_tokens_seen": 4865392640, "step": 2320 }, { "epoch": 0.13, "grad_norm": 0.83984375, "learning_rate": 2e-05, "loss": 2.2125, "num_input_tokens_seen": 4867489792, "step": 2321 }, { "epoch": 0.13, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.246, "num_input_tokens_seen": 4869586944, "step": 2322 }, { "epoch": 0.13, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2184, "num_input_tokens_seen": 4871684096, "step": 2323 }, { "epoch": 0.13, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.2249, "num_input_tokens_seen": 4873781248, "step": 2324 }, { "epoch": 0.13, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.235, "num_input_tokens_seen": 4875878400, "step": 2325 }, { "epoch": 0.13, "grad_norm": 0.77734375, "learning_rate": 2e-05, "loss": 2.2251, "num_input_tokens_seen": 4877975552, "step": 2326 }, { "epoch": 0.13, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.2355, "num_input_tokens_seen": 4880072704, "step": 2327 }, { "epoch": 0.13, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.2144, "num_input_tokens_seen": 4882169856, "step": 2328 }, { "epoch": 0.13, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.2044, "num_input_tokens_seen": 4884267008, "step": 2329 }, { "epoch": 0.13, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2153, "num_input_tokens_seen": 4886364160, "step": 2330 }, { "epoch": 0.13, "grad_norm": 0.96875, "learning_rate": 2e-05, "loss": 2.1933, "num_input_tokens_seen": 4888461312, "step": 2331 }, { "epoch": 0.13, "grad_norm": 0.796875, "learning_rate": 2e-05, "loss": 2.2241, "num_input_tokens_seen": 4890558464, "step": 2332 }, { "epoch": 0.13, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2318, "num_input_tokens_seen": 4892655616, "step": 2333 }, { "epoch": 0.13, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.2072, "num_input_tokens_seen": 4894752768, "step": 2334 }, { "epoch": 0.13, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.2245, "num_input_tokens_seen": 4896849920, "step": 2335 }, { "epoch": 0.13, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.2125, "num_input_tokens_seen": 4898947072, "step": 2336 }, { "epoch": 0.13, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2228, "num_input_tokens_seen": 4901044224, "step": 2337 }, { "epoch": 0.13, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.2031, "num_input_tokens_seen": 4903141376, "step": 2338 }, { "epoch": 0.13, "grad_norm": 0.98828125, "learning_rate": 2e-05, "loss": 2.1687, "num_input_tokens_seen": 4905238528, "step": 2339 }, { "epoch": 0.13, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.2423, "num_input_tokens_seen": 4907335680, "step": 2340 }, { "epoch": 0.13, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.2175, "num_input_tokens_seen": 4909432832, "step": 2341 }, { "epoch": 0.13, "grad_norm": 1.25, "learning_rate": 2e-05, "loss": 2.2102, "num_input_tokens_seen": 4911529984, "step": 2342 }, { "epoch": 0.13, "grad_norm": 1.0390625, "learning_rate": 2e-05, "loss": 2.2022, "num_input_tokens_seen": 4913627136, "step": 2343 }, { "epoch": 0.14, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.1779, "num_input_tokens_seen": 4915724288, "step": 2344 }, { "epoch": 0.14, "grad_norm": 1.3203125, "learning_rate": 2e-05, "loss": 2.193, "num_input_tokens_seen": 4917821440, "step": 2345 }, { "epoch": 0.14, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.2176, "num_input_tokens_seen": 4919918592, "step": 2346 }, { "epoch": 0.14, "grad_norm": 0.9921875, "learning_rate": 2e-05, "loss": 2.2466, "num_input_tokens_seen": 4922015744, "step": 2347 }, { "epoch": 0.14, "grad_norm": 1.1171875, "learning_rate": 2e-05, "loss": 2.2113, "num_input_tokens_seen": 4924112896, "step": 2348 }, { "epoch": 0.14, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2185, "num_input_tokens_seen": 4926210048, "step": 2349 }, { "epoch": 0.14, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.177, "num_input_tokens_seen": 4928307200, "step": 2350 }, { "epoch": 0.14, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.2186, "num_input_tokens_seen": 4930404352, "step": 2351 }, { "epoch": 0.14, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.1916, "num_input_tokens_seen": 4932501504, "step": 2352 }, { "epoch": 0.14, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.1838, "num_input_tokens_seen": 4934598656, "step": 2353 }, { "epoch": 0.14, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2081, "num_input_tokens_seen": 4936695808, "step": 2354 }, { "epoch": 0.14, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2101, "num_input_tokens_seen": 4938792960, "step": 2355 }, { "epoch": 0.14, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2087, "num_input_tokens_seen": 4940890112, "step": 2356 }, { "epoch": 0.14, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2218, "num_input_tokens_seen": 4942987264, "step": 2357 }, { "epoch": 0.14, "grad_norm": 1.09375, "learning_rate": 2e-05, "loss": 2.224, "num_input_tokens_seen": 4945084416, "step": 2358 }, { "epoch": 0.14, "grad_norm": 0.75390625, "learning_rate": 2e-05, "loss": 2.2067, "num_input_tokens_seen": 4947181568, "step": 2359 }, { "epoch": 0.14, "grad_norm": 0.99609375, "learning_rate": 2e-05, "loss": 2.1973, "num_input_tokens_seen": 4949278720, "step": 2360 }, { "epoch": 0.14, "grad_norm": 1.1875, "learning_rate": 2e-05, "loss": 2.2084, "num_input_tokens_seen": 4951375872, "step": 2361 }, { "epoch": 0.14, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2111, "num_input_tokens_seen": 4953473024, "step": 2362 }, { "epoch": 0.14, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.1762, "num_input_tokens_seen": 4955570176, "step": 2363 }, { "epoch": 0.14, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.2157, "num_input_tokens_seen": 4957667328, "step": 2364 }, { "epoch": 0.14, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2088, "num_input_tokens_seen": 4959764480, "step": 2365 }, { "epoch": 0.14, "grad_norm": 1.234375, "learning_rate": 2e-05, "loss": 2.2402, "num_input_tokens_seen": 4961861632, "step": 2366 }, { "epoch": 0.14, "grad_norm": 1.3671875, "learning_rate": 2e-05, "loss": 2.211, "num_input_tokens_seen": 4963958784, "step": 2367 }, { "epoch": 0.14, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.2208, "num_input_tokens_seen": 4966055936, "step": 2368 }, { "epoch": 0.14, "grad_norm": 1.328125, "learning_rate": 2e-05, "loss": 2.1928, "num_input_tokens_seen": 4968153088, "step": 2369 }, { "epoch": 0.14, "grad_norm": 1.3359375, "learning_rate": 2e-05, "loss": 2.22, "num_input_tokens_seen": 4970250240, "step": 2370 }, { "epoch": 0.14, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.2084, "num_input_tokens_seen": 4972347392, "step": 2371 }, { "epoch": 0.14, "grad_norm": 0.87890625, "learning_rate": 2e-05, "loss": 2.1896, "num_input_tokens_seen": 4974444544, "step": 2372 }, { "epoch": 0.14, "grad_norm": 1.1640625, "learning_rate": 2e-05, "loss": 2.2113, "num_input_tokens_seen": 4976541696, "step": 2373 }, { "epoch": 0.14, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.1771, "num_input_tokens_seen": 4978638848, "step": 2374 }, { "epoch": 0.14, "grad_norm": 1.046875, "learning_rate": 2e-05, "loss": 2.2459, "num_input_tokens_seen": 4980736000, "step": 2375 }, { "epoch": 0.14, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.2276, "num_input_tokens_seen": 4982833152, "step": 2376 }, { "epoch": 0.14, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2095, "num_input_tokens_seen": 4984930304, "step": 2377 }, { "epoch": 0.14, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.2181, "num_input_tokens_seen": 4987027456, "step": 2378 }, { "epoch": 0.14, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.2384, "num_input_tokens_seen": 4989124608, "step": 2379 }, { "epoch": 0.14, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.2352, "num_input_tokens_seen": 4991221760, "step": 2380 }, { "epoch": 0.14, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.206, "num_input_tokens_seen": 4993318912, "step": 2381 }, { "epoch": 0.14, "grad_norm": 0.921875, "learning_rate": 2e-05, "loss": 2.2215, "num_input_tokens_seen": 4995416064, "step": 2382 }, { "epoch": 0.14, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.234, "num_input_tokens_seen": 4997513216, "step": 2383 }, { "epoch": 0.14, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2282, "num_input_tokens_seen": 4999610368, "step": 2384 }, { "epoch": 0.14, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.1974, "num_input_tokens_seen": 5001707520, "step": 2385 }, { "epoch": 0.14, "grad_norm": 0.76953125, "learning_rate": 2e-05, "loss": 2.2135, "num_input_tokens_seen": 5003804672, "step": 2386 }, { "epoch": 0.14, "grad_norm": 0.9453125, "learning_rate": 2e-05, "loss": 2.2315, "num_input_tokens_seen": 5005901824, "step": 2387 }, { "epoch": 0.14, "grad_norm": 0.9296875, "learning_rate": 2e-05, "loss": 2.2172, "num_input_tokens_seen": 5007998976, "step": 2388 }, { "epoch": 0.14, "eval_loss": 2.2845656871795654, "eval_runtime": 1677.6813, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.588, "num_input_tokens_seen": 5007998976, "step": 2388 }, { "epoch": 0.14, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.2172, "num_input_tokens_seen": 5010096128, "step": 2389 }, { "epoch": 0.14, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.2471, "num_input_tokens_seen": 5012193280, "step": 2390 }, { "epoch": 0.14, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.2383, "num_input_tokens_seen": 5014290432, "step": 2391 }, { "epoch": 0.15, "grad_norm": 0.859375, "learning_rate": 2e-05, "loss": 2.1998, "num_input_tokens_seen": 5016387584, "step": 2392 }, { "epoch": 0.15, "grad_norm": 1.4921875, "learning_rate": 2e-05, "loss": 2.2113, "num_input_tokens_seen": 5018484736, "step": 2393 }, { "epoch": 0.15, "grad_norm": 1.4765625, "learning_rate": 2e-05, "loss": 2.1996, "num_input_tokens_seen": 5020581888, "step": 2394 }, { "epoch": 0.15, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2106, "num_input_tokens_seen": 5022679040, "step": 2395 }, { "epoch": 0.15, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.2066, "num_input_tokens_seen": 5024776192, "step": 2396 }, { "epoch": 0.15, "grad_norm": 1.1875, "learning_rate": 2e-05, "loss": 2.2016, "num_input_tokens_seen": 5026873344, "step": 2397 }, { "epoch": 0.15, "grad_norm": 0.96484375, "learning_rate": 2e-05, "loss": 2.2149, "num_input_tokens_seen": 5028970496, "step": 2398 }, { "epoch": 0.15, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.2258, "num_input_tokens_seen": 5031067648, "step": 2399 }, { "epoch": 0.15, "grad_norm": 0.98828125, "learning_rate": 2e-05, "loss": 2.2216, "num_input_tokens_seen": 5033164800, "step": 2400 }, { "epoch": 0.15, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.2041, "num_input_tokens_seen": 5035261952, "step": 2401 }, { "epoch": 0.15, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2054, "num_input_tokens_seen": 5037359104, "step": 2402 }, { "epoch": 0.15, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.2385, "num_input_tokens_seen": 5039456256, "step": 2403 }, { "epoch": 0.15, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.1899, "num_input_tokens_seen": 5041553408, "step": 2404 }, { "epoch": 0.15, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.207, "num_input_tokens_seen": 5043650560, "step": 2405 }, { "epoch": 0.15, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.2031, "num_input_tokens_seen": 5045747712, "step": 2406 }, { "epoch": 0.15, "grad_norm": 0.91015625, "learning_rate": 2e-05, "loss": 2.1876, "num_input_tokens_seen": 5047844864, "step": 2407 }, { "epoch": 0.15, "grad_norm": 0.84375, "learning_rate": 2e-05, "loss": 2.2022, "num_input_tokens_seen": 5049942016, "step": 2408 }, { "epoch": 0.15, "grad_norm": 1.0703125, "learning_rate": 2e-05, "loss": 2.175, "num_input_tokens_seen": 5052039168, "step": 2409 }, { "epoch": 0.15, "grad_norm": 0.8984375, "learning_rate": 2e-05, "loss": 2.2121, "num_input_tokens_seen": 5054136320, "step": 2410 }, { "epoch": 0.15, "grad_norm": 0.85546875, "learning_rate": 2e-05, "loss": 2.1748, "num_input_tokens_seen": 5056233472, "step": 2411 }, { "epoch": 0.15, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 2.2166, "num_input_tokens_seen": 5058330624, "step": 2412 }, { "epoch": 0.15, "grad_norm": 1.015625, "learning_rate": 2e-05, "loss": 2.198, "num_input_tokens_seen": 5060427776, "step": 2413 }, { "epoch": 0.15, "grad_norm": 0.9140625, "learning_rate": 2e-05, "loss": 2.2371, "num_input_tokens_seen": 5062524928, "step": 2414 }, { "epoch": 0.15, "grad_norm": 0.99609375, "learning_rate": 2e-05, "loss": 2.195, "num_input_tokens_seen": 5064622080, "step": 2415 }, { "epoch": 0.15, "grad_norm": 0.9609375, "learning_rate": 2e-05, "loss": 2.2151, "num_input_tokens_seen": 5066719232, "step": 2416 }, { "epoch": 0.15, "grad_norm": 0.80859375, "learning_rate": 2e-05, "loss": 2.1957, "num_input_tokens_seen": 5068816384, "step": 2417 }, { "epoch": 0.15, "grad_norm": 0.875, "learning_rate": 2e-05, "loss": 2.2213, "num_input_tokens_seen": 5070913536, "step": 2418 }, { "epoch": 0.15, "grad_norm": 0.78515625, "learning_rate": 2e-05, "loss": 2.2185, "num_input_tokens_seen": 5073010688, "step": 2419 }, { "epoch": 0.15, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.2109, "num_input_tokens_seen": 5075107840, "step": 2420 }, { "epoch": 0.15, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2051, "num_input_tokens_seen": 5077204992, "step": 2421 }, { "epoch": 0.15, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2415, "num_input_tokens_seen": 5079302144, "step": 2422 }, { "epoch": 0.15, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2256, "num_input_tokens_seen": 5081399296, "step": 2423 }, { "epoch": 0.15, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2285, "num_input_tokens_seen": 5083496448, "step": 2424 }, { "epoch": 0.15, "grad_norm": 0.71484375, "learning_rate": 2e-05, "loss": 2.1879, "num_input_tokens_seen": 5085593600, "step": 2425 }, { "epoch": 0.15, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.2043, "num_input_tokens_seen": 5087690752, "step": 2426 }, { "epoch": 0.15, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2538, "num_input_tokens_seen": 5089787904, "step": 2427 }, { "epoch": 0.15, "grad_norm": 0.7578125, "learning_rate": 2e-05, "loss": 2.2484, "num_input_tokens_seen": 5091885056, "step": 2428 }, { "epoch": 0.15, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2404, "num_input_tokens_seen": 5093982208, "step": 2429 }, { "epoch": 0.15, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.1924, "num_input_tokens_seen": 5096079360, "step": 2430 }, { "epoch": 0.15, "grad_norm": 0.92578125, "learning_rate": 2e-05, "loss": 2.2381, "num_input_tokens_seen": 5098176512, "step": 2431 }, { "epoch": 0.15, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.2188, "num_input_tokens_seen": 5100273664, "step": 2432 }, { "epoch": 0.15, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.2242, "num_input_tokens_seen": 5102370816, "step": 2433 }, { "epoch": 0.15, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2287, "num_input_tokens_seen": 5104467968, "step": 2434 }, { "epoch": 0.15, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2295, "num_input_tokens_seen": 5106565120, "step": 2435 }, { "epoch": 0.15, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.2007, "num_input_tokens_seen": 5108662272, "step": 2436 }, { "epoch": 0.15, "grad_norm": 0.8515625, "learning_rate": 2e-05, "loss": 2.1813, "num_input_tokens_seen": 5110759424, "step": 2437 }, { "epoch": 0.15, "grad_norm": 0.72265625, "learning_rate": 2e-05, "loss": 2.2076, "num_input_tokens_seen": 5112856576, "step": 2438 }, { "epoch": 0.15, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2065, "num_input_tokens_seen": 5114953728, "step": 2439 }, { "epoch": 0.16, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2148, "num_input_tokens_seen": 5117050880, "step": 2440 }, { "epoch": 0.16, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2454, "num_input_tokens_seen": 5119148032, "step": 2441 }, { "epoch": 0.16, "grad_norm": 0.8046875, "learning_rate": 2e-05, "loss": 2.2369, "num_input_tokens_seen": 5121245184, "step": 2442 }, { "epoch": 0.16, "grad_norm": 0.7734375, "learning_rate": 2e-05, "loss": 2.2235, "num_input_tokens_seen": 5123342336, "step": 2443 }, { "epoch": 0.16, "grad_norm": 0.73828125, "learning_rate": 2e-05, "loss": 2.208, "num_input_tokens_seen": 5125439488, "step": 2444 }, { "epoch": 0.16, "grad_norm": 0.8828125, "learning_rate": 2e-05, "loss": 2.2429, "num_input_tokens_seen": 5127536640, "step": 2445 }, { "epoch": 0.16, "grad_norm": 0.91796875, "learning_rate": 2e-05, "loss": 2.2352, "num_input_tokens_seen": 5129633792, "step": 2446 }, { "epoch": 0.16, "grad_norm": 0.7890625, "learning_rate": 2e-05, "loss": 2.223, "num_input_tokens_seen": 5131730944, "step": 2447 }, { "epoch": 0.16, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.2403, "num_input_tokens_seen": 5133828096, "step": 2448 }, { "epoch": 0.16, "grad_norm": 1.40625, "learning_rate": 2e-05, "loss": 2.2296, "num_input_tokens_seen": 5135925248, "step": 2449 }, { "epoch": 0.16, "grad_norm": 15.4375, "learning_rate": 2e-05, "loss": 2.228, "num_input_tokens_seen": 5138022400, "step": 2450 }, { "epoch": 0.16, "grad_norm": 1.59375, "learning_rate": 2e-05, "loss": 2.2053, "num_input_tokens_seen": 5140119552, "step": 2451 }, { "epoch": 0.16, "grad_norm": 1.1328125, "learning_rate": 2e-05, "loss": 2.1757, "num_input_tokens_seen": 5142216704, "step": 2452 }, { "epoch": 0.16, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.2169, "num_input_tokens_seen": 5144313856, "step": 2453 }, { "epoch": 0.16, "grad_norm": 1.3515625, "learning_rate": 2e-05, "loss": 2.1949, "num_input_tokens_seen": 5146411008, "step": 2454 }, { "epoch": 0.16, "grad_norm": 1.140625, "learning_rate": 2e-05, "loss": 2.2216, "num_input_tokens_seen": 5148508160, "step": 2455 }, { "epoch": 0.16, "grad_norm": 1.0234375, "learning_rate": 2e-05, "loss": 2.1917, "num_input_tokens_seen": 5150605312, "step": 2456 }, { "epoch": 0.16, "grad_norm": 1.375, "learning_rate": 2e-05, "loss": 2.2072, "num_input_tokens_seen": 5152702464, "step": 2457 }, { "epoch": 0.16, "grad_norm": 1.1015625, "learning_rate": 2e-05, "loss": 2.219, "num_input_tokens_seen": 5154799616, "step": 2458 }, { "epoch": 0.16, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.211, "num_input_tokens_seen": 5156896768, "step": 2459 }, { "epoch": 0.16, "grad_norm": 1.03125, "learning_rate": 2e-05, "loss": 2.2342, "num_input_tokens_seen": 5158993920, "step": 2460 }, { "epoch": 0.16, "grad_norm": 0.8203125, "learning_rate": 2e-05, "loss": 2.214, "num_input_tokens_seen": 5161091072, "step": 2461 }, { "epoch": 0.16, "grad_norm": 0.8671875, "learning_rate": 2e-05, "loss": 2.2081, "num_input_tokens_seen": 5163188224, "step": 2462 }, { "epoch": 0.16, "grad_norm": 1.0859375, "learning_rate": 2e-05, "loss": 2.2518, "num_input_tokens_seen": 5165285376, "step": 2463 }, { "epoch": 0.16, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2446, "num_input_tokens_seen": 5167382528, "step": 2464 }, { "epoch": 0.16, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.2265, "num_input_tokens_seen": 5169479680, "step": 2465 }, { "epoch": 0.16, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.2186, "num_input_tokens_seen": 5171576832, "step": 2466 }, { "epoch": 0.16, "grad_norm": 0.98046875, "learning_rate": 2e-05, "loss": 2.268, "num_input_tokens_seen": 5173673984, "step": 2467 }, { "epoch": 0.16, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.2073, "num_input_tokens_seen": 5175771136, "step": 2468 }, { "epoch": 0.16, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2059, "num_input_tokens_seen": 5177868288, "step": 2469 }, { "epoch": 0.16, "grad_norm": 0.828125, "learning_rate": 2e-05, "loss": 2.2206, "num_input_tokens_seen": 5179965440, "step": 2470 }, { "epoch": 0.16, "grad_norm": 1.0625, "learning_rate": 2e-05, "loss": 2.2297, "num_input_tokens_seen": 5182062592, "step": 2471 }, { "epoch": 0.16, "grad_norm": 0.890625, "learning_rate": 2e-05, "loss": 2.1802, "num_input_tokens_seen": 5184159744, "step": 2472 }, { "epoch": 0.16, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.2438, "num_input_tokens_seen": 5186256896, "step": 2473 }, { "epoch": 0.16, "grad_norm": 0.81640625, "learning_rate": 2e-05, "loss": 2.2078, "num_input_tokens_seen": 5188354048, "step": 2474 }, { "epoch": 0.16, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.2347, "num_input_tokens_seen": 5190451200, "step": 2475 }, { "epoch": 0.16, "grad_norm": 0.9375, "learning_rate": 2e-05, "loss": 2.1934, "num_input_tokens_seen": 5192548352, "step": 2476 }, { "epoch": 0.16, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.1833, "num_input_tokens_seen": 5194645504, "step": 2477 }, { "epoch": 0.16, "grad_norm": 0.8125, "learning_rate": 2e-05, "loss": 2.2239, "num_input_tokens_seen": 5196742656, "step": 2478 }, { "epoch": 0.16, "grad_norm": 1.1171875, "learning_rate": 2e-05, "loss": 2.2279, "num_input_tokens_seen": 5198839808, "step": 2479 }, { "epoch": 0.16, "grad_norm": 1.15625, "learning_rate": 2e-05, "loss": 2.2398, "num_input_tokens_seen": 5200936960, "step": 2480 }, { "epoch": 0.16, "grad_norm": 0.90234375, "learning_rate": 2e-05, "loss": 2.227, "num_input_tokens_seen": 5203034112, "step": 2481 }, { "epoch": 0.16, "grad_norm": 1.2578125, "learning_rate": 2e-05, "loss": 2.2006, "num_input_tokens_seen": 5205131264, "step": 2482 }, { "epoch": 0.16, "grad_norm": 1.75, "learning_rate": 2e-05, "loss": 2.2074, "num_input_tokens_seen": 5207228416, "step": 2483 }, { "epoch": 0.16, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 2.2153, "num_input_tokens_seen": 5209325568, "step": 2484 }, { "epoch": 0.16, "grad_norm": 0.90625, "learning_rate": 2e-05, "loss": 2.2281, "num_input_tokens_seen": 5211422720, "step": 2485 }, { "epoch": 0.16, "grad_norm": 1.2734375, "learning_rate": 2e-05, "loss": 2.1909, "num_input_tokens_seen": 5213519872, "step": 2486 }, { "epoch": 0.17, "grad_norm": 1.640625, "learning_rate": 2e-05, "loss": 2.2138, "num_input_tokens_seen": 5215617024, "step": 2487 }, { "epoch": 0.17, "grad_norm": 1.296875, "learning_rate": 2e-05, "loss": 2.1726, "num_input_tokens_seen": 5217714176, "step": 2488 }, { "epoch": 0.17, "grad_norm": 0.84765625, "learning_rate": 2e-05, "loss": 2.179, "num_input_tokens_seen": 5219811328, "step": 2489 }, { "epoch": 0.17, "grad_norm": 1.171875, "learning_rate": 2e-05, "loss": 2.2159, "num_input_tokens_seen": 5221908480, "step": 2490 }, { "epoch": 0.17, "grad_norm": 1.21875, "learning_rate": 2e-05, "loss": 2.2382, "num_input_tokens_seen": 5224005632, "step": 2491 }, { "epoch": 0.17, "grad_norm": 1.1484375, "learning_rate": 2e-05, "loss": 2.1965, "num_input_tokens_seen": 5226102784, "step": 2492 }, { "epoch": 0.17, "grad_norm": 0.82421875, "learning_rate": 2e-05, "loss": 2.1674, "num_input_tokens_seen": 5228199936, "step": 2493 }, { "epoch": 0.17, "grad_norm": 1.0078125, "learning_rate": 2e-05, "loss": 2.203, "num_input_tokens_seen": 5230297088, "step": 2494 }, { "epoch": 0.17, "grad_norm": 1.0546875, "learning_rate": 2e-05, "loss": 2.1765, "num_input_tokens_seen": 5232394240, "step": 2495 }, { "epoch": 0.17, "grad_norm": 0.86328125, "learning_rate": 2e-05, "loss": 2.1648, "num_input_tokens_seen": 5234491392, "step": 2496 }, { "epoch": 0.17, "grad_norm": 0.89453125, "learning_rate": 2e-05, "loss": 2.1952, "num_input_tokens_seen": 5236588544, "step": 2497 }, { "epoch": 0.17, "grad_norm": 1.1796875, "learning_rate": 2e-05, "loss": 2.2105, "num_input_tokens_seen": 5238685696, "step": 2498 }, { "epoch": 0.17, "grad_norm": 0.88671875, "learning_rate": 2e-05, "loss": 2.1676, "num_input_tokens_seen": 5240782848, "step": 2499 }, { "epoch": 0.17, "grad_norm": 0.99609375, "learning_rate": 2e-05, "loss": 2.1949, "num_input_tokens_seen": 5242880000, "step": 2500 }, { "epoch": 0.17, "grad_norm": 0.9765625, "learning_rate": 2e-05, "loss": 2.2532, "num_input_tokens_seen": 5244977152, "step": 2501 }, { "epoch": 0.17, "grad_norm": 0.79296875, "learning_rate": 2e-05, "loss": 2.2425, "num_input_tokens_seen": 5247074304, "step": 2502 }, { "epoch": 0.17, "grad_norm": 0.7421875, "learning_rate": 2e-05, "loss": 2.2774, "num_input_tokens_seen": 5249171456, "step": 2503 }, { "epoch": 0.17, "grad_norm": 0.71875, "learning_rate": 2e-05, "loss": 2.2143, "num_input_tokens_seen": 5251268608, "step": 2504 }, { "epoch": 0.17, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.2455, "num_input_tokens_seen": 5253365760, "step": 2505 }, { "epoch": 0.17, "grad_norm": 0.87109375, "learning_rate": 2e-05, "loss": 2.2435, "num_input_tokens_seen": 5255462912, "step": 2506 }, { "epoch": 0.17, "grad_norm": 0.6953125, "learning_rate": 2e-05, "loss": 2.2258, "num_input_tokens_seen": 5257560064, "step": 2507 }, { "epoch": 0.17, "grad_norm": 0.80078125, "learning_rate": 2e-05, "loss": 2.2156, "num_input_tokens_seen": 5259657216, "step": 2508 }, { "epoch": 0.17, "grad_norm": 0.83203125, "learning_rate": 2e-05, "loss": 2.2501, "num_input_tokens_seen": 5261754368, "step": 2509 }, { "epoch": 0.17, "grad_norm": 0.75, "learning_rate": 2e-05, "loss": 2.2627, "num_input_tokens_seen": 5263851520, "step": 2510 }, { "epoch": 0.17, "grad_norm": 0.6015625, "learning_rate": 2e-05, "loss": 2.2641, "num_input_tokens_seen": 5265948672, "step": 2511 }, { "epoch": 0.17, "grad_norm": 0.70703125, "learning_rate": 2e-05, "loss": 2.2227, "num_input_tokens_seen": 5268045824, "step": 2512 }, { "epoch": 0.17, "grad_norm": 0.78125, "learning_rate": 2e-05, "loss": 2.1824, "num_input_tokens_seen": 5270142976, "step": 2513 }, { "epoch": 0.17, "grad_norm": 0.6015625, "learning_rate": 2e-05, "loss": 2.2242, "num_input_tokens_seen": 5272240128, "step": 2514 }, { "epoch": 0.17, "grad_norm": 0.62890625, "learning_rate": 2e-05, "loss": 2.2262, "num_input_tokens_seen": 5274337280, "step": 2515 }, { "epoch": 0.17, "grad_norm": 0.625, "learning_rate": 2e-05, "loss": 2.2326, "num_input_tokens_seen": 5276434432, "step": 2516 }, { "epoch": 0.17, "grad_norm": 0.58203125, "learning_rate": 2e-05, "loss": 2.2287, "num_input_tokens_seen": 5278531584, "step": 2517 }, { "epoch": 0.17, "grad_norm": 0.61328125, "learning_rate": 2e-05, "loss": 2.2359, "num_input_tokens_seen": 5280628736, "step": 2518 }, { "epoch": 0.17, "grad_norm": 0.62890625, "learning_rate": 2e-05, "loss": 2.2626, "num_input_tokens_seen": 5282725888, "step": 2519 }, { "epoch": 0.17, "grad_norm": 0.55859375, "learning_rate": 2e-05, "loss": 2.2574, "num_input_tokens_seen": 5284823040, "step": 2520 }, { "epoch": 0.17, "grad_norm": 0.58203125, "learning_rate": 2e-05, "loss": 2.2636, "num_input_tokens_seen": 5286920192, "step": 2521 }, { "epoch": 0.17, "grad_norm": 0.609375, "learning_rate": 2e-05, "loss": 2.2267, "num_input_tokens_seen": 5289017344, "step": 2522 }, { "epoch": 0.17, "grad_norm": 0.5703125, "learning_rate": 2e-05, "loss": 2.2373, "num_input_tokens_seen": 5291114496, "step": 2523 }, { "epoch": 0.17, "grad_norm": 0.6015625, "learning_rate": 2e-05, "loss": 2.2023, "num_input_tokens_seen": 5293211648, "step": 2524 }, { "epoch": 0.17, "grad_norm": 0.61328125, "learning_rate": 2e-05, "loss": 2.2242, "num_input_tokens_seen": 5295308800, "step": 2525 }, { "epoch": 0.17, "grad_norm": 0.6328125, "learning_rate": 2e-05, "loss": 2.2436, "num_input_tokens_seen": 5297405952, "step": 2526 }, { "epoch": 0.17, "grad_norm": 0.61328125, "learning_rate": 2e-05, "loss": 2.211, "num_input_tokens_seen": 5299503104, "step": 2527 }, { "epoch": 0.17, "grad_norm": 0.62109375, "learning_rate": 2e-05, "loss": 2.2402, "num_input_tokens_seen": 5301600256, "step": 2528 }, { "epoch": 0.17, "grad_norm": 0.58984375, "learning_rate": 2e-05, "loss": 2.2375, "num_input_tokens_seen": 5303697408, "step": 2529 }, { "epoch": 0.17, "grad_norm": 0.625, "learning_rate": 2e-05, "loss": 2.2399, "num_input_tokens_seen": 5305794560, "step": 2530 }, { "epoch": 0.17, "grad_norm": 0.62890625, "learning_rate": 2e-05, "loss": 2.2409, "num_input_tokens_seen": 5307891712, "step": 2531 }, { "epoch": 0.17, "grad_norm": 0.59375, "learning_rate": 2e-05, "loss": 2.2632, "num_input_tokens_seen": 5309988864, "step": 2532 }, { "epoch": 0.17, "grad_norm": 0.66796875, "learning_rate": 2e-05, "loss": 2.2392, "num_input_tokens_seen": 5312086016, "step": 2533 }, { "epoch": 0.17, "grad_norm": 0.58984375, "learning_rate": 2e-05, "loss": 2.2223, "num_input_tokens_seen": 5314183168, "step": 2534 }, { "epoch": 0.18, "grad_norm": 0.58984375, "learning_rate": 2e-05, "loss": 2.2643, "num_input_tokens_seen": 5316280320, "step": 2535 }, { "epoch": 0.18, "grad_norm": 0.5546875, "learning_rate": 2e-05, "loss": 2.266, "num_input_tokens_seen": 5318377472, "step": 2536 }, { "epoch": 0.18, "grad_norm": 0.5859375, "learning_rate": 2e-05, "loss": 2.2564, "num_input_tokens_seen": 5320474624, "step": 2537 }, { "epoch": 0.18, "grad_norm": 0.54296875, "learning_rate": 2e-05, "loss": 2.2537, "num_input_tokens_seen": 5322571776, "step": 2538 }, { "epoch": 0.18, "grad_norm": 0.5625, "learning_rate": 2e-05, "loss": 2.2835, "num_input_tokens_seen": 5324668928, "step": 2539 }, { "epoch": 0.18, "grad_norm": 0.55859375, "learning_rate": 2e-05, "loss": 2.228, "num_input_tokens_seen": 5326766080, "step": 2540 }, { "epoch": 0.18, "grad_norm": 0.5390625, "learning_rate": 2e-05, "loss": 2.2302, "num_input_tokens_seen": 5328863232, "step": 2541 }, { "epoch": 0.18, "grad_norm": 0.5625, "learning_rate": 2e-05, "loss": 2.2323, "num_input_tokens_seen": 5330960384, "step": 2542 }, { "epoch": 0.18, "grad_norm": 0.57421875, "learning_rate": 2e-05, "loss": 2.1935, "num_input_tokens_seen": 5333057536, "step": 2543 }, { "epoch": 0.18, "grad_norm": 0.5546875, "learning_rate": 2e-05, "loss": 2.2391, "num_input_tokens_seen": 5335154688, "step": 2544 }, { "epoch": 0.18, "grad_norm": 0.56640625, "learning_rate": 2e-05, "loss": 2.2353, "num_input_tokens_seen": 5337251840, "step": 2545 }, { "epoch": 0.18, "grad_norm": 0.5234375, "learning_rate": 2e-05, "loss": 2.2606, "num_input_tokens_seen": 5339348992, "step": 2546 }, { "epoch": 0.18, "grad_norm": 0.54296875, "learning_rate": 2e-05, "loss": 2.2437, "num_input_tokens_seen": 5341446144, "step": 2547 }, { "epoch": 0.18, "grad_norm": 0.578125, "learning_rate": 2e-05, "loss": 2.2223, "num_input_tokens_seen": 5343543296, "step": 2548 }, { "epoch": 0.18, "grad_norm": 0.578125, "learning_rate": 2e-05, "loss": 2.234, "num_input_tokens_seen": 5345640448, "step": 2549 }, { "epoch": 0.18, "grad_norm": 0.57421875, "learning_rate": 2e-05, "loss": 2.2671, "num_input_tokens_seen": 5347737600, "step": 2550 }, { "epoch": 0.18, "grad_norm": 0.5546875, "learning_rate": 2e-05, "loss": 2.2482, "num_input_tokens_seen": 5349834752, "step": 2551 }, { "epoch": 0.18, "grad_norm": 0.61328125, "learning_rate": 2e-05, "loss": 2.2244, "num_input_tokens_seen": 5351931904, "step": 2552 }, { "epoch": 0.18, "grad_norm": 0.5703125, "learning_rate": 2e-05, "loss": 2.2537, "num_input_tokens_seen": 5354029056, "step": 2553 }, { "epoch": 0.18, "grad_norm": 0.62109375, "learning_rate": 2e-05, "loss": 2.2495, "num_input_tokens_seen": 5356126208, "step": 2554 }, { "epoch": 0.18, "grad_norm": 0.671875, "learning_rate": 2e-05, "loss": 2.2813, "num_input_tokens_seen": 5358223360, "step": 2555 }, { "epoch": 0.18, "grad_norm": 0.54296875, "learning_rate": 2e-05, "loss": 2.2157, "num_input_tokens_seen": 5360320512, "step": 2556 }, { "epoch": 0.18, "grad_norm": 0.57421875, "learning_rate": 2e-05, "loss": 2.2131, "num_input_tokens_seen": 5362417664, "step": 2557 }, { "epoch": 0.18, "grad_norm": 0.58984375, "learning_rate": 2e-05, "loss": 2.275, "num_input_tokens_seen": 5364514816, "step": 2558 }, { "epoch": 0.18, "grad_norm": 0.578125, "learning_rate": 2e-05, "loss": 2.2604, "num_input_tokens_seen": 5366611968, "step": 2559 }, { "epoch": 0.18, "grad_norm": 0.55859375, "learning_rate": 2e-05, "loss": 2.2455, "num_input_tokens_seen": 5368709120, "step": 2560 }, { "epoch": 0.18, "grad_norm": 0.5625, "learning_rate": 2e-05, "loss": 2.2506, "num_input_tokens_seen": 5370806272, "step": 2561 }, { "epoch": 0.18, "grad_norm": 0.578125, "learning_rate": 2e-05, "loss": 2.2468, "num_input_tokens_seen": 5372903424, "step": 2562 }, { "epoch": 0.18, "grad_norm": 0.5234375, "learning_rate": 2e-05, "loss": 2.2476, "num_input_tokens_seen": 5375000576, "step": 2563 }, { "epoch": 0.18, "grad_norm": 0.56640625, "learning_rate": 2e-05, "loss": 2.2391, "num_input_tokens_seen": 5377097728, "step": 2564 }, { "epoch": 0.18, "grad_norm": 0.57421875, "learning_rate": 2e-05, "loss": 2.2651, "num_input_tokens_seen": 5379194880, "step": 2565 }, { "epoch": 0.18, "grad_norm": 0.5390625, "learning_rate": 2e-05, "loss": 2.2662, "num_input_tokens_seen": 5381292032, "step": 2566 }, { "epoch": 0.18, "grad_norm": 0.53515625, "learning_rate": 2e-05, "loss": 2.2551, "num_input_tokens_seen": 5383389184, "step": 2567 }, { "epoch": 0.18, "grad_norm": 0.51953125, "learning_rate": 2e-05, "loss": 2.2431, "num_input_tokens_seen": 5385486336, "step": 2568 }, { "epoch": 0.18, "grad_norm": 0.5546875, "learning_rate": 2e-05, "loss": 2.269, "num_input_tokens_seen": 5387583488, "step": 2569 }, { "epoch": 0.18, "grad_norm": 0.54296875, "learning_rate": 2e-05, "loss": 2.244, "num_input_tokens_seen": 5389680640, "step": 2570 }, { "epoch": 0.18, "grad_norm": 0.5859375, "learning_rate": 2e-05, "loss": 2.2554, "num_input_tokens_seen": 5391777792, "step": 2571 }, { "epoch": 0.18, "grad_norm": 0.53125, "learning_rate": 2e-05, "loss": 2.228, "num_input_tokens_seen": 5393874944, "step": 2572 }, { "epoch": 0.18, "grad_norm": 0.53515625, "learning_rate": 2e-05, "loss": 2.2689, "num_input_tokens_seen": 5395972096, "step": 2573 }, { "epoch": 0.18, "grad_norm": 0.57421875, "learning_rate": 2e-05, "loss": 2.2274, "num_input_tokens_seen": 5398069248, "step": 2574 }, { "epoch": 0.18, "grad_norm": 0.5625, "learning_rate": 2e-05, "loss": 2.2118, "num_input_tokens_seen": 5400166400, "step": 2575 }, { "epoch": 0.18, "grad_norm": 0.6015625, "learning_rate": 2e-05, "loss": 2.2246, "num_input_tokens_seen": 5402263552, "step": 2576 }, { "epoch": 0.18, "grad_norm": 0.5625, "learning_rate": 2e-05, "loss": 2.231, "num_input_tokens_seen": 5404360704, "step": 2577 }, { "epoch": 0.18, "grad_norm": 0.5625, "learning_rate": 2e-05, "loss": 2.2491, "num_input_tokens_seen": 5406457856, "step": 2578 }, { "epoch": 0.18, "grad_norm": 0.6171875, "learning_rate": 2e-05, "loss": 2.2323, "num_input_tokens_seen": 5408555008, "step": 2579 }, { "epoch": 0.18, "grad_norm": 0.5859375, "learning_rate": 2e-05, "loss": 2.2529, "num_input_tokens_seen": 5410652160, "step": 2580 }, { "epoch": 0.18, "grad_norm": 0.67578125, "learning_rate": 2e-05, "loss": 2.2248, "num_input_tokens_seen": 5412749312, "step": 2581 }, { "epoch": 0.18, "grad_norm": 0.6015625, "learning_rate": 2e-05, "loss": 2.2077, "num_input_tokens_seen": 5414846464, "step": 2582 }, { "epoch": 0.19, "grad_norm": 0.61328125, "learning_rate": 2e-05, "loss": 2.2312, "num_input_tokens_seen": 5416943616, "step": 2583 }, { "epoch": 0.19, "grad_norm": 0.62890625, "learning_rate": 2e-05, "loss": 2.2491, "num_input_tokens_seen": 5419040768, "step": 2584 }, { "epoch": 0.19, "grad_norm": 0.69921875, "learning_rate": 2e-05, "loss": 2.2337, "num_input_tokens_seen": 5421137920, "step": 2585 }, { "epoch": 0.19, "grad_norm": 0.56640625, "learning_rate": 2e-05, "loss": 2.2125, "num_input_tokens_seen": 5423235072, "step": 2586 }, { "epoch": 0.19, "grad_norm": 0.64453125, "learning_rate": 2e-05, "loss": 2.2438, "num_input_tokens_seen": 5425332224, "step": 2587 }, { "epoch": 0.19, "eval_loss": 2.27114200592041, "eval_runtime": 1924.8277, "eval_samples_per_second": 2.048, "eval_steps_per_second": 0.512, "num_input_tokens_seen": 5425332224, "step": 2587 }, { "epoch": 0.19, "grad_norm": 0.58984375, "learning_rate": 2e-05, "loss": 2.2681, "num_input_tokens_seen": 5427429376, "step": 2588 }, { "epoch": 0.19, "grad_norm": 0.55859375, "learning_rate": 2e-05, "loss": 2.2245, "num_input_tokens_seen": 5429526528, "step": 2589 }, { "epoch": 0.19, "grad_norm": 0.60546875, "learning_rate": 2e-05, "loss": 2.2476, "num_input_tokens_seen": 5431623680, "step": 2590 }, { "epoch": 0.19, "grad_norm": 0.53125, "learning_rate": 2e-05, "loss": 2.2004, "num_input_tokens_seen": 5433720832, "step": 2591 }, { "epoch": 0.19, "grad_norm": 0.6171875, "learning_rate": 2e-05, "loss": 2.2343, "num_input_tokens_seen": 5435817984, "step": 2592 }, { "epoch": 0.19, "grad_norm": 0.57421875, "learning_rate": 2e-05, "loss": 2.2351, "num_input_tokens_seen": 5437915136, "step": 2593 }, { "epoch": 0.19, "grad_norm": 0.58984375, "learning_rate": 2e-05, "loss": 2.2344, "num_input_tokens_seen": 5440012288, "step": 2594 }, { "epoch": 0.19, "grad_norm": 0.546875, "learning_rate": 2e-05, "loss": 2.2213, "num_input_tokens_seen": 5442109440, "step": 2595 }, { "epoch": 0.19, "grad_norm": 0.55859375, "learning_rate": 2e-05, "loss": 2.2306, "num_input_tokens_seen": 5444206592, "step": 2596 }, { "epoch": 0.19, "grad_norm": 0.58203125, "learning_rate": 2e-05, "loss": 2.2363, "num_input_tokens_seen": 5446303744, "step": 2597 }, { "epoch": 0.19, "grad_norm": 0.578125, "learning_rate": 2e-05, "loss": 2.2797, "num_input_tokens_seen": 5448400896, "step": 2598 }, { "epoch": 0.19, "grad_norm": 0.57421875, "learning_rate": 2e-05, "loss": 2.2352, "num_input_tokens_seen": 5450498048, "step": 2599 }, { "epoch": 0.19, "grad_norm": 0.54296875, "learning_rate": 2e-05, "loss": 2.2149, "num_input_tokens_seen": 5452595200, "step": 2600 } ], "logging_steps": 1, "max_steps": 4768, "num_input_tokens_seen": 5452595200, "num_train_epochs": 9223372036854775807, "save_steps": 100, "total_flos": 2.3262930664968684e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }