diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2029 +1,3821 @@ { - "best_metric": 1.5016273260116577, - "best_model_checkpoint": "lora_lr/mistralai/Mistral-7B-Instruct-v0.2/unaligned/checkpoint-240", - "epoch": 0.327575175943698, - "eval_steps": 10, - "global_step": 256, + "best_metric": 1.633476734161377, + "best_model_checkpoint": "lora_lr_pad/meta-llama/Meta-Llama-3-8B-Instruct/unaligned/checkpoint-500", + "epoch": 0.655150351887396, + "eval_steps": 20, + "global_step": 512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0, - "grad_norm": 1.484375, + "epoch": 0.0012795905310300703, + "grad_norm": 0.1796875, "learning_rate": 2.0000000000000003e-06, - "loss": 3.7421, + "loss": 2.2847, "step": 1 }, { - "epoch": 0.0, - "grad_norm": 1.4921875, + "epoch": 0.0025591810620601407, + "grad_norm": 0.1884765625, "learning_rate": 4.000000000000001e-06, - "loss": 3.6136, + "loss": 2.1736, "step": 2 }, { - "epoch": 0.0, - "grad_norm": 1.4140625, + "epoch": 0.003838771593090211, + "grad_norm": 0.1572265625, "learning_rate": 6e-06, - "loss": 3.5641, + "loss": 2.205, "step": 3 }, { - "epoch": 0.01, - "grad_norm": 1.59375, + "epoch": 0.005118362124120281, + "grad_norm": 0.1943359375, "learning_rate": 8.000000000000001e-06, - "loss": 3.8168, + "loss": 2.2475, "step": 4 }, { - "epoch": 0.01, - "grad_norm": 1.5703125, + "epoch": 0.006397952655150352, + "grad_norm": 0.189453125, "learning_rate": 1e-05, - "loss": 3.7471, + "loss": 2.2441, "step": 5 }, { - "epoch": 0.01, - "grad_norm": 1.4609375, + "epoch": 0.007677543186180422, + "grad_norm": 0.18359375, "learning_rate": 1.2e-05, - "loss": 3.6939, + "loss": 2.2687, "step": 6 }, { - "epoch": 0.01, - "grad_norm": 1.5546875, + "epoch": 0.008957133717210493, + "grad_norm": 0.201171875, "learning_rate": 1.4000000000000001e-05, - "loss": 3.5964, + "loss": 2.1679, "step": 7 }, { - "epoch": 0.01, - "grad_norm": 1.5625, + "epoch": 0.010236724248240563, + "grad_norm": 0.1884765625, "learning_rate": 1.6000000000000003e-05, - "loss": 3.7708, + "loss": 2.2092, "step": 8 }, { - "epoch": 0.01, - "grad_norm": 1.5703125, + "epoch": 0.011516314779270634, + "grad_norm": 0.1904296875, "learning_rate": 1.8e-05, - "loss": 3.7418, + "loss": 2.2664, "step": 9 }, { - "epoch": 0.01, - "grad_norm": 1.609375, + "epoch": 0.012795905310300703, + "grad_norm": 0.189453125, "learning_rate": 2e-05, - "loss": 3.8394, + "loss": 2.2509, "step": 10 }, { - "epoch": 0.01, - "eval_loss": 3.672560214996338, - "eval_runtime": 108.3497, - "eval_samples_per_second": 46.147, - "eval_steps_per_second": 1.449, - "step": 10 - }, - { - "epoch": 0.01, - "grad_norm": 1.5078125, + "epoch": 0.014075495841330775, + "grad_norm": 0.1748046875, "learning_rate": 2.2000000000000003e-05, - "loss": 3.6075, + "loss": 2.1772, "step": 11 }, { - "epoch": 0.02, - "grad_norm": 1.4296875, + "epoch": 0.015355086372360844, + "grad_norm": 0.1796875, "learning_rate": 2.4e-05, - "loss": 3.4171, + "loss": 2.2352, "step": 12 }, { - "epoch": 0.02, - "grad_norm": 1.6484375, + "epoch": 0.016634676903390915, + "grad_norm": 0.208984375, "learning_rate": 2.6000000000000002e-05, - "loss": 3.7848, + "loss": 2.2991, "step": 13 }, { - "epoch": 0.02, - "grad_norm": 1.6328125, + "epoch": 0.017914267434420986, + "grad_norm": 0.201171875, "learning_rate": 2.8000000000000003e-05, - "loss": 3.7797, + "loss": 2.3595, "step": 14 }, { - "epoch": 0.02, - "grad_norm": 1.609375, + "epoch": 0.019193857965451054, + "grad_norm": 0.1943359375, "learning_rate": 3e-05, - "loss": 3.7132, + "loss": 2.2419, "step": 15 }, { - "epoch": 0.02, - "grad_norm": 1.5390625, + "epoch": 0.020473448496481125, + "grad_norm": 0.201171875, "learning_rate": 3.2000000000000005e-05, - "loss": 3.5353, + "loss": 2.2519, "step": 16 }, { - "epoch": 0.02, - "grad_norm": 1.828125, + "epoch": 0.021753039027511197, + "grad_norm": 0.2275390625, "learning_rate": 3.4000000000000007e-05, - "loss": 3.687, + "loss": 2.1917, "step": 17 }, { - "epoch": 0.02, - "grad_norm": 1.65625, + "epoch": 0.023032629558541268, + "grad_norm": 0.2373046875, "learning_rate": 3.6e-05, - "loss": 3.4312, + "loss": 2.26, "step": 18 }, { - "epoch": 0.02, - "grad_norm": 1.8203125, + "epoch": 0.02431222008957134, + "grad_norm": 0.236328125, "learning_rate": 3.8e-05, - "loss": 3.4971, + "loss": 2.2416, "step": 19 }, { - "epoch": 0.03, - "grad_norm": 1.875, + "epoch": 0.025591810620601407, + "grad_norm": 0.25, "learning_rate": 4e-05, - "loss": 3.3386, + "loss": 2.2174, "step": 20 }, { - "epoch": 0.03, - "eval_loss": 3.376652240753174, - "eval_runtime": 108.6461, - "eval_samples_per_second": 46.021, - "eval_steps_per_second": 1.445, + "epoch": 0.025591810620601407, + "eval_loss": 2.2057077884674072, + "eval_runtime": 103.2988, + "eval_samples_per_second": 48.403, + "eval_steps_per_second": 1.52, "step": 20 }, { - "epoch": 0.03, - "grad_norm": 1.9921875, + "epoch": 0.026871401151631478, + "grad_norm": 0.283203125, "learning_rate": 4.2e-05, - "loss": 3.3568, + "loss": 2.1936, "step": 21 }, { - "epoch": 0.03, - "grad_norm": 2.109375, + "epoch": 0.02815099168266155, + "grad_norm": 0.294921875, "learning_rate": 4.4000000000000006e-05, - "loss": 3.3976, + "loss": 2.2211, "step": 22 }, { - "epoch": 0.03, - "grad_norm": 2.125, + "epoch": 0.02943058221369162, + "grad_norm": 0.345703125, "learning_rate": 4.600000000000001e-05, - "loss": 3.3644, + "loss": 2.2378, "step": 23 }, { - "epoch": 0.03, - "grad_norm": 2.28125, + "epoch": 0.030710172744721688, + "grad_norm": 0.353515625, "learning_rate": 4.8e-05, - "loss": 3.1825, + "loss": 2.2521, "step": 24 }, { - "epoch": 0.03, - "grad_norm": 2.421875, + "epoch": 0.03198976327575176, + "grad_norm": 0.37109375, "learning_rate": 5e-05, - "loss": 3.2047, + "loss": 2.1821, "step": 25 }, { - "epoch": 0.03, - "grad_norm": 2.359375, + "epoch": 0.03326935380678183, + "grad_norm": 0.34765625, "learning_rate": 5.2000000000000004e-05, - "loss": 3.098, + "loss": 2.2399, "step": 26 }, { - "epoch": 0.03, - "grad_norm": 2.390625, + "epoch": 0.0345489443378119, + "grad_norm": 0.400390625, "learning_rate": 5.4000000000000005e-05, - "loss": 2.9831, + "loss": 2.1444, "step": 27 }, { - "epoch": 0.04, - "grad_norm": 2.359375, + "epoch": 0.03582853486884197, + "grad_norm": 0.451171875, "learning_rate": 5.6000000000000006e-05, - "loss": 2.9573, + "loss": 2.1987, "step": 28 }, { - "epoch": 0.04, - "grad_norm": 2.609375, + "epoch": 0.037108125399872044, + "grad_norm": 0.427734375, "learning_rate": 5.8e-05, - "loss": 2.9497, + "loss": 2.159, "step": 29 }, { - "epoch": 0.04, - "grad_norm": 2.125, + "epoch": 0.03838771593090211, + "grad_norm": 0.404296875, "learning_rate": 6e-05, - "loss": 2.5156, - "step": 30 - }, - { - "epoch": 0.04, - "eval_loss": 2.578493356704712, - "eval_runtime": 109.4296, - "eval_samples_per_second": 45.691, - "eval_steps_per_second": 1.435, + "loss": 2.0387, "step": 30 }, { - "epoch": 0.04, - "grad_norm": 2.25, + "epoch": 0.03966730646193218, + "grad_norm": 0.470703125, "learning_rate": 6.2e-05, - "loss": 2.6802, + "loss": 2.144, "step": 31 }, { - "epoch": 0.04, - "grad_norm": 1.984375, + "epoch": 0.04094689699296225, + "grad_norm": 0.431640625, "learning_rate": 6.400000000000001e-05, - "loss": 2.4563, + "loss": 2.0673, "step": 32 }, { - "epoch": 0.04, - "grad_norm": 2.046875, + "epoch": 0.04222648752399232, + "grad_norm": 0.44921875, "learning_rate": 6.6e-05, - "loss": 2.4709, + "loss": 2.0571, "step": 33 }, { - "epoch": 0.04, - "grad_norm": 2.046875, + "epoch": 0.04350607805502239, + "grad_norm": 0.41796875, "learning_rate": 6.800000000000001e-05, - "loss": 2.4096, + "loss": 2.0175, "step": 34 }, { - "epoch": 0.04, - "grad_norm": 1.75, + "epoch": 0.044785668586052464, + "grad_norm": 0.42578125, "learning_rate": 7e-05, - "loss": 2.2541, + "loss": 2.039, "step": 35 }, { - "epoch": 0.05, - "grad_norm": 1.78125, + "epoch": 0.046065259117082535, + "grad_norm": 0.390625, "learning_rate": 7.2e-05, - "loss": 2.1879, + "loss": 1.9393, "step": 36 }, { - "epoch": 0.05, - "grad_norm": 1.6484375, + "epoch": 0.04734484964811261, + "grad_norm": 0.423828125, "learning_rate": 7.4e-05, - "loss": 2.2291, + "loss": 2.0395, "step": 37 }, { - "epoch": 0.05, - "grad_norm": 1.4140625, + "epoch": 0.04862444017914268, + "grad_norm": 0.357421875, "learning_rate": 7.6e-05, - "loss": 2.028, + "loss": 1.9196, "step": 38 }, { - "epoch": 0.05, - "grad_norm": 1.1640625, + "epoch": 0.04990403071017274, + "grad_norm": 0.310546875, "learning_rate": 7.800000000000001e-05, - "loss": 1.9569, + "loss": 1.8573, "step": 39 }, { - "epoch": 0.05, - "grad_norm": 0.9296875, + "epoch": 0.05118362124120281, + "grad_norm": 0.283203125, "learning_rate": 8e-05, - "loss": 1.8834, + "loss": 1.8748, "step": 40 }, { - "epoch": 0.05, - "eval_loss": 1.834492564201355, - "eval_runtime": 109.4942, - "eval_samples_per_second": 45.665, - "eval_steps_per_second": 1.434, + "epoch": 0.05118362124120281, + "eval_loss": 1.8459974527359009, + "eval_runtime": 103.1757, + "eval_samples_per_second": 48.461, + "eval_steps_per_second": 1.522, "step": 40 }, { - "epoch": 0.05, - "grad_norm": 0.765625, + "epoch": 0.052463211772232884, + "grad_norm": 0.287109375, "learning_rate": 8.2e-05, - "loss": 1.9118, + "loss": 1.9095, "step": 41 }, { - "epoch": 0.05, - "grad_norm": 0.640625, + "epoch": 0.053742802303262956, + "grad_norm": 0.3203125, "learning_rate": 8.4e-05, - "loss": 1.8096, + "loss": 1.854, "step": 42 }, { - "epoch": 0.06, - "grad_norm": 0.515625, + "epoch": 0.05502239283429303, + "grad_norm": 0.341796875, "learning_rate": 8.6e-05, - "loss": 1.8097, + "loss": 1.8241, "step": 43 }, { - "epoch": 0.06, - "grad_norm": 0.49609375, + "epoch": 0.0563019833653231, + "grad_norm": 0.3359375, "learning_rate": 8.800000000000001e-05, - "loss": 1.7625, + "loss": 1.8081, "step": 44 }, { - "epoch": 0.06, - "grad_norm": 0.578125, + "epoch": 0.05758157389635317, + "grad_norm": 0.2890625, "learning_rate": 9e-05, - "loss": 1.7719, + "loss": 1.7557, "step": 45 }, { - "epoch": 0.06, - "grad_norm": 0.490234375, + "epoch": 0.05886116442738324, + "grad_norm": 0.25, "learning_rate": 9.200000000000001e-05, - "loss": 1.7783, + "loss": 1.7961, "step": 46 }, { - "epoch": 0.06, - "grad_norm": 0.5625, + "epoch": 0.060140754958413305, + "grad_norm": 0.23828125, "learning_rate": 9.4e-05, - "loss": 1.7888, + "loss": 1.812, "step": 47 }, { - "epoch": 0.06, - "grad_norm": 0.55078125, + "epoch": 0.061420345489443376, + "grad_norm": 0.26953125, "learning_rate": 9.6e-05, - "loss": 1.7738, + "loss": 1.8099, "step": 48 }, { - "epoch": 0.06, - "grad_norm": 0.5859375, + "epoch": 0.06269993602047345, + "grad_norm": 0.2451171875, "learning_rate": 9.8e-05, - "loss": 1.7428, + "loss": 1.7337, "step": 49 }, { - "epoch": 0.06, - "grad_norm": 0.55078125, + "epoch": 0.06397952655150352, + "grad_norm": 0.279296875, "learning_rate": 0.0001, - "loss": 1.7651, - "step": 50 - }, - { - "epoch": 0.06, - "eval_loss": 1.744554042816162, - "eval_runtime": 109.7416, - "eval_samples_per_second": 45.562, - "eval_steps_per_second": 1.431, + "loss": 1.7997, "step": 50 }, { - "epoch": 0.07, - "grad_norm": 0.58984375, + "epoch": 0.06525911708253358, + "grad_norm": 0.2099609375, "learning_rate": 0.00010200000000000001, - "loss": 1.8558, + "loss": 1.8664, "step": 51 }, { - "epoch": 0.07, - "grad_norm": 0.6171875, + "epoch": 0.06653870761356366, + "grad_norm": 0.2109375, "learning_rate": 0.00010400000000000001, - "loss": 1.7136, + "loss": 1.7195, "step": 52 }, { - "epoch": 0.07, - "grad_norm": 0.6328125, + "epoch": 0.06781829814459372, + "grad_norm": 0.185546875, "learning_rate": 0.00010600000000000002, - "loss": 1.6953, + "loss": 1.7255, "step": 53 }, { - "epoch": 0.07, - "grad_norm": 0.79296875, + "epoch": 0.0690978886756238, + "grad_norm": 0.1904296875, "learning_rate": 0.00010800000000000001, - "loss": 1.7225, + "loss": 1.7667, "step": 54 }, { - "epoch": 0.07, - "grad_norm": 0.83203125, + "epoch": 0.07037747920665387, + "grad_norm": 0.1728515625, "learning_rate": 0.00011000000000000002, - "loss": 1.6275, + "loss": 1.6752, "step": 55 }, { - "epoch": 0.07, - "grad_norm": 0.84375, + "epoch": 0.07165706973768395, + "grad_norm": 0.173828125, "learning_rate": 0.00011200000000000001, - "loss": 1.6724, + "loss": 1.7413, "step": 56 }, { - "epoch": 0.07, - "grad_norm": 0.5546875, + "epoch": 0.07293666026871401, + "grad_norm": 0.177734375, "learning_rate": 0.00011399999999999999, - "loss": 1.627, + "loss": 1.7498, "step": 57 }, { - "epoch": 0.07, - "grad_norm": 0.388671875, + "epoch": 0.07421625079974409, + "grad_norm": 0.1494140625, "learning_rate": 0.000116, - "loss": 1.6624, + "loss": 1.7908, "step": 58 }, { - "epoch": 0.08, - "grad_norm": 0.34375, + "epoch": 0.07549584133077415, + "grad_norm": 0.1533203125, "learning_rate": 0.000118, - "loss": 1.592, + "loss": 1.7423, "step": 59 }, { - "epoch": 0.08, - "grad_norm": 0.306640625, + "epoch": 0.07677543186180422, + "grad_norm": 0.154296875, "learning_rate": 0.00012, - "loss": 1.638, + "loss": 1.7723, "step": 60 }, { - "epoch": 0.08, - "eval_loss": 1.5958868265151978, - "eval_runtime": 109.6677, - "eval_samples_per_second": 45.592, - "eval_steps_per_second": 1.432, + "epoch": 0.07677543186180422, + "eval_loss": 1.7295056581497192, + "eval_runtime": 103.1327, + "eval_samples_per_second": 48.481, + "eval_steps_per_second": 1.522, "step": 60 }, { - "epoch": 0.08, - "grad_norm": 0.302734375, + "epoch": 0.0780550223928343, + "grad_norm": 0.1552734375, "learning_rate": 0.000122, - "loss": 1.5942, + "loss": 1.747, "step": 61 }, { - "epoch": 0.08, - "grad_norm": 0.30078125, + "epoch": 0.07933461292386436, + "grad_norm": 0.1396484375, "learning_rate": 0.000124, - "loss": 1.6169, + "loss": 1.7531, "step": 62 }, { - "epoch": 0.08, - "grad_norm": 0.296875, + "epoch": 0.08061420345489444, + "grad_norm": 0.1650390625, "learning_rate": 0.000126, - "loss": 1.6183, + "loss": 1.7453, "step": 63 }, { - "epoch": 0.08, - "grad_norm": 0.3046875, + "epoch": 0.0818937939859245, + "grad_norm": 0.1484375, "learning_rate": 0.00012800000000000002, - "loss": 1.5629, + "loss": 1.6982, "step": 64 }, { - "epoch": 0.08, - "grad_norm": 0.3203125, + "epoch": 0.08317338451695458, + "grad_norm": 0.140625, "learning_rate": 0.00013000000000000002, - "loss": 1.6236, + "loss": 1.7718, "step": 65 }, { - "epoch": 0.08, - "grad_norm": 0.267578125, + "epoch": 0.08445297504798464, + "grad_norm": 0.1630859375, "learning_rate": 0.000132, - "loss": 1.6701, + "loss": 1.815, "step": 66 }, { - "epoch": 0.09, - "grad_norm": 0.294921875, + "epoch": 0.08573256557901472, + "grad_norm": 0.1552734375, "learning_rate": 0.000134, - "loss": 1.6067, + "loss": 1.7436, "step": 67 }, { - "epoch": 0.09, - "grad_norm": 0.30859375, + "epoch": 0.08701215611004479, + "grad_norm": 0.1494140625, "learning_rate": 0.00013600000000000003, - "loss": 1.5935, + "loss": 1.7252, "step": 68 }, { - "epoch": 0.09, - "grad_norm": 0.27734375, + "epoch": 0.08829174664107485, + "grad_norm": 0.146484375, "learning_rate": 0.000138, - "loss": 1.578, + "loss": 1.6938, "step": 69 }, { - "epoch": 0.09, - "grad_norm": 0.26953125, + "epoch": 0.08957133717210493, + "grad_norm": 0.138671875, "learning_rate": 0.00014, - "loss": 1.592, - "step": 70 - }, - { - "epoch": 0.09, - "eval_loss": 1.571932077407837, - "eval_runtime": 109.6303, - "eval_samples_per_second": 45.608, - "eval_steps_per_second": 1.432, + "loss": 1.688, "step": 70 }, { - "epoch": 0.09, - "grad_norm": 0.28125, + "epoch": 0.09085092770313499, + "grad_norm": 0.1484375, "learning_rate": 0.000142, - "loss": 1.5923, + "loss": 1.7185, "step": 71 }, { - "epoch": 0.09, - "grad_norm": 0.283203125, + "epoch": 0.09213051823416507, + "grad_norm": 0.1357421875, "learning_rate": 0.000144, - "loss": 1.5838, + "loss": 1.7416, "step": 72 }, { - "epoch": 0.09, - "grad_norm": 0.26171875, + "epoch": 0.09341010876519514, + "grad_norm": 0.12890625, "learning_rate": 0.000146, - "loss": 1.5646, + "loss": 1.6831, "step": 73 }, { - "epoch": 0.09, - "grad_norm": 0.255859375, + "epoch": 0.09468969929622521, + "grad_norm": 0.13671875, "learning_rate": 0.000148, - "loss": 1.6265, + "loss": 1.7872, "step": 74 }, { - "epoch": 0.1, - "grad_norm": 0.271484375, + "epoch": 0.09596928982725528, + "grad_norm": 0.142578125, "learning_rate": 0.00015000000000000001, - "loss": 1.622, + "loss": 1.7634, "step": 75 }, { - "epoch": 0.1, - "grad_norm": 0.2373046875, + "epoch": 0.09724888035828536, + "grad_norm": 0.1298828125, "learning_rate": 0.000152, - "loss": 1.5504, + "loss": 1.7029, "step": 76 }, { - "epoch": 0.1, - "grad_norm": 0.267578125, + "epoch": 0.09852847088931542, + "grad_norm": 0.1376953125, "learning_rate": 0.000154, - "loss": 1.6184, + "loss": 1.7621, "step": 77 }, { - "epoch": 0.1, - "grad_norm": 0.228515625, + "epoch": 0.09980806142034548, + "grad_norm": 0.1708984375, "learning_rate": 0.00015600000000000002, - "loss": 1.5829, + "loss": 1.7248, "step": 78 }, { - "epoch": 0.1, - "grad_norm": 0.2451171875, + "epoch": 0.10108765195137556, + "grad_norm": 0.1611328125, "learning_rate": 0.00015800000000000002, - "loss": 1.5305, + "loss": 1.664, "step": 79 }, { - "epoch": 0.1, - "grad_norm": 0.25, + "epoch": 0.10236724248240563, + "grad_norm": 0.1513671875, "learning_rate": 0.00016, - "loss": 1.554, + "loss": 1.694, "step": 80 }, { - "epoch": 0.1, - "eval_loss": 1.5540077686309814, - "eval_runtime": 109.5859, - "eval_samples_per_second": 45.626, - "eval_steps_per_second": 1.433, + "epoch": 0.10236724248240563, + "eval_loss": 1.6964631080627441, + "eval_runtime": 103.1498, + "eval_samples_per_second": 48.473, + "eval_steps_per_second": 1.522, "step": 80 }, { - "epoch": 0.1, - "grad_norm": 0.24609375, + "epoch": 0.1036468330134357, + "grad_norm": 0.14453125, "learning_rate": 0.000162, - "loss": 1.5691, + "loss": 1.6872, "step": 81 }, { - "epoch": 0.1, - "grad_norm": 0.2470703125, + "epoch": 0.10492642354446577, + "grad_norm": 0.1474609375, "learning_rate": 0.000164, - "loss": 1.542, + "loss": 1.684, "step": 82 }, { - "epoch": 0.11, - "grad_norm": 0.2431640625, + "epoch": 0.10620601407549585, + "grad_norm": 0.1826171875, "learning_rate": 0.000166, - "loss": 1.5844, + "loss": 1.7423, "step": 83 }, { - "epoch": 0.11, - "grad_norm": 0.2333984375, + "epoch": 0.10748560460652591, + "grad_norm": 0.169921875, "learning_rate": 0.000168, - "loss": 1.5429, + "loss": 1.7016, "step": 84 }, { - "epoch": 0.11, - "grad_norm": 0.2255859375, + "epoch": 0.10876519513755598, + "grad_norm": 0.150390625, "learning_rate": 0.00017, - "loss": 1.5836, + "loss": 1.7208, "step": 85 }, { - "epoch": 0.11, - "grad_norm": 0.25390625, + "epoch": 0.11004478566858605, + "grad_norm": 0.16015625, "learning_rate": 0.000172, - "loss": 1.5227, + "loss": 1.6444, "step": 86 }, { - "epoch": 0.11, - "grad_norm": 0.2412109375, + "epoch": 0.11132437619961612, + "grad_norm": 0.1533203125, "learning_rate": 0.000174, - "loss": 1.5495, + "loss": 1.7113, "step": 87 }, { - "epoch": 0.11, - "grad_norm": 0.21484375, + "epoch": 0.1126039667306462, + "grad_norm": 0.1513671875, "learning_rate": 0.00017600000000000002, - "loss": 1.6124, + "loss": 1.7502, "step": 88 }, { - "epoch": 0.11, - "grad_norm": 0.2470703125, + "epoch": 0.11388355726167626, + "grad_norm": 0.2080078125, "learning_rate": 0.00017800000000000002, - "loss": 1.5469, + "loss": 1.7002, "step": 89 }, { - "epoch": 0.12, - "grad_norm": 0.2470703125, + "epoch": 0.11516314779270634, + "grad_norm": 0.1796875, "learning_rate": 0.00018, - "loss": 1.5215, - "step": 90 - }, - { - "epoch": 0.12, - "eval_loss": 1.5420269966125488, - "eval_runtime": 109.6203, - "eval_samples_per_second": 45.612, - "eval_steps_per_second": 1.432, + "loss": 1.6666, "step": 90 }, { - "epoch": 0.12, - "grad_norm": 0.22265625, + "epoch": 0.1164427383237364, + "grad_norm": 0.1484375, "learning_rate": 0.000182, - "loss": 1.5654, + "loss": 1.7207, "step": 91 }, { - "epoch": 0.12, - "grad_norm": 0.2392578125, + "epoch": 0.11772232885476648, + "grad_norm": 0.162109375, "learning_rate": 0.00018400000000000003, - "loss": 1.494, + "loss": 1.6261, "step": 92 }, { - "epoch": 0.12, - "grad_norm": 0.216796875, + "epoch": 0.11900191938579655, + "grad_norm": 0.1787109375, "learning_rate": 0.00018600000000000002, - "loss": 1.6285, + "loss": 1.7819, "step": 93 }, { - "epoch": 0.12, - "grad_norm": 0.2451171875, + "epoch": 0.12028150991682661, + "grad_norm": 0.1572265625, "learning_rate": 0.000188, - "loss": 1.5177, + "loss": 1.6319, "step": 94 }, { - "epoch": 0.12, - "grad_norm": 0.2177734375, + "epoch": 0.12156110044785669, + "grad_norm": 0.171875, "learning_rate": 0.00019, - "loss": 1.5659, + "loss": 1.6837, "step": 95 }, { - "epoch": 0.12, - "grad_norm": 0.220703125, + "epoch": 0.12284069097888675, + "grad_norm": 0.1884765625, "learning_rate": 0.000192, - "loss": 1.6005, + "loss": 1.7428, "step": 96 }, { - "epoch": 0.12, - "grad_norm": 0.251953125, + "epoch": 0.12412028150991683, + "grad_norm": 0.1748046875, "learning_rate": 0.000194, - "loss": 1.5713, + "loss": 1.686, "step": 97 }, { - "epoch": 0.13, - "grad_norm": 0.2294921875, + "epoch": 0.1253998720409469, + "grad_norm": 0.1513671875, "learning_rate": 0.000196, - "loss": 1.4815, + "loss": 1.6477, "step": 98 }, { - "epoch": 0.13, - "grad_norm": 0.216796875, + "epoch": 0.12667946257197696, + "grad_norm": 0.1904296875, "learning_rate": 0.00019800000000000002, - "loss": 1.489, + "loss": 1.6311, "step": 99 }, { - "epoch": 0.13, - "grad_norm": 0.216796875, + "epoch": 0.12795905310300704, + "grad_norm": 0.1923828125, "learning_rate": 0.0002, - "loss": 1.497, + "loss": 1.6155, "step": 100 }, { - "epoch": 0.13, - "eval_loss": 1.5331306457519531, - "eval_runtime": 108.6942, - "eval_samples_per_second": 46.001, - "eval_steps_per_second": 1.444, + "epoch": 0.12795905310300704, + "eval_loss": 1.6780136823654175, + "eval_runtime": 103.1139, + "eval_samples_per_second": 48.49, + "eval_steps_per_second": 1.523, "step": 100 }, { - "epoch": 0.13, - "grad_norm": 0.23046875, - "learning_rate": 0.00019871794871794874, - "loss": 1.4933, + "epoch": 0.12923864363403711, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019951456310679614, + "loss": 1.6283, "step": 101 }, { - "epoch": 0.13, - "grad_norm": 0.23828125, - "learning_rate": 0.00019743589743589744, - "loss": 1.5385, + "epoch": 0.13051823416506717, + "grad_norm": 0.158203125, + "learning_rate": 0.00019902912621359224, + "loss": 1.6891, "step": 102 }, { - "epoch": 0.13, - "grad_norm": 0.2431640625, - "learning_rate": 0.00019615384615384615, - "loss": 1.5745, + "epoch": 0.13179782469609724, + "grad_norm": 0.171875, + "learning_rate": 0.00019854368932038837, + "loss": 1.738, "step": 103 }, { - "epoch": 0.13, - "grad_norm": 0.220703125, - "learning_rate": 0.00019487179487179487, - "loss": 1.5485, + "epoch": 0.13307741522712732, + "grad_norm": 0.169921875, + "learning_rate": 0.00019805825242718447, + "loss": 1.7, "step": 104 }, { - "epoch": 0.13, - "grad_norm": 0.248046875, - "learning_rate": 0.0001935897435897436, - "loss": 1.5391, + "epoch": 0.1343570057581574, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001975728155339806, + "loss": 1.6759, "step": 105 }, { - "epoch": 0.14, - "grad_norm": 0.23046875, - "learning_rate": 0.00019230769230769233, - "loss": 1.5649, + "epoch": 0.13563659628918745, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001970873786407767, + "loss": 1.7018, "step": 106 }, { - "epoch": 0.14, - "grad_norm": 0.212890625, - "learning_rate": 0.00019102564102564104, - "loss": 1.5543, + "epoch": 0.13691618682021753, + "grad_norm": 0.1396484375, + "learning_rate": 0.00019660194174757283, + "loss": 1.6875, "step": 107 }, { - "epoch": 0.14, - "grad_norm": 0.23046875, - "learning_rate": 0.00018974358974358974, - "loss": 1.5377, + "epoch": 0.1381957773512476, + "grad_norm": 0.169921875, + "learning_rate": 0.00019611650485436895, + "loss": 1.6614, "step": 108 }, { - "epoch": 0.14, - "grad_norm": 0.2314453125, - "learning_rate": 0.00018846153846153847, - "loss": 1.534, + "epoch": 0.13947536788227768, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019563106796116505, + "loss": 1.6804, "step": 109 }, { - "epoch": 0.14, - "grad_norm": 0.2119140625, - "learning_rate": 0.0001871794871794872, - "loss": 1.5583, - "step": 110 - }, - { - "epoch": 0.14, - "eval_loss": 1.526591181755066, - "eval_runtime": 108.4319, - "eval_samples_per_second": 46.112, - "eval_steps_per_second": 1.448, + "epoch": 0.14075495841330773, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019514563106796118, + "loss": 1.7319, "step": 110 }, { - "epoch": 0.14, - "grad_norm": 0.234375, - "learning_rate": 0.0001858974358974359, - "loss": 1.4938, + "epoch": 0.1420345489443378, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019466019417475728, + "loss": 1.6407, "step": 111 }, { - "epoch": 0.14, - "grad_norm": 0.248046875, - "learning_rate": 0.00018461538461538463, - "loss": 1.5327, + "epoch": 0.1433141394753679, + "grad_norm": 0.17578125, + "learning_rate": 0.0001941747572815534, + "loss": 1.6765, "step": 112 }, { - "epoch": 0.14, - "grad_norm": 0.2294921875, - "learning_rate": 0.00018333333333333334, - "loss": 1.5475, + "epoch": 0.14459373000639794, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019368932038834954, + "loss": 1.6793, "step": 113 }, { - "epoch": 0.15, - "grad_norm": 0.2431640625, - "learning_rate": 0.00018205128205128207, - "loss": 1.5718, + "epoch": 0.14587332053742802, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019320388349514564, + "loss": 1.685, "step": 114 }, { - "epoch": 0.15, - "grad_norm": 0.23046875, - "learning_rate": 0.00018076923076923077, - "loss": 1.53, + "epoch": 0.1471529110684581, + "grad_norm": 0.16796875, + "learning_rate": 0.00019271844660194177, + "loss": 1.6685, "step": 115 }, { - "epoch": 0.15, - "grad_norm": 0.2177734375, - "learning_rate": 0.0001794871794871795, - "loss": 1.5675, + "epoch": 0.14843250159948818, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019223300970873787, + "loss": 1.689, "step": 116 }, { - "epoch": 0.15, - "grad_norm": 0.2431640625, - "learning_rate": 0.00017820512820512823, - "loss": 1.6159, + "epoch": 0.14971209213051823, + "grad_norm": 0.185546875, + "learning_rate": 0.000191747572815534, + "loss": 1.7595, "step": 117 }, { - "epoch": 0.15, - "grad_norm": 0.2119140625, - "learning_rate": 0.00017692307692307693, - "loss": 1.5419, + "epoch": 0.1509916826615483, + "grad_norm": 0.15625, + "learning_rate": 0.0001912621359223301, + "loss": 1.6845, "step": 118 }, { - "epoch": 0.15, - "grad_norm": 0.2294921875, - "learning_rate": 0.00017564102564102566, - "loss": 1.4955, + "epoch": 0.15227127319257838, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019077669902912623, + "loss": 1.6387, "step": 119 }, { - "epoch": 0.15, - "grad_norm": 0.2236328125, - "learning_rate": 0.00017435897435897436, - "loss": 1.5639, + "epoch": 0.15355086372360843, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019029126213592236, + "loss": 1.7231, "step": 120 }, { - "epoch": 0.15, - "eval_loss": 1.521650791168213, - "eval_runtime": 108.8276, - "eval_samples_per_second": 45.944, - "eval_steps_per_second": 1.443, + "epoch": 0.15355086372360843, + "eval_loss": 1.667124629020691, + "eval_runtime": 103.1502, + "eval_samples_per_second": 48.473, + "eval_steps_per_second": 1.522, "step": 120 }, { - "epoch": 0.15, - "grad_norm": 0.2119140625, - "learning_rate": 0.0001730769230769231, - "loss": 1.5569, + "epoch": 0.1548304542546385, + "grad_norm": 0.1728515625, + "learning_rate": 0.00018980582524271846, + "loss": 1.7072, "step": 121 }, { - "epoch": 0.16, - "grad_norm": 0.24609375, - "learning_rate": 0.0001717948717948718, - "loss": 1.527, + "epoch": 0.1561100447856686, + "grad_norm": 0.173828125, + "learning_rate": 0.00018932038834951458, + "loss": 1.6321, "step": 122 }, { - "epoch": 0.16, - "grad_norm": 0.2197265625, - "learning_rate": 0.00017051282051282053, - "loss": 1.538, + "epoch": 0.15738963531669867, + "grad_norm": 0.1513671875, + "learning_rate": 0.00018883495145631069, + "loss": 1.6628, "step": 123 }, { - "epoch": 0.16, - "grad_norm": 0.26953125, - "learning_rate": 0.00016923076923076923, - "loss": 1.5353, + "epoch": 0.15866922584772872, + "grad_norm": 0.169921875, + "learning_rate": 0.00018834951456310681, + "loss": 1.6786, "step": 124 }, { - "epoch": 0.16, - "grad_norm": 0.2353515625, - "learning_rate": 0.00016794871794871796, - "loss": 1.5495, + "epoch": 0.1599488163787588, + "grad_norm": 0.1767578125, + "learning_rate": 0.00018786407766990291, + "loss": 1.6748, "step": 125 }, { - "epoch": 0.16, - "grad_norm": 0.2421875, - "learning_rate": 0.0001666666666666667, - "loss": 1.5366, + "epoch": 0.16122840690978887, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018737864077669904, + "loss": 1.6798, "step": 126 }, { - "epoch": 0.16, - "grad_norm": 0.228515625, - "learning_rate": 0.0001653846153846154, - "loss": 1.5695, + "epoch": 0.16250799744081892, + "grad_norm": 0.1513671875, + "learning_rate": 0.00018689320388349517, + "loss": 1.7003, "step": 127 }, { - "epoch": 0.16, - "grad_norm": 0.2314453125, - "learning_rate": 0.0001641025641025641, - "loss": 1.5112, + "epoch": 0.163787587971849, + "grad_norm": 0.166015625, + "learning_rate": 0.00018640776699029127, + "loss": 1.6482, "step": 128 }, { - "epoch": 0.17, - "grad_norm": 0.234375, - "learning_rate": 0.00016282051282051282, - "loss": 1.5919, + "epoch": 0.16506717850287908, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001859223300970874, + "loss": 1.7206, "step": 129 }, { - "epoch": 0.17, - "grad_norm": 0.2216796875, - "learning_rate": 0.00016153846153846155, - "loss": 1.5785, - "step": 130 - }, - { - "epoch": 0.17, - "eval_loss": 1.517764925956726, - "eval_runtime": 109.0406, - "eval_samples_per_second": 45.854, - "eval_steps_per_second": 1.44, + "epoch": 0.16634676903390916, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001854368932038835, + "loss": 1.7277, "step": 130 }, { - "epoch": 0.17, - "grad_norm": 0.248046875, - "learning_rate": 0.00016025641025641028, - "loss": 1.5056, + "epoch": 0.1676263595649392, + "grad_norm": 0.1787109375, + "learning_rate": 0.00018495145631067963, + "loss": 1.633, "step": 131 }, { - "epoch": 0.17, - "grad_norm": 0.236328125, - "learning_rate": 0.00015897435897435896, - "loss": 1.5427, + "epoch": 0.1689059500959693, + "grad_norm": 0.173828125, + "learning_rate": 0.00018446601941747576, + "loss": 1.7117, "step": 132 }, { - "epoch": 0.17, - "grad_norm": 0.23046875, - "learning_rate": 0.0001576923076923077, - "loss": 1.5357, + "epoch": 0.17018554062699937, + "grad_norm": 0.1865234375, + "learning_rate": 0.00018398058252427186, + "loss": 1.7057, "step": 133 }, { - "epoch": 0.17, - "grad_norm": 0.21875, - "learning_rate": 0.00015641025641025642, - "loss": 1.5671, + "epoch": 0.17146513115802944, + "grad_norm": 0.16796875, + "learning_rate": 0.00018349514563106799, + "loss": 1.7182, "step": 134 }, { - "epoch": 0.17, - "grad_norm": 0.2421875, - "learning_rate": 0.00015512820512820515, - "loss": 1.5247, + "epoch": 0.1727447216890595, + "grad_norm": 0.171875, + "learning_rate": 0.0001830097087378641, + "loss": 1.68, "step": 135 }, { - "epoch": 0.17, - "grad_norm": 0.2412109375, - "learning_rate": 0.00015384615384615385, - "loss": 1.557, + "epoch": 0.17402431222008957, + "grad_norm": 0.185546875, + "learning_rate": 0.00018252427184466022, + "loss": 1.6948, "step": 136 }, { - "epoch": 0.18, - "grad_norm": 0.26953125, - "learning_rate": 0.00015256410256410255, - "loss": 1.5042, + "epoch": 0.17530390275111965, + "grad_norm": 0.189453125, + "learning_rate": 0.00018203883495145632, + "loss": 1.6332, "step": 137 }, { - "epoch": 0.18, - "grad_norm": 0.2373046875, - "learning_rate": 0.00015128205128205128, - "loss": 1.5061, + "epoch": 0.1765834932821497, + "grad_norm": 0.17578125, + "learning_rate": 0.00018155339805825244, + "loss": 1.6327, "step": 138 }, { - "epoch": 0.18, - "grad_norm": 0.26953125, - "learning_rate": 0.00015000000000000001, - "loss": 1.5608, + "epoch": 0.17786308381317978, + "grad_norm": 0.185546875, + "learning_rate": 0.00018106796116504857, + "loss": 1.7119, "step": 139 }, { - "epoch": 0.18, - "grad_norm": 0.25, - "learning_rate": 0.00014871794871794872, - "loss": 1.5235, + "epoch": 0.17914267434420986, + "grad_norm": 0.201171875, + "learning_rate": 0.00018058252427184467, + "loss": 1.6484, "step": 140 }, { - "epoch": 0.18, - "eval_loss": 1.5145833492279053, - "eval_runtime": 109.0187, - "eval_samples_per_second": 45.864, - "eval_steps_per_second": 1.44, + "epoch": 0.17914267434420986, + "eval_loss": 1.6604257822036743, + "eval_runtime": 103.1306, + "eval_samples_per_second": 48.482, + "eval_steps_per_second": 1.522, "step": 140 }, { - "epoch": 0.18, - "grad_norm": 0.2314453125, - "learning_rate": 0.00014743589743589745, - "loss": 1.5099, + "epoch": 0.18042226487523993, + "grad_norm": 0.193359375, + "learning_rate": 0.0001800970873786408, + "loss": 1.6833, "step": 141 }, { - "epoch": 0.18, - "grad_norm": 0.228515625, - "learning_rate": 0.00014615384615384615, - "loss": 1.5413, + "epoch": 0.18170185540626999, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001796116504854369, + "loss": 1.6802, "step": 142 }, { - "epoch": 0.18, - "grad_norm": 0.2373046875, - "learning_rate": 0.00014487179487179488, - "loss": 1.5383, + "epoch": 0.18298144593730006, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017912621359223303, + "loss": 1.6759, "step": 143 }, { - "epoch": 0.18, - "grad_norm": 0.234375, - "learning_rate": 0.0001435897435897436, - "loss": 1.5671, + "epoch": 0.18426103646833014, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017864077669902913, + "loss": 1.7033, "step": 144 }, { - "epoch": 0.19, - "grad_norm": 0.244140625, - "learning_rate": 0.0001423076923076923, - "loss": 1.5102, + "epoch": 0.1855406269993602, + "grad_norm": 0.19921875, + "learning_rate": 0.00017815533980582526, + "loss": 1.6981, "step": 145 }, { - "epoch": 0.19, - "grad_norm": 0.220703125, - "learning_rate": 0.00014102564102564104, - "loss": 1.545, + "epoch": 0.18682021753039027, + "grad_norm": 0.171875, + "learning_rate": 0.0001776699029126214, + "loss": 1.7084, "step": 146 }, { - "epoch": 0.19, - "grad_norm": 0.228515625, - "learning_rate": 0.00013974358974358974, - "loss": 1.5223, + "epoch": 0.18809980806142035, + "grad_norm": 0.1640625, + "learning_rate": 0.0001771844660194175, + "loss": 1.6641, "step": 147 }, { - "epoch": 0.19, - "grad_norm": 0.2275390625, - "learning_rate": 0.00013846153846153847, - "loss": 1.433, + "epoch": 0.18937939859245043, + "grad_norm": 0.162109375, + "learning_rate": 0.00017669902912621362, + "loss": 1.5855, "step": 148 }, { - "epoch": 0.19, - "grad_norm": 0.2353515625, - "learning_rate": 0.00013717948717948718, - "loss": 1.554, + "epoch": 0.19065898912348048, + "grad_norm": 0.158203125, + "learning_rate": 0.00017621359223300972, + "loss": 1.7034, "step": 149 }, { - "epoch": 0.19, - "grad_norm": 0.2314453125, - "learning_rate": 0.0001358974358974359, - "loss": 1.5447, - "step": 150 - }, - { - "epoch": 0.19, - "eval_loss": 1.5118788480758667, - "eval_runtime": 109.1366, - "eval_samples_per_second": 45.814, - "eval_steps_per_second": 1.439, + "epoch": 0.19193857965451055, + "grad_norm": 0.16015625, + "learning_rate": 0.00017572815533980585, + "loss": 1.7136, "step": 150 }, { - "epoch": 0.19, - "grad_norm": 0.259765625, - "learning_rate": 0.00013461538461538464, - "loss": 1.4771, + "epoch": 0.19321817018554063, + "grad_norm": 0.189453125, + "learning_rate": 0.00017524271844660195, + "loss": 1.6045, "step": 151 }, { - "epoch": 0.19, - "grad_norm": 0.240234375, - "learning_rate": 0.00013333333333333334, - "loss": 1.5853, + "epoch": 0.1944977607165707, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017475728155339805, + "loss": 1.7295, "step": 152 }, { - "epoch": 0.2, - "grad_norm": 0.2197265625, - "learning_rate": 0.00013205128205128204, - "loss": 1.5237, + "epoch": 0.19577735124760076, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017427184466019418, + "loss": 1.6617, "step": 153 }, { - "epoch": 0.2, - "grad_norm": 0.2578125, - "learning_rate": 0.00013076923076923077, - "loss": 1.4963, + "epoch": 0.19705694177863084, + "grad_norm": 0.177734375, + "learning_rate": 0.00017378640776699028, + "loss": 1.6196, "step": 154 }, { - "epoch": 0.2, - "grad_norm": 0.2373046875, - "learning_rate": 0.0001294871794871795, - "loss": 1.5393, + "epoch": 0.19833653230966092, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001733009708737864, + "loss": 1.6836, "step": 155 }, { - "epoch": 0.2, - "grad_norm": 0.220703125, - "learning_rate": 0.00012820512820512823, - "loss": 1.4725, + "epoch": 0.19961612284069097, + "grad_norm": 0.16015625, + "learning_rate": 0.00017281553398058253, + "loss": 1.6273, "step": 156 }, { - "epoch": 0.2, - "grad_norm": 0.220703125, - "learning_rate": 0.00012692307692307693, - "loss": 1.5187, + "epoch": 0.20089571337172105, + "grad_norm": 0.16796875, + "learning_rate": 0.00017233009708737864, + "loss": 1.6758, "step": 157 }, { - "epoch": 0.2, - "grad_norm": 0.251953125, - "learning_rate": 0.00012564102564102564, - "loss": 1.5263, + "epoch": 0.20217530390275112, + "grad_norm": 0.1796875, + "learning_rate": 0.00017184466019417476, + "loss": 1.6883, "step": 158 }, { - "epoch": 0.2, - "grad_norm": 0.2470703125, - "learning_rate": 0.00012435897435897437, - "loss": 1.5388, + "epoch": 0.2034548944337812, + "grad_norm": 0.16015625, + "learning_rate": 0.00017135922330097086, + "loss": 1.676, "step": 159 }, { - "epoch": 0.2, - "grad_norm": 0.224609375, - "learning_rate": 0.0001230769230769231, - "loss": 1.4606, + "epoch": 0.20473448496481125, + "grad_norm": 0.1650390625, + "learning_rate": 0.000170873786407767, + "loss": 1.6166, "step": 160 }, { - "epoch": 0.2, - "eval_loss": 1.5093975067138672, - "eval_runtime": 109.3522, - "eval_samples_per_second": 45.724, - "eval_steps_per_second": 1.436, + "epoch": 0.20473448496481125, + "eval_loss": 1.6558058261871338, + "eval_runtime": 103.0857, + "eval_samples_per_second": 48.503, + "eval_steps_per_second": 1.523, "step": 160 }, { - "epoch": 0.21, - "grad_norm": 0.2392578125, - "learning_rate": 0.00012179487179487179, - "loss": 1.5584, + "epoch": 0.20601407549584133, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001703883495145631, + "loss": 1.6962, "step": 161 }, { - "epoch": 0.21, - "grad_norm": 0.224609375, - "learning_rate": 0.00012051282051282052, - "loss": 1.4941, + "epoch": 0.2072936660268714, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016990291262135922, + "loss": 1.643, "step": 162 }, { - "epoch": 0.21, - "grad_norm": 0.236328125, - "learning_rate": 0.00011923076923076923, - "loss": 1.5283, + "epoch": 0.20857325655790146, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016941747572815535, + "loss": 1.6716, "step": 163 }, { - "epoch": 0.21, - "grad_norm": 0.2373046875, - "learning_rate": 0.00011794871794871796, - "loss": 1.4864, + "epoch": 0.20985284708893154, + "grad_norm": 0.181640625, + "learning_rate": 0.00016893203883495145, + "loss": 1.6173, "step": 164 }, { - "epoch": 0.21, - "grad_norm": 0.24609375, - "learning_rate": 0.00011666666666666668, - "loss": 1.4398, + "epoch": 0.21113243761996162, + "grad_norm": 0.1728515625, + "learning_rate": 0.00016844660194174758, + "loss": 1.5833, "step": 165 }, { - "epoch": 0.21, - "grad_norm": 0.251953125, - "learning_rate": 0.00011538461538461538, - "loss": 1.514, + "epoch": 0.2124120281509917, + "grad_norm": 0.1884765625, + "learning_rate": 0.00016796116504854368, + "loss": 1.6686, "step": 166 }, { - "epoch": 0.21, - "grad_norm": 0.25390625, - "learning_rate": 0.0001141025641025641, - "loss": 1.5578, + "epoch": 0.21369161868202174, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001674757281553398, + "loss": 1.6964, "step": 167 }, { - "epoch": 0.21, - "grad_norm": 0.2353515625, - "learning_rate": 0.00011282051282051283, - "loss": 1.5682, + "epoch": 0.21497120921305182, + "grad_norm": 0.1796875, + "learning_rate": 0.00016699029126213594, + "loss": 1.6934, "step": 168 }, { - "epoch": 0.22, - "grad_norm": 0.23046875, - "learning_rate": 0.00011153846153846154, - "loss": 1.4911, + "epoch": 0.2162507997440819, + "grad_norm": 0.169921875, + "learning_rate": 0.00016650485436893204, + "loss": 1.6628, "step": 169 }, { - "epoch": 0.22, - "grad_norm": 0.236328125, - "learning_rate": 0.00011025641025641027, - "loss": 1.5906, - "step": 170 - }, - { - "epoch": 0.22, - "eval_loss": 1.5074635744094849, - "eval_runtime": 108.9797, - "eval_samples_per_second": 45.88, - "eval_steps_per_second": 1.441, + "epoch": 0.21753039027511195, + "grad_norm": 0.1826171875, + "learning_rate": 0.00016601941747572817, + "loss": 1.7518, "step": 170 }, { - "epoch": 0.22, - "grad_norm": 0.271484375, - "learning_rate": 0.00010897435897435896, - "loss": 1.5266, + "epoch": 0.21880998080614203, + "grad_norm": 0.203125, + "learning_rate": 0.00016553398058252427, + "loss": 1.6653, "step": 171 }, { - "epoch": 0.22, - "grad_norm": 0.23828125, - "learning_rate": 0.0001076923076923077, - "loss": 1.4515, + "epoch": 0.2200895713371721, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001650485436893204, + "loss": 1.6052, "step": 172 }, { - "epoch": 0.22, - "grad_norm": 0.2451171875, - "learning_rate": 0.00010641025641025641, - "loss": 1.5492, + "epoch": 0.22136916186820219, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001645631067961165, + "loss": 1.6775, "step": 173 }, { - "epoch": 0.22, - "grad_norm": 0.2314453125, - "learning_rate": 0.00010512820512820514, - "loss": 1.4961, + "epoch": 0.22264875239923224, + "grad_norm": 0.18359375, + "learning_rate": 0.00016407766990291262, + "loss": 1.6516, "step": 174 }, { - "epoch": 0.22, - "grad_norm": 0.2421875, - "learning_rate": 0.00010384615384615386, - "loss": 1.5594, + "epoch": 0.22392834293026231, + "grad_norm": 0.1796875, + "learning_rate": 0.00016359223300970875, + "loss": 1.7263, "step": 175 }, { - "epoch": 0.23, - "grad_norm": 0.2275390625, - "learning_rate": 0.00010256410256410256, - "loss": 1.5469, + "epoch": 0.2252079334612924, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016310679611650485, + "loss": 1.6874, "step": 176 }, { - "epoch": 0.23, - "grad_norm": 0.240234375, - "learning_rate": 0.00010128205128205129, - "loss": 1.4925, + "epoch": 0.22648752399232247, + "grad_norm": 0.1845703125, + "learning_rate": 0.00016262135922330098, + "loss": 1.6519, "step": 177 }, { - "epoch": 0.23, - "grad_norm": 0.2578125, - "learning_rate": 0.0001, - "loss": 1.4808, + "epoch": 0.22776711452335252, + "grad_norm": 0.1875, + "learning_rate": 0.00016213592233009708, + "loss": 1.6241, "step": 178 }, { - "epoch": 0.23, - "grad_norm": 0.251953125, - "learning_rate": 9.871794871794872e-05, - "loss": 1.4963, + "epoch": 0.2290467050543826, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001616504854368932, + "loss": 1.6657, "step": 179 }, { - "epoch": 0.23, - "grad_norm": 0.2578125, - "learning_rate": 9.743589743589744e-05, - "loss": 1.5397, + "epoch": 0.23032629558541268, + "grad_norm": 0.20703125, + "learning_rate": 0.0001611650485436893, + "loss": 1.6658, "step": 180 }, { - "epoch": 0.23, - "eval_loss": 1.5060302019119263, - "eval_runtime": 109.1672, - "eval_samples_per_second": 45.801, - "eval_steps_per_second": 1.438, + "epoch": 0.23032629558541268, + "eval_loss": 1.6510590314865112, + "eval_runtime": 103.101, + "eval_samples_per_second": 48.496, + "eval_steps_per_second": 1.523, "step": 180 }, { - "epoch": 0.23, - "grad_norm": 0.2451171875, - "learning_rate": 9.615384615384617e-05, - "loss": 1.5202, + "epoch": 0.23160588611644273, + "grad_norm": 0.181640625, + "learning_rate": 0.00016067961165048544, + "loss": 1.6732, "step": 181 }, { - "epoch": 0.23, - "grad_norm": 0.2158203125, - "learning_rate": 9.487179487179487e-05, - "loss": 1.488, + "epoch": 0.2328854766474728, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016019417475728157, + "loss": 1.6288, "step": 182 }, { - "epoch": 0.23, - "grad_norm": 0.25390625, - "learning_rate": 9.35897435897436e-05, - "loss": 1.5124, + "epoch": 0.23416506717850288, + "grad_norm": 0.181640625, + "learning_rate": 0.00015970873786407767, + "loss": 1.65, "step": 183 }, { - "epoch": 0.24, - "grad_norm": 0.2578125, - "learning_rate": 9.230769230769232e-05, - "loss": 1.4791, + "epoch": 0.23544465770953296, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001592233009708738, + "loss": 1.643, "step": 184 }, { - "epoch": 0.24, - "grad_norm": 0.2421875, - "learning_rate": 9.102564102564103e-05, - "loss": 1.6039, + "epoch": 0.236724248240563, + "grad_norm": 0.169921875, + "learning_rate": 0.0001587378640776699, + "loss": 1.7358, "step": 185 }, { - "epoch": 0.24, - "grad_norm": 0.2353515625, - "learning_rate": 8.974358974358975e-05, - "loss": 1.5389, + "epoch": 0.2380038387715931, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015825242718446603, + "loss": 1.6775, "step": 186 }, { - "epoch": 0.24, - "grad_norm": 0.2216796875, - "learning_rate": 8.846153846153847e-05, - "loss": 1.5542, + "epoch": 0.23928342930262317, + "grad_norm": 0.171875, + "learning_rate": 0.00015776699029126213, + "loss": 1.7007, "step": 187 }, { - "epoch": 0.24, - "grad_norm": 0.2353515625, - "learning_rate": 8.717948717948718e-05, - "loss": 1.5131, + "epoch": 0.24056301983365322, + "grad_norm": 0.16796875, + "learning_rate": 0.00015728155339805825, + "loss": 1.6372, "step": 188 }, { - "epoch": 0.24, - "grad_norm": 0.248046875, - "learning_rate": 8.58974358974359e-05, - "loss": 1.5371, + "epoch": 0.2418426103646833, + "grad_norm": 0.2041015625, + "learning_rate": 0.00015679611650485438, + "loss": 1.7057, "step": 189 }, { - "epoch": 0.24, - "grad_norm": 0.2392578125, - "learning_rate": 8.461538461538461e-05, - "loss": 1.5264, - "step": 190 - }, - { - "epoch": 0.24, - "eval_loss": 1.5048484802246094, - "eval_runtime": 109.1034, - "eval_samples_per_second": 45.828, - "eval_steps_per_second": 1.439, + "epoch": 0.24312220089571338, + "grad_norm": 0.19140625, + "learning_rate": 0.00015631067961165048, + "loss": 1.6965, "step": 190 }, { - "epoch": 0.24, - "grad_norm": 0.236328125, - "learning_rate": 8.333333333333334e-05, - "loss": 1.5295, + "epoch": 0.24440179142674345, + "grad_norm": 0.1953125, + "learning_rate": 0.0001558252427184466, + "loss": 1.6888, "step": 191 }, { - "epoch": 0.25, - "grad_norm": 0.2333984375, - "learning_rate": 8.205128205128205e-05, - "loss": 1.5677, + "epoch": 0.2456813819577735, + "grad_norm": 0.181640625, + "learning_rate": 0.0001553398058252427, + "loss": 1.7146, "step": 192 }, { - "epoch": 0.25, - "grad_norm": 0.255859375, - "learning_rate": 8.076923076923078e-05, - "loss": 1.4473, + "epoch": 0.24696097248880358, + "grad_norm": 0.2001953125, + "learning_rate": 0.00015485436893203884, + "loss": 1.5767, "step": 193 }, { - "epoch": 0.25, - "grad_norm": 0.2353515625, - "learning_rate": 7.948717948717948e-05, - "loss": 1.5756, + "epoch": 0.24824056301983366, + "grad_norm": 0.185546875, + "learning_rate": 0.00015436893203883497, + "loss": 1.7411, "step": 194 }, { - "epoch": 0.25, - "grad_norm": 0.236328125, - "learning_rate": 7.820512820512821e-05, - "loss": 1.4832, + "epoch": 0.2495201535508637, + "grad_norm": 0.181640625, + "learning_rate": 0.00015388349514563107, + "loss": 1.6321, "step": 195 }, { - "epoch": 0.25, - "grad_norm": 0.2294921875, - "learning_rate": 7.692307692307693e-05, - "loss": 1.5144, + "epoch": 0.2507997440818938, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001533980582524272, + "loss": 1.6406, "step": 196 }, { - "epoch": 0.25, - "grad_norm": 0.248046875, - "learning_rate": 7.564102564102564e-05, - "loss": 1.5077, + "epoch": 0.25207933461292387, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001529126213592233, + "loss": 1.6426, "step": 197 }, { - "epoch": 0.25, - "grad_norm": 0.21875, - "learning_rate": 7.435897435897436e-05, - "loss": 1.5042, + "epoch": 0.2533589251439539, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015242718446601943, + "loss": 1.643, "step": 198 }, { - "epoch": 0.25, - "grad_norm": 0.22265625, - "learning_rate": 7.307692307692307e-05, - "loss": 1.5022, + "epoch": 0.254638515674984, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015194174757281553, + "loss": 1.6484, "step": 199 }, { - "epoch": 0.26, - "grad_norm": 0.265625, - "learning_rate": 7.17948717948718e-05, - "loss": 1.484, + "epoch": 0.2559181062060141, + "grad_norm": 0.2216796875, + "learning_rate": 0.00015145631067961166, + "loss": 1.6357, "step": 200 }, { - "epoch": 0.26, - "eval_loss": 1.503655195236206, - "eval_runtime": 117.4418, - "eval_samples_per_second": 42.574, - "eval_steps_per_second": 1.337, + "epoch": 0.2559181062060141, + "eval_loss": 1.6480427980422974, + "eval_runtime": 103.1119, + "eval_samples_per_second": 48.491, + "eval_steps_per_second": 1.523, "step": 200 }, { - "epoch": 0.26, - "grad_norm": 0.2431640625, - "learning_rate": 7.051282051282052e-05, - "loss": 1.4973, + "epoch": 0.2571976967370441, + "grad_norm": 0.189453125, + "learning_rate": 0.00015097087378640778, + "loss": 1.6421, "step": 201 }, { - "epoch": 0.26, - "grad_norm": 0.2314453125, - "learning_rate": 6.923076923076924e-05, - "loss": 1.5267, + "epoch": 0.25847728726807423, + "grad_norm": 0.1826171875, + "learning_rate": 0.00015048543689320389, + "loss": 1.6995, "step": 202 }, { - "epoch": 0.26, - "grad_norm": 0.2236328125, - "learning_rate": 6.794871794871795e-05, - "loss": 1.5657, + "epoch": 0.2597568777991043, + "grad_norm": 0.1806640625, + "learning_rate": 0.00015000000000000001, + "loss": 1.7139, "step": 203 }, { - "epoch": 0.26, - "grad_norm": 0.23046875, - "learning_rate": 6.666666666666667e-05, - "loss": 1.5184, + "epoch": 0.26103646833013433, + "grad_norm": 0.1865234375, + "learning_rate": 0.00014951456310679611, + "loss": 1.6852, "step": 204 }, { - "epoch": 0.26, - "grad_norm": 0.23828125, - "learning_rate": 6.538461538461539e-05, - "loss": 1.4522, + "epoch": 0.26231605886116444, + "grad_norm": 0.2001953125, + "learning_rate": 0.00014902912621359224, + "loss": 1.6057, "step": 205 }, { - "epoch": 0.26, - "grad_norm": 0.2451171875, - "learning_rate": 6.410256410256412e-05, - "loss": 1.5309, + "epoch": 0.2635956493921945, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014854368932038834, + "loss": 1.6795, "step": 206 }, { - "epoch": 0.26, - "grad_norm": 0.2373046875, - "learning_rate": 6.282051282051282e-05, - "loss": 1.4503, + "epoch": 0.2648752399232246, + "grad_norm": 0.1826171875, + "learning_rate": 0.00014805825242718447, + "loss": 1.5875, "step": 207 }, { - "epoch": 0.27, - "grad_norm": 0.2265625, - "learning_rate": 6.153846153846155e-05, - "loss": 1.5173, + "epoch": 0.26615483045425464, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001475728155339806, + "loss": 1.6799, "step": 208 }, { - "epoch": 0.27, - "grad_norm": 0.275390625, - "learning_rate": 6.025641025641026e-05, - "loss": 1.5112, + "epoch": 0.2674344209852847, + "grad_norm": 0.208984375, + "learning_rate": 0.0001470873786407767, + "loss": 1.6385, "step": 209 }, { - "epoch": 0.27, - "grad_norm": 0.234375, - "learning_rate": 5.897435897435898e-05, - "loss": 1.5342, - "step": 210 - }, - { - "epoch": 0.27, - "eval_loss": 1.5029850006103516, - "eval_runtime": 118.7123, - "eval_samples_per_second": 42.119, - "eval_steps_per_second": 1.323, + "epoch": 0.2687140115163148, + "grad_norm": 0.169921875, + "learning_rate": 0.00014660194174757283, + "loss": 1.7025, "step": 210 }, { - "epoch": 0.27, - "grad_norm": 0.2333984375, - "learning_rate": 5.769230769230769e-05, - "loss": 1.5216, + "epoch": 0.26999360204734485, + "grad_norm": 0.185546875, + "learning_rate": 0.00014611650485436893, + "loss": 1.6543, "step": 211 }, { - "epoch": 0.27, - "grad_norm": 0.2373046875, - "learning_rate": 5.6410256410256414e-05, - "loss": 1.5005, + "epoch": 0.2712731925783749, + "grad_norm": 0.1806640625, + "learning_rate": 0.00014563106796116506, + "loss": 1.6421, "step": 212 }, { - "epoch": 0.27, - "grad_norm": 0.23046875, - "learning_rate": 5.512820512820514e-05, - "loss": 1.5372, + "epoch": 0.272552783109405, + "grad_norm": 0.185546875, + "learning_rate": 0.0001451456310679612, + "loss": 1.6629, "step": 213 }, { - "epoch": 0.27, - "grad_norm": 0.2353515625, - "learning_rate": 5.384615384615385e-05, - "loss": 1.4916, + "epoch": 0.27383237364043506, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001446601941747573, + "loss": 1.632, "step": 214 }, { - "epoch": 0.28, - "grad_norm": 0.232421875, - "learning_rate": 5.256410256410257e-05, - "loss": 1.4511, + "epoch": 0.2751119641714651, + "grad_norm": 0.1904296875, + "learning_rate": 0.00014417475728155342, + "loss": 1.59, "step": 215 }, { - "epoch": 0.28, - "grad_norm": 0.251953125, - "learning_rate": 5.128205128205128e-05, - "loss": 1.4984, + "epoch": 0.2763915547024952, + "grad_norm": 0.1962890625, + "learning_rate": 0.00014368932038834952, + "loss": 1.6291, "step": 216 }, { - "epoch": 0.28, - "grad_norm": 0.275390625, - "learning_rate": 5e-05, - "loss": 1.5332, + "epoch": 0.27767114523352526, + "grad_norm": 0.21875, + "learning_rate": 0.00014320388349514565, + "loss": 1.6759, "step": 217 }, { - "epoch": 0.28, - "grad_norm": 0.2412109375, - "learning_rate": 4.871794871794872e-05, - "loss": 1.558, + "epoch": 0.27895073576455537, + "grad_norm": 0.19140625, + "learning_rate": 0.00014271844660194175, + "loss": 1.6989, "step": 218 }, { - "epoch": 0.28, - "grad_norm": 0.255859375, - "learning_rate": 4.7435897435897435e-05, - "loss": 1.5057, + "epoch": 0.2802303262955854, + "grad_norm": 0.1845703125, + "learning_rate": 0.00014223300970873787, + "loss": 1.6291, "step": 219 }, { - "epoch": 0.28, - "grad_norm": 0.2431640625, - "learning_rate": 4.615384615384616e-05, - "loss": 1.5374, + "epoch": 0.28150991682661547, + "grad_norm": 0.1767578125, + "learning_rate": 0.000141747572815534, + "loss": 1.6757, "step": 220 }, { - "epoch": 0.28, - "eval_loss": 1.5021294355392456, - "eval_runtime": 120.5989, - "eval_samples_per_second": 41.46, - "eval_steps_per_second": 1.302, + "epoch": 0.28150991682661547, + "eval_loss": 1.645837664604187, + "eval_runtime": 103.1004, + "eval_samples_per_second": 48.496, + "eval_steps_per_second": 1.523, "step": 220 }, { - "epoch": 0.28, - "grad_norm": 0.26953125, - "learning_rate": 4.4871794871794874e-05, - "loss": 1.5332, + "epoch": 0.2827895073576456, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001412621359223301, + "loss": 1.6866, "step": 221 }, { - "epoch": 0.28, - "grad_norm": 0.236328125, - "learning_rate": 4.358974358974359e-05, - "loss": 1.5205, + "epoch": 0.2840690978886756, + "grad_norm": 0.189453125, + "learning_rate": 0.00014077669902912623, + "loss": 1.6816, "step": 222 }, { - "epoch": 0.29, - "grad_norm": 0.224609375, - "learning_rate": 4.230769230769231e-05, - "loss": 1.497, + "epoch": 0.2853486884197057, + "grad_norm": 0.1875, + "learning_rate": 0.00014029126213592233, + "loss": 1.6683, "step": 223 }, { - "epoch": 0.29, - "grad_norm": 0.22265625, - "learning_rate": 4.1025641025641023e-05, - "loss": 1.5158, + "epoch": 0.2866282789507358, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013980582524271846, + "loss": 1.6537, "step": 224 }, { - "epoch": 0.29, - "grad_norm": 0.2421875, - "learning_rate": 3.974358974358974e-05, - "loss": 1.4847, + "epoch": 0.28790786948176583, + "grad_norm": 0.2197265625, + "learning_rate": 0.00013932038834951456, + "loss": 1.6292, "step": 225 }, { - "epoch": 0.29, - "grad_norm": 0.244140625, - "learning_rate": 3.846153846153846e-05, - "loss": 1.4456, + "epoch": 0.2891874600127959, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001388349514563107, + "loss": 1.5684, "step": 226 }, { - "epoch": 0.29, - "grad_norm": 0.228515625, - "learning_rate": 3.717948717948718e-05, - "loss": 1.5334, + "epoch": 0.290467050543826, + "grad_norm": 0.1826171875, + "learning_rate": 0.00013834951456310682, + "loss": 1.6822, "step": 227 }, { - "epoch": 0.29, - "grad_norm": 0.25390625, - "learning_rate": 3.58974358974359e-05, - "loss": 1.5238, + "epoch": 0.29174664107485604, + "grad_norm": 0.2216796875, + "learning_rate": 0.00013786407766990292, + "loss": 1.668, "step": 228 }, { - "epoch": 0.29, - "grad_norm": 0.2490234375, - "learning_rate": 3.461538461538462e-05, - "loss": 1.4388, + "epoch": 0.2930262316058861, + "grad_norm": 0.193359375, + "learning_rate": 0.00013737864077669905, + "loss": 1.564, "step": 229 }, { - "epoch": 0.29, - "grad_norm": 0.2353515625, - "learning_rate": 3.3333333333333335e-05, - "loss": 1.5555, - "step": 230 - }, - { - "epoch": 0.29, - "eval_loss": 1.5019102096557617, - "eval_runtime": 109.1608, - "eval_samples_per_second": 45.804, - "eval_steps_per_second": 1.438, + "epoch": 0.2943058221369162, + "grad_norm": 0.193359375, + "learning_rate": 0.00013689320388349515, + "loss": 1.6721, "step": 230 }, { - "epoch": 0.3, - "grad_norm": 0.267578125, - "learning_rate": 3.205128205128206e-05, - "loss": 1.5535, + "epoch": 0.29558541266794625, + "grad_norm": 0.2099609375, + "learning_rate": 0.00013640776699029128, + "loss": 1.6735, "step": 231 }, { - "epoch": 0.3, - "grad_norm": 0.240234375, - "learning_rate": 3.0769230769230774e-05, - "loss": 1.4322, + "epoch": 0.29686500319897635, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001359223300970874, + "loss": 1.5751, "step": 232 }, { - "epoch": 0.3, - "grad_norm": 0.2470703125, - "learning_rate": 2.948717948717949e-05, - "loss": 1.471, + "epoch": 0.2981445937300064, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001354368932038835, + "loss": 1.6004, "step": 233 }, { - "epoch": 0.3, - "grad_norm": 0.25, - "learning_rate": 2.8205128205128207e-05, - "loss": 1.5444, + "epoch": 0.29942418426103645, + "grad_norm": 0.21875, + "learning_rate": 0.00013495145631067963, + "loss": 1.6797, "step": 234 }, { - "epoch": 0.3, - "grad_norm": 0.25, - "learning_rate": 2.6923076923076923e-05, - "loss": 1.5331, + "epoch": 0.30070377479206656, + "grad_norm": 0.1884765625, + "learning_rate": 0.00013446601941747573, + "loss": 1.6688, "step": 235 }, { - "epoch": 0.3, - "grad_norm": 0.236328125, - "learning_rate": 2.564102564102564e-05, - "loss": 1.5355, + "epoch": 0.3019833653230966, + "grad_norm": 0.19921875, + "learning_rate": 0.00013398058252427186, + "loss": 1.6719, "step": 236 }, { - "epoch": 0.3, - "grad_norm": 0.244140625, - "learning_rate": 2.435897435897436e-05, - "loss": 1.4883, + "epoch": 0.30326295585412666, + "grad_norm": 0.181640625, + "learning_rate": 0.00013349514563106796, + "loss": 1.6186, "step": 237 }, { - "epoch": 0.3, - "grad_norm": 0.263671875, - "learning_rate": 2.307692307692308e-05, - "loss": 1.5266, + "epoch": 0.30454254638515676, + "grad_norm": 0.19921875, + "learning_rate": 0.0001330097087378641, + "loss": 1.6694, "step": 238 }, { - "epoch": 0.31, - "grad_norm": 0.2451171875, - "learning_rate": 2.1794871794871795e-05, - "loss": 1.5576, + "epoch": 0.3058221369161868, + "grad_norm": 0.185546875, + "learning_rate": 0.00013252427184466022, + "loss": 1.7104, "step": 239 }, { - "epoch": 0.31, - "grad_norm": 0.259765625, - "learning_rate": 2.0512820512820512e-05, - "loss": 1.5615, + "epoch": 0.30710172744721687, + "grad_norm": 0.189453125, + "learning_rate": 0.00013203883495145632, + "loss": 1.6939, "step": 240 }, { - "epoch": 0.31, - "eval_loss": 1.5016273260116577, - "eval_runtime": 109.191, - "eval_samples_per_second": 45.791, - "eval_steps_per_second": 1.438, + "epoch": 0.30710172744721687, + "eval_loss": 1.6442595720291138, + "eval_runtime": 103.1037, + "eval_samples_per_second": 48.495, + "eval_steps_per_second": 1.523, "step": 240 }, { - "epoch": 0.31, - "grad_norm": 0.23828125, - "learning_rate": 1.923076923076923e-05, - "loss": 1.531, + "epoch": 0.30838131797824697, + "grad_norm": 0.1904296875, + "learning_rate": 0.00013155339805825245, + "loss": 1.6986, "step": 241 }, { - "epoch": 0.31, - "grad_norm": 0.2470703125, - "learning_rate": 1.794871794871795e-05, - "loss": 1.4563, + "epoch": 0.309660908509277, + "grad_norm": 0.1982421875, + "learning_rate": 0.00013106796116504855, + "loss": 1.5877, "step": 242 }, { - "epoch": 0.31, - "grad_norm": 0.24609375, - "learning_rate": 1.6666666666666667e-05, - "loss": 1.4773, + "epoch": 0.31094049904030713, + "grad_norm": 0.1884765625, + "learning_rate": 0.00013058252427184468, + "loss": 1.6292, "step": 243 }, { - "epoch": 0.31, - "grad_norm": 0.2431640625, - "learning_rate": 1.5384615384615387e-05, - "loss": 1.4496, + "epoch": 0.3122200895713372, + "grad_norm": 0.203125, + "learning_rate": 0.00013009708737864078, + "loss": 1.6097, "step": 244 }, { - "epoch": 0.31, - "grad_norm": 0.228515625, - "learning_rate": 1.4102564102564104e-05, - "loss": 1.5677, + "epoch": 0.31349968010236723, + "grad_norm": 0.169921875, + "learning_rate": 0.0001296116504854369, + "loss": 1.7044, "step": 245 }, { - "epoch": 0.31, - "grad_norm": 0.2578125, - "learning_rate": 1.282051282051282e-05, - "loss": 1.4935, + "epoch": 0.31477927063339733, + "grad_norm": 0.185546875, + "learning_rate": 0.00012912621359223304, + "loss": 1.6325, "step": 246 }, { - "epoch": 0.32, - "grad_norm": 0.2216796875, - "learning_rate": 1.153846153846154e-05, - "loss": 1.5357, + "epoch": 0.3160588611644274, + "grad_norm": 0.1806640625, + "learning_rate": 0.00012864077669902914, + "loss": 1.6831, "step": 247 }, { - "epoch": 0.32, - "grad_norm": 0.259765625, - "learning_rate": 1.0256410256410256e-05, - "loss": 1.5693, + "epoch": 0.31733845169545744, + "grad_norm": 0.203125, + "learning_rate": 0.00012815533980582526, + "loss": 1.7119, "step": 248 }, { - "epoch": 0.32, - "grad_norm": 0.271484375, - "learning_rate": 8.974358974358976e-06, - "loss": 1.5037, + "epoch": 0.31861804222648754, + "grad_norm": 0.201171875, + "learning_rate": 0.00012766990291262137, + "loss": 1.6437, "step": 249 }, { - "epoch": 0.32, - "grad_norm": 0.236328125, - "learning_rate": 7.692307692307694e-06, - "loss": 1.4786, - "step": 250 - }, - { - "epoch": 0.32, - "eval_loss": 1.501659631729126, - "eval_runtime": 109.1682, - "eval_samples_per_second": 45.801, - "eval_steps_per_second": 1.438, + "epoch": 0.3198976327575176, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001271844660194175, + "loss": 1.6205, "step": 250 }, { - "epoch": 0.32, - "grad_norm": 0.2412109375, - "learning_rate": 6.41025641025641e-06, - "loss": 1.5624, + "epoch": 0.32117722328854764, + "grad_norm": 0.193359375, + "learning_rate": 0.00012669902912621362, + "loss": 1.7224, "step": 251 }, { - "epoch": 0.32, - "grad_norm": 0.232421875, - "learning_rate": 5.128205128205128e-06, - "loss": 1.4851, + "epoch": 0.32245681381957775, + "grad_norm": 0.1904296875, + "learning_rate": 0.00012621359223300972, + "loss": 1.6414, "step": 252 }, { - "epoch": 0.32, - "grad_norm": 0.2451171875, - "learning_rate": 3.846153846153847e-06, - "loss": 1.4184, + "epoch": 0.3237364043506078, + "grad_norm": 0.232421875, + "learning_rate": 0.00012572815533980585, + "loss": 1.5297, "step": 253 }, { - "epoch": 0.33, - "grad_norm": 0.2353515625, - "learning_rate": 2.564102564102564e-06, - "loss": 1.4701, + "epoch": 0.32501599488163785, + "grad_norm": 0.177734375, + "learning_rate": 0.00012524271844660195, + "loss": 1.5789, "step": 254 }, { - "epoch": 0.33, - "grad_norm": 0.244140625, - "learning_rate": 1.282051282051282e-06, - "loss": 1.5079, + "epoch": 0.32629558541266795, + "grad_norm": 0.1767578125, + "learning_rate": 0.00012475728155339805, + "loss": 1.6328, "step": 255 }, { - "epoch": 0.33, - "grad_norm": 0.2216796875, - "learning_rate": 0.0, - "loss": 1.5386, + "epoch": 0.327575175943698, + "grad_norm": 0.2109375, + "learning_rate": 0.00012427184466019418, + "loss": 1.6872, "step": 256 }, { - "epoch": 0.33, - "step": 256, - "total_flos": 1.8280284932328653e+17, - "train_loss": 1.802713230252266, - "train_runtime": 3920.872, - "train_samples_per_second": 4.179, - "train_steps_per_second": 0.065 + "epoch": 0.3288547664747281, + "grad_norm": 0.19140625, + "learning_rate": 0.00012378640776699028, + "loss": 1.6293, + "step": 257 }, { - "epoch": 0.33, - "eval_loss": 1.5016273260116577, - "eval_runtime": 109.2744, - "eval_samples_per_second": 45.756, - "eval_steps_per_second": 1.437, - "step": 256 + "epoch": 0.33013435700575816, + "grad_norm": 0.201171875, + "learning_rate": 0.0001233009708737864, + "loss": 1.6105, + "step": 258 + }, + { + "epoch": 0.3314139475367882, + "grad_norm": 0.197265625, + "learning_rate": 0.0001228155339805825, + "loss": 1.6522, + "step": 259 + }, + { + "epoch": 0.3326935380678183, + "grad_norm": 0.2001953125, + "learning_rate": 0.00012233009708737864, + "loss": 1.6715, + "step": 260 + }, + { + "epoch": 0.3326935380678183, + "eval_loss": 1.6419228315353394, + "eval_runtime": 103.1115, + "eval_samples_per_second": 48.491, + "eval_steps_per_second": 1.523, + "step": 260 + }, + { + "epoch": 0.33397312859884837, + "grad_norm": 0.18359375, + "learning_rate": 0.00012184466019417475, + "loss": 1.6435, + "step": 261 + }, + { + "epoch": 0.3352527191298784, + "grad_norm": 0.201171875, + "learning_rate": 0.00012135922330097087, + "loss": 1.6339, + "step": 262 + }, + { + "epoch": 0.3365323096609085, + "grad_norm": 0.2216796875, + "learning_rate": 0.00012087378640776698, + "loss": 1.6233, + "step": 263 + }, + { + "epoch": 0.3378119001919386, + "grad_norm": 0.201171875, + "learning_rate": 0.0001203883495145631, + "loss": 1.61, + "step": 264 + }, + { + "epoch": 0.3390914907229686, + "grad_norm": 0.1875, + "learning_rate": 0.00011990291262135923, + "loss": 1.6532, + "step": 265 + }, + { + "epoch": 0.34037108125399873, + "grad_norm": 0.1962890625, + "learning_rate": 0.00011941747572815534, + "loss": 1.6994, + "step": 266 + }, + { + "epoch": 0.3416506717850288, + "grad_norm": 0.189453125, + "learning_rate": 0.00011893203883495146, + "loss": 1.5847, + "step": 267 + }, + { + "epoch": 0.3429302623160589, + "grad_norm": 0.1845703125, + "learning_rate": 0.00011844660194174757, + "loss": 1.6299, + "step": 268 + }, + { + "epoch": 0.34420985284708894, + "grad_norm": 0.193359375, + "learning_rate": 0.00011796116504854368, + "loss": 1.63, + "step": 269 + }, + { + "epoch": 0.345489443378119, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001174757281553398, + "loss": 1.6556, + "step": 270 + }, + { + "epoch": 0.3467690339091491, + "grad_norm": 0.193359375, + "learning_rate": 0.00011699029126213593, + "loss": 1.6967, + "step": 271 + }, + { + "epoch": 0.34804862444017914, + "grad_norm": 0.2001953125, + "learning_rate": 0.00011650485436893204, + "loss": 1.7115, + "step": 272 + }, + { + "epoch": 0.3493282149712092, + "grad_norm": 0.1982421875, + "learning_rate": 0.00011601941747572816, + "loss": 1.7578, + "step": 273 + }, + { + "epoch": 0.3506078055022393, + "grad_norm": 0.205078125, + "learning_rate": 0.00011553398058252427, + "loss": 1.6873, + "step": 274 + }, + { + "epoch": 0.35188739603326935, + "grad_norm": 0.216796875, + "learning_rate": 0.00011504854368932039, + "loss": 1.6566, + "step": 275 + }, + { + "epoch": 0.3531669865642994, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001145631067961165, + "loss": 1.7911, + "step": 276 + }, + { + "epoch": 0.3544465770953295, + "grad_norm": 0.197265625, + "learning_rate": 0.00011407766990291261, + "loss": 1.6738, + "step": 277 + }, + { + "epoch": 0.35572616762635956, + "grad_norm": 0.21484375, + "learning_rate": 0.00011359223300970874, + "loss": 1.6299, + "step": 278 + }, + { + "epoch": 0.3570057581573896, + "grad_norm": 0.2109375, + "learning_rate": 0.00011310679611650486, + "loss": 1.7136, + "step": 279 + }, + { + "epoch": 0.3582853486884197, + "grad_norm": 0.185546875, + "learning_rate": 0.00011262135922330097, + "loss": 1.6764, + "step": 280 + }, + { + "epoch": 0.3582853486884197, + "eval_loss": 1.6408168077468872, + "eval_runtime": 103.085, + "eval_samples_per_second": 48.504, + "eval_steps_per_second": 1.523, + "step": 280 + }, + { + "epoch": 0.35956493921944976, + "grad_norm": 0.1796875, + "learning_rate": 0.00011213592233009709, + "loss": 1.6869, + "step": 281 + }, + { + "epoch": 0.36084452975047987, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001116504854368932, + "loss": 1.6637, + "step": 282 + }, + { + "epoch": 0.3621241202815099, + "grad_norm": 0.216796875, + "learning_rate": 0.00011116504854368932, + "loss": 1.6449, + "step": 283 + }, + { + "epoch": 0.36340371081253997, + "grad_norm": 0.19140625, + "learning_rate": 0.00011067961165048544, + "loss": 1.6964, + "step": 284 + }, + { + "epoch": 0.3646833013435701, + "grad_norm": 0.2109375, + "learning_rate": 0.00011019417475728156, + "loss": 1.7385, + "step": 285 + }, + { + "epoch": 0.3659628918746001, + "grad_norm": 0.2109375, + "learning_rate": 0.00010970873786407767, + "loss": 1.6712, + "step": 286 + }, + { + "epoch": 0.3672424824056302, + "grad_norm": 0.1943359375, + "learning_rate": 0.00010922330097087379, + "loss": 1.675, + "step": 287 + }, + { + "epoch": 0.3685220729366603, + "grad_norm": 0.208984375, + "learning_rate": 0.0001087378640776699, + "loss": 1.6172, + "step": 288 + }, + { + "epoch": 0.36980166346769033, + "grad_norm": 0.185546875, + "learning_rate": 0.00010825242718446602, + "loss": 1.6774, + "step": 289 + }, + { + "epoch": 0.3710812539987204, + "grad_norm": 0.181640625, + "learning_rate": 0.00010776699029126213, + "loss": 1.6645, + "step": 290 + }, + { + "epoch": 0.3723608445297505, + "grad_norm": 0.1923828125, + "learning_rate": 0.00010728155339805826, + "loss": 1.6674, + "step": 291 + }, + { + "epoch": 0.37364043506078054, + "grad_norm": 0.1943359375, + "learning_rate": 0.00010679611650485437, + "loss": 1.6028, + "step": 292 + }, + { + "epoch": 0.37492002559181065, + "grad_norm": 0.1904296875, + "learning_rate": 0.00010631067961165049, + "loss": 1.7152, + "step": 293 + }, + { + "epoch": 0.3761996161228407, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001058252427184466, + "loss": 1.6715, + "step": 294 + }, + { + "epoch": 0.37747920665387075, + "grad_norm": 0.1982421875, + "learning_rate": 0.00010533980582524272, + "loss": 1.6071, + "step": 295 + }, + { + "epoch": 0.37875879718490085, + "grad_norm": 0.208984375, + "learning_rate": 0.00010485436893203883, + "loss": 1.6542, + "step": 296 + }, + { + "epoch": 0.3800383877159309, + "grad_norm": 0.1865234375, + "learning_rate": 0.00010436893203883496, + "loss": 1.734, + "step": 297 + }, + { + "epoch": 0.38131797824696095, + "grad_norm": 0.2041015625, + "learning_rate": 0.00010388349514563107, + "loss": 1.577, + "step": 298 + }, + { + "epoch": 0.38259756877799106, + "grad_norm": 0.1923828125, + "learning_rate": 0.00010339805825242719, + "loss": 1.6174, + "step": 299 + }, + { + "epoch": 0.3838771593090211, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001029126213592233, + "loss": 1.6505, + "step": 300 + }, + { + "epoch": 0.3838771593090211, + "eval_loss": 1.6392818689346313, + "eval_runtime": 103.1129, + "eval_samples_per_second": 48.491, + "eval_steps_per_second": 1.523, + "step": 300 + }, + { + "epoch": 0.38515674984005116, + "grad_norm": 0.1904296875, + "learning_rate": 0.00010242718446601942, + "loss": 1.6486, + "step": 301 + }, + { + "epoch": 0.38643634037108127, + "grad_norm": 0.1982421875, + "learning_rate": 0.00010194174757281553, + "loss": 1.6358, + "step": 302 + }, + { + "epoch": 0.3877159309021113, + "grad_norm": 0.216796875, + "learning_rate": 0.00010145631067961166, + "loss": 1.6483, + "step": 303 + }, + { + "epoch": 0.3889955214331414, + "grad_norm": 0.1962890625, + "learning_rate": 0.00010097087378640778, + "loss": 1.7252, + "step": 304 + }, + { + "epoch": 0.3902751119641715, + "grad_norm": 0.1982421875, + "learning_rate": 0.00010048543689320389, + "loss": 1.7207, + "step": 305 + }, + { + "epoch": 0.3915547024952015, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001, + "loss": 1.6304, + "step": 306 + }, + { + "epoch": 0.39283429302623163, + "grad_norm": 0.1875, + "learning_rate": 9.951456310679612e-05, + "loss": 1.6087, + "step": 307 + }, + { + "epoch": 0.3941138835572617, + "grad_norm": 0.189453125, + "learning_rate": 9.902912621359223e-05, + "loss": 1.6207, + "step": 308 + }, + { + "epoch": 0.39539347408829173, + "grad_norm": 0.1982421875, + "learning_rate": 9.854368932038835e-05, + "loss": 1.618, + "step": 309 + }, + { + "epoch": 0.39667306461932184, + "grad_norm": 0.189453125, + "learning_rate": 9.805825242718448e-05, + "loss": 1.5887, + "step": 310 + }, + { + "epoch": 0.3979526551503519, + "grad_norm": 0.2041015625, + "learning_rate": 9.757281553398059e-05, + "loss": 1.6387, + "step": 311 + }, + { + "epoch": 0.39923224568138194, + "grad_norm": 0.193359375, + "learning_rate": 9.70873786407767e-05, + "loss": 1.6049, + "step": 312 + }, + { + "epoch": 0.40051183621241204, + "grad_norm": 0.2021484375, + "learning_rate": 9.660194174757282e-05, + "loss": 1.6714, + "step": 313 + }, + { + "epoch": 0.4017914267434421, + "grad_norm": 0.21484375, + "learning_rate": 9.611650485436893e-05, + "loss": 1.6432, + "step": 314 + }, + { + "epoch": 0.40307101727447214, + "grad_norm": 0.185546875, + "learning_rate": 9.563106796116505e-05, + "loss": 1.6556, + "step": 315 + }, + { + "epoch": 0.40435060780550225, + "grad_norm": 0.201171875, + "learning_rate": 9.514563106796118e-05, + "loss": 1.7007, + "step": 316 + }, + { + "epoch": 0.4056301983365323, + "grad_norm": 0.1884765625, + "learning_rate": 9.466019417475729e-05, + "loss": 1.6269, + "step": 317 + }, + { + "epoch": 0.4069097888675624, + "grad_norm": 0.19140625, + "learning_rate": 9.417475728155341e-05, + "loss": 1.602, + "step": 318 + }, + { + "epoch": 0.40818937939859246, + "grad_norm": 0.19140625, + "learning_rate": 9.368932038834952e-05, + "loss": 1.6387, + "step": 319 + }, + { + "epoch": 0.4094689699296225, + "grad_norm": 0.201171875, + "learning_rate": 9.320388349514564e-05, + "loss": 1.6849, + "step": 320 + }, + { + "epoch": 0.4094689699296225, + "eval_loss": 1.6380316019058228, + "eval_runtime": 103.1068, + "eval_samples_per_second": 48.493, + "eval_steps_per_second": 1.523, + "step": 320 + }, + { + "epoch": 0.4107485604606526, + "grad_norm": 0.189453125, + "learning_rate": 9.271844660194175e-05, + "loss": 1.6638, + "step": 321 + }, + { + "epoch": 0.41202815099168266, + "grad_norm": 0.1875, + "learning_rate": 9.223300970873788e-05, + "loss": 1.6501, + "step": 322 + }, + { + "epoch": 0.4133077415227127, + "grad_norm": 0.2119140625, + "learning_rate": 9.174757281553399e-05, + "loss": 1.624, + "step": 323 + }, + { + "epoch": 0.4145873320537428, + "grad_norm": 0.2138671875, + "learning_rate": 9.126213592233011e-05, + "loss": 1.5555, + "step": 324 + }, + { + "epoch": 0.41586692258477287, + "grad_norm": 0.1884765625, + "learning_rate": 9.077669902912622e-05, + "loss": 1.6896, + "step": 325 + }, + { + "epoch": 0.4171465131158029, + "grad_norm": 0.1953125, + "learning_rate": 9.029126213592234e-05, + "loss": 1.696, + "step": 326 + }, + { + "epoch": 0.418426103646833, + "grad_norm": 0.21484375, + "learning_rate": 8.980582524271845e-05, + "loss": 1.7137, + "step": 327 + }, + { + "epoch": 0.4197056941778631, + "grad_norm": 0.189453125, + "learning_rate": 8.932038834951457e-05, + "loss": 1.705, + "step": 328 + }, + { + "epoch": 0.4209852847088932, + "grad_norm": 0.2021484375, + "learning_rate": 8.88349514563107e-05, + "loss": 1.63, + "step": 329 + }, + { + "epoch": 0.42226487523992323, + "grad_norm": 0.185546875, + "learning_rate": 8.834951456310681e-05, + "loss": 1.7132, + "step": 330 + }, + { + "epoch": 0.4235444657709533, + "grad_norm": 0.1875, + "learning_rate": 8.786407766990292e-05, + "loss": 1.6763, + "step": 331 + }, + { + "epoch": 0.4248240563019834, + "grad_norm": 0.1904296875, + "learning_rate": 8.737864077669902e-05, + "loss": 1.6609, + "step": 332 + }, + { + "epoch": 0.42610364683301344, + "grad_norm": 0.2001953125, + "learning_rate": 8.689320388349514e-05, + "loss": 1.738, + "step": 333 + }, + { + "epoch": 0.4273832373640435, + "grad_norm": 0.19140625, + "learning_rate": 8.640776699029127e-05, + "loss": 1.6413, + "step": 334 + }, + { + "epoch": 0.4286628278950736, + "grad_norm": 0.1826171875, + "learning_rate": 8.592233009708738e-05, + "loss": 1.6171, + "step": 335 + }, + { + "epoch": 0.42994241842610365, + "grad_norm": 0.203125, + "learning_rate": 8.54368932038835e-05, + "loss": 1.6703, + "step": 336 + }, + { + "epoch": 0.4312220089571337, + "grad_norm": 0.193359375, + "learning_rate": 8.495145631067961e-05, + "loss": 1.7195, + "step": 337 + }, + { + "epoch": 0.4325015994881638, + "grad_norm": 0.20703125, + "learning_rate": 8.446601941747573e-05, + "loss": 1.5734, + "step": 338 + }, + { + "epoch": 0.43378119001919385, + "grad_norm": 0.205078125, + "learning_rate": 8.398058252427184e-05, + "loss": 1.697, + "step": 339 + }, + { + "epoch": 0.4350607805502239, + "grad_norm": 0.216796875, + "learning_rate": 8.349514563106797e-05, + "loss": 1.7021, + "step": 340 + }, + { + "epoch": 0.4350607805502239, + "eval_loss": 1.637184739112854, + "eval_runtime": 103.0987, + "eval_samples_per_second": 48.497, + "eval_steps_per_second": 1.523, + "step": 340 + }, + { + "epoch": 0.436340371081254, + "grad_norm": 0.197265625, + "learning_rate": 8.300970873786408e-05, + "loss": 1.6074, + "step": 341 + }, + { + "epoch": 0.43761996161228406, + "grad_norm": 0.193359375, + "learning_rate": 8.25242718446602e-05, + "loss": 1.6612, + "step": 342 + }, + { + "epoch": 0.43889955214331416, + "grad_norm": 0.2021484375, + "learning_rate": 8.203883495145631e-05, + "loss": 1.6748, + "step": 343 + }, + { + "epoch": 0.4401791426743442, + "grad_norm": 0.205078125, + "learning_rate": 8.155339805825243e-05, + "loss": 1.6782, + "step": 344 + }, + { + "epoch": 0.44145873320537427, + "grad_norm": 0.1845703125, + "learning_rate": 8.106796116504854e-05, + "loss": 1.6676, + "step": 345 + }, + { + "epoch": 0.44273832373640437, + "grad_norm": 0.205078125, + "learning_rate": 8.058252427184466e-05, + "loss": 1.6316, + "step": 346 + }, + { + "epoch": 0.4440179142674344, + "grad_norm": 0.1923828125, + "learning_rate": 8.009708737864078e-05, + "loss": 1.6504, + "step": 347 + }, + { + "epoch": 0.44529750479846447, + "grad_norm": 0.22265625, + "learning_rate": 7.96116504854369e-05, + "loss": 1.5952, + "step": 348 + }, + { + "epoch": 0.4465770953294946, + "grad_norm": 0.212890625, + "learning_rate": 7.912621359223301e-05, + "loss": 1.5702, + "step": 349 + }, + { + "epoch": 0.44785668586052463, + "grad_norm": 0.2216796875, + "learning_rate": 7.864077669902913e-05, + "loss": 1.661, + "step": 350 + }, + { + "epoch": 0.4491362763915547, + "grad_norm": 0.21484375, + "learning_rate": 7.815533980582524e-05, + "loss": 1.6323, + "step": 351 + }, + { + "epoch": 0.4504158669225848, + "grad_norm": 0.220703125, + "learning_rate": 7.766990291262136e-05, + "loss": 1.6799, + "step": 352 + }, + { + "epoch": 0.45169545745361483, + "grad_norm": 0.19140625, + "learning_rate": 7.718446601941748e-05, + "loss": 1.6872, + "step": 353 + }, + { + "epoch": 0.45297504798464494, + "grad_norm": 0.2158203125, + "learning_rate": 7.66990291262136e-05, + "loss": 1.6583, + "step": 354 + }, + { + "epoch": 0.454254638515675, + "grad_norm": 0.1796875, + "learning_rate": 7.621359223300971e-05, + "loss": 1.6758, + "step": 355 + }, + { + "epoch": 0.45553422904670504, + "grad_norm": 0.2099609375, + "learning_rate": 7.572815533980583e-05, + "loss": 1.586, + "step": 356 + }, + { + "epoch": 0.45681381957773515, + "grad_norm": 0.2060546875, + "learning_rate": 7.524271844660194e-05, + "loss": 1.5202, + "step": 357 + }, + { + "epoch": 0.4580934101087652, + "grad_norm": 0.2099609375, + "learning_rate": 7.475728155339806e-05, + "loss": 1.6501, + "step": 358 + }, + { + "epoch": 0.45937300063979525, + "grad_norm": 0.2021484375, + "learning_rate": 7.427184466019417e-05, + "loss": 1.6315, + "step": 359 + }, + { + "epoch": 0.46065259117082535, + "grad_norm": 0.19921875, + "learning_rate": 7.37864077669903e-05, + "loss": 1.661, + "step": 360 + }, + { + "epoch": 0.46065259117082535, + "eval_loss": 1.6363531351089478, + "eval_runtime": 103.1142, + "eval_samples_per_second": 48.49, + "eval_steps_per_second": 1.523, + "step": 360 + }, + { + "epoch": 0.4619321817018554, + "grad_norm": 0.2109375, + "learning_rate": 7.330097087378641e-05, + "loss": 1.6214, + "step": 361 + }, + { + "epoch": 0.46321177223288545, + "grad_norm": 0.2001953125, + "learning_rate": 7.281553398058253e-05, + "loss": 1.645, + "step": 362 + }, + { + "epoch": 0.46449136276391556, + "grad_norm": 0.1982421875, + "learning_rate": 7.233009708737864e-05, + "loss": 1.7353, + "step": 363 + }, + { + "epoch": 0.4657709532949456, + "grad_norm": 0.1865234375, + "learning_rate": 7.184466019417476e-05, + "loss": 1.6297, + "step": 364 + }, + { + "epoch": 0.46705054382597566, + "grad_norm": 0.1865234375, + "learning_rate": 7.135922330097087e-05, + "loss": 1.6419, + "step": 365 + }, + { + "epoch": 0.46833013435700577, + "grad_norm": 0.2021484375, + "learning_rate": 7.0873786407767e-05, + "loss": 1.5767, + "step": 366 + }, + { + "epoch": 0.4696097248880358, + "grad_norm": 0.1806640625, + "learning_rate": 7.038834951456312e-05, + "loss": 1.6454, + "step": 367 + }, + { + "epoch": 0.4708893154190659, + "grad_norm": 0.205078125, + "learning_rate": 6.990291262135923e-05, + "loss": 1.6399, + "step": 368 + }, + { + "epoch": 0.472168905950096, + "grad_norm": 0.19921875, + "learning_rate": 6.941747572815534e-05, + "loss": 1.7118, + "step": 369 + }, + { + "epoch": 0.473448496481126, + "grad_norm": 0.22265625, + "learning_rate": 6.893203883495146e-05, + "loss": 1.6869, + "step": 370 + }, + { + "epoch": 0.47472808701215613, + "grad_norm": 0.1953125, + "learning_rate": 6.844660194174757e-05, + "loss": 1.6411, + "step": 371 + }, + { + "epoch": 0.4760076775431862, + "grad_norm": 0.19921875, + "learning_rate": 6.79611650485437e-05, + "loss": 1.7214, + "step": 372 + }, + { + "epoch": 0.47728726807421623, + "grad_norm": 0.19140625, + "learning_rate": 6.747572815533982e-05, + "loss": 1.664, + "step": 373 + }, + { + "epoch": 0.47856685860524634, + "grad_norm": 0.20703125, + "learning_rate": 6.699029126213593e-05, + "loss": 1.6712, + "step": 374 + }, + { + "epoch": 0.4798464491362764, + "grad_norm": 0.1962890625, + "learning_rate": 6.650485436893205e-05, + "loss": 1.662, + "step": 375 + }, + { + "epoch": 0.48112603966730644, + "grad_norm": 0.1904296875, + "learning_rate": 6.601941747572816e-05, + "loss": 1.6388, + "step": 376 + }, + { + "epoch": 0.48240563019833654, + "grad_norm": 0.2001953125, + "learning_rate": 6.553398058252428e-05, + "loss": 1.6927, + "step": 377 + }, + { + "epoch": 0.4836852207293666, + "grad_norm": 0.201171875, + "learning_rate": 6.504854368932039e-05, + "loss": 1.6294, + "step": 378 + }, + { + "epoch": 0.4849648112603967, + "grad_norm": 0.197265625, + "learning_rate": 6.456310679611652e-05, + "loss": 1.6538, + "step": 379 + }, + { + "epoch": 0.48624440179142675, + "grad_norm": 0.236328125, + "learning_rate": 6.407766990291263e-05, + "loss": 1.6242, + "step": 380 + }, + { + "epoch": 0.48624440179142675, + "eval_loss": 1.6356617212295532, + "eval_runtime": 103.1559, + "eval_samples_per_second": 48.47, + "eval_steps_per_second": 1.522, + "step": 380 + }, + { + "epoch": 0.4875239923224568, + "grad_norm": 0.189453125, + "learning_rate": 6.359223300970875e-05, + "loss": 1.6998, + "step": 381 + }, + { + "epoch": 0.4888035828534869, + "grad_norm": 0.2109375, + "learning_rate": 6.310679611650486e-05, + "loss": 1.6254, + "step": 382 + }, + { + "epoch": 0.49008317338451696, + "grad_norm": 0.203125, + "learning_rate": 6.262135922330098e-05, + "loss": 1.5919, + "step": 383 + }, + { + "epoch": 0.491362763915547, + "grad_norm": 0.20703125, + "learning_rate": 6.213592233009709e-05, + "loss": 1.6995, + "step": 384 + }, + { + "epoch": 0.4926423544465771, + "grad_norm": 0.2109375, + "learning_rate": 6.16504854368932e-05, + "loss": 1.6394, + "step": 385 + }, + { + "epoch": 0.49392194497760716, + "grad_norm": 0.205078125, + "learning_rate": 6.116504854368932e-05, + "loss": 1.6893, + "step": 386 + }, + { + "epoch": 0.4952015355086372, + "grad_norm": 0.2041015625, + "learning_rate": 6.0679611650485434e-05, + "loss": 1.5551, + "step": 387 + }, + { + "epoch": 0.4964811260396673, + "grad_norm": 0.1982421875, + "learning_rate": 6.019417475728155e-05, + "loss": 1.623, + "step": 388 + }, + { + "epoch": 0.49776071657069737, + "grad_norm": 0.205078125, + "learning_rate": 5.970873786407767e-05, + "loss": 1.5971, + "step": 389 + }, + { + "epoch": 0.4990403071017274, + "grad_norm": 0.1982421875, + "learning_rate": 5.9223300970873785e-05, + "loss": 1.6355, + "step": 390 + }, + { + "epoch": 0.5003198976327575, + "grad_norm": 0.205078125, + "learning_rate": 5.87378640776699e-05, + "loss": 1.7074, + "step": 391 + }, + { + "epoch": 0.5015994881637876, + "grad_norm": 0.1982421875, + "learning_rate": 5.825242718446602e-05, + "loss": 1.6935, + "step": 392 + }, + { + "epoch": 0.5028790786948176, + "grad_norm": 0.2001953125, + "learning_rate": 5.7766990291262135e-05, + "loss": 1.7455, + "step": 393 + }, + { + "epoch": 0.5041586692258477, + "grad_norm": 0.1904296875, + "learning_rate": 5.728155339805825e-05, + "loss": 1.6606, + "step": 394 + }, + { + "epoch": 0.5054382597568778, + "grad_norm": 0.1943359375, + "learning_rate": 5.679611650485437e-05, + "loss": 1.6364, + "step": 395 + }, + { + "epoch": 0.5067178502879078, + "grad_norm": 0.1845703125, + "learning_rate": 5.6310679611650486e-05, + "loss": 1.6301, + "step": 396 + }, + { + "epoch": 0.5079974408189379, + "grad_norm": 0.2216796875, + "learning_rate": 5.58252427184466e-05, + "loss": 1.5651, + "step": 397 + }, + { + "epoch": 0.509277031349968, + "grad_norm": 0.1953125, + "learning_rate": 5.533980582524272e-05, + "loss": 1.6575, + "step": 398 + }, + { + "epoch": 0.510556621880998, + "grad_norm": 0.203125, + "learning_rate": 5.4854368932038836e-05, + "loss": 1.5667, + "step": 399 + }, + { + "epoch": 0.5118362124120281, + "grad_norm": 0.2001953125, + "learning_rate": 5.436893203883495e-05, + "loss": 1.6508, + "step": 400 + }, + { + "epoch": 0.5118362124120281, + "eval_loss": 1.6350483894348145, + "eval_runtime": 103.234, + "eval_samples_per_second": 48.434, + "eval_steps_per_second": 1.521, + "step": 400 + }, + { + "epoch": 0.5131158029430583, + "grad_norm": 0.2001953125, + "learning_rate": 5.3883495145631065e-05, + "loss": 1.6265, + "step": 401 + }, + { + "epoch": 0.5143953934740882, + "grad_norm": 0.197265625, + "learning_rate": 5.339805825242719e-05, + "loss": 1.6332, + "step": 402 + }, + { + "epoch": 0.5156749840051184, + "grad_norm": 0.205078125, + "learning_rate": 5.29126213592233e-05, + "loss": 1.6849, + "step": 403 + }, + { + "epoch": 0.5169545745361485, + "grad_norm": 0.228515625, + "learning_rate": 5.2427184466019416e-05, + "loss": 1.6917, + "step": 404 + }, + { + "epoch": 0.5182341650671785, + "grad_norm": 0.2099609375, + "learning_rate": 5.194174757281554e-05, + "loss": 1.6444, + "step": 405 + }, + { + "epoch": 0.5195137555982086, + "grad_norm": 0.193359375, + "learning_rate": 5.145631067961165e-05, + "loss": 1.6165, + "step": 406 + }, + { + "epoch": 0.5207933461292387, + "grad_norm": 0.2138671875, + "learning_rate": 5.0970873786407766e-05, + "loss": 1.6882, + "step": 407 + }, + { + "epoch": 0.5220729366602687, + "grad_norm": 0.19140625, + "learning_rate": 5.048543689320389e-05, + "loss": 1.65, + "step": 408 + }, + { + "epoch": 0.5233525271912988, + "grad_norm": 0.17578125, + "learning_rate": 5e-05, + "loss": 1.6691, + "step": 409 + }, + { + "epoch": 0.5246321177223289, + "grad_norm": 0.2158203125, + "learning_rate": 4.951456310679612e-05, + "loss": 1.6197, + "step": 410 + }, + { + "epoch": 0.525911708253359, + "grad_norm": 0.2060546875, + "learning_rate": 4.902912621359224e-05, + "loss": 1.6359, + "step": 411 + }, + { + "epoch": 0.527191298784389, + "grad_norm": 0.177734375, + "learning_rate": 4.854368932038835e-05, + "loss": 1.6942, + "step": 412 + }, + { + "epoch": 0.5284708893154191, + "grad_norm": 0.2021484375, + "learning_rate": 4.805825242718447e-05, + "loss": 1.6303, + "step": 413 + }, + { + "epoch": 0.5297504798464492, + "grad_norm": 0.2041015625, + "learning_rate": 4.757281553398059e-05, + "loss": 1.6041, + "step": 414 + }, + { + "epoch": 0.5310300703774792, + "grad_norm": 0.1962890625, + "learning_rate": 4.7087378640776703e-05, + "loss": 1.7167, + "step": 415 + }, + { + "epoch": 0.5323096609085093, + "grad_norm": 0.1953125, + "learning_rate": 4.660194174757282e-05, + "loss": 1.6828, + "step": 416 + }, + { + "epoch": 0.5335892514395394, + "grad_norm": 0.2255859375, + "learning_rate": 4.611650485436894e-05, + "loss": 1.6386, + "step": 417 + }, + { + "epoch": 0.5348688419705694, + "grad_norm": 0.2060546875, + "learning_rate": 4.5631067961165054e-05, + "loss": 1.6407, + "step": 418 + }, + { + "epoch": 0.5361484325015995, + "grad_norm": 0.1962890625, + "learning_rate": 4.514563106796117e-05, + "loss": 1.6942, + "step": 419 + }, + { + "epoch": 0.5374280230326296, + "grad_norm": 0.1904296875, + "learning_rate": 4.466019417475728e-05, + "loss": 1.6137, + "step": 420 + }, + { + "epoch": 0.5374280230326296, + "eval_loss": 1.634352684020996, + "eval_runtime": 103.4435, + "eval_samples_per_second": 48.336, + "eval_steps_per_second": 1.518, + "step": 420 + }, + { + "epoch": 0.5387076135636596, + "grad_norm": 0.1982421875, + "learning_rate": 4.4174757281553404e-05, + "loss": 1.5895, + "step": 421 + }, + { + "epoch": 0.5399872040946897, + "grad_norm": 0.201171875, + "learning_rate": 4.368932038834951e-05, + "loss": 1.5607, + "step": 422 + }, + { + "epoch": 0.5412667946257198, + "grad_norm": 0.193359375, + "learning_rate": 4.3203883495145634e-05, + "loss": 1.6577, + "step": 423 + }, + { + "epoch": 0.5425463851567498, + "grad_norm": 0.20703125, + "learning_rate": 4.271844660194175e-05, + "loss": 1.6122, + "step": 424 + }, + { + "epoch": 0.5438259756877799, + "grad_norm": 0.1982421875, + "learning_rate": 4.223300970873786e-05, + "loss": 1.6206, + "step": 425 + }, + { + "epoch": 0.54510556621881, + "grad_norm": 0.236328125, + "learning_rate": 4.1747572815533984e-05, + "loss": 1.6636, + "step": 426 + }, + { + "epoch": 0.54638515674984, + "grad_norm": 0.1962890625, + "learning_rate": 4.12621359223301e-05, + "loss": 1.5838, + "step": 427 + }, + { + "epoch": 0.5476647472808701, + "grad_norm": 0.2119140625, + "learning_rate": 4.077669902912621e-05, + "loss": 1.664, + "step": 428 + }, + { + "epoch": 0.5489443378119002, + "grad_norm": 0.1875, + "learning_rate": 4.029126213592233e-05, + "loss": 1.6365, + "step": 429 + }, + { + "epoch": 0.5502239283429302, + "grad_norm": 0.2021484375, + "learning_rate": 3.980582524271845e-05, + "loss": 1.7319, + "step": 430 + }, + { + "epoch": 0.5515035188739603, + "grad_norm": 0.2021484375, + "learning_rate": 3.9320388349514564e-05, + "loss": 1.5717, + "step": 431 + }, + { + "epoch": 0.5527831094049904, + "grad_norm": 0.1923828125, + "learning_rate": 3.883495145631068e-05, + "loss": 1.5898, + "step": 432 + }, + { + "epoch": 0.5540626999360204, + "grad_norm": 0.1982421875, + "learning_rate": 3.83495145631068e-05, + "loss": 1.6142, + "step": 433 + }, + { + "epoch": 0.5553422904670505, + "grad_norm": 0.1923828125, + "learning_rate": 3.7864077669902914e-05, + "loss": 1.6619, + "step": 434 + }, + { + "epoch": 0.5566218809980806, + "grad_norm": 0.1962890625, + "learning_rate": 3.737864077669903e-05, + "loss": 1.624, + "step": 435 + }, + { + "epoch": 0.5579014715291107, + "grad_norm": 0.2060546875, + "learning_rate": 3.689320388349515e-05, + "loss": 1.657, + "step": 436 + }, + { + "epoch": 0.5591810620601407, + "grad_norm": 0.19140625, + "learning_rate": 3.6407766990291265e-05, + "loss": 1.6744, + "step": 437 + }, + { + "epoch": 0.5604606525911708, + "grad_norm": 0.1865234375, + "learning_rate": 3.592233009708738e-05, + "loss": 1.6852, + "step": 438 + }, + { + "epoch": 0.5617402431222009, + "grad_norm": 0.2041015625, + "learning_rate": 3.54368932038835e-05, + "loss": 1.5807, + "step": 439 + }, + { + "epoch": 0.5630198336532309, + "grad_norm": 0.193359375, + "learning_rate": 3.4951456310679615e-05, + "loss": 1.7031, + "step": 440 + }, + { + "epoch": 0.5630198336532309, + "eval_loss": 1.6339551210403442, + "eval_runtime": 103.2337, + "eval_samples_per_second": 48.434, + "eval_steps_per_second": 1.521, + "step": 440 + }, + { + "epoch": 0.564299424184261, + "grad_norm": 0.2119140625, + "learning_rate": 3.446601941747573e-05, + "loss": 1.624, + "step": 441 + }, + { + "epoch": 0.5655790147152912, + "grad_norm": 0.2177734375, + "learning_rate": 3.398058252427185e-05, + "loss": 1.5942, + "step": 442 + }, + { + "epoch": 0.5668586052463211, + "grad_norm": 0.189453125, + "learning_rate": 3.3495145631067966e-05, + "loss": 1.6723, + "step": 443 + }, + { + "epoch": 0.5681381957773513, + "grad_norm": 0.2099609375, + "learning_rate": 3.300970873786408e-05, + "loss": 1.6573, + "step": 444 + }, + { + "epoch": 0.5694177863083814, + "grad_norm": 0.2001953125, + "learning_rate": 3.2524271844660195e-05, + "loss": 1.6232, + "step": 445 + }, + { + "epoch": 0.5706973768394114, + "grad_norm": 0.2119140625, + "learning_rate": 3.2038834951456316e-05, + "loss": 1.6724, + "step": 446 + }, + { + "epoch": 0.5719769673704415, + "grad_norm": 0.2041015625, + "learning_rate": 3.155339805825243e-05, + "loss": 1.7238, + "step": 447 + }, + { + "epoch": 0.5732565579014716, + "grad_norm": 0.212890625, + "learning_rate": 3.1067961165048545e-05, + "loss": 1.6028, + "step": 448 + }, + { + "epoch": 0.5745361484325016, + "grad_norm": 0.1943359375, + "learning_rate": 3.058252427184466e-05, + "loss": 1.7009, + "step": 449 + }, + { + "epoch": 0.5758157389635317, + "grad_norm": 0.19921875, + "learning_rate": 3.0097087378640774e-05, + "loss": 1.5935, + "step": 450 + }, + { + "epoch": 0.5770953294945618, + "grad_norm": 0.2119140625, + "learning_rate": 2.9611650485436892e-05, + "loss": 1.5787, + "step": 451 + }, + { + "epoch": 0.5783749200255918, + "grad_norm": 0.1953125, + "learning_rate": 2.912621359223301e-05, + "loss": 1.6732, + "step": 452 + }, + { + "epoch": 0.5796545105566219, + "grad_norm": 0.201171875, + "learning_rate": 2.8640776699029125e-05, + "loss": 1.665, + "step": 453 + }, + { + "epoch": 0.580934101087652, + "grad_norm": 0.2177734375, + "learning_rate": 2.8155339805825243e-05, + "loss": 1.6404, + "step": 454 + }, + { + "epoch": 0.582213691618682, + "grad_norm": 0.1923828125, + "learning_rate": 2.766990291262136e-05, + "loss": 1.6541, + "step": 455 + }, + { + "epoch": 0.5834932821497121, + "grad_norm": 0.1923828125, + "learning_rate": 2.7184466019417475e-05, + "loss": 1.7034, + "step": 456 + }, + { + "epoch": 0.5847728726807422, + "grad_norm": 0.203125, + "learning_rate": 2.6699029126213593e-05, + "loss": 1.6074, + "step": 457 + }, + { + "epoch": 0.5860524632117722, + "grad_norm": 0.2138671875, + "learning_rate": 2.6213592233009708e-05, + "loss": 1.6595, + "step": 458 + }, + { + "epoch": 0.5873320537428023, + "grad_norm": 0.1904296875, + "learning_rate": 2.5728155339805826e-05, + "loss": 1.6631, + "step": 459 + }, + { + "epoch": 0.5886116442738324, + "grad_norm": 0.197265625, + "learning_rate": 2.5242718446601944e-05, + "loss": 1.62, + "step": 460 + }, + { + "epoch": 0.5886116442738324, + "eval_loss": 1.6338707208633423, + "eval_runtime": 103.2695, + "eval_samples_per_second": 48.417, + "eval_steps_per_second": 1.52, + "step": 460 + }, + { + "epoch": 0.5898912348048625, + "grad_norm": 0.181640625, + "learning_rate": 2.475728155339806e-05, + "loss": 1.7321, + "step": 461 + }, + { + "epoch": 0.5911708253358925, + "grad_norm": 0.2177734375, + "learning_rate": 2.4271844660194176e-05, + "loss": 1.6152, + "step": 462 + }, + { + "epoch": 0.5924504158669226, + "grad_norm": 0.197265625, + "learning_rate": 2.3786407766990294e-05, + "loss": 1.6654, + "step": 463 + }, + { + "epoch": 0.5937300063979527, + "grad_norm": 0.2080078125, + "learning_rate": 2.330097087378641e-05, + "loss": 1.6801, + "step": 464 + }, + { + "epoch": 0.5950095969289827, + "grad_norm": 0.1923828125, + "learning_rate": 2.2815533980582527e-05, + "loss": 1.5644, + "step": 465 + }, + { + "epoch": 0.5962891874600128, + "grad_norm": 0.1953125, + "learning_rate": 2.233009708737864e-05, + "loss": 1.6238, + "step": 466 + }, + { + "epoch": 0.5975687779910429, + "grad_norm": 0.203125, + "learning_rate": 2.1844660194174756e-05, + "loss": 1.6235, + "step": 467 + }, + { + "epoch": 0.5988483685220729, + "grad_norm": 0.2001953125, + "learning_rate": 2.1359223300970874e-05, + "loss": 1.611, + "step": 468 + }, + { + "epoch": 0.600127959053103, + "grad_norm": 0.20703125, + "learning_rate": 2.0873786407766992e-05, + "loss": 1.6158, + "step": 469 + }, + { + "epoch": 0.6014075495841331, + "grad_norm": 0.1953125, + "learning_rate": 2.0388349514563107e-05, + "loss": 1.6569, + "step": 470 + }, + { + "epoch": 0.6026871401151631, + "grad_norm": 0.2021484375, + "learning_rate": 1.9902912621359225e-05, + "loss": 1.7015, + "step": 471 + }, + { + "epoch": 0.6039667306461932, + "grad_norm": 0.205078125, + "learning_rate": 1.941747572815534e-05, + "loss": 1.6282, + "step": 472 + }, + { + "epoch": 0.6052463211772233, + "grad_norm": 0.21484375, + "learning_rate": 1.8932038834951457e-05, + "loss": 1.6254, + "step": 473 + }, + { + "epoch": 0.6065259117082533, + "grad_norm": 0.1953125, + "learning_rate": 1.8446601941747575e-05, + "loss": 1.6283, + "step": 474 + }, + { + "epoch": 0.6078055022392834, + "grad_norm": 0.205078125, + "learning_rate": 1.796116504854369e-05, + "loss": 1.6671, + "step": 475 + }, + { + "epoch": 0.6090850927703135, + "grad_norm": 0.1962890625, + "learning_rate": 1.7475728155339808e-05, + "loss": 1.6847, + "step": 476 + }, + { + "epoch": 0.6103646833013435, + "grad_norm": 0.1806640625, + "learning_rate": 1.6990291262135926e-05, + "loss": 1.6453, + "step": 477 + }, + { + "epoch": 0.6116442738323736, + "grad_norm": 0.1884765625, + "learning_rate": 1.650485436893204e-05, + "loss": 1.5761, + "step": 478 + }, + { + "epoch": 0.6129238643634037, + "grad_norm": 0.2001953125, + "learning_rate": 1.6019417475728158e-05, + "loss": 1.6311, + "step": 479 + }, + { + "epoch": 0.6142034548944337, + "grad_norm": 0.1884765625, + "learning_rate": 1.5533980582524273e-05, + "loss": 1.6448, + "step": 480 + }, + { + "epoch": 0.6142034548944337, + "eval_loss": 1.6335575580596924, + "eval_runtime": 103.3386, + "eval_samples_per_second": 48.385, + "eval_steps_per_second": 1.519, + "step": 480 + }, + { + "epoch": 0.6154830454254638, + "grad_norm": 0.19921875, + "learning_rate": 1.5048543689320387e-05, + "loss": 1.6385, + "step": 481 + }, + { + "epoch": 0.6167626359564939, + "grad_norm": 0.2119140625, + "learning_rate": 1.4563106796116505e-05, + "loss": 1.5795, + "step": 482 + }, + { + "epoch": 0.6180422264875239, + "grad_norm": 0.1904296875, + "learning_rate": 1.4077669902912621e-05, + "loss": 1.5808, + "step": 483 + }, + { + "epoch": 0.619321817018554, + "grad_norm": 0.203125, + "learning_rate": 1.3592233009708738e-05, + "loss": 1.592, + "step": 484 + }, + { + "epoch": 0.6206014075495841, + "grad_norm": 0.2021484375, + "learning_rate": 1.3106796116504854e-05, + "loss": 1.6375, + "step": 485 + }, + { + "epoch": 0.6218809980806143, + "grad_norm": 0.1962890625, + "learning_rate": 1.2621359223300972e-05, + "loss": 1.671, + "step": 486 + }, + { + "epoch": 0.6231605886116443, + "grad_norm": 0.2001953125, + "learning_rate": 1.2135922330097088e-05, + "loss": 1.6573, + "step": 487 + }, + { + "epoch": 0.6244401791426744, + "grad_norm": 0.197265625, + "learning_rate": 1.1650485436893204e-05, + "loss": 1.6253, + "step": 488 + }, + { + "epoch": 0.6257197696737045, + "grad_norm": 0.2119140625, + "learning_rate": 1.116504854368932e-05, + "loss": 1.6306, + "step": 489 + }, + { + "epoch": 0.6269993602047345, + "grad_norm": 0.2109375, + "learning_rate": 1.0679611650485437e-05, + "loss": 1.6724, + "step": 490 + }, + { + "epoch": 0.6282789507357646, + "grad_norm": 0.1865234375, + "learning_rate": 1.0194174757281553e-05, + "loss": 1.5965, + "step": 491 + }, + { + "epoch": 0.6295585412667947, + "grad_norm": 0.1982421875, + "learning_rate": 9.70873786407767e-06, + "loss": 1.6145, + "step": 492 + }, + { + "epoch": 0.6308381317978247, + "grad_norm": 0.19921875, + "learning_rate": 9.223300970873788e-06, + "loss": 1.5935, + "step": 493 + }, + { + "epoch": 0.6321177223288548, + "grad_norm": 0.2099609375, + "learning_rate": 8.737864077669904e-06, + "loss": 1.5623, + "step": 494 + }, + { + "epoch": 0.6333973128598849, + "grad_norm": 0.1923828125, + "learning_rate": 8.25242718446602e-06, + "loss": 1.648, + "step": 495 + }, + { + "epoch": 0.6346769033909149, + "grad_norm": 0.2021484375, + "learning_rate": 7.766990291262136e-06, + "loss": 1.6144, + "step": 496 + }, + { + "epoch": 0.635956493921945, + "grad_norm": 0.201171875, + "learning_rate": 7.281553398058253e-06, + "loss": 1.6338, + "step": 497 + }, + { + "epoch": 0.6372360844529751, + "grad_norm": 0.1953125, + "learning_rate": 6.796116504854369e-06, + "loss": 1.7038, + "step": 498 + }, + { + "epoch": 0.6385156749840051, + "grad_norm": 0.1943359375, + "learning_rate": 6.310679611650486e-06, + "loss": 1.6286, + "step": 499 + }, + { + "epoch": 0.6397952655150352, + "grad_norm": 0.1796875, + "learning_rate": 5.825242718446602e-06, + "loss": 1.7244, + "step": 500 + }, + { + "epoch": 0.6397952655150352, + "eval_loss": 1.633476734161377, + "eval_runtime": 103.4328, + "eval_samples_per_second": 48.341, + "eval_steps_per_second": 1.518, + "step": 500 + }, + { + "epoch": 0.6410748560460653, + "grad_norm": 0.2099609375, + "learning_rate": 5.3398058252427185e-06, + "loss": 1.6388, + "step": 501 + }, + { + "epoch": 0.6423544465770953, + "grad_norm": 0.205078125, + "learning_rate": 4.854368932038835e-06, + "loss": 1.6222, + "step": 502 + }, + { + "epoch": 0.6436340371081254, + "grad_norm": 0.197265625, + "learning_rate": 4.368932038834952e-06, + "loss": 1.6958, + "step": 503 + }, + { + "epoch": 0.6449136276391555, + "grad_norm": 0.2197265625, + "learning_rate": 3.883495145631068e-06, + "loss": 1.6074, + "step": 504 + }, + { + "epoch": 0.6461932181701855, + "grad_norm": 0.2099609375, + "learning_rate": 3.3980582524271844e-06, + "loss": 1.6576, + "step": 505 + }, + { + "epoch": 0.6474728087012156, + "grad_norm": 0.212890625, + "learning_rate": 2.912621359223301e-06, + "loss": 1.634, + "step": 506 + }, + { + "epoch": 0.6487523992322457, + "grad_norm": 0.189453125, + "learning_rate": 2.4271844660194174e-06, + "loss": 1.6171, + "step": 507 + }, + { + "epoch": 0.6500319897632757, + "grad_norm": 0.1962890625, + "learning_rate": 1.941747572815534e-06, + "loss": 1.613, + "step": 508 + }, + { + "epoch": 0.6513115802943058, + "grad_norm": 0.1884765625, + "learning_rate": 1.4563106796116506e-06, + "loss": 1.612, + "step": 509 + }, + { + "epoch": 0.6525911708253359, + "grad_norm": 0.185546875, + "learning_rate": 9.70873786407767e-07, + "loss": 1.6429, + "step": 510 + }, + { + "epoch": 0.653870761356366, + "grad_norm": 0.193359375, + "learning_rate": 4.854368932038835e-07, + "loss": 1.6898, + "step": 511 + }, + { + "epoch": 0.655150351887396, + "grad_norm": 0.185546875, + "learning_rate": 0.0, + "loss": 1.6705, + "step": 512 + }, + { + "epoch": 0.655150351887396, + "step": 512, + "total_flos": 3.639412992104202e+17, + "train_loss": 1.7040968828368932, + "train_runtime": 4680.8022, + "train_samples_per_second": 7.001, + "train_steps_per_second": 0.109 + }, + { + "epoch": 0.655150351887396, + "eval_loss": 1.633476734161377, + "eval_runtime": 103.3998, + "eval_samples_per_second": 48.356, + "eval_steps_per_second": 1.518, + "step": 512 } ], "logging_steps": 1, - "max_steps": 256, + "max_steps": 512, "num_input_tokens_seen": 0, "num_train_epochs": 1, - "save_steps": 10, - "total_flos": 1.8280284932328653e+17, + "save_steps": 20, + "total_flos": 3.639412992104202e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null