{ "best_metric": 1.4878435134887695, "best_model_checkpoint": "lora_lr_pad/mistralai/Mistral-7B-Instruct-v0.2/unaligned/checkpoint-500", "epoch": 0.655150351887396, "eval_steps": 20, "global_step": 512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012795905310300703, "grad_norm": 1.3359375, "learning_rate": 2.0000000000000003e-06, "loss": 3.2562, "step": 1 }, { "epoch": 0.0025591810620601407, "grad_norm": 1.3671875, "learning_rate": 4.000000000000001e-06, "loss": 3.152, "step": 2 }, { "epoch": 0.003838771593090211, "grad_norm": 1.2734375, "learning_rate": 6e-06, "loss": 3.101, "step": 3 }, { "epoch": 0.005118362124120281, "grad_norm": 1.421875, "learning_rate": 8.000000000000001e-06, "loss": 3.2665, "step": 4 }, { "epoch": 0.006397952655150352, "grad_norm": 1.375, "learning_rate": 1e-05, "loss": 3.2401, "step": 5 }, { "epoch": 0.007677543186180422, "grad_norm": 1.3046875, "learning_rate": 1.2e-05, "loss": 3.1574, "step": 6 }, { "epoch": 0.008957133717210493, "grad_norm": 1.3515625, "learning_rate": 1.4000000000000001e-05, "loss": 3.1197, "step": 7 }, { "epoch": 0.010236724248240563, "grad_norm": 1.4140625, "learning_rate": 1.6000000000000003e-05, "loss": 3.2179, "step": 8 }, { "epoch": 0.011516314779270634, "grad_norm": 1.421875, "learning_rate": 1.8e-05, "loss": 3.2687, "step": 9 }, { "epoch": 0.012795905310300703, "grad_norm": 1.4609375, "learning_rate": 2e-05, "loss": 3.2973, "step": 10 }, { "epoch": 0.014075495841330775, "grad_norm": 1.3359375, "learning_rate": 2.2000000000000003e-05, "loss": 3.1228, "step": 11 }, { "epoch": 0.015355086372360844, "grad_norm": 1.234375, "learning_rate": 2.4e-05, "loss": 3.0316, "step": 12 }, { "epoch": 0.016634676903390915, "grad_norm": 1.4765625, "learning_rate": 2.6000000000000002e-05, "loss": 3.2293, "step": 13 }, { "epoch": 0.017914267434420986, "grad_norm": 1.4375, "learning_rate": 2.8000000000000003e-05, "loss": 3.2405, "step": 14 }, { "epoch": 0.019193857965451054, "grad_norm": 1.453125, "learning_rate": 3e-05, "loss": 3.217, "step": 15 }, { "epoch": 0.020473448496481125, "grad_norm": 1.375, "learning_rate": 3.2000000000000005e-05, "loss": 3.092, "step": 16 }, { "epoch": 0.021753039027511197, "grad_norm": 1.59375, "learning_rate": 3.4000000000000007e-05, "loss": 3.2057, "step": 17 }, { "epoch": 0.023032629558541268, "grad_norm": 1.3984375, "learning_rate": 3.6e-05, "loss": 2.9932, "step": 18 }, { "epoch": 0.02431222008957134, "grad_norm": 1.546875, "learning_rate": 3.8e-05, "loss": 3.0673, "step": 19 }, { "epoch": 0.025591810620601407, "grad_norm": 1.5703125, "learning_rate": 4e-05, "loss": 2.9652, "step": 20 }, { "epoch": 0.025591810620601407, "eval_loss": 2.928436756134033, "eval_runtime": 103.8047, "eval_samples_per_second": 48.167, "eval_steps_per_second": 1.512, "step": 20 }, { "epoch": 0.026871401151631478, "grad_norm": 1.6875, "learning_rate": 4.2e-05, "loss": 2.9173, "step": 21 }, { "epoch": 0.02815099168266155, "grad_norm": 1.7890625, "learning_rate": 4.4000000000000006e-05, "loss": 2.9455, "step": 22 }, { "epoch": 0.02943058221369162, "grad_norm": 1.7421875, "learning_rate": 4.600000000000001e-05, "loss": 2.909, "step": 23 }, { "epoch": 0.030710172744721688, "grad_norm": 1.7734375, "learning_rate": 4.8e-05, "loss": 2.8191, "step": 24 }, { "epoch": 0.03198976327575176, "grad_norm": 1.8515625, "learning_rate": 5e-05, "loss": 2.7853, "step": 25 }, { "epoch": 0.03326935380678183, "grad_norm": 1.7890625, "learning_rate": 5.2000000000000004e-05, "loss": 2.7678, "step": 26 }, { "epoch": 0.0345489443378119, "grad_norm": 1.7890625, "learning_rate": 5.4000000000000005e-05, "loss": 2.6028, "step": 27 }, { "epoch": 0.03582853486884197, "grad_norm": 1.8046875, "learning_rate": 5.6000000000000006e-05, "loss": 2.6369, "step": 28 }, { "epoch": 0.037108125399872044, "grad_norm": 1.8984375, "learning_rate": 5.8e-05, "loss": 2.6158, "step": 29 }, { "epoch": 0.03838771593090211, "grad_norm": 1.59375, "learning_rate": 6e-05, "loss": 2.2824, "step": 30 }, { "epoch": 0.03966730646193218, "grad_norm": 1.6796875, "learning_rate": 6.2e-05, "loss": 2.4286, "step": 31 }, { "epoch": 0.04094689699296225, "grad_norm": 1.5625, "learning_rate": 6.400000000000001e-05, "loss": 2.2919, "step": 32 }, { "epoch": 0.04222648752399232, "grad_norm": 1.546875, "learning_rate": 6.6e-05, "loss": 2.2725, "step": 33 }, { "epoch": 0.04350607805502239, "grad_norm": 1.484375, "learning_rate": 6.800000000000001e-05, "loss": 2.202, "step": 34 }, { "epoch": 0.044785668586052464, "grad_norm": 1.375, "learning_rate": 7e-05, "loss": 2.1415, "step": 35 }, { "epoch": 0.046065259117082535, "grad_norm": 1.328125, "learning_rate": 7.2e-05, "loss": 2.0692, "step": 36 }, { "epoch": 0.04734484964811261, "grad_norm": 1.2734375, "learning_rate": 7.4e-05, "loss": 2.1186, "step": 37 }, { "epoch": 0.04862444017914268, "grad_norm": 1.046875, "learning_rate": 7.6e-05, "loss": 1.9482, "step": 38 }, { "epoch": 0.04990403071017274, "grad_norm": 0.88671875, "learning_rate": 7.800000000000001e-05, "loss": 1.8985, "step": 39 }, { "epoch": 0.05118362124120281, "grad_norm": 0.7421875, "learning_rate": 8e-05, "loss": 1.844, "step": 40 }, { "epoch": 0.05118362124120281, "eval_loss": 1.8063277006149292, "eval_runtime": 103.8793, "eval_samples_per_second": 48.133, "eval_steps_per_second": 1.511, "step": 40 }, { "epoch": 0.052463211772232884, "grad_norm": 0.6015625, "learning_rate": 8.2e-05, "loss": 1.8771, "step": 41 }, { "epoch": 0.053742802303262956, "grad_norm": 0.515625, "learning_rate": 8.4e-05, "loss": 1.7902, "step": 42 }, { "epoch": 0.05502239283429303, "grad_norm": 0.45703125, "learning_rate": 8.6e-05, "loss": 1.7946, "step": 43 }, { "epoch": 0.0563019833653231, "grad_norm": 0.458984375, "learning_rate": 8.800000000000001e-05, "loss": 1.751, "step": 44 }, { "epoch": 0.05758157389635317, "grad_norm": 0.52734375, "learning_rate": 9e-05, "loss": 1.7537, "step": 45 }, { "epoch": 0.05886116442738324, "grad_norm": 0.4765625, "learning_rate": 9.200000000000001e-05, "loss": 1.7619, "step": 46 }, { "epoch": 0.060140754958413305, "grad_norm": 0.5546875, "learning_rate": 9.4e-05, "loss": 1.7668, "step": 47 }, { "epoch": 0.061420345489443376, "grad_norm": 0.625, "learning_rate": 9.6e-05, "loss": 1.7556, "step": 48 }, { "epoch": 0.06269993602047345, "grad_norm": 0.75390625, "learning_rate": 9.8e-05, "loss": 1.7158, "step": 49 }, { "epoch": 0.06397952655150352, "grad_norm": 0.8828125, "learning_rate": 0.0001, "loss": 1.7301, "step": 50 }, { "epoch": 0.06525911708253358, "grad_norm": 0.9921875, "learning_rate": 0.00010200000000000001, "loss": 1.7979, "step": 51 }, { "epoch": 0.06653870761356366, "grad_norm": 0.84765625, "learning_rate": 0.00010400000000000001, "loss": 1.6356, "step": 52 }, { "epoch": 0.06781829814459372, "grad_norm": 0.494140625, "learning_rate": 0.00010600000000000002, "loss": 1.6094, "step": 53 }, { "epoch": 0.0690978886756238, "grad_norm": 0.48046875, "learning_rate": 0.00010800000000000001, "loss": 1.6467, "step": 54 }, { "epoch": 0.07037747920665387, "grad_norm": 0.44140625, "learning_rate": 0.00011000000000000002, "loss": 1.576, "step": 55 }, { "epoch": 0.07165706973768395, "grad_norm": 0.384765625, "learning_rate": 0.00011200000000000001, "loss": 1.6307, "step": 56 }, { "epoch": 0.07293666026871401, "grad_norm": 0.375, "learning_rate": 0.00011399999999999999, "loss": 1.6133, "step": 57 }, { "epoch": 0.07421625079974409, "grad_norm": 0.33984375, "learning_rate": 0.000116, "loss": 1.6575, "step": 58 }, { "epoch": 0.07549584133077415, "grad_norm": 0.310546875, "learning_rate": 0.000118, "loss": 1.5782, "step": 59 }, { "epoch": 0.07677543186180422, "grad_norm": 0.29296875, "learning_rate": 0.00012, "loss": 1.6386, "step": 60 }, { "epoch": 0.07677543186180422, "eval_loss": 1.5920685529708862, "eval_runtime": 103.8813, "eval_samples_per_second": 48.132, "eval_steps_per_second": 1.511, "step": 60 }, { "epoch": 0.0780550223928343, "grad_norm": 0.2890625, "learning_rate": 0.000122, "loss": 1.5949, "step": 61 }, { "epoch": 0.07933461292386436, "grad_norm": 0.28125, "learning_rate": 0.000124, "loss": 1.6136, "step": 62 }, { "epoch": 0.08061420345489444, "grad_norm": 0.2890625, "learning_rate": 0.000126, "loss": 1.6135, "step": 63 }, { "epoch": 0.0818937939859245, "grad_norm": 0.287109375, "learning_rate": 0.00012800000000000002, "loss": 1.5579, "step": 64 }, { "epoch": 0.08317338451695458, "grad_norm": 0.306640625, "learning_rate": 0.00013000000000000002, "loss": 1.6174, "step": 65 }, { "epoch": 0.08445297504798464, "grad_norm": 0.28125, "learning_rate": 0.000132, "loss": 1.6687, "step": 66 }, { "epoch": 0.08573256557901472, "grad_norm": 0.30078125, "learning_rate": 0.000134, "loss": 1.604, "step": 67 }, { "epoch": 0.08701215611004479, "grad_norm": 0.3125, "learning_rate": 0.00013600000000000003, "loss": 1.5936, "step": 68 }, { "epoch": 0.08829174664107485, "grad_norm": 0.27734375, "learning_rate": 0.000138, "loss": 1.5744, "step": 69 }, { "epoch": 0.08957133717210493, "grad_norm": 0.275390625, "learning_rate": 0.00014, "loss": 1.5875, "step": 70 }, { "epoch": 0.09085092770313499, "grad_norm": 0.283203125, "learning_rate": 0.000142, "loss": 1.5938, "step": 71 }, { "epoch": 0.09213051823416507, "grad_norm": 0.291015625, "learning_rate": 0.000144, "loss": 1.5795, "step": 72 }, { "epoch": 0.09341010876519514, "grad_norm": 0.2578125, "learning_rate": 0.000146, "loss": 1.5642, "step": 73 }, { "epoch": 0.09468969929622521, "grad_norm": 0.255859375, "learning_rate": 0.000148, "loss": 1.6276, "step": 74 }, { "epoch": 0.09596928982725528, "grad_norm": 0.259765625, "learning_rate": 0.00015000000000000001, "loss": 1.6222, "step": 75 }, { "epoch": 0.09724888035828536, "grad_norm": 0.232421875, "learning_rate": 0.000152, "loss": 1.5487, "step": 76 }, { "epoch": 0.09852847088931542, "grad_norm": 0.259765625, "learning_rate": 0.000154, "loss": 1.6183, "step": 77 }, { "epoch": 0.09980806142034548, "grad_norm": 0.228515625, "learning_rate": 0.00015600000000000002, "loss": 1.5813, "step": 78 }, { "epoch": 0.10108765195137556, "grad_norm": 0.25, "learning_rate": 0.00015800000000000002, "loss": 1.5332, "step": 79 }, { "epoch": 0.10236724248240563, "grad_norm": 0.255859375, "learning_rate": 0.00016, "loss": 1.5553, "step": 80 }, { "epoch": 0.10236724248240563, "eval_loss": 1.5541130304336548, "eval_runtime": 103.8472, "eval_samples_per_second": 48.148, "eval_steps_per_second": 1.512, "step": 80 }, { "epoch": 0.1036468330134357, "grad_norm": 0.248046875, "learning_rate": 0.000162, "loss": 1.5697, "step": 81 }, { "epoch": 0.10492642354446577, "grad_norm": 0.251953125, "learning_rate": 0.000164, "loss": 1.5474, "step": 82 }, { "epoch": 0.10620601407549585, "grad_norm": 0.2412109375, "learning_rate": 0.000166, "loss": 1.5834, "step": 83 }, { "epoch": 0.10748560460652591, "grad_norm": 0.2294921875, "learning_rate": 0.000168, "loss": 1.542, "step": 84 }, { "epoch": 0.10876519513755598, "grad_norm": 0.2265625, "learning_rate": 0.00017, "loss": 1.5841, "step": 85 }, { "epoch": 0.11004478566858605, "grad_norm": 0.25390625, "learning_rate": 0.000172, "loss": 1.515, "step": 86 }, { "epoch": 0.11132437619961612, "grad_norm": 0.244140625, "learning_rate": 0.000174, "loss": 1.5565, "step": 87 }, { "epoch": 0.1126039667306462, "grad_norm": 0.2197265625, "learning_rate": 0.00017600000000000002, "loss": 1.6133, "step": 88 }, { "epoch": 0.11388355726167626, "grad_norm": 0.25, "learning_rate": 0.00017800000000000002, "loss": 1.5476, "step": 89 }, { "epoch": 0.11516314779270634, "grad_norm": 0.25, "learning_rate": 0.00018, "loss": 1.5212, "step": 90 }, { "epoch": 0.1164427383237364, "grad_norm": 0.2216796875, "learning_rate": 0.000182, "loss": 1.5628, "step": 91 }, { "epoch": 0.11772232885476648, "grad_norm": 0.2353515625, "learning_rate": 0.00018400000000000003, "loss": 1.4946, "step": 92 }, { "epoch": 0.11900191938579655, "grad_norm": 0.2255859375, "learning_rate": 0.00018600000000000002, "loss": 1.6303, "step": 93 }, { "epoch": 0.12028150991682661, "grad_norm": 0.24609375, "learning_rate": 0.000188, "loss": 1.5155, "step": 94 }, { "epoch": 0.12156110044785669, "grad_norm": 0.220703125, "learning_rate": 0.00019, "loss": 1.5652, "step": 95 }, { "epoch": 0.12284069097888675, "grad_norm": 0.2236328125, "learning_rate": 0.000192, "loss": 1.6042, "step": 96 }, { "epoch": 0.12412028150991683, "grad_norm": 0.263671875, "learning_rate": 0.000194, "loss": 1.5697, "step": 97 }, { "epoch": 0.1253998720409469, "grad_norm": 0.2294921875, "learning_rate": 0.000196, "loss": 1.4785, "step": 98 }, { "epoch": 0.12667946257197696, "grad_norm": 0.2236328125, "learning_rate": 0.00019800000000000002, "loss": 1.4921, "step": 99 }, { "epoch": 0.12795905310300704, "grad_norm": 0.22265625, "learning_rate": 0.0002, "loss": 1.4985, "step": 100 }, { "epoch": 0.12795905310300704, "eval_loss": 1.5341166257858276, "eval_runtime": 103.7833, "eval_samples_per_second": 48.177, "eval_steps_per_second": 1.513, "step": 100 }, { "epoch": 0.12923864363403711, "grad_norm": 0.232421875, "learning_rate": 0.00019951456310679614, "loss": 1.494, "step": 101 }, { "epoch": 0.13051823416506717, "grad_norm": 0.236328125, "learning_rate": 0.00019902912621359224, "loss": 1.5407, "step": 102 }, { "epoch": 0.13179782469609724, "grad_norm": 0.236328125, "learning_rate": 0.00019854368932038837, "loss": 1.5755, "step": 103 }, { "epoch": 0.13307741522712732, "grad_norm": 0.2216796875, "learning_rate": 0.00019805825242718447, "loss": 1.5491, "step": 104 }, { "epoch": 0.1343570057581574, "grad_norm": 0.2421875, "learning_rate": 0.0001975728155339806, "loss": 1.5393, "step": 105 }, { "epoch": 0.13563659628918745, "grad_norm": 0.2236328125, "learning_rate": 0.0001970873786407767, "loss": 1.5657, "step": 106 }, { "epoch": 0.13691618682021753, "grad_norm": 0.2119140625, "learning_rate": 0.00019660194174757283, "loss": 1.5551, "step": 107 }, { "epoch": 0.1381957773512476, "grad_norm": 0.240234375, "learning_rate": 0.00019611650485436895, "loss": 1.5408, "step": 108 }, { "epoch": 0.13947536788227768, "grad_norm": 0.2373046875, "learning_rate": 0.00019563106796116505, "loss": 1.5375, "step": 109 }, { "epoch": 0.14075495841330773, "grad_norm": 0.216796875, "learning_rate": 0.00019514563106796118, "loss": 1.5578, "step": 110 }, { "epoch": 0.1420345489443378, "grad_norm": 0.240234375, "learning_rate": 0.00019466019417475728, "loss": 1.4969, "step": 111 }, { "epoch": 0.1433141394753679, "grad_norm": 0.244140625, "learning_rate": 0.0001941747572815534, "loss": 1.5347, "step": 112 }, { "epoch": 0.14459373000639794, "grad_norm": 0.287109375, "learning_rate": 0.00019368932038834954, "loss": 1.549, "step": 113 }, { "epoch": 0.14587332053742802, "grad_norm": 0.2470703125, "learning_rate": 0.00019320388349514564, "loss": 1.5749, "step": 114 }, { "epoch": 0.1471529110684581, "grad_norm": 0.2353515625, "learning_rate": 0.00019271844660194177, "loss": 1.5311, "step": 115 }, { "epoch": 0.14843250159948818, "grad_norm": 0.2275390625, "learning_rate": 0.00019223300970873787, "loss": 1.5678, "step": 116 }, { "epoch": 0.14971209213051823, "grad_norm": 0.259765625, "learning_rate": 0.000191747572815534, "loss": 1.616, "step": 117 }, { "epoch": 0.1509916826615483, "grad_norm": 0.2158203125, "learning_rate": 0.0001912621359223301, "loss": 1.5449, "step": 118 }, { "epoch": 0.15227127319257838, "grad_norm": 0.2373046875, "learning_rate": 0.00019077669902912623, "loss": 1.4975, "step": 119 }, { "epoch": 0.15355086372360843, "grad_norm": 0.234375, "learning_rate": 0.00019029126213592236, "loss": 1.5631, "step": 120 }, { "epoch": 0.15355086372360843, "eval_loss": 1.5212680101394653, "eval_runtime": 103.8151, "eval_samples_per_second": 48.163, "eval_steps_per_second": 1.512, "step": 120 }, { "epoch": 0.1548304542546385, "grad_norm": 0.2177734375, "learning_rate": 0.00018980582524271846, "loss": 1.5534, "step": 121 }, { "epoch": 0.1561100447856686, "grad_norm": 0.2578125, "learning_rate": 0.00018932038834951458, "loss": 1.5329, "step": 122 }, { "epoch": 0.15738963531669867, "grad_norm": 0.2255859375, "learning_rate": 0.00018883495145631069, "loss": 1.5372, "step": 123 }, { "epoch": 0.15866922584772872, "grad_norm": 0.26171875, "learning_rate": 0.00018834951456310681, "loss": 1.5373, "step": 124 }, { "epoch": 0.1599488163787588, "grad_norm": 0.2421875, "learning_rate": 0.00018786407766990291, "loss": 1.5492, "step": 125 }, { "epoch": 0.16122840690978887, "grad_norm": 0.255859375, "learning_rate": 0.00018737864077669904, "loss": 1.536, "step": 126 }, { "epoch": 0.16250799744081892, "grad_norm": 0.236328125, "learning_rate": 0.00018689320388349517, "loss": 1.5719, "step": 127 }, { "epoch": 0.163787587971849, "grad_norm": 0.248046875, "learning_rate": 0.00018640776699029127, "loss": 1.5101, "step": 128 }, { "epoch": 0.16506717850287908, "grad_norm": 0.22265625, "learning_rate": 0.0001859223300970874, "loss": 1.5896, "step": 129 }, { "epoch": 0.16634676903390916, "grad_norm": 0.220703125, "learning_rate": 0.0001854368932038835, "loss": 1.5791, "step": 130 }, { "epoch": 0.1676263595649392, "grad_norm": 0.251953125, "learning_rate": 0.00018495145631067963, "loss": 1.5013, "step": 131 }, { "epoch": 0.1689059500959693, "grad_norm": 0.2373046875, "learning_rate": 0.00018446601941747576, "loss": 1.5421, "step": 132 }, { "epoch": 0.17018554062699937, "grad_norm": 0.2373046875, "learning_rate": 0.00018398058252427186, "loss": 1.5329, "step": 133 }, { "epoch": 0.17146513115802944, "grad_norm": 0.2294921875, "learning_rate": 0.00018349514563106799, "loss": 1.5631, "step": 134 }, { "epoch": 0.1727447216890595, "grad_norm": 0.25390625, "learning_rate": 0.0001830097087378641, "loss": 1.5278, "step": 135 }, { "epoch": 0.17402431222008957, "grad_norm": 0.2431640625, "learning_rate": 0.00018252427184466022, "loss": 1.5548, "step": 136 }, { "epoch": 0.17530390275111965, "grad_norm": 0.275390625, "learning_rate": 0.00018203883495145632, "loss": 1.5023, "step": 137 }, { "epoch": 0.1765834932821497, "grad_norm": 0.2333984375, "learning_rate": 0.00018155339805825244, "loss": 1.5062, "step": 138 }, { "epoch": 0.17786308381317978, "grad_norm": 0.265625, "learning_rate": 0.00018106796116504857, "loss": 1.5637, "step": 139 }, { "epoch": 0.17914267434420986, "grad_norm": 0.255859375, "learning_rate": 0.00018058252427184467, "loss": 1.5201, "step": 140 }, { "epoch": 0.17914267434420986, "eval_loss": 1.513644814491272, "eval_runtime": 103.8133, "eval_samples_per_second": 48.163, "eval_steps_per_second": 1.512, "step": 140 }, { "epoch": 0.18042226487523993, "grad_norm": 0.2353515625, "learning_rate": 0.0001800970873786408, "loss": 1.5106, "step": 141 }, { "epoch": 0.18170185540626999, "grad_norm": 0.23046875, "learning_rate": 0.0001796116504854369, "loss": 1.5403, "step": 142 }, { "epoch": 0.18298144593730006, "grad_norm": 0.236328125, "learning_rate": 0.00017912621359223303, "loss": 1.5373, "step": 143 }, { "epoch": 0.18426103646833014, "grad_norm": 0.2421875, "learning_rate": 0.00017864077669902913, "loss": 1.5697, "step": 144 }, { "epoch": 0.1855406269993602, "grad_norm": 0.240234375, "learning_rate": 0.00017815533980582526, "loss": 1.5135, "step": 145 }, { "epoch": 0.18682021753039027, "grad_norm": 0.228515625, "learning_rate": 0.0001776699029126214, "loss": 1.5446, "step": 146 }, { "epoch": 0.18809980806142035, "grad_norm": 0.22265625, "learning_rate": 0.0001771844660194175, "loss": 1.5226, "step": 147 }, { "epoch": 0.18937939859245043, "grad_norm": 0.2275390625, "learning_rate": 0.00017669902912621362, "loss": 1.4343, "step": 148 }, { "epoch": 0.19065898912348048, "grad_norm": 0.232421875, "learning_rate": 0.00017621359223300972, "loss": 1.5509, "step": 149 }, { "epoch": 0.19193857965451055, "grad_norm": 0.234375, "learning_rate": 0.00017572815533980585, "loss": 1.5397, "step": 150 }, { "epoch": 0.19321817018554063, "grad_norm": 0.26171875, "learning_rate": 0.00017524271844660195, "loss": 1.4796, "step": 151 }, { "epoch": 0.1944977607165707, "grad_norm": 0.2470703125, "learning_rate": 0.00017475728155339805, "loss": 1.5855, "step": 152 }, { "epoch": 0.19577735124760076, "grad_norm": 0.2255859375, "learning_rate": 0.00017427184466019418, "loss": 1.5221, "step": 153 }, { "epoch": 0.19705694177863084, "grad_norm": 0.265625, "learning_rate": 0.00017378640776699028, "loss": 1.4963, "step": 154 }, { "epoch": 0.19833653230966092, "grad_norm": 0.236328125, "learning_rate": 0.0001733009708737864, "loss": 1.5379, "step": 155 }, { "epoch": 0.19961612284069097, "grad_norm": 0.220703125, "learning_rate": 0.00017281553398058253, "loss": 1.4701, "step": 156 }, { "epoch": 0.20089571337172105, "grad_norm": 0.2255859375, "learning_rate": 0.00017233009708737864, "loss": 1.5133, "step": 157 }, { "epoch": 0.20217530390275112, "grad_norm": 0.251953125, "learning_rate": 0.00017184466019417476, "loss": 1.5238, "step": 158 }, { "epoch": 0.2034548944337812, "grad_norm": 0.2490234375, "learning_rate": 0.00017135922330097086, "loss": 1.5361, "step": 159 }, { "epoch": 0.20473448496481125, "grad_norm": 0.232421875, "learning_rate": 0.000170873786407767, "loss": 1.4585, "step": 160 }, { "epoch": 0.20473448496481125, "eval_loss": 1.5087493658065796, "eval_runtime": 103.8289, "eval_samples_per_second": 48.156, "eval_steps_per_second": 1.512, "step": 160 }, { "epoch": 0.20601407549584133, "grad_norm": 0.2412109375, "learning_rate": 0.0001703883495145631, "loss": 1.5574, "step": 161 }, { "epoch": 0.2072936660268714, "grad_norm": 0.2255859375, "learning_rate": 0.00016990291262135922, "loss": 1.4938, "step": 162 }, { "epoch": 0.20857325655790146, "grad_norm": 0.2431640625, "learning_rate": 0.00016941747572815535, "loss": 1.5307, "step": 163 }, { "epoch": 0.20985284708893154, "grad_norm": 0.23828125, "learning_rate": 0.00016893203883495145, "loss": 1.4849, "step": 164 }, { "epoch": 0.21113243761996162, "grad_norm": 0.25390625, "learning_rate": 0.00016844660194174758, "loss": 1.4399, "step": 165 }, { "epoch": 0.2124120281509917, "grad_norm": 0.2578125, "learning_rate": 0.00016796116504854368, "loss": 1.512, "step": 166 }, { "epoch": 0.21369161868202174, "grad_norm": 0.263671875, "learning_rate": 0.0001674757281553398, "loss": 1.5586, "step": 167 }, { "epoch": 0.21497120921305182, "grad_norm": 0.2412109375, "learning_rate": 0.00016699029126213594, "loss": 1.5673, "step": 168 }, { "epoch": 0.2162507997440819, "grad_norm": 0.2392578125, "learning_rate": 0.00016650485436893204, "loss": 1.4893, "step": 169 }, { "epoch": 0.21753039027511195, "grad_norm": 0.244140625, "learning_rate": 0.00016601941747572817, "loss": 1.5885, "step": 170 }, { "epoch": 0.21880998080614203, "grad_norm": 0.275390625, "learning_rate": 0.00016553398058252427, "loss": 1.5318, "step": 171 }, { "epoch": 0.2200895713371721, "grad_norm": 0.2392578125, "learning_rate": 0.0001650485436893204, "loss": 1.4523, "step": 172 }, { "epoch": 0.22136916186820219, "grad_norm": 0.255859375, "learning_rate": 0.0001645631067961165, "loss": 1.5486, "step": 173 }, { "epoch": 0.22264875239923224, "grad_norm": 0.234375, "learning_rate": 0.00016407766990291262, "loss": 1.4989, "step": 174 }, { "epoch": 0.22392834293026231, "grad_norm": 0.2421875, "learning_rate": 0.00016359223300970875, "loss": 1.5556, "step": 175 }, { "epoch": 0.2252079334612924, "grad_norm": 0.232421875, "learning_rate": 0.00016310679611650485, "loss": 1.545, "step": 176 }, { "epoch": 0.22648752399232247, "grad_norm": 0.2470703125, "learning_rate": 0.00016262135922330098, "loss": 1.4939, "step": 177 }, { "epoch": 0.22776711452335252, "grad_norm": 0.259765625, "learning_rate": 0.00016213592233009708, "loss": 1.4768, "step": 178 }, { "epoch": 0.2290467050543826, "grad_norm": 0.2578125, "learning_rate": 0.0001616504854368932, "loss": 1.494, "step": 179 }, { "epoch": 0.23032629558541268, "grad_norm": 0.259765625, "learning_rate": 0.0001611650485436893, "loss": 1.5361, "step": 180 }, { "epoch": 0.23032629558541268, "eval_loss": 1.5049980878829956, "eval_runtime": 103.8247, "eval_samples_per_second": 48.158, "eval_steps_per_second": 1.512, "step": 180 }, { "epoch": 0.23160588611644273, "grad_norm": 0.248046875, "learning_rate": 0.00016067961165048544, "loss": 1.5126, "step": 181 }, { "epoch": 0.2328854766474728, "grad_norm": 0.2216796875, "learning_rate": 0.00016019417475728157, "loss": 1.4835, "step": 182 }, { "epoch": 0.23416506717850288, "grad_norm": 0.251953125, "learning_rate": 0.00015970873786407767, "loss": 1.5131, "step": 183 }, { "epoch": 0.23544465770953296, "grad_norm": 0.259765625, "learning_rate": 0.0001592233009708738, "loss": 1.4804, "step": 184 }, { "epoch": 0.236724248240563, "grad_norm": 0.25, "learning_rate": 0.0001587378640776699, "loss": 1.6027, "step": 185 }, { "epoch": 0.2380038387715931, "grad_norm": 0.2373046875, "learning_rate": 0.00015825242718446603, "loss": 1.5373, "step": 186 }, { "epoch": 0.23928342930262317, "grad_norm": 0.2265625, "learning_rate": 0.00015776699029126213, "loss": 1.5531, "step": 187 }, { "epoch": 0.24056301983365322, "grad_norm": 0.2412109375, "learning_rate": 0.00015728155339805825, "loss": 1.5101, "step": 188 }, { "epoch": 0.2418426103646833, "grad_norm": 0.2578125, "learning_rate": 0.00015679611650485438, "loss": 1.538, "step": 189 }, { "epoch": 0.24312220089571338, "grad_norm": 0.2451171875, "learning_rate": 0.00015631067961165048, "loss": 1.526, "step": 190 }, { "epoch": 0.24440179142674345, "grad_norm": 0.248046875, "learning_rate": 0.0001558252427184466, "loss": 1.5275, "step": 191 }, { "epoch": 0.2456813819577735, "grad_norm": 0.2421875, "learning_rate": 0.0001553398058252427, "loss": 1.567, "step": 192 }, { "epoch": 0.24696097248880358, "grad_norm": 0.267578125, "learning_rate": 0.00015485436893203884, "loss": 1.4457, "step": 193 }, { "epoch": 0.24824056301983366, "grad_norm": 0.2421875, "learning_rate": 0.00015436893203883497, "loss": 1.5728, "step": 194 }, { "epoch": 0.2495201535508637, "grad_norm": 0.2421875, "learning_rate": 0.00015388349514563107, "loss": 1.4829, "step": 195 }, { "epoch": 0.2507997440818938, "grad_norm": 0.23046875, "learning_rate": 0.0001533980582524272, "loss": 1.5093, "step": 196 }, { "epoch": 0.25207933461292387, "grad_norm": 0.259765625, "learning_rate": 0.0001529126213592233, "loss": 1.5079, "step": 197 }, { "epoch": 0.2533589251439539, "grad_norm": 0.2265625, "learning_rate": 0.00015242718446601943, "loss": 1.4996, "step": 198 }, { "epoch": 0.254638515674984, "grad_norm": 0.2255859375, "learning_rate": 0.00015194174757281553, "loss": 1.4967, "step": 199 }, { "epoch": 0.2559181062060141, "grad_norm": 0.287109375, "learning_rate": 0.00015145631067961166, "loss": 1.485, "step": 200 }, { "epoch": 0.2559181062060141, "eval_loss": 1.5022693872451782, "eval_runtime": 103.8087, "eval_samples_per_second": 48.166, "eval_steps_per_second": 1.512, "step": 200 }, { "epoch": 0.2571976967370441, "grad_norm": 0.255859375, "learning_rate": 0.00015097087378640778, "loss": 1.4926, "step": 201 }, { "epoch": 0.25847728726807423, "grad_norm": 0.232421875, "learning_rate": 0.00015048543689320389, "loss": 1.5215, "step": 202 }, { "epoch": 0.2597568777991043, "grad_norm": 0.2373046875, "learning_rate": 0.00015000000000000001, "loss": 1.5674, "step": 203 }, { "epoch": 0.26103646833013433, "grad_norm": 0.240234375, "learning_rate": 0.00014951456310679611, "loss": 1.5157, "step": 204 }, { "epoch": 0.26231605886116444, "grad_norm": 0.2451171875, "learning_rate": 0.00014902912621359224, "loss": 1.4502, "step": 205 }, { "epoch": 0.2635956493921945, "grad_norm": 0.2578125, "learning_rate": 0.00014854368932038834, "loss": 1.5289, "step": 206 }, { "epoch": 0.2648752399232246, "grad_norm": 0.24609375, "learning_rate": 0.00014805825242718447, "loss": 1.4454, "step": 207 }, { "epoch": 0.26615483045425464, "grad_norm": 0.236328125, "learning_rate": 0.0001475728155339806, "loss": 1.5132, "step": 208 }, { "epoch": 0.2674344209852847, "grad_norm": 0.28515625, "learning_rate": 0.0001470873786407767, "loss": 1.5041, "step": 209 }, { "epoch": 0.2687140115163148, "grad_norm": 0.2353515625, "learning_rate": 0.00014660194174757283, "loss": 1.5313, "step": 210 }, { "epoch": 0.26999360204734485, "grad_norm": 0.24609375, "learning_rate": 0.00014611650485436893, "loss": 1.5156, "step": 211 }, { "epoch": 0.2712731925783749, "grad_norm": 0.2451171875, "learning_rate": 0.00014563106796116506, "loss": 1.4958, "step": 212 }, { "epoch": 0.272552783109405, "grad_norm": 0.2451171875, "learning_rate": 0.0001451456310679612, "loss": 1.5324, "step": 213 }, { "epoch": 0.27383237364043506, "grad_norm": 0.2451171875, "learning_rate": 0.0001446601941747573, "loss": 1.4894, "step": 214 }, { "epoch": 0.2751119641714651, "grad_norm": 0.2412109375, "learning_rate": 0.00014417475728155342, "loss": 1.4462, "step": 215 }, { "epoch": 0.2763915547024952, "grad_norm": 0.267578125, "learning_rate": 0.00014368932038834952, "loss": 1.5, "step": 216 }, { "epoch": 0.27767114523352526, "grad_norm": 0.296875, "learning_rate": 0.00014320388349514565, "loss": 1.5317, "step": 217 }, { "epoch": 0.27895073576455537, "grad_norm": 0.251953125, "learning_rate": 0.00014271844660194175, "loss": 1.5553, "step": 218 }, { "epoch": 0.2802303262955854, "grad_norm": 0.2734375, "learning_rate": 0.00014223300970873787, "loss": 1.5055, "step": 219 }, { "epoch": 0.28150991682661547, "grad_norm": 0.25, "learning_rate": 0.000141747572815534, "loss": 1.5299, "step": 220 }, { "epoch": 0.28150991682661547, "eval_loss": 1.4998944997787476, "eval_runtime": 103.8007, "eval_samples_per_second": 48.169, "eval_steps_per_second": 1.513, "step": 220 }, { "epoch": 0.2827895073576456, "grad_norm": 0.2734375, "learning_rate": 0.0001412621359223301, "loss": 1.5298, "step": 221 }, { "epoch": 0.2840690978886756, "grad_norm": 0.25, "learning_rate": 0.00014077669902912623, "loss": 1.5178, "step": 222 }, { "epoch": 0.2853486884197057, "grad_norm": 0.244140625, "learning_rate": 0.00014029126213592233, "loss": 1.4975, "step": 223 }, { "epoch": 0.2866282789507358, "grad_norm": 0.2333984375, "learning_rate": 0.00013980582524271846, "loss": 1.5121, "step": 224 }, { "epoch": 0.28790786948176583, "grad_norm": 0.26171875, "learning_rate": 0.00013932038834951456, "loss": 1.4838, "step": 225 }, { "epoch": 0.2891874600127959, "grad_norm": 0.265625, "learning_rate": 0.0001388349514563107, "loss": 1.4422, "step": 226 }, { "epoch": 0.290467050543826, "grad_norm": 0.2421875, "learning_rate": 0.00013834951456310682, "loss": 1.5315, "step": 227 }, { "epoch": 0.29174664107485604, "grad_norm": 0.271484375, "learning_rate": 0.00013786407766990292, "loss": 1.524, "step": 228 }, { "epoch": 0.2930262316058861, "grad_norm": 0.255859375, "learning_rate": 0.00013737864077669905, "loss": 1.4314, "step": 229 }, { "epoch": 0.2943058221369162, "grad_norm": 0.2421875, "learning_rate": 0.00013689320388349515, "loss": 1.5496, "step": 230 }, { "epoch": 0.29558541266794625, "grad_norm": 0.279296875, "learning_rate": 0.00013640776699029128, "loss": 1.5526, "step": 231 }, { "epoch": 0.29686500319897635, "grad_norm": 0.25, "learning_rate": 0.0001359223300970874, "loss": 1.427, "step": 232 }, { "epoch": 0.2981445937300064, "grad_norm": 0.267578125, "learning_rate": 0.0001354368932038835, "loss": 1.4646, "step": 233 }, { "epoch": 0.29942418426103645, "grad_norm": 0.2734375, "learning_rate": 0.00013495145631067963, "loss": 1.5392, "step": 234 }, { "epoch": 0.30070377479206656, "grad_norm": 0.2734375, "learning_rate": 0.00013446601941747573, "loss": 1.5333, "step": 235 }, { "epoch": 0.3019833653230966, "grad_norm": 0.2431640625, "learning_rate": 0.00013398058252427186, "loss": 1.5282, "step": 236 }, { "epoch": 0.30326295585412666, "grad_norm": 0.251953125, "learning_rate": 0.00013349514563106796, "loss": 1.4763, "step": 237 }, { "epoch": 0.30454254638515676, "grad_norm": 0.2734375, "learning_rate": 0.0001330097087378641, "loss": 1.5199, "step": 238 }, { "epoch": 0.3058221369161868, "grad_norm": 0.263671875, "learning_rate": 0.00013252427184466022, "loss": 1.5497, "step": 239 }, { "epoch": 0.30710172744721687, "grad_norm": 0.267578125, "learning_rate": 0.00013203883495145632, "loss": 1.5539, "step": 240 }, { "epoch": 0.30710172744721687, "eval_loss": 1.4981228113174438, "eval_runtime": 103.819, "eval_samples_per_second": 48.161, "eval_steps_per_second": 1.512, "step": 240 }, { "epoch": 0.30838131797824697, "grad_norm": 0.255859375, "learning_rate": 0.00013155339805825245, "loss": 1.5309, "step": 241 }, { "epoch": 0.309660908509277, "grad_norm": 0.26171875, "learning_rate": 0.00013106796116504855, "loss": 1.4503, "step": 242 }, { "epoch": 0.31094049904030713, "grad_norm": 0.26171875, "learning_rate": 0.00013058252427184468, "loss": 1.4718, "step": 243 }, { "epoch": 0.3122200895713372, "grad_norm": 0.2578125, "learning_rate": 0.00013009708737864078, "loss": 1.4445, "step": 244 }, { "epoch": 0.31349968010236723, "grad_norm": 0.2392578125, "learning_rate": 0.0001296116504854369, "loss": 1.5606, "step": 245 }, { "epoch": 0.31477927063339733, "grad_norm": 0.2734375, "learning_rate": 0.00012912621359223304, "loss": 1.4917, "step": 246 }, { "epoch": 0.3160588611644274, "grad_norm": 0.240234375, "learning_rate": 0.00012864077669902914, "loss": 1.5295, "step": 247 }, { "epoch": 0.31733845169545744, "grad_norm": 0.2890625, "learning_rate": 0.00012815533980582526, "loss": 1.5685, "step": 248 }, { "epoch": 0.31861804222648754, "grad_norm": 0.28515625, "learning_rate": 0.00012766990291262137, "loss": 1.4985, "step": 249 }, { "epoch": 0.3198976327575176, "grad_norm": 0.251953125, "learning_rate": 0.0001271844660194175, "loss": 1.4746, "step": 250 }, { "epoch": 0.32117722328854764, "grad_norm": 0.26953125, "learning_rate": 0.00012669902912621362, "loss": 1.5615, "step": 251 }, { "epoch": 0.32245681381957775, "grad_norm": 0.2412109375, "learning_rate": 0.00012621359223300972, "loss": 1.4777, "step": 252 }, { "epoch": 0.3237364043506078, "grad_norm": 0.26171875, "learning_rate": 0.00012572815533980585, "loss": 1.4152, "step": 253 }, { "epoch": 0.32501599488163785, "grad_norm": 0.25390625, "learning_rate": 0.00012524271844660195, "loss": 1.4632, "step": 254 }, { "epoch": 0.32629558541266795, "grad_norm": 0.259765625, "learning_rate": 0.00012475728155339805, "loss": 1.5011, "step": 255 }, { "epoch": 0.327575175943698, "grad_norm": 0.244140625, "learning_rate": 0.00012427184466019418, "loss": 1.5339, "step": 256 }, { "epoch": 0.3288547664747281, "grad_norm": 0.271484375, "learning_rate": 0.00012378640776699028, "loss": 1.4776, "step": 257 }, { "epoch": 0.33013435700575816, "grad_norm": 0.265625, "learning_rate": 0.0001233009708737864, "loss": 1.4585, "step": 258 }, { "epoch": 0.3314139475367882, "grad_norm": 0.26171875, "learning_rate": 0.0001228155339805825, "loss": 1.4791, "step": 259 }, { "epoch": 0.3326935380678183, "grad_norm": 0.263671875, "learning_rate": 0.00012233009708737864, "loss": 1.521, "step": 260 }, { "epoch": 0.3326935380678183, "eval_loss": 1.4961707592010498, "eval_runtime": 103.7907, "eval_samples_per_second": 48.174, "eval_steps_per_second": 1.513, "step": 260 }, { "epoch": 0.33397312859884837, "grad_norm": 0.265625, "learning_rate": 0.00012184466019417475, "loss": 1.5015, "step": 261 }, { "epoch": 0.3352527191298784, "grad_norm": 0.26953125, "learning_rate": 0.00012135922330097087, "loss": 1.4732, "step": 262 }, { "epoch": 0.3365323096609085, "grad_norm": 0.294921875, "learning_rate": 0.00012087378640776698, "loss": 1.4832, "step": 263 }, { "epoch": 0.3378119001919386, "grad_norm": 0.271484375, "learning_rate": 0.0001203883495145631, "loss": 1.4629, "step": 264 }, { "epoch": 0.3390914907229686, "grad_norm": 0.265625, "learning_rate": 0.00011990291262135923, "loss": 1.5046, "step": 265 }, { "epoch": 0.34037108125399873, "grad_norm": 0.279296875, "learning_rate": 0.00011941747572815534, "loss": 1.5724, "step": 266 }, { "epoch": 0.3416506717850288, "grad_norm": 0.26171875, "learning_rate": 0.00011893203883495146, "loss": 1.4481, "step": 267 }, { "epoch": 0.3429302623160589, "grad_norm": 0.25390625, "learning_rate": 0.00011844660194174757, "loss": 1.5081, "step": 268 }, { "epoch": 0.34420985284708894, "grad_norm": 0.26171875, "learning_rate": 0.00011796116504854368, "loss": 1.5056, "step": 269 }, { "epoch": 0.345489443378119, "grad_norm": 0.26953125, "learning_rate": 0.0001174757281553398, "loss": 1.5279, "step": 270 }, { "epoch": 0.3467690339091491, "grad_norm": 0.244140625, "learning_rate": 0.00011699029126213593, "loss": 1.5293, "step": 271 }, { "epoch": 0.34804862444017914, "grad_norm": 0.271484375, "learning_rate": 0.00011650485436893204, "loss": 1.5436, "step": 272 }, { "epoch": 0.3493282149712092, "grad_norm": 0.259765625, "learning_rate": 0.00011601941747572816, "loss": 1.5806, "step": 273 }, { "epoch": 0.3506078055022393, "grad_norm": 0.271484375, "learning_rate": 0.00011553398058252427, "loss": 1.5412, "step": 274 }, { "epoch": 0.35188739603326935, "grad_norm": 0.283203125, "learning_rate": 0.00011504854368932039, "loss": 1.5225, "step": 275 }, { "epoch": 0.3531669865642994, "grad_norm": 0.2490234375, "learning_rate": 0.0001145631067961165, "loss": 1.6229, "step": 276 }, { "epoch": 0.3544465770953295, "grad_norm": 0.2734375, "learning_rate": 0.00011407766990291261, "loss": 1.5231, "step": 277 }, { "epoch": 0.35572616762635956, "grad_norm": 0.26171875, "learning_rate": 0.00011359223300970874, "loss": 1.481, "step": 278 }, { "epoch": 0.3570057581573896, "grad_norm": 0.265625, "learning_rate": 0.00011310679611650486, "loss": 1.544, "step": 279 }, { "epoch": 0.3582853486884197, "grad_norm": 0.25390625, "learning_rate": 0.00011262135922330097, "loss": 1.5186, "step": 280 }, { "epoch": 0.3582853486884197, "eval_loss": 1.4947106838226318, "eval_runtime": 103.7757, "eval_samples_per_second": 48.181, "eval_steps_per_second": 1.513, "step": 280 }, { "epoch": 0.35956493921944976, "grad_norm": 0.2470703125, "learning_rate": 0.00011213592233009709, "loss": 1.5507, "step": 281 }, { "epoch": 0.36084452975047987, "grad_norm": 0.255859375, "learning_rate": 0.0001116504854368932, "loss": 1.5091, "step": 282 }, { "epoch": 0.3621241202815099, "grad_norm": 0.28125, "learning_rate": 0.00011116504854368932, "loss": 1.4962, "step": 283 }, { "epoch": 0.36340371081253997, "grad_norm": 0.251953125, "learning_rate": 0.00011067961165048544, "loss": 1.537, "step": 284 }, { "epoch": 0.3646833013435701, "grad_norm": 0.271484375, "learning_rate": 0.00011019417475728156, "loss": 1.5814, "step": 285 }, { "epoch": 0.3659628918746001, "grad_norm": 0.279296875, "learning_rate": 0.00010970873786407767, "loss": 1.5328, "step": 286 }, { "epoch": 0.3672424824056302, "grad_norm": 0.275390625, "learning_rate": 0.00010922330097087379, "loss": 1.5429, "step": 287 }, { "epoch": 0.3685220729366603, "grad_norm": 0.26953125, "learning_rate": 0.0001087378640776699, "loss": 1.489, "step": 288 }, { "epoch": 0.36980166346769033, "grad_norm": 0.248046875, "learning_rate": 0.00010825242718446602, "loss": 1.518, "step": 289 }, { "epoch": 0.3710812539987204, "grad_norm": 0.25, "learning_rate": 0.00010776699029126213, "loss": 1.5142, "step": 290 }, { "epoch": 0.3723608445297505, "grad_norm": 0.271484375, "learning_rate": 0.00010728155339805826, "loss": 1.5229, "step": 291 }, { "epoch": 0.37364043506078054, "grad_norm": 0.251953125, "learning_rate": 0.00010679611650485437, "loss": 1.4803, "step": 292 }, { "epoch": 0.37492002559181065, "grad_norm": 0.2578125, "learning_rate": 0.00010631067961165049, "loss": 1.5791, "step": 293 }, { "epoch": 0.3761996161228407, "grad_norm": 0.2470703125, "learning_rate": 0.0001058252427184466, "loss": 1.5336, "step": 294 }, { "epoch": 0.37747920665387075, "grad_norm": 0.275390625, "learning_rate": 0.00010533980582524272, "loss": 1.4581, "step": 295 }, { "epoch": 0.37875879718490085, "grad_norm": 0.267578125, "learning_rate": 0.00010485436893203883, "loss": 1.4919, "step": 296 }, { "epoch": 0.3800383877159309, "grad_norm": 0.25390625, "learning_rate": 0.00010436893203883496, "loss": 1.5778, "step": 297 }, { "epoch": 0.38131797824696095, "grad_norm": 0.2734375, "learning_rate": 0.00010388349514563107, "loss": 1.4489, "step": 298 }, { "epoch": 0.38259756877799106, "grad_norm": 0.265625, "learning_rate": 0.00010339805825242719, "loss": 1.4704, "step": 299 }, { "epoch": 0.3838771593090211, "grad_norm": 0.267578125, "learning_rate": 0.0001029126213592233, "loss": 1.5316, "step": 300 }, { "epoch": 0.3838771593090211, "eval_loss": 1.4935728311538696, "eval_runtime": 103.76, "eval_samples_per_second": 48.188, "eval_steps_per_second": 1.513, "step": 300 }, { "epoch": 0.38515674984005116, "grad_norm": 0.265625, "learning_rate": 0.00010242718446601942, "loss": 1.5093, "step": 301 }, { "epoch": 0.38643634037108127, "grad_norm": 0.263671875, "learning_rate": 0.00010194174757281553, "loss": 1.4871, "step": 302 }, { "epoch": 0.3877159309021113, "grad_norm": 0.29296875, "learning_rate": 0.00010145631067961166, "loss": 1.4842, "step": 303 }, { "epoch": 0.3889955214331414, "grad_norm": 0.275390625, "learning_rate": 0.00010097087378640778, "loss": 1.5545, "step": 304 }, { "epoch": 0.3902751119641715, "grad_norm": 0.255859375, "learning_rate": 0.00010048543689320389, "loss": 1.5671, "step": 305 }, { "epoch": 0.3915547024952015, "grad_norm": 0.263671875, "learning_rate": 0.0001, "loss": 1.4817, "step": 306 }, { "epoch": 0.39283429302623163, "grad_norm": 0.275390625, "learning_rate": 9.951456310679612e-05, "loss": 1.4727, "step": 307 }, { "epoch": 0.3941138835572617, "grad_norm": 0.251953125, "learning_rate": 9.902912621359223e-05, "loss": 1.4717, "step": 308 }, { "epoch": 0.39539347408829173, "grad_norm": 0.2734375, "learning_rate": 9.854368932038835e-05, "loss": 1.4795, "step": 309 }, { "epoch": 0.39667306461932184, "grad_norm": 0.267578125, "learning_rate": 9.805825242718448e-05, "loss": 1.4475, "step": 310 }, { "epoch": 0.3979526551503519, "grad_norm": 0.283203125, "learning_rate": 9.757281553398059e-05, "loss": 1.4661, "step": 311 }, { "epoch": 0.39923224568138194, "grad_norm": 0.271484375, "learning_rate": 9.70873786407767e-05, "loss": 1.4808, "step": 312 }, { "epoch": 0.40051183621241204, "grad_norm": 0.29296875, "learning_rate": 9.660194174757282e-05, "loss": 1.5169, "step": 313 }, { "epoch": 0.4017914267434421, "grad_norm": 0.296875, "learning_rate": 9.611650485436893e-05, "loss": 1.4856, "step": 314 }, { "epoch": 0.40307101727447214, "grad_norm": 0.2578125, "learning_rate": 9.563106796116505e-05, "loss": 1.4995, "step": 315 }, { "epoch": 0.40435060780550225, "grad_norm": 0.28125, "learning_rate": 9.514563106796118e-05, "loss": 1.5682, "step": 316 }, { "epoch": 0.4056301983365323, "grad_norm": 0.265625, "learning_rate": 9.466019417475729e-05, "loss": 1.492, "step": 317 }, { "epoch": 0.4069097888675624, "grad_norm": 0.275390625, "learning_rate": 9.417475728155341e-05, "loss": 1.4706, "step": 318 }, { "epoch": 0.40818937939859246, "grad_norm": 0.265625, "learning_rate": 9.368932038834952e-05, "loss": 1.5184, "step": 319 }, { "epoch": 0.4094689699296225, "grad_norm": 0.283203125, "learning_rate": 9.320388349514564e-05, "loss": 1.5288, "step": 320 }, { "epoch": 0.4094689699296225, "eval_loss": 1.4924702644348145, "eval_runtime": 103.7588, "eval_samples_per_second": 48.189, "eval_steps_per_second": 1.513, "step": 320 }, { "epoch": 0.4107485604606526, "grad_norm": 0.275390625, "learning_rate": 9.271844660194175e-05, "loss": 1.5166, "step": 321 }, { "epoch": 0.41202815099168266, "grad_norm": 0.265625, "learning_rate": 9.223300970873788e-05, "loss": 1.5137, "step": 322 }, { "epoch": 0.4133077415227127, "grad_norm": 0.2890625, "learning_rate": 9.174757281553399e-05, "loss": 1.4874, "step": 323 }, { "epoch": 0.4145873320537428, "grad_norm": 0.27734375, "learning_rate": 9.126213592233011e-05, "loss": 1.4144, "step": 324 }, { "epoch": 0.41586692258477287, "grad_norm": 0.251953125, "learning_rate": 9.077669902912622e-05, "loss": 1.529, "step": 325 }, { "epoch": 0.4171465131158029, "grad_norm": 0.2578125, "learning_rate": 9.029126213592234e-05, "loss": 1.5507, "step": 326 }, { "epoch": 0.418426103646833, "grad_norm": 0.283203125, "learning_rate": 8.980582524271845e-05, "loss": 1.5447, "step": 327 }, { "epoch": 0.4197056941778631, "grad_norm": 0.2578125, "learning_rate": 8.932038834951457e-05, "loss": 1.556, "step": 328 }, { "epoch": 0.4209852847088932, "grad_norm": 0.2734375, "learning_rate": 8.88349514563107e-05, "loss": 1.4945, "step": 329 }, { "epoch": 0.42226487523992323, "grad_norm": 0.2578125, "learning_rate": 8.834951456310681e-05, "loss": 1.5668, "step": 330 }, { "epoch": 0.4235444657709533, "grad_norm": 0.259765625, "learning_rate": 8.786407766990292e-05, "loss": 1.501, "step": 331 }, { "epoch": 0.4248240563019834, "grad_norm": 0.259765625, "learning_rate": 8.737864077669902e-05, "loss": 1.5187, "step": 332 }, { "epoch": 0.42610364683301344, "grad_norm": 0.259765625, "learning_rate": 8.689320388349514e-05, "loss": 1.5736, "step": 333 }, { "epoch": 0.4273832373640435, "grad_norm": 0.28125, "learning_rate": 8.640776699029127e-05, "loss": 1.5085, "step": 334 }, { "epoch": 0.4286628278950736, "grad_norm": 0.25390625, "learning_rate": 8.592233009708738e-05, "loss": 1.4757, "step": 335 }, { "epoch": 0.42994241842610365, "grad_norm": 0.267578125, "learning_rate": 8.54368932038835e-05, "loss": 1.5243, "step": 336 }, { "epoch": 0.4312220089571337, "grad_norm": 0.26953125, "learning_rate": 8.495145631067961e-05, "loss": 1.5662, "step": 337 }, { "epoch": 0.4325015994881638, "grad_norm": 0.265625, "learning_rate": 8.446601941747573e-05, "loss": 1.433, "step": 338 }, { "epoch": 0.43378119001919385, "grad_norm": 0.283203125, "learning_rate": 8.398058252427184e-05, "loss": 1.5378, "step": 339 }, { "epoch": 0.4350607805502239, "grad_norm": 0.29296875, "learning_rate": 8.349514563106797e-05, "loss": 1.5276, "step": 340 }, { "epoch": 0.4350607805502239, "eval_loss": 1.4914450645446777, "eval_runtime": 103.7522, "eval_samples_per_second": 48.192, "eval_steps_per_second": 1.513, "step": 340 }, { "epoch": 0.436340371081254, "grad_norm": 0.2734375, "learning_rate": 8.300970873786408e-05, "loss": 1.4723, "step": 341 }, { "epoch": 0.43761996161228406, "grad_norm": 0.25390625, "learning_rate": 8.25242718446602e-05, "loss": 1.5185, "step": 342 }, { "epoch": 0.43889955214331416, "grad_norm": 0.271484375, "learning_rate": 8.203883495145631e-05, "loss": 1.5317, "step": 343 }, { "epoch": 0.4401791426743442, "grad_norm": 0.251953125, "learning_rate": 8.155339805825243e-05, "loss": 1.5254, "step": 344 }, { "epoch": 0.44145873320537427, "grad_norm": 0.259765625, "learning_rate": 8.106796116504854e-05, "loss": 1.5152, "step": 345 }, { "epoch": 0.44273832373640437, "grad_norm": 0.28125, "learning_rate": 8.058252427184466e-05, "loss": 1.4812, "step": 346 }, { "epoch": 0.4440179142674344, "grad_norm": 0.26953125, "learning_rate": 8.009708737864078e-05, "loss": 1.5023, "step": 347 }, { "epoch": 0.44529750479846447, "grad_norm": 0.291015625, "learning_rate": 7.96116504854369e-05, "loss": 1.4516, "step": 348 }, { "epoch": 0.4465770953294946, "grad_norm": 0.279296875, "learning_rate": 7.912621359223301e-05, "loss": 1.4349, "step": 349 }, { "epoch": 0.44785668586052463, "grad_norm": 0.28125, "learning_rate": 7.864077669902913e-05, "loss": 1.5181, "step": 350 }, { "epoch": 0.4491362763915547, "grad_norm": 0.27734375, "learning_rate": 7.815533980582524e-05, "loss": 1.4765, "step": 351 }, { "epoch": 0.4504158669225848, "grad_norm": 0.275390625, "learning_rate": 7.766990291262136e-05, "loss": 1.524, "step": 352 }, { "epoch": 0.45169545745361483, "grad_norm": 0.25, "learning_rate": 7.718446601941748e-05, "loss": 1.5236, "step": 353 }, { "epoch": 0.45297504798464494, "grad_norm": 0.2890625, "learning_rate": 7.66990291262136e-05, "loss": 1.5358, "step": 354 }, { "epoch": 0.454254638515675, "grad_norm": 0.244140625, "learning_rate": 7.621359223300971e-05, "loss": 1.5447, "step": 355 }, { "epoch": 0.45553422904670504, "grad_norm": 0.287109375, "learning_rate": 7.572815533980583e-05, "loss": 1.428, "step": 356 }, { "epoch": 0.45681381957773515, "grad_norm": 0.263671875, "learning_rate": 7.524271844660194e-05, "loss": 1.3921, "step": 357 }, { "epoch": 0.4580934101087652, "grad_norm": 0.2734375, "learning_rate": 7.475728155339806e-05, "loss": 1.5059, "step": 358 }, { "epoch": 0.45937300063979525, "grad_norm": 0.2734375, "learning_rate": 7.427184466019417e-05, "loss": 1.4832, "step": 359 }, { "epoch": 0.46065259117082535, "grad_norm": 0.279296875, "learning_rate": 7.37864077669903e-05, "loss": 1.5236, "step": 360 }, { "epoch": 0.46065259117082535, "eval_loss": 1.4904447793960571, "eval_runtime": 103.753, "eval_samples_per_second": 48.191, "eval_steps_per_second": 1.513, "step": 360 }, { "epoch": 0.4619321817018554, "grad_norm": 0.287109375, "learning_rate": 7.330097087378641e-05, "loss": 1.5008, "step": 361 }, { "epoch": 0.46321177223288545, "grad_norm": 0.28125, "learning_rate": 7.281553398058253e-05, "loss": 1.5244, "step": 362 }, { "epoch": 0.46449136276391556, "grad_norm": 0.27734375, "learning_rate": 7.233009708737864e-05, "loss": 1.5849, "step": 363 }, { "epoch": 0.4657709532949456, "grad_norm": 0.2490234375, "learning_rate": 7.184466019417476e-05, "loss": 1.4882, "step": 364 }, { "epoch": 0.46705054382597566, "grad_norm": 0.267578125, "learning_rate": 7.135922330097087e-05, "loss": 1.4905, "step": 365 }, { "epoch": 0.46833013435700577, "grad_norm": 0.267578125, "learning_rate": 7.0873786407767e-05, "loss": 1.4391, "step": 366 }, { "epoch": 0.4696097248880358, "grad_norm": 0.2451171875, "learning_rate": 7.038834951456312e-05, "loss": 1.5034, "step": 367 }, { "epoch": 0.4708893154190659, "grad_norm": 0.283203125, "learning_rate": 6.990291262135923e-05, "loss": 1.4928, "step": 368 }, { "epoch": 0.472168905950096, "grad_norm": 0.265625, "learning_rate": 6.941747572815534e-05, "loss": 1.5578, "step": 369 }, { "epoch": 0.473448496481126, "grad_norm": 0.287109375, "learning_rate": 6.893203883495146e-05, "loss": 1.5403, "step": 370 }, { "epoch": 0.47472808701215613, "grad_norm": 0.263671875, "learning_rate": 6.844660194174757e-05, "loss": 1.5081, "step": 371 }, { "epoch": 0.4760076775431862, "grad_norm": 0.267578125, "learning_rate": 6.79611650485437e-05, "loss": 1.5799, "step": 372 }, { "epoch": 0.47728726807421623, "grad_norm": 0.25390625, "learning_rate": 6.747572815533982e-05, "loss": 1.5097, "step": 373 }, { "epoch": 0.47856685860524634, "grad_norm": 0.26953125, "learning_rate": 6.699029126213593e-05, "loss": 1.5164, "step": 374 }, { "epoch": 0.4798464491362764, "grad_norm": 0.26171875, "learning_rate": 6.650485436893205e-05, "loss": 1.5139, "step": 375 }, { "epoch": 0.48112603966730644, "grad_norm": 0.251953125, "learning_rate": 6.601941747572816e-05, "loss": 1.4756, "step": 376 }, { "epoch": 0.48240563019833654, "grad_norm": 0.263671875, "learning_rate": 6.553398058252428e-05, "loss": 1.5325, "step": 377 }, { "epoch": 0.4836852207293666, "grad_norm": 0.283203125, "learning_rate": 6.504854368932039e-05, "loss": 1.4932, "step": 378 }, { "epoch": 0.4849648112603967, "grad_norm": 0.265625, "learning_rate": 6.456310679611652e-05, "loss": 1.5157, "step": 379 }, { "epoch": 0.48624440179142675, "grad_norm": 0.298828125, "learning_rate": 6.407766990291263e-05, "loss": 1.4882, "step": 380 }, { "epoch": 0.48624440179142675, "eval_loss": 1.4896763563156128, "eval_runtime": 103.7894, "eval_samples_per_second": 48.174, "eval_steps_per_second": 1.513, "step": 380 }, { "epoch": 0.4875239923224568, "grad_norm": 0.259765625, "learning_rate": 6.359223300970875e-05, "loss": 1.546, "step": 381 }, { "epoch": 0.4888035828534869, "grad_norm": 0.279296875, "learning_rate": 6.310679611650486e-05, "loss": 1.4907, "step": 382 }, { "epoch": 0.49008317338451696, "grad_norm": 0.2734375, "learning_rate": 6.262135922330098e-05, "loss": 1.4512, "step": 383 }, { "epoch": 0.491362763915547, "grad_norm": 0.26953125, "learning_rate": 6.213592233009709e-05, "loss": 1.5442, "step": 384 }, { "epoch": 0.4926423544465771, "grad_norm": 0.28125, "learning_rate": 6.16504854368932e-05, "loss": 1.4975, "step": 385 }, { "epoch": 0.49392194497760716, "grad_norm": 0.265625, "learning_rate": 6.116504854368932e-05, "loss": 1.534, "step": 386 }, { "epoch": 0.4952015355086372, "grad_norm": 0.27734375, "learning_rate": 6.0679611650485434e-05, "loss": 1.417, "step": 387 }, { "epoch": 0.4964811260396673, "grad_norm": 0.255859375, "learning_rate": 6.019417475728155e-05, "loss": 1.4766, "step": 388 }, { "epoch": 0.49776071657069737, "grad_norm": 0.283203125, "learning_rate": 5.970873786407767e-05, "loss": 1.4672, "step": 389 }, { "epoch": 0.4990403071017274, "grad_norm": 0.271484375, "learning_rate": 5.9223300970873785e-05, "loss": 1.5021, "step": 390 }, { "epoch": 0.5003198976327575, "grad_norm": 0.283203125, "learning_rate": 5.87378640776699e-05, "loss": 1.5723, "step": 391 }, { "epoch": 0.5015994881637876, "grad_norm": 0.271484375, "learning_rate": 5.825242718446602e-05, "loss": 1.5351, "step": 392 }, { "epoch": 0.5028790786948176, "grad_norm": 0.27734375, "learning_rate": 5.7766990291262135e-05, "loss": 1.5922, "step": 393 }, { "epoch": 0.5041586692258477, "grad_norm": 0.263671875, "learning_rate": 5.728155339805825e-05, "loss": 1.5, "step": 394 }, { "epoch": 0.5054382597568778, "grad_norm": 0.26953125, "learning_rate": 5.679611650485437e-05, "loss": 1.5057, "step": 395 }, { "epoch": 0.5067178502879078, "grad_norm": 0.255859375, "learning_rate": 5.6310679611650486e-05, "loss": 1.4968, "step": 396 }, { "epoch": 0.5079974408189379, "grad_norm": 0.27734375, "learning_rate": 5.58252427184466e-05, "loss": 1.4383, "step": 397 }, { "epoch": 0.509277031349968, "grad_norm": 0.2734375, "learning_rate": 5.533980582524272e-05, "loss": 1.5069, "step": 398 }, { "epoch": 0.510556621880998, "grad_norm": 0.265625, "learning_rate": 5.4854368932038836e-05, "loss": 1.4293, "step": 399 }, { "epoch": 0.5118362124120281, "grad_norm": 0.275390625, "learning_rate": 5.436893203883495e-05, "loss": 1.5098, "step": 400 }, { "epoch": 0.5118362124120281, "eval_loss": 1.4892088174819946, "eval_runtime": 103.7401, "eval_samples_per_second": 48.197, "eval_steps_per_second": 1.513, "step": 400 }, { "epoch": 0.5131158029430583, "grad_norm": 0.267578125, "learning_rate": 5.3883495145631065e-05, "loss": 1.4924, "step": 401 }, { "epoch": 0.5143953934740882, "grad_norm": 0.267578125, "learning_rate": 5.339805825242719e-05, "loss": 1.5093, "step": 402 }, { "epoch": 0.5156749840051184, "grad_norm": 0.271484375, "learning_rate": 5.29126213592233e-05, "loss": 1.5384, "step": 403 }, { "epoch": 0.5169545745361485, "grad_norm": 0.275390625, "learning_rate": 5.2427184466019416e-05, "loss": 1.5331, "step": 404 }, { "epoch": 0.5182341650671785, "grad_norm": 0.28515625, "learning_rate": 5.194174757281554e-05, "loss": 1.5009, "step": 405 }, { "epoch": 0.5195137555982086, "grad_norm": 0.255859375, "learning_rate": 5.145631067961165e-05, "loss": 1.4744, "step": 406 }, { "epoch": 0.5207933461292387, "grad_norm": 0.28125, "learning_rate": 5.0970873786407766e-05, "loss": 1.5178, "step": 407 }, { "epoch": 0.5220729366602687, "grad_norm": 0.263671875, "learning_rate": 5.048543689320389e-05, "loss": 1.4893, "step": 408 }, { "epoch": 0.5233525271912988, "grad_norm": 0.236328125, "learning_rate": 5e-05, "loss": 1.5309, "step": 409 }, { "epoch": 0.5246321177223289, "grad_norm": 0.30078125, "learning_rate": 4.951456310679612e-05, "loss": 1.4873, "step": 410 }, { "epoch": 0.525911708253359, "grad_norm": 0.2734375, "learning_rate": 4.902912621359224e-05, "loss": 1.5083, "step": 411 }, { "epoch": 0.527191298784389, "grad_norm": 0.25390625, "learning_rate": 4.854368932038835e-05, "loss": 1.5487, "step": 412 }, { "epoch": 0.5284708893154191, "grad_norm": 0.2578125, "learning_rate": 4.805825242718447e-05, "loss": 1.4886, "step": 413 }, { "epoch": 0.5297504798464492, "grad_norm": 0.265625, "learning_rate": 4.757281553398059e-05, "loss": 1.4687, "step": 414 }, { "epoch": 0.5310300703774792, "grad_norm": 0.267578125, "learning_rate": 4.7087378640776703e-05, "loss": 1.5612, "step": 415 }, { "epoch": 0.5323096609085093, "grad_norm": 0.26953125, "learning_rate": 4.660194174757282e-05, "loss": 1.534, "step": 416 }, { "epoch": 0.5335892514395394, "grad_norm": 0.296875, "learning_rate": 4.611650485436894e-05, "loss": 1.4985, "step": 417 }, { "epoch": 0.5348688419705694, "grad_norm": 0.259765625, "learning_rate": 4.5631067961165054e-05, "loss": 1.5042, "step": 418 }, { "epoch": 0.5361484325015995, "grad_norm": 0.267578125, "learning_rate": 4.514563106796117e-05, "loss": 1.5292, "step": 419 }, { "epoch": 0.5374280230326296, "grad_norm": 0.265625, "learning_rate": 4.466019417475728e-05, "loss": 1.4526, "step": 420 }, { "epoch": 0.5374280230326296, "eval_loss": 1.4887601137161255, "eval_runtime": 103.7847, "eval_samples_per_second": 48.177, "eval_steps_per_second": 1.513, "step": 420 }, { "epoch": 0.5387076135636596, "grad_norm": 0.275390625, "learning_rate": 4.4174757281553404e-05, "loss": 1.4529, "step": 421 }, { "epoch": 0.5399872040946897, "grad_norm": 0.271484375, "learning_rate": 4.368932038834951e-05, "loss": 1.4163, "step": 422 }, { "epoch": 0.5412667946257198, "grad_norm": 0.2578125, "learning_rate": 4.3203883495145634e-05, "loss": 1.5169, "step": 423 }, { "epoch": 0.5425463851567498, "grad_norm": 0.27734375, "learning_rate": 4.271844660194175e-05, "loss": 1.4888, "step": 424 }, { "epoch": 0.5438259756877799, "grad_norm": 0.265625, "learning_rate": 4.223300970873786e-05, "loss": 1.4733, "step": 425 }, { "epoch": 0.54510556621881, "grad_norm": 0.306640625, "learning_rate": 4.1747572815533984e-05, "loss": 1.503, "step": 426 }, { "epoch": 0.54638515674984, "grad_norm": 0.27734375, "learning_rate": 4.12621359223301e-05, "loss": 1.4406, "step": 427 }, { "epoch": 0.5476647472808701, "grad_norm": 0.267578125, "learning_rate": 4.077669902912621e-05, "loss": 1.4952, "step": 428 }, { "epoch": 0.5489443378119002, "grad_norm": 0.271484375, "learning_rate": 4.029126213592233e-05, "loss": 1.4837, "step": 429 }, { "epoch": 0.5502239283429302, "grad_norm": 0.28515625, "learning_rate": 3.980582524271845e-05, "loss": 1.6037, "step": 430 }, { "epoch": 0.5515035188739603, "grad_norm": 0.283203125, "learning_rate": 3.9320388349514564e-05, "loss": 1.4425, "step": 431 }, { "epoch": 0.5527831094049904, "grad_norm": 0.26953125, "learning_rate": 3.883495145631068e-05, "loss": 1.4502, "step": 432 }, { "epoch": 0.5540626999360204, "grad_norm": 0.291015625, "learning_rate": 3.83495145631068e-05, "loss": 1.4936, "step": 433 }, { "epoch": 0.5553422904670505, "grad_norm": 0.287109375, "learning_rate": 3.7864077669902914e-05, "loss": 1.5186, "step": 434 }, { "epoch": 0.5566218809980806, "grad_norm": 0.283203125, "learning_rate": 3.737864077669903e-05, "loss": 1.464, "step": 435 }, { "epoch": 0.5579014715291107, "grad_norm": 0.271484375, "learning_rate": 3.689320388349515e-05, "loss": 1.4973, "step": 436 }, { "epoch": 0.5591810620601407, "grad_norm": 0.2578125, "learning_rate": 3.6407766990291265e-05, "loss": 1.5231, "step": 437 }, { "epoch": 0.5604606525911708, "grad_norm": 0.255859375, "learning_rate": 3.592233009708738e-05, "loss": 1.5225, "step": 438 }, { "epoch": 0.5617402431222009, "grad_norm": 0.2734375, "learning_rate": 3.54368932038835e-05, "loss": 1.4312, "step": 439 }, { "epoch": 0.5630198336532309, "grad_norm": 0.27734375, "learning_rate": 3.4951456310679615e-05, "loss": 1.5529, "step": 440 }, { "epoch": 0.5630198336532309, "eval_loss": 1.4884061813354492, "eval_runtime": 103.7793, "eval_samples_per_second": 48.179, "eval_steps_per_second": 1.513, "step": 440 }, { "epoch": 0.564299424184261, "grad_norm": 0.2890625, "learning_rate": 3.446601941747573e-05, "loss": 1.4933, "step": 441 }, { "epoch": 0.5655790147152912, "grad_norm": 0.298828125, "learning_rate": 3.398058252427185e-05, "loss": 1.4352, "step": 442 }, { "epoch": 0.5668586052463211, "grad_norm": 0.271484375, "learning_rate": 3.3495145631067966e-05, "loss": 1.5062, "step": 443 }, { "epoch": 0.5681381957773513, "grad_norm": 0.279296875, "learning_rate": 3.300970873786408e-05, "loss": 1.5018, "step": 444 }, { "epoch": 0.5694177863083814, "grad_norm": 0.283203125, "learning_rate": 3.2524271844660195e-05, "loss": 1.5037, "step": 445 }, { "epoch": 0.5706973768394114, "grad_norm": 0.287109375, "learning_rate": 3.2038834951456316e-05, "loss": 1.5056, "step": 446 }, { "epoch": 0.5719769673704415, "grad_norm": 0.294921875, "learning_rate": 3.155339805825243e-05, "loss": 1.567, "step": 447 }, { "epoch": 0.5732565579014716, "grad_norm": 0.287109375, "learning_rate": 3.1067961165048545e-05, "loss": 1.4753, "step": 448 }, { "epoch": 0.5745361484325016, "grad_norm": 0.2734375, "learning_rate": 3.058252427184466e-05, "loss": 1.5731, "step": 449 }, { "epoch": 0.5758157389635317, "grad_norm": 0.265625, "learning_rate": 3.0097087378640774e-05, "loss": 1.4695, "step": 450 }, { "epoch": 0.5770953294945618, "grad_norm": 0.279296875, "learning_rate": 2.9611650485436892e-05, "loss": 1.4394, "step": 451 }, { "epoch": 0.5783749200255918, "grad_norm": 0.26953125, "learning_rate": 2.912621359223301e-05, "loss": 1.52, "step": 452 }, { "epoch": 0.5796545105566219, "grad_norm": 0.275390625, "learning_rate": 2.8640776699029125e-05, "loss": 1.5178, "step": 453 }, { "epoch": 0.580934101087652, "grad_norm": 0.298828125, "learning_rate": 2.8155339805825243e-05, "loss": 1.5035, "step": 454 }, { "epoch": 0.582213691618682, "grad_norm": 0.259765625, "learning_rate": 2.766990291262136e-05, "loss": 1.5201, "step": 455 }, { "epoch": 0.5834932821497121, "grad_norm": 0.259765625, "learning_rate": 2.7184466019417475e-05, "loss": 1.5495, "step": 456 }, { "epoch": 0.5847728726807422, "grad_norm": 0.27734375, "learning_rate": 2.6699029126213593e-05, "loss": 1.483, "step": 457 }, { "epoch": 0.5860524632117722, "grad_norm": 0.287109375, "learning_rate": 2.6213592233009708e-05, "loss": 1.5084, "step": 458 }, { "epoch": 0.5873320537428023, "grad_norm": 0.26953125, "learning_rate": 2.5728155339805826e-05, "loss": 1.5115, "step": 459 }, { "epoch": 0.5886116442738324, "grad_norm": 0.275390625, "learning_rate": 2.5242718446601944e-05, "loss": 1.4747, "step": 460 }, { "epoch": 0.5886116442738324, "eval_loss": 1.4881880283355713, "eval_runtime": 103.7794, "eval_samples_per_second": 48.179, "eval_steps_per_second": 1.513, "step": 460 }, { "epoch": 0.5898912348048625, "grad_norm": 0.265625, "learning_rate": 2.475728155339806e-05, "loss": 1.5567, "step": 461 }, { "epoch": 0.5911708253358925, "grad_norm": 0.2734375, "learning_rate": 2.4271844660194176e-05, "loss": 1.4473, "step": 462 }, { "epoch": 0.5924504158669226, "grad_norm": 0.263671875, "learning_rate": 2.3786407766990294e-05, "loss": 1.5204, "step": 463 }, { "epoch": 0.5937300063979527, "grad_norm": 0.283203125, "learning_rate": 2.330097087378641e-05, "loss": 1.537, "step": 464 }, { "epoch": 0.5950095969289827, "grad_norm": 0.251953125, "learning_rate": 2.2815533980582527e-05, "loss": 1.4269, "step": 465 }, { "epoch": 0.5962891874600128, "grad_norm": 0.259765625, "learning_rate": 2.233009708737864e-05, "loss": 1.452, "step": 466 }, { "epoch": 0.5975687779910429, "grad_norm": 0.271484375, "learning_rate": 2.1844660194174756e-05, "loss": 1.4702, "step": 467 }, { "epoch": 0.5988483685220729, "grad_norm": 0.263671875, "learning_rate": 2.1359223300970874e-05, "loss": 1.4577, "step": 468 }, { "epoch": 0.600127959053103, "grad_norm": 0.27734375, "learning_rate": 2.0873786407766992e-05, "loss": 1.5009, "step": 469 }, { "epoch": 0.6014075495841331, "grad_norm": 0.251953125, "learning_rate": 2.0388349514563107e-05, "loss": 1.4926, "step": 470 }, { "epoch": 0.6026871401151631, "grad_norm": 0.2890625, "learning_rate": 1.9902912621359225e-05, "loss": 1.5575, "step": 471 }, { "epoch": 0.6039667306461932, "grad_norm": 0.28125, "learning_rate": 1.941747572815534e-05, "loss": 1.4811, "step": 472 }, { "epoch": 0.6052463211772233, "grad_norm": 0.291015625, "learning_rate": 1.8932038834951457e-05, "loss": 1.496, "step": 473 }, { "epoch": 0.6065259117082533, "grad_norm": 0.26953125, "learning_rate": 1.8446601941747575e-05, "loss": 1.4922, "step": 474 }, { "epoch": 0.6078055022392834, "grad_norm": 0.28515625, "learning_rate": 1.796116504854369e-05, "loss": 1.5435, "step": 475 }, { "epoch": 0.6090850927703135, "grad_norm": 0.267578125, "learning_rate": 1.7475728155339808e-05, "loss": 1.5427, "step": 476 }, { "epoch": 0.6103646833013435, "grad_norm": 0.2333984375, "learning_rate": 1.6990291262135926e-05, "loss": 1.4921, "step": 477 }, { "epoch": 0.6116442738323736, "grad_norm": 0.26171875, "learning_rate": 1.650485436893204e-05, "loss": 1.4226, "step": 478 }, { "epoch": 0.6129238643634037, "grad_norm": 0.287109375, "learning_rate": 1.6019417475728158e-05, "loss": 1.4878, "step": 479 }, { "epoch": 0.6142034548944337, "grad_norm": 0.263671875, "learning_rate": 1.5533980582524273e-05, "loss": 1.5165, "step": 480 }, { "epoch": 0.6142034548944337, "eval_loss": 1.4879465103149414, "eval_runtime": 103.7919, "eval_samples_per_second": 48.173, "eval_steps_per_second": 1.513, "step": 480 }, { "epoch": 0.6154830454254638, "grad_norm": 0.28515625, "learning_rate": 1.5048543689320387e-05, "loss": 1.5134, "step": 481 }, { "epoch": 0.6167626359564939, "grad_norm": 0.28125, "learning_rate": 1.4563106796116505e-05, "loss": 1.4587, "step": 482 }, { "epoch": 0.6180422264875239, "grad_norm": 0.271484375, "learning_rate": 1.4077669902912621e-05, "loss": 1.4203, "step": 483 }, { "epoch": 0.619321817018554, "grad_norm": 0.28515625, "learning_rate": 1.3592233009708738e-05, "loss": 1.4245, "step": 484 }, { "epoch": 0.6206014075495841, "grad_norm": 0.279296875, "learning_rate": 1.3106796116504854e-05, "loss": 1.513, "step": 485 }, { "epoch": 0.6218809980806143, "grad_norm": 0.267578125, "learning_rate": 1.2621359223300972e-05, "loss": 1.5439, "step": 486 }, { "epoch": 0.6231605886116443, "grad_norm": 0.283203125, "learning_rate": 1.2135922330097088e-05, "loss": 1.5063, "step": 487 }, { "epoch": 0.6244401791426744, "grad_norm": 0.2734375, "learning_rate": 1.1650485436893204e-05, "loss": 1.4927, "step": 488 }, { "epoch": 0.6257197696737045, "grad_norm": 0.28515625, "learning_rate": 1.116504854368932e-05, "loss": 1.4943, "step": 489 }, { "epoch": 0.6269993602047345, "grad_norm": 0.2734375, "learning_rate": 1.0679611650485437e-05, "loss": 1.5125, "step": 490 }, { "epoch": 0.6282789507357646, "grad_norm": 0.263671875, "learning_rate": 1.0194174757281553e-05, "loss": 1.4396, "step": 491 }, { "epoch": 0.6295585412667947, "grad_norm": 0.275390625, "learning_rate": 9.70873786407767e-06, "loss": 1.4928, "step": 492 }, { "epoch": 0.6308381317978247, "grad_norm": 0.291015625, "learning_rate": 9.223300970873788e-06, "loss": 1.4664, "step": 493 }, { "epoch": 0.6321177223288548, "grad_norm": 0.279296875, "learning_rate": 8.737864077669904e-06, "loss": 1.4305, "step": 494 }, { "epoch": 0.6333973128598849, "grad_norm": 0.267578125, "learning_rate": 8.25242718446602e-06, "loss": 1.5016, "step": 495 }, { "epoch": 0.6346769033909149, "grad_norm": 0.28125, "learning_rate": 7.766990291262136e-06, "loss": 1.4641, "step": 496 }, { "epoch": 0.635956493921945, "grad_norm": 0.275390625, "learning_rate": 7.281553398058253e-06, "loss": 1.497, "step": 497 }, { "epoch": 0.6372360844529751, "grad_norm": 0.2578125, "learning_rate": 6.796116504854369e-06, "loss": 1.54, "step": 498 }, { "epoch": 0.6385156749840051, "grad_norm": 0.267578125, "learning_rate": 6.310679611650486e-06, "loss": 1.4769, "step": 499 }, { "epoch": 0.6397952655150352, "grad_norm": 0.2412109375, "learning_rate": 5.825242718446602e-06, "loss": 1.5592, "step": 500 }, { "epoch": 0.6397952655150352, "eval_loss": 1.4878435134887695, "eval_runtime": 103.7542, "eval_samples_per_second": 48.191, "eval_steps_per_second": 1.513, "step": 500 }, { "epoch": 0.6410748560460653, "grad_norm": 0.28515625, "learning_rate": 5.3398058252427185e-06, "loss": 1.4929, "step": 501 }, { "epoch": 0.6423544465770953, "grad_norm": 0.271484375, "learning_rate": 4.854368932038835e-06, "loss": 1.4825, "step": 502 }, { "epoch": 0.6436340371081254, "grad_norm": 0.2734375, "learning_rate": 4.368932038834952e-06, "loss": 1.5464, "step": 503 }, { "epoch": 0.6449136276391555, "grad_norm": 0.2890625, "learning_rate": 3.883495145631068e-06, "loss": 1.4797, "step": 504 }, { "epoch": 0.6461932181701855, "grad_norm": 0.298828125, "learning_rate": 3.3980582524271844e-06, "loss": 1.5039, "step": 505 }, { "epoch": 0.6474728087012156, "grad_norm": 0.296875, "learning_rate": 2.912621359223301e-06, "loss": 1.4895, "step": 506 }, { "epoch": 0.6487523992322457, "grad_norm": 0.265625, "learning_rate": 2.4271844660194174e-06, "loss": 1.4887, "step": 507 }, { "epoch": 0.6500319897632757, "grad_norm": 0.265625, "learning_rate": 1.941747572815534e-06, "loss": 1.4734, "step": 508 }, { "epoch": 0.6513115802943058, "grad_norm": 0.2578125, "learning_rate": 1.4563106796116506e-06, "loss": 1.4636, "step": 509 }, { "epoch": 0.6525911708253359, "grad_norm": 0.265625, "learning_rate": 9.70873786407767e-07, "loss": 1.5035, "step": 510 }, { "epoch": 0.653870761356366, "grad_norm": 0.251953125, "learning_rate": 4.854368932038835e-07, "loss": 1.5264, "step": 511 }, { "epoch": 0.655150351887396, "grad_norm": 0.259765625, "learning_rate": 0.0, "loss": 1.5441, "step": 512 }, { "epoch": 0.655150351887396, "step": 512, "total_flos": 3.6685588640169984e+17, "train_loss": 1.622293347492814, "train_runtime": 4726.6516, "train_samples_per_second": 6.933, "train_steps_per_second": 0.108 }, { "epoch": 0.655150351887396, "eval_loss": 1.4878435134887695, "eval_runtime": 103.7194, "eval_samples_per_second": 48.207, "eval_steps_per_second": 1.514, "step": 512 } ], "logging_steps": 1, "max_steps": 512, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 3.6685588640169984e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }