{ "best_metric": null, "best_model_checkpoint": null, "epoch": 50.0, "eval_steps": 50, "global_step": 5300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.47, "learning_rate": 8.679245283018868e-07, "loss": 130.658, "step": 50 }, { "epoch": 0.47, "eval_runtime": 2.1932, "eval_samples_per_second": 327.381, "eval_steps_per_second": 10.487, "step": 50 }, { "epoch": 0.94, "learning_rate": 2.716981132075472e-06, "loss": 119.614, "step": 100 }, { "epoch": 0.94, "eval_runtime": 1.9516, "eval_samples_per_second": 367.905, "eval_steps_per_second": 11.785, "step": 100 }, { "epoch": 1.42, "learning_rate": 4.603773584905661e-06, "loss": 84.4188, "step": 150 }, { "epoch": 1.42, "eval_runtime": 1.9342, "eval_samples_per_second": 371.215, "eval_steps_per_second": 11.891, "step": 150 }, { "epoch": 1.89, "learning_rate": 6.490566037735849e-06, "loss": 53.1678, "step": 200 }, { "epoch": 1.89, "eval_runtime": 1.9027, "eval_samples_per_second": 377.349, "eval_steps_per_second": 12.088, "step": 200 }, { "epoch": 2.36, "learning_rate": 8.377358490566039e-06, "loss": 33.2104, "step": 250 }, { "epoch": 2.36, "eval_runtime": 1.9458, "eval_samples_per_second": 368.991, "eval_steps_per_second": 11.82, "step": 250 }, { "epoch": 2.83, "learning_rate": 1.0264150943396227e-05, "loss": 28.033, "step": 300 }, { "epoch": 2.83, "eval_runtime": 1.9188, "eval_samples_per_second": 374.183, "eval_steps_per_second": 11.986, "step": 300 }, { "epoch": 3.3, "learning_rate": 1.2150943396226416e-05, "loss": 22.0028, "step": 350 }, { "epoch": 3.3, "eval_runtime": 2.939, "eval_samples_per_second": 244.303, "eval_steps_per_second": 7.826, "step": 350 }, { "epoch": 3.77, "learning_rate": 1.4037735849056604e-05, "loss": 17.8244, "step": 400 }, { "epoch": 3.77, "eval_runtime": 2.0678, "eval_samples_per_second": 347.226, "eval_steps_per_second": 11.123, "step": 400 }, { "epoch": 4.25, "learning_rate": 1.5924528301886794e-05, "loss": 14.5603, "step": 450 }, { "epoch": 4.25, "eval_runtime": 2.0344, "eval_samples_per_second": 352.926, "eval_steps_per_second": 11.305, "step": 450 }, { "epoch": 4.72, "learning_rate": 1.7811320754716983e-05, "loss": 11.8745, "step": 500 }, { "epoch": 4.72, "eval_runtime": 2.0323, "eval_samples_per_second": 353.287, "eval_steps_per_second": 11.317, "step": 500 }, { "epoch": 5.19, "learning_rate": 1.969811320754717e-05, "loss": 9.6856, "step": 550 }, { "epoch": 5.19, "eval_runtime": 2.0201, "eval_samples_per_second": 355.425, "eval_steps_per_second": 11.385, "step": 550 }, { "epoch": 5.66, "learning_rate": 1.999617436193858e-05, "loss": 7.8251, "step": 600 }, { "epoch": 5.66, "eval_runtime": 2.0209, "eval_samples_per_second": 355.286, "eval_steps_per_second": 11.381, "step": 600 }, { "epoch": 6.13, "learning_rate": 1.9981648322537017e-05, "loss": 6.5604, "step": 650 }, { "epoch": 6.13, "eval_runtime": 1.9676, "eval_samples_per_second": 364.903, "eval_steps_per_second": 11.689, "step": 650 }, { "epoch": 6.6, "learning_rate": 1.995629882570864e-05, "loss": 5.9968, "step": 700 }, { "epoch": 6.6, "eval_runtime": 1.933, "eval_samples_per_second": 371.443, "eval_steps_per_second": 11.899, "step": 700 }, { "epoch": 7.08, "learning_rate": 1.992015335881735e-05, "loss": 5.3714, "step": 750 }, { "epoch": 7.08, "eval_runtime": 1.928, "eval_samples_per_second": 372.4, "eval_steps_per_second": 11.929, "step": 750 }, { "epoch": 7.55, "learning_rate": 1.9873251115682577e-05, "loss": 5.0835, "step": 800 }, { "epoch": 7.55, "eval_runtime": 1.9194, "eval_samples_per_second": 374.085, "eval_steps_per_second": 11.983, "step": 800 }, { "epoch": 8.02, "learning_rate": 1.9815642954080055e-05, "loss": 4.8417, "step": 850 }, { "epoch": 8.02, "eval_runtime": 1.9205, "eval_samples_per_second": 373.866, "eval_steps_per_second": 11.976, "step": 850 }, { "epoch": 8.49, "learning_rate": 1.97473913405949e-05, "loss": 4.651, "step": 900 }, { "epoch": 8.49, "eval_runtime": 1.9167, "eval_samples_per_second": 374.597, "eval_steps_per_second": 12.0, "step": 900 }, { "epoch": 8.96, "learning_rate": 1.966857028288687e-05, "loss": 4.5806, "step": 950 }, { "epoch": 8.96, "eval_runtime": 1.9211, "eval_samples_per_second": 373.737, "eval_steps_per_second": 11.972, "step": 950 }, { "epoch": 9.43, "learning_rate": 1.9579265249441216e-05, "loss": 4.5062, "step": 1000 }, { "epoch": 9.43, "eval_runtime": 1.9281, "eval_samples_per_second": 372.389, "eval_steps_per_second": 11.929, "step": 1000 }, { "epoch": 9.91, "learning_rate": 1.9479573076892152e-05, "loss": 4.4466, "step": 1050 }, { "epoch": 9.91, "eval_runtime": 1.9203, "eval_samples_per_second": 373.895, "eval_steps_per_second": 11.977, "step": 1050 }, { "epoch": 10.38, "learning_rate": 1.9369601865019452e-05, "loss": 4.4225, "step": 1100 }, { "epoch": 10.38, "eval_runtime": 1.9182, "eval_samples_per_second": 374.311, "eval_steps_per_second": 11.99, "step": 1100 }, { "epoch": 10.85, "learning_rate": 1.9249470859531976e-05, "loss": 4.4022, "step": 1150 }, { "epoch": 10.85, "eval_runtime": 1.9262, "eval_samples_per_second": 372.754, "eval_steps_per_second": 11.941, "step": 1150 }, { "epoch": 11.32, "learning_rate": 1.9119310322765315e-05, "loss": 4.3985, "step": 1200 }, { "epoch": 11.32, "eval_runtime": 1.94, "eval_samples_per_second": 370.112, "eval_steps_per_second": 11.856, "step": 1200 }, { "epoch": 11.79, "learning_rate": 1.8979261392433685e-05, "loss": 4.3822, "step": 1250 }, { "epoch": 11.79, "eval_runtime": 1.9393, "eval_samples_per_second": 370.229, "eval_steps_per_second": 11.86, "step": 1250 }, { "epoch": 12.26, "learning_rate": 1.8829475928589272e-05, "loss": 4.3754, "step": 1300 }, { "epoch": 12.26, "eval_runtime": 1.9864, "eval_samples_per_second": 361.455, "eval_steps_per_second": 11.579, "step": 1300 }, { "epoch": 12.74, "learning_rate": 1.8670116348954945e-05, "loss": 4.3684, "step": 1350 }, { "epoch": 12.74, "eval_runtime": 1.9783, "eval_samples_per_second": 362.937, "eval_steps_per_second": 11.626, "step": 1350 }, { "epoch": 13.21, "learning_rate": 1.850135545280894e-05, "loss": 4.3725, "step": 1400 }, { "epoch": 13.21, "eval_runtime": 1.9796, "eval_samples_per_second": 362.707, "eval_steps_per_second": 11.619, "step": 1400 }, { "epoch": 13.68, "learning_rate": 1.832337623361242e-05, "loss": 4.3738, "step": 1450 }, { "epoch": 13.68, "eval_runtime": 1.9672, "eval_samples_per_second": 364.987, "eval_steps_per_second": 11.692, "step": 1450 }, { "epoch": 14.15, "learning_rate": 1.8136371680583176e-05, "loss": 4.3524, "step": 1500 }, { "epoch": 14.15, "eval_runtime": 1.917, "eval_samples_per_second": 374.54, "eval_steps_per_second": 11.998, "step": 1500 }, { "epoch": 14.62, "learning_rate": 1.7940544569430468e-05, "loss": 4.3487, "step": 1550 }, { "epoch": 14.62, "eval_runtime": 1.9212, "eval_samples_per_second": 373.718, "eval_steps_per_second": 11.971, "step": 1550 }, { "epoch": 15.09, "learning_rate": 1.7736107242478143e-05, "loss": 4.347, "step": 1600 }, { "epoch": 15.09, "eval_runtime": 1.9221, "eval_samples_per_second": 373.543, "eval_steps_per_second": 11.966, "step": 1600 }, { "epoch": 15.57, "learning_rate": 1.7523281378414246e-05, "loss": 4.3417, "step": 1650 }, { "epoch": 15.57, "eval_runtime": 2.1716, "eval_samples_per_second": 330.631, "eval_steps_per_second": 10.591, "step": 1650 }, { "epoch": 16.04, "learning_rate": 1.730229775191693e-05, "loss": 4.3224, "step": 1700 }, { "epoch": 16.04, "eval_runtime": 1.9519, "eval_samples_per_second": 367.842, "eval_steps_per_second": 11.783, "step": 1700 }, { "epoch": 16.51, "learning_rate": 1.7073395983417227e-05, "loss": 4.3169, "step": 1750 }, { "epoch": 16.51, "eval_runtime": 1.9446, "eval_samples_per_second": 369.221, "eval_steps_per_second": 11.827, "step": 1750 }, { "epoch": 16.98, "learning_rate": 1.6836824279270053e-05, "loss": 4.3111, "step": 1800 }, { "epoch": 16.98, "eval_runtime": 1.9452, "eval_samples_per_second": 369.117, "eval_steps_per_second": 11.824, "step": 1800 }, { "epoch": 17.45, "learning_rate": 1.6592839162615223e-05, "loss": 4.3045, "step": 1850 }, { "epoch": 17.45, "eval_runtime": 1.9364, "eval_samples_per_second": 370.798, "eval_steps_per_second": 11.878, "step": 1850 }, { "epoch": 17.92, "learning_rate": 1.6341705195220257e-05, "loss": 4.2946, "step": 1900 }, { "epoch": 17.92, "eval_runtime": 1.9407, "eval_samples_per_second": 369.973, "eval_steps_per_second": 11.852, "step": 1900 }, { "epoch": 18.4, "learning_rate": 1.6083694690606592e-05, "loss": 4.2868, "step": 1950 }, { "epoch": 18.4, "eval_runtime": 1.9401, "eval_samples_per_second": 370.079, "eval_steps_per_second": 11.855, "step": 1950 }, { "epoch": 18.87, "learning_rate": 1.581908741877034e-05, "loss": 4.2791, "step": 2000 }, { "epoch": 18.87, "eval_runtime": 1.9388, "eval_samples_per_second": 370.327, "eval_steps_per_second": 11.863, "step": 2000 }, { "epoch": 19.34, "learning_rate": 1.5548170302817683e-05, "loss": 4.2596, "step": 2050 }, { "epoch": 19.34, "eval_runtime": 1.9504, "eval_samples_per_second": 368.122, "eval_steps_per_second": 11.792, "step": 2050 }, { "epoch": 19.81, "learning_rate": 1.5271237107843925e-05, "loss": 4.253, "step": 2100 }, { "epoch": 19.81, "eval_runtime": 1.9966, "eval_samples_per_second": 359.617, "eval_steps_per_second": 11.52, "step": 2100 }, { "epoch": 20.28, "learning_rate": 1.4988588122393497e-05, "loss": 4.2556, "step": 2150 }, { "epoch": 20.28, "eval_runtime": 2.0261, "eval_samples_per_second": 354.367, "eval_steps_per_second": 11.352, "step": 2150 }, { "epoch": 20.75, "learning_rate": 1.47005298328464e-05, "loss": 4.2289, "step": 2200 }, { "epoch": 20.75, "eval_runtime": 2.0081, "eval_samples_per_second": 357.56, "eval_steps_per_second": 11.454, "step": 2200 }, { "epoch": 21.23, "learning_rate": 1.4407374591084064e-05, "loss": 4.211, "step": 2250 }, { "epoch": 21.23, "eval_runtime": 1.925, "eval_samples_per_second": 372.984, "eval_steps_per_second": 11.948, "step": 2250 }, { "epoch": 21.7, "learning_rate": 1.4109440275795071e-05, "loss": 4.2125, "step": 2300 }, { "epoch": 21.7, "eval_runtime": 1.9426, "eval_samples_per_second": 369.601, "eval_steps_per_second": 11.84, "step": 2300 }, { "epoch": 22.17, "learning_rate": 1.3807049947787954e-05, "loss": 4.1935, "step": 2350 }, { "epoch": 22.17, "eval_runtime": 1.944, "eval_samples_per_second": 369.342, "eval_steps_per_second": 11.831, "step": 2350 }, { "epoch": 22.64, "learning_rate": 1.3500531499684819e-05, "loss": 4.1803, "step": 2400 }, { "epoch": 22.64, "eval_runtime": 1.9484, "eval_samples_per_second": 368.517, "eval_steps_per_second": 11.805, "step": 2400 }, { "epoch": 23.11, "learning_rate": 1.3190217300375694e-05, "loss": 4.173, "step": 2450 }, { "epoch": 23.11, "eval_runtime": 1.9375, "eval_samples_per_second": 370.574, "eval_steps_per_second": 11.871, "step": 2450 }, { "epoch": 23.58, "learning_rate": 1.2876443834619066e-05, "loss": 4.1564, "step": 2500 }, { "epoch": 23.58, "eval_runtime": 1.9431, "eval_samples_per_second": 369.514, "eval_steps_per_second": 11.837, "step": 2500 }, { "epoch": 24.06, "learning_rate": 1.2559551338179468e-05, "loss": 4.1486, "step": 2550 }, { "epoch": 24.06, "eval_runtime": 1.9508, "eval_samples_per_second": 368.046, "eval_steps_per_second": 11.79, "step": 2550 }, { "epoch": 24.53, "learning_rate": 1.2239883428897687e-05, "loss": 4.1223, "step": 2600 }, { "epoch": 24.53, "eval_runtime": 1.9403, "eval_samples_per_second": 370.043, "eval_steps_per_second": 11.854, "step": 2600 }, { "epoch": 25.0, "learning_rate": 1.1917786734093682e-05, "loss": 4.1107, "step": 2650 }, { "epoch": 25.0, "eval_runtime": 1.9435, "eval_samples_per_second": 369.437, "eval_steps_per_second": 11.834, "step": 2650 }, { "epoch": 25.47, "learning_rate": 1.1593610514706217e-05, "loss": 4.0779, "step": 2700 }, { "epoch": 25.47, "eval_runtime": 1.9405, "eval_samples_per_second": 370.004, "eval_steps_per_second": 11.853, "step": 2700 }, { "epoch": 25.94, "learning_rate": 1.1267706286576759e-05, "loss": 4.0284, "step": 2750 }, { "epoch": 25.94, "eval_runtime": 1.944, "eval_samples_per_second": 369.342, "eval_steps_per_second": 11.831, "step": 2750 }, { "epoch": 26.42, "learning_rate": 1.094042743928831e-05, "loss": 3.9635, "step": 2800 }, { "epoch": 26.42, "eval_runtime": 3.7643, "eval_samples_per_second": 190.741, "eval_steps_per_second": 6.11, "step": 2800 }, { "epoch": 26.89, "learning_rate": 1.0612128852972474e-05, "loss": 3.9045, "step": 2850 }, { "epoch": 26.89, "eval_runtime": 1.9311, "eval_samples_per_second": 371.804, "eval_steps_per_second": 11.91, "step": 2850 }, { "epoch": 27.36, "learning_rate": 1.0283166513500267e-05, "loss": 3.8146, "step": 2900 }, { "epoch": 27.36, "eval_runtime": 1.9224, "eval_samples_per_second": 373.485, "eval_steps_per_second": 11.964, "step": 2900 }, { "epoch": 27.83, "learning_rate": 9.953897126473933e-06, "loss": 3.7245, "step": 2950 }, { "epoch": 27.83, "eval_runtime": 1.9864, "eval_samples_per_second": 361.449, "eval_steps_per_second": 11.578, "step": 2950 }, { "epoch": 28.3, "learning_rate": 9.624677730438344e-06, "loss": 3.6334, "step": 3000 }, { "epoch": 28.3, "eval_runtime": 1.9535, "eval_samples_per_second": 367.553, "eval_steps_per_second": 11.774, "step": 3000 }, { "epoch": 28.77, "learning_rate": 9.295865309731342e-06, "loss": 3.5344, "step": 3050 }, { "epoch": 28.77, "eval_runtime": 1.9514, "eval_samples_per_second": 367.939, "eval_steps_per_second": 11.786, "step": 3050 }, { "epoch": 29.25, "learning_rate": 8.96781640739291e-06, "loss": 3.4395, "step": 3100 }, { "epoch": 29.25, "eval_runtime": 1.9254, "eval_samples_per_second": 372.918, "eval_steps_per_second": 11.946, "step": 3100 }, { "epoch": 29.72, "learning_rate": 8.64088673855282e-06, "loss": 3.3242, "step": 3150 }, { "epoch": 29.72, "eval_runtime": 1.9225, "eval_samples_per_second": 373.464, "eval_steps_per_second": 11.963, "step": 3150 }, { "epoch": 30.19, "learning_rate": 8.315430804716022e-06, "loss": 3.205, "step": 3200 }, { "epoch": 30.19, "eval_runtime": 1.9123, "eval_samples_per_second": 375.463, "eval_steps_per_second": 12.027, "step": 3200 }, { "epoch": 30.66, "learning_rate": 7.991801509364023e-06, "loss": 3.1021, "step": 3250 }, { "epoch": 30.66, "eval_runtime": 1.9257, "eval_samples_per_second": 372.855, "eval_steps_per_second": 11.944, "step": 3250 }, { "epoch": 31.13, "learning_rate": 7.670349775289047e-06, "loss": 2.9841, "step": 3300 }, { "epoch": 31.13, "eval_runtime": 1.9263, "eval_samples_per_second": 372.732, "eval_steps_per_second": 11.94, "step": 3300 }, { "epoch": 31.6, "learning_rate": 7.3514241640759175e-06, "loss": 2.8855, "step": 3350 }, { "epoch": 31.6, "eval_runtime": 1.9276, "eval_samples_per_second": 372.486, "eval_steps_per_second": 11.932, "step": 3350 }, { "epoch": 32.08, "learning_rate": 7.035370498144325e-06, "loss": 2.8144, "step": 3400 }, { "epoch": 32.08, "eval_runtime": 1.9176, "eval_samples_per_second": 374.418, "eval_steps_per_second": 11.994, "step": 3400 }, { "epoch": 32.55, "learning_rate": 6.722531485761199e-06, "loss": 2.701, "step": 3450 }, { "epoch": 32.55, "eval_runtime": 1.9236, "eval_samples_per_second": 373.25, "eval_steps_per_second": 11.956, "step": 3450 }, { "epoch": 33.02, "learning_rate": 6.413246349429934e-06, "loss": 2.615, "step": 3500 }, { "epoch": 33.02, "eval_runtime": 1.9223, "eval_samples_per_second": 373.516, "eval_steps_per_second": 11.965, "step": 3500 }, { "epoch": 33.49, "learning_rate": 6.107850458059322e-06, "loss": 2.5553, "step": 3550 }, { "epoch": 33.49, "eval_runtime": 1.9299, "eval_samples_per_second": 372.031, "eval_steps_per_second": 11.917, "step": 3550 }, { "epoch": 33.96, "learning_rate": 5.8066749633110675e-06, "loss": 2.4601, "step": 3600 }, { "epoch": 33.96, "eval_runtime": 1.9178, "eval_samples_per_second": 374.392, "eval_steps_per_second": 11.993, "step": 3600 }, { "epoch": 34.43, "learning_rate": 5.510046440520228e-06, "loss": 2.4033, "step": 3650 }, { "epoch": 34.43, "eval_runtime": 1.9204, "eval_samples_per_second": 373.879, "eval_steps_per_second": 11.977, "step": 3650 }, { "epoch": 34.91, "learning_rate": 5.218286534577938e-06, "loss": 2.3601, "step": 3700 }, { "epoch": 34.91, "eval_runtime": 1.913, "eval_samples_per_second": 375.322, "eval_steps_per_second": 12.023, "step": 3700 }, { "epoch": 35.38, "learning_rate": 4.93171161116037e-06, "loss": 2.3119, "step": 3750 }, { "epoch": 35.38, "eval_runtime": 1.9156, "eval_samples_per_second": 374.813, "eval_steps_per_second": 12.007, "step": 3750 }, { "epoch": 35.85, "learning_rate": 4.656198158017416e-06, "loss": 2.2631, "step": 3800 }, { "epoch": 35.85, "eval_runtime": 1.9287, "eval_samples_per_second": 372.271, "eval_steps_per_second": 11.925, "step": 3800 }, { "epoch": 36.32, "learning_rate": 4.380800513736635e-06, "loss": 2.2201, "step": 3850 }, { "epoch": 36.32, "eval_runtime": 1.9261, "eval_samples_per_second": 372.768, "eval_steps_per_second": 11.941, "step": 3850 }, { "epoch": 36.79, "learning_rate": 4.111495967954926e-06, "loss": 2.1818, "step": 3900 }, { "epoch": 36.79, "eval_runtime": 1.9179, "eval_samples_per_second": 374.378, "eval_steps_per_second": 11.993, "step": 3900 }, { "epoch": 37.26, "learning_rate": 3.848576537200217e-06, "loss": 2.1282, "step": 3950 }, { "epoch": 37.26, "eval_runtime": 1.921, "eval_samples_per_second": 373.772, "eval_steps_per_second": 11.973, "step": 3950 }, { "epoch": 37.74, "learning_rate": 3.5923273143923885e-06, "loss": 2.1174, "step": 4000 }, { "epoch": 37.74, "eval_runtime": 1.9202, "eval_samples_per_second": 373.92, "eval_steps_per_second": 11.978, "step": 4000 }, { "epoch": 38.21, "learning_rate": 3.343026159706837e-06, "loss": 2.091, "step": 4050 }, { "epoch": 38.21, "eval_runtime": 1.9212, "eval_samples_per_second": 373.727, "eval_steps_per_second": 11.972, "step": 4050 }, { "epoch": 38.68, "learning_rate": 3.1009433992807925e-06, "loss": 2.0562, "step": 4100 }, { "epoch": 38.68, "eval_runtime": 1.9201, "eval_samples_per_second": 373.932, "eval_steps_per_second": 11.978, "step": 4100 }, { "epoch": 39.15, "learning_rate": 2.8709585976496825e-06, "loss": 2.0276, "step": 4150 }, { "epoch": 39.15, "eval_runtime": 1.9126, "eval_samples_per_second": 375.405, "eval_steps_per_second": 12.025, "step": 4150 }, { "epoch": 39.62, "learning_rate": 2.6439348660701634e-06, "loss": 2.0138, "step": 4200 }, { "epoch": 39.62, "eval_runtime": 1.918, "eval_samples_per_second": 374.351, "eval_steps_per_second": 11.992, "step": 4200 }, { "epoch": 40.09, "learning_rate": 2.424887578383799e-06, "loss": 1.9901, "step": 4250 }, { "epoch": 40.09, "eval_runtime": 2.0806, "eval_samples_per_second": 345.101, "eval_steps_per_second": 11.055, "step": 4250 }, { "epoch": 40.57, "learning_rate": 2.21405425538036e-06, "loss": 1.9743, "step": 4300 }, { "epoch": 40.57, "eval_runtime": 2.0154, "eval_samples_per_second": 356.253, "eval_steps_per_second": 11.412, "step": 4300 }, { "epoch": 41.04, "learning_rate": 2.011663511154628e-06, "loss": 1.9583, "step": 4350 }, { "epoch": 41.04, "eval_runtime": 2.7415, "eval_samples_per_second": 261.903, "eval_steps_per_second": 8.39, "step": 4350 }, { "epoch": 41.51, "learning_rate": 1.817934805211976e-06, "loss": 1.9442, "step": 4400 }, { "epoch": 41.51, "eval_runtime": 1.9318, "eval_samples_per_second": 371.67, "eval_steps_per_second": 11.906, "step": 4400 }, { "epoch": 41.98, "learning_rate": 1.6330782045006088e-06, "loss": 1.9376, "step": 4450 }, { "epoch": 41.98, "eval_runtime": 1.9289, "eval_samples_per_second": 372.225, "eval_steps_per_second": 11.924, "step": 4450 }, { "epoch": 42.45, "learning_rate": 1.457294155628457e-06, "loss": 1.919, "step": 4500 }, { "epoch": 42.45, "eval_runtime": 1.9343, "eval_samples_per_second": 371.203, "eval_steps_per_second": 11.891, "step": 4500 }, { "epoch": 42.92, "learning_rate": 1.2907732675117878e-06, "loss": 1.9033, "step": 4550 }, { "epoch": 42.92, "eval_runtime": 1.9245, "eval_samples_per_second": 373.093, "eval_steps_per_second": 11.951, "step": 4550 }, { "epoch": 43.4, "learning_rate": 1.1336961046911443e-06, "loss": 1.8907, "step": 4600 }, { "epoch": 43.4, "eval_runtime": 1.9188, "eval_samples_per_second": 374.186, "eval_steps_per_second": 11.986, "step": 4600 }, { "epoch": 43.87, "learning_rate": 9.862329915387669e-07, "loss": 1.8839, "step": 4650 }, { "epoch": 43.87, "eval_runtime": 1.9284, "eval_samples_per_second": 372.325, "eval_steps_per_second": 11.927, "step": 4650 }, { "epoch": 44.34, "learning_rate": 8.485438275698154e-07, "loss": 1.8705, "step": 4700 }, { "epoch": 44.34, "eval_runtime": 1.9257, "eval_samples_per_second": 372.846, "eval_steps_per_second": 11.944, "step": 4700 }, { "epoch": 44.81, "learning_rate": 7.207779140576066e-07, "loss": 1.8764, "step": 4750 }, { "epoch": 44.81, "eval_runtime": 1.9191, "eval_samples_per_second": 374.139, "eval_steps_per_second": 11.985, "step": 4750 }, { "epoch": 45.28, "learning_rate": 6.030737921409169e-07, "loss": 1.8671, "step": 4800 }, { "epoch": 45.28, "eval_runtime": 1.9182, "eval_samples_per_second": 374.313, "eval_steps_per_second": 11.991, "step": 4800 }, { "epoch": 45.75, "learning_rate": 4.955590925988896e-07, "loss": 1.8624, "step": 4850 }, { "epoch": 45.75, "eval_runtime": 1.9234, "eval_samples_per_second": 373.294, "eval_steps_per_second": 11.958, "step": 4850 }, { "epoch": 46.23, "learning_rate": 3.983503974564229e-07, "loss": 1.8639, "step": 4900 }, { "epoch": 46.23, "eval_runtime": 1.9237, "eval_samples_per_second": 373.238, "eval_steps_per_second": 11.956, "step": 4900 }, { "epoch": 46.7, "learning_rate": 3.115531135701155e-07, "loss": 1.8532, "step": 4950 }, { "epoch": 46.7, "eval_runtime": 1.9206, "eval_samples_per_second": 373.851, "eval_steps_per_second": 11.976, "step": 4950 }, { "epoch": 47.17, "learning_rate": 2.3526135833186527e-07, "loss": 1.849, "step": 5000 }, { "epoch": 47.17, "eval_runtime": 1.9156, "eval_samples_per_second": 374.809, "eval_steps_per_second": 12.006, "step": 5000 }, { "epoch": 47.64, "learning_rate": 1.6955785761400444e-07, "loss": 1.844, "step": 5050 }, { "epoch": 47.64, "eval_runtime": 1.9207, "eval_samples_per_second": 373.812, "eval_steps_per_second": 11.974, "step": 5050 }, { "epoch": 48.11, "learning_rate": 1.145138560667003e-07, "loss": 1.8538, "step": 5100 }, { "epoch": 48.11, "eval_runtime": 1.9259, "eval_samples_per_second": 372.809, "eval_steps_per_second": 11.942, "step": 5100 }, { "epoch": 48.58, "learning_rate": 7.018903986483083e-08, "loss": 1.8575, "step": 5150 }, { "epoch": 48.58, "eval_runtime": 1.9229, "eval_samples_per_second": 373.387, "eval_steps_per_second": 11.961, "step": 5150 }, { "epoch": 49.06, "learning_rate": 3.663147198813666e-08, "loss": 1.8503, "step": 5200 }, { "epoch": 49.06, "eval_runtime": 1.9176, "eval_samples_per_second": 374.432, "eval_steps_per_second": 11.994, "step": 5200 }, { "epoch": 49.53, "learning_rate": 1.3877540104818566e-08, "loss": 1.8424, "step": 5250 }, { "epoch": 49.53, "eval_runtime": 1.9158, "eval_samples_per_second": 374.782, "eval_steps_per_second": 12.006, "step": 5250 }, { "epoch": 50.0, "learning_rate": 1.951917115091684e-09, "loss": 1.8519, "step": 5300 }, { "epoch": 50.0, "eval_runtime": 1.9163, "eval_samples_per_second": 374.689, "eval_steps_per_second": 12.003, "step": 5300 }, { "epoch": 50.0, "step": 5300, "total_flos": 3.2498053856362496e+16, "train_loss": 7.9858926362811395, "train_runtime": 3481.491, "train_samples_per_second": 48.6, "train_steps_per_second": 1.522 } ], "logging_steps": 50, "max_steps": 5300, "num_train_epochs": 50, "save_steps": 50, "total_flos": 3.2498053856362496e+16, "trial_name": null, "trial_params": null }