{ "best_metric": null, "best_model_checkpoint": null, "epoch": 14.0, "global_step": 75124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5e-09, "loss": 10.4893, "step": 1 }, { "epoch": 0.09, "learning_rate": 2.5e-06, "loss": 9.3442, "step": 500 }, { "epoch": 0.19, "learning_rate": 5e-06, "loss": 7.4232, "step": 1000 }, { "epoch": 0.28, "learning_rate": 7.5e-06, "loss": 6.0986, "step": 1500 }, { "epoch": 0.37, "learning_rate": 1e-05, "loss": 5.8257, "step": 2000 }, { "epoch": 0.47, "learning_rate": 1.25e-05, "loss": 5.7081, "step": 2500 }, { "epoch": 0.56, "learning_rate": 1.5e-05, "loss": 5.6336, "step": 3000 }, { "epoch": 0.65, "learning_rate": 1.75e-05, "loss": 5.5724, "step": 3500 }, { "epoch": 0.75, "learning_rate": 2e-05, "loss": 5.529, "step": 4000 }, { "epoch": 0.84, "learning_rate": 2.25e-05, "loss": 5.4913, "step": 4500 }, { "epoch": 0.93, "learning_rate": 2.5e-05, "loss": 5.4578, "step": 5000 }, { "epoch": 1.02, "learning_rate": 2.7500000000000004e-05, "loss": 5.4299, "step": 5500 }, { "epoch": 1.12, "learning_rate": 3e-05, "loss": 5.4036, "step": 6000 }, { "epoch": 1.21, "learning_rate": 3.2500000000000004e-05, "loss": 5.3821, "step": 6500 }, { "epoch": 1.3, "learning_rate": 3.5e-05, "loss": 5.3594, "step": 7000 }, { "epoch": 1.4, "learning_rate": 3.7500000000000003e-05, "loss": 5.3419, "step": 7500 }, { "epoch": 1.49, "learning_rate": 4e-05, "loss": 5.321, "step": 8000 }, { "epoch": 1.58, "learning_rate": 4.2495e-05, "loss": 5.3034, "step": 8500 }, { "epoch": 1.68, "learning_rate": 4.4995000000000005e-05, "loss": 5.2938, "step": 9000 }, { "epoch": 1.77, "learning_rate": 4.7495e-05, "loss": 5.2774, "step": 9500 }, { "epoch": 1.86, "learning_rate": 4.9995000000000005e-05, "loss": 5.2669, "step": 10000 }, { "epoch": 1.96, "learning_rate": 4.99883448792361e-05, "loss": 5.2542, "step": 10500 }, { "epoch": 2.05, "learning_rate": 4.997664295075829e-05, "loss": 5.2418, "step": 11000 }, { "epoch": 2.14, "learning_rate": 4.9964941022280475e-05, "loss": 5.231, "step": 11500 }, { "epoch": 2.24, "learning_rate": 4.995323909380266e-05, "loss": 5.2201, "step": 12000 }, { "epoch": 2.33, "learning_rate": 4.994156056918181e-05, "loss": 5.2114, "step": 12500 }, { "epoch": 2.42, "learning_rate": 4.992985864070399e-05, "loss": 5.2043, "step": 13000 }, { "epoch": 2.52, "learning_rate": 4.9918156712226175e-05, "loss": 5.194, "step": 13500 }, { "epoch": 2.61, "learning_rate": 4.9906454783748366e-05, "loss": 5.1832, "step": 14000 }, { "epoch": 2.7, "learning_rate": 4.989475285527055e-05, "loss": 5.1801, "step": 14500 }, { "epoch": 2.8, "learning_rate": 4.988307433064969e-05, "loss": 5.1721, "step": 15000 }, { "epoch": 2.89, "learning_rate": 4.987137240217188e-05, "loss": 5.1657, "step": 15500 }, { "epoch": 2.98, "learning_rate": 4.9859670473694066e-05, "loss": 5.16, "step": 16000 }, { "epoch": 3.07, "learning_rate": 4.984796854521626e-05, "loss": 5.1548, "step": 16500 }, { "epoch": 3.17, "learning_rate": 4.983629002059539e-05, "loss": 5.1467, "step": 17000 }, { "epoch": 3.26, "learning_rate": 4.982458809211759e-05, "loss": 5.1421, "step": 17500 }, { "epoch": 3.35, "learning_rate": 4.981288616363977e-05, "loss": 5.1356, "step": 18000 }, { "epoch": 3.45, "learning_rate": 4.980118423516196e-05, "loss": 5.1325, "step": 18500 }, { "epoch": 3.54, "learning_rate": 4.97895057105411e-05, "loss": 5.1272, "step": 19000 }, { "epoch": 3.63, "learning_rate": 4.977780378206329e-05, "loss": 5.1207, "step": 19500 }, { "epoch": 3.73, "learning_rate": 4.976610185358547e-05, "loss": 5.1182, "step": 20000 }, { "epoch": 3.82, "learning_rate": 4.975439992510766e-05, "loss": 5.1137, "step": 20500 }, { "epoch": 3.91, "learning_rate": 4.974269799662985e-05, "loss": 5.1099, "step": 21000 }, { "epoch": 4.01, "learning_rate": 4.973101947200899e-05, "loss": 5.1054, "step": 21500 }, { "epoch": 4.1, "learning_rate": 4.971931754353117e-05, "loss": 5.0999, "step": 22000 }, { "epoch": 4.19, "learning_rate": 4.9707615615053363e-05, "loss": 5.0948, "step": 22500 }, { "epoch": 4.29, "learning_rate": 4.9695913686575554e-05, "loss": 5.0925, "step": 23000 }, { "epoch": 4.38, "learning_rate": 4.968423516195469e-05, "loss": 5.0874, "step": 23500 }, { "epoch": 4.47, "learning_rate": 4.967253323347688e-05, "loss": 5.0848, "step": 24000 }, { "epoch": 4.57, "learning_rate": 4.966083130499907e-05, "loss": 5.0815, "step": 24500 }, { "epoch": 4.66, "learning_rate": 4.9649129376521254e-05, "loss": 5.0802, "step": 25000 }, { "epoch": 4.75, "learning_rate": 4.9637450851900395e-05, "loss": 5.0777, "step": 25500 }, { "epoch": 4.85, "learning_rate": 4.962574892342258e-05, "loss": 5.0732, "step": 26000 }, { "epoch": 4.94, "learning_rate": 4.961404699494477e-05, "loss": 5.0705, "step": 26500 }, { "epoch": 5.03, "learning_rate": 4.9602345066466954e-05, "loss": 5.0673, "step": 27000 }, { "epoch": 5.12, "learning_rate": 4.9590666541846095e-05, "loss": 5.0608, "step": 27500 }, { "epoch": 5.22, "learning_rate": 4.9578964613368286e-05, "loss": 5.0599, "step": 28000 }, { "epoch": 5.31, "learning_rate": 4.956726268489047e-05, "loss": 5.0567, "step": 28500 }, { "epoch": 5.4, "learning_rate": 4.955556075641266e-05, "loss": 5.0523, "step": 29000 }, { "epoch": 5.5, "learning_rate": 4.95438822317918e-05, "loss": 5.051, "step": 29500 }, { "epoch": 5.59, "learning_rate": 4.953218030331399e-05, "loss": 5.0469, "step": 30000 }, { "epoch": 5.68, "learning_rate": 4.952047837483618e-05, "loss": 5.0424, "step": 30500 }, { "epoch": 5.78, "learning_rate": 4.950877644635836e-05, "loss": 5.0405, "step": 31000 }, { "epoch": 5.87, "learning_rate": 4.949709792173751e-05, "loss": 5.0073, "step": 31500 }, { "epoch": 5.96, "learning_rate": 4.948539599325969e-05, "loss": 4.6646, "step": 32000 }, { "epoch": 6.06, "learning_rate": 4.9473694064781877e-05, "loss": 4.3483, "step": 32500 }, { "epoch": 6.15, "learning_rate": 4.946199213630406e-05, "loss": 4.0878, "step": 33000 }, { "epoch": 6.24, "learning_rate": 4.945031361168321e-05, "loss": 3.8246, "step": 33500 }, { "epoch": 6.34, "learning_rate": 4.943861168320539e-05, "loss": 3.221, "step": 34000 }, { "epoch": 6.43, "learning_rate": 4.9426909754727577e-05, "loss": 2.7026, "step": 34500 }, { "epoch": 6.52, "learning_rate": 4.9415207826249774e-05, "loss": 2.3592, "step": 35000 }, { "epoch": 6.62, "learning_rate": 4.940352930162891e-05, "loss": 1.9468, "step": 35500 }, { "epoch": 6.71, "learning_rate": 4.939182737315109e-05, "loss": 1.6962, "step": 36000 }, { "epoch": 6.8, "learning_rate": 4.938012544467328e-05, "loss": 1.5455, "step": 36500 }, { "epoch": 6.9, "learning_rate": 4.9368423516195474e-05, "loss": 1.4404, "step": 37000 }, { "epoch": 6.99, "learning_rate": 4.9356744991574615e-05, "loss": 1.3671, "step": 37500 }, { "epoch": 7.08, "learning_rate": 4.93450430630968e-05, "loss": 1.3047, "step": 38000 }, { "epoch": 7.17, "learning_rate": 4.933334113461899e-05, "loss": 1.242, "step": 38500 }, { "epoch": 7.27, "learning_rate": 4.9321639206141174e-05, "loss": 1.1857, "step": 39000 }, { "epoch": 7.36, "learning_rate": 4.930993727766336e-05, "loss": 1.1364, "step": 39500 }, { "epoch": 7.45, "learning_rate": 4.9298258753042506e-05, "loss": 1.0976, "step": 40000 }, { "epoch": 7.55, "learning_rate": 4.928655682456469e-05, "loss": 1.062, "step": 40500 }, { "epoch": 7.64, "learning_rate": 4.9274854896086874e-05, "loss": 1.0284, "step": 41000 }, { "epoch": 7.73, "learning_rate": 4.9263152967609065e-05, "loss": 1.0023, "step": 41500 }, { "epoch": 7.83, "learning_rate": 4.9251474442988206e-05, "loss": 0.98, "step": 42000 }, { "epoch": 7.92, "learning_rate": 4.9239772514510397e-05, "loss": 0.9582, "step": 42500 }, { "epoch": 8.01, "learning_rate": 4.922809398988954e-05, "loss": 0.9424, "step": 43000 }, { "epoch": 8.11, "learning_rate": 4.921639206141173e-05, "loss": 0.9232, "step": 43500 }, { "epoch": 8.2, "learning_rate": 4.920469013293391e-05, "loss": 0.908, "step": 44000 }, { "epoch": 8.29, "learning_rate": 4.9192988204456097e-05, "loss": 0.8941, "step": 44500 }, { "epoch": 8.39, "learning_rate": 4.918128627597828e-05, "loss": 0.8833, "step": 45000 }, { "epoch": 8.48, "learning_rate": 4.916958434750047e-05, "loss": 0.8697, "step": 45500 }, { "epoch": 8.57, "learning_rate": 4.9157882419022655e-05, "loss": 0.8558, "step": 46000 }, { "epoch": 8.67, "learning_rate": 4.9146203894401796e-05, "loss": 0.8465, "step": 46500 }, { "epoch": 8.76, "learning_rate": 4.913450196592399e-05, "loss": 0.8352, "step": 47000 }, { "epoch": 8.85, "learning_rate": 4.912280003744618e-05, "loss": 0.8253, "step": 47500 }, { "epoch": 8.95, "learning_rate": 4.911109810896836e-05, "loss": 0.8135, "step": 48000 }, { "epoch": 9.04, "learning_rate": 4.9099396180490546e-05, "loss": 0.8064, "step": 48500 }, { "epoch": 9.13, "learning_rate": 4.908769425201274e-05, "loss": 0.7971, "step": 49000 }, { "epoch": 9.22, "learning_rate": 4.907601572739188e-05, "loss": 0.7846, "step": 49500 }, { "epoch": 9.32, "learning_rate": 4.906431379891406e-05, "loss": 0.779, "step": 50000 }, { "epoch": 9.41, "learning_rate": 4.9052611870436246e-05, "loss": 0.7717, "step": 50500 }, { "epoch": 9.5, "learning_rate": 4.904090994195844e-05, "loss": 0.7618, "step": 51000 }, { "epoch": 9.6, "learning_rate": 4.902920801348063e-05, "loss": 0.7573, "step": 51500 }, { "epoch": 9.69, "learning_rate": 4.901752948885976e-05, "loss": 0.7505, "step": 52000 }, { "epoch": 9.78, "learning_rate": 4.900582756038195e-05, "loss": 0.7445, "step": 52500 }, { "epoch": 9.88, "learning_rate": 4.8994125631904144e-05, "loss": 0.7389, "step": 53000 }, { "epoch": 9.97, "learning_rate": 4.898242370342633e-05, "loss": 0.7314, "step": 53500 }, { "epoch": 10.06, "learning_rate": 4.897072177494851e-05, "loss": 0.7245, "step": 54000 }, { "epoch": 10.16, "learning_rate": 4.895904325032766e-05, "loss": 0.7193, "step": 54500 }, { "epoch": 10.25, "learning_rate": 4.8947341321849843e-05, "loss": 0.7137, "step": 55000 }, { "epoch": 10.34, "learning_rate": 4.893563939337203e-05, "loss": 0.7083, "step": 55500 }, { "epoch": 10.44, "learning_rate": 4.892393746489422e-05, "loss": 0.7039, "step": 56000 }, { "epoch": 10.53, "learning_rate": 4.891223553641641e-05, "loss": 0.6995, "step": 56500 }, { "epoch": 10.62, "learning_rate": 4.890053360793859e-05, "loss": 0.6941, "step": 57000 }, { "epoch": 10.72, "learning_rate": 4.888885508331773e-05, "loss": 0.6904, "step": 57500 }, { "epoch": 10.81, "learning_rate": 4.8877153154839925e-05, "loss": 0.6846, "step": 58000 }, { "epoch": 10.9, "learning_rate": 4.886545122636211e-05, "loss": 0.6806, "step": 58500 }, { "epoch": 11.0, "learning_rate": 4.885374929788429e-05, "loss": 0.6771, "step": 59000 }, { "epoch": 11.09, "learning_rate": 4.884204736940648e-05, "loss": 0.6726, "step": 59500 }, { "epoch": 11.18, "learning_rate": 4.8830368844785625e-05, "loss": 0.6679, "step": 60000 }, { "epoch": 11.27, "learning_rate": 4.881866691630781e-05, "loss": 0.6634, "step": 60500 }, { "epoch": 11.37, "learning_rate": 4.880696498782999e-05, "loss": 0.6607, "step": 61000 }, { "epoch": 11.46, "learning_rate": 4.8795263059352184e-05, "loss": 0.6568, "step": 61500 }, { "epoch": 11.55, "learning_rate": 4.8783561130874375e-05, "loss": 0.6546, "step": 62000 }, { "epoch": 11.65, "learning_rate": 4.877188260625351e-05, "loss": 0.6527, "step": 62500 }, { "epoch": 11.74, "learning_rate": 4.87601806777757e-05, "loss": 0.6455, "step": 63000 }, { "epoch": 11.83, "learning_rate": 4.874847874929789e-05, "loss": 0.6437, "step": 63500 }, { "epoch": 11.93, "learning_rate": 4.8736776820820074e-05, "loss": 0.6408, "step": 64000 }, { "epoch": 12.02, "learning_rate": 4.872507489234226e-05, "loss": 0.6369, "step": 64500 }, { "epoch": 12.11, "learning_rate": 4.871337296386444e-05, "loss": 0.6336, "step": 65000 }, { "epoch": 12.21, "learning_rate": 4.870167103538664e-05, "loss": 0.63, "step": 65500 }, { "epoch": 12.3, "learning_rate": 4.8689992510765774e-05, "loss": 0.6277, "step": 66000 }, { "epoch": 12.39, "learning_rate": 4.867829058228796e-05, "loss": 0.6265, "step": 66500 }, { "epoch": 12.49, "learning_rate": 4.866658865381015e-05, "loss": 0.6238, "step": 67000 }, { "epoch": 12.58, "learning_rate": 4.865488672533234e-05, "loss": 0.6192, "step": 67500 }, { "epoch": 12.67, "learning_rate": 4.864320820071148e-05, "loss": 0.6165, "step": 68000 }, { "epoch": 12.77, "learning_rate": 4.8631506272233665e-05, "loss": 0.613, "step": 68500 }, { "epoch": 12.86, "learning_rate": 4.8619804343755856e-05, "loss": 0.6124, "step": 69000 }, { "epoch": 12.95, "learning_rate": 4.860810241527804e-05, "loss": 0.6111, "step": 69500 }, { "epoch": 13.05, "learning_rate": 4.8596400486800224e-05, "loss": 0.607, "step": 70000 }, { "epoch": 13.14, "learning_rate": 4.858469855832241e-05, "loss": 0.602, "step": 70500 }, { "epoch": 13.23, "learning_rate": 4.8573020033701556e-05, "loss": 0.602, "step": 71000 }, { "epoch": 13.32, "learning_rate": 4.856131810522374e-05, "loss": 0.6008, "step": 71500 }, { "epoch": 13.42, "learning_rate": 4.854961617674593e-05, "loss": 0.5974, "step": 72000 }, { "epoch": 13.51, "learning_rate": 4.853791424826812e-05, "loss": 0.5939, "step": 72500 }, { "epoch": 13.6, "learning_rate": 4.8526212319790305e-05, "loss": 0.5942, "step": 73000 }, { "epoch": 13.7, "learning_rate": 4.851451039131249e-05, "loss": 0.5902, "step": 73500 }, { "epoch": 13.79, "learning_rate": 4.850283186669163e-05, "loss": 0.5893, "step": 74000 }, { "epoch": 13.88, "learning_rate": 4.849112993821382e-05, "loss": 0.5889, "step": 74500 }, { "epoch": 13.98, "learning_rate": 4.8479428009736005e-05, "loss": 0.5864, "step": 75000 } ], "max_steps": 2146400, "num_train_epochs": 400, "total_flos": 2.024730999544978e+19, "trial_name": null, "trial_params": null }