{ "best_metric": 0.6349581480026245, "best_model_checkpoint": "nllb_200_distilled_1.3B_ENtoFO_bsz_64_epochs_10lr0.0001/checkpoint-5500", "epoch": 1.9329826141393285, "eval_steps": 500, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.035145138438896883, "grad_norm": 0.7466627955436707, "learning_rate": 2e-05, "loss": 1.2984, "step": 100 }, { "epoch": 0.07029027687779377, "grad_norm": 0.8146196603775024, "learning_rate": 4e-05, "loss": 0.9825, "step": 200 }, { "epoch": 0.10543541531669065, "grad_norm": 0.8888100981712341, "learning_rate": 6e-05, "loss": 0.9552, "step": 300 }, { "epoch": 0.14058055375558753, "grad_norm": 0.7786069512367249, "learning_rate": 8e-05, "loss": 0.9249, "step": 400 }, { "epoch": 0.17572569219448442, "grad_norm": 0.7248039245605469, "learning_rate": 0.0001, "loss": 0.9121, "step": 500 }, { "epoch": 0.17572569219448442, "eval_bleu": 39.5852, "eval_chrf++": 58.6112, "eval_gen_len": 17.7742, "eval_loss": 0.8167479634284973, "eval_runtime": 3590.7671, "eval_samples_per_second": 2.039, "eval_steps_per_second": 1.02, "step": 500 }, { "epoch": 0.2108708306333813, "grad_norm": 0.6579689979553223, "learning_rate": 9.964221824686941e-05, "loss": 0.9033, "step": 600 }, { "epoch": 0.24601596907227818, "grad_norm": 0.7566829919815063, "learning_rate": 9.928443649373882e-05, "loss": 0.89, "step": 700 }, { "epoch": 0.28116110751117507, "grad_norm": 0.750449001789093, "learning_rate": 9.892665474060824e-05, "loss": 0.8709, "step": 800 }, { "epoch": 0.31630624595007195, "grad_norm": 0.6458595395088196, "learning_rate": 9.856887298747764e-05, "loss": 0.855, "step": 900 }, { "epoch": 0.35145138438896883, "grad_norm": 0.6504934430122375, "learning_rate": 9.821109123434705e-05, "loss": 0.8558, "step": 1000 }, { "epoch": 0.35145138438896883, "eval_bleu": 41.4406, "eval_chrf++": 60.2627, "eval_gen_len": 17.9384, "eval_loss": 0.7587867975234985, "eval_runtime": 3432.3677, "eval_samples_per_second": 2.133, "eval_steps_per_second": 1.067, "step": 1000 }, { "epoch": 0.3865965228278657, "grad_norm": 0.7932488918304443, "learning_rate": 9.785330948121646e-05, "loss": 0.8455, "step": 1100 }, { "epoch": 0.4217416612667626, "grad_norm": 0.7324668765068054, "learning_rate": 9.749552772808587e-05, "loss": 0.8363, "step": 1200 }, { "epoch": 0.4568867997056595, "grad_norm": 0.7606379985809326, "learning_rate": 9.713774597495528e-05, "loss": 0.816, "step": 1300 }, { "epoch": 0.49203193814455637, "grad_norm": 0.5376294851303101, "learning_rate": 9.677996422182469e-05, "loss": 0.8058, "step": 1400 }, { "epoch": 0.5271770765834533, "grad_norm": 0.903544008731842, "learning_rate": 9.64221824686941e-05, "loss": 0.8106, "step": 1500 }, { "epoch": 0.5271770765834533, "eval_bleu": 41.9609, "eval_chrf++": 60.633, "eval_gen_len": 17.8868, "eval_loss": 0.7227240800857544, "eval_runtime": 3519.7091, "eval_samples_per_second": 2.08, "eval_steps_per_second": 1.04, "step": 1500 }, { "epoch": 0.5623222150223501, "grad_norm": 0.6048879027366638, "learning_rate": 9.606440071556351e-05, "loss": 0.7833, "step": 1600 }, { "epoch": 0.597467353461247, "grad_norm": 0.6456710696220398, "learning_rate": 9.570661896243292e-05, "loss": 0.7913, "step": 1700 }, { "epoch": 0.6326124919001439, "grad_norm": 0.6244344711303711, "learning_rate": 9.534883720930233e-05, "loss": 0.7991, "step": 1800 }, { "epoch": 0.6677576303390408, "grad_norm": 0.6789991855621338, "learning_rate": 9.499105545617174e-05, "loss": 0.7764, "step": 1900 }, { "epoch": 0.7029027687779377, "grad_norm": 0.7175530195236206, "learning_rate": 9.463327370304115e-05, "loss": 0.782, "step": 2000 }, { "epoch": 0.7029027687779377, "eval_bleu": 42.6374, "eval_chrf++": 61.2613, "eval_gen_len": 17.7972, "eval_loss": 0.7011950612068176, "eval_runtime": 3436.4901, "eval_samples_per_second": 2.13, "eval_steps_per_second": 1.065, "step": 2000 }, { "epoch": 0.7380479072168346, "grad_norm": 0.5626235604286194, "learning_rate": 9.427549194991055e-05, "loss": 0.7717, "step": 2100 }, { "epoch": 0.7731930456557314, "grad_norm": 0.6301568746566772, "learning_rate": 9.391771019677997e-05, "loss": 0.7697, "step": 2200 }, { "epoch": 0.8083381840946283, "grad_norm": 0.5354183316230774, "learning_rate": 9.355992844364938e-05, "loss": 0.7659, "step": 2300 }, { "epoch": 0.8434833225335252, "grad_norm": 0.6053968071937561, "learning_rate": 9.320214669051879e-05, "loss": 0.7588, "step": 2400 }, { "epoch": 0.8786284609724221, "grad_norm": 0.593941330909729, "learning_rate": 9.284436493738819e-05, "loss": 0.7488, "step": 2500 }, { "epoch": 0.8786284609724221, "eval_bleu": 43.3601, "eval_chrf++": 61.7425, "eval_gen_len": 17.935, "eval_loss": 0.6797980666160583, "eval_runtime": 3463.8249, "eval_samples_per_second": 2.114, "eval_steps_per_second": 1.057, "step": 2500 }, { "epoch": 0.913773599411319, "grad_norm": 0.5975068211555481, "learning_rate": 9.248658318425761e-05, "loss": 0.7571, "step": 2600 }, { "epoch": 0.9489187378502159, "grad_norm": 0.5520368218421936, "learning_rate": 9.212880143112702e-05, "loss": 0.7398, "step": 2700 }, { "epoch": 0.9840638762891127, "grad_norm": 0.5748527646064758, "learning_rate": 9.177101967799643e-05, "loss": 0.7455, "step": 2800 }, { "epoch": 1.0192090147280095, "grad_norm": 0.6073290705680847, "learning_rate": 9.141323792486583e-05, "loss": 0.6714, "step": 2900 }, { "epoch": 1.0543541531669065, "grad_norm": 0.6117560863494873, "learning_rate": 9.105545617173525e-05, "loss": 0.6163, "step": 3000 }, { "epoch": 1.0543541531669065, "eval_bleu": 43.6802, "eval_chrf++": 62.0773, "eval_gen_len": 17.9802, "eval_loss": 0.6721383929252625, "eval_runtime": 3607.5103, "eval_samples_per_second": 2.029, "eval_steps_per_second": 1.015, "step": 3000 }, { "epoch": 1.0894992916058033, "grad_norm": 0.5141863822937012, "learning_rate": 9.069767441860465e-05, "loss": 0.6151, "step": 3100 }, { "epoch": 1.1246444300447003, "grad_norm": 0.5742676258087158, "learning_rate": 9.033989266547407e-05, "loss": 0.6236, "step": 3200 }, { "epoch": 1.159789568483597, "grad_norm": 0.5287165641784668, "learning_rate": 8.998211091234347e-05, "loss": 0.6249, "step": 3300 }, { "epoch": 1.194934706922494, "grad_norm": 0.5631851553916931, "learning_rate": 8.962432915921289e-05, "loss": 0.6229, "step": 3400 }, { "epoch": 1.2300798453613908, "grad_norm": 0.5420289039611816, "learning_rate": 8.926654740608229e-05, "loss": 0.6103, "step": 3500 }, { "epoch": 1.2300798453613908, "eval_bleu": 43.9272, "eval_chrf++": 62.2949, "eval_gen_len": 17.8805, "eval_loss": 0.6639961004257202, "eval_runtime": 3511.7691, "eval_samples_per_second": 2.085, "eval_steps_per_second": 1.042, "step": 3500 }, { "epoch": 1.2652249838002878, "grad_norm": 0.576799750328064, "learning_rate": 8.890876565295171e-05, "loss": 0.6076, "step": 3600 }, { "epoch": 1.3003701222391846, "grad_norm": 0.4976769983768463, "learning_rate": 8.855098389982111e-05, "loss": 0.6084, "step": 3700 }, { "epoch": 1.3355152606780816, "grad_norm": 0.5788149237632751, "learning_rate": 8.819320214669053e-05, "loss": 0.6121, "step": 3800 }, { "epoch": 1.3706603991169783, "grad_norm": 0.5019386410713196, "learning_rate": 8.783542039355993e-05, "loss": 0.6172, "step": 3900 }, { "epoch": 1.4058055375558753, "grad_norm": 0.5911116600036621, "learning_rate": 8.747763864042933e-05, "loss": 0.6037, "step": 4000 }, { "epoch": 1.4058055375558753, "eval_bleu": 43.7946, "eval_chrf++": 62.1568, "eval_gen_len": 17.8172, "eval_loss": 0.6595008373260498, "eval_runtime": 3539.6035, "eval_samples_per_second": 2.068, "eval_steps_per_second": 1.034, "step": 4000 }, { "epoch": 1.4409506759947721, "grad_norm": 0.5771309733390808, "learning_rate": 8.711985688729875e-05, "loss": 0.6067, "step": 4100 }, { "epoch": 1.476095814433669, "grad_norm": 0.6425995230674744, "learning_rate": 8.676207513416817e-05, "loss": 0.6097, "step": 4200 }, { "epoch": 1.5112409528725659, "grad_norm": 0.5513516664505005, "learning_rate": 8.640429338103757e-05, "loss": 0.6084, "step": 4300 }, { "epoch": 1.5463860913114629, "grad_norm": 0.5649448037147522, "learning_rate": 8.604651162790697e-05, "loss": 0.6026, "step": 4400 }, { "epoch": 1.5815312297503596, "grad_norm": 0.6140225529670715, "learning_rate": 8.568872987477639e-05, "loss": 0.6074, "step": 4500 }, { "epoch": 1.5815312297503596, "eval_bleu": 43.8068, "eval_chrf++": 62.1665, "eval_gen_len": 17.8271, "eval_loss": 0.64792400598526, "eval_runtime": 3653.4406, "eval_samples_per_second": 2.004, "eval_steps_per_second": 1.002, "step": 4500 }, { "epoch": 1.6166763681892564, "grad_norm": 0.5787105560302734, "learning_rate": 8.533094812164581e-05, "loss": 0.6167, "step": 4600 }, { "epoch": 1.6518215066281534, "grad_norm": 0.5308918356895447, "learning_rate": 8.497316636851521e-05, "loss": 0.5936, "step": 4700 }, { "epoch": 1.6869666450670504, "grad_norm": 0.5810381770133972, "learning_rate": 8.461538461538461e-05, "loss": 0.6074, "step": 4800 }, { "epoch": 1.7221117835059472, "grad_norm": 0.6304420232772827, "learning_rate": 8.425760286225403e-05, "loss": 0.5995, "step": 4900 }, { "epoch": 1.757256921944844, "grad_norm": 0.6314705610275269, "learning_rate": 8.389982110912343e-05, "loss": 0.6064, "step": 5000 }, { "epoch": 1.757256921944844, "eval_bleu": 44.0163, "eval_chrf++": 62.4374, "eval_gen_len": 17.8788, "eval_loss": 0.6403423547744751, "eval_runtime": 3440.4393, "eval_samples_per_second": 2.128, "eval_steps_per_second": 1.064, "step": 5000 }, { "epoch": 1.792402060383741, "grad_norm": 0.5468895435333252, "learning_rate": 8.354203935599285e-05, "loss": 0.6014, "step": 5100 }, { "epoch": 1.827547198822638, "grad_norm": 0.564479649066925, "learning_rate": 8.318425760286225e-05, "loss": 0.5984, "step": 5200 }, { "epoch": 1.8626923372615347, "grad_norm": 0.6068270206451416, "learning_rate": 8.282647584973167e-05, "loss": 0.5995, "step": 5300 }, { "epoch": 1.8978374757004315, "grad_norm": 0.5385338068008423, "learning_rate": 8.246869409660107e-05, "loss": 0.6023, "step": 5400 }, { "epoch": 1.9329826141393285, "grad_norm": 0.5280515551567078, "learning_rate": 8.211091234347049e-05, "loss": 0.5969, "step": 5500 }, { "epoch": 1.9329826141393285, "eval_bleu": 44.5286, "eval_chrf++": 62.728, "eval_gen_len": 17.8899, "eval_loss": 0.6349581480026245, "eval_runtime": 3524.4692, "eval_samples_per_second": 2.077, "eval_steps_per_second": 1.039, "step": 5500 } ], "logging_steps": 100, "max_steps": 28450, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3968643831845356e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }