|
{ |
|
"best_metric": 0.4079853296279907, |
|
"best_model_checkpoint": "./mistral/23-02-24-Weni-ZeroShot-3.3.3-Mistral-7b-Multilanguage-1-epoch-3.2.0_Zeroshot-2_max_steps-201_batch_128_2024-02-23_ppid_2273/checkpoint-180", |
|
"epoch": 0.8921933085501859, |
|
"eval_steps": 20, |
|
"global_step": 180, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.0583807229995728, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5561, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 0.8398289084434509, |
|
"eval_runtime": 365.9464, |
|
"eval_samples_per_second": 7.834, |
|
"eval_steps_per_second": 0.246, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2255832850933075, |
|
"learning_rate": 0.0001940350472628637, |
|
"loss": 0.594, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.5098645687103271, |
|
"eval_runtime": 365.932, |
|
"eval_samples_per_second": 7.835, |
|
"eval_steps_per_second": 0.246, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.188425675034523, |
|
"learning_rate": 0.0001779507116848976, |
|
"loss": 0.4849, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 0.46610626578330994, |
|
"eval_runtime": 365.7636, |
|
"eval_samples_per_second": 7.838, |
|
"eval_steps_per_second": 0.246, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.24421139061450958, |
|
"learning_rate": 0.00015199063052725745, |
|
"loss": 0.4516, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.4327794909477234, |
|
"eval_runtime": 366.1258, |
|
"eval_samples_per_second": 7.831, |
|
"eval_steps_per_second": 0.246, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.1821870654821396, |
|
"learning_rate": 0.00011982811629223709, |
|
"loss": 0.4237, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.41923952102661133, |
|
"eval_runtime": 366.5093, |
|
"eval_samples_per_second": 7.822, |
|
"eval_steps_per_second": 0.246, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.16211040318012238, |
|
"learning_rate": 8.530012652622397e-05, |
|
"loss": 0.4235, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 0.41423356533050537, |
|
"eval_runtime": 366.3967, |
|
"eval_samples_per_second": 7.825, |
|
"eval_steps_per_second": 0.246, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.16519276797771454, |
|
"learning_rate": 5.2525817770470084e-05, |
|
"loss": 0.4128, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 0.41103067994117737, |
|
"eval_runtime": 366.4036, |
|
"eval_samples_per_second": 7.825, |
|
"eval_steps_per_second": 0.246, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.15887108445167542, |
|
"learning_rate": 2.5415134079383006e-05, |
|
"loss": 0.4152, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_loss": 0.4090025722980499, |
|
"eval_runtime": 366.5464, |
|
"eval_samples_per_second": 7.822, |
|
"eval_steps_per_second": 0.246, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.17298047244548798, |
|
"learning_rate": 7.202354390738608e-06, |
|
"loss": 0.4092, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 0.4079853296279907, |
|
"eval_runtime": 366.7715, |
|
"eval_samples_per_second": 7.817, |
|
"eval_steps_per_second": 0.245, |
|
"step": 180 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 201, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 8.112849538860974e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|