|
import evaluate |
|
import numpy as np |
|
from datasets import load_dataset |
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoModelForSequenceClassification, |
|
Trainer, |
|
TrainingArguments, |
|
) |
|
|
|
dataset_id = "google/fleurs" |
|
model_id = "facebook/xlm-v-base" |
|
metric_name = "accuracy" |
|
|
|
|
|
columns_to_remove = [ |
|
"audio", |
|
"id", |
|
"num_samples", |
|
"path", |
|
"transcription", |
|
"gender", |
|
"language", |
|
"lang_group_id", |
|
] |
|
|
|
train, val = load_dataset(dataset_id, "all", split=["train", "validation"], ignore_verifications=True) |
|
|
|
|
|
|
|
unique_langs = set() |
|
label2id = {} |
|
id2label = {} |
|
for lang, lang_id in zip(val["language"], val["lang_id"]): |
|
if lang not in unique_langs: |
|
unique_langs.add(lang) |
|
id2label[lang_id] = lang |
|
label2id[lang] = lang_id |
|
|
|
id2label = dict(sorted(id2label.items(), key=lambda item: item[0])) |
|
label2id = dict(sorted(label2id.items(), key=lambda item: item[1])) |
|
|
|
train = train.remove_columns(columns_to_remove) |
|
val = val.remove_columns(columns_to_remove) |
|
train = train.rename_column("raw_transcription", "text") |
|
val = val.rename_column("raw_transcription", "text") |
|
train = train.rename_column("lang_id", "label") |
|
val = val.rename_column("lang_id", "label") |
|
|
|
train = train.shuffle(seed=42) |
|
val = val.shuffle(seed=42) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
def preprocess(data): |
|
return tokenizer(data["text"], truncation=True) |
|
|
|
processed_train = train.map(preprocess, batched=True) |
|
processed_val = val.map(preprocess, batched=True) |
|
|
|
print(processed_train) |
|
print(processed_val) |
|
|
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
model_id, |
|
num_labels=len(id2label), |
|
label2id=label2id, |
|
id2label=id2label, |
|
ignore_mismatched_sizes=True, |
|
) |
|
|
|
args = TrainingArguments( |
|
"xlm-v-base-language-id", |
|
learning_rate=3e-5, |
|
warmup_ratio=0.1, |
|
per_device_train_batch_size=16, |
|
gradient_accumulation_steps=4, |
|
per_device_eval_batch_size=16, |
|
num_train_epochs=5, |
|
load_best_model_at_end=True, |
|
metric_for_best_model=metric_name, |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
logging_steps=10, |
|
fp16=True, |
|
push_to_hub=True, |
|
) |
|
|
|
metric = evaluate.load(metric_name) |
|
|
|
|
|
def compute_metrics(eval_pred): |
|
predictions = np.argmax(eval_pred.predictions, axis=1) |
|
return metric.compute(predictions=predictions, references=eval_pred.label_ids) |
|
|
|
|
|
trainer = Trainer( |
|
model, |
|
args, |
|
train_dataset=processed_train, |
|
eval_dataset=processed_val, |
|
tokenizer=tokenizer, |
|
compute_metrics=compute_metrics, |
|
) |
|
|
|
trainer.train() |
|
|
|
|