errors when running whisper locally
Hi, I have the following Python (3.10.12) packages installed:
- torch === 2.3.1
- torchaudio === 2.3.1
- torchvision == 0.18.1
- transformers == 4.40.2
and the following specs:
- 32 gb ram ddr5
- 4070 super 12 gb vram
- ryzen 5 7600x
I'm trying to run whisper-large-v3 locally on my GPU for a data transcription task (from audio to text), but I get a cuda out of memory error.
Here is the core part of source code that I use to do that:
### import
from datasets import load_dataset, Audio, DatasetDict, Dataset
import multiprocess as mp
from transformers import WhisperFeatureExtractor, WhisperProcessor, WhisperForConditionalGeneration, GenerationConfig, WhisperTokenizer
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
### dataset preparation
dataset=load_dataset("huggingface_dataset_space", streaming=True)
dd = DatasetDict()
for ds_name, iterable_ds in dataset.items():
ds=Dataset.from_generator(lambda: (yield from iterable_ds), features=iterable_ds.features, num_proc=mp.cpu_count())
dd[ds_name] = ds
datasets=dd.cast_column("audio", Audio(sampling_rate=16000))
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large-v3")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v3", language="italian", task="transcribe")
def prepare_dataset(batch):
audio = batch["audio"]
batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
batch["labels"] = tokenizer(batch["transcription"]).input_ids
return batch
final_ds=datasets.map(prepare_dataset, num_proc=mp.cpu_count())
final_ds=final_ds.remove_columns(["audio", "transcription"])
### model training
model=WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
processor=WhisperProcessor.from_pretrained("openai/whisper-large-v3", language="italian", task="transcribe")
device=("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
processor: Any
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
input_features=[{"input_features": feature["input_features"]} for feature in features]
batch=self.processor.feature_extractor.pad(input_features, return_tensors="pt")
label_features=[{"input_ids": feature["labels"]} for feature in features]
labels_batch=self.processor.tokenizer.pad(label_features, return_tensors="pt")
labels=labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
if (labels[:, 0]==self.processor.tokenizer.bos_token_id).all().cpu().item():
labels=labels[:, 1:]
batch["labels"]=labels
return batch
data_collator=DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
### word error rate as evaluation metric
metric=evaluate.load("wer")
def compute_metrics(pred):
pred_ids=pred.predictions
label_ids=pred.label_ids
label_ids[label_ids == -100]=tokenizer.pad_token_id
pred_str=tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
label_str=tokenizer.batch_decode(label_ids, skip_special_tokens=True)
wer=100 * metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
forced_decoder_ids = processor.get_decoder_prompt_ids(language="italian", task="transcribe")
model.config.forced_decoder_ids = forced_decoder_ids
model.config.suppress_tokens = []
model.generation_config.language = "it"
gen_cnf = GenerationConfig.from_pretrained(
"openai/whisper-large-v3",
language="it",
num_beams=2,
early_stopping=True,
early_stopping_patience=1,
early_stopping_metric="loss"
)
gen_cnf.save_pretrained("resources/generation_config/")
### training arguments
training_args = Seq2SeqTrainingArguments(
output_dir="huggingface_model_space",
generation_config=gen_cnf,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
max_steps=1000,
eval_steps=100,
save_steps=100,
evaluation_strategy="steps",
save_strategy="steps",
fp16=True,
log_level='debug',
logging_strategy="steps",
logging_steps=1,
report_to=["tensorboard"],
predict_with_generate=True,
generation_max_length=225,
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
push_to_hub=False,
)
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=final_ds["train"],
eval_dataset=final_ds["test"],
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor,
)
trainer.train()
I have about 900 training audios and 250 test audios, but I think I cannot fit the model in memory and so I got CUDA memory error.
WHAT I TRIED:
- sample few examples from the dataset (about 50-20) but the situation didn't change,
- tried to run on CPU and in that case my 32 gb of RAM became saturated and the system got stucked
- tried the small version of Whisper on my GPU and it worked
- tried the medium version of Whisper on my GPU and it worked for five minutes and then again CUDA memory error
So it's like that the RAM became saturated while the training is ongoing. Do you have some advices to optimize memory or I need a machine with at least 64 gb of vram?
Thank you for your time!
EDIT: this is fine-tuning task of whisper
Hey @luigimontaleone ,
You have a few options here:
- If you want to do full fine-tuning, you can try memory saving strategies such as DeepSpeed
- If you're happy doing low-rank training, you can try LoRA using PEFT, c.f. faster Whisper fine-tuning from @reach-vb
I won't go into detail on the above two strategies since there's plenty of information on each link, but let me know if you have any questions and I'd be happy to help! 🤗
- If you're happy doing low-rank training, you can try LoRA using PEFT, c.f. faster Whisper fine-tuning from @reach-vb
Sorry to keep you waiting, I tried the second option and I managed to run the task locally, thanks a lot!