|
--- |
|
license: mit |
|
language: |
|
- vi |
|
metrics: |
|
- wer |
|
base_model: |
|
- facebook/wav2vec2-xls-r-300m |
|
pipeline_tag: automatic-speech-recognition |
|
--- |
|
|
|
Tôi đã fine-tune với 15Gb dữ liệu audio với kết quả Wer: 24.46 |
|
|
|
## Cách sử dụng |
|
``` |
|
|
|
import torch |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC |
|
import torchaudio |
|
|
|
mydevice = 'cuda' |
|
processor = Wav2Vec2Processor.from_pretrained("hataphu/wav2vec2-vi-300m") |
|
model = Wav2Vec2ForCTC.from_pretrained("hataphu/wav2vec2-vi-300m") |
|
model.to(mydevice) |
|
model.eval() |
|
audio_input, sampling_rate = torchaudio.load('audio-path-file') |
|
|
|
input_values = processor( |
|
audio_input.squeeze().numpy(), sampling_rate=sampling_rate |
|
).input_values[0] |
|
|
|
logits = model(torch.tensor(input_values).unsqueeze(0).to(mydevice)).logits |
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
transcription = processor.decode(predicted_ids[0]) |
|
print(transcription) |
|
``` |
|
|
|
|
|
|