kingabzpro
commited on
Commit
•
01bee33
1
Parent(s):
ae30068
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
base_model: openai/whisper-small
|
4 |
+
tags:
|
5 |
+
- generated_from_trainer
|
6 |
+
datasets:
|
7 |
+
- mozilla-foundation/common_voice_13_0
|
8 |
+
language:
|
9 |
+
- hi
|
10 |
+
metrics:
|
11 |
+
- cer
|
12 |
+
- wer
|
13 |
+
library_name: transformers
|
14 |
+
pipeline_tag: automatic-speech-recognition
|
15 |
+
model-index:
|
16 |
+
- name: whisper-small-hi-cv
|
17 |
+
results:
|
18 |
+
- task:
|
19 |
+
name: Automatic Speech Recognition
|
20 |
+
type: automatic-speech-recognition
|
21 |
+
dataset:
|
22 |
+
name: Common Voice 15
|
23 |
+
type: mozilla-foundation/common_voice_15_0
|
24 |
+
args: hi
|
25 |
+
metrics:
|
26 |
+
- name: Test WER
|
27 |
+
type: wer
|
28 |
+
value: 13.9913
|
29 |
+
- name: Test CER
|
30 |
+
type: cer
|
31 |
+
value: 5.8844
|
32 |
+
|
33 |
+
- task:
|
34 |
+
name: Automatic Speech Recognition
|
35 |
+
type: automatic-speech-recognition
|
36 |
+
dataset:
|
37 |
+
name: Common Voice 13
|
38 |
+
type: mozilla-foundation/common_voice_13_0
|
39 |
+
args: hi
|
40 |
+
metrics:
|
41 |
+
- name: Test WER
|
42 |
+
type: wer
|
43 |
+
value: 23.3824
|
44 |
+
- name: Test CER
|
45 |
+
type: cer
|
46 |
+
value: 10.5288
|
47 |
+
---
|
48 |
+
|
49 |
+
# whisper-small-hi-cv
|
50 |
+
|
51 |
+
This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on the Common Voice 15 dataset.
|
52 |
+
It achieves the following results on the evaluation set:
|
53 |
+
- Wer: 13.9913
|
54 |
+
- Cer: 5.8844
|
55 |
+
|
56 |
+
View the results on Kaggle Notebook: https://www.kaggle.com/code/kingabzpro/whisper-hindi-eval
|
57 |
+
|
58 |
+
## Evaluation
|
59 |
+
|
60 |
+
```python
|
61 |
+
from datasets import load_dataset,load_metric,Audio
|
62 |
+
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
63 |
+
import torch
|
64 |
+
import torchaudio
|
65 |
+
|
66 |
+
test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", "hi", split="test")
|
67 |
+
wer = load_metric("wer")
|
68 |
+
cer = load_metric("cer")
|
69 |
+
|
70 |
+
processor = WhisperProcessor.from_pretrained("kingabzpro/whisper-small-hi-cv")
|
71 |
+
model = WhisperForConditionalGeneration.from_pretrained("kingabzpro/whisper-small-hi-cv").to("cuda")
|
72 |
+
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
73 |
+
|
74 |
+
def map_to_pred(batch):
|
75 |
+
audio = batch["audio"]
|
76 |
+
input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
|
77 |
+
batch["reference"] = processor.tokenizer._normalize(batch['sentence'])
|
78 |
+
|
79 |
+
with torch.no_grad():
|
80 |
+
predicted_ids = model.generate(input_features.to("cuda"))[0]
|
81 |
+
transcription = processor.decode(predicted_ids)
|
82 |
+
batch["prediction"] = processor.tokenizer._normalize(transcription)
|
83 |
+
return batch
|
84 |
+
|
85 |
+
result = test_dataset.map(map_to_pred)
|
86 |
+
|
87 |
+
print("WER: {:2f}".format(100 * wer.compute(predictions=result["prediction"], references=result["reference"])))
|
88 |
+
print("CER: {:2f}".format(100 * cer.compute(predictions=result["prediction"], references=result["reference"])))
|
89 |
+
```
|
90 |
+
```bash
|
91 |
+
WER: 23.3824
|
92 |
+
CER: 10.5288
|
93 |
+
```
|