jonatasgrosman
commited on
Commit
•
ec0a4d2
1
Parent(s):
ac5e62b
update model
Browse files- README.md +15 -80
- config.json +1 -1
- preprocessor_config.json +1 -0
- pytorch_model.bin +1 -1
README.md
CHANGED
@@ -2,8 +2,6 @@
|
|
2 |
language: en
|
3 |
datasets:
|
4 |
- common_voice
|
5 |
-
- librispeech_asr
|
6 |
-
- timit_asr
|
7 |
metrics:
|
8 |
- wer
|
9 |
- cer
|
@@ -26,15 +24,15 @@ model-index:
|
|
26 |
metrics:
|
27 |
- name: Test WER
|
28 |
type: wer
|
29 |
-
value:
|
30 |
- name: Test CER
|
31 |
type: cer
|
32 |
-
value: 8.
|
33 |
---
|
34 |
|
35 |
# Wav2Vec2-Large-XLSR-53-English
|
36 |
|
37 |
-
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on English using the [Common Voice](https://huggingface.co/datasets/common_voice)
|
38 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
39 |
|
40 |
The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
|
@@ -83,16 +81,16 @@ for i, predicted_sentence in enumerate(predicted_sentences):
|
|
83 |
|
84 |
| Reference | Prediction |
|
85 |
| ------------- | ------------- |
|
86 |
-
| "SHE'LL BE ALL RIGHT." | SHE'
|
87 |
| SIX | SIX |
|
88 |
-
| "ALL'S WELL THAT ENDS WELL." | ALL
|
89 |
| DO YOU MEAN IT? | DO YOU MEAN IT |
|
90 |
| THE NEW PATCH IS LESS INVASIVE THAN THE OLD ONE, BUT STILL CAUSES REGRESSIONS. | THE NEW PATCH IS LESS INVASIVE THAN THE OLD ONE BUT STILL CAUSES REGRESSION |
|
91 |
-
| HOW IS MOZILLA GOING TO HANDLE AMBIGUITIES LIKE QUEUE AND CUE? | HOW IS
|
92 |
-
| "I GUESS YOU MUST THINK I'M KINDA BATTY." |
|
93 |
| NO ONE NEAR THE REMOTE MACHINE YOU COULD RING? | NO ONE NEAR THE REMOTE MACHINE YOU COULD RING |
|
94 |
-
| SAUCE FOR THE GOOSE IS SAUCE FOR THE GANDER. | SAUCE FOR THE
|
95 |
-
| GROVES STARTED WRITING SONGS WHEN SHE WAS FOUR YEARS OLD. | GRAFS STARTED WRITING
|
96 |
|
97 |
## Evaluation
|
98 |
|
@@ -117,11 +115,6 @@ CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", ";", ":", '""', "%", '
|
|
117 |
|
118 |
test_dataset = load_dataset("common_voice", LANG_ID, split="test")
|
119 |
|
120 |
-
# uncomment the following lines to eval using other datasets
|
121 |
-
# test_dataset = load_dataset("librispeech_asr", "clean", split="test")
|
122 |
-
# test_dataset = load_dataset("librispeech_asr", "other", split="test")
|
123 |
-
# test_dataset = load_dataset("timit_asr", split="test")
|
124 |
-
|
125 |
wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
|
126 |
cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
|
127 |
|
@@ -136,9 +129,9 @@ model.to(DEVICE)
|
|
136 |
def speech_file_to_array_fn(batch):
|
137 |
with warnings.catch_warnings():
|
138 |
warnings.simplefilter("ignore")
|
139 |
-
speech_array, sampling_rate = librosa.load(batch["
|
140 |
batch["speech"] = speech_array
|
141 |
-
batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["
|
142 |
return batch
|
143 |
|
144 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
@@ -166,76 +159,18 @@ print(f"CER: {cer.compute(predictions=predictions, references=references, chunk_
|
|
166 |
|
167 |
**Test Result**:
|
168 |
|
169 |
-
In the table below I report the Word Error Rate (WER) and the Character Error Rate (CER) of the model. I ran the evaluation script described above on other models as well (on 2021-
|
170 |
-
|
171 |
-
---
|
172 |
-
|
173 |
-
**Common Voice**
|
174 |
|
175 |
| Model | WER | CER |
|
176 |
| ------------- | ------------- | ------------- |
|
177 |
-
| jonatasgrosman/wav2vec2-large-xlsr-53-english | **
|
178 |
-
| jonatasgrosman/wav2vec2-large-english | 21.
|
179 |
| facebook/wav2vec2-large-960h-lv60-self | 22.03% | 10.39% |
|
180 |
| facebook/wav2vec2-large-960h-lv60 | 23.97% | 11.14% |
|
|
|
181 |
| facebook/wav2vec2-large-960h | 32.79% | 16.03% |
|
182 |
-
| boris/xlsr-en-punctuation | 34.81% | 15.51% |
|
183 |
| facebook/wav2vec2-base-960h | 39.86% | 19.89% |
|
184 |
| facebook/wav2vec2-base-100h | 51.06% | 25.06% |
|
185 |
| elgeish/wav2vec2-large-lv60-timit-asr | 59.96% | 34.28% |
|
186 |
| facebook/wav2vec2-base-10k-voxpopuli-ft-en | 66.41% | 36.76% |
|
187 |
| elgeish/wav2vec2-base-timit-asr | 68.78% | 36.81% |
|
188 |
-
|
189 |
-
---
|
190 |
-
|
191 |
-
**LibriSpeech (clean)**
|
192 |
-
|
193 |
-
| Model | WER | CER |
|
194 |
-
| ------------- | ------------- | ------------- |
|
195 |
-
| facebook/wav2vec2-large-960h-lv60-self | **1.86%** | **0.54%** |
|
196 |
-
| facebook/wav2vec2-large-960h-lv60 | 2.15% | 0.61% |
|
197 |
-
| facebook/wav2vec2-large-960h | 2.82% | 0.84% |
|
198 |
-
| facebook/wav2vec2-base-960h | 3.44% | 1.06% |
|
199 |
-
| jonatasgrosman/wav2vec2-large-xlsr-53-english | 4.16% | 1.28% |
|
200 |
-
| facebook/wav2vec2-base-100h | 6.26% | 2.00% |
|
201 |
-
| jonatasgrosman/wav2vec2-large-english | 8.00% | 2.55% |
|
202 |
-
| elgeish/wav2vec2-large-lv60-timit-asr | 15.53% | 4.93% |
|
203 |
-
| boris/xlsr-en-punctuation | 19.28% | 6.45% |
|
204 |
-
| elgeish/wav2vec2-base-timit-asr | 29.19% | 8.38% |
|
205 |
-
| facebook/wav2vec2-base-10k-voxpopuli-ft-en | 31.82% | 12.41% |
|
206 |
-
|
207 |
-
---
|
208 |
-
|
209 |
-
**LibriSpeech (other)**
|
210 |
-
|
211 |
-
| Model | WER | CER |
|
212 |
-
| ------------- | ------------- | ------------- |
|
213 |
-
| facebook/wav2vec2-large-960h-lv60-self | **3.89%** | **1.40%** |
|
214 |
-
| facebook/wav2vec2-large-960h-lv60 | 4.45% | 1.56% |
|
215 |
-
| facebook/wav2vec2-large-960h | 6.49% | 2.52% |
|
216 |
-
| jonatasgrosman/wav2vec2-large-xlsr-53-english | 8.82% | 3.42% |
|
217 |
-
| facebook/wav2vec2-base-960h | 8.90% | 3.55% |
|
218 |
-
| jonatasgrosman/wav2vec2-large-english | 13.62% | 5.24% |
|
219 |
-
| facebook/wav2vec2-base-100h | 13.97% | 5.51% |
|
220 |
-
| boris/xlsr-en-punctuation | 26.40% | 10.11% |
|
221 |
-
| elgeish/wav2vec2-large-lv60-timit-asr | 28.39% | 12.08% |
|
222 |
-
| elgeish/wav2vec2-base-timit-asr | 42.04% | 15.57% |
|
223 |
-
| facebook/wav2vec2-base-10k-voxpopuli-ft-en | 45.19% | 20.32% |
|
224 |
-
|
225 |
-
---
|
226 |
-
|
227 |
-
**TIMIT**
|
228 |
-
|
229 |
-
| Model | WER | CER |
|
230 |
-
| ------------- | ------------- | ------------- |
|
231 |
-
| facebook/wav2vec2-large-960h-lv60-self | **5.17%** | **1.33%** |
|
232 |
-
| facebook/wav2vec2-large-960h-lv60 | 6.24% | 1.54% |
|
233 |
-
| jonatasgrosman/wav2vec2-large-xlsr-53-english | 6.81% | 2.02% |
|
234 |
-
| facebook/wav2vec2-large-960h | 9.63% | 2.19% |
|
235 |
-
| facebook/wav2vec2-base-960h | 11.48% | 2.76% |
|
236 |
-
| elgeish/wav2vec2-large-lv60-timit-asr | 13.83% | 4.36% |
|
237 |
-
| jonatasgrosman/wav2vec2-large-english | 13.91% | 4.01% |
|
238 |
-
| facebook/wav2vec2-base-100h | 16.75% | 4.79% |
|
239 |
-
| elgeish/wav2vec2-base-timit-asr | 25.40% | 8.16% |
|
240 |
-
| boris/xlsr-en-punctuation | 25.93% | 9.99% |
|
241 |
-
| facebook/wav2vec2-base-10k-voxpopuli-ft-en | 51.08% | 19.84% |
|
|
|
2 |
language: en
|
3 |
datasets:
|
4 |
- common_voice
|
|
|
|
|
5 |
metrics:
|
6 |
- wer
|
7 |
- cer
|
|
|
24 |
metrics:
|
25 |
- name: Test WER
|
26 |
type: wer
|
27 |
+
value: 18.98
|
28 |
- name: Test CER
|
29 |
type: cer
|
30 |
+
value: 8.29
|
31 |
---
|
32 |
|
33 |
# Wav2Vec2-Large-XLSR-53-English
|
34 |
|
35 |
+
Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on English using the [Common Voice](https://huggingface.co/datasets/common_voice).
|
36 |
When using this model, make sure that your speech input is sampled at 16kHz.
|
37 |
|
38 |
The script used for training can be found here: https://github.com/jonatasgrosman/wav2vec2-sprint
|
|
|
81 |
|
82 |
| Reference | Prediction |
|
83 |
| ------------- | ------------- |
|
84 |
+
| "SHE'LL BE ALL RIGHT." | SHE'LL BE ALL RIGHT |
|
85 |
| SIX | SIX |
|
86 |
+
| "ALL'S WELL THAT ENDS WELL." | ALL AS WELL THAT ENDS WELL |
|
87 |
| DO YOU MEAN IT? | DO YOU MEAN IT |
|
88 |
| THE NEW PATCH IS LESS INVASIVE THAN THE OLD ONE, BUT STILL CAUSES REGRESSIONS. | THE NEW PATCH IS LESS INVASIVE THAN THE OLD ONE BUT STILL CAUSES REGRESSION |
|
89 |
+
| HOW IS MOZILLA GOING TO HANDLE AMBIGUITIES LIKE QUEUE AND CUE? | HOW IS MOSLILLAR GOING TO HANDLE ANDBEWOOTH HIS LIKE Q AND Q |
|
90 |
+
| "I GUESS YOU MUST THINK I'M KINDA BATTY." | RUSTIAN WASTIN PAN ONTE BATTLY |
|
91 |
| NO ONE NEAR THE REMOTE MACHINE YOU COULD RING? | NO ONE NEAR THE REMOTE MACHINE YOU COULD RING |
|
92 |
+
| SAUCE FOR THE GOOSE IS SAUCE FOR THE GANDER. | SAUCE FOR THE GUICE IS SAUCE FOR THE GONDER |
|
93 |
+
| GROVES STARTED WRITING SONGS WHEN SHE WAS FOUR YEARS OLD. | GRAFS STARTED WRITING SONGS WHEN SHE WAS FOUR YEARS OLD |
|
94 |
|
95 |
## Evaluation
|
96 |
|
|
|
115 |
|
116 |
test_dataset = load_dataset("common_voice", LANG_ID, split="test")
|
117 |
|
|
|
|
|
|
|
|
|
|
|
118 |
wer = load_metric("wer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/wer.py
|
119 |
cer = load_metric("cer.py") # https://github.com/jonatasgrosman/wav2vec2-sprint/blob/main/cer.py
|
120 |
|
|
|
129 |
def speech_file_to_array_fn(batch):
|
130 |
with warnings.catch_warnings():
|
131 |
warnings.simplefilter("ignore")
|
132 |
+
speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
|
133 |
batch["speech"] = speech_array
|
134 |
+
batch["sentence"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).upper()
|
135 |
return batch
|
136 |
|
137 |
test_dataset = test_dataset.map(speech_file_to_array_fn)
|
|
|
159 |
|
160 |
**Test Result**:
|
161 |
|
162 |
+
In the table below I report the Word Error Rate (WER) and the Character Error Rate (CER) of the model. I ran the evaluation script described above on other models as well (on 2021-06-17). Note that the table below may show different results from those already reported, this may have been caused due to some specificity of the other evaluation scripts used.
|
|
|
|
|
|
|
|
|
163 |
|
164 |
| Model | WER | CER |
|
165 |
| ------------- | ------------- | ------------- |
|
166 |
+
| jonatasgrosman/wav2vec2-large-xlsr-53-english | **18.98%** | **8.29%** |
|
167 |
+
| jonatasgrosman/wav2vec2-large-english | 21.53% | 9.66% |
|
168 |
| facebook/wav2vec2-large-960h-lv60-self | 22.03% | 10.39% |
|
169 |
| facebook/wav2vec2-large-960h-lv60 | 23.97% | 11.14% |
|
170 |
+
| boris/xlsr-en-punctuation | 29.10% | 10.75% |
|
171 |
| facebook/wav2vec2-large-960h | 32.79% | 16.03% |
|
|
|
172 |
| facebook/wav2vec2-base-960h | 39.86% | 19.89% |
|
173 |
| facebook/wav2vec2-base-100h | 51.06% | 25.06% |
|
174 |
| elgeish/wav2vec2-large-lv60-timit-asr | 59.96% | 34.28% |
|
175 |
| facebook/wav2vec2-base-10k-voxpopuli-ft-en | 66.41% | 36.76% |
|
176 |
| elgeish/wav2vec2-base-timit-asr | 68.78% | 36.81% |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
CHANGED
@@ -71,6 +71,6 @@
|
|
71 |
"num_feat_extract_layers": 7,
|
72 |
"num_hidden_layers": 24,
|
73 |
"pad_token_id": 0,
|
74 |
-
"transformers_version": "4.
|
75 |
"vocab_size": 33
|
76 |
}
|
|
|
71 |
"num_feat_extract_layers": 7,
|
72 |
"num_hidden_layers": 24,
|
73 |
"pad_token_id": 0,
|
74 |
+
"transformers_version": "4.7.0.dev0",
|
75 |
"vocab_size": 33
|
76 |
}
|
preprocessor_config.json
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
{
|
2 |
"do_normalize": true,
|
|
|
3 |
"feature_size": 1,
|
4 |
"padding_side": "right",
|
5 |
"padding_value": 0.0,
|
|
|
1 |
{
|
2 |
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
"feature_size": 1,
|
5 |
"padding_side": "right",
|
6 |
"padding_value": 0.0,
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1262069143
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b7688644eeefe1f5760bb4c4a61d085793a3740159fdbf19fd37c5d4f3729bf
|
3 |
size 1262069143
|