Add LM
Browse files- .gitignore +1 -0
- .ipynb_checkpoints/README-checkpoint.md +90 -0
- .ipynb_checkpoints/added_tokens-checkpoint.json +1 -0
- .ipynb_checkpoints/all_results-checkpoint.json +15 -0
- .ipynb_checkpoints/config-checkpoint.json +107 -0
- .ipynb_checkpoints/eval-checkpoint.py +141 -0
- .ipynb_checkpoints/eval_results-checkpoint.json +9 -0
- .ipynb_checkpoints/preprocessor_config-checkpoint.json +9 -0
- .ipynb_checkpoints/tokenizer_config-checkpoint.json +1 -0
- .ipynb_checkpoints/train_results-checkpoint.json +9 -0
- .ipynb_checkpoints/trainer_state-checkpoint.json +52 -0
- .ipynb_checkpoints/vocab-checkpoint.json +1 -0
- README.md +90 -0
- added_tokens.json +1 -0
- all_results.json +15 -0
- config.json +107 -0
- eval.py +141 -0
- eval_results.json +5 -0
- preprocessor_config.json +9 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- train_results.json +9 -0
- trainer_state.json +52 -0
- training_args.bin +3 -0
- vocab.json +1 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
checkpoint-*/
|
.ipynb_checkpoints/README-checkpoint.md
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- ar
|
4 |
+
license: apache-2.0
|
5 |
+
tags:
|
6 |
+
- automatic-speech-recognition
|
7 |
+
- common_voice
|
8 |
+
- generated_from_trainer
|
9 |
+
- ar
|
10 |
+
- robust-speech-event
|
11 |
+
datasets:
|
12 |
+
- common_voice
|
13 |
+
model-index:
|
14 |
+
- name: XLS-R-300M - Arabic
|
15 |
+
results:
|
16 |
+
- task:
|
17 |
+
name: Automatic Speech Recognition
|
18 |
+
type: automatic-speech-recognition
|
19 |
+
dataset:
|
20 |
+
name: Robust Speech Event - Dev Data
|
21 |
+
type: speech-recognition-community-v2/dev_data
|
22 |
+
args: ar
|
23 |
+
metrics:
|
24 |
+
- name: Test WER
|
25 |
+
type: wer
|
26 |
+
value: 1.0
|
27 |
+
- name: Test CER
|
28 |
+
type: cer
|
29 |
+
value: 1.0
|
30 |
+
|
31 |
+
---
|
32 |
+
|
33 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
34 |
+
should probably proofread and complete it, then remove this comment. -->
|
35 |
+
|
36 |
+
# wav2vec2-xls-r-300m-ar
|
37 |
+
|
38 |
+
This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the COMMON_VOICE - AR dataset.
|
39 |
+
It achieves the following results on the evaluation set:
|
40 |
+
- eval_loss: 3.0191
|
41 |
+
- eval_wer: 1.0
|
42 |
+
- eval_runtime: 252.2389
|
43 |
+
- eval_samples_per_second: 30.217
|
44 |
+
- eval_steps_per_second: 0.476
|
45 |
+
- epoch: 1.0
|
46 |
+
- step: 340
|
47 |
+
|
48 |
+
## Model description
|
49 |
+
|
50 |
+
More information needed
|
51 |
+
|
52 |
+
## Intended uses & limitations
|
53 |
+
|
54 |
+
More information needed
|
55 |
+
|
56 |
+
## Training and evaluation data
|
57 |
+
|
58 |
+
More information needed
|
59 |
+
|
60 |
+
## Training procedure
|
61 |
+
|
62 |
+
### Training hyperparameters
|
63 |
+
|
64 |
+
The following hyperparameters were used during training:
|
65 |
+
- learning_rate: 0.0005
|
66 |
+
- train_batch_size: 64
|
67 |
+
- eval_batch_size: 64
|
68 |
+
- seed: 42
|
69 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
70 |
+
- lr_scheduler_type: linear
|
71 |
+
- lr_scheduler_warmup_steps: 2000
|
72 |
+
- num_epochs: 5
|
73 |
+
- mixed_precision_training: Native AMP
|
74 |
+
|
75 |
+
### Framework versions
|
76 |
+
|
77 |
+
- Transformers 4.17.0.dev0
|
78 |
+
- Pytorch 1.10.2+cu102
|
79 |
+
- Datasets 1.18.2.dev0
|
80 |
+
- Tokenizers 0.11.0
|
81 |
+
|
82 |
+
#### Evaluation Commands
|
83 |
+
|
84 |
+
Please use the evaluation script `eval.py` included in the repo.
|
85 |
+
|
86 |
+
1. To evaluate on `speech-recognition-community-v2/dev_data`
|
87 |
+
|
88 |
+
```bash
|
89 |
+
python eval.py --model_id nouamanetazi/wav2vec2-xls-r-300m-ar --dataset speech-recognition-community-v2/dev_data --config ar --split validation --chunk_length_s 5.0 --stride_length_s 1.0
|
90 |
+
```
|
.ipynb_checkpoints/added_tokens-checkpoint.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<s>": 33, "</s>": 34}
|
.ipynb_checkpoints/all_results-checkpoint.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 5.0,
|
3 |
+
"eval_loss": 26.253612518310547,
|
4 |
+
"eval_runtime": 5.982,
|
5 |
+
"eval_samples": 128,
|
6 |
+
"eval_samples_per_second": 21.397,
|
7 |
+
"eval_steps_per_second": 0.334,
|
8 |
+
"eval_wer": 1.0,
|
9 |
+
"total_flos": 1.3476444758728704e+17,
|
10 |
+
"train_loss": 19.624227905273436,
|
11 |
+
"train_runtime": 37.1321,
|
12 |
+
"train_samples": 128,
|
13 |
+
"train_samples_per_second": 17.236,
|
14 |
+
"train_steps_per_second": 0.269
|
15 |
+
}
|
.ipynb_checkpoints/config-checkpoint.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "facebook/wav2vec2-xls-r-300m",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"adapter_kernel_size": 3,
|
5 |
+
"adapter_stride": 2,
|
6 |
+
"add_adapter": false,
|
7 |
+
"apply_spec_augment": true,
|
8 |
+
"architectures": [
|
9 |
+
"Wav2Vec2ForCTC"
|
10 |
+
],
|
11 |
+
"attention_dropout": 0.0,
|
12 |
+
"bos_token_id": 1,
|
13 |
+
"classifier_proj_size": 256,
|
14 |
+
"codevector_dim": 768,
|
15 |
+
"contrastive_logits_temperature": 0.1,
|
16 |
+
"conv_bias": true,
|
17 |
+
"conv_dim": [
|
18 |
+
512,
|
19 |
+
512,
|
20 |
+
512,
|
21 |
+
512,
|
22 |
+
512,
|
23 |
+
512,
|
24 |
+
512
|
25 |
+
],
|
26 |
+
"conv_kernel": [
|
27 |
+
10,
|
28 |
+
3,
|
29 |
+
3,
|
30 |
+
3,
|
31 |
+
3,
|
32 |
+
2,
|
33 |
+
2
|
34 |
+
],
|
35 |
+
"conv_stride": [
|
36 |
+
5,
|
37 |
+
2,
|
38 |
+
2,
|
39 |
+
2,
|
40 |
+
2,
|
41 |
+
2,
|
42 |
+
2
|
43 |
+
],
|
44 |
+
"ctc_loss_reduction": "mean",
|
45 |
+
"ctc_zero_infinity": false,
|
46 |
+
"diversity_loss_weight": 0.1,
|
47 |
+
"do_stable_layer_norm": true,
|
48 |
+
"eos_token_id": 2,
|
49 |
+
"feat_extract_activation": "gelu",
|
50 |
+
"feat_extract_dropout": 0.0,
|
51 |
+
"feat_extract_norm": "layer",
|
52 |
+
"feat_proj_dropout": 0.0,
|
53 |
+
"feat_quantizer_dropout": 0.0,
|
54 |
+
"final_dropout": 0.0,
|
55 |
+
"hidden_act": "gelu",
|
56 |
+
"hidden_dropout": 0.0,
|
57 |
+
"hidden_size": 1024,
|
58 |
+
"initializer_range": 0.02,
|
59 |
+
"intermediate_size": 4096,
|
60 |
+
"layer_norm_eps": 1e-05,
|
61 |
+
"layerdrop": 0.0,
|
62 |
+
"mask_feature_length": 64,
|
63 |
+
"mask_feature_min_masks": 0,
|
64 |
+
"mask_feature_prob": 0.25,
|
65 |
+
"mask_time_length": 10,
|
66 |
+
"mask_time_min_masks": 2,
|
67 |
+
"mask_time_prob": 0.75,
|
68 |
+
"model_type": "wav2vec2",
|
69 |
+
"num_adapter_layers": 3,
|
70 |
+
"num_attention_heads": 16,
|
71 |
+
"num_codevector_groups": 2,
|
72 |
+
"num_codevectors_per_group": 320,
|
73 |
+
"num_conv_pos_embedding_groups": 16,
|
74 |
+
"num_conv_pos_embeddings": 128,
|
75 |
+
"num_feat_extract_layers": 7,
|
76 |
+
"num_hidden_layers": 24,
|
77 |
+
"num_negatives": 100,
|
78 |
+
"output_hidden_size": 1024,
|
79 |
+
"pad_token_id": 32,
|
80 |
+
"proj_codevector_dim": 768,
|
81 |
+
"tdnn_dilation": [
|
82 |
+
1,
|
83 |
+
2,
|
84 |
+
3,
|
85 |
+
1,
|
86 |
+
1
|
87 |
+
],
|
88 |
+
"tdnn_dim": [
|
89 |
+
512,
|
90 |
+
512,
|
91 |
+
512,
|
92 |
+
512,
|
93 |
+
1500
|
94 |
+
],
|
95 |
+
"tdnn_kernel": [
|
96 |
+
5,
|
97 |
+
3,
|
98 |
+
3,
|
99 |
+
1,
|
100 |
+
1
|
101 |
+
],
|
102 |
+
"torch_dtype": "float32",
|
103 |
+
"transformers_version": "4.17.0.dev0",
|
104 |
+
"use_weighted_layer_sum": false,
|
105 |
+
"vocab_size": 35,
|
106 |
+
"xvector_output_dim": 512
|
107 |
+
}
|
.ipynb_checkpoints/eval-checkpoint.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from datasets import load_dataset, load_metric, Audio, Dataset
|
3 |
+
from transformers import pipeline, AutoFeatureExtractor
|
4 |
+
import re
|
5 |
+
import argparse
|
6 |
+
import unicodedata
|
7 |
+
from typing import Dict
|
8 |
+
|
9 |
+
|
10 |
+
def log_results(result: Dataset, args: Dict[str, str]):
|
11 |
+
""" DO NOT CHANGE. This function computes and logs the result metrics. """
|
12 |
+
|
13 |
+
log_outputs = args.log_outputs
|
14 |
+
dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
|
15 |
+
|
16 |
+
# load metric
|
17 |
+
wer = load_metric("wer")
|
18 |
+
cer = load_metric("cer")
|
19 |
+
|
20 |
+
# compute metrics
|
21 |
+
wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
|
22 |
+
cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
|
23 |
+
|
24 |
+
# print & log results
|
25 |
+
result_str = (
|
26 |
+
f"WER: {wer_result}\n"
|
27 |
+
f"CER: {cer_result}"
|
28 |
+
)
|
29 |
+
print(result_str)
|
30 |
+
|
31 |
+
with open(f"{dataset_id}_eval_results.txt", "w") as f:
|
32 |
+
f.write(result_str)
|
33 |
+
|
34 |
+
# log all results in text file. Possibly interesting for analysis
|
35 |
+
if log_outputs is not None:
|
36 |
+
pred_file = f"log_{dataset_id}_predictions.txt"
|
37 |
+
target_file = f"log_{dataset_id}_targets.txt"
|
38 |
+
|
39 |
+
with open(pred_file, "w") as p, open(target_file, "w") as t:
|
40 |
+
|
41 |
+
# mapping function to write output
|
42 |
+
def write_to_file(batch, i):
|
43 |
+
p.write(f"{i}" + "\n")
|
44 |
+
p.write(batch["prediction"] + "\n")
|
45 |
+
t.write(f"{i}" + "\n")
|
46 |
+
t.write(batch["target"] + "\n")
|
47 |
+
|
48 |
+
result.map(write_to_file, with_indices=True)
|
49 |
+
|
50 |
+
|
51 |
+
# Normalize arabic
|
52 |
+
def normalizeArabic(text):
|
53 |
+
# https://alraqmiyyat.github.io/2013/01-02.html
|
54 |
+
text = re.sub("[إأٱآا]", "ا", text)
|
55 |
+
text = re.sub("ى", "ي", text)
|
56 |
+
text = re.sub("ؤ", "ء", text)
|
57 |
+
text = re.sub("ئ", "ء", text)
|
58 |
+
|
59 |
+
# keep only characters which unicode \u0600-\u06FF and space
|
60 |
+
text = re.sub(r"[^\u0600-\u06FF ]", "", text)
|
61 |
+
return text
|
62 |
+
|
63 |
+
def normalize_text(text: str) -> str:
|
64 |
+
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
65 |
+
|
66 |
+
chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
67 |
+
|
68 |
+
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
69 |
+
|
70 |
+
# In addition, we can normalize the target text, e.g. removing new lines characters etc...
|
71 |
+
# note that order is important here!
|
72 |
+
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
73 |
+
|
74 |
+
for t in token_sequences_to_ignore:
|
75 |
+
text = " ".join(text.split(t))
|
76 |
+
|
77 |
+
text = normalizeArabic(text)
|
78 |
+
|
79 |
+
return text
|
80 |
+
|
81 |
+
|
82 |
+
def main(args):
|
83 |
+
# load dataset
|
84 |
+
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
85 |
+
|
86 |
+
# for testing: only process the first two examples as a test
|
87 |
+
# dataset = dataset.select(range(10))
|
88 |
+
|
89 |
+
# load processor
|
90 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
91 |
+
sampling_rate = feature_extractor.sampling_rate
|
92 |
+
|
93 |
+
# resample audio
|
94 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
|
95 |
+
|
96 |
+
# load eval pipeline
|
97 |
+
asr = pipeline("automatic-speech-recognition", model=args.model_id)
|
98 |
+
|
99 |
+
# map function to decode audio
|
100 |
+
def map_to_pred(batch):
|
101 |
+
prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
|
102 |
+
|
103 |
+
batch["prediction"] = prediction["text"]
|
104 |
+
batch["target"] = normalize_text(batch["sentence"])
|
105 |
+
return batch
|
106 |
+
|
107 |
+
# run inference on all examples
|
108 |
+
result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
|
109 |
+
|
110 |
+
# compute and log_results
|
111 |
+
# do not change function below
|
112 |
+
log_results(result, args)
|
113 |
+
|
114 |
+
|
115 |
+
if __name__ == "__main__":
|
116 |
+
parser = argparse.ArgumentParser()
|
117 |
+
|
118 |
+
parser.add_argument(
|
119 |
+
"--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
|
120 |
+
)
|
121 |
+
parser.add_argument(
|
122 |
+
"--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
|
123 |
+
)
|
124 |
+
parser.add_argument(
|
125 |
+
"--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
|
126 |
+
)
|
127 |
+
parser.add_argument(
|
128 |
+
"--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
|
129 |
+
)
|
130 |
+
parser.add_argument(
|
131 |
+
"--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
|
132 |
+
)
|
133 |
+
parser.add_argument(
|
134 |
+
"--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
|
135 |
+
)
|
136 |
+
parser.add_argument(
|
137 |
+
"--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
|
138 |
+
)
|
139 |
+
args = parser.parse_args()
|
140 |
+
|
141 |
+
main(args)
|
.ipynb_checkpoints/eval_results-checkpoint.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 20.0,
|
3 |
+
"eval_loss": 6.937458515167236,
|
4 |
+
"eval_runtime": 5.7217,
|
5 |
+
"eval_samples": 128,
|
6 |
+
"eval_samples_per_second": 22.371,
|
7 |
+
"eval_steps_per_second": 0.35,
|
8 |
+
"eval_wer": 1.0
|
9 |
+
}
|
.ipynb_checkpoints/preprocessor_config-checkpoint.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
.ipynb_checkpoints/tokenizer_config-checkpoint.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./wav2vec2-xls-r-300m-ar", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
.ipynb_checkpoints/train_results-checkpoint.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 5.0,
|
3 |
+
"total_flos": 1.3476444758728704e+17,
|
4 |
+
"train_loss": 16.66825189590454,
|
5 |
+
"train_runtime": 91.9274,
|
6 |
+
"train_samples": 128,
|
7 |
+
"train_samples_per_second": 6.962,
|
8 |
+
"train_steps_per_second": 0.109
|
9 |
+
}
|
.ipynb_checkpoints/trainer_state-checkpoint.json
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.0,
|
5 |
+
"global_step": 340,
|
6 |
+
"is_hyper_param_search": false,
|
7 |
+
"is_local_process_zero": true,
|
8 |
+
"is_world_process_zero": true,
|
9 |
+
"log_history": [
|
10 |
+
{
|
11 |
+
"epoch": 0.15,
|
12 |
+
"learning_rate": 1.1750000000000001e-05,
|
13 |
+
"loss": 15.017,
|
14 |
+
"step": 50
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"epoch": 0.29,
|
18 |
+
"learning_rate": 2.425e-05,
|
19 |
+
"loss": 6.7134,
|
20 |
+
"step": 100
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"epoch": 0.44,
|
24 |
+
"learning_rate": 3.675e-05,
|
25 |
+
"loss": 4.3869,
|
26 |
+
"step": 150
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"epoch": 0.59,
|
30 |
+
"learning_rate": 4.9250000000000004e-05,
|
31 |
+
"loss": 3.6209,
|
32 |
+
"step": 200
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"epoch": 0.74,
|
36 |
+
"learning_rate": 6.175e-05,
|
37 |
+
"loss": 3.2011,
|
38 |
+
"step": 250
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"epoch": 0.88,
|
42 |
+
"learning_rate": 7.425e-05,
|
43 |
+
"loss": 3.0513,
|
44 |
+
"step": 300
|
45 |
+
}
|
46 |
+
],
|
47 |
+
"max_steps": 1700,
|
48 |
+
"num_train_epochs": 5,
|
49 |
+
"total_flos": 1.7302176965482906e+18,
|
50 |
+
"trial_name": null,
|
51 |
+
"trial_params": null
|
52 |
+
}
|
.ipynb_checkpoints/vocab-checkpoint.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"_": 1, "e": 2, "g": 3, "t": 4, "\u00ab": 5, "\u00bb": 6, "\u061b": 7, "\u0621": 8, "\u0627": 9, "\u0628": 10, "\u0629": 11, "\u062a": 12, "\u062b": 13, "\u062c": 14, "\u062d": 15, "\u062e": 16, "\u062f": 17, "\u0630": 18, "\u0631": 19, "\u0632": 20, "\u0633": 21, "\u0634": 22, "\u0635": 23, "\u0636": 24, "\u0637": 25, "\u0638": 26, "\u0639": 27, "\u063a": 28, "\u0641": 29, "\u0642": 30, "\u0643": 31, "\u0644": 32, "\u0645": 33, "\u0646": 34, "\u0647": 35, "\u0648": 36, "\u064a": 37, "\u0670": 38, "\u0686": 39, "\u06a8": 40, "\u06a9": 41, "\u06be": 42, "\u06cc": 43, "\u06d6": 44, "\u06da": 45, "\u262d": 46, "\ufe83": 47, "\ufefb": 48, "|": 0, "[UNK]": 49, "[PAD]": 50}
|
README.md
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- ar
|
4 |
+
license: apache-2.0
|
5 |
+
tags:
|
6 |
+
- automatic-speech-recognition
|
7 |
+
- common_voice
|
8 |
+
- generated_from_trainer
|
9 |
+
- ar
|
10 |
+
- robust-speech-event
|
11 |
+
datasets:
|
12 |
+
- common_voice
|
13 |
+
model-index:
|
14 |
+
- name: XLS-R-300M - Arabic
|
15 |
+
results:
|
16 |
+
- task:
|
17 |
+
name: Automatic Speech Recognition
|
18 |
+
type: automatic-speech-recognition
|
19 |
+
dataset:
|
20 |
+
name: Robust Speech Event - Dev Data
|
21 |
+
type: speech-recognition-community-v2/dev_data
|
22 |
+
args: ar
|
23 |
+
metrics:
|
24 |
+
- name: Test WER
|
25 |
+
type: wer
|
26 |
+
value: 1.0
|
27 |
+
- name: Test CER
|
28 |
+
type: cer
|
29 |
+
value: 1.0
|
30 |
+
|
31 |
+
---
|
32 |
+
|
33 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
34 |
+
should probably proofread and complete it, then remove this comment. -->
|
35 |
+
|
36 |
+
# wav2vec2-xls-r-300m-ar
|
37 |
+
|
38 |
+
This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the COMMON_VOICE - AR dataset.
|
39 |
+
It achieves the following results on the evaluation set:
|
40 |
+
- eval_loss: 3.0191
|
41 |
+
- eval_wer: 1.0
|
42 |
+
- eval_runtime: 252.2389
|
43 |
+
- eval_samples_per_second: 30.217
|
44 |
+
- eval_steps_per_second: 0.476
|
45 |
+
- epoch: 1.0
|
46 |
+
- step: 340
|
47 |
+
|
48 |
+
## Model description
|
49 |
+
|
50 |
+
More information needed
|
51 |
+
|
52 |
+
## Intended uses & limitations
|
53 |
+
|
54 |
+
More information needed
|
55 |
+
|
56 |
+
## Training and evaluation data
|
57 |
+
|
58 |
+
More information needed
|
59 |
+
|
60 |
+
## Training procedure
|
61 |
+
|
62 |
+
### Training hyperparameters
|
63 |
+
|
64 |
+
The following hyperparameters were used during training:
|
65 |
+
- learning_rate: 0.0005
|
66 |
+
- train_batch_size: 64
|
67 |
+
- eval_batch_size: 64
|
68 |
+
- seed: 42
|
69 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
70 |
+
- lr_scheduler_type: linear
|
71 |
+
- lr_scheduler_warmup_steps: 2000
|
72 |
+
- num_epochs: 5
|
73 |
+
- mixed_precision_training: Native AMP
|
74 |
+
|
75 |
+
### Framework versions
|
76 |
+
|
77 |
+
- Transformers 4.17.0.dev0
|
78 |
+
- Pytorch 1.10.2+cu102
|
79 |
+
- Datasets 1.18.2.dev0
|
80 |
+
- Tokenizers 0.11.0
|
81 |
+
|
82 |
+
#### Evaluation Commands
|
83 |
+
|
84 |
+
Please use the evaluation script `eval.py` included in the repo.
|
85 |
+
|
86 |
+
1. To evaluate on `speech-recognition-community-v2/dev_data`
|
87 |
+
|
88 |
+
```bash
|
89 |
+
python eval.py --model_id nouamanetazi/wav2vec2-xls-r-300m-ar --dataset speech-recognition-community-v2/dev_data --config ar --split validation --chunk_length_s 5.0 --stride_length_s 1.0
|
90 |
+
```
|
added_tokens.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<s>": 33, "</s>": 34}
|
all_results.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 20.0,
|
3 |
+
"eval_loss": 3.0191357135772705,
|
4 |
+
"eval_runtime": 5.7217,
|
5 |
+
"eval_samples": 7622,
|
6 |
+
"eval_samples_per_second": 22.371,
|
7 |
+
"eval_steps_per_second": 0.35,
|
8 |
+
"eval_wer": 1.0,
|
9 |
+
"total_flos": 5.430583918308557e+17,
|
10 |
+
"train_loss": 8.69529299736023,
|
11 |
+
"train_runtime": 243.8197,
|
12 |
+
"train_samples": 128,
|
13 |
+
"train_samples_per_second": 10.5,
|
14 |
+
"train_steps_per_second": 0.164
|
15 |
+
}
|
config.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "facebook/wav2vec2-xls-r-300m",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"adapter_kernel_size": 3,
|
5 |
+
"adapter_stride": 2,
|
6 |
+
"add_adapter": false,
|
7 |
+
"apply_spec_augment": true,
|
8 |
+
"architectures": [
|
9 |
+
"Wav2Vec2ForCTC"
|
10 |
+
],
|
11 |
+
"attention_dropout": 0.0,
|
12 |
+
"bos_token_id": 1,
|
13 |
+
"classifier_proj_size": 256,
|
14 |
+
"codevector_dim": 768,
|
15 |
+
"contrastive_logits_temperature": 0.1,
|
16 |
+
"conv_bias": true,
|
17 |
+
"conv_dim": [
|
18 |
+
512,
|
19 |
+
512,
|
20 |
+
512,
|
21 |
+
512,
|
22 |
+
512,
|
23 |
+
512,
|
24 |
+
512
|
25 |
+
],
|
26 |
+
"conv_kernel": [
|
27 |
+
10,
|
28 |
+
3,
|
29 |
+
3,
|
30 |
+
3,
|
31 |
+
3,
|
32 |
+
2,
|
33 |
+
2
|
34 |
+
],
|
35 |
+
"conv_stride": [
|
36 |
+
5,
|
37 |
+
2,
|
38 |
+
2,
|
39 |
+
2,
|
40 |
+
2,
|
41 |
+
2,
|
42 |
+
2
|
43 |
+
],
|
44 |
+
"ctc_loss_reduction": "mean",
|
45 |
+
"ctc_zero_infinity": false,
|
46 |
+
"diversity_loss_weight": 0.1,
|
47 |
+
"do_stable_layer_norm": true,
|
48 |
+
"eos_token_id": 2,
|
49 |
+
"feat_extract_activation": "gelu",
|
50 |
+
"feat_extract_dropout": 0.0,
|
51 |
+
"feat_extract_norm": "layer",
|
52 |
+
"feat_proj_dropout": 0.0,
|
53 |
+
"feat_quantizer_dropout": 0.0,
|
54 |
+
"final_dropout": 0.0,
|
55 |
+
"hidden_act": "gelu",
|
56 |
+
"hidden_dropout": 0.0,
|
57 |
+
"hidden_size": 1024,
|
58 |
+
"initializer_range": 0.02,
|
59 |
+
"intermediate_size": 4096,
|
60 |
+
"layer_norm_eps": 1e-05,
|
61 |
+
"layerdrop": 0.0,
|
62 |
+
"mask_feature_length": 64,
|
63 |
+
"mask_feature_min_masks": 0,
|
64 |
+
"mask_feature_prob": 0.25,
|
65 |
+
"mask_time_length": 10,
|
66 |
+
"mask_time_min_masks": 2,
|
67 |
+
"mask_time_prob": 0.75,
|
68 |
+
"model_type": "wav2vec2",
|
69 |
+
"num_adapter_layers": 3,
|
70 |
+
"num_attention_heads": 16,
|
71 |
+
"num_codevector_groups": 2,
|
72 |
+
"num_codevectors_per_group": 320,
|
73 |
+
"num_conv_pos_embedding_groups": 16,
|
74 |
+
"num_conv_pos_embeddings": 128,
|
75 |
+
"num_feat_extract_layers": 7,
|
76 |
+
"num_hidden_layers": 24,
|
77 |
+
"num_negatives": 100,
|
78 |
+
"output_hidden_size": 1024,
|
79 |
+
"pad_token_id": 41,
|
80 |
+
"proj_codevector_dim": 768,
|
81 |
+
"tdnn_dilation": [
|
82 |
+
1,
|
83 |
+
2,
|
84 |
+
3,
|
85 |
+
1,
|
86 |
+
1
|
87 |
+
],
|
88 |
+
"tdnn_dim": [
|
89 |
+
512,
|
90 |
+
512,
|
91 |
+
512,
|
92 |
+
512,
|
93 |
+
1500
|
94 |
+
],
|
95 |
+
"tdnn_kernel": [
|
96 |
+
5,
|
97 |
+
3,
|
98 |
+
3,
|
99 |
+
1,
|
100 |
+
1
|
101 |
+
],
|
102 |
+
"torch_dtype": "float32",
|
103 |
+
"transformers_version": "4.17.0.dev0",
|
104 |
+
"use_weighted_layer_sum": false,
|
105 |
+
"vocab_size": 44,
|
106 |
+
"xvector_output_dim": 512
|
107 |
+
}
|
eval.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from datasets import load_dataset, load_metric, Audio, Dataset
|
3 |
+
from transformers import pipeline, AutoFeatureExtractor
|
4 |
+
import re
|
5 |
+
import argparse
|
6 |
+
import unicodedata
|
7 |
+
from typing import Dict
|
8 |
+
|
9 |
+
|
10 |
+
def log_results(result: Dataset, args: Dict[str, str]):
|
11 |
+
""" DO NOT CHANGE. This function computes and logs the result metrics. """
|
12 |
+
|
13 |
+
log_outputs = args.log_outputs
|
14 |
+
dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
|
15 |
+
|
16 |
+
# load metric
|
17 |
+
wer = load_metric("wer")
|
18 |
+
cer = load_metric("cer")
|
19 |
+
|
20 |
+
# compute metrics
|
21 |
+
wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
|
22 |
+
cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
|
23 |
+
|
24 |
+
# print & log results
|
25 |
+
result_str = (
|
26 |
+
f"WER: {wer_result}\n"
|
27 |
+
f"CER: {cer_result}"
|
28 |
+
)
|
29 |
+
print(result_str)
|
30 |
+
|
31 |
+
with open(f"{dataset_id}_eval_results.txt", "w") as f:
|
32 |
+
f.write(result_str)
|
33 |
+
|
34 |
+
# log all results in text file. Possibly interesting for analysis
|
35 |
+
if log_outputs is not None:
|
36 |
+
pred_file = f"log_{dataset_id}_predictions.txt"
|
37 |
+
target_file = f"log_{dataset_id}_targets.txt"
|
38 |
+
|
39 |
+
with open(pred_file, "w") as p, open(target_file, "w") as t:
|
40 |
+
|
41 |
+
# mapping function to write output
|
42 |
+
def write_to_file(batch, i):
|
43 |
+
p.write(f"{i}" + "\n")
|
44 |
+
p.write(batch["prediction"] + "\n")
|
45 |
+
t.write(f"{i}" + "\n")
|
46 |
+
t.write(batch["target"] + "\n")
|
47 |
+
|
48 |
+
result.map(write_to_file, with_indices=True)
|
49 |
+
|
50 |
+
|
51 |
+
# Normalize arabic
|
52 |
+
def normalizeArabic(text):
|
53 |
+
# https://alraqmiyyat.github.io/2013/01-02.html
|
54 |
+
text = re.sub("[إأٱآا]", "ا", text)
|
55 |
+
text = re.sub("ى", "ي", text)
|
56 |
+
text = re.sub("ؤ", "ء", text)
|
57 |
+
text = re.sub("ئ", "ء", text)
|
58 |
+
|
59 |
+
# keep only characters which unicode \u0600-\u06FF and space
|
60 |
+
text = re.sub(r"[^\u0600-\u06FF ]", "", text)
|
61 |
+
return text
|
62 |
+
|
63 |
+
def normalize_text(text: str) -> str:
|
64 |
+
"""DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
|
65 |
+
|
66 |
+
chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]' # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
|
67 |
+
|
68 |
+
text = re.sub(chars_to_ignore_regex, "", text.lower())
|
69 |
+
|
70 |
+
# In addition, we can normalize the target text, e.g. removing new lines characters etc...
|
71 |
+
# note that order is important here!
|
72 |
+
token_sequences_to_ignore = ["\n\n", "\n", " ", " "]
|
73 |
+
|
74 |
+
for t in token_sequences_to_ignore:
|
75 |
+
text = " ".join(text.split(t))
|
76 |
+
|
77 |
+
text = normalizeArabic(text)
|
78 |
+
|
79 |
+
return text
|
80 |
+
|
81 |
+
|
82 |
+
def main(args):
|
83 |
+
# load dataset
|
84 |
+
dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
|
85 |
+
|
86 |
+
# for testing: only process the first two examples as a test
|
87 |
+
# dataset = dataset.select(range(10))
|
88 |
+
|
89 |
+
# load processor
|
90 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
|
91 |
+
sampling_rate = feature_extractor.sampling_rate
|
92 |
+
|
93 |
+
# resample audio
|
94 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
|
95 |
+
|
96 |
+
# load eval pipeline
|
97 |
+
asr = pipeline("automatic-speech-recognition", model=args.model_id)
|
98 |
+
|
99 |
+
# map function to decode audio
|
100 |
+
def map_to_pred(batch):
|
101 |
+
prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
|
102 |
+
|
103 |
+
batch["prediction"] = prediction["text"]
|
104 |
+
batch["target"] = normalize_text(batch["sentence"])
|
105 |
+
return batch
|
106 |
+
|
107 |
+
# run inference on all examples
|
108 |
+
result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
|
109 |
+
|
110 |
+
# compute and log_results
|
111 |
+
# do not change function below
|
112 |
+
log_results(result, args)
|
113 |
+
|
114 |
+
|
115 |
+
if __name__ == "__main__":
|
116 |
+
parser = argparse.ArgumentParser()
|
117 |
+
|
118 |
+
parser.add_argument(
|
119 |
+
"--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
|
120 |
+
)
|
121 |
+
parser.add_argument(
|
122 |
+
"--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
|
123 |
+
)
|
124 |
+
parser.add_argument(
|
125 |
+
"--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'` for Common Voice"
|
126 |
+
)
|
127 |
+
parser.add_argument(
|
128 |
+
"--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
|
129 |
+
)
|
130 |
+
parser.add_argument(
|
131 |
+
"--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
|
132 |
+
)
|
133 |
+
parser.add_argument(
|
134 |
+
"--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
|
135 |
+
)
|
136 |
+
parser.add_argument(
|
137 |
+
"--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
|
138 |
+
)
|
139 |
+
args = parser.parse_args()
|
140 |
+
|
141 |
+
main(args)
|
eval_results.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_loss": 3.0191357135772705,
|
3 |
+
"eval_samples": 7622,
|
4 |
+
"eval_wer": 1.0
|
5 |
+
}
|
preprocessor_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0,
|
7 |
+
"return_attention_mask": true,
|
8 |
+
"sampling_rate": 16000
|
9 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9bb293ce9691ca8a0f2f673c58ae6116242a0d7831f8149ba99c6208d6c79c1d
|
3 |
+
size 1262104049
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./wav2vec2-xls-r-300m-ar", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
train_results.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 20.0,
|
3 |
+
"total_flos": 5.430583918308557e+17,
|
4 |
+
"train_loss": 8.69529299736023,
|
5 |
+
"train_runtime": 243.8197,
|
6 |
+
"train_samples": 128,
|
7 |
+
"train_samples_per_second": 10.5,
|
8 |
+
"train_steps_per_second": 0.164
|
9 |
+
}
|
trainer_state.json
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.0,
|
5 |
+
"global_step": 340,
|
6 |
+
"is_hyper_param_search": false,
|
7 |
+
"is_local_process_zero": true,
|
8 |
+
"is_world_process_zero": true,
|
9 |
+
"log_history": [
|
10 |
+
{
|
11 |
+
"epoch": 0.15,
|
12 |
+
"learning_rate": 1.1750000000000001e-05,
|
13 |
+
"loss": 15.017,
|
14 |
+
"step": 50
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"epoch": 0.29,
|
18 |
+
"learning_rate": 2.425e-05,
|
19 |
+
"loss": 6.7134,
|
20 |
+
"step": 100
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"epoch": 0.44,
|
24 |
+
"learning_rate": 3.675e-05,
|
25 |
+
"loss": 4.3869,
|
26 |
+
"step": 150
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"epoch": 0.59,
|
30 |
+
"learning_rate": 4.9250000000000004e-05,
|
31 |
+
"loss": 3.6209,
|
32 |
+
"step": 200
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"epoch": 0.74,
|
36 |
+
"learning_rate": 6.175e-05,
|
37 |
+
"loss": 3.2011,
|
38 |
+
"step": 250
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"epoch": 0.88,
|
42 |
+
"learning_rate": 7.425e-05,
|
43 |
+
"loss": 3.0513,
|
44 |
+
"step": 300
|
45 |
+
}
|
46 |
+
],
|
47 |
+
"max_steps": 1700,
|
48 |
+
"num_train_epochs": 5,
|
49 |
+
"total_flos": 1.7302176965482906e+18,
|
50 |
+
"trial_name": null,
|
51 |
+
"trial_params": null
|
52 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82d5cc94b6f50c93cc3ab3c2b1e2b036aee795930a890ce78840feb7035dda43
|
3 |
+
size 3055
|
vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"ء": 1, "ا": 2, "ب": 3, "ة": 4, "ت": 5, "ث": 6, "ج": 7, "ح": 8, "خ": 9, "د": 10, "ذ": 11, "ر": 12, "ز": 13, "س": 14, "ش": 15, "ص": 16, "ض": 17, "ط": 18, "ظ": 19, "ع": 20, "غ": 21, "ف": 22, "ق": 23, "ك": 24, "ل": 25, "م": 26, "ن": 27, "ه": 28, "و": 29, "ي": 30, "|": 0, "[UNK]": 31, "[PAD]": 32}
|