nouamanetazi HF staff commited on Feb 10, 2022

Commit

05e6e2c

•

2 Parent(s): 4ca529b 5139435

Add LM

Browse files

Files changed (26) hide show

.gitignore +1 -0
.ipynb_checkpoints/README-checkpoint.md +90 -0
.ipynb_checkpoints/added_tokens-checkpoint.json +1 -0
.ipynb_checkpoints/all_results-checkpoint.json +15 -0
.ipynb_checkpoints/config-checkpoint.json +107 -0
.ipynb_checkpoints/eval-checkpoint.py +141 -0
.ipynb_checkpoints/eval_results-checkpoint.json +9 -0
.ipynb_checkpoints/preprocessor_config-checkpoint.json +9 -0
.ipynb_checkpoints/tokenizer_config-checkpoint.json +1 -0
.ipynb_checkpoints/train_results-checkpoint.json +9 -0
.ipynb_checkpoints/trainer_state-checkpoint.json +52 -0
.ipynb_checkpoints/vocab-checkpoint.json +1 -0
README.md +90 -0
added_tokens.json +1 -0
all_results.json +15 -0
config.json +107 -0
eval.py +141 -0
eval_results.json +5 -0
preprocessor_config.json +9 -0
pytorch_model.bin +3 -0
special_tokens_map.json +1 -0
tokenizer_config.json +1 -0
train_results.json +9 -0
trainer_state.json +52 -0
training_args.bin +3 -0
vocab.json +1 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ checkpoint-*/

.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1,90 @@

+---
+language:
+- ar
+license: apache-2.0
+tags:
+- automatic-speech-recognition
+- common_voice
+- generated_from_trainer
+- ar
+- robust-speech-event
+datasets:
+- common_voice
+model-index:
+- name: XLS-R-300M - Arabic
+  results:
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Robust Speech Event - Dev Data
+      type: speech-recognition-community-v2/dev_data
+      args: ar
+    metrics:
+       - name: Test WER
+         type: wer
+         value: 1.0
+       - name: Test CER
+         type: cer
+         value: 1.0
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# wav2vec2-xls-r-300m-ar
+This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the COMMON_VOICE - AR dataset.
+It achieves the following results on the evaluation set:
+- eval_loss: 3.0191
+- eval_wer: 1.0
+- eval_runtime: 252.2389
+- eval_samples_per_second: 30.217
+- eval_steps_per_second: 0.476
+- epoch: 1.0
+- step: 340
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0005
+- train_batch_size: 64
+- eval_batch_size: 64
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 2000
+- num_epochs: 5
+- mixed_precision_training: Native AMP
+### Framework versions
+- Transformers 4.17.0.dev0
+- Pytorch 1.10.2+cu102
+- Datasets 1.18.2.dev0
+- Tokenizers 0.11.0
+#### Evaluation Commands
+Please use the evaluation script `eval.py` included in the repo.
+1. To evaluate on `speech-recognition-community-v2/dev_data`
+```bash
+python eval.py --model_id nouamanetazi/wav2vec2-xls-r-300m-ar --dataset speech-recognition-community-v2/dev_data --config ar --split validation --chunk_length_s 5.0 --stride_length_s 1.0
+```

.ipynb_checkpoints/added_tokens-checkpoint.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<s>": 33, "</s>": 34}

.ipynb_checkpoints/all_results-checkpoint.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "epoch": 5.0,
+    "eval_loss": 26.253612518310547,
+    "eval_runtime": 5.982,
+    "eval_samples": 128,
+    "eval_samples_per_second": 21.397,
+    "eval_steps_per_second": 0.334,
+    "eval_wer": 1.0,
+    "total_flos": 1.3476444758728704e+17,
+    "train_loss": 19.624227905273436,
+    "train_runtime": 37.1321,
+    "train_samples": 128,
+    "train_samples_per_second": 17.236,
+    "train_steps_per_second": 0.269
+}

.ipynb_checkpoints/config-checkpoint.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "_name_or_path": "facebook/wav2vec2-xls-r-300m",
+  "activation_dropout": 0.1,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_feature_length": 64,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.25,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.75,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 32,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.17.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 35,
+  "xvector_output_dim": 512
+}

.ipynb_checkpoints/eval-checkpoint.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#!/usr/bin/env python3
+from datasets import load_dataset, load_metric, Audio, Dataset
+from transformers import pipeline, AutoFeatureExtractor
+import re
+import argparse
+import unicodedata
+from typing import Dict
+def log_results(result: Dataset, args: Dict[str, str]):
+    """ DO NOT CHANGE. This function computes and logs the result metrics. """
+    log_outputs = args.log_outputs
+    dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
+    # load metric
+    wer = load_metric("wer")
+    cer = load_metric("cer")
+    # compute metrics
+    wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
+    cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
+    # print & log results
+    result_str = (
+        f"WER: {wer_result}\n"
+        f"CER: {cer_result}"
+    )
+    print(result_str)
+    with open(f"{dataset_id}_eval_results.txt", "w") as f:
+        f.write(result_str)
+    # log all results in text file. Possibly interesting for analysis
+    if log_outputs is not None:
+        pred_file = f"log_{dataset_id}_predictions.txt"
+        target_file = f"log_{dataset_id}_targets.txt"
+        with open(pred_file, "w") as p, open(target_file, "w") as t:
+            # mapping function to write output
+            def write_to_file(batch, i):
+                p.write(f"{i}" + "\n")
+                p.write(batch["prediction"] + "\n")
+                t.write(f"{i}" + "\n")
+                t.write(batch["target"] + "\n")
+            result.map(write_to_file, with_indices=True)
+# Normalize arabic
+def normalizeArabic(text):
+    # https://alraqmiyyat.github.io/2013/01-02.html
+    text = re.sub("[إأٱآا]", "ا", text)
+    text = re.sub("ى", "ي", text)
+    text = re.sub("ؤ", "ء", text)
+    text = re.sub("ئ", "ء", text)
+    # keep only characters which unicode \u0600-\u06FF and space
+    text = re.sub(r"[^\u0600-\u06FF ]", "", text)
+    return text
+def normalize_text(text: str) -> str:
+    """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
+    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    text = re.sub(chars_to_ignore_regex, "", text.lower())
+    # In addition, we can normalize the target text, e.g. removing new lines characters etc...
+    # note that order is important here!
+    token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
+    for t in token_sequences_to_ignore:
+        text = " ".join(text.split(t))
+    text = normalizeArabic(text)
+    return text
+def main(args):
+    # load dataset
+    dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
+    # for testing: only process the first two examples as a test
+    # dataset = dataset.select(range(10))
+    # load processor
+    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
+    sampling_rate = feature_extractor.sampling_rate
+    # resample audio
+    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
+    # load eval pipeline
+    asr = pipeline("automatic-speech-recognition", model=args.model_id)
+    # map function to decode audio
+    def map_to_pred(batch):
+        prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
+        batch["prediction"] = prediction["text"]
+        batch["target"] = normalize_text(batch["sentence"])
+        return batch
+    # run inference on all examples
+    result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
+    # compute and log_results
+    # do not change function below
+    log_results(result, args)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
+    )
+    parser.add_argument(
+        "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
+    )
+    parser.add_argument(
+        "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice"
+    )
+    parser.add_argument(
+        "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
+    )
+    parser.add_argument(
+        "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
+    )
+    parser.add_argument(
+        "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
+    )
+    parser.add_argument(
+        "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
+    )
+    args = parser.parse_args()
+    main(args)

.ipynb_checkpoints/eval_results-checkpoint.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 20.0,
+    "eval_loss": 6.937458515167236,
+    "eval_runtime": 5.7217,
+    "eval_samples": 128,
+    "eval_samples_per_second": 22.371,
+    "eval_steps_per_second": 0.35,
+    "eval_wer": 1.0
+}

.ipynb_checkpoints/preprocessor_config-checkpoint.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

.ipynb_checkpoints/tokenizer_config-checkpoint.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./wav2vec2-xls-r-300m-ar", "tokenizer_class": "Wav2Vec2CTCTokenizer"}

.ipynb_checkpoints/train_results-checkpoint.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 5.0,
+    "total_flos": 1.3476444758728704e+17,
+    "train_loss": 16.66825189590454,
+    "train_runtime": 91.9274,
+    "train_samples": 128,
+    "train_samples_per_second": 6.962,
+    "train_steps_per_second": 0.109
+}

.ipynb_checkpoints/trainer_state-checkpoint.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "global_step": 340,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.15,
+      "learning_rate": 1.1750000000000001e-05,
+      "loss": 15.017,
+      "step": 50
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 2.425e-05,
+      "loss": 6.7134,
+      "step": 100
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 3.675e-05,
+      "loss": 4.3869,
+      "step": 150
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 4.9250000000000004e-05,
+      "loss": 3.6209,
+      "step": 200
+    },
+    {
+      "epoch": 0.74,
+      "learning_rate": 6.175e-05,
+      "loss": 3.2011,
+      "step": 250
+    },
+    {
+      "epoch": 0.88,
+      "learning_rate": 7.425e-05,
+      "loss": 3.0513,
+      "step": 300
+    }
+  ],
+  "max_steps": 1700,
+  "num_train_epochs": 5,
+  "total_flos": 1.7302176965482906e+18,
+  "trial_name": null,
+  "trial_params": null
+}

.ipynb_checkpoints/vocab-checkpoint.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"_": 1, "e": 2, "g": 3, "t": 4, "\u00ab": 5, "\u00bb": 6, "\u061b": 7, "\u0621": 8, "\u0627": 9, "\u0628": 10, "\u0629": 11, "\u062a": 12, "\u062b": 13, "\u062c": 14, "\u062d": 15, "\u062e": 16, "\u062f": 17, "\u0630": 18, "\u0631": 19, "\u0632": 20, "\u0633": 21, "\u0634": 22, "\u0635": 23, "\u0636": 24, "\u0637": 25, "\u0638": 26, "\u0639": 27, "\u063a": 28, "\u0641": 29, "\u0642": 30, "\u0643": 31, "\u0644": 32, "\u0645": 33, "\u0646": 34, "\u0647": 35, "\u0648": 36, "\u064a": 37, "\u0670": 38, "\u0686": 39, "\u06a8": 40, "\u06a9": 41, "\u06be": 42, "\u06cc": 43, "\u06d6": 44, "\u06da": 45, "\u262d": 46, "\ufe83": 47, "\ufefb": 48, "|": 0, "[UNK]": 49, "[PAD]": 50}

README.md ADDED Viewed

	@@ -0,0 +1,90 @@

+---
+language:
+- ar
+license: apache-2.0
+tags:
+- automatic-speech-recognition
+- common_voice
+- generated_from_trainer
+- ar
+- robust-speech-event
+datasets:
+- common_voice
+model-index:
+- name: XLS-R-300M - Arabic
+  results:
+  - task:
+      name: Automatic Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Robust Speech Event - Dev Data
+      type: speech-recognition-community-v2/dev_data
+      args: ar
+    metrics:
+       - name: Test WER
+         type: wer
+         value: 1.0
+       - name: Test CER
+         type: cer
+         value: 1.0
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# wav2vec2-xls-r-300m-ar
+This model is a fine-tuned version of [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m) on the COMMON_VOICE - AR dataset.
+It achieves the following results on the evaluation set:
+- eval_loss: 3.0191
+- eval_wer: 1.0
+- eval_runtime: 252.2389
+- eval_samples_per_second: 30.217
+- eval_steps_per_second: 0.476
+- epoch: 1.0
+- step: 340
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0005
+- train_batch_size: 64
+- eval_batch_size: 64
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 2000
+- num_epochs: 5
+- mixed_precision_training: Native AMP
+### Framework versions
+- Transformers 4.17.0.dev0
+- Pytorch 1.10.2+cu102
+- Datasets 1.18.2.dev0
+- Tokenizers 0.11.0
+#### Evaluation Commands
+Please use the evaluation script `eval.py` included in the repo.
+1. To evaluate on `speech-recognition-community-v2/dev_data`
+```bash
+python eval.py --model_id nouamanetazi/wav2vec2-xls-r-300m-ar --dataset speech-recognition-community-v2/dev_data --config ar --split validation --chunk_length_s 5.0 --stride_length_s 1.0
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<s>": 33, "</s>": 34}

all_results.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "epoch": 20.0,
+    "eval_loss": 3.0191357135772705,
+    "eval_runtime": 5.7217,
+    "eval_samples": 7622,
+    "eval_samples_per_second": 22.371,
+    "eval_steps_per_second": 0.35,
+    "eval_wer": 1.0,
+    "total_flos": 5.430583918308557e+17,
+    "train_loss": 8.69529299736023,
+    "train_runtime": 243.8197,
+    "train_samples": 128,
+    "train_samples_per_second": 10.5,
+    "train_steps_per_second": 0.164
+}

config.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "_name_or_path": "facebook/wav2vec2-xls-r-300m",
+  "activation_dropout": 0.1,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "Wav2Vec2ForCTC"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.0,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.0,
+  "mask_feature_length": 64,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.25,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.75,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 41,
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.17.0.dev0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 44,
+  "xvector_output_dim": 512
+}

eval.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#!/usr/bin/env python3
+from datasets import load_dataset, load_metric, Audio, Dataset
+from transformers import pipeline, AutoFeatureExtractor
+import re
+import argparse
+import unicodedata
+from typing import Dict
+def log_results(result: Dataset, args: Dict[str, str]):
+    """ DO NOT CHANGE. This function computes and logs the result metrics. """
+    log_outputs = args.log_outputs
+    dataset_id = "_".join(args.dataset.split("/") + [args.config, args.split])
+    # load metric
+    wer = load_metric("wer")
+    cer = load_metric("cer")
+    # compute metrics
+    wer_result = wer.compute(references=result["target"], predictions=result["prediction"])
+    cer_result = cer.compute(references=result["target"], predictions=result["prediction"])
+    # print & log results
+    result_str = (
+        f"WER: {wer_result}\n"
+        f"CER: {cer_result}"
+    )
+    print(result_str)
+    with open(f"{dataset_id}_eval_results.txt", "w") as f:
+        f.write(result_str)
+    # log all results in text file. Possibly interesting for analysis
+    if log_outputs is not None:
+        pred_file = f"log_{dataset_id}_predictions.txt"
+        target_file = f"log_{dataset_id}_targets.txt"
+        with open(pred_file, "w") as p, open(target_file, "w") as t:
+            # mapping function to write output
+            def write_to_file(batch, i):
+                p.write(f"{i}" + "\n")
+                p.write(batch["prediction"] + "\n")
+                t.write(f"{i}" + "\n")
+                t.write(batch["target"] + "\n")
+            result.map(write_to_file, with_indices=True)
+# Normalize arabic
+def normalizeArabic(text):
+    # https://alraqmiyyat.github.io/2013/01-02.html
+    text = re.sub("[إأٱآا]", "ا", text)
+    text = re.sub("ى", "ي", text)
+    text = re.sub("ؤ", "ء", text)
+    text = re.sub("ئ", "ء", text)
+    # keep only characters which unicode \u0600-\u06FF and space
+    text = re.sub(r"[^\u0600-\u06FF ]", "", text)
+    return text
+def normalize_text(text: str) -> str:
+    """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
+    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    text = re.sub(chars_to_ignore_regex, "", text.lower())
+    # In addition, we can normalize the target text, e.g. removing new lines characters etc...
+    # note that order is important here!
+    token_sequences_to_ignore = ["\n\n", "\n", "   ", "  "]
+    for t in token_sequences_to_ignore:
+        text = " ".join(text.split(t))
+    text = normalizeArabic(text)
+    return text
+def main(args):
+    # load dataset
+    dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True)
+    # for testing: only process the first two examples as a test
+    # dataset = dataset.select(range(10))
+    # load processor
+    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
+    sampling_rate = feature_extractor.sampling_rate
+    # resample audio
+    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
+    # load eval pipeline
+    asr = pipeline("automatic-speech-recognition", model=args.model_id)
+    # map function to decode audio
+    def map_to_pred(batch):
+        prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s)
+        batch["prediction"] = prediction["text"]
+        batch["target"] = normalize_text(batch["sentence"])
+        return batch
+    # run inference on all examples
+    result = dataset.map(map_to_pred, remove_columns=dataset.column_names)
+    # compute and log_results
+    # do not change function below
+    log_results(result, args)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_id", type=str, required=True, help="Model identifier. Should be loadable with 🤗 Transformers"
+    )
+    parser.add_argument(
+        "--dataset", type=str, required=True, help="Dataset name to evaluate the `model_id`. Should be loadable with 🤗 Datasets"
+    )
+    parser.add_argument(
+        "--config", type=str, required=True, help="Config of the dataset. *E.g.* `'en'`  for Common Voice"
+    )
+    parser.add_argument(
+        "--split", type=str, required=True, help="Split of the dataset. *E.g.* `'test'`"
+    )
+    parser.add_argument(
+        "--chunk_length_s", type=float, default=None, help="Chunk length in seconds. Defaults to None. For long audio files a good value would be 5.0 seconds."
+    )
+    parser.add_argument(
+        "--stride_length_s", type=float, default=None, help="Stride of the audio chunks. Defaults to None. For long audio files a good value would be 1.0 seconds."
+    )
+    parser.add_argument(
+        "--log_outputs", action='store_true', help="If defined, write outputs to log file for analysis."
+    )
+    args = parser.parse_args()
+    main(args)

eval_results.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "eval_loss": 3.0191357135772705,
+    "eval_samples": 7622,
+    "eval_wer": 1.0
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bb293ce9691ca8a0f2f673c58ae6116242a0d7831f8149ba99c6208d6c79c1d
+size 1262104049

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "[UNK]", "pad_token": "[PAD]", "additional_special_tokens": [{"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "[PAD]", "do_lower_case": false, "word_delimiter_token": "\|", "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "./wav2vec2-xls-r-300m-ar", "tokenizer_class": "Wav2Vec2CTCTokenizer"}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 20.0,
+    "total_flos": 5.430583918308557e+17,
+    "train_loss": 8.69529299736023,
+    "train_runtime": 243.8197,
+    "train_samples": 128,
+    "train_samples_per_second": 10.5,
+    "train_steps_per_second": 0.164
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "global_step": 340,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.15,
+      "learning_rate": 1.1750000000000001e-05,
+      "loss": 15.017,
+      "step": 50
+    },
+    {
+      "epoch": 0.29,
+      "learning_rate": 2.425e-05,
+      "loss": 6.7134,
+      "step": 100
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 3.675e-05,
+      "loss": 4.3869,
+      "step": 150
+    },
+    {
+      "epoch": 0.59,
+      "learning_rate": 4.9250000000000004e-05,
+      "loss": 3.6209,
+      "step": 200
+    },
+    {
+      "epoch": 0.74,
+      "learning_rate": 6.175e-05,
+      "loss": 3.2011,
+      "step": 250
+    },
+    {
+      "epoch": 0.88,
+      "learning_rate": 7.425e-05,
+      "loss": 3.0513,
+      "step": 300
+    }
+  ],
+  "max_steps": 1700,
+  "num_train_epochs": 5,
+  "total_flos": 1.7302176965482906e+18,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82d5cc94b6f50c93cc3ab3c2b1e2b036aee795930a890ce78840feb7035dda43
+size 3055

vocab.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"ء": 1, "ا": 2, "ب": 3, "ة": 4, "ت": 5, "ث": 6, "ج": 7, "ح": 8, "خ": 9, "د": 10, "ذ": 11, "ر": 12, "ز": 13, "س": 14, "ش": 15, "ص": 16, "ض": 17, "ط": 18, "ظ": 19, "ع": 20, "غ": 21, "ف": 22, "ق": 23, "ك": 24, "ل": 25, "م": 26, "ن": 27, "ه": 28, "و": 29, "ي": 30, "\|": 0, "[UNK]": 31, "[PAD]": 32}