Priyajay commited on
Commit
f24cf8d
1 Parent(s): 4d346e6

Model save

Browse files
.ipynb_checkpoints/run-checkpoint.sh CHANGED
@@ -1,6 +1,6 @@
1
  python run_speech_recognition_ctc.py \
2
  --dataset_name="common_voice" \
3
- --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
4
  --dataset_config_name="ab" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
 
1
  python run_speech_recognition_ctc.py \
2
  --dataset_name="common_voice" \
3
+ --model_name_or_path="hf-test/xls-r-dummy" \
4
  --dataset_config_name="ab" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
README.md DELETED
@@ -1,64 +0,0 @@
1
- ---
2
- language:
3
- - ab
4
- license: apache-2.0
5
- tags:
6
- - automatic-speech-recognition
7
- - common_voice
8
- - generated_from_trainer
9
- datasets:
10
- - common_voice
11
- model-index:
12
- - name: ''
13
- results: []
14
- ---
15
-
16
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
- should probably proofread and complete it, then remove this comment. -->
18
-
19
- #
20
-
21
- This model is a fine-tuned version of [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on the COMMON_VOICE - AB dataset.
22
- It achieves the following results on the evaluation set:
23
- - Loss: 15.1833
24
- - Wer: 1.0
25
-
26
- ## Model description
27
-
28
- More information needed
29
-
30
- ## Intended uses & limitations
31
-
32
- More information needed
33
-
34
- ## Training and evaluation data
35
-
36
- More information needed
37
-
38
- ## Training procedure
39
-
40
- ### Training hyperparameters
41
-
42
- The following hyperparameters were used during training:
43
- - learning_rate: 0.0003
44
- - train_batch_size: 16
45
- - eval_batch_size: 8
46
- - seed: 42
47
- - gradient_accumulation_steps: 2
48
- - total_train_batch_size: 32
49
- - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
- - lr_scheduler_type: linear
51
- - lr_scheduler_warmup_steps: 500
52
- - num_epochs: 2.0
53
- - mixed_precision_training: Native AMP
54
-
55
- ### Training results
56
-
57
-
58
-
59
- ### Framework versions
60
-
61
- - Transformers 4.17.0.dev0
62
- - Pytorch 1.10.2+cu102
63
- - Datasets 1.18.2.dev0
64
- - Tokenizers 0.11.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
all_results.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "epoch": 2.0,
3
- "eval_loss": 15.183259963989258,
4
- "eval_runtime": 0.5122,
5
- "eval_samples": 9,
6
- "eval_samples_per_second": 17.572,
7
- "eval_steps_per_second": 3.905,
8
- "eval_wer": 1.0,
9
- "train_loss": 21.219526290893555,
10
- "train_runtime": 4.3872,
11
- "train_samples": 22,
12
- "train_samples_per_second": 10.029,
13
- "train_steps_per_second": 0.456
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "facebook/wav2vec2-large-xlsr-53",
3
  "activation_dropout": 0.0,
4
  "adapter_kernel_size": 3,
5
  "adapter_stride": 2,
@@ -11,35 +11,23 @@
11
  "attention_dropout": 0.0,
12
  "bos_token_id": 1,
13
  "classifier_proj_size": 256,
14
- "codevector_dim": 768,
15
  "contrastive_logits_temperature": 0.1,
16
- "conv_bias": true,
17
  "conv_dim": [
18
- 512,
19
- 512,
20
- 512,
21
- 512,
22
- 512,
23
- 512,
24
- 512
25
  ],
26
  "conv_kernel": [
27
- 10,
28
- 3,
29
- 3,
30
- 3,
31
- 3,
32
- 2,
33
- 2
34
  ],
35
  "conv_stride": [
36
- 5,
37
- 2,
38
- 2,
39
- 2,
40
- 2,
41
- 2,
42
- 2
43
  ],
44
  "ctc_loss_reduction": "mean",
45
  "ctc_zero_infinity": false,
@@ -54,38 +42,31 @@
54
  "final_dropout": 0.0,
55
  "hidden_act": "gelu",
56
  "hidden_dropout": 0.0,
57
- "hidden_size": 1024,
 
58
  "initializer_range": 0.02,
59
- "intermediate_size": 4096,
60
  "layer_norm_eps": 1e-05,
61
  "layerdrop": 0.0,
62
- "mask_channel_length": 10,
63
- "mask_channel_min_space": 1,
64
- "mask_channel_other": 0.0,
65
- "mask_channel_prob": 0.0,
66
- "mask_channel_selection": "static",
67
  "mask_feature_length": 10,
68
  "mask_feature_min_masks": 0,
69
  "mask_feature_prob": 0.0,
70
  "mask_time_length": 10,
71
  "mask_time_min_masks": 2,
72
- "mask_time_min_space": 1,
73
- "mask_time_other": 0.0,
74
  "mask_time_prob": 0.05,
75
- "mask_time_selection": "static",
76
  "model_type": "wav2vec2",
77
  "num_adapter_layers": 3,
78
- "num_attention_heads": 16,
79
  "num_codevector_groups": 2,
80
  "num_codevectors_per_group": 320,
81
- "num_conv_pos_embedding_groups": 16,
82
- "num_conv_pos_embeddings": 128,
83
- "num_feat_extract_layers": 7,
84
- "num_hidden_layers": 24,
85
- "num_negatives": 100,
86
- "output_hidden_size": 1024,
87
  "pad_token_id": 39,
88
- "proj_codevector_dim": 768,
89
  "tdnn_dilation": [
90
  1,
91
  2,
 
1
  {
2
+ "_name_or_path": "hf-test/xls-r-dummy",
3
  "activation_dropout": 0.0,
4
  "adapter_kernel_size": 3,
5
  "adapter_stride": 2,
 
11
  "attention_dropout": 0.0,
12
  "bos_token_id": 1,
13
  "classifier_proj_size": 256,
14
+ "codevector_dim": 256,
15
  "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": false,
17
  "conv_dim": [
18
+ 32,
19
+ 32,
20
+ 32
 
 
 
 
21
  ],
22
  "conv_kernel": [
23
+ 8,
24
+ 8,
25
+ 8
 
 
 
 
26
  ],
27
  "conv_stride": [
28
+ 4,
29
+ 4,
30
+ 4
 
 
 
 
31
  ],
32
  "ctc_loss_reduction": "mean",
33
  "ctc_zero_infinity": false,
 
42
  "final_dropout": 0.0,
43
  "hidden_act": "gelu",
44
  "hidden_dropout": 0.0,
45
+ "hidden_dropout_prob": 0.1,
46
+ "hidden_size": 16,
47
  "initializer_range": 0.02,
48
+ "intermediate_size": 20,
49
  "layer_norm_eps": 1e-05,
50
  "layerdrop": 0.0,
 
 
 
 
 
51
  "mask_feature_length": 10,
52
  "mask_feature_min_masks": 0,
53
  "mask_feature_prob": 0.0,
54
  "mask_time_length": 10,
55
  "mask_time_min_masks": 2,
 
 
56
  "mask_time_prob": 0.05,
 
57
  "model_type": "wav2vec2",
58
  "num_adapter_layers": 3,
59
+ "num_attention_heads": 2,
60
  "num_codevector_groups": 2,
61
  "num_codevectors_per_group": 320,
62
+ "num_conv_pos_embedding_groups": 2,
63
+ "num_conv_pos_embeddings": 16,
64
+ "num_feat_extract_layers": 3,
65
+ "num_hidden_layers": 4,
66
+ "num_negatives": 10,
67
+ "output_hidden_size": 16,
68
  "pad_token_id": 39,
69
+ "proj_codevector_dim": 256,
70
  "tdnn_dilation": [
71
  1,
72
  2,
eval_results.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "epoch": 2.0,
3
- "eval_loss": 15.183259963989258,
4
- "eval_runtime": 0.5122,
5
- "eval_samples": 9,
6
- "eval_samples_per_second": 17.572,
7
- "eval_steps_per_second": 3.905,
8
- "eval_wer": 1.0
9
- }
 
 
 
 
 
 
 
 
 
 
preprocessor_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
  "feature_size": 1,
5
  "padding_side": "right",
6
- "padding_value": 0,
7
- "return_attention_mask": true,
8
  "sampling_rate": 16000
9
  }
 
3
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
  "feature_size": 1,
5
  "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
  "sampling_rate": 16000
9
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:910c4a1f6ba987dc94174de99e17aea1ba16b0b098742df52036cd0086b2cb63
3
- size 1262095857
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac89a4d5306b1b0af49e557b2cf55ff2eae830d8e96f3da9876d516bfe96bf71
3
+ size 143142
run.sh CHANGED
@@ -1,6 +1,6 @@
1
  python run_speech_recognition_ctc.py \
2
  --dataset_name="common_voice" \
3
- --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
4
  --dataset_config_name="ab" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
 
1
  python run_speech_recognition_ctc.py \
2
  --dataset_name="common_voice" \
3
+ --model_name_or_path="hf-test/xls-r-dummy" \
4
  --dataset_config_name="ab" \
5
  --output_dir="./" \
6
  --overwrite_output_dir \
train_results.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "epoch": 2.0,
3
- "train_loss": 21.219526290893555,
4
- "train_runtime": 4.3872,
5
- "train_samples": 22,
6
- "train_samples_per_second": 10.029,
7
- "train_steps_per_second": 0.456
8
- }
 
 
 
 
 
 
 
 
 
trainer_state.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
- "global_step": 2,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 2.0,
12
- "step": 2,
13
- "total_flos": 1.171423688905728e+16,
14
- "train_loss": 21.219526290893555,
15
- "train_runtime": 4.3872,
16
- "train_samples_per_second": 10.029,
17
- "train_steps_per_second": 0.456
18
- }
19
- ],
20
- "max_steps": 2,
21
- "num_train_epochs": 2,
22
- "total_flos": 1.171423688905728e+16,
23
- "trial_name": null,
24
- "trial_params": null
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ac91995bba3613eae578bea40f065315404269cabdcaf2cee979d28ab550d414
3
  size 2991
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2abf70ee833af8d8c22a2c8a2cbecfd532a51e2b5d92efc5025a369bcd756a19
3
  size 2991