antoniomae1234 commited on
Commit
df6736f
1 Parent(s): 3fbc589

Upload 457 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +13 -0
  2. TTS/TTS_.models.json +937 -0
  3. TTS/TTS_VERSION +1 -0
  4. TTS/TTS___init__.py +6 -0
  5. TTS/TTS_api.py +489 -0
  6. TTS/TTS_cs_api.py +317 -0
  7. TTS/TTS_model.py +59 -0
  8. TTS/__pycache__/TTS___pycache_____init__.cpython-39.pyc +0 -0
  9. TTS/__pycache__/TTS___pycache___api.cpython-39.pyc +0 -0
  10. TTS/__pycache__/TTS___pycache___cs_api.cpython-39.pyc +0 -0
  11. TTS/__pycache__/TTS___pycache___model.cpython-39.pyc +0 -0
  12. TTS/bin/TTS_bin___init__.py +0 -0
  13. TTS/bin/TTS_bin_collect_env_info.py +48 -0
  14. TTS/bin/TTS_bin_compute_attention_masks.py +165 -0
  15. TTS/bin/TTS_bin_compute_embeddings.py +197 -0
  16. TTS/bin/TTS_bin_compute_statistics.py +96 -0
  17. TTS/bin/TTS_bin_eval_encoder.py +88 -0
  18. TTS/bin/TTS_bin_extract_tts_spectrograms (1).py +287 -0
  19. TTS/bin/TTS_bin_extract_tts_spectrograms.py +287 -0
  20. TTS/bin/TTS_bin_find_unique_chars (1).py +45 -0
  21. TTS/bin/TTS_bin_find_unique_chars.py +45 -0
  22. TTS/bin/TTS_bin_find_unique_phonemes (1).py +74 -0
  23. TTS/bin/TTS_bin_find_unique_phonemes.py +74 -0
  24. TTS/bin/TTS_bin_remove_silence_using_vad (1).py +124 -0
  25. TTS/bin/TTS_bin_remove_silence_using_vad.py +124 -0
  26. TTS/bin/TTS_bin_resample (1).py +90 -0
  27. TTS/bin/TTS_bin_resample.py +90 -0
  28. TTS/bin/TTS_bin_synthesize (1).py +541 -0
  29. TTS/bin/TTS_bin_synthesize.py +541 -0
  30. TTS/bin/TTS_bin_train_encoder (1).py +319 -0
  31. TTS/bin/TTS_bin_train_encoder.py +319 -0
  32. TTS/bin/TTS_bin_train_tts (1).py +71 -0
  33. TTS/bin/TTS_bin_train_tts.py +71 -0
  34. TTS/bin/TTS_bin_train_vocoder.py +77 -0
  35. TTS/bin/TTS_bin_tune_wavegrad.py +103 -0
  36. TTS/config/TTS_config___init__.py +138 -0
  37. TTS/config/TTS_config_shared_configs.py +268 -0
  38. TTS/config/__pycache__/TTS_config___pycache_____init__.cpython-39.pyc +0 -0
  39. TTS/config/__pycache__/TTS_config___pycache___shared_configs.cpython-39.pyc +0 -0
  40. TTS/encoder/TTS_encoder_README.md +18 -0
  41. TTS/encoder/TTS_encoder___init__.py +0 -0
  42. TTS/encoder/TTS_encoder_dataset.py +147 -0
  43. TTS/encoder/TTS_encoder_losses.py +226 -0
  44. TTS/encoder/TTS_encoder_requirements.txt +2 -0
  45. TTS/encoder/__pycache__/TTS_encoder___pycache_____init__.cpython-39.pyc +0 -0
  46. TTS/encoder/__pycache__/TTS_encoder___pycache___losses.cpython-39.pyc +0 -0
  47. TTS/encoder/configs/TTS_encoder_configs_base_encoder_config.py +61 -0
  48. TTS/encoder/configs/TTS_encoder_configs_emotion_encoder_config.py +12 -0
  49. TTS/encoder/configs/TTS_encoder_configs_speaker_encoder_config.py +11 -0
  50. TTS/encoder/models/TTS_encoder_models_base_encoder.py +161 -0
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Voice Clone
3
+ emoji: 🏃
4
+ colorFrom: blue
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 4.5.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
TTS/TTS_.models.json ADDED
@@ -0,0 +1,937 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_models": {
3
+ "multilingual": {
4
+ "multi-dataset": {
5
+ "xtts_v2": {
6
+ "description": "XTTS-v2.0.2 by Coqui with 16 languages.",
7
+ "hf_url": [
8
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
9
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
10
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
11
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
12
+ ],
13
+ "model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c",
14
+ "default_vocoder": null,
15
+ "commit": "480a6cdf7",
16
+ "license": "CPML",
17
+ "contact": "[email protected]",
18
+ "tos_required": true
19
+ },
20
+ "xtts_v1.1": {
21
+ "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
22
+ "hf_url": [
23
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
24
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
25
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
26
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
27
+ ],
28
+ "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
29
+ "default_vocoder": null,
30
+ "commit": "82910a63",
31
+ "license": "CPML",
32
+ "contact": "[email protected]",
33
+ "tos_required": true
34
+ },
35
+ "your_tts": {
36
+ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
37
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
38
+ "default_vocoder": null,
39
+ "commit": "e9a1953e",
40
+ "license": "CC BY-NC-ND 4.0",
41
+ "contact": "[email protected]"
42
+ },
43
+ "bark": {
44
+ "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
45
+ "hf_url": [
46
+ "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
47
+ "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
48
+ "https://app.coqui.ai/tts_model/text_2.pt",
49
+ "https://coqui.gateway.scarf.sh/hf/bark/config.json",
50
+ "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
51
+ "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
52
+ ],
53
+ "default_vocoder": null,
54
+ "commit": "e9a1953e",
55
+ "license": "MIT",
56
+ "contact": "https://www.suno.ai/"
57
+ }
58
+ }
59
+ },
60
+ "bg": {
61
+ "cv": {
62
+ "vits": {
63
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
64
+ "default_vocoder": null,
65
+ "commit": null,
66
+ "author": "@NeonGeckoCom",
67
+ "license": "bsd-3-clause"
68
+ }
69
+ }
70
+ },
71
+ "cs": {
72
+ "cv": {
73
+ "vits": {
74
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
75
+ "default_vocoder": null,
76
+ "commit": null,
77
+ "author": "@NeonGeckoCom",
78
+ "license": "bsd-3-clause"
79
+ }
80
+ }
81
+ },
82
+ "da": {
83
+ "cv": {
84
+ "vits": {
85
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
86
+ "default_vocoder": null,
87
+ "commit": null,
88
+ "author": "@NeonGeckoCom",
89
+ "license": "bsd-3-clause"
90
+ }
91
+ }
92
+ },
93
+ "et": {
94
+ "cv": {
95
+ "vits": {
96
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
97
+ "default_vocoder": null,
98
+ "commit": null,
99
+ "author": "@NeonGeckoCom",
100
+ "license": "bsd-3-clause"
101
+ }
102
+ }
103
+ },
104
+ "ga": {
105
+ "cv": {
106
+ "vits": {
107
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
108
+ "default_vocoder": null,
109
+ "commit": null,
110
+ "author": "@NeonGeckoCom",
111
+ "license": "bsd-3-clause"
112
+ }
113
+ }
114
+ },
115
+ "en": {
116
+ "ek1": {
117
+ "tacotron2": {
118
+ "description": "EK1 en-rp tacotron2 by NMStoker",
119
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
120
+ "default_vocoder": "vocoder_models/en/ek1/wavegrad",
121
+ "commit": "c802255",
122
+ "license": "apache 2.0"
123
+ }
124
+ },
125
+ "ljspeech": {
126
+ "tacotron2-DDC": {
127
+ "description": "Tacotron2 with Double Decoder Consistency.",
128
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
129
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
130
+ "commit": "bae2ad0f",
131
+ "author": "Eren Gölge @erogol",
132
+ "license": "apache 2.0",
133
+ "contact": "[email protected]"
134
+ },
135
+ "tacotron2-DDC_ph": {
136
+ "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
137
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
138
+ "default_vocoder": "vocoder_models/en/ljspeech/univnet",
139
+ "commit": "3900448",
140
+ "author": "Eren Gölge @erogol",
141
+ "license": "apache 2.0",
142
+ "contact": "[email protected]"
143
+ },
144
+ "glow-tts": {
145
+ "description": "",
146
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
147
+ "stats_file": null,
148
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
149
+ "commit": "",
150
+ "author": "Eren Gölge @erogol",
151
+ "license": "MPL",
152
+ "contact": "[email protected]"
153
+ },
154
+ "speedy-speech": {
155
+ "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
156
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
157
+ "stats_file": null,
158
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
159
+ "commit": "4581e3d",
160
+ "author": "Eren Gölge @erogol",
161
+ "license": "apache 2.0",
162
+ "contact": "[email protected]"
163
+ },
164
+ "tacotron2-DCA": {
165
+ "description": "",
166
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
167
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
168
+ "commit": "",
169
+ "author": "Eren Gölge @erogol",
170
+ "license": "MPL",
171
+ "contact": "[email protected]"
172
+ },
173
+ "vits": {
174
+ "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
175
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
176
+ "default_vocoder": null,
177
+ "commit": "3900448",
178
+ "author": "Eren Gölge @erogol",
179
+ "license": "apache 2.0",
180
+ "contact": "[email protected]"
181
+ },
182
+ "vits--neon": {
183
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
184
+ "default_vocoder": null,
185
+ "author": "@NeonGeckoCom",
186
+ "license": "bsd-3-clause",
187
+ "contact": null,
188
+ "commit": null
189
+ },
190
+ "fast_pitch": {
191
+ "description": "FastPitch model trained on LJSpeech using the Aligner Network",
192
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
193
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
194
+ "commit": "b27b3ba",
195
+ "author": "Eren Gölge @erogol",
196
+ "license": "apache 2.0",
197
+ "contact": "[email protected]"
198
+ },
199
+ "overflow": {
200
+ "description": "Overflow model trained on LJSpeech",
201
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
202
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
203
+ "commit": "3b1a28f",
204
+ "author": "Eren Gölge @erogol",
205
+ "license": "apache 2.0",
206
+ "contact": "[email protected]"
207
+ },
208
+ "neural_hmm": {
209
+ "description": "Neural HMM model trained on LJSpeech",
210
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
211
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
212
+ "commit": "3b1a28f",
213
+ "author": "Shivam Metha @shivammehta25",
214
+ "license": "apache 2.0",
215
+ "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
216
+ }
217
+ },
218
+ "vctk": {
219
+ "vits": {
220
+ "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
221
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
222
+ "default_vocoder": null,
223
+ "commit": "3900448",
224
+ "author": "Eren @erogol",
225
+ "license": "apache 2.0",
226
+ "contact": "[email protected]"
227
+ },
228
+ "fast_pitch": {
229
+ "description": "FastPitch model trained on VCTK dataseset.",
230
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
231
+ "default_vocoder": null,
232
+ "commit": "bdab788d",
233
+ "author": "Eren @erogol",
234
+ "license": "CC BY-NC-ND 4.0",
235
+ "contact": "[email protected]"
236
+ }
237
+ },
238
+ "sam": {
239
+ "tacotron-DDC": {
240
+ "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
241
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
242
+ "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
243
+ "commit": "bae2ad0f",
244
+ "author": "Eren Gölge @erogol",
245
+ "license": "apache 2.0",
246
+ "contact": "[email protected]"
247
+ }
248
+ },
249
+ "blizzard2013": {
250
+ "capacitron-t2-c50": {
251
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
252
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
253
+ "commit": "d6284e7",
254
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
255
+ "author": "Adam Froghyar @a-froghyar",
256
+ "license": "apache 2.0",
257
+ "contact": "[email protected]"
258
+ },
259
+ "capacitron-t2-c150_v2": {
260
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
261
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
262
+ "commit": "a67039d",
263
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
264
+ "author": "Adam Froghyar @a-froghyar",
265
+ "license": "apache 2.0",
266
+ "contact": "[email protected]"
267
+ }
268
+ },
269
+ "multi-dataset": {
270
+ "tortoise-v2": {
271
+ "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
272
+ "github_rls_url": [
273
+ "https://app.coqui.ai/tts_model/autoregressive.pth",
274
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
275
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
276
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
277
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
278
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
279
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
280
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
281
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
282
+ ],
283
+ "commit": "c1875f6",
284
+ "default_vocoder": null,
285
+ "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
286
+ "license": "apache 2.0"
287
+ }
288
+ },
289
+ "jenny": {
290
+ "jenny": {
291
+ "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
292
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
293
+ "default_vocoder": null,
294
+ "commit": "ba40a1c",
295
+ "license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
296
+ "author": "@noml4u"
297
+ }
298
+ }
299
+ },
300
+ "es": {
301
+ "mai": {
302
+ "tacotron2-DDC": {
303
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
304
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
305
+ "commit": "",
306
+ "author": "Eren Gölge @erogol",
307
+ "license": "MPL",
308
+ "contact": "[email protected]"
309
+ }
310
+ },
311
+ "css10": {
312
+ "vits": {
313
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
314
+ "default_vocoder": null,
315
+ "commit": null,
316
+ "author": "@NeonGeckoCom",
317
+ "license": "bsd-3-clause"
318
+ }
319
+ }
320
+ },
321
+ "fr": {
322
+ "mai": {
323
+ "tacotron2-DDC": {
324
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
325
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
326
+ "commit": null,
327
+ "author": "Eren Gölge @erogol",
328
+ "license": "MPL",
329
+ "contact": "[email protected]"
330
+ }
331
+ },
332
+ "css10": {
333
+ "vits": {
334
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
335
+ "default_vocoder": null,
336
+ "commit": null,
337
+ "author": "@NeonGeckoCom",
338
+ "license": "bsd-3-clause"
339
+ }
340
+ }
341
+ },
342
+ "uk": {
343
+ "mai": {
344
+ "glow-tts": {
345
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
346
+ "author": "@robinhad",
347
+ "commit": "bdab788d",
348
+ "license": "MIT",
349
+ "contact": "",
350
+ "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
351
+ },
352
+ "vits": {
353
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
354
+ "default_vocoder": null,
355
+ "commit": null,
356
+ "author": "@NeonGeckoCom",
357
+ "license": "bsd-3-clause"
358
+ }
359
+ }
360
+ },
361
+ "zh-CN": {
362
+ "baker": {
363
+ "tacotron2-DDC-GST": {
364
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
365
+ "commit": "unknown",
366
+ "author": "@kirianguiller",
367
+ "license": "apache 2.0",
368
+ "default_vocoder": null
369
+ }
370
+ }
371
+ },
372
+ "nl": {
373
+ "mai": {
374
+ "tacotron2-DDC": {
375
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
376
+ "author": "@r-dh",
377
+ "license": "apache 2.0",
378
+ "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
379
+ "stats_file": null,
380
+ "commit": "540d811"
381
+ }
382
+ },
383
+ "css10": {
384
+ "vits": {
385
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
386
+ "default_vocoder": null,
387
+ "commit": null,
388
+ "author": "@NeonGeckoCom",
389
+ "license": "bsd-3-clause"
390
+ }
391
+ }
392
+ },
393
+ "de": {
394
+ "thorsten": {
395
+ "tacotron2-DCA": {
396
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
397
+ "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
398
+ "author": "@thorstenMueller",
399
+ "license": "apache 2.0",
400
+ "commit": "unknown"
401
+ },
402
+ "vits": {
403
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
404
+ "default_vocoder": null,
405
+ "author": "@thorstenMueller",
406
+ "license": "apache 2.0",
407
+ "commit": "unknown"
408
+ },
409
+ "tacotron2-DDC": {
410
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
411
+ "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
412
+ "description": "Thorsten-Dec2021-22k-DDC",
413
+ "author": "@thorstenMueller",
414
+ "license": "apache 2.0",
415
+ "commit": "unknown"
416
+ }
417
+ },
418
+ "css10": {
419
+ "vits-neon": {
420
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
421
+ "default_vocoder": null,
422
+ "author": "@NeonGeckoCom",
423
+ "license": "bsd-3-clause",
424
+ "commit": null
425
+ }
426
+ }
427
+ },
428
+ "ja": {
429
+ "kokoro": {
430
+ "tacotron2-DDC": {
431
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
432
+ "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
433
+ "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
434
+ "author": "@kaiidams",
435
+ "license": "apache 2.0",
436
+ "commit": "401fbd89"
437
+ }
438
+ }
439
+ },
440
+ "tr": {
441
+ "common-voice": {
442
+ "glow-tts": {
443
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
444
+ "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
445
+ "license": "MIT",
446
+ "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
447
+ "author": "Fatih Akademi",
448
+ "commit": null
449
+ }
450
+ }
451
+ },
452
+ "it": {
453
+ "mai_female": {
454
+ "glow-tts": {
455
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
456
+ "default_vocoder": null,
457
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
458
+ "author": "@nicolalandro",
459
+ "license": "apache 2.0",
460
+ "commit": null
461
+ },
462
+ "vits": {
463
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
464
+ "default_vocoder": null,
465
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
466
+ "author": "@nicolalandro",
467
+ "license": "apache 2.0",
468
+ "commit": null
469
+ }
470
+ },
471
+ "mai_male": {
472
+ "glow-tts": {
473
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
474
+ "default_vocoder": null,
475
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
476
+ "author": "@nicolalandro",
477
+ "license": "apache 2.0",
478
+ "commit": null
479
+ },
480
+ "vits": {
481
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
482
+ "default_vocoder": null,
483
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
484
+ "author": "@nicolalandro",
485
+ "license": "apache 2.0",
486
+ "commit": null
487
+ }
488
+ }
489
+ },
490
+ "ewe": {
491
+ "openbible": {
492
+ "vits": {
493
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
494
+ "default_vocoder": null,
495
+ "license": "CC-BY-SA 4.0",
496
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
497
+ "author": "@coqui_ai",
498
+ "commit": "1b22f03"
499
+ }
500
+ }
501
+ },
502
+ "hau": {
503
+ "openbible": {
504
+ "vits": {
505
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
506
+ "default_vocoder": null,
507
+ "license": "CC-BY-SA 4.0",
508
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
509
+ "author": "@coqui_ai",
510
+ "commit": "1b22f03"
511
+ }
512
+ }
513
+ },
514
+ "lin": {
515
+ "openbible": {
516
+ "vits": {
517
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
518
+ "default_vocoder": null,
519
+ "license": "CC-BY-SA 4.0",
520
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
521
+ "author": "@coqui_ai",
522
+ "commit": "1b22f03"
523
+ }
524
+ }
525
+ },
526
+ "tw_akuapem": {
527
+ "openbible": {
528
+ "vits": {
529
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
530
+ "default_vocoder": null,
531
+ "license": "CC-BY-SA 4.0",
532
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
533
+ "author": "@coqui_ai",
534
+ "commit": "1b22f03"
535
+ }
536
+ }
537
+ },
538
+ "tw_asante": {
539
+ "openbible": {
540
+ "vits": {
541
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
542
+ "default_vocoder": null,
543
+ "license": "CC-BY-SA 4.0",
544
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
545
+ "author": "@coqui_ai",
546
+ "commit": "1b22f03"
547
+ }
548
+ }
549
+ },
550
+ "yor": {
551
+ "openbible": {
552
+ "vits": {
553
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
554
+ "default_vocoder": null,
555
+ "license": "CC-BY-SA 4.0",
556
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
557
+ "author": "@coqui_ai",
558
+ "commit": "1b22f03"
559
+ }
560
+ }
561
+ },
562
+ "hu": {
563
+ "css10": {
564
+ "vits": {
565
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
566
+ "default_vocoder": null,
567
+ "commit": null,
568
+ "author": "@NeonGeckoCom",
569
+ "license": "bsd-3-clause"
570
+ }
571
+ }
572
+ },
573
+ "el": {
574
+ "cv": {
575
+ "vits": {
576
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
577
+ "default_vocoder": null,
578
+ "commit": null,
579
+ "author": "@NeonGeckoCom",
580
+ "license": "bsd-3-clause"
581
+ }
582
+ }
583
+ },
584
+ "fi": {
585
+ "css10": {
586
+ "vits": {
587
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
588
+ "default_vocoder": null,
589
+ "commit": null,
590
+ "author": "@NeonGeckoCom",
591
+ "license": "bsd-3-clause"
592
+ }
593
+ }
594
+ },
595
+ "hr": {
596
+ "cv": {
597
+ "vits": {
598
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
599
+ "default_vocoder": null,
600
+ "commit": null,
601
+ "author": "@NeonGeckoCom",
602
+ "license": "bsd-3-clause"
603
+ }
604
+ }
605
+ },
606
+ "lt": {
607
+ "cv": {
608
+ "vits": {
609
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
610
+ "default_vocoder": null,
611
+ "commit": null,
612
+ "author": "@NeonGeckoCom",
613
+ "license": "bsd-3-clause"
614
+ }
615
+ }
616
+ },
617
+ "lv": {
618
+ "cv": {
619
+ "vits": {
620
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
621
+ "default_vocoder": null,
622
+ "commit": null,
623
+ "author": "@NeonGeckoCom",
624
+ "license": "bsd-3-clause"
625
+ }
626
+ }
627
+ },
628
+ "mt": {
629
+ "cv": {
630
+ "vits": {
631
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
632
+ "default_vocoder": null,
633
+ "commit": null,
634
+ "author": "@NeonGeckoCom",
635
+ "license": "bsd-3-clause"
636
+ }
637
+ }
638
+ },
639
+ "pl": {
640
+ "mai_female": {
641
+ "vits": {
642
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
643
+ "default_vocoder": null,
644
+ "commit": null,
645
+ "author": "@NeonGeckoCom",
646
+ "license": "bsd-3-clause"
647
+ }
648
+ }
649
+ },
650
+ "pt": {
651
+ "cv": {
652
+ "vits": {
653
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
654
+ "default_vocoder": null,
655
+ "commit": null,
656
+ "author": "@NeonGeckoCom",
657
+ "license": "bsd-3-clause"
658
+ }
659
+ }
660
+ },
661
+ "ro": {
662
+ "cv": {
663
+ "vits": {
664
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
665
+ "default_vocoder": null,
666
+ "commit": null,
667
+ "author": "@NeonGeckoCom",
668
+ "license": "bsd-3-clause"
669
+ }
670
+ }
671
+ },
672
+ "sk": {
673
+ "cv": {
674
+ "vits": {
675
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
676
+ "default_vocoder": null,
677
+ "commit": null,
678
+ "author": "@NeonGeckoCom",
679
+ "license": "bsd-3-clause"
680
+ }
681
+ }
682
+ },
683
+ "sl": {
684
+ "cv": {
685
+ "vits": {
686
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
687
+ "default_vocoder": null,
688
+ "commit": null,
689
+ "author": "@NeonGeckoCom",
690
+ "license": "bsd-3-clause"
691
+ }
692
+ }
693
+ },
694
+ "sv": {
695
+ "cv": {
696
+ "vits": {
697
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
698
+ "default_vocoder": null,
699
+ "commit": null,
700
+ "author": "@NeonGeckoCom",
701
+ "license": "bsd-3-clause"
702
+ }
703
+ }
704
+ },
705
+ "ca": {
706
+ "custom": {
707
+ "vits": {
708
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
709
+ "default_vocoder": null,
710
+ "commit": null,
711
+ "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
712
+ "author": "@gullabi",
713
+ "license": "CC-BY-4.0"
714
+ }
715
+ }
716
+ },
717
+ "fa": {
718
+ "custom": {
719
+ "glow-tts": {
720
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
721
+ "default_vocoder": null,
722
+ "commit": null,
723
+ "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
724
+ "author": "@karim23657",
725
+ "license": "CC-BY-4.0"
726
+ }
727
+ }
728
+ },
729
+ "bn": {
730
+ "custom": {
731
+ "vits-male": {
732
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
733
+ "default_vocoder": null,
734
+ "commit": null,
735
+ "description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
736
+ "author": "@mobassir94",
737
+ "license": "Apache 2.0"
738
+ },
739
+ "vits-female": {
740
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
741
+ "default_vocoder": null,
742
+ "commit": null,
743
+ "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
744
+ "author": "@mobassir94",
745
+ "license": "Apache 2.0"
746
+ }
747
+ }
748
+ },
749
+ "be": {
750
+ "common-voice": {
751
+ "glow-tts":{
752
+ "description": "Belarusian GlowTTS model created by @alex73 (Github).",
753
+ "github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
754
+ "default_vocoder": "vocoder_models/be/common-voice/hifigan",
755
+ "commit": "c0aabb85",
756
+ "license": "CC-BY-SA 4.0",
757
+ "contact": "[email protected]"
758
+ }
759
+ }
760
+ }
761
+ },
762
+ "vocoder_models": {
763
+ "universal": {
764
+ "libri-tts": {
765
+ "wavegrad": {
766
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
767
+ "commit": "ea976b0",
768
+ "author": "Eren Gölge @erogol",
769
+ "license": "MPL",
770
+ "contact": "[email protected]"
771
+ },
772
+ "fullband-melgan": {
773
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
774
+ "commit": "4132240",
775
+ "author": "Eren Gölge @erogol",
776
+ "license": "MPL",
777
+ "contact": "[email protected]"
778
+ }
779
+ }
780
+ },
781
+ "en": {
782
+ "ek1": {
783
+ "wavegrad": {
784
+ "description": "EK1 en-rp wavegrad by NMStoker",
785
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
786
+ "commit": "c802255",
787
+ "license": "apache 2.0"
788
+ }
789
+ },
790
+ "ljspeech": {
791
+ "multiband-melgan": {
792
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
793
+ "commit": "ea976b0",
794
+ "author": "Eren Gölge @erogol",
795
+ "license": "MPL",
796
+ "contact": "[email protected]"
797
+ },
798
+ "hifigan_v2": {
799
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
800
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
801
+ "commit": "bae2ad0f",
802
+ "author": "@erogol",
803
+ "license": "apache 2.0",
804
+ "contact": "[email protected]"
805
+ },
806
+ "univnet": {
807
+ "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
808
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
809
+ "commit": "4581e3d",
810
+ "author": "Eren @erogol",
811
+ "license": "apache 2.0",
812
+ "contact": "[email protected]"
813
+ }
814
+ },
815
+ "blizzard2013": {
816
+ "hifigan_v2": {
817
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
818
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
819
+ "commit": "d6284e7",
820
+ "author": "Adam Froghyar @a-froghyar",
821
+ "license": "apache 2.0",
822
+ "contact": "[email protected]"
823
+ }
824
+ },
825
+ "vctk": {
826
+ "hifigan_v2": {
827
+ "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
828
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
829
+ "commit": "2f07160",
830
+ "author": "Edresson Casanova",
831
+ "license": "apache 2.0",
832
+ "contact": ""
833
+ }
834
+ },
835
+ "sam": {
836
+ "hifigan_v2": {
837
+ "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
838
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
839
+ "commit": "2f07160",
840
+ "author": "Eren Gölge @erogol",
841
+ "license": "apache 2.0",
842
+ "contact": "[email protected]"
843
+ }
844
+ }
845
+ },
846
+ "nl": {
847
+ "mai": {
848
+ "parallel-wavegan": {
849
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
850
+ "author": "@r-dh",
851
+ "license": "apache 2.0",
852
+ "commit": "unknown"
853
+ }
854
+ }
855
+ },
856
+ "de": {
857
+ "thorsten": {
858
+ "wavegrad": {
859
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
860
+ "author": "@thorstenMueller",
861
+ "license": "apache 2.0",
862
+ "commit": "unknown"
863
+ },
864
+ "fullband-melgan": {
865
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
866
+ "author": "@thorstenMueller",
867
+ "license": "apache 2.0",
868
+ "commit": "unknown"
869
+ },
870
+ "hifigan_v1": {
871
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
872
+ "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
873
+ "author": "@thorstenMueller",
874
+ "license": "apache 2.0",
875
+ "commit": "unknown"
876
+ }
877
+ }
878
+ },
879
+ "ja": {
880
+ "kokoro": {
881
+ "hifigan_v1": {
882
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
883
+ "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
884
+ "author": "@kaiidams",
885
+ "license": "apache 2.0",
886
+ "commit": "3900448"
887
+ }
888
+ }
889
+ },
890
+ "uk": {
891
+ "mai": {
892
+ "multiband-melgan": {
893
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
894
+ "author": "@robinhad",
895
+ "commit": "bdab788d",
896
+ "license": "MIT",
897
+ "contact": ""
898
+ }
899
+ }
900
+ },
901
+ "tr": {
902
+ "common-voice": {
903
+ "hifigan": {
904
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
905
+ "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
906
+ "author": "Fatih Akademi",
907
+ "license": "MIT",
908
+ "commit": null
909
+ }
910
+ }
911
+ },
912
+ "be": {
913
+ "common-voice": {
914
+ "hifigan": {
915
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
916
+ "description": "Belarusian HiFiGAN model created by @alex73 (Github).",
917
+ "author": "@alex73",
918
+ "license": "CC-BY-SA 4.0",
919
+ "commit": "c0aabb85"
920
+ }
921
+ }
922
+ }
923
+ },
924
+ "voice_conversion_models": {
925
+ "multilingual": {
926
+ "vctk": {
927
+ "freevc24": {
928
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
929
+ "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
930
+ "author": "Jing-Yi Li @OlaWod",
931
+ "license": "MIT",
932
+ "commit": null
933
+ }
934
+ }
935
+ }
936
+ }
937
+ }
TTS/TTS_VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.20.6
TTS/TTS___init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4
+ version = f.read().strip()
5
+
6
+ __version__ = version
TTS/TTS_api.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import warnings
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+ import numpy as np
7
+ from torch import nn
8
+
9
+ from TTS.cs_api import CS_API
10
+ from TTS.utils.audio.numpy_transforms import save_wav
11
+ from TTS.utils.manage import ModelManager
12
+ from TTS.utils.synthesizer import Synthesizer
13
+
14
+
15
+ class TTS(nn.Module):
16
+ """TODO: Add voice conversion and Capacitron support."""
17
+
18
+ def __init__(
19
+ self,
20
+ model_name: str = "",
21
+ model_path: str = None,
22
+ config_path: str = None,
23
+ vocoder_path: str = None,
24
+ vocoder_config_path: str = None,
25
+ progress_bar: bool = True,
26
+ cs_api_model: str = "XTTS",
27
+ gpu=False,
28
+ ):
29
+ """🐸TTS python interface that allows to load and use the released models.
30
+
31
+ Example with a multi-speaker model:
32
+ >>> from TTS.api import TTS
33
+ >>> tts = TTS(TTS.list_models()[0])
34
+ >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
35
+ >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
36
+
37
+ Example with a single-speaker model:
38
+ >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
39
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
40
+
41
+ Example loading a model from a path:
42
+ >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
43
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
44
+
45
+ Example voice cloning with YourTTS in English, French and Portuguese:
46
+ >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
47
+ >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
48
+ >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
49
+ >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
50
+
51
+ Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
52
+ >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
53
+ >>> tts.tts_to_file("This is a test.", file_path="output.wav")
54
+
55
+ Args:
56
+ model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
57
+ model_path (str, optional): Path to the model checkpoint. Defaults to None.
58
+ config_path (str, optional): Path to the model config. Defaults to None.
59
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
60
+ vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
61
+ progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
62
+ cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
63
+ "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
64
+ Defaults to "XTTS".
65
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
66
+ """
67
+ super().__init__()
68
+ self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
69
+
70
+ self.synthesizer = None
71
+ self.voice_converter = None
72
+ self.csapi = None
73
+ self.cs_api_model = cs_api_model
74
+ self.model_name = ""
75
+
76
+ if gpu:
77
+ warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
78
+
79
+ if model_name is not None:
80
+ if "tts_models" in model_name or "coqui_studio" in model_name:
81
+ self.load_tts_model_by_name(model_name, gpu)
82
+ elif "voice_conversion_models" in model_name:
83
+ self.load_vc_model_by_name(model_name, gpu)
84
+
85
+ if model_path:
86
+ self.load_tts_model_by_path(
87
+ model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
88
+ )
89
+
90
+ @property
91
+ def models(self):
92
+ return self.manager.list_tts_models()
93
+
94
+ @property
95
+ def is_multi_speaker(self):
96
+ if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
97
+ return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
98
+ return False
99
+
100
+ @property
101
+ def is_coqui_studio(self):
102
+ if self.model_name is None:
103
+ return False
104
+ return "coqui_studio" in self.model_name
105
+
106
+ @property
107
+ def is_multi_lingual(self):
108
+ # Not sure what sets this to None, but applied a fix to prevent crashing.
109
+ if isinstance(self.model_name, str) and "xtts" in self.model_name:
110
+ return True
111
+ if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
112
+ return self.synthesizer.tts_model.language_manager.num_languages > 1
113
+ return False
114
+
115
+ @property
116
+ def speakers(self):
117
+ if not self.is_multi_speaker:
118
+ return None
119
+ return self.synthesizer.tts_model.speaker_manager.speaker_names
120
+
121
+ @property
122
+ def languages(self):
123
+ if not self.is_multi_lingual:
124
+ return None
125
+ return self.synthesizer.tts_model.language_manager.language_names
126
+
127
+ @staticmethod
128
+ def get_models_file_path():
129
+ return Path(__file__).parent / ".models.json"
130
+
131
+ def list_models(self):
132
+ try:
133
+ csapi = CS_API(model=self.cs_api_model)
134
+ models = csapi.list_speakers_as_tts_models()
135
+ except ValueError as e:
136
+ print(e)
137
+ models = []
138
+ manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
139
+ return manager.list_tts_models() + models
140
+
141
+ def download_model_by_name(self, model_name: str):
142
+ model_path, config_path, model_item = self.manager.download_model(model_name)
143
+ if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
144
+ # return model directory if there are multiple files
145
+ # we assume that the model knows how to load itself
146
+ return None, None, None, None, model_path
147
+ if model_item.get("default_vocoder") is None:
148
+ return model_path, config_path, None, None, None
149
+ vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
150
+ return model_path, config_path, vocoder_path, vocoder_config_path, None
151
+
152
+ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
153
+ """Load one of the voice conversion models by name.
154
+
155
+ Args:
156
+ model_name (str): Model name to load. You can list models by ```tts.models```.
157
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
158
+ """
159
+ self.model_name = model_name
160
+ model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
161
+ self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
162
+
163
+ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
164
+ """Load one of 🐸TTS models by name.
165
+
166
+ Args:
167
+ model_name (str): Model name to load. You can list models by ```tts.models```.
168
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
169
+
170
+ TODO: Add tests
171
+ """
172
+ self.synthesizer = None
173
+ self.csapi = None
174
+ self.model_name = model_name
175
+
176
+ if "coqui_studio" in model_name:
177
+ self.csapi = CS_API()
178
+ else:
179
+ model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
180
+ model_name
181
+ )
182
+
183
+ # init synthesizer
184
+ # None values are fetch from the model
185
+ self.synthesizer = Synthesizer(
186
+ tts_checkpoint=model_path,
187
+ tts_config_path=config_path,
188
+ tts_speakers_file=None,
189
+ tts_languages_file=None,
190
+ vocoder_checkpoint=vocoder_path,
191
+ vocoder_config=vocoder_config_path,
192
+ encoder_checkpoint=None,
193
+ encoder_config=None,
194
+ model_dir=model_dir,
195
+ use_cuda=gpu,
196
+ )
197
+
198
+ def load_tts_model_by_path(
199
+ self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
200
+ ):
201
+ """Load a model from a path.
202
+
203
+ Args:
204
+ model_path (str): Path to the model checkpoint.
205
+ config_path (str): Path to the model config.
206
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
207
+ vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
208
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
209
+ """
210
+
211
+ self.synthesizer = Synthesizer(
212
+ tts_checkpoint=model_path,
213
+ tts_config_path=config_path,
214
+ tts_speakers_file=None,
215
+ tts_languages_file=None,
216
+ vocoder_checkpoint=vocoder_path,
217
+ vocoder_config=vocoder_config,
218
+ encoder_checkpoint=None,
219
+ encoder_config=None,
220
+ use_cuda=gpu,
221
+ )
222
+
223
+ def _check_arguments(
224
+ self,
225
+ speaker: str = None,
226
+ language: str = None,
227
+ speaker_wav: str = None,
228
+ emotion: str = None,
229
+ speed: float = None,
230
+ **kwargs,
231
+ ) -> None:
232
+ """Check if the arguments are valid for the model."""
233
+ if not self.is_coqui_studio:
234
+ # check for the coqui tts models
235
+ if self.is_multi_speaker and (speaker is None and speaker_wav is None):
236
+ raise ValueError("Model is multi-speaker but no `speaker` is provided.")
237
+ if self.is_multi_lingual and language is None:
238
+ raise ValueError("Model is multi-lingual but no `language` is provided.")
239
+ if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
240
+ raise ValueError("Model is not multi-speaker but `speaker` is provided.")
241
+ if not self.is_multi_lingual and language is not None:
242
+ raise ValueError("Model is not multi-lingual but `language` is provided.")
243
+ if not emotion is None and not speed is None:
244
+ raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
245
+ else:
246
+ if emotion is None:
247
+ emotion = "Neutral"
248
+ if speed is None:
249
+ speed = 1.0
250
+ # check for the studio models
251
+ if speaker_wav is not None:
252
+ raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
253
+ if speaker is not None:
254
+ raise ValueError("Coqui Studio models do not support `speaker` argument.")
255
+ if language is not None and language != "en":
256
+ raise ValueError("Coqui Studio models currently support only `language=en` argument.")
257
+ if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
258
+ raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
259
+
260
+ def tts_coqui_studio(
261
+ self,
262
+ text: str,
263
+ speaker_name: str = None,
264
+ language: str = None,
265
+ emotion: str = None,
266
+ speed: float = 1.0,
267
+ pipe_out=None,
268
+ file_path: str = None,
269
+ ) -> Union[np.ndarray, str]:
270
+ """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
271
+
272
+ Args:
273
+ text (str):
274
+ Input text to synthesize.
275
+ speaker_name (str, optional):
276
+ Speaker name from Coqui Studio. Defaults to None.
277
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
278
+ supported by `XTTS` model.
279
+ emotion (str, optional):
280
+ Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
281
+ with "V1" model. Defaults to None.
282
+ speed (float, optional):
283
+ Speed of the speech. Defaults to 1.0.
284
+ pipe_out (BytesIO, optional):
285
+ Flag to stdout the generated TTS wav file for shell pipe.
286
+ file_path (str, optional):
287
+ Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
288
+
289
+ Returns:
290
+ Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
291
+ """
292
+ speaker_name = self.model_name.split("/")[2]
293
+ if file_path is not None:
294
+ return self.csapi.tts_to_file(
295
+ text=text,
296
+ speaker_name=speaker_name,
297
+ language=language,
298
+ speed=speed,
299
+ pipe_out=pipe_out,
300
+ emotion=emotion,
301
+ file_path=file_path,
302
+ )[0]
303
+ return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
304
+
305
+ def tts(
306
+ self,
307
+ text: str,
308
+ speaker: str = None,
309
+ language: str = None,
310
+ speaker_wav: str = None,
311
+ emotion: str = None,
312
+ speed: float = None,
313
+ **kwargs,
314
+ ):
315
+ """Convert text to speech.
316
+
317
+ Args:
318
+ text (str):
319
+ Input text to synthesize.
320
+ speaker (str, optional):
321
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
322
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
323
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
324
+ supported by `XTTS` model.
325
+ speaker_wav (str, optional):
326
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
327
+ Defaults to None.
328
+ emotion (str, optional):
329
+ Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
330
+ speed (float, optional):
331
+ Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
332
+ Defaults to None.
333
+ """
334
+ self._check_arguments(
335
+ speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
336
+ )
337
+ if self.csapi is not None:
338
+ return self.tts_coqui_studio(
339
+ text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
340
+ )
341
+ wav = self.synthesizer.tts(
342
+ text=text,
343
+ speaker_name=speaker,
344
+ language_name=language,
345
+ speaker_wav=speaker_wav,
346
+ reference_wav=None,
347
+ style_wav=None,
348
+ style_text=None,
349
+ reference_speaker_name=None,
350
+ **kwargs,
351
+ )
352
+ return wav
353
+
354
+ def tts_to_file(
355
+ self,
356
+ text: str,
357
+ speaker: str = None,
358
+ language: str = None,
359
+ speaker_wav: str = None,
360
+ emotion: str = None,
361
+ speed: float = 1.0,
362
+ pipe_out=None,
363
+ file_path: str = "output.wav",
364
+ **kwargs,
365
+ ):
366
+ """Convert text to speech.
367
+
368
+ Args:
369
+ text (str):
370
+ Input text to synthesize.
371
+ speaker (str, optional):
372
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
373
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
374
+ language (str, optional):
375
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
376
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
377
+ speaker_wav (str, optional):
378
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
379
+ Defaults to None.
380
+ emotion (str, optional):
381
+ Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
382
+ speed (float, optional):
383
+ Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
384
+ pipe_out (BytesIO, optional):
385
+ Flag to stdout the generated TTS wav file for shell pipe.
386
+ file_path (str, optional):
387
+ Output file path. Defaults to "output.wav".
388
+ kwargs (dict, optional):
389
+ Additional arguments for the model.
390
+ """
391
+ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
392
+
393
+ if self.csapi is not None:
394
+ return self.tts_coqui_studio(
395
+ text=text,
396
+ speaker_name=speaker,
397
+ language=language,
398
+ emotion=emotion,
399
+ speed=speed,
400
+ file_path=file_path,
401
+ pipe_out=pipe_out,
402
+ )
403
+ wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
404
+ self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
405
+ return file_path
406
+
407
+ def voice_conversion(
408
+ self,
409
+ source_wav: str,
410
+ target_wav: str,
411
+ ):
412
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
413
+
414
+ Args:``
415
+ source_wav (str):
416
+ Path to the source wav file.
417
+ target_wav (str):`
418
+ Path to the target wav file.
419
+ """
420
+ wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
421
+ return wav
422
+
423
+ def voice_conversion_to_file(
424
+ self,
425
+ source_wav: str,
426
+ target_wav: str,
427
+ file_path: str = "output.wav",
428
+ ):
429
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
430
+
431
+ Args:
432
+ source_wav (str):
433
+ Path to the source wav file.
434
+ target_wav (str):
435
+ Path to the target wav file.
436
+ file_path (str, optional):
437
+ Output file path. Defaults to "output.wav".
438
+ """
439
+ wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
440
+ save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
441
+ return file_path
442
+
443
+ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
444
+ """Convert text to speech with voice conversion.
445
+
446
+ It combines tts with voice conversion to fake voice cloning.
447
+
448
+ - Convert text to speech with tts.
449
+ - Convert the output wav to target speaker with voice conversion.
450
+
451
+ Args:
452
+ text (str):
453
+ Input text to synthesize.
454
+ language (str, optional):
455
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
456
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
457
+ speaker_wav (str, optional):
458
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
459
+ Defaults to None.
460
+ """
461
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
462
+ # Lazy code... save it to a temp file to resample it while reading it for VC
463
+ self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav)
464
+ if self.voice_converter is None:
465
+ self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
466
+ wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
467
+ return wav
468
+
469
+ def tts_with_vc_to_file(
470
+ self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav"
471
+ ):
472
+ """Convert text to speech with voice conversion and save to file.
473
+
474
+ Check `tts_with_vc` for more details.
475
+
476
+ Args:
477
+ text (str):
478
+ Input text to synthesize.
479
+ language (str, optional):
480
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
481
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
482
+ speaker_wav (str, optional):
483
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
484
+ Defaults to None.
485
+ file_path (str, optional):
486
+ Output file path. Defaults to "output.wav".
487
+ """
488
+ wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav)
489
+ save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
TTS/TTS_cs_api.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client
2
+ import json
3
+ import os
4
+ import tempfile
5
+ import urllib.request
6
+ from typing import Tuple
7
+
8
+ import numpy as np
9
+ import requests
10
+ from scipy.io import wavfile
11
+
12
+ from TTS.utils.audio.numpy_transforms import save_wav
13
+
14
+
15
+ class Speaker(object):
16
+ """Convert dict to object."""
17
+
18
+ def __init__(self, d, is_voice=False):
19
+ self.is_voice = is_voice
20
+ for k, v in d.items():
21
+ if isinstance(k, (list, tuple)):
22
+ setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
23
+ else:
24
+ setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
25
+
26
+ def __repr__(self):
27
+ return str(self.__dict__)
28
+
29
+
30
+ class CS_API:
31
+ """🐸Coqui Studio API Wrapper.
32
+
33
+ 🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
34
+ interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
35
+ characteristics. You can use these voices to generate new audio files or use them in your applications.
36
+ You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
37
+ You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
38
+ https://app.coqui.ai/account. We can either enter the token as an environment variable as
39
+ `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
40
+ Visit https://app.coqui.ai/api for more information.
41
+
42
+
43
+ Args:
44
+ api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
45
+ `COQUI_STUDIO_TOKEN`.
46
+ model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
47
+
48
+
49
+ Example listing all available speakers:
50
+ >>> from TTS.api import CS_API
51
+ >>> tts = CS_API()
52
+ >>> tts.speakers
53
+
54
+ Example listing all emotions:
55
+ >>> # emotions are only available for `V1` model
56
+ >>> from TTS.api import CS_API
57
+ >>> tts = CS_API(model="V1")
58
+ >>> tts.emotions
59
+
60
+ Example with a built-in 🐸 speaker:
61
+ >>> from TTS.api import CS_API
62
+ >>> tts = CS_API()
63
+ >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name)
64
+ >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
65
+
66
+ Example with multi-language model:
67
+ >>> from TTS.api import CS_API
68
+ >>> tts = CS_API(model="XTTS")
69
+ >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
70
+ """
71
+
72
+ MODEL_ENDPOINTS = {
73
+ "V1": {
74
+ "list_speakers": "https://app.coqui.ai/api/v2/speakers",
75
+ "synthesize": "https://app.coqui.ai/api/v2/samples",
76
+ "list_voices": "https://app.coqui.ai/api/v2/voices",
77
+ },
78
+ "XTTS": {
79
+ "list_speakers": "https://app.coqui.ai/api/v2/speakers",
80
+ "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
81
+ "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
82
+ },
83
+ }
84
+
85
+ SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
86
+
87
+ def __init__(self, api_token=None, model="XTTS"):
88
+ self.api_token = api_token
89
+ self.model = model
90
+ self.headers = None
91
+ self._speakers = None
92
+ self._check_token()
93
+
94
+ @staticmethod
95
+ def ping_api():
96
+ URL = "https://coqui.gateway.scarf.sh/tts/api"
97
+ _ = requests.get(URL)
98
+
99
+ @property
100
+ def speakers(self):
101
+ if self._speakers is None:
102
+ self._speakers = self.list_all_speakers()
103
+ return self._speakers
104
+
105
+ @property
106
+ def emotions(self):
107
+ """Return a list of available emotions.
108
+
109
+ TODO: Get this from the API endpoint.
110
+ """
111
+ if self.model == "V1":
112
+ return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
113
+ else:
114
+ raise ValueError(f"❗ Emotions are not available for {self.model}.")
115
+
116
+ def _check_token(self):
117
+ if self.api_token is None:
118
+ self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
119
+ self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
120
+ if not self.api_token:
121
+ raise ValueError(
122
+ "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
123
+ "Visit 🔗https://app.coqui.ai/account to get one.\n"
124
+ "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
125
+ ""
126
+ )
127
+
128
+ def list_all_speakers(self):
129
+ """Return both built-in Coqui Studio speakers and custom voices created by the user."""
130
+ return self.list_speakers() + self.list_voices()
131
+
132
+ def list_speakers(self):
133
+ """List built-in Coqui Studio speakers."""
134
+ self._check_token()
135
+ conn = http.client.HTTPSConnection("app.coqui.ai")
136
+ url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
137
+ conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
138
+ res = conn.getresponse()
139
+ data = res.read()
140
+ return [Speaker(s) for s in json.loads(data)["result"]]
141
+
142
+ def list_voices(self):
143
+ """List custom voices created by the user."""
144
+ conn = http.client.HTTPSConnection("app.coqui.ai")
145
+ url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
146
+ conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
147
+ res = conn.getresponse()
148
+ data = res.read()
149
+ return [Speaker(s, True) for s in json.loads(data)["result"]]
150
+
151
+ def list_speakers_as_tts_models(self):
152
+ """List speakers in ModelManager format."""
153
+ models = []
154
+ for speaker in self.speakers:
155
+ model = f"coqui_studio/multilingual/{speaker.name}/{self.model}"
156
+ models.append(model)
157
+ return models
158
+
159
+ def name_to_speaker(self, name):
160
+ for speaker in self.speakers:
161
+ if speaker.name == name:
162
+ return speaker
163
+ raise ValueError(f"Speaker {name} not found in {self.speakers}")
164
+
165
+ def id_to_speaker(self, speaker_id):
166
+ for speaker in self.speakers:
167
+ if speaker.id == speaker_id:
168
+ return speaker
169
+ raise ValueError(f"Speaker {speaker_id} not found.")
170
+
171
+ @staticmethod
172
+ def url_to_np(url):
173
+ tmp_file, _ = urllib.request.urlretrieve(url)
174
+ rate, data = wavfile.read(tmp_file)
175
+ return data, rate
176
+
177
+ @staticmethod
178
+ def _create_payload(model, text, speaker, speed, emotion, language):
179
+ payload = {}
180
+ # if speaker.is_voice:
181
+ payload["voice_id"] = speaker.id
182
+ # else:
183
+ payload["speaker_id"] = speaker.id
184
+
185
+ if model == "V1":
186
+ payload.update(
187
+ {
188
+ "emotion": emotion,
189
+ "name": speaker.name,
190
+ "text": text,
191
+ "speed": speed,
192
+ }
193
+ )
194
+ elif model == "XTTS":
195
+ payload.update(
196
+ {
197
+ "name": speaker.name,
198
+ "text": text,
199
+ "speed": speed,
200
+ "language": language,
201
+ }
202
+ )
203
+ else:
204
+ raise ValueError(f"❗ Unknown model {model}")
205
+ return payload
206
+
207
+ def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language):
208
+ assert text is not None, "❗ text is required for V1 model."
209
+ assert speaker_name is not None, "❗ speaker_name is required for V1 model."
210
+ if self.model == "V1":
211
+ if emotion is None:
212
+ emotion = "Neutral"
213
+ assert language is None, "❗ language is not supported for V1 model."
214
+ elif self.model == "XTTS":
215
+ assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
216
+ assert language is not None, "❗ Language is required for XTTS model."
217
+ assert (
218
+ language in self.SUPPORTED_LANGUAGES
219
+ ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
220
+ return text, speaker_name, speaker_id, emotion, speed, language
221
+
222
+ def tts(
223
+ self,
224
+ text: str,
225
+ speaker_name: str = None,
226
+ speaker_id=None,
227
+ emotion=None,
228
+ speed=1.0,
229
+ language=None, # pylint: disable=unused-argument
230
+ ) -> Tuple[np.ndarray, int]:
231
+ """Synthesize speech from text.
232
+
233
+ Args:
234
+ text (str): Text to synthesize.
235
+ speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
236
+ voices (user generated speakers) with `list_voices()`.
237
+ speaker_id (str): Speaker ID. If None, the speaker name is used.
238
+ emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only
239
+ supported by `V1` model. Defaults to None.
240
+ speed (float): Speed of the speech. 1.0 is normal speed.
241
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
242
+ supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
243
+ """
244
+ self._check_token()
245
+ self.ping_api()
246
+
247
+ if speaker_name is None and speaker_id is None:
248
+ raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
249
+ if speaker_id is None:
250
+ speaker = self.name_to_speaker(speaker_name)
251
+ else:
252
+ speaker = self.id_to_speaker(speaker_id)
253
+
254
+ text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args(
255
+ text, speaker_name, speaker_id, emotion, speed, language
256
+ )
257
+
258
+ conn = http.client.HTTPSConnection("app.coqui.ai")
259
+ payload = self._create_payload(self.model, text, speaker, speed, emotion, language)
260
+ url = self.MODEL_ENDPOINTS[self.model]["synthesize"]
261
+ conn.request("POST", url, json.dumps(payload), self.headers)
262
+ res = conn.getresponse()
263
+ data = res.read()
264
+ try:
265
+ wav, sr = self.url_to_np(json.loads(data)["audio_url"])
266
+ except KeyError as e:
267
+ raise ValueError(f" [!] 🐸 API returned error: {data}") from e
268
+ return wav, sr
269
+
270
+ def tts_to_file(
271
+ self,
272
+ text: str,
273
+ speaker_name: str,
274
+ speaker_id=None,
275
+ emotion=None,
276
+ speed=1.0,
277
+ pipe_out=None,
278
+ language=None,
279
+ file_path: str = None,
280
+ ) -> str:
281
+ """Synthesize speech from text and save it to a file.
282
+
283
+ Args:
284
+ text (str): Text to synthesize.
285
+ speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
286
+ voices (user generated speakers) with `list_voices()`.
287
+ speaker_id (str): Speaker ID. If None, the speaker name is used.
288
+ emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
289
+ speed (float): Speed of the speech. 1.0 is normal speed.
290
+ pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
291
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
292
+ supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
293
+ file_path (str): Path to save the file. If None, a temporary file is created.
294
+ """
295
+ if file_path is None:
296
+ file_path = tempfile.mktemp(".wav")
297
+ wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
298
+ save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out)
299
+ return file_path
300
+
301
+
302
+ if __name__ == "__main__":
303
+ import time
304
+
305
+ api = CS_API()
306
+ print(api.speakers)
307
+ print(api.list_speakers_as_tts_models())
308
+
309
+ ts = time.time()
310
+ wav, sr = api.tts(
311
+ "It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name
312
+ )
313
+ print(f" [i] XTTS took {time.time() - ts:.2f}s")
314
+
315
+ filepath = api.tts_to_file(
316
+ text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav"
317
+ )
TTS/TTS_model.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+ from typing import Dict
3
+
4
+ import torch
5
+ from coqpit import Coqpit
6
+ from trainer import TrainerModel
7
+
8
+ # pylint: skip-file
9
+
10
+
11
+ class BaseTrainerModel(TrainerModel):
12
+ """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
13
+
14
+ Every new 🐸TTS model must inherit it.
15
+ """
16
+
17
+ @staticmethod
18
+ @abstractmethod
19
+ def init_from_config(config: Coqpit):
20
+ """Init the model and all its attributes from the given config.
21
+
22
+ Override this depending on your model.
23
+ """
24
+ ...
25
+
26
+ @abstractmethod
27
+ def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
28
+ """Forward pass for inference.
29
+
30
+ It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
31
+ is considered to be the main output and you can add any other auxiliary outputs as you want.
32
+
33
+ We don't use `*kwargs` since it is problematic with the TorchScript API.
34
+
35
+ Args:
36
+ input (torch.Tensor): [description]
37
+ aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
38
+
39
+ Returns:
40
+ Dict: [description]
41
+ """
42
+ outputs_dict = {"model_outputs": None}
43
+ ...
44
+ return outputs_dict
45
+
46
+ @abstractmethod
47
+ def load_checkpoint(
48
+ self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
49
+ ) -> None:
50
+ """Load a model checkpoint gile and get ready for training or inference.
51
+
52
+ Args:
53
+ config (Coqpit): Model configuration.
54
+ checkpoint_path (str): Path to the model checkpoint file.
55
+ eval (bool, optional): If true, init model for inference else for training. Defaults to False.
56
+ strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
57
+ cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
58
+ """
59
+ ...
TTS/__pycache__/TTS___pycache_____init__.cpython-39.pyc ADDED
Binary file (358 Bytes). View file
 
TTS/__pycache__/TTS___pycache___api.cpython-39.pyc ADDED
Binary file (18.5 kB). View file
 
TTS/__pycache__/TTS___pycache___cs_api.cpython-39.pyc ADDED
Binary file (12 kB). View file
 
TTS/__pycache__/TTS___pycache___model.cpython-39.pyc ADDED
Binary file (2.58 kB). View file
 
TTS/bin/TTS_bin___init__.py ADDED
File without changes
TTS/bin/TTS_bin_collect_env_info.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Get detailed info about the working environment."""
2
+ import os
3
+ import platform
4
+ import sys
5
+
6
+ import numpy
7
+ import torch
8
+
9
+ sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10
+ import json
11
+
12
+ import TTS
13
+
14
+
15
+ def system_info():
16
+ return {
17
+ "OS": platform.system(),
18
+ "architecture": platform.architecture(),
19
+ "version": platform.version(),
20
+ "processor": platform.processor(),
21
+ "python": platform.python_version(),
22
+ }
23
+
24
+
25
+ def cuda_info():
26
+ return {
27
+ "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28
+ "available": torch.cuda.is_available(),
29
+ "version": torch.version.cuda,
30
+ }
31
+
32
+
33
+ def package_info():
34
+ return {
35
+ "numpy": numpy.__version__,
36
+ "PyTorch_version": torch.__version__,
37
+ "PyTorch_debug": torch.version.debug,
38
+ "TTS": TTS.__version__,
39
+ }
40
+
41
+
42
+ def main():
43
+ details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44
+ print(json.dumps(details, indent=4, sort_keys=True))
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
TTS/bin/TTS_bin_compute_attention_masks.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import importlib
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch.utils.data import DataLoader
9
+ from tqdm import tqdm
10
+
11
+ from TTS.config import load_config
12
+ from TTS.tts.datasets.TTSDataset import TTSDataset
13
+ from TTS.tts.models import setup_model
14
+ from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
15
+ from TTS.utils.audio import AudioProcessor
16
+ from TTS.utils.io import load_checkpoint
17
+
18
+ if __name__ == "__main__":
19
+ # pylint: disable=bad-option-value
20
+ parser = argparse.ArgumentParser(
21
+ description="""Extract attention masks from trained Tacotron/Tacotron2 models.
22
+ These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
23
+ """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
24
+ (e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
25
+ """
26
+ Example run:
27
+ CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
28
+ --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
29
+ --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
30
+ --dataset_metafile metadata.csv
31
+ --data_path /root/LJSpeech-1.1/
32
+ --batch_size 32
33
+ --dataset ljspeech
34
+ --use_cuda True
35
+ """,
36
+ formatter_class=RawTextHelpFormatter,
37
+ )
38
+ parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
39
+ parser.add_argument(
40
+ "--config_path",
41
+ type=str,
42
+ required=True,
43
+ help="Path to Tacotron/Tacotron2 config file.",
44
+ )
45
+ parser.add_argument(
46
+ "--dataset",
47
+ type=str,
48
+ default="",
49
+ required=True,
50
+ help="Target dataset processor name from TTS.tts.dataset.preprocess.",
51
+ )
52
+
53
+ parser.add_argument(
54
+ "--dataset_metafile",
55
+ type=str,
56
+ default="",
57
+ required=True,
58
+ help="Dataset metafile inclusing file paths with transcripts.",
59
+ )
60
+ parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
61
+ parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
62
+
63
+ parser.add_argument(
64
+ "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
65
+ )
66
+ args = parser.parse_args()
67
+
68
+ C = load_config(args.config_path)
69
+ ap = AudioProcessor(**C.audio)
70
+
71
+ # if the vocabulary was passed, replace the default
72
+ if "characters" in C.keys():
73
+ symbols, phonemes = make_symbols(**C.characters)
74
+
75
+ # load the model
76
+ num_chars = len(phonemes) if C.use_phonemes else len(symbols)
77
+ # TODO: handle multi-speaker
78
+ model = setup_model(C)
79
+ model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
80
+
81
+ # data loader
82
+ preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
83
+ preprocessor = getattr(preprocessor, args.dataset)
84
+ meta_data = preprocessor(args.data_path, args.dataset_metafile)
85
+ dataset = TTSDataset(
86
+ model.decoder.r,
87
+ C.text_cleaner,
88
+ compute_linear_spec=False,
89
+ ap=ap,
90
+ meta_data=meta_data,
91
+ characters=C.characters if "characters" in C.keys() else None,
92
+ add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
93
+ use_phonemes=C.use_phonemes,
94
+ phoneme_cache_path=C.phoneme_cache_path,
95
+ phoneme_language=C.phoneme_language,
96
+ enable_eos_bos=C.enable_eos_bos_chars,
97
+ )
98
+
99
+ dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
100
+ loader = DataLoader(
101
+ dataset,
102
+ batch_size=args.batch_size,
103
+ num_workers=4,
104
+ collate_fn=dataset.collate_fn,
105
+ shuffle=False,
106
+ drop_last=False,
107
+ )
108
+
109
+ # compute attentions
110
+ file_paths = []
111
+ with torch.no_grad():
112
+ for data in tqdm(loader):
113
+ # setup input data
114
+ text_input = data[0]
115
+ text_lengths = data[1]
116
+ linear_input = data[3]
117
+ mel_input = data[4]
118
+ mel_lengths = data[5]
119
+ stop_targets = data[6]
120
+ item_idxs = data[7]
121
+
122
+ # dispatch data to GPU
123
+ if args.use_cuda:
124
+ text_input = text_input.cuda()
125
+ text_lengths = text_lengths.cuda()
126
+ mel_input = mel_input.cuda()
127
+ mel_lengths = mel_lengths.cuda()
128
+
129
+ model_outputs = model.forward(text_input, text_lengths, mel_input)
130
+
131
+ alignments = model_outputs["alignments"].detach()
132
+ for idx, alignment in enumerate(alignments):
133
+ item_idx = item_idxs[idx]
134
+ # interpolate if r > 1
135
+ alignment = (
136
+ torch.nn.functional.interpolate(
137
+ alignment.transpose(0, 1).unsqueeze(0),
138
+ size=None,
139
+ scale_factor=model.decoder.r,
140
+ mode="nearest",
141
+ align_corners=None,
142
+ recompute_scale_factor=None,
143
+ )
144
+ .squeeze(0)
145
+ .transpose(0, 1)
146
+ )
147
+ # remove paddings
148
+ alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
149
+ # set file paths
150
+ wav_file_name = os.path.basename(item_idx)
151
+ align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
152
+ file_path = item_idx.replace(wav_file_name, align_file_name)
153
+ # save output
154
+ wav_file_abs_path = os.path.abspath(item_idx)
155
+ file_abs_path = os.path.abspath(file_path)
156
+ file_paths.append([wav_file_abs_path, file_abs_path])
157
+ np.save(file_path, alignment)
158
+
159
+ # ourput metafile
160
+ metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
161
+
162
+ with open(metafile, "w", encoding="utf-8") as f:
163
+ for p in file_paths:
164
+ f.write(f"{p[0]}|{p[1]}\n")
165
+ print(f" >> Metafile created: {metafile}")
TTS/bin/TTS_bin_compute_embeddings.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ import torch
6
+ from tqdm import tqdm
7
+
8
+ from TTS.config import load_config
9
+ from TTS.config.shared_configs import BaseDatasetConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.utils.managers import save_file
12
+ from TTS.tts.utils.speakers import SpeakerManager
13
+
14
+
15
+ def compute_embeddings(
16
+ model_path,
17
+ config_path,
18
+ output_path,
19
+ old_speakers_file=None,
20
+ old_append=False,
21
+ config_dataset_path=None,
22
+ formatter_name=None,
23
+ dataset_name=None,
24
+ dataset_path=None,
25
+ meta_file_train=None,
26
+ meta_file_val=None,
27
+ disable_cuda=False,
28
+ no_eval=False,
29
+ ):
30
+ use_cuda = torch.cuda.is_available() and not disable_cuda
31
+
32
+ if config_dataset_path is not None:
33
+ c_dataset = load_config(config_dataset_path)
34
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
35
+ else:
36
+ c_dataset = BaseDatasetConfig()
37
+ c_dataset.formatter = formatter_name
38
+ c_dataset.dataset_name = dataset_name
39
+ c_dataset.path = dataset_path
40
+ if meta_file_train is not None:
41
+ c_dataset.meta_file_train = meta_file_train
42
+ if meta_file_val is not None:
43
+ c_dataset.meta_file_val = meta_file_val
44
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
45
+
46
+ if meta_data_eval is None:
47
+ samples = meta_data_train
48
+ else:
49
+ samples = meta_data_train + meta_data_eval
50
+
51
+ encoder_manager = SpeakerManager(
52
+ encoder_model_path=model_path,
53
+ encoder_config_path=config_path,
54
+ d_vectors_file_path=old_speakers_file,
55
+ use_cuda=use_cuda,
56
+ )
57
+
58
+ class_name_key = encoder_manager.encoder_config.class_name_key
59
+
60
+ # compute speaker embeddings
61
+ if old_speakers_file is not None and old_append:
62
+ speaker_mapping = encoder_manager.embeddings
63
+ else:
64
+ speaker_mapping = {}
65
+
66
+ for fields in tqdm(samples):
67
+ class_name = fields[class_name_key]
68
+ audio_file = fields["audio_file"]
69
+ embedding_key = fields["audio_unique_name"]
70
+
71
+ # Only update the speaker name when the embedding is already in the old file.
72
+ if embedding_key in speaker_mapping:
73
+ speaker_mapping[embedding_key]["name"] = class_name
74
+ continue
75
+
76
+ if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
77
+ # get the embedding from the old file
78
+ embedd = encoder_manager.get_embedding_by_clip(embedding_key)
79
+ else:
80
+ # extract the embedding
81
+ embedd = encoder_manager.compute_embedding_from_clip(audio_file)
82
+
83
+ # create speaker_mapping if target dataset is defined
84
+ speaker_mapping[embedding_key] = {}
85
+ speaker_mapping[embedding_key]["name"] = class_name
86
+ speaker_mapping[embedding_key]["embedding"] = embedd
87
+
88
+ if speaker_mapping:
89
+ # save speaker_mapping if target dataset is defined
90
+ if os.path.isdir(output_path):
91
+ mapping_file_path = os.path.join(output_path, "speakers.pth")
92
+ else:
93
+ mapping_file_path = output_path
94
+
95
+ if os.path.dirname(mapping_file_path) != "":
96
+ os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
97
+
98
+ save_file(speaker_mapping, mapping_file_path)
99
+ print("Speaker embeddings saved at:", mapping_file_path)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ parser = argparse.ArgumentParser(
104
+ description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
105
+ """
106
+ Example runs:
107
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
108
+
109
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
110
+ """,
111
+ formatter_class=RawTextHelpFormatter,
112
+ )
113
+ parser.add_argument(
114
+ "--model_path",
115
+ type=str,
116
+ help="Path to model checkpoint file. It defaults to the released speaker encoder.",
117
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
118
+ )
119
+ parser.add_argument(
120
+ "--config_path",
121
+ type=str,
122
+ help="Path to model config file. It defaults to the released speaker encoder config.",
123
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
124
+ )
125
+ parser.add_argument(
126
+ "--config_dataset_path",
127
+ type=str,
128
+ help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
129
+ default=None,
130
+ )
131
+ parser.add_argument(
132
+ "--output_path",
133
+ type=str,
134
+ help="Path for output `pth` or `json` file.",
135
+ default="speakers.pth",
136
+ )
137
+ parser.add_argument(
138
+ "--old_file",
139
+ type=str,
140
+ help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
141
+ default=None,
142
+ )
143
+ parser.add_argument(
144
+ "--old_append",
145
+ help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
146
+ default=False,
147
+ action="store_true",
148
+ )
149
+ parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
150
+ parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
151
+ parser.add_argument(
152
+ "--formatter_name",
153
+ type=str,
154
+ help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
155
+ default=None,
156
+ )
157
+ parser.add_argument(
158
+ "--dataset_name",
159
+ type=str,
160
+ help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
161
+ default=None,
162
+ )
163
+ parser.add_argument(
164
+ "--dataset_path",
165
+ type=str,
166
+ help="Path to the dataset. You either need to provide this or `config_dataset_path`",
167
+ default=None,
168
+ )
169
+ parser.add_argument(
170
+ "--meta_file_train",
171
+ type=str,
172
+ help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
173
+ default=None,
174
+ )
175
+ parser.add_argument(
176
+ "--meta_file_val",
177
+ type=str,
178
+ help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
179
+ default=None,
180
+ )
181
+ args = parser.parse_args()
182
+
183
+ compute_embeddings(
184
+ args.model_path,
185
+ args.config_path,
186
+ args.output_path,
187
+ old_speakers_file=args.old_file,
188
+ old_append=args.old_append,
189
+ config_dataset_path=args.config_dataset_path,
190
+ formatter_name=args.formatter_name,
191
+ dataset_name=args.dataset_name,
192
+ dataset_path=args.dataset_path,
193
+ meta_file_train=args.meta_file_train,
194
+ meta_file_val=args.meta_file_val,
195
+ disable_cuda=args.disable_cuda,
196
+ no_eval=args.no_eval,
197
+ )
TTS/bin/TTS_bin_compute_statistics.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import os
7
+
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+
11
+ # from TTS.utils.io import load_config
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import load_tts_samples
14
+ from TTS.utils.audio import AudioProcessor
15
+
16
+
17
+ def main():
18
+ """Run preprocessing process."""
19
+ parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
20
+ parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
21
+ parser.add_argument("out_path", type=str, help="save path (directory and filename).")
22
+ parser.add_argument(
23
+ "--data_path",
24
+ type=str,
25
+ required=False,
26
+ help="folder including the target set of wavs overriding dataset config.",
27
+ )
28
+ args, overrides = parser.parse_known_args()
29
+
30
+ CONFIG = load_config(args.config_path)
31
+ CONFIG.parse_known_args(overrides, relaxed_parser=True)
32
+
33
+ # load config
34
+ CONFIG.audio.signal_norm = False # do not apply earlier normalization
35
+ CONFIG.audio.stats_path = None # discard pre-defined stats
36
+
37
+ # load audio processor
38
+ ap = AudioProcessor(**CONFIG.audio.to_dict())
39
+
40
+ # load the meta data of target dataset
41
+ if args.data_path:
42
+ dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
43
+ else:
44
+ dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
45
+ print(f" > There are {len(dataset_items)} files.")
46
+
47
+ mel_sum = 0
48
+ mel_square_sum = 0
49
+ linear_sum = 0
50
+ linear_square_sum = 0
51
+ N = 0
52
+ for item in tqdm(dataset_items):
53
+ # compute features
54
+ wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
55
+ linear = ap.spectrogram(wav)
56
+ mel = ap.melspectrogram(wav)
57
+
58
+ # compute stats
59
+ N += mel.shape[1]
60
+ mel_sum += mel.sum(1)
61
+ linear_sum += linear.sum(1)
62
+ mel_square_sum += (mel**2).sum(axis=1)
63
+ linear_square_sum += (linear**2).sum(axis=1)
64
+
65
+ mel_mean = mel_sum / N
66
+ mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
67
+ linear_mean = linear_sum / N
68
+ linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
69
+
70
+ output_file_path = args.out_path
71
+ stats = {}
72
+ stats["mel_mean"] = mel_mean
73
+ stats["mel_std"] = mel_scale
74
+ stats["linear_mean"] = linear_mean
75
+ stats["linear_std"] = linear_scale
76
+
77
+ print(f" > Avg mel spec mean: {mel_mean.mean()}")
78
+ print(f" > Avg mel spec scale: {mel_scale.mean()}")
79
+ print(f" > Avg linear spec mean: {linear_mean.mean()}")
80
+ print(f" > Avg linear spec scale: {linear_scale.mean()}")
81
+
82
+ # set default config values for mean-var scaling
83
+ CONFIG.audio.stats_path = output_file_path
84
+ CONFIG.audio.signal_norm = True
85
+ # remove redundant values
86
+ del CONFIG.audio.max_norm
87
+ del CONFIG.audio.min_level_db
88
+ del CONFIG.audio.symmetric_norm
89
+ del CONFIG.audio.clip_norm
90
+ stats["audio_config"] = CONFIG.audio.to_dict()
91
+ np.save(output_file_path, stats, allow_pickle=True)
92
+ print(f" > stats saved to {output_file_path}")
93
+
94
+
95
+ if __name__ == "__main__":
96
+ main()
TTS/bin/TTS_bin_eval_encoder.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from argparse import RawTextHelpFormatter
3
+
4
+ import torch
5
+ from tqdm import tqdm
6
+
7
+ from TTS.config import load_config
8
+ from TTS.tts.datasets import load_tts_samples
9
+ from TTS.tts.utils.speakers import SpeakerManager
10
+
11
+
12
+ def compute_encoder_accuracy(dataset_items, encoder_manager):
13
+ class_name_key = encoder_manager.encoder_config.class_name_key
14
+ map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
15
+
16
+ class_acc_dict = {}
17
+
18
+ # compute embeddings for all wav_files
19
+ for item in tqdm(dataset_items):
20
+ class_name = item[class_name_key]
21
+ wav_file = item["audio_file"]
22
+
23
+ # extract the embedding
24
+ embedd = encoder_manager.compute_embedding_from_clip(wav_file)
25
+ if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
26
+ embedding = torch.FloatTensor(embedd).unsqueeze(0)
27
+ if encoder_manager.use_cuda:
28
+ embedding = embedding.cuda()
29
+
30
+ class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
31
+ predicted_label = map_classid_to_classname[str(class_id)]
32
+ else:
33
+ predicted_label = None
34
+
35
+ if class_name is not None and predicted_label is not None:
36
+ is_equal = int(class_name == predicted_label)
37
+ if class_name not in class_acc_dict:
38
+ class_acc_dict[class_name] = [is_equal]
39
+ else:
40
+ class_acc_dict[class_name].append(is_equal)
41
+ else:
42
+ raise RuntimeError("Error: class_name or/and predicted_label are None")
43
+
44
+ acc_avg = 0
45
+ for key, values in class_acc_dict.items():
46
+ acc = sum(values) / len(values)
47
+ print("Class", key, "Accuracy:", acc)
48
+ acc_avg += acc
49
+
50
+ print("Average Accuracy:", acc_avg / len(class_acc_dict))
51
+
52
+
53
+ if __name__ == "__main__":
54
+ parser = argparse.ArgumentParser(
55
+ description="""Compute the accuracy of the encoder.\n\n"""
56
+ """
57
+ Example runs:
58
+ python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
59
+ """,
60
+ formatter_class=RawTextHelpFormatter,
61
+ )
62
+ parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
63
+ parser.add_argument(
64
+ "config_path",
65
+ type=str,
66
+ help="Path to model config file.",
67
+ )
68
+
69
+ parser.add_argument(
70
+ "config_dataset_path",
71
+ type=str,
72
+ help="Path to dataset config file.",
73
+ )
74
+ parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
75
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
76
+
77
+ args = parser.parse_args()
78
+
79
+ c_dataset = load_config(args.config_dataset_path)
80
+
81
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
82
+ items = meta_data_train + meta_data_eval
83
+
84
+ enc_manager = SpeakerManager(
85
+ encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
86
+ )
87
+
88
+ compute_encoder_accuracy(items, enc_manager)
TTS/bin/TTS_bin_extract_tts_spectrograms (1).py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Extract Mel spectrograms with teacher forcing."""
3
+
4
+ import argparse
5
+ import os
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch.utils.data import DataLoader
10
+ from tqdm import tqdm
11
+
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import TTSDataset, load_tts_samples
14
+ from TTS.tts.models import setup_model
15
+ from TTS.tts.utils.speakers import SpeakerManager
16
+ from TTS.tts.utils.text.tokenizer import TTSTokenizer
17
+ from TTS.utils.audio import AudioProcessor
18
+ from TTS.utils.audio.numpy_transforms import quantize
19
+ from TTS.utils.generic_utils import count_parameters
20
+
21
+ use_cuda = torch.cuda.is_available()
22
+
23
+
24
+ def setup_loader(ap, r, verbose=False):
25
+ tokenizer, _ = TTSTokenizer.init_from_config(c)
26
+ dataset = TTSDataset(
27
+ outputs_per_step=r,
28
+ compute_linear_spec=False,
29
+ samples=meta_data,
30
+ tokenizer=tokenizer,
31
+ ap=ap,
32
+ batch_group_size=0,
33
+ min_text_len=c.min_text_len,
34
+ max_text_len=c.max_text_len,
35
+ min_audio_len=c.min_audio_len,
36
+ max_audio_len=c.max_audio_len,
37
+ phoneme_cache_path=c.phoneme_cache_path,
38
+ precompute_num_workers=0,
39
+ use_noise_augment=False,
40
+ verbose=verbose,
41
+ speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
42
+ d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
43
+ )
44
+
45
+ if c.use_phonemes and c.compute_input_seq_cache:
46
+ # precompute phonemes to have a better estimate of sequence lengths.
47
+ dataset.compute_input_seq(c.num_loader_workers)
48
+ dataset.preprocess_samples()
49
+
50
+ loader = DataLoader(
51
+ dataset,
52
+ batch_size=c.batch_size,
53
+ shuffle=False,
54
+ collate_fn=dataset.collate_fn,
55
+ drop_last=False,
56
+ sampler=None,
57
+ num_workers=c.num_loader_workers,
58
+ pin_memory=False,
59
+ )
60
+ return loader
61
+
62
+
63
+ def set_filename(wav_path, out_path):
64
+ wav_file = os.path.basename(wav_path)
65
+ file_name = wav_file.split(".")[0]
66
+ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
67
+ os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
68
+ os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
69
+ os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
70
+ wavq_path = os.path.join(out_path, "quant", file_name)
71
+ mel_path = os.path.join(out_path, "mel", file_name)
72
+ wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
73
+ wav_path = os.path.join(out_path, "wav", file_name + ".wav")
74
+ return file_name, wavq_path, mel_path, wav_gl_path, wav_path
75
+
76
+
77
+ def format_data(data):
78
+ # setup input data
79
+ text_input = data["token_id"]
80
+ text_lengths = data["token_id_lengths"]
81
+ mel_input = data["mel"]
82
+ mel_lengths = data["mel_lengths"]
83
+ item_idx = data["item_idxs"]
84
+ d_vectors = data["d_vectors"]
85
+ speaker_ids = data["speaker_ids"]
86
+ attn_mask = data["attns"]
87
+ avg_text_length = torch.mean(text_lengths.float())
88
+ avg_spec_length = torch.mean(mel_lengths.float())
89
+
90
+ # dispatch data to GPU
91
+ if use_cuda:
92
+ text_input = text_input.cuda(non_blocking=True)
93
+ text_lengths = text_lengths.cuda(non_blocking=True)
94
+ mel_input = mel_input.cuda(non_blocking=True)
95
+ mel_lengths = mel_lengths.cuda(non_blocking=True)
96
+ if speaker_ids is not None:
97
+ speaker_ids = speaker_ids.cuda(non_blocking=True)
98
+ if d_vectors is not None:
99
+ d_vectors = d_vectors.cuda(non_blocking=True)
100
+ if attn_mask is not None:
101
+ attn_mask = attn_mask.cuda(non_blocking=True)
102
+ return (
103
+ text_input,
104
+ text_lengths,
105
+ mel_input,
106
+ mel_lengths,
107
+ speaker_ids,
108
+ d_vectors,
109
+ avg_text_length,
110
+ avg_spec_length,
111
+ attn_mask,
112
+ item_idx,
113
+ )
114
+
115
+
116
+ @torch.no_grad()
117
+ def inference(
118
+ model_name,
119
+ model,
120
+ ap,
121
+ text_input,
122
+ text_lengths,
123
+ mel_input,
124
+ mel_lengths,
125
+ speaker_ids=None,
126
+ d_vectors=None,
127
+ ):
128
+ if model_name == "glow_tts":
129
+ speaker_c = None
130
+ if speaker_ids is not None:
131
+ speaker_c = speaker_ids
132
+ elif d_vectors is not None:
133
+ speaker_c = d_vectors
134
+ outputs = model.inference_with_MAS(
135
+ text_input,
136
+ text_lengths,
137
+ mel_input,
138
+ mel_lengths,
139
+ aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
140
+ )
141
+ model_output = outputs["model_outputs"]
142
+ model_output = model_output.detach().cpu().numpy()
143
+
144
+ elif "tacotron" in model_name:
145
+ aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
146
+ outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
147
+ postnet_outputs = outputs["model_outputs"]
148
+ # normalize tacotron output
149
+ if model_name == "tacotron":
150
+ mel_specs = []
151
+ postnet_outputs = postnet_outputs.data.cpu().numpy()
152
+ for b in range(postnet_outputs.shape[0]):
153
+ postnet_output = postnet_outputs[b]
154
+ mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
155
+ model_output = torch.stack(mel_specs).cpu().numpy()
156
+
157
+ elif model_name == "tacotron2":
158
+ model_output = postnet_outputs.detach().cpu().numpy()
159
+ return model_output
160
+
161
+
162
+ def extract_spectrograms(
163
+ data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
164
+ ):
165
+ model.eval()
166
+ export_metadata = []
167
+ for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
168
+ # format data
169
+ (
170
+ text_input,
171
+ text_lengths,
172
+ mel_input,
173
+ mel_lengths,
174
+ speaker_ids,
175
+ d_vectors,
176
+ _,
177
+ _,
178
+ _,
179
+ item_idx,
180
+ ) = format_data(data)
181
+
182
+ model_output = inference(
183
+ c.model.lower(),
184
+ model,
185
+ ap,
186
+ text_input,
187
+ text_lengths,
188
+ mel_input,
189
+ mel_lengths,
190
+ speaker_ids,
191
+ d_vectors,
192
+ )
193
+
194
+ for idx in range(text_input.shape[0]):
195
+ wav_file_path = item_idx[idx]
196
+ wav = ap.load_wav(wav_file_path)
197
+ _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
198
+
199
+ # quantize and save wav
200
+ if quantize_bits > 0:
201
+ wavq = quantize(wav, quantize_bits)
202
+ np.save(wavq_path, wavq)
203
+
204
+ # save TTS mel
205
+ mel = model_output[idx]
206
+ mel_length = mel_lengths[idx]
207
+ mel = mel[:mel_length, :].T
208
+ np.save(mel_path, mel)
209
+
210
+ export_metadata.append([wav_file_path, mel_path])
211
+ if save_audio:
212
+ ap.save_wav(wav, wav_path)
213
+
214
+ if debug:
215
+ print("Audio for debug saved at:", wav_gl_path)
216
+ wav = ap.inv_melspectrogram(mel)
217
+ ap.save_wav(wav, wav_gl_path)
218
+
219
+ with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
220
+ for data in export_metadata:
221
+ f.write(f"{data[0]}|{data[1]+'.npy'}\n")
222
+
223
+
224
+ def main(args): # pylint: disable=redefined-outer-name
225
+ # pylint: disable=global-variable-undefined
226
+ global meta_data, speaker_manager
227
+
228
+ # Audio processor
229
+ ap = AudioProcessor(**c.audio)
230
+
231
+ # load data instances
232
+ meta_data_train, meta_data_eval = load_tts_samples(
233
+ c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
234
+ )
235
+
236
+ # use eval and training partitions
237
+ meta_data = meta_data_train + meta_data_eval
238
+
239
+ # init speaker manager
240
+ if c.use_speaker_embedding:
241
+ speaker_manager = SpeakerManager(data_items=meta_data)
242
+ elif c.use_d_vector_file:
243
+ speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
244
+ else:
245
+ speaker_manager = None
246
+
247
+ # setup model
248
+ model = setup_model(c)
249
+
250
+ # restore model
251
+ model.load_checkpoint(c, args.checkpoint_path, eval=True)
252
+
253
+ if use_cuda:
254
+ model.cuda()
255
+
256
+ num_params = count_parameters(model)
257
+ print("\n > Model has {} parameters".format(num_params), flush=True)
258
+ # set r
259
+ r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
260
+ own_loader = setup_loader(ap, r, verbose=True)
261
+
262
+ extract_spectrograms(
263
+ own_loader,
264
+ model,
265
+ ap,
266
+ args.output_path,
267
+ quantize_bits=args.quantize_bits,
268
+ save_audio=args.save_audio,
269
+ debug=args.debug,
270
+ metada_name="metada.txt",
271
+ )
272
+
273
+
274
+ if __name__ == "__main__":
275
+ parser = argparse.ArgumentParser()
276
+ parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
277
+ parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
278
+ parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
279
+ parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
280
+ parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
281
+ parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
282
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
283
+ args = parser.parse_args()
284
+
285
+ c = load_config(args.config_path)
286
+ c.audio.trim_silence = False
287
+ main(args)
TTS/bin/TTS_bin_extract_tts_spectrograms.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Extract Mel spectrograms with teacher forcing."""
3
+
4
+ import argparse
5
+ import os
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch.utils.data import DataLoader
10
+ from tqdm import tqdm
11
+
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import TTSDataset, load_tts_samples
14
+ from TTS.tts.models import setup_model
15
+ from TTS.tts.utils.speakers import SpeakerManager
16
+ from TTS.tts.utils.text.tokenizer import TTSTokenizer
17
+ from TTS.utils.audio import AudioProcessor
18
+ from TTS.utils.audio.numpy_transforms import quantize
19
+ from TTS.utils.generic_utils import count_parameters
20
+
21
+ use_cuda = torch.cuda.is_available()
22
+
23
+
24
+ def setup_loader(ap, r, verbose=False):
25
+ tokenizer, _ = TTSTokenizer.init_from_config(c)
26
+ dataset = TTSDataset(
27
+ outputs_per_step=r,
28
+ compute_linear_spec=False,
29
+ samples=meta_data,
30
+ tokenizer=tokenizer,
31
+ ap=ap,
32
+ batch_group_size=0,
33
+ min_text_len=c.min_text_len,
34
+ max_text_len=c.max_text_len,
35
+ min_audio_len=c.min_audio_len,
36
+ max_audio_len=c.max_audio_len,
37
+ phoneme_cache_path=c.phoneme_cache_path,
38
+ precompute_num_workers=0,
39
+ use_noise_augment=False,
40
+ verbose=verbose,
41
+ speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
42
+ d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
43
+ )
44
+
45
+ if c.use_phonemes and c.compute_input_seq_cache:
46
+ # precompute phonemes to have a better estimate of sequence lengths.
47
+ dataset.compute_input_seq(c.num_loader_workers)
48
+ dataset.preprocess_samples()
49
+
50
+ loader = DataLoader(
51
+ dataset,
52
+ batch_size=c.batch_size,
53
+ shuffle=False,
54
+ collate_fn=dataset.collate_fn,
55
+ drop_last=False,
56
+ sampler=None,
57
+ num_workers=c.num_loader_workers,
58
+ pin_memory=False,
59
+ )
60
+ return loader
61
+
62
+
63
+ def set_filename(wav_path, out_path):
64
+ wav_file = os.path.basename(wav_path)
65
+ file_name = wav_file.split(".")[0]
66
+ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
67
+ os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
68
+ os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
69
+ os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
70
+ wavq_path = os.path.join(out_path, "quant", file_name)
71
+ mel_path = os.path.join(out_path, "mel", file_name)
72
+ wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
73
+ wav_path = os.path.join(out_path, "wav", file_name + ".wav")
74
+ return file_name, wavq_path, mel_path, wav_gl_path, wav_path
75
+
76
+
77
+ def format_data(data):
78
+ # setup input data
79
+ text_input = data["token_id"]
80
+ text_lengths = data["token_id_lengths"]
81
+ mel_input = data["mel"]
82
+ mel_lengths = data["mel_lengths"]
83
+ item_idx = data["item_idxs"]
84
+ d_vectors = data["d_vectors"]
85
+ speaker_ids = data["speaker_ids"]
86
+ attn_mask = data["attns"]
87
+ avg_text_length = torch.mean(text_lengths.float())
88
+ avg_spec_length = torch.mean(mel_lengths.float())
89
+
90
+ # dispatch data to GPU
91
+ if use_cuda:
92
+ text_input = text_input.cuda(non_blocking=True)
93
+ text_lengths = text_lengths.cuda(non_blocking=True)
94
+ mel_input = mel_input.cuda(non_blocking=True)
95
+ mel_lengths = mel_lengths.cuda(non_blocking=True)
96
+ if speaker_ids is not None:
97
+ speaker_ids = speaker_ids.cuda(non_blocking=True)
98
+ if d_vectors is not None:
99
+ d_vectors = d_vectors.cuda(non_blocking=True)
100
+ if attn_mask is not None:
101
+ attn_mask = attn_mask.cuda(non_blocking=True)
102
+ return (
103
+ text_input,
104
+ text_lengths,
105
+ mel_input,
106
+ mel_lengths,
107
+ speaker_ids,
108
+ d_vectors,
109
+ avg_text_length,
110
+ avg_spec_length,
111
+ attn_mask,
112
+ item_idx,
113
+ )
114
+
115
+
116
+ @torch.no_grad()
117
+ def inference(
118
+ model_name,
119
+ model,
120
+ ap,
121
+ text_input,
122
+ text_lengths,
123
+ mel_input,
124
+ mel_lengths,
125
+ speaker_ids=None,
126
+ d_vectors=None,
127
+ ):
128
+ if model_name == "glow_tts":
129
+ speaker_c = None
130
+ if speaker_ids is not None:
131
+ speaker_c = speaker_ids
132
+ elif d_vectors is not None:
133
+ speaker_c = d_vectors
134
+ outputs = model.inference_with_MAS(
135
+ text_input,
136
+ text_lengths,
137
+ mel_input,
138
+ mel_lengths,
139
+ aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
140
+ )
141
+ model_output = outputs["model_outputs"]
142
+ model_output = model_output.detach().cpu().numpy()
143
+
144
+ elif "tacotron" in model_name:
145
+ aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
146
+ outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
147
+ postnet_outputs = outputs["model_outputs"]
148
+ # normalize tacotron output
149
+ if model_name == "tacotron":
150
+ mel_specs = []
151
+ postnet_outputs = postnet_outputs.data.cpu().numpy()
152
+ for b in range(postnet_outputs.shape[0]):
153
+ postnet_output = postnet_outputs[b]
154
+ mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
155
+ model_output = torch.stack(mel_specs).cpu().numpy()
156
+
157
+ elif model_name == "tacotron2":
158
+ model_output = postnet_outputs.detach().cpu().numpy()
159
+ return model_output
160
+
161
+
162
+ def extract_spectrograms(
163
+ data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
164
+ ):
165
+ model.eval()
166
+ export_metadata = []
167
+ for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
168
+ # format data
169
+ (
170
+ text_input,
171
+ text_lengths,
172
+ mel_input,
173
+ mel_lengths,
174
+ speaker_ids,
175
+ d_vectors,
176
+ _,
177
+ _,
178
+ _,
179
+ item_idx,
180
+ ) = format_data(data)
181
+
182
+ model_output = inference(
183
+ c.model.lower(),
184
+ model,
185
+ ap,
186
+ text_input,
187
+ text_lengths,
188
+ mel_input,
189
+ mel_lengths,
190
+ speaker_ids,
191
+ d_vectors,
192
+ )
193
+
194
+ for idx in range(text_input.shape[0]):
195
+ wav_file_path = item_idx[idx]
196
+ wav = ap.load_wav(wav_file_path)
197
+ _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
198
+
199
+ # quantize and save wav
200
+ if quantize_bits > 0:
201
+ wavq = quantize(wav, quantize_bits)
202
+ np.save(wavq_path, wavq)
203
+
204
+ # save TTS mel
205
+ mel = model_output[idx]
206
+ mel_length = mel_lengths[idx]
207
+ mel = mel[:mel_length, :].T
208
+ np.save(mel_path, mel)
209
+
210
+ export_metadata.append([wav_file_path, mel_path])
211
+ if save_audio:
212
+ ap.save_wav(wav, wav_path)
213
+
214
+ if debug:
215
+ print("Audio for debug saved at:", wav_gl_path)
216
+ wav = ap.inv_melspectrogram(mel)
217
+ ap.save_wav(wav, wav_gl_path)
218
+
219
+ with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
220
+ for data in export_metadata:
221
+ f.write(f"{data[0]}|{data[1]+'.npy'}\n")
222
+
223
+
224
+ def main(args): # pylint: disable=redefined-outer-name
225
+ # pylint: disable=global-variable-undefined
226
+ global meta_data, speaker_manager
227
+
228
+ # Audio processor
229
+ ap = AudioProcessor(**c.audio)
230
+
231
+ # load data instances
232
+ meta_data_train, meta_data_eval = load_tts_samples(
233
+ c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
234
+ )
235
+
236
+ # use eval and training partitions
237
+ meta_data = meta_data_train + meta_data_eval
238
+
239
+ # init speaker manager
240
+ if c.use_speaker_embedding:
241
+ speaker_manager = SpeakerManager(data_items=meta_data)
242
+ elif c.use_d_vector_file:
243
+ speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
244
+ else:
245
+ speaker_manager = None
246
+
247
+ # setup model
248
+ model = setup_model(c)
249
+
250
+ # restore model
251
+ model.load_checkpoint(c, args.checkpoint_path, eval=True)
252
+
253
+ if use_cuda:
254
+ model.cuda()
255
+
256
+ num_params = count_parameters(model)
257
+ print("\n > Model has {} parameters".format(num_params), flush=True)
258
+ # set r
259
+ r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
260
+ own_loader = setup_loader(ap, r, verbose=True)
261
+
262
+ extract_spectrograms(
263
+ own_loader,
264
+ model,
265
+ ap,
266
+ args.output_path,
267
+ quantize_bits=args.quantize_bits,
268
+ save_audio=args.save_audio,
269
+ debug=args.debug,
270
+ metada_name="metada.txt",
271
+ )
272
+
273
+
274
+ if __name__ == "__main__":
275
+ parser = argparse.ArgumentParser()
276
+ parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
277
+ parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
278
+ parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
279
+ parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
280
+ parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
281
+ parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
282
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
283
+ args = parser.parse_args()
284
+
285
+ c = load_config(args.config_path)
286
+ c.audio.trim_silence = False
287
+ main(args)
TTS/bin/TTS_bin_find_unique_chars (1).py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ from TTS.config import load_config
6
+ from TTS.tts.datasets import load_tts_samples
7
+
8
+
9
+ def main():
10
+ # pylint: disable=bad-option-value
11
+ parser = argparse.ArgumentParser(
12
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13
+ """
14
+ Example runs:
15
+
16
+ python TTS/bin/find_unique_chars.py --config_path config.json
17
+ """,
18
+ formatter_class=RawTextHelpFormatter,
19
+ )
20
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21
+ args = parser.parse_args()
22
+
23
+ c = load_config(args.config_path)
24
+
25
+ # load all datasets
26
+ train_items, eval_items = load_tts_samples(
27
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28
+ )
29
+
30
+ items = train_items + eval_items
31
+
32
+ texts = "".join(item["text"] for item in items)
33
+ chars = set(texts)
34
+ lower_chars = filter(lambda c: c.islower(), chars)
35
+ chars_force_lower = [c.lower() for c in chars]
36
+ chars_force_lower = set(chars_force_lower)
37
+
38
+ print(f" > Number of unique characters: {len(chars)}")
39
+ print(f" > Unique characters: {''.join(sorted(chars))}")
40
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
TTS/bin/TTS_bin_find_unique_chars.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ from TTS.config import load_config
6
+ from TTS.tts.datasets import load_tts_samples
7
+
8
+
9
+ def main():
10
+ # pylint: disable=bad-option-value
11
+ parser = argparse.ArgumentParser(
12
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13
+ """
14
+ Example runs:
15
+
16
+ python TTS/bin/find_unique_chars.py --config_path config.json
17
+ """,
18
+ formatter_class=RawTextHelpFormatter,
19
+ )
20
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21
+ args = parser.parse_args()
22
+
23
+ c = load_config(args.config_path)
24
+
25
+ # load all datasets
26
+ train_items, eval_items = load_tts_samples(
27
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28
+ )
29
+
30
+ items = train_items + eval_items
31
+
32
+ texts = "".join(item["text"] for item in items)
33
+ chars = set(texts)
34
+ lower_chars = filter(lambda c: c.islower(), chars)
35
+ chars_force_lower = [c.lower() for c in chars]
36
+ chars_force_lower = set(chars_force_lower)
37
+
38
+ print(f" > Number of unique characters: {len(chars)}")
39
+ print(f" > Unique characters: {''.join(sorted(chars))}")
40
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
TTS/bin/TTS_bin_find_unique_phonemes (1).py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ import multiprocessing
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ from tqdm.contrib.concurrent import process_map
7
+
8
+ from TTS.config import load_config
9
+ from TTS.tts.datasets import load_tts_samples
10
+ from TTS.tts.utils.text.phonemizers import Gruut
11
+
12
+
13
+ def compute_phonemes(item):
14
+ text = item["text"]
15
+ ph = phonemizer.phonemize(text).replace("|", "")
16
+ return set(list(ph))
17
+
18
+
19
+ def main():
20
+ # pylint: disable=W0601
21
+ global c, phonemizer
22
+ # pylint: disable=bad-option-value
23
+ parser = argparse.ArgumentParser(
24
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
25
+ """
26
+ Example runs:
27
+
28
+ python TTS/bin/find_unique_phonemes.py --config_path config.json
29
+ """,
30
+ formatter_class=RawTextHelpFormatter,
31
+ )
32
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
33
+ args = parser.parse_args()
34
+
35
+ c = load_config(args.config_path)
36
+
37
+ # load all datasets
38
+ train_items, eval_items = load_tts_samples(
39
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
40
+ )
41
+ items = train_items + eval_items
42
+ print("Num items:", len(items))
43
+
44
+ language_list = [item["language"] for item in items]
45
+ is_lang_def = all(language_list)
46
+
47
+ if not c.phoneme_language or not is_lang_def:
48
+ raise ValueError("Phoneme language must be defined in config.")
49
+
50
+ if not language_list.count(language_list[0]) == len(language_list):
51
+ raise ValueError(
52
+ "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
53
+ )
54
+
55
+ phonemizer = Gruut(language=language_list[0], keep_puncs=True)
56
+
57
+ phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
58
+ phones = []
59
+ for ph in phonemes:
60
+ phones.extend(ph)
61
+
62
+ phones = set(phones)
63
+ lower_phones = filter(lambda c: c.islower(), phones)
64
+ phones_force_lower = [c.lower() for c in phones]
65
+ phones_force_lower = set(phones_force_lower)
66
+
67
+ print(f" > Number of unique phonemes: {len(phones)}")
68
+ print(f" > Unique phonemes: {''.join(sorted(phones))}")
69
+ print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
70
+ print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()
TTS/bin/TTS_bin_find_unique_phonemes.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ import multiprocessing
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ from tqdm.contrib.concurrent import process_map
7
+
8
+ from TTS.config import load_config
9
+ from TTS.tts.datasets import load_tts_samples
10
+ from TTS.tts.utils.text.phonemizers import Gruut
11
+
12
+
13
+ def compute_phonemes(item):
14
+ text = item["text"]
15
+ ph = phonemizer.phonemize(text).replace("|", "")
16
+ return set(list(ph))
17
+
18
+
19
+ def main():
20
+ # pylint: disable=W0601
21
+ global c, phonemizer
22
+ # pylint: disable=bad-option-value
23
+ parser = argparse.ArgumentParser(
24
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
25
+ """
26
+ Example runs:
27
+
28
+ python TTS/bin/find_unique_phonemes.py --config_path config.json
29
+ """,
30
+ formatter_class=RawTextHelpFormatter,
31
+ )
32
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
33
+ args = parser.parse_args()
34
+
35
+ c = load_config(args.config_path)
36
+
37
+ # load all datasets
38
+ train_items, eval_items = load_tts_samples(
39
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
40
+ )
41
+ items = train_items + eval_items
42
+ print("Num items:", len(items))
43
+
44
+ language_list = [item["language"] for item in items]
45
+ is_lang_def = all(language_list)
46
+
47
+ if not c.phoneme_language or not is_lang_def:
48
+ raise ValueError("Phoneme language must be defined in config.")
49
+
50
+ if not language_list.count(language_list[0]) == len(language_list):
51
+ raise ValueError(
52
+ "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
53
+ )
54
+
55
+ phonemizer = Gruut(language=language_list[0], keep_puncs=True)
56
+
57
+ phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
58
+ phones = []
59
+ for ph in phonemes:
60
+ phones.extend(ph)
61
+
62
+ phones = set(phones)
63
+ lower_phones = filter(lambda c: c.islower(), phones)
64
+ phones_force_lower = [c.lower() for c in phones]
65
+ phones_force_lower = set(phones_force_lower)
66
+
67
+ print(f" > Number of unique phonemes: {len(phones)}")
68
+ print(f" > Unique phonemes: {''.join(sorted(phones))}")
69
+ print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
70
+ print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()
TTS/bin/TTS_bin_remove_silence_using_vad (1).py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import multiprocessing
4
+ import os
5
+ import pathlib
6
+
7
+ import torch
8
+ from tqdm import tqdm
9
+
10
+ from TTS.utils.vad import get_vad_model_and_utils, remove_silence
11
+
12
+ torch.set_num_threads(1)
13
+
14
+
15
+ def adjust_path_and_remove_silence(audio_path):
16
+ output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
17
+ # ignore if the file exists
18
+ if os.path.exists(output_path) and not args.force:
19
+ return output_path, False
20
+
21
+ # create all directory structure
22
+ pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
23
+ # remove the silence and save the audio
24
+ output_path, is_speech = remove_silence(
25
+ model_and_utils,
26
+ audio_path,
27
+ output_path,
28
+ trim_just_beginning_and_end=args.trim_just_beginning_and_end,
29
+ use_cuda=args.use_cuda,
30
+ )
31
+ return output_path, is_speech
32
+
33
+
34
+ def preprocess_audios():
35
+ files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
36
+ print("> Number of files: ", len(files))
37
+ if not args.force:
38
+ print("> Ignoring files that already exist in the output idrectory.")
39
+
40
+ if args.trim_just_beginning_and_end:
41
+ print("> Trimming just the beginning and the end with nonspeech parts.")
42
+ else:
43
+ print("> Trimming all nonspeech parts.")
44
+
45
+ filtered_files = []
46
+ if files:
47
+ # create threads
48
+ # num_threads = multiprocessing.cpu_count()
49
+ # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
50
+
51
+ if args.num_processes > 1:
52
+ with multiprocessing.Pool(processes=args.num_processes) as pool:
53
+ results = list(
54
+ tqdm(
55
+ pool.imap_unordered(adjust_path_and_remove_silence, files),
56
+ total=len(files),
57
+ desc="Processing audio files",
58
+ )
59
+ )
60
+ for output_path, is_speech in results:
61
+ if not is_speech:
62
+ filtered_files.append(output_path)
63
+ else:
64
+ for f in tqdm(files):
65
+ output_path, is_speech = adjust_path_and_remove_silence(f)
66
+ if not is_speech:
67
+ filtered_files.append(output_path)
68
+
69
+ # write files that do not have speech
70
+ with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
71
+ for file in filtered_files:
72
+ f.write(str(file) + "\n")
73
+ else:
74
+ print("> No files Found !")
75
+
76
+
77
+ if __name__ == "__main__":
78
+ parser = argparse.ArgumentParser(
79
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
80
+ )
81
+ parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
82
+ parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
83
+ parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
84
+ parser.add_argument(
85
+ "-g",
86
+ "--glob",
87
+ type=str,
88
+ default="**/*.wav",
89
+ help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
90
+ )
91
+ parser.add_argument(
92
+ "-t",
93
+ "--trim_just_beginning_and_end",
94
+ type=bool,
95
+ default=True,
96
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
97
+ )
98
+ parser.add_argument(
99
+ "-c",
100
+ "--use_cuda",
101
+ type=bool,
102
+ default=False,
103
+ help="If True use cuda",
104
+ )
105
+ parser.add_argument(
106
+ "--use_onnx",
107
+ type=bool,
108
+ default=False,
109
+ help="If True use onnx",
110
+ )
111
+ parser.add_argument(
112
+ "--num_processes",
113
+ type=int,
114
+ default=1,
115
+ help="Number of processes to use",
116
+ )
117
+ args = parser.parse_args()
118
+
119
+ if args.output_dir == "":
120
+ args.output_dir = args.input_dir
121
+
122
+ # load the model and utils
123
+ model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
124
+ preprocess_audios()
TTS/bin/TTS_bin_remove_silence_using_vad.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import multiprocessing
4
+ import os
5
+ import pathlib
6
+
7
+ import torch
8
+ from tqdm import tqdm
9
+
10
+ from TTS.utils.vad import get_vad_model_and_utils, remove_silence
11
+
12
+ torch.set_num_threads(1)
13
+
14
+
15
+ def adjust_path_and_remove_silence(audio_path):
16
+ output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
17
+ # ignore if the file exists
18
+ if os.path.exists(output_path) and not args.force:
19
+ return output_path, False
20
+
21
+ # create all directory structure
22
+ pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
23
+ # remove the silence and save the audio
24
+ output_path, is_speech = remove_silence(
25
+ model_and_utils,
26
+ audio_path,
27
+ output_path,
28
+ trim_just_beginning_and_end=args.trim_just_beginning_and_end,
29
+ use_cuda=args.use_cuda,
30
+ )
31
+ return output_path, is_speech
32
+
33
+
34
+ def preprocess_audios():
35
+ files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
36
+ print("> Number of files: ", len(files))
37
+ if not args.force:
38
+ print("> Ignoring files that already exist in the output idrectory.")
39
+
40
+ if args.trim_just_beginning_and_end:
41
+ print("> Trimming just the beginning and the end with nonspeech parts.")
42
+ else:
43
+ print("> Trimming all nonspeech parts.")
44
+
45
+ filtered_files = []
46
+ if files:
47
+ # create threads
48
+ # num_threads = multiprocessing.cpu_count()
49
+ # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
50
+
51
+ if args.num_processes > 1:
52
+ with multiprocessing.Pool(processes=args.num_processes) as pool:
53
+ results = list(
54
+ tqdm(
55
+ pool.imap_unordered(adjust_path_and_remove_silence, files),
56
+ total=len(files),
57
+ desc="Processing audio files",
58
+ )
59
+ )
60
+ for output_path, is_speech in results:
61
+ if not is_speech:
62
+ filtered_files.append(output_path)
63
+ else:
64
+ for f in tqdm(files):
65
+ output_path, is_speech = adjust_path_and_remove_silence(f)
66
+ if not is_speech:
67
+ filtered_files.append(output_path)
68
+
69
+ # write files that do not have speech
70
+ with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
71
+ for file in filtered_files:
72
+ f.write(str(file) + "\n")
73
+ else:
74
+ print("> No files Found !")
75
+
76
+
77
+ if __name__ == "__main__":
78
+ parser = argparse.ArgumentParser(
79
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
80
+ )
81
+ parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
82
+ parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
83
+ parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
84
+ parser.add_argument(
85
+ "-g",
86
+ "--glob",
87
+ type=str,
88
+ default="**/*.wav",
89
+ help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
90
+ )
91
+ parser.add_argument(
92
+ "-t",
93
+ "--trim_just_beginning_and_end",
94
+ type=bool,
95
+ default=True,
96
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
97
+ )
98
+ parser.add_argument(
99
+ "-c",
100
+ "--use_cuda",
101
+ type=bool,
102
+ default=False,
103
+ help="If True use cuda",
104
+ )
105
+ parser.add_argument(
106
+ "--use_onnx",
107
+ type=bool,
108
+ default=False,
109
+ help="If True use onnx",
110
+ )
111
+ parser.add_argument(
112
+ "--num_processes",
113
+ type=int,
114
+ default=1,
115
+ help="Number of processes to use",
116
+ )
117
+ args = parser.parse_args()
118
+
119
+ if args.output_dir == "":
120
+ args.output_dir = args.input_dir
121
+
122
+ # load the model and utils
123
+ model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
124
+ preprocess_audios()
TTS/bin/TTS_bin_resample (1).py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+ from multiprocessing import Pool
6
+ from shutil import copytree
7
+
8
+ import librosa
9
+ import soundfile as sf
10
+ from tqdm import tqdm
11
+
12
+
13
+ def resample_file(func_args):
14
+ filename, output_sr = func_args
15
+ y, sr = librosa.load(filename, sr=output_sr)
16
+ sf.write(filename, y, sr)
17
+
18
+
19
+ def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
20
+ if output_dir:
21
+ print("Recursively copying the input folder...")
22
+ copytree(input_dir, output_dir)
23
+ input_dir = output_dir
24
+
25
+ print("Resampling the audio files...")
26
+ audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
27
+ print(f"Found {len(audio_files)} files...")
28
+ audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
29
+ with Pool(processes=n_jobs) as p:
30
+ with tqdm(total=len(audio_files)) as pbar:
31
+ for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
32
+ pbar.update()
33
+
34
+ print("Done !")
35
+
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser(
39
+ description="""Resample a folder recusively with librosa
40
+ Can be used in place or create a copy of the folder as an output.\n\n
41
+ Example run:
42
+ python TTS/bin/resample.py
43
+ --input_dir /root/LJSpeech-1.1/
44
+ --output_sr 22050
45
+ --output_dir /root/resampled_LJSpeech-1.1/
46
+ --file_ext wav
47
+ --n_jobs 24
48
+ """,
49
+ formatter_class=RawTextHelpFormatter,
50
+ )
51
+
52
+ parser.add_argument(
53
+ "--input_dir",
54
+ type=str,
55
+ default=None,
56
+ required=True,
57
+ help="Path of the folder containing the audio files to resample",
58
+ )
59
+
60
+ parser.add_argument(
61
+ "--output_sr",
62
+ type=int,
63
+ default=22050,
64
+ required=False,
65
+ help="Samlple rate to which the audio files should be resampled",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--output_dir",
70
+ type=str,
71
+ default=None,
72
+ required=False,
73
+ help="Path of the destination folder. If not defined, the operation is done in place",
74
+ )
75
+
76
+ parser.add_argument(
77
+ "--file_ext",
78
+ type=str,
79
+ default="wav",
80
+ required=False,
81
+ help="Extension of the audio files to resample",
82
+ )
83
+
84
+ parser.add_argument(
85
+ "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
86
+ )
87
+
88
+ args = parser.parse_args()
89
+
90
+ resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
TTS/bin/TTS_bin_resample.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+ from multiprocessing import Pool
6
+ from shutil import copytree
7
+
8
+ import librosa
9
+ import soundfile as sf
10
+ from tqdm import tqdm
11
+
12
+
13
+ def resample_file(func_args):
14
+ filename, output_sr = func_args
15
+ y, sr = librosa.load(filename, sr=output_sr)
16
+ sf.write(filename, y, sr)
17
+
18
+
19
+ def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
20
+ if output_dir:
21
+ print("Recursively copying the input folder...")
22
+ copytree(input_dir, output_dir)
23
+ input_dir = output_dir
24
+
25
+ print("Resampling the audio files...")
26
+ audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
27
+ print(f"Found {len(audio_files)} files...")
28
+ audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
29
+ with Pool(processes=n_jobs) as p:
30
+ with tqdm(total=len(audio_files)) as pbar:
31
+ for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
32
+ pbar.update()
33
+
34
+ print("Done !")
35
+
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser(
39
+ description="""Resample a folder recusively with librosa
40
+ Can be used in place or create a copy of the folder as an output.\n\n
41
+ Example run:
42
+ python TTS/bin/resample.py
43
+ --input_dir /root/LJSpeech-1.1/
44
+ --output_sr 22050
45
+ --output_dir /root/resampled_LJSpeech-1.1/
46
+ --file_ext wav
47
+ --n_jobs 24
48
+ """,
49
+ formatter_class=RawTextHelpFormatter,
50
+ )
51
+
52
+ parser.add_argument(
53
+ "--input_dir",
54
+ type=str,
55
+ default=None,
56
+ required=True,
57
+ help="Path of the folder containing the audio files to resample",
58
+ )
59
+
60
+ parser.add_argument(
61
+ "--output_sr",
62
+ type=int,
63
+ default=22050,
64
+ required=False,
65
+ help="Samlple rate to which the audio files should be resampled",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--output_dir",
70
+ type=str,
71
+ default=None,
72
+ required=False,
73
+ help="Path of the destination folder. If not defined, the operation is done in place",
74
+ )
75
+
76
+ parser.add_argument(
77
+ "--file_ext",
78
+ type=str,
79
+ default="wav",
80
+ required=False,
81
+ help="Extension of the audio files to resample",
82
+ )
83
+
84
+ parser.add_argument(
85
+ "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
86
+ )
87
+
88
+ args = parser.parse_args()
89
+
90
+ resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
TTS/bin/TTS_bin_synthesize (1).py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import contextlib
6
+ import sys
7
+ from argparse import RawTextHelpFormatter
8
+
9
+ # pylint: disable=redefined-outer-name, unused-argument
10
+ from pathlib import Path
11
+
12
+ description = """
13
+ Synthesize speech on command line.
14
+
15
+ You can either use your trained model or choose a model from the provided list.
16
+
17
+ If you don't specify any models, then it uses LJSpeech based English model.
18
+
19
+ #### Single Speaker Models
20
+
21
+ - List provided models:
22
+
23
+ ```
24
+ $ tts --list_models
25
+ ```
26
+
27
+ - Get model info (for both tts_models and vocoder_models):
28
+
29
+ - Query by type/name:
30
+ The model_info_by_name uses the name as it from the --list_models.
31
+ ```
32
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
33
+ ```
34
+ For example:
35
+ ```
36
+ $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
37
+ $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
38
+ ```
39
+ - Query by type/idx:
40
+ The model_query_idx uses the corresponding idx from --list_models.
41
+
42
+ ```
43
+ $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
44
+ ```
45
+
46
+ For example:
47
+
48
+ ```
49
+ $ tts --model_info_by_idx tts_models/3
50
+ ```
51
+
52
+ - Query info for model info by full name:
53
+ ```
54
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
55
+ ```
56
+
57
+ - Run TTS with default models:
58
+
59
+ ```
60
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav
61
+ ```
62
+
63
+ - Run TTS and pipe out the generated TTS wav file data:
64
+
65
+ ```
66
+ $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
67
+ ```
68
+
69
+ - Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
70
+
71
+ ```
72
+ $ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
73
+ ```
74
+
75
+ - Run a TTS model with its default vocoder model:
76
+
77
+ ```
78
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
79
+ ```
80
+
81
+ For example:
82
+
83
+ ```
84
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
85
+ ```
86
+
87
+ - Run with specific TTS and vocoder models from the list:
88
+
89
+ ```
90
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
91
+ ```
92
+
93
+ For example:
94
+
95
+ ```
96
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
97
+ ```
98
+
99
+ - Run your own TTS model (Using Griffin-Lim Vocoder):
100
+
101
+ ```
102
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
103
+ ```
104
+
105
+ - Run your own TTS and Vocoder models:
106
+
107
+ ```
108
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
109
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
110
+ ```
111
+
112
+ #### Multi-speaker Models
113
+
114
+ - List the available speakers and choose a <speaker_id> among them:
115
+
116
+ ```
117
+ $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
118
+ ```
119
+
120
+ - Run the multi-speaker TTS model with the target speaker ID:
121
+
122
+ ```
123
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
124
+ ```
125
+
126
+ - Run your own multi-speaker TTS model:
127
+
128
+ ```
129
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
130
+ ```
131
+
132
+ ### Voice Conversion Models
133
+
134
+ ```
135
+ $ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
136
+ ```
137
+ """
138
+
139
+
140
+ def str2bool(v):
141
+ if isinstance(v, bool):
142
+ return v
143
+ if v.lower() in ("yes", "true", "t", "y", "1"):
144
+ return True
145
+ if v.lower() in ("no", "false", "f", "n", "0"):
146
+ return False
147
+ raise argparse.ArgumentTypeError("Boolean value expected.")
148
+
149
+
150
+ def main():
151
+ parser = argparse.ArgumentParser(
152
+ description=description.replace(" ```\n", ""),
153
+ formatter_class=RawTextHelpFormatter,
154
+ )
155
+
156
+ parser.add_argument(
157
+ "--list_models",
158
+ type=str2bool,
159
+ nargs="?",
160
+ const=True,
161
+ default=False,
162
+ help="list available pre-trained TTS and vocoder models.",
163
+ )
164
+
165
+ parser.add_argument(
166
+ "--model_info_by_idx",
167
+ type=str,
168
+ default=None,
169
+ help="model info using query format: <model_type>/<model_query_idx>",
170
+ )
171
+
172
+ parser.add_argument(
173
+ "--model_info_by_name",
174
+ type=str,
175
+ default=None,
176
+ help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
177
+ )
178
+
179
+ parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
180
+
181
+ # Args for running pre-trained TTS models.
182
+ parser.add_argument(
183
+ "--model_name",
184
+ type=str,
185
+ default="tts_models/en/ljspeech/tacotron2-DDC",
186
+ help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
187
+ )
188
+ parser.add_argument(
189
+ "--vocoder_name",
190
+ type=str,
191
+ default=None,
192
+ help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
193
+ )
194
+
195
+ # Args for running custom models
196
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
197
+ parser.add_argument(
198
+ "--model_path",
199
+ type=str,
200
+ default=None,
201
+ help="Path to model file.",
202
+ )
203
+ parser.add_argument(
204
+ "--out_path",
205
+ type=str,
206
+ default="tts_output.wav",
207
+ help="Output wav file path.",
208
+ )
209
+ parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
210
+ parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
211
+ parser.add_argument(
212
+ "--vocoder_path",
213
+ type=str,
214
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
215
+ default=None,
216
+ )
217
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
218
+ parser.add_argument(
219
+ "--encoder_path",
220
+ type=str,
221
+ help="Path to speaker encoder model file.",
222
+ default=None,
223
+ )
224
+ parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
225
+
226
+ # args for coqui studio
227
+ parser.add_argument(
228
+ "--cs_model",
229
+ type=str,
230
+ help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
231
+ )
232
+ parser.add_argument(
233
+ "--emotion",
234
+ type=str,
235
+ help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.",
236
+ default=None,
237
+ )
238
+ parser.add_argument(
239
+ "--language",
240
+ type=str,
241
+ help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
242
+ default=None,
243
+ )
244
+ parser.add_argument(
245
+ "--pipe_out",
246
+ help="stdout the generated TTS wav file for shell pipe.",
247
+ type=str2bool,
248
+ nargs="?",
249
+ const=True,
250
+ default=False,
251
+ )
252
+ parser.add_argument(
253
+ "--speed",
254
+ type=float,
255
+ help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.",
256
+ default=None,
257
+ )
258
+
259
+ # args for multi-speaker synthesis
260
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
261
+ parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
262
+ parser.add_argument(
263
+ "--speaker_idx",
264
+ type=str,
265
+ help="Target speaker ID for a multi-speaker TTS model.",
266
+ default=None,
267
+ )
268
+ parser.add_argument(
269
+ "--language_idx",
270
+ type=str,
271
+ help="Target language ID for a multi-lingual TTS model.",
272
+ default=None,
273
+ )
274
+ parser.add_argument(
275
+ "--speaker_wav",
276
+ nargs="+",
277
+ help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
278
+ default=None,
279
+ )
280
+ parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
281
+ parser.add_argument(
282
+ "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
283
+ )
284
+ parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
285
+ parser.add_argument(
286
+ "--list_speaker_idxs",
287
+ help="List available speaker ids for the defined multi-speaker model.",
288
+ type=str2bool,
289
+ nargs="?",
290
+ const=True,
291
+ default=False,
292
+ )
293
+ parser.add_argument(
294
+ "--list_language_idxs",
295
+ help="List available language ids for the defined multi-lingual model.",
296
+ type=str2bool,
297
+ nargs="?",
298
+ const=True,
299
+ default=False,
300
+ )
301
+ # aux args
302
+ parser.add_argument(
303
+ "--save_spectogram",
304
+ type=bool,
305
+ help="If true save raw spectogram for further (vocoder) processing in out_path.",
306
+ default=False,
307
+ )
308
+ parser.add_argument(
309
+ "--reference_wav",
310
+ type=str,
311
+ help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
312
+ default=None,
313
+ )
314
+ parser.add_argument(
315
+ "--reference_speaker_idx",
316
+ type=str,
317
+ help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
318
+ default=None,
319
+ )
320
+ parser.add_argument(
321
+ "--progress_bar",
322
+ type=str2bool,
323
+ help="If true shows a progress bar for the model download. Defaults to True",
324
+ default=True,
325
+ )
326
+
327
+ # voice conversion args
328
+ parser.add_argument(
329
+ "--source_wav",
330
+ type=str,
331
+ default=None,
332
+ help="Original audio file to convert in the voice of the target_wav",
333
+ )
334
+ parser.add_argument(
335
+ "--target_wav",
336
+ type=str,
337
+ default=None,
338
+ help="Target audio file to convert in the voice of the source_wav",
339
+ )
340
+
341
+ parser.add_argument(
342
+ "--voice_dir",
343
+ type=str,
344
+ default=None,
345
+ help="Voice dir for tortoise model",
346
+ )
347
+
348
+ args = parser.parse_args()
349
+
350
+ # print the description if either text or list_models is not set
351
+ check_args = [
352
+ args.text,
353
+ args.list_models,
354
+ args.list_speaker_idxs,
355
+ args.list_language_idxs,
356
+ args.reference_wav,
357
+ args.model_info_by_idx,
358
+ args.model_info_by_name,
359
+ args.source_wav,
360
+ args.target_wav,
361
+ ]
362
+ if not any(check_args):
363
+ parser.parse_args(["-h"])
364
+
365
+ pipe_out = sys.stdout if args.pipe_out else None
366
+
367
+ with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
368
+ # Late-import to make things load faster
369
+ from TTS.api import TTS
370
+ from TTS.utils.manage import ModelManager
371
+ from TTS.utils.synthesizer import Synthesizer
372
+
373
+ # load model manager
374
+ path = Path(__file__).parent / "../.models.json"
375
+ manager = ModelManager(path, progress_bar=args.progress_bar)
376
+ api = TTS()
377
+
378
+ tts_path = None
379
+ tts_config_path = None
380
+ speakers_file_path = None
381
+ language_ids_file_path = None
382
+ vocoder_path = None
383
+ vocoder_config_path = None
384
+ encoder_path = None
385
+ encoder_config_path = None
386
+ vc_path = None
387
+ vc_config_path = None
388
+ model_dir = None
389
+
390
+ # CASE1 #list : list pre-trained TTS models
391
+ if args.list_models:
392
+ manager.add_cs_api_models(api.list_models())
393
+ manager.list_models()
394
+ sys.exit()
395
+
396
+ # CASE2 #info : model info for pre-trained TTS models
397
+ if args.model_info_by_idx:
398
+ model_query = args.model_info_by_idx
399
+ manager.model_info_by_idx(model_query)
400
+ sys.exit()
401
+
402
+ if args.model_info_by_name:
403
+ model_query_full_name = args.model_info_by_name
404
+ manager.model_info_by_full_name(model_query_full_name)
405
+ sys.exit()
406
+
407
+ # CASE3: TTS with coqui studio models
408
+ if "coqui_studio" in args.model_name:
409
+ print(" > Using 🐸Coqui Studio model: ", args.model_name)
410
+ api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
411
+ api.tts_to_file(
412
+ text=args.text,
413
+ emotion=args.emotion,
414
+ file_path=args.out_path,
415
+ language=args.language,
416
+ speed=args.speed,
417
+ pipe_out=pipe_out,
418
+ )
419
+ print(" > Saving output to ", args.out_path)
420
+ return
421
+
422
+ # CASE4: load pre-trained model paths
423
+ if args.model_name is not None and not args.model_path:
424
+ model_path, config_path, model_item = manager.download_model(args.model_name)
425
+ # tts model
426
+ if model_item["model_type"] == "tts_models":
427
+ tts_path = model_path
428
+ tts_config_path = config_path
429
+ if "default_vocoder" in model_item:
430
+ args.vocoder_name = (
431
+ model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
432
+ )
433
+
434
+ # voice conversion model
435
+ if model_item["model_type"] == "voice_conversion_models":
436
+ vc_path = model_path
437
+ vc_config_path = config_path
438
+
439
+ # tts model with multiple files to be loaded from the directory path
440
+ if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
441
+ model_dir = model_path
442
+ tts_path = None
443
+ tts_config_path = None
444
+ args.vocoder_name = None
445
+
446
+ # load vocoder
447
+ if args.vocoder_name is not None and not args.vocoder_path:
448
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
449
+
450
+ # CASE5: set custom model paths
451
+ if args.model_path is not None:
452
+ tts_path = args.model_path
453
+ tts_config_path = args.config_path
454
+ speakers_file_path = args.speakers_file_path
455
+ language_ids_file_path = args.language_ids_file_path
456
+
457
+ if args.vocoder_path is not None:
458
+ vocoder_path = args.vocoder_path
459
+ vocoder_config_path = args.vocoder_config_path
460
+
461
+ if args.encoder_path is not None:
462
+ encoder_path = args.encoder_path
463
+ encoder_config_path = args.encoder_config_path
464
+
465
+ device = args.device
466
+ if args.use_cuda:
467
+ device = "cuda"
468
+
469
+ # load models
470
+ synthesizer = Synthesizer(
471
+ tts_path,
472
+ tts_config_path,
473
+ speakers_file_path,
474
+ language_ids_file_path,
475
+ vocoder_path,
476
+ vocoder_config_path,
477
+ encoder_path,
478
+ encoder_config_path,
479
+ vc_path,
480
+ vc_config_path,
481
+ model_dir,
482
+ args.voice_dir,
483
+ ).to(device)
484
+
485
+ # query speaker ids of a multi-speaker model.
486
+ if args.list_speaker_idxs:
487
+ print(
488
+ " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
489
+ )
490
+ print(synthesizer.tts_model.speaker_manager.name_to_id)
491
+ return
492
+
493
+ # query langauge ids of a multi-lingual model.
494
+ if args.list_language_idxs:
495
+ print(
496
+ " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
497
+ )
498
+ print(synthesizer.tts_model.language_manager.name_to_id)
499
+ return
500
+
501
+ # check the arguments against a multi-speaker model.
502
+ if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
503
+ print(
504
+ " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
505
+ "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
506
+ )
507
+ return
508
+
509
+ # RUN THE SYNTHESIS
510
+ if args.text:
511
+ print(" > Text: {}".format(args.text))
512
+
513
+ # kick it
514
+ if tts_path is not None:
515
+ wav = synthesizer.tts(
516
+ args.text,
517
+ speaker_name=args.speaker_idx,
518
+ language_name=args.language_idx,
519
+ speaker_wav=args.speaker_wav,
520
+ reference_wav=args.reference_wav,
521
+ style_wav=args.capacitron_style_wav,
522
+ style_text=args.capacitron_style_text,
523
+ reference_speaker_name=args.reference_speaker_idx,
524
+ )
525
+ elif vc_path is not None:
526
+ wav = synthesizer.voice_conversion(
527
+ source_wav=args.source_wav,
528
+ target_wav=args.target_wav,
529
+ )
530
+ elif model_dir is not None:
531
+ wav = synthesizer.tts(
532
+ args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
533
+ )
534
+
535
+ # save the results
536
+ print(" > Saving output to {}".format(args.out_path))
537
+ synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
538
+
539
+
540
+ if __name__ == "__main__":
541
+ main()
TTS/bin/TTS_bin_synthesize.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import contextlib
6
+ import sys
7
+ from argparse import RawTextHelpFormatter
8
+
9
+ # pylint: disable=redefined-outer-name, unused-argument
10
+ from pathlib import Path
11
+
12
+ description = """
13
+ Synthesize speech on command line.
14
+
15
+ You can either use your trained model or choose a model from the provided list.
16
+
17
+ If you don't specify any models, then it uses LJSpeech based English model.
18
+
19
+ #### Single Speaker Models
20
+
21
+ - List provided models:
22
+
23
+ ```
24
+ $ tts --list_models
25
+ ```
26
+
27
+ - Get model info (for both tts_models and vocoder_models):
28
+
29
+ - Query by type/name:
30
+ The model_info_by_name uses the name as it from the --list_models.
31
+ ```
32
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
33
+ ```
34
+ For example:
35
+ ```
36
+ $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
37
+ $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
38
+ ```
39
+ - Query by type/idx:
40
+ The model_query_idx uses the corresponding idx from --list_models.
41
+
42
+ ```
43
+ $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
44
+ ```
45
+
46
+ For example:
47
+
48
+ ```
49
+ $ tts --model_info_by_idx tts_models/3
50
+ ```
51
+
52
+ - Query info for model info by full name:
53
+ ```
54
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
55
+ ```
56
+
57
+ - Run TTS with default models:
58
+
59
+ ```
60
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav
61
+ ```
62
+
63
+ - Run TTS and pipe out the generated TTS wav file data:
64
+
65
+ ```
66
+ $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
67
+ ```
68
+
69
+ - Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
70
+
71
+ ```
72
+ $ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
73
+ ```
74
+
75
+ - Run a TTS model with its default vocoder model:
76
+
77
+ ```
78
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
79
+ ```
80
+
81
+ For example:
82
+
83
+ ```
84
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
85
+ ```
86
+
87
+ - Run with specific TTS and vocoder models from the list:
88
+
89
+ ```
90
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
91
+ ```
92
+
93
+ For example:
94
+
95
+ ```
96
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
97
+ ```
98
+
99
+ - Run your own TTS model (Using Griffin-Lim Vocoder):
100
+
101
+ ```
102
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
103
+ ```
104
+
105
+ - Run your own TTS and Vocoder models:
106
+
107
+ ```
108
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
109
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
110
+ ```
111
+
112
+ #### Multi-speaker Models
113
+
114
+ - List the available speakers and choose a <speaker_id> among them:
115
+
116
+ ```
117
+ $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
118
+ ```
119
+
120
+ - Run the multi-speaker TTS model with the target speaker ID:
121
+
122
+ ```
123
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
124
+ ```
125
+
126
+ - Run your own multi-speaker TTS model:
127
+
128
+ ```
129
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
130
+ ```
131
+
132
+ ### Voice Conversion Models
133
+
134
+ ```
135
+ $ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
136
+ ```
137
+ """
138
+
139
+
140
+ def str2bool(v):
141
+ if isinstance(v, bool):
142
+ return v
143
+ if v.lower() in ("yes", "true", "t", "y", "1"):
144
+ return True
145
+ if v.lower() in ("no", "false", "f", "n", "0"):
146
+ return False
147
+ raise argparse.ArgumentTypeError("Boolean value expected.")
148
+
149
+
150
+ def main():
151
+ parser = argparse.ArgumentParser(
152
+ description=description.replace(" ```\n", ""),
153
+ formatter_class=RawTextHelpFormatter,
154
+ )
155
+
156
+ parser.add_argument(
157
+ "--list_models",
158
+ type=str2bool,
159
+ nargs="?",
160
+ const=True,
161
+ default=False,
162
+ help="list available pre-trained TTS and vocoder models.",
163
+ )
164
+
165
+ parser.add_argument(
166
+ "--model_info_by_idx",
167
+ type=str,
168
+ default=None,
169
+ help="model info using query format: <model_type>/<model_query_idx>",
170
+ )
171
+
172
+ parser.add_argument(
173
+ "--model_info_by_name",
174
+ type=str,
175
+ default=None,
176
+ help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
177
+ )
178
+
179
+ parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
180
+
181
+ # Args for running pre-trained TTS models.
182
+ parser.add_argument(
183
+ "--model_name",
184
+ type=str,
185
+ default="tts_models/en/ljspeech/tacotron2-DDC",
186
+ help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
187
+ )
188
+ parser.add_argument(
189
+ "--vocoder_name",
190
+ type=str,
191
+ default=None,
192
+ help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
193
+ )
194
+
195
+ # Args for running custom models
196
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
197
+ parser.add_argument(
198
+ "--model_path",
199
+ type=str,
200
+ default=None,
201
+ help="Path to model file.",
202
+ )
203
+ parser.add_argument(
204
+ "--out_path",
205
+ type=str,
206
+ default="tts_output.wav",
207
+ help="Output wav file path.",
208
+ )
209
+ parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
210
+ parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
211
+ parser.add_argument(
212
+ "--vocoder_path",
213
+ type=str,
214
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
215
+ default=None,
216
+ )
217
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
218
+ parser.add_argument(
219
+ "--encoder_path",
220
+ type=str,
221
+ help="Path to speaker encoder model file.",
222
+ default=None,
223
+ )
224
+ parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
225
+
226
+ # args for coqui studio
227
+ parser.add_argument(
228
+ "--cs_model",
229
+ type=str,
230
+ help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
231
+ )
232
+ parser.add_argument(
233
+ "--emotion",
234
+ type=str,
235
+ help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.",
236
+ default=None,
237
+ )
238
+ parser.add_argument(
239
+ "--language",
240
+ type=str,
241
+ help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
242
+ default=None,
243
+ )
244
+ parser.add_argument(
245
+ "--pipe_out",
246
+ help="stdout the generated TTS wav file for shell pipe.",
247
+ type=str2bool,
248
+ nargs="?",
249
+ const=True,
250
+ default=False,
251
+ )
252
+ parser.add_argument(
253
+ "--speed",
254
+ type=float,
255
+ help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.",
256
+ default=None,
257
+ )
258
+
259
+ # args for multi-speaker synthesis
260
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
261
+ parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
262
+ parser.add_argument(
263
+ "--speaker_idx",
264
+ type=str,
265
+ help="Target speaker ID for a multi-speaker TTS model.",
266
+ default=None,
267
+ )
268
+ parser.add_argument(
269
+ "--language_idx",
270
+ type=str,
271
+ help="Target language ID for a multi-lingual TTS model.",
272
+ default=None,
273
+ )
274
+ parser.add_argument(
275
+ "--speaker_wav",
276
+ nargs="+",
277
+ help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
278
+ default=None,
279
+ )
280
+ parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
281
+ parser.add_argument(
282
+ "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
283
+ )
284
+ parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
285
+ parser.add_argument(
286
+ "--list_speaker_idxs",
287
+ help="List available speaker ids for the defined multi-speaker model.",
288
+ type=str2bool,
289
+ nargs="?",
290
+ const=True,
291
+ default=False,
292
+ )
293
+ parser.add_argument(
294
+ "--list_language_idxs",
295
+ help="List available language ids for the defined multi-lingual model.",
296
+ type=str2bool,
297
+ nargs="?",
298
+ const=True,
299
+ default=False,
300
+ )
301
+ # aux args
302
+ parser.add_argument(
303
+ "--save_spectogram",
304
+ type=bool,
305
+ help="If true save raw spectogram for further (vocoder) processing in out_path.",
306
+ default=False,
307
+ )
308
+ parser.add_argument(
309
+ "--reference_wav",
310
+ type=str,
311
+ help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
312
+ default=None,
313
+ )
314
+ parser.add_argument(
315
+ "--reference_speaker_idx",
316
+ type=str,
317
+ help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
318
+ default=None,
319
+ )
320
+ parser.add_argument(
321
+ "--progress_bar",
322
+ type=str2bool,
323
+ help="If true shows a progress bar for the model download. Defaults to True",
324
+ default=True,
325
+ )
326
+
327
+ # voice conversion args
328
+ parser.add_argument(
329
+ "--source_wav",
330
+ type=str,
331
+ default=None,
332
+ help="Original audio file to convert in the voice of the target_wav",
333
+ )
334
+ parser.add_argument(
335
+ "--target_wav",
336
+ type=str,
337
+ default=None,
338
+ help="Target audio file to convert in the voice of the source_wav",
339
+ )
340
+
341
+ parser.add_argument(
342
+ "--voice_dir",
343
+ type=str,
344
+ default=None,
345
+ help="Voice dir for tortoise model",
346
+ )
347
+
348
+ args = parser.parse_args()
349
+
350
+ # print the description if either text or list_models is not set
351
+ check_args = [
352
+ args.text,
353
+ args.list_models,
354
+ args.list_speaker_idxs,
355
+ args.list_language_idxs,
356
+ args.reference_wav,
357
+ args.model_info_by_idx,
358
+ args.model_info_by_name,
359
+ args.source_wav,
360
+ args.target_wav,
361
+ ]
362
+ if not any(check_args):
363
+ parser.parse_args(["-h"])
364
+
365
+ pipe_out = sys.stdout if args.pipe_out else None
366
+
367
+ with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
368
+ # Late-import to make things load faster
369
+ from TTS.api import TTS
370
+ from TTS.utils.manage import ModelManager
371
+ from TTS.utils.synthesizer import Synthesizer
372
+
373
+ # load model manager
374
+ path = Path(__file__).parent / "../.models.json"
375
+ manager = ModelManager(path, progress_bar=args.progress_bar)
376
+ api = TTS()
377
+
378
+ tts_path = None
379
+ tts_config_path = None
380
+ speakers_file_path = None
381
+ language_ids_file_path = None
382
+ vocoder_path = None
383
+ vocoder_config_path = None
384
+ encoder_path = None
385
+ encoder_config_path = None
386
+ vc_path = None
387
+ vc_config_path = None
388
+ model_dir = None
389
+
390
+ # CASE1 #list : list pre-trained TTS models
391
+ if args.list_models:
392
+ manager.add_cs_api_models(api.list_models())
393
+ manager.list_models()
394
+ sys.exit()
395
+
396
+ # CASE2 #info : model info for pre-trained TTS models
397
+ if args.model_info_by_idx:
398
+ model_query = args.model_info_by_idx
399
+ manager.model_info_by_idx(model_query)
400
+ sys.exit()
401
+
402
+ if args.model_info_by_name:
403
+ model_query_full_name = args.model_info_by_name
404
+ manager.model_info_by_full_name(model_query_full_name)
405
+ sys.exit()
406
+
407
+ # CASE3: TTS with coqui studio models
408
+ if "coqui_studio" in args.model_name:
409
+ print(" > Using 🐸Coqui Studio model: ", args.model_name)
410
+ api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
411
+ api.tts_to_file(
412
+ text=args.text,
413
+ emotion=args.emotion,
414
+ file_path=args.out_path,
415
+ language=args.language,
416
+ speed=args.speed,
417
+ pipe_out=pipe_out,
418
+ )
419
+ print(" > Saving output to ", args.out_path)
420
+ return
421
+
422
+ # CASE4: load pre-trained model paths
423
+ if args.model_name is not None and not args.model_path:
424
+ model_path, config_path, model_item = manager.download_model(args.model_name)
425
+ # tts model
426
+ if model_item["model_type"] == "tts_models":
427
+ tts_path = model_path
428
+ tts_config_path = config_path
429
+ if "default_vocoder" in model_item:
430
+ args.vocoder_name = (
431
+ model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
432
+ )
433
+
434
+ # voice conversion model
435
+ if model_item["model_type"] == "voice_conversion_models":
436
+ vc_path = model_path
437
+ vc_config_path = config_path
438
+
439
+ # tts model with multiple files to be loaded from the directory path
440
+ if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
441
+ model_dir = model_path
442
+ tts_path = None
443
+ tts_config_path = None
444
+ args.vocoder_name = None
445
+
446
+ # load vocoder
447
+ if args.vocoder_name is not None and not args.vocoder_path:
448
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
449
+
450
+ # CASE5: set custom model paths
451
+ if args.model_path is not None:
452
+ tts_path = args.model_path
453
+ tts_config_path = args.config_path
454
+ speakers_file_path = args.speakers_file_path
455
+ language_ids_file_path = args.language_ids_file_path
456
+
457
+ if args.vocoder_path is not None:
458
+ vocoder_path = args.vocoder_path
459
+ vocoder_config_path = args.vocoder_config_path
460
+
461
+ if args.encoder_path is not None:
462
+ encoder_path = args.encoder_path
463
+ encoder_config_path = args.encoder_config_path
464
+
465
+ device = args.device
466
+ if args.use_cuda:
467
+ device = "cuda"
468
+
469
+ # load models
470
+ synthesizer = Synthesizer(
471
+ tts_path,
472
+ tts_config_path,
473
+ speakers_file_path,
474
+ language_ids_file_path,
475
+ vocoder_path,
476
+ vocoder_config_path,
477
+ encoder_path,
478
+ encoder_config_path,
479
+ vc_path,
480
+ vc_config_path,
481
+ model_dir,
482
+ args.voice_dir,
483
+ ).to(device)
484
+
485
+ # query speaker ids of a multi-speaker model.
486
+ if args.list_speaker_idxs:
487
+ print(
488
+ " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
489
+ )
490
+ print(synthesizer.tts_model.speaker_manager.name_to_id)
491
+ return
492
+
493
+ # query langauge ids of a multi-lingual model.
494
+ if args.list_language_idxs:
495
+ print(
496
+ " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
497
+ )
498
+ print(synthesizer.tts_model.language_manager.name_to_id)
499
+ return
500
+
501
+ # check the arguments against a multi-speaker model.
502
+ if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
503
+ print(
504
+ " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
505
+ "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
506
+ )
507
+ return
508
+
509
+ # RUN THE SYNTHESIS
510
+ if args.text:
511
+ print(" > Text: {}".format(args.text))
512
+
513
+ # kick it
514
+ if tts_path is not None:
515
+ wav = synthesizer.tts(
516
+ args.text,
517
+ speaker_name=args.speaker_idx,
518
+ language_name=args.language_idx,
519
+ speaker_wav=args.speaker_wav,
520
+ reference_wav=args.reference_wav,
521
+ style_wav=args.capacitron_style_wav,
522
+ style_text=args.capacitron_style_text,
523
+ reference_speaker_name=args.reference_speaker_idx,
524
+ )
525
+ elif vc_path is not None:
526
+ wav = synthesizer.voice_conversion(
527
+ source_wav=args.source_wav,
528
+ target_wav=args.target_wav,
529
+ )
530
+ elif model_dir is not None:
531
+ wav = synthesizer.tts(
532
+ args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
533
+ )
534
+
535
+ # save the results
536
+ print(" > Saving output to {}".format(args.out_path))
537
+ synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
538
+
539
+
540
+ if __name__ == "__main__":
541
+ main()
TTS/bin/TTS_bin_train_encoder (1).py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import sys
6
+ import time
7
+ import traceback
8
+
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+ from trainer.torch import NoamLR
12
+ from trainer.trainer_utils import get_optimizer
13
+
14
+ from TTS.encoder.dataset import EncoderDataset
15
+ from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
16
+ from TTS.encoder.utils.training import init_training
17
+ from TTS.encoder.utils.visual import plot_embeddings
18
+ from TTS.tts.datasets import load_tts_samples
19
+ from TTS.utils.audio import AudioProcessor
20
+ from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
21
+ from TTS.utils.io import copy_model_files
22
+ from TTS.utils.samplers import PerfectBatchSampler
23
+ from TTS.utils.training import check_update
24
+
25
+ torch.backends.cudnn.enabled = True
26
+ torch.backends.cudnn.benchmark = True
27
+ torch.manual_seed(54321)
28
+ use_cuda = torch.cuda.is_available()
29
+ num_gpus = torch.cuda.device_count()
30
+ print(" > Using CUDA: ", use_cuda)
31
+ print(" > Number of GPUs: ", num_gpus)
32
+
33
+
34
+ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
35
+ num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
36
+ num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
37
+
38
+ dataset = EncoderDataset(
39
+ c,
40
+ ap,
41
+ meta_data_eval if is_val else meta_data_train,
42
+ voice_len=c.voice_len,
43
+ num_utter_per_class=num_utter_per_class,
44
+ num_classes_in_batch=num_classes_in_batch,
45
+ verbose=verbose,
46
+ augmentation_config=c.audio_augmentation if not is_val else None,
47
+ use_torch_spec=c.model_params.get("use_torch_spec", False),
48
+ )
49
+ # get classes list
50
+ classes = dataset.get_class_list()
51
+
52
+ sampler = PerfectBatchSampler(
53
+ dataset.items,
54
+ classes,
55
+ batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
56
+ num_classes_in_batch=num_classes_in_batch,
57
+ num_gpus=1,
58
+ shuffle=not is_val,
59
+ drop_last=True,
60
+ )
61
+
62
+ if len(classes) < num_classes_in_batch:
63
+ if is_val:
64
+ raise RuntimeError(
65
+ f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
66
+ )
67
+ raise RuntimeError(
68
+ f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
69
+ )
70
+
71
+ # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
72
+ if is_val:
73
+ dataset.set_classes(train_classes)
74
+
75
+ loader = DataLoader(
76
+ dataset,
77
+ num_workers=c.num_loader_workers,
78
+ batch_sampler=sampler,
79
+ collate_fn=dataset.collate_fn,
80
+ )
81
+
82
+ return loader, classes, dataset.get_map_classid_to_classname()
83
+
84
+
85
+ def evaluation(model, criterion, data_loader, global_step):
86
+ eval_loss = 0
87
+ for _, data in enumerate(data_loader):
88
+ with torch.no_grad():
89
+ # setup input data
90
+ inputs, labels = data
91
+
92
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
93
+ labels = torch.transpose(
94
+ labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
95
+ ).reshape(labels.shape)
96
+ inputs = torch.transpose(
97
+ inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
98
+ ).reshape(inputs.shape)
99
+
100
+ # dispatch data to GPU
101
+ if use_cuda:
102
+ inputs = inputs.cuda(non_blocking=True)
103
+ labels = labels.cuda(non_blocking=True)
104
+
105
+ # forward pass model
106
+ outputs = model(inputs)
107
+
108
+ # loss computation
109
+ loss = criterion(
110
+ outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
111
+ )
112
+
113
+ eval_loss += loss.item()
114
+
115
+ eval_avg_loss = eval_loss / len(data_loader)
116
+ # save stats
117
+ dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
118
+ # plot the last batch in the evaluation
119
+ figures = {
120
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
121
+ }
122
+ dashboard_logger.eval_figures(global_step, figures)
123
+ return eval_avg_loss
124
+
125
+
126
+ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
127
+ model.train()
128
+ best_loss = float("inf")
129
+ avg_loader_time = 0
130
+ end_time = time.time()
131
+ for epoch in range(c.epochs):
132
+ tot_loss = 0
133
+ epoch_time = 0
134
+ for _, data in enumerate(data_loader):
135
+ start_time = time.time()
136
+
137
+ # setup input data
138
+ inputs, labels = data
139
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
140
+ labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
141
+ labels.shape
142
+ )
143
+ inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
144
+ inputs.shape
145
+ )
146
+ # ToDo: move it to a unit test
147
+ # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
148
+ # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
149
+ # idx = 0
150
+ # for j in range(0, c.num_classes_in_batch, 1):
151
+ # for i in range(j, len(labels), c.num_classes_in_batch):
152
+ # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
153
+ # print("Invalid")
154
+ # print(labels)
155
+ # exit()
156
+ # idx += 1
157
+ # labels = labels_converted
158
+ # inputs = inputs_converted
159
+
160
+ loader_time = time.time() - end_time
161
+ global_step += 1
162
+
163
+ # setup lr
164
+ if c.lr_decay:
165
+ scheduler.step()
166
+ optimizer.zero_grad()
167
+
168
+ # dispatch data to GPU
169
+ if use_cuda:
170
+ inputs = inputs.cuda(non_blocking=True)
171
+ labels = labels.cuda(non_blocking=True)
172
+
173
+ # forward pass model
174
+ outputs = model(inputs)
175
+
176
+ # loss computation
177
+ loss = criterion(
178
+ outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
179
+ )
180
+ loss.backward()
181
+ grad_norm, _ = check_update(model, c.grad_clip)
182
+ optimizer.step()
183
+
184
+ step_time = time.time() - start_time
185
+ epoch_time += step_time
186
+
187
+ # acumulate the total epoch loss
188
+ tot_loss += loss.item()
189
+
190
+ # Averaged Loader Time
191
+ num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
192
+ avg_loader_time = (
193
+ 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
194
+ if avg_loader_time != 0
195
+ else loader_time
196
+ )
197
+ current_lr = optimizer.param_groups[0]["lr"]
198
+
199
+ if global_step % c.steps_plot_stats == 0:
200
+ # Plot Training Epoch Stats
201
+ train_stats = {
202
+ "loss": loss.item(),
203
+ "lr": current_lr,
204
+ "grad_norm": grad_norm,
205
+ "step_time": step_time,
206
+ "avg_loader_time": avg_loader_time,
207
+ }
208
+ dashboard_logger.train_epoch_stats(global_step, train_stats)
209
+ figures = {
210
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
211
+ }
212
+ dashboard_logger.train_figures(global_step, figures)
213
+
214
+ if global_step % c.print_step == 0:
215
+ print(
216
+ " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
217
+ "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
218
+ global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
219
+ ),
220
+ flush=True,
221
+ )
222
+
223
+ if global_step % c.save_step == 0:
224
+ # save model
225
+ save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
226
+
227
+ end_time = time.time()
228
+
229
+ print("")
230
+ print(
231
+ ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
232
+ "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
233
+ epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
234
+ ),
235
+ flush=True,
236
+ )
237
+ # evaluation
238
+ if c.run_eval:
239
+ model.eval()
240
+ eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
241
+ print("\n\n")
242
+ print("--> EVAL PERFORMANCE")
243
+ print(
244
+ " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
245
+ flush=True,
246
+ )
247
+ # save the best checkpoint
248
+ best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
249
+ model.train()
250
+
251
+ return best_loss, global_step
252
+
253
+
254
+ def main(args): # pylint: disable=redefined-outer-name
255
+ # pylint: disable=global-variable-undefined
256
+ global meta_data_train
257
+ global meta_data_eval
258
+ global train_classes
259
+
260
+ ap = AudioProcessor(**c.audio)
261
+ model = setup_encoder_model(c)
262
+
263
+ optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
264
+
265
+ # pylint: disable=redefined-outer-name
266
+ meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
267
+
268
+ train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
269
+ if c.run_eval:
270
+ eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
271
+ else:
272
+ eval_data_loader = None
273
+
274
+ num_classes = len(train_classes)
275
+ criterion = model.get_criterion(c, num_classes)
276
+
277
+ if c.loss == "softmaxproto" and c.model != "speaker_encoder":
278
+ c.map_classid_to_classname = map_classid_to_classname
279
+ copy_model_files(c, OUT_PATH)
280
+
281
+ if args.restore_path:
282
+ criterion, args.restore_step = model.load_checkpoint(
283
+ c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
284
+ )
285
+ print(" > Model restored from step %d" % args.restore_step, flush=True)
286
+ else:
287
+ args.restore_step = 0
288
+
289
+ if c.lr_decay:
290
+ scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
291
+ else:
292
+ scheduler = None
293
+
294
+ num_params = count_parameters(model)
295
+ print("\n > Model has {} parameters".format(num_params), flush=True)
296
+
297
+ if use_cuda:
298
+ model = model.cuda()
299
+ criterion.cuda()
300
+
301
+ global_step = args.restore_step
302
+ _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
303
+
304
+
305
+ if __name__ == "__main__":
306
+ args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
307
+
308
+ try:
309
+ main(args)
310
+ except KeyboardInterrupt:
311
+ remove_experiment_folder(OUT_PATH)
312
+ try:
313
+ sys.exit(0)
314
+ except SystemExit:
315
+ os._exit(0) # pylint: disable=protected-access
316
+ except Exception: # pylint: disable=broad-except
317
+ remove_experiment_folder(OUT_PATH)
318
+ traceback.print_exc()
319
+ sys.exit(1)
TTS/bin/TTS_bin_train_encoder.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import sys
6
+ import time
7
+ import traceback
8
+
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+ from trainer.torch import NoamLR
12
+ from trainer.trainer_utils import get_optimizer
13
+
14
+ from TTS.encoder.dataset import EncoderDataset
15
+ from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
16
+ from TTS.encoder.utils.training import init_training
17
+ from TTS.encoder.utils.visual import plot_embeddings
18
+ from TTS.tts.datasets import load_tts_samples
19
+ from TTS.utils.audio import AudioProcessor
20
+ from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
21
+ from TTS.utils.io import copy_model_files
22
+ from TTS.utils.samplers import PerfectBatchSampler
23
+ from TTS.utils.training import check_update
24
+
25
+ torch.backends.cudnn.enabled = True
26
+ torch.backends.cudnn.benchmark = True
27
+ torch.manual_seed(54321)
28
+ use_cuda = torch.cuda.is_available()
29
+ num_gpus = torch.cuda.device_count()
30
+ print(" > Using CUDA: ", use_cuda)
31
+ print(" > Number of GPUs: ", num_gpus)
32
+
33
+
34
+ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
35
+ num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
36
+ num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
37
+
38
+ dataset = EncoderDataset(
39
+ c,
40
+ ap,
41
+ meta_data_eval if is_val else meta_data_train,
42
+ voice_len=c.voice_len,
43
+ num_utter_per_class=num_utter_per_class,
44
+ num_classes_in_batch=num_classes_in_batch,
45
+ verbose=verbose,
46
+ augmentation_config=c.audio_augmentation if not is_val else None,
47
+ use_torch_spec=c.model_params.get("use_torch_spec", False),
48
+ )
49
+ # get classes list
50
+ classes = dataset.get_class_list()
51
+
52
+ sampler = PerfectBatchSampler(
53
+ dataset.items,
54
+ classes,
55
+ batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
56
+ num_classes_in_batch=num_classes_in_batch,
57
+ num_gpus=1,
58
+ shuffle=not is_val,
59
+ drop_last=True,
60
+ )
61
+
62
+ if len(classes) < num_classes_in_batch:
63
+ if is_val:
64
+ raise RuntimeError(
65
+ f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
66
+ )
67
+ raise RuntimeError(
68
+ f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
69
+ )
70
+
71
+ # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
72
+ if is_val:
73
+ dataset.set_classes(train_classes)
74
+
75
+ loader = DataLoader(
76
+ dataset,
77
+ num_workers=c.num_loader_workers,
78
+ batch_sampler=sampler,
79
+ collate_fn=dataset.collate_fn,
80
+ )
81
+
82
+ return loader, classes, dataset.get_map_classid_to_classname()
83
+
84
+
85
+ def evaluation(model, criterion, data_loader, global_step):
86
+ eval_loss = 0
87
+ for _, data in enumerate(data_loader):
88
+ with torch.no_grad():
89
+ # setup input data
90
+ inputs, labels = data
91
+
92
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
93
+ labels = torch.transpose(
94
+ labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
95
+ ).reshape(labels.shape)
96
+ inputs = torch.transpose(
97
+ inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
98
+ ).reshape(inputs.shape)
99
+
100
+ # dispatch data to GPU
101
+ if use_cuda:
102
+ inputs = inputs.cuda(non_blocking=True)
103
+ labels = labels.cuda(non_blocking=True)
104
+
105
+ # forward pass model
106
+ outputs = model(inputs)
107
+
108
+ # loss computation
109
+ loss = criterion(
110
+ outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
111
+ )
112
+
113
+ eval_loss += loss.item()
114
+
115
+ eval_avg_loss = eval_loss / len(data_loader)
116
+ # save stats
117
+ dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
118
+ # plot the last batch in the evaluation
119
+ figures = {
120
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
121
+ }
122
+ dashboard_logger.eval_figures(global_step, figures)
123
+ return eval_avg_loss
124
+
125
+
126
+ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
127
+ model.train()
128
+ best_loss = float("inf")
129
+ avg_loader_time = 0
130
+ end_time = time.time()
131
+ for epoch in range(c.epochs):
132
+ tot_loss = 0
133
+ epoch_time = 0
134
+ for _, data in enumerate(data_loader):
135
+ start_time = time.time()
136
+
137
+ # setup input data
138
+ inputs, labels = data
139
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
140
+ labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
141
+ labels.shape
142
+ )
143
+ inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
144
+ inputs.shape
145
+ )
146
+ # ToDo: move it to a unit test
147
+ # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
148
+ # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
149
+ # idx = 0
150
+ # for j in range(0, c.num_classes_in_batch, 1):
151
+ # for i in range(j, len(labels), c.num_classes_in_batch):
152
+ # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
153
+ # print("Invalid")
154
+ # print(labels)
155
+ # exit()
156
+ # idx += 1
157
+ # labels = labels_converted
158
+ # inputs = inputs_converted
159
+
160
+ loader_time = time.time() - end_time
161
+ global_step += 1
162
+
163
+ # setup lr
164
+ if c.lr_decay:
165
+ scheduler.step()
166
+ optimizer.zero_grad()
167
+
168
+ # dispatch data to GPU
169
+ if use_cuda:
170
+ inputs = inputs.cuda(non_blocking=True)
171
+ labels = labels.cuda(non_blocking=True)
172
+
173
+ # forward pass model
174
+ outputs = model(inputs)
175
+
176
+ # loss computation
177
+ loss = criterion(
178
+ outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
179
+ )
180
+ loss.backward()
181
+ grad_norm, _ = check_update(model, c.grad_clip)
182
+ optimizer.step()
183
+
184
+ step_time = time.time() - start_time
185
+ epoch_time += step_time
186
+
187
+ # acumulate the total epoch loss
188
+ tot_loss += loss.item()
189
+
190
+ # Averaged Loader Time
191
+ num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
192
+ avg_loader_time = (
193
+ 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
194
+ if avg_loader_time != 0
195
+ else loader_time
196
+ )
197
+ current_lr = optimizer.param_groups[0]["lr"]
198
+
199
+ if global_step % c.steps_plot_stats == 0:
200
+ # Plot Training Epoch Stats
201
+ train_stats = {
202
+ "loss": loss.item(),
203
+ "lr": current_lr,
204
+ "grad_norm": grad_norm,
205
+ "step_time": step_time,
206
+ "avg_loader_time": avg_loader_time,
207
+ }
208
+ dashboard_logger.train_epoch_stats(global_step, train_stats)
209
+ figures = {
210
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
211
+ }
212
+ dashboard_logger.train_figures(global_step, figures)
213
+
214
+ if global_step % c.print_step == 0:
215
+ print(
216
+ " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
217
+ "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
218
+ global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
219
+ ),
220
+ flush=True,
221
+ )
222
+
223
+ if global_step % c.save_step == 0:
224
+ # save model
225
+ save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
226
+
227
+ end_time = time.time()
228
+
229
+ print("")
230
+ print(
231
+ ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
232
+ "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
233
+ epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
234
+ ),
235
+ flush=True,
236
+ )
237
+ # evaluation
238
+ if c.run_eval:
239
+ model.eval()
240
+ eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
241
+ print("\n\n")
242
+ print("--> EVAL PERFORMANCE")
243
+ print(
244
+ " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
245
+ flush=True,
246
+ )
247
+ # save the best checkpoint
248
+ best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
249
+ model.train()
250
+
251
+ return best_loss, global_step
252
+
253
+
254
+ def main(args): # pylint: disable=redefined-outer-name
255
+ # pylint: disable=global-variable-undefined
256
+ global meta_data_train
257
+ global meta_data_eval
258
+ global train_classes
259
+
260
+ ap = AudioProcessor(**c.audio)
261
+ model = setup_encoder_model(c)
262
+
263
+ optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
264
+
265
+ # pylint: disable=redefined-outer-name
266
+ meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
267
+
268
+ train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
269
+ if c.run_eval:
270
+ eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
271
+ else:
272
+ eval_data_loader = None
273
+
274
+ num_classes = len(train_classes)
275
+ criterion = model.get_criterion(c, num_classes)
276
+
277
+ if c.loss == "softmaxproto" and c.model != "speaker_encoder":
278
+ c.map_classid_to_classname = map_classid_to_classname
279
+ copy_model_files(c, OUT_PATH)
280
+
281
+ if args.restore_path:
282
+ criterion, args.restore_step = model.load_checkpoint(
283
+ c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
284
+ )
285
+ print(" > Model restored from step %d" % args.restore_step, flush=True)
286
+ else:
287
+ args.restore_step = 0
288
+
289
+ if c.lr_decay:
290
+ scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
291
+ else:
292
+ scheduler = None
293
+
294
+ num_params = count_parameters(model)
295
+ print("\n > Model has {} parameters".format(num_params), flush=True)
296
+
297
+ if use_cuda:
298
+ model = model.cuda()
299
+ criterion.cuda()
300
+
301
+ global_step = args.restore_step
302
+ _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
303
+
304
+
305
+ if __name__ == "__main__":
306
+ args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
307
+
308
+ try:
309
+ main(args)
310
+ except KeyboardInterrupt:
311
+ remove_experiment_folder(OUT_PATH)
312
+ try:
313
+ sys.exit(0)
314
+ except SystemExit:
315
+ os._exit(0) # pylint: disable=protected-access
316
+ except Exception: # pylint: disable=broad-except
317
+ remove_experiment_folder(OUT_PATH)
318
+ traceback.print_exc()
319
+ sys.exit(1)
TTS/bin/TTS_bin_train_tts (1).py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.tts.datasets import load_tts_samples
8
+ from TTS.tts.models import setup_model
9
+
10
+
11
+ @dataclass
12
+ class TrainTTSArgs(TrainerArgs):
13
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14
+
15
+
16
+ def main():
17
+ """Run `tts` model training directly by a `config.json` file."""
18
+ # init trainer args
19
+ train_args = TrainTTSArgs()
20
+ parser = train_args.init_argparse(arg_prefix="")
21
+
22
+ # override trainer args from comman-line args
23
+ args, config_overrides = parser.parse_known_args()
24
+ train_args.parse_args(args)
25
+
26
+ # load config.json and register
27
+ if args.config_path or args.continue_path:
28
+ if args.config_path:
29
+ # init from a file
30
+ config = load_config(args.config_path)
31
+ if len(config_overrides) > 0:
32
+ config.parse_known_args(config_overrides, relaxed_parser=True)
33
+ elif args.continue_path:
34
+ # continue from a prev experiment
35
+ config = load_config(os.path.join(args.continue_path, "config.json"))
36
+ if len(config_overrides) > 0:
37
+ config.parse_known_args(config_overrides, relaxed_parser=True)
38
+ else:
39
+ # init from console args
40
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41
+
42
+ config_base = BaseTrainingConfig()
43
+ config_base.parse_known_args(config_overrides)
44
+ config = register_config(config_base.model)()
45
+
46
+ # load training samples
47
+ train_samples, eval_samples = load_tts_samples(
48
+ config.datasets,
49
+ eval_split=True,
50
+ eval_split_max_size=config.eval_split_max_size,
51
+ eval_split_size=config.eval_split_size,
52
+ )
53
+
54
+ # init the model from config
55
+ model = setup_model(config, train_samples + eval_samples)
56
+
57
+ # init the trainer and 🚀
58
+ trainer = Trainer(
59
+ train_args,
60
+ model.config,
61
+ config.output_path,
62
+ model=model,
63
+ train_samples=train_samples,
64
+ eval_samples=eval_samples,
65
+ parse_command_line_args=False,
66
+ )
67
+ trainer.fit()
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
TTS/bin/TTS_bin_train_tts.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.tts.datasets import load_tts_samples
8
+ from TTS.tts.models import setup_model
9
+
10
+
11
+ @dataclass
12
+ class TrainTTSArgs(TrainerArgs):
13
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14
+
15
+
16
+ def main():
17
+ """Run `tts` model training directly by a `config.json` file."""
18
+ # init trainer args
19
+ train_args = TrainTTSArgs()
20
+ parser = train_args.init_argparse(arg_prefix="")
21
+
22
+ # override trainer args from comman-line args
23
+ args, config_overrides = parser.parse_known_args()
24
+ train_args.parse_args(args)
25
+
26
+ # load config.json and register
27
+ if args.config_path or args.continue_path:
28
+ if args.config_path:
29
+ # init from a file
30
+ config = load_config(args.config_path)
31
+ if len(config_overrides) > 0:
32
+ config.parse_known_args(config_overrides, relaxed_parser=True)
33
+ elif args.continue_path:
34
+ # continue from a prev experiment
35
+ config = load_config(os.path.join(args.continue_path, "config.json"))
36
+ if len(config_overrides) > 0:
37
+ config.parse_known_args(config_overrides, relaxed_parser=True)
38
+ else:
39
+ # init from console args
40
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41
+
42
+ config_base = BaseTrainingConfig()
43
+ config_base.parse_known_args(config_overrides)
44
+ config = register_config(config_base.model)()
45
+
46
+ # load training samples
47
+ train_samples, eval_samples = load_tts_samples(
48
+ config.datasets,
49
+ eval_split=True,
50
+ eval_split_max_size=config.eval_split_max_size,
51
+ eval_split_size=config.eval_split_size,
52
+ )
53
+
54
+ # init the model from config
55
+ model = setup_model(config, train_samples + eval_samples)
56
+
57
+ # init the trainer and 🚀
58
+ trainer = Trainer(
59
+ train_args,
60
+ model.config,
61
+ config.output_path,
62
+ model=model,
63
+ train_samples=train_samples,
64
+ eval_samples=eval_samples,
65
+ parse_command_line_args=False,
66
+ )
67
+ trainer.fit()
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
TTS/bin/TTS_bin_train_vocoder.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.utils.audio import AudioProcessor
8
+ from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9
+ from TTS.vocoder.models import setup_model
10
+
11
+
12
+ @dataclass
13
+ class TrainVocoderArgs(TrainerArgs):
14
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
15
+
16
+
17
+ def main():
18
+ """Run `tts` model training directly by a `config.json` file."""
19
+ # init trainer args
20
+ train_args = TrainVocoderArgs()
21
+ parser = train_args.init_argparse(arg_prefix="")
22
+
23
+ # override trainer args from comman-line args
24
+ args, config_overrides = parser.parse_known_args()
25
+ train_args.parse_args(args)
26
+
27
+ # load config.json and register
28
+ if args.config_path or args.continue_path:
29
+ if args.config_path:
30
+ # init from a file
31
+ config = load_config(args.config_path)
32
+ if len(config_overrides) > 0:
33
+ config.parse_known_args(config_overrides, relaxed_parser=True)
34
+ elif args.continue_path:
35
+ # continue from a prev experiment
36
+ config = load_config(os.path.join(args.continue_path, "config.json"))
37
+ if len(config_overrides) > 0:
38
+ config.parse_known_args(config_overrides, relaxed_parser=True)
39
+ else:
40
+ # init from console args
41
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
42
+
43
+ config_base = BaseTrainingConfig()
44
+ config_base.parse_known_args(config_overrides)
45
+ config = register_config(config_base.model)()
46
+
47
+ # load training samples
48
+ if "feature_path" in config and config.feature_path:
49
+ # load pre-computed features
50
+ print(f" > Loading features from: {config.feature_path}")
51
+ eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
52
+ else:
53
+ # load data raw wav files
54
+ eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
55
+
56
+ # setup audio processor
57
+ ap = AudioProcessor(**config.audio)
58
+
59
+ # init the model from config
60
+ model = setup_model(config)
61
+
62
+ # init the trainer and 🚀
63
+ trainer = Trainer(
64
+ train_args,
65
+ config,
66
+ config.output_path,
67
+ model=model,
68
+ train_samples=train_samples,
69
+ eval_samples=eval_samples,
70
+ training_assets={"audio_processor": ap},
71
+ parse_command_line_args=False,
72
+ )
73
+ trainer.fit()
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
TTS/bin/TTS_bin_tune_wavegrad.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search a good noise schedule for WaveGrad for a given number of inference iterations"""
2
+ import argparse
3
+ from itertools import product as cartesian_product
4
+
5
+ import numpy as np
6
+ import torch
7
+ from torch.utils.data import DataLoader
8
+ from tqdm import tqdm
9
+
10
+ from TTS.config import load_config
11
+ from TTS.utils.audio import AudioProcessor
12
+ from TTS.vocoder.datasets.preprocess import load_wav_data
13
+ from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
14
+ from TTS.vocoder.models import setup_model
15
+
16
+ if __name__ == "__main__":
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
19
+ parser.add_argument("--config_path", type=str, help="Path to model config file.")
20
+ parser.add_argument("--data_path", type=str, help="Path to data directory.")
21
+ parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
22
+ parser.add_argument(
23
+ "--num_iter",
24
+ type=int,
25
+ help="Number of model inference iterations that you like to optimize noise schedule for.",
26
+ )
27
+ parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
28
+ parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
29
+ parser.add_argument(
30
+ "--search_depth",
31
+ type=int,
32
+ default=3,
33
+ help="Search granularity. Increasing this increases the run-time exponentially.",
34
+ )
35
+
36
+ # load config
37
+ args = parser.parse_args()
38
+ config = load_config(args.config_path)
39
+
40
+ # setup audio processor
41
+ ap = AudioProcessor(**config.audio)
42
+
43
+ # load dataset
44
+ _, train_data = load_wav_data(args.data_path, 0)
45
+ train_data = train_data[: args.num_samples]
46
+ dataset = WaveGradDataset(
47
+ ap=ap,
48
+ items=train_data,
49
+ seq_len=-1,
50
+ hop_len=ap.hop_length,
51
+ pad_short=config.pad_short,
52
+ conv_pad=config.conv_pad,
53
+ is_training=True,
54
+ return_segments=False,
55
+ use_noise_augment=False,
56
+ use_cache=False,
57
+ verbose=True,
58
+ )
59
+ loader = DataLoader(
60
+ dataset,
61
+ batch_size=1,
62
+ shuffle=False,
63
+ collate_fn=dataset.collate_full_clips,
64
+ drop_last=False,
65
+ num_workers=config.num_loader_workers,
66
+ pin_memory=False,
67
+ )
68
+
69
+ # setup the model
70
+ model = setup_model(config)
71
+ if args.use_cuda:
72
+ model.cuda()
73
+
74
+ # setup optimization parameters
75
+ base_values = sorted(10 * np.random.uniform(size=args.search_depth))
76
+ print(f" > base values: {base_values}")
77
+ exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
78
+ best_error = float("inf")
79
+ best_schedule = None # pylint: disable=C0103
80
+ total_search_iter = len(base_values) ** args.num_iter
81
+ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
82
+ beta = exponents * base
83
+ model.compute_noise_level(beta)
84
+ for data in loader:
85
+ mel, audio = data
86
+ y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
87
+
88
+ if args.use_cuda:
89
+ y_hat = y_hat.cpu()
90
+ y_hat = y_hat.numpy()
91
+
92
+ mel_hat = []
93
+ for i in range(y_hat.shape[0]):
94
+ m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
95
+ mel_hat.append(torch.from_numpy(m))
96
+
97
+ mel_hat = torch.stack(mel_hat)
98
+ mse = torch.sum((mel - mel_hat) ** 2).mean()
99
+ if mse.item() < best_error:
100
+ best_error = mse.item()
101
+ best_schedule = {"beta": beta}
102
+ print(f" > Found a better schedule. - MSE: {mse.item()}")
103
+ np.save(args.output_path, best_schedule)
TTS/config/TTS_config___init__.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Dict
5
+
6
+ import fsspec
7
+ import yaml
8
+ from coqpit import Coqpit
9
+
10
+ from TTS.config.shared_configs import *
11
+ from TTS.utils.generic_utils import find_module
12
+
13
+
14
+ def read_json_with_comments(json_path):
15
+ """for backward compat."""
16
+ # fallback to json
17
+ with fsspec.open(json_path, "r", encoding="utf-8") as f:
18
+ input_str = f.read()
19
+ # handle comments
20
+ input_str = re.sub(r"\\\n", "", input_str)
21
+ input_str = re.sub(r"//.*\n", "\n", input_str)
22
+ data = json.loads(input_str)
23
+ return data
24
+
25
+
26
+ def register_config(model_name: str) -> Coqpit:
27
+ """Find the right config for the given model name.
28
+
29
+ Args:
30
+ model_name (str): Model name.
31
+
32
+ Raises:
33
+ ModuleNotFoundError: No matching config for the model name.
34
+
35
+ Returns:
36
+ Coqpit: config class.
37
+ """
38
+ config_class = None
39
+ config_name = model_name + "_config"
40
+
41
+ # TODO: fix this
42
+ if model_name == "xtts":
43
+ from TTS.tts.configs.xtts_config import XttsConfig
44
+
45
+ config_class = XttsConfig
46
+ paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
47
+ for path in paths:
48
+ try:
49
+ config_class = find_module(path, config_name)
50
+ except ModuleNotFoundError:
51
+ pass
52
+ if config_class is None:
53
+ raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
54
+ return config_class
55
+
56
+
57
+ def _process_model_name(config_dict: Dict) -> str:
58
+ """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
59
+
60
+ Args:
61
+ config_dict (Dict): A dictionary including the config fields.
62
+
63
+ Returns:
64
+ str: Formatted modelname.
65
+ """
66
+ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
67
+ model_name = model_name.replace("_generator", "").replace("_discriminator", "")
68
+ return model_name
69
+
70
+
71
+ def load_config(config_path: str) -> Coqpit:
72
+ """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
73
+ to find the corresponding Config class. Then initialize the Config.
74
+
75
+ Args:
76
+ config_path (str): path to the config file.
77
+
78
+ Raises:
79
+ TypeError: given config file has an unknown type.
80
+
81
+ Returns:
82
+ Coqpit: TTS config object.
83
+ """
84
+ config_dict = {}
85
+ ext = os.path.splitext(config_path)[1]
86
+ if ext in (".yml", ".yaml"):
87
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
88
+ data = yaml.safe_load(f)
89
+ elif ext == ".json":
90
+ try:
91
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
92
+ data = json.load(f)
93
+ except json.decoder.JSONDecodeError:
94
+ # backwards compat.
95
+ data = read_json_with_comments(config_path)
96
+ else:
97
+ raise TypeError(f" [!] Unknown config file type {ext}")
98
+ config_dict.update(data)
99
+ model_name = _process_model_name(config_dict)
100
+ config_class = register_config(model_name.lower())
101
+ config = config_class()
102
+ config.from_dict(config_dict)
103
+ return config
104
+
105
+
106
+ def check_config_and_model_args(config, arg_name, value):
107
+ """Check the give argument in `config.model_args` if exist or in `config` for
108
+ the given value.
109
+
110
+ Return False if the argument does not exist in `config.model_args` or `config`.
111
+ This is to patch up the compatibility between models with and without `model_args`.
112
+
113
+ TODO: Remove this in the future with a unified approach.
114
+ """
115
+ if hasattr(config, "model_args"):
116
+ if arg_name in config.model_args:
117
+ return config.model_args[arg_name] == value
118
+ if hasattr(config, arg_name):
119
+ return config[arg_name] == value
120
+ return False
121
+
122
+
123
+ def get_from_config_or_model_args(config, arg_name):
124
+ """Get the given argument from `config.model_args` if exist or in `config`."""
125
+ if hasattr(config, "model_args"):
126
+ if arg_name in config.model_args:
127
+ return config.model_args[arg_name]
128
+ return config[arg_name]
129
+
130
+
131
+ def get_from_config_or_model_args_with_default(config, arg_name, def_val):
132
+ """Get the given argument from `config.model_args` if exist or in `config`."""
133
+ if hasattr(config, "model_args"):
134
+ if arg_name in config.model_args:
135
+ return config.model_args[arg_name]
136
+ if hasattr(config, arg_name):
137
+ return config[arg_name]
138
+ return def_val
TTS/config/TTS_config_shared_configs.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+ from typing import List
3
+
4
+ from coqpit import Coqpit, check_argument
5
+ from trainer import TrainerConfig
6
+
7
+
8
+ @dataclass
9
+ class BaseAudioConfig(Coqpit):
10
+ """Base config to definge audio processing parameters. It is used to initialize
11
+ ```TTS.utils.audio.AudioProcessor.```
12
+
13
+ Args:
14
+ fft_size (int):
15
+ Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
16
+
17
+ win_length (int):
18
+ Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
19
+ ```fft_size```. Defaults to 1024.
20
+
21
+ hop_length (int):
22
+ Number of audio samples between adjacent STFT columns. Defaults to 1024.
23
+
24
+ frame_shift_ms (int):
25
+ Set ```hop_length``` based on milliseconds and sampling rate.
26
+
27
+ frame_length_ms (int):
28
+ Set ```win_length``` based on milliseconds and sampling rate.
29
+
30
+ stft_pad_mode (str):
31
+ Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
32
+
33
+ sample_rate (int):
34
+ Audio sampling rate. Defaults to 22050.
35
+
36
+ resample (bool):
37
+ Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
38
+
39
+ preemphasis (float):
40
+ Preemphasis coefficient. Defaults to 0.0.
41
+
42
+ ref_level_db (int): 20
43
+ Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
44
+ Defaults to 20.
45
+
46
+ do_sound_norm (bool):
47
+ Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
48
+
49
+ log_func (str):
50
+ Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
51
+
52
+ do_trim_silence (bool):
53
+ Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
54
+
55
+ do_amp_to_db_linear (bool, optional):
56
+ enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
57
+
58
+ do_amp_to_db_mel (bool, optional):
59
+ enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
60
+
61
+ pitch_fmax (float, optional):
62
+ Maximum frequency of the F0 frames. Defaults to ```640```.
63
+
64
+ pitch_fmin (float, optional):
65
+ Minimum frequency of the F0 frames. Defaults to ```1```.
66
+
67
+ trim_db (int):
68
+ Silence threshold used for silence trimming. Defaults to 45.
69
+
70
+ do_rms_norm (bool, optional):
71
+ enable/disable RMS volume normalization when loading an audio file. Defaults to False.
72
+
73
+ db_level (int, optional):
74
+ dB level used for rms normalization. The range is -99 to 0. Defaults to None.
75
+
76
+ power (float):
77
+ Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
78
+ artifacts in the synthesized voice. Defaults to 1.5.
79
+
80
+ griffin_lim_iters (int):
81
+ Number of Griffing Lim iterations. Defaults to 60.
82
+
83
+ num_mels (int):
84
+ Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
85
+
86
+ mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
87
+ It needs to be adjusted for a dataset. Defaults to 0.
88
+
89
+ mel_fmax (float):
90
+ Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
91
+
92
+ spec_gain (int):
93
+ Gain applied when converting amplitude to DB. Defaults to 20.
94
+
95
+ signal_norm (bool):
96
+ enable/disable signal normalization. Defaults to True.
97
+
98
+ min_level_db (int):
99
+ minimum db threshold for the computed melspectrograms. Defaults to -100.
100
+
101
+ symmetric_norm (bool):
102
+ enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
103
+ [0, k], Defaults to True.
104
+
105
+ max_norm (float):
106
+ ```k``` defining the normalization range. Defaults to 4.0.
107
+
108
+ clip_norm (bool):
109
+ enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
110
+
111
+ stats_path (str):
112
+ Path to the computed stats file. Defaults to None.
113
+ """
114
+
115
+ # stft parameters
116
+ fft_size: int = 1024
117
+ win_length: int = 1024
118
+ hop_length: int = 256
119
+ frame_shift_ms: int = None
120
+ frame_length_ms: int = None
121
+ stft_pad_mode: str = "reflect"
122
+ # audio processing parameters
123
+ sample_rate: int = 22050
124
+ resample: bool = False
125
+ preemphasis: float = 0.0
126
+ ref_level_db: int = 20
127
+ do_sound_norm: bool = False
128
+ log_func: str = "np.log10"
129
+ # silence trimming
130
+ do_trim_silence: bool = True
131
+ trim_db: int = 45
132
+ # rms volume normalization
133
+ do_rms_norm: bool = False
134
+ db_level: float = None
135
+ # griffin-lim params
136
+ power: float = 1.5
137
+ griffin_lim_iters: int = 60
138
+ # mel-spec params
139
+ num_mels: int = 80
140
+ mel_fmin: float = 0.0
141
+ mel_fmax: float = None
142
+ spec_gain: int = 20
143
+ do_amp_to_db_linear: bool = True
144
+ do_amp_to_db_mel: bool = True
145
+ # f0 params
146
+ pitch_fmax: float = 640.0
147
+ pitch_fmin: float = 1.0
148
+ # normalization params
149
+ signal_norm: bool = True
150
+ min_level_db: int = -100
151
+ symmetric_norm: bool = True
152
+ max_norm: float = 4.0
153
+ clip_norm: bool = True
154
+ stats_path: str = None
155
+
156
+ def check_values(
157
+ self,
158
+ ):
159
+ """Check config fields"""
160
+ c = asdict(self)
161
+ check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
162
+ check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
163
+ check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
164
+ check_argument(
165
+ "frame_length_ms",
166
+ c,
167
+ restricted=True,
168
+ min_val=10,
169
+ max_val=1000,
170
+ alternative="win_length",
171
+ )
172
+ check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
173
+ check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
174
+ check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
175
+ check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
176
+ check_argument("power", c, restricted=True, min_val=1, max_val=5)
177
+ check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
178
+
179
+ # normalization parameters
180
+ check_argument("signal_norm", c, restricted=True)
181
+ check_argument("symmetric_norm", c, restricted=True)
182
+ check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
183
+ check_argument("clip_norm", c, restricted=True)
184
+ check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
185
+ check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
186
+ check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
187
+ check_argument("do_trim_silence", c, restricted=True)
188
+ check_argument("trim_db", c, restricted=True)
189
+
190
+
191
+ @dataclass
192
+ class BaseDatasetConfig(Coqpit):
193
+ """Base config for TTS datasets.
194
+
195
+ Args:
196
+ formatter (str):
197
+ Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
198
+
199
+ dataset_name (str):
200
+ Unique name for the dataset. Defaults to `""`.
201
+
202
+ path (str):
203
+ Root path to the dataset files. Defaults to `""`.
204
+
205
+ meta_file_train (str):
206
+ Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
207
+ Defaults to `""`.
208
+
209
+ ignored_speakers (List):
210
+ List of speakers IDs that are not used at the training. Default None.
211
+
212
+ language (str):
213
+ Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
214
+
215
+ phonemizer (str):
216
+ Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
217
+
218
+ meta_file_val (str):
219
+ Name of the dataset meta file that defines the instances used at validation.
220
+
221
+ meta_file_attn_mask (str):
222
+ Path to the file that lists the attention mask files used with models that require attention masks to
223
+ train the duration predictor.
224
+ """
225
+
226
+ formatter: str = ""
227
+ dataset_name: str = ""
228
+ path: str = ""
229
+ meta_file_train: str = ""
230
+ ignored_speakers: List[str] = None
231
+ language: str = ""
232
+ phonemizer: str = ""
233
+ meta_file_val: str = ""
234
+ meta_file_attn_mask: str = ""
235
+
236
+ def check_values(
237
+ self,
238
+ ):
239
+ """Check config fields"""
240
+ c = asdict(self)
241
+ check_argument("formatter", c, restricted=True)
242
+ check_argument("path", c, restricted=True)
243
+ check_argument("meta_file_train", c, restricted=True)
244
+ check_argument("meta_file_val", c, restricted=False)
245
+ check_argument("meta_file_attn_mask", c, restricted=False)
246
+
247
+
248
+ @dataclass
249
+ class BaseTrainingConfig(TrainerConfig):
250
+ """Base config to define the basic 🐸TTS training parameters that are shared
251
+ among all the models. It is based on ```Trainer.TrainingConfig```.
252
+
253
+ Args:
254
+ model (str):
255
+ Name of the model that is used in the training.
256
+
257
+ num_loader_workers (int):
258
+ Number of workers for training time dataloader.
259
+
260
+ num_eval_loader_workers (int):
261
+ Number of workers for evaluation time dataloader.
262
+ """
263
+
264
+ model: str = None
265
+ # dataloading
266
+ num_loader_workers: int = 0
267
+ num_eval_loader_workers: int = 0
268
+ use_noise_augment: bool = False
TTS/config/__pycache__/TTS_config___pycache_____init__.cpython-39.pyc ADDED
Binary file (4.08 kB). View file
 
TTS/config/__pycache__/TTS_config___pycache___shared_configs.cpython-39.pyc ADDED
Binary file (9.52 kB). View file
 
TTS/encoder/TTS_encoder_README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Speaker Encoder
2
+
3
+ This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4
+
5
+ With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6
+
7
+ Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8
+
9
+ ![](umap.png)
10
+
11
+ Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12
+
13
+ To run the code, you need to follow the same flow as in TTS.
14
+
15
+ - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16
+ - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17
+ - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18
+ - Watch training on Tensorboard as in TTS
TTS/encoder/TTS_encoder___init__.py ADDED
File without changes
TTS/encoder/TTS_encoder_dataset.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import torch
4
+ from torch.utils.data import Dataset
5
+
6
+ from TTS.encoder.utils.generic_utils import AugmentWAV
7
+
8
+
9
+ class EncoderDataset(Dataset):
10
+ def __init__(
11
+ self,
12
+ config,
13
+ ap,
14
+ meta_data,
15
+ voice_len=1.6,
16
+ num_classes_in_batch=64,
17
+ num_utter_per_class=10,
18
+ verbose=False,
19
+ augmentation_config=None,
20
+ use_torch_spec=None,
21
+ ):
22
+ """
23
+ Args:
24
+ ap (TTS.tts.utils.AudioProcessor): audio processor object.
25
+ meta_data (list): list of dataset instances.
26
+ seq_len (int): voice segment length in seconds.
27
+ verbose (bool): print diagnostic information.
28
+ """
29
+ super().__init__()
30
+ self.config = config
31
+ self.items = meta_data
32
+ self.sample_rate = ap.sample_rate
33
+ self.seq_len = int(voice_len * self.sample_rate)
34
+ self.num_utter_per_class = num_utter_per_class
35
+ self.ap = ap
36
+ self.verbose = verbose
37
+ self.use_torch_spec = use_torch_spec
38
+ self.classes, self.items = self.__parse_items()
39
+
40
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
41
+
42
+ # Data Augmentation
43
+ self.augmentator = None
44
+ self.gaussian_augmentation_config = None
45
+ if augmentation_config:
46
+ self.data_augmentation_p = augmentation_config["p"]
47
+ if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
48
+ self.augmentator = AugmentWAV(ap, augmentation_config)
49
+
50
+ if "gaussian" in augmentation_config.keys():
51
+ self.gaussian_augmentation_config = augmentation_config["gaussian"]
52
+
53
+ if self.verbose:
54
+ print("\n > DataLoader initialization")
55
+ print(f" | > Classes per Batch: {num_classes_in_batch}")
56
+ print(f" | > Number of instances : {len(self.items)}")
57
+ print(f" | > Sequence length: {self.seq_len}")
58
+ print(f" | > Num Classes: {len(self.classes)}")
59
+ print(f" | > Classes: {self.classes}")
60
+
61
+ def load_wav(self, filename):
62
+ audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
63
+ return audio
64
+
65
+ def __parse_items(self):
66
+ class_to_utters = {}
67
+ for item in self.items:
68
+ path_ = item["audio_file"]
69
+ class_name = item[self.config.class_name_key]
70
+ if class_name in class_to_utters.keys():
71
+ class_to_utters[class_name].append(path_)
72
+ else:
73
+ class_to_utters[class_name] = [
74
+ path_,
75
+ ]
76
+
77
+ # skip classes with number of samples >= self.num_utter_per_class
78
+ class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
79
+
80
+ classes = list(class_to_utters.keys())
81
+ classes.sort()
82
+
83
+ new_items = []
84
+ for item in self.items:
85
+ path_ = item["audio_file"]
86
+ class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
87
+ # ignore filtered classes
88
+ if class_name not in classes:
89
+ continue
90
+ # ignore small audios
91
+ if self.load_wav(path_).shape[0] - self.seq_len <= 0:
92
+ continue
93
+
94
+ new_items.append({"wav_file_path": path_, "class_name": class_name})
95
+
96
+ return classes, new_items
97
+
98
+ def __len__(self):
99
+ return len(self.items)
100
+
101
+ def get_num_classes(self):
102
+ return len(self.classes)
103
+
104
+ def get_class_list(self):
105
+ return self.classes
106
+
107
+ def set_classes(self, classes):
108
+ self.classes = classes
109
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
110
+
111
+ def get_map_classid_to_classname(self):
112
+ return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
113
+
114
+ def __getitem__(self, idx):
115
+ return self.items[idx]
116
+
117
+ def collate_fn(self, batch):
118
+ # get the batch class_ids
119
+ labels = []
120
+ feats = []
121
+ for item in batch:
122
+ utter_path = item["wav_file_path"]
123
+ class_name = item["class_name"]
124
+
125
+ # get classid
126
+ class_id = self.classname_to_classid[class_name]
127
+ # load wav file
128
+ wav = self.load_wav(utter_path)
129
+ offset = random.randint(0, wav.shape[0] - self.seq_len)
130
+ wav = wav[offset : offset + self.seq_len]
131
+
132
+ if self.augmentator is not None and self.data_augmentation_p:
133
+ if random.random() < self.data_augmentation_p:
134
+ wav = self.augmentator.apply_one(wav)
135
+
136
+ if not self.use_torch_spec:
137
+ mel = self.ap.melspectrogram(wav)
138
+ feats.append(torch.FloatTensor(mel))
139
+ else:
140
+ feats.append(torch.FloatTensor(wav))
141
+
142
+ labels.append(class_id)
143
+
144
+ feats = torch.stack(feats)
145
+ labels = torch.LongTensor(labels)
146
+
147
+ return feats, labels
TTS/encoder/TTS_encoder_losses.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from torch import nn
4
+
5
+
6
+ # adapted from https://github.com/cvqluu/GE2E-Loss
7
+ class GE2ELoss(nn.Module):
8
+ def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
9
+ """
10
+ Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
11
+ Accepts an input of size (N, M, D)
12
+ where N is the number of speakers in the batch,
13
+ M is the number of utterances per speaker,
14
+ and D is the dimensionality of the embedding vector (e.g. d-vector)
15
+ Args:
16
+ - init_w (float): defines the initial value of w in Equation (5) of [1]
17
+ - init_b (float): definies the initial value of b in Equation (5) of [1]
18
+ """
19
+ super().__init__()
20
+ # pylint: disable=E1102
21
+ self.w = nn.Parameter(torch.tensor(init_w))
22
+ # pylint: disable=E1102
23
+ self.b = nn.Parameter(torch.tensor(init_b))
24
+ self.loss_method = loss_method
25
+
26
+ print(" > Initialized Generalized End-to-End loss")
27
+
28
+ assert self.loss_method in ["softmax", "contrast"]
29
+
30
+ if self.loss_method == "softmax":
31
+ self.embed_loss = self.embed_loss_softmax
32
+ if self.loss_method == "contrast":
33
+ self.embed_loss = self.embed_loss_contrast
34
+
35
+ # pylint: disable=R0201
36
+ def calc_new_centroids(self, dvecs, centroids, spkr, utt):
37
+ """
38
+ Calculates the new centroids excluding the reference utterance
39
+ """
40
+ excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
41
+ excl = torch.mean(excl, 0)
42
+ new_centroids = []
43
+ for i, centroid in enumerate(centroids):
44
+ if i == spkr:
45
+ new_centroids.append(excl)
46
+ else:
47
+ new_centroids.append(centroid)
48
+ return torch.stack(new_centroids)
49
+
50
+ def calc_cosine_sim(self, dvecs, centroids):
51
+ """
52
+ Make the cosine similarity matrix with dims (N,M,N)
53
+ """
54
+ cos_sim_matrix = []
55
+ for spkr_idx, speaker in enumerate(dvecs):
56
+ cs_row = []
57
+ for utt_idx, utterance in enumerate(speaker):
58
+ new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
59
+ # vector based cosine similarity for speed
60
+ cs_row.append(
61
+ torch.clamp(
62
+ torch.mm(
63
+ utterance.unsqueeze(1).transpose(0, 1),
64
+ new_centroids.transpose(0, 1),
65
+ )
66
+ / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
67
+ 1e-6,
68
+ )
69
+ )
70
+ cs_row = torch.cat(cs_row, dim=0)
71
+ cos_sim_matrix.append(cs_row)
72
+ return torch.stack(cos_sim_matrix)
73
+
74
+ # pylint: disable=R0201
75
+ def embed_loss_softmax(self, dvecs, cos_sim_matrix):
76
+ """
77
+ Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
78
+ """
79
+ N, M, _ = dvecs.shape
80
+ L = []
81
+ for j in range(N):
82
+ L_row = []
83
+ for i in range(M):
84
+ L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
85
+ L_row = torch.stack(L_row)
86
+ L.append(L_row)
87
+ return torch.stack(L)
88
+
89
+ # pylint: disable=R0201
90
+ def embed_loss_contrast(self, dvecs, cos_sim_matrix):
91
+ """
92
+ Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
93
+ """
94
+ N, M, _ = dvecs.shape
95
+ L = []
96
+ for j in range(N):
97
+ L_row = []
98
+ for i in range(M):
99
+ centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
100
+ excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
101
+ L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
102
+ L_row = torch.stack(L_row)
103
+ L.append(L_row)
104
+ return torch.stack(L)
105
+
106
+ def forward(self, x, _label=None):
107
+ """
108
+ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
109
+ """
110
+
111
+ assert x.size()[1] >= 2
112
+
113
+ centroids = torch.mean(x, 1)
114
+ cos_sim_matrix = self.calc_cosine_sim(x, centroids)
115
+ torch.clamp(self.w, 1e-6)
116
+ cos_sim_matrix = self.w * cos_sim_matrix + self.b
117
+ L = self.embed_loss(x, cos_sim_matrix)
118
+ return L.mean()
119
+
120
+
121
+ # adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
122
+ class AngleProtoLoss(nn.Module):
123
+ """
124
+ Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
125
+ Accepts an input of size (N, M, D)
126
+ where N is the number of speakers in the batch,
127
+ M is the number of utterances per speaker,
128
+ and D is the dimensionality of the embedding vector
129
+ Args:
130
+ - init_w (float): defines the initial value of w
131
+ - init_b (float): definies the initial value of b
132
+ """
133
+
134
+ def __init__(self, init_w=10.0, init_b=-5.0):
135
+ super().__init__()
136
+ # pylint: disable=E1102
137
+ self.w = nn.Parameter(torch.tensor(init_w))
138
+ # pylint: disable=E1102
139
+ self.b = nn.Parameter(torch.tensor(init_b))
140
+ self.criterion = torch.nn.CrossEntropyLoss()
141
+
142
+ print(" > Initialized Angular Prototypical loss")
143
+
144
+ def forward(self, x, _label=None):
145
+ """
146
+ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
147
+ """
148
+
149
+ assert x.size()[1] >= 2
150
+
151
+ out_anchor = torch.mean(x[:, 1:, :], 1)
152
+ out_positive = x[:, 0, :]
153
+ num_speakers = out_anchor.size()[0]
154
+
155
+ cos_sim_matrix = F.cosine_similarity(
156
+ out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
157
+ out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
158
+ )
159
+ torch.clamp(self.w, 1e-6)
160
+ cos_sim_matrix = cos_sim_matrix * self.w + self.b
161
+ label = torch.arange(num_speakers).to(cos_sim_matrix.device)
162
+ L = self.criterion(cos_sim_matrix, label)
163
+ return L
164
+
165
+
166
+ class SoftmaxLoss(nn.Module):
167
+ """
168
+ Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
169
+ Args:
170
+ - embedding_dim (float): speaker embedding dim
171
+ - n_speakers (float): number of speakers
172
+ """
173
+
174
+ def __init__(self, embedding_dim, n_speakers):
175
+ super().__init__()
176
+
177
+ self.criterion = torch.nn.CrossEntropyLoss()
178
+ self.fc = nn.Linear(embedding_dim, n_speakers)
179
+
180
+ print("Initialised Softmax Loss")
181
+
182
+ def forward(self, x, label=None):
183
+ # reshape for compatibility
184
+ x = x.reshape(-1, x.size()[-1])
185
+ label = label.reshape(-1)
186
+
187
+ x = self.fc(x)
188
+ L = self.criterion(x, label)
189
+
190
+ return L
191
+
192
+ def inference(self, embedding):
193
+ x = self.fc(embedding)
194
+ activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
195
+ class_id = torch.argmax(activations)
196
+ return class_id
197
+
198
+
199
+ class SoftmaxAngleProtoLoss(nn.Module):
200
+ """
201
+ Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
202
+ Args:
203
+ - embedding_dim (float): speaker embedding dim
204
+ - n_speakers (float): number of speakers
205
+ - init_w (float): defines the initial value of w
206
+ - init_b (float): definies the initial value of b
207
+ """
208
+
209
+ def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
210
+ super().__init__()
211
+
212
+ self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
213
+ self.angleproto = AngleProtoLoss(init_w, init_b)
214
+
215
+ print("Initialised SoftmaxAnglePrototypical Loss")
216
+
217
+ def forward(self, x, label=None):
218
+ """
219
+ Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
220
+ """
221
+
222
+ Lp = self.angleproto(x)
223
+
224
+ Ls = self.softmax(x, label)
225
+
226
+ return Ls + Lp
TTS/encoder/TTS_encoder_requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ umap-learn
2
+ numpy>=1.17.0
TTS/encoder/__pycache__/TTS_encoder___pycache_____init__.cpython-39.pyc ADDED
Binary file (160 Bytes). View file
 
TTS/encoder/__pycache__/TTS_encoder___pycache___losses.cpython-39.pyc ADDED
Binary file (7.83 kB). View file
 
TTS/encoder/configs/TTS_encoder_configs_base_encoder_config.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass, field
2
+ from typing import Dict, List
3
+
4
+ from coqpit import MISSING
5
+
6
+ from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
7
+
8
+
9
+ @dataclass
10
+ class BaseEncoderConfig(BaseTrainingConfig):
11
+ """Defines parameters for a Generic Encoder model."""
12
+
13
+ model: str = None
14
+ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
15
+ datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
16
+ # model params
17
+ model_params: Dict = field(
18
+ default_factory=lambda: {
19
+ "model_name": "lstm",
20
+ "input_dim": 80,
21
+ "proj_dim": 256,
22
+ "lstm_dim": 768,
23
+ "num_lstm_layers": 3,
24
+ "use_lstm_with_projection": True,
25
+ }
26
+ )
27
+
28
+ audio_augmentation: Dict = field(default_factory=lambda: {})
29
+
30
+ # training params
31
+ epochs: int = 1000
32
+ loss: str = "angleproto"
33
+ grad_clip: float = 3.0
34
+ lr: float = 0.0001
35
+ optimizer: str = "radam"
36
+ optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
37
+ lr_decay: bool = False
38
+ warmup_steps: int = 4000
39
+
40
+ # logging params
41
+ tb_model_param_stats: bool = False
42
+ steps_plot_stats: int = 10
43
+ save_step: int = 1000
44
+ print_step: int = 20
45
+ run_eval: bool = False
46
+
47
+ # data loader
48
+ num_classes_in_batch: int = MISSING
49
+ num_utter_per_class: int = MISSING
50
+ eval_num_classes_in_batch: int = None
51
+ eval_num_utter_per_class: int = None
52
+
53
+ num_loader_workers: int = MISSING
54
+ voice_len: float = 1.6
55
+
56
+ def check_values(self):
57
+ super().check_values()
58
+ c = asdict(self)
59
+ assert (
60
+ c["model_params"]["input_dim"] == self.audio.num_mels
61
+ ), " [!] model input dimendion must be equal to melspectrogram dimension."
TTS/encoder/configs/TTS_encoder_configs_emotion_encoder_config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+
3
+ from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
+
5
+
6
+ @dataclass
7
+ class EmotionEncoderConfig(BaseEncoderConfig):
8
+ """Defines parameters for Emotion Encoder model."""
9
+
10
+ model: str = "emotion_encoder"
11
+ map_classid_to_classname: dict = None
12
+ class_name_key: str = "emotion_name"
TTS/encoder/configs/TTS_encoder_configs_speaker_encoder_config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+
3
+ from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
+
5
+
6
+ @dataclass
7
+ class SpeakerEncoderConfig(BaseEncoderConfig):
8
+ """Defines parameters for Speaker Encoder model."""
9
+
10
+ model: str = "speaker_encoder"
11
+ class_name_key: str = "speaker_name"
TTS/encoder/models/TTS_encoder_models_base_encoder.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torchaudio
4
+ from coqpit import Coqpit
5
+ from torch import nn
6
+
7
+ from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
8
+ from TTS.utils.generic_utils import set_init_dict
9
+ from TTS.utils.io import load_fsspec
10
+
11
+
12
+ class PreEmphasis(nn.Module):
13
+ def __init__(self, coefficient=0.97):
14
+ super().__init__()
15
+ self.coefficient = coefficient
16
+ self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
17
+
18
+ def forward(self, x):
19
+ assert len(x.size()) == 2
20
+
21
+ x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
22
+ return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
23
+
24
+
25
+ class BaseEncoder(nn.Module):
26
+ """Base `encoder` class. Every new `encoder` model must inherit this.
27
+
28
+ It defines common `encoder` specific functions.
29
+ """
30
+
31
+ # pylint: disable=W0102
32
+ def __init__(self):
33
+ super(BaseEncoder, self).__init__()
34
+
35
+ def get_torch_mel_spectrogram_class(self, audio_config):
36
+ return torch.nn.Sequential(
37
+ PreEmphasis(audio_config["preemphasis"]),
38
+ # TorchSTFT(
39
+ # n_fft=audio_config["fft_size"],
40
+ # hop_length=audio_config["hop_length"],
41
+ # win_length=audio_config["win_length"],
42
+ # sample_rate=audio_config["sample_rate"],
43
+ # window="hamming_window",
44
+ # mel_fmin=0.0,
45
+ # mel_fmax=None,
46
+ # use_htk=True,
47
+ # do_amp_to_db=False,
48
+ # n_mels=audio_config["num_mels"],
49
+ # power=2.0,
50
+ # use_mel=True,
51
+ # mel_norm=None,
52
+ # )
53
+ torchaudio.transforms.MelSpectrogram(
54
+ sample_rate=audio_config["sample_rate"],
55
+ n_fft=audio_config["fft_size"],
56
+ win_length=audio_config["win_length"],
57
+ hop_length=audio_config["hop_length"],
58
+ window_fn=torch.hamming_window,
59
+ n_mels=audio_config["num_mels"],
60
+ ),
61
+ )
62
+
63
+ @torch.no_grad()
64
+ def inference(self, x, l2_norm=True):
65
+ return self.forward(x, l2_norm)
66
+
67
+ @torch.no_grad()
68
+ def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
69
+ """
70
+ Generate embeddings for a batch of utterances
71
+ x: 1xTxD
72
+ """
73
+ # map to the waveform size
74
+ if self.use_torch_spec:
75
+ num_frames = num_frames * self.audio_config["hop_length"]
76
+
77
+ max_len = x.shape[1]
78
+
79
+ if max_len < num_frames:
80
+ num_frames = max_len
81
+
82
+ offsets = np.linspace(0, max_len - num_frames, num=num_eval)
83
+
84
+ frames_batch = []
85
+ for offset in offsets:
86
+ offset = int(offset)
87
+ end_offset = int(offset + num_frames)
88
+ frames = x[:, offset:end_offset]
89
+ frames_batch.append(frames)
90
+
91
+ frames_batch = torch.cat(frames_batch, dim=0)
92
+ embeddings = self.inference(frames_batch, l2_norm=l2_norm)
93
+
94
+ if return_mean:
95
+ embeddings = torch.mean(embeddings, dim=0, keepdim=True)
96
+ return embeddings
97
+
98
+ def get_criterion(self, c: Coqpit, num_classes=None):
99
+ if c.loss == "ge2e":
100
+ criterion = GE2ELoss(loss_method="softmax")
101
+ elif c.loss == "angleproto":
102
+ criterion = AngleProtoLoss()
103
+ elif c.loss == "softmaxproto":
104
+ criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
105
+ else:
106
+ raise Exception("The %s not is a loss supported" % c.loss)
107
+ return criterion
108
+
109
+ def load_checkpoint(
110
+ self,
111
+ config: Coqpit,
112
+ checkpoint_path: str,
113
+ eval: bool = False,
114
+ use_cuda: bool = False,
115
+ criterion=None,
116
+ cache=False,
117
+ ):
118
+ state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
119
+ try:
120
+ self.load_state_dict(state["model"])
121
+ print(" > Model fully restored. ")
122
+ except (KeyError, RuntimeError) as error:
123
+ # If eval raise the error
124
+ if eval:
125
+ raise error
126
+
127
+ print(" > Partial model initialization.")
128
+ model_dict = self.state_dict()
129
+ model_dict = set_init_dict(model_dict, state["model"], c)
130
+ self.load_state_dict(model_dict)
131
+ del model_dict
132
+
133
+ # load the criterion for restore_path
134
+ if criterion is not None and "criterion" in state:
135
+ try:
136
+ criterion.load_state_dict(state["criterion"])
137
+ except (KeyError, RuntimeError) as error:
138
+ print(" > Criterion load ignored because of:", error)
139
+
140
+ # instance and load the criterion for the encoder classifier in inference time
141
+ if (
142
+ eval
143
+ and criterion is None
144
+ and "criterion" in state
145
+ and getattr(config, "map_classid_to_classname", None) is not None
146
+ ):
147
+ criterion = self.get_criterion(config, len(config.map_classid_to_classname))
148
+ criterion.load_state_dict(state["criterion"])
149
+
150
+ if use_cuda:
151
+ self.cuda()
152
+ if criterion is not None:
153
+ criterion = criterion.cuda()
154
+
155
+ if eval:
156
+ self.eval()
157
+ assert not self.training
158
+
159
+ if not eval:
160
+ return criterion, state["step"]
161
+ return criterion