Spaces:

robinhad
/

ukrainian-tts

Running

App Files Files Community

Yurii Paniv commited on Sep 8, 2022

Commit

3502c7a

•

1 Parent(s): 450d18c

Release 3.0.0

Browse files

Files changed (4) hide show

README.md +4 -2
app.py +22 -18
config.json +29 -31
requirements.txt +3 -1

README.md CHANGED Viewed

@@ -4,6 +4,8 @@ emoji: 🐌
 colorFrom: blue
 colorTo: yellow
 sdk: gradio
 app_file: app.py
 pinned: false
 ---
@@ -15,7 +17,7 @@ Link to online demo -> [https://huggingface.co/spaces/robinhad/ukrainian-tts](ht
 Code is licensed under `MIT License`, models are under `GNU GPL v3 License`.
 # Support
-If you like my work, please support -> ![mono](https://www.monobank.ua/favicon.ico) [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm)
 # Example
 `Mykyta (male)`:
@@ -50,6 +52,6 @@ tts-server --model_path path/to/model.pth \
 # Attribution 🤝
 - Model training - [Yurii Paniv @robinhad](https://github.com/robinhad)
-- Mykyta and Olena dataset - [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
 - Autostress (with dictionary) using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress) - [Oleksiy Syvokon @asivokon](https://github.com/asivokon)
 - Autostress (with model) using [ukrainian-accentor](https://github.com/egorsmkv/ukrainian-accentor) - [Bohdan Mykhailenko @NeonBohdan](https://github.com/NeonBohdan) + [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)

 colorFrom: blue
 colorTo: yellow
 sdk: gradio
+sdk_version : 3.3
+python_version: 3.9
 app_file: app.py
 pinned: false
 ---
 Code is licensed under `MIT License`, models are under `GNU GPL v3 License`.
 # Support
+If you like my work, please support -> [https://send.monobank.ua/jar/48iHq4xAXm](https://send.monobank.ua/jar/48iHq4xAXm)
 # Example
 `Mykyta (male)`:
 # Attribution 🤝
 - Model training - [Yurii Paniv @robinhad](https://github.com/robinhad)
+- Mykyta, Olena and Lada dataset - [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
 - Autostress (with dictionary) using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress) - [Oleksiy Syvokon @asivokon](https://github.com/asivokon)
 - Autostress (with model) using [ukrainian-accentor](https://github.com/egorsmkv/ukrainian-accentor) - [Bohdan Mykhailenko @NeonBohdan](https://github.com/NeonBohdan) + [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)

app.py CHANGED Viewed

@@ -17,8 +17,9 @@ class StressOption(Enum):
 class VoiceOption(Enum):
-    FemaleVoice = "Олена (жіночий) 👩"
-    MaleVoice = "Микита (чоловічий) 👨"
 def download(url, file_name):
@@ -32,7 +33,7 @@ def download(url, file_name):
 print("downloading uk/mykyta/vits-tts")
-release_number = "v2.0.0"
 model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
 config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
 speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
@@ -70,7 +71,11 @@ def tts(text: str, voice: str, stress: str):
     autostress_with_model = (
         True if stress == StressOption.AutomaticStressWithModel.value else False
     )
-    speaker_name = "male1" if voice == VoiceOption.MaleVoice.value else "female3"
     text = preprocess_text(text, autostress_with_model)
     text_limit = 7200
     text = (
@@ -85,6 +90,11 @@ def tts(text: str, voice: str, stress: str):
         return fp.name, text
 iface = gr.Interface(
     fn=tts,
     inputs=[
@@ -95,7 +105,7 @@ iface = gr.Interface(
         gr.inputs.Radio(
             label="Голос",
             choices=[option.value for option in VoiceOption],
-            default=VoiceOption.FemaleVoice.value,
         ),
         gr.inputs.Radio(
             label="Наголоси",
@@ -108,39 +118,33 @@ iface = gr.Interface(
     ],
     title="🐸💬🇺🇦 - Coqui TTS",
     description="Україномовний🇺🇦 TTS за допомогою Coqui TTS (щоб вручну поставити наголос, використовуйте + перед голосною)",
-    article="Якщо вам подобається, підтримайте за посиланням: [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm),  "
-    + "Github: [https://github.com/robinhad/ukrainian-tts](https://github.com/robinhad/ukrainian-tts)   \n"
-    + "Model training - [Yurii Paniv @robinhad](https://github.com/robinhad)   \n"
-    + "Mykyta and Olena dataset - [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)   \n"
-    + "Autostress (with dictionary) using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress) - [Oleksiy Syvokon @asivokon](https://github.com/asivokon)    \n"
-    + "Autostress (with model) using [ukrainian-accentor](https://github.com/egorsmkv/ukrainian-accentor) - [Bohdan Mykhailenko @NeonBohdan](https://github.com/NeonBohdan) + [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)    \n"
-    + f'<center><img src="{badge}" alt="visitors badge"/></center>',
     examples=[
         [
             "Введіть, будь ласка, своє речення.",
-            VoiceOption.FemaleVoice.value,
             StressOption.AutomaticStress.value,
         ],
         [
             "Введіть, будь ласка, своє речення.",
-            VoiceOption.MaleVoice.value,
             StressOption.AutomaticStress.value,
         ],
         [
             "Вв+едіть, будь ласка, св+оє реч+ення.",
-            VoiceOption.MaleVoice.value,
             StressOption.AutomaticStress.value,
         ],
         [
             "Привіт, як тебе звати?",
-            VoiceOption.FemaleVoice.value,
             StressOption.AutomaticStress.value,
         ],
         [
             "Договір підписано 4 квітня 1949 року.",
-            VoiceOption.FemaleVoice.value,
             StressOption.AutomaticStress.value,
         ],
     ],
 )
-iface.launch(enable_queue=True, prevent_thread_lock=True)

 class VoiceOption(Enum):
+    Olena = "Олена (жіночий) 👩"
+    Mykyta = "Микита (чоловічий) 👨"
+    Lada = "Лада (жіночий) 👩"
 def download(url, file_name):
 print("downloading uk/mykyta/vits-tts")
+release_number = "v3.0.0-alpha"
 model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
 config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
 speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
     autostress_with_model = (
         True if stress == StressOption.AutomaticStressWithModel.value else False
     )
+    speaker_name = "mykyta"
+    if voice == VoiceOption.Olena.value:
+        speaker_name = "olena"
+    elif voice == VoiceOption.Lada.value:
+        speaker_name = "lada"
     text = preprocess_text(text, autostress_with_model)
     text_limit = 7200
     text = (
         return fp.name, text
+with open("README.md") as file:
+    article = file.read()
+    article = article[article.find("---\n", 4) + 5::]
 iface = gr.Interface(
     fn=tts,
     inputs=[
         gr.inputs.Radio(
             label="Голос",
             choices=[option.value for option in VoiceOption],
+            default=VoiceOption.Olena.value,
         ),
         gr.inputs.Radio(
             label="Наголоси",
     ],
     title="🐸💬🇺🇦 - Coqui TTS",
     description="Україномовний🇺🇦 TTS за допомогою Coqui TTS (щоб вручну поставити наголос, використовуйте + перед голосною)",
+    article=article + f'\n  <center><img src="{badge}" alt="visitors badge"/></center>',
     examples=[
         [
             "Введіть, будь ласка, своє речення.",
+            VoiceOption.Olena.value,
             StressOption.AutomaticStress.value,
         ],
         [
             "Введіть, будь ласка, своє речення.",
+            VoiceOption.Mykyta.value,
             StressOption.AutomaticStress.value,
         ],
         [
             "Вв+едіть, будь ласка, св+оє реч+ення.",
+            VoiceOption.Mykyta.value,
             StressOption.AutomaticStress.value,
         ],
         [
             "Привіт, як тебе звати?",
+            VoiceOption.Olena.value,
             StressOption.AutomaticStress.value,
         ],
         [
             "Договір підписано 4 квітня 1949 року.",
+            VoiceOption.Lada.value,
             StressOption.AutomaticStress.value,
         ],
     ],
 )
+iface.launch(enable_queue=True)

config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
     "output_path": "/home/robinhad/Projects/TTS",
     "logger_uri": null,
-    "run_name": "vits_mykyta_woman",
     "project_name": null,
     "run_description": "\ud83d\udc38Coqui trainer run.",
     "print_step": 25,
@@ -9,8 +9,8 @@
     "model_param_stats": false,
     "wandb_entity": null,
     "dashboard_logger": "tensorboard",
-    "log_model_step": 10000,
-    "save_step": 10000,
     "save_n_checkpoints": 5,
     "save_checkpoints": true,
     "save_all_best": false,
@@ -19,15 +19,16 @@
     "print_eval": false,
     "test_delay_epochs": -1,
     "run_eval": true,
     "distributed_backend": "nccl",
     "distributed_url": "tcp://localhost:54321",
     "mixed_precision": true,
-    "epochs": 1000,
-    "batch_size": 32,
     "eval_batch_size": 16,
     "grad_clip": [
-        1000.0,
-        1000.0
     ],
     "scheduler_after_epoch": true,
     "lr": 0.001,
@@ -45,11 +46,11 @@
     "use_grad_scaler": false,
     "cudnn_enable": true,
     "cudnn_deterministic": false,
-    "cudnn_benchmark": true,
     "training_seed": 54321,
     "model": "vits",
-    "num_loader_workers": 8,
-    "num_eval_loader_workers": 8,
     "use_noise_augment": false,
     "audio": {
         "fft_size": 1024,
@@ -60,20 +61,20 @@
         "stft_pad_mode": "reflect",
         "sample_rate": 22050,
         "resample": false,
-        "preemphasis": 0.0,
         "ref_level_db": 35,
         "do_sound_norm": true,
         "log_func": "np.log",
         "do_trim_silence": false,
         "trim_db": 35,
         "do_rms_norm": false,
-        "db_level": null,
         "power": 1.1,
         "griffin_lim_iters": 60,
         "num_mels": 80,
-        "mel_fmin": 0.0,
-        "mel_fmax": 8000.0,
-        "spec_gain": 6,
         "do_amp_to_db_linear": true,
         "do_amp_to_db_mel": true,
         "pitch_fmax": 640.0,
@@ -110,23 +111,20 @@
     "batch_group_size": 0,
     "loss_masking": null,
     "sort_by_audio_len": true,
-    "min_audio_len": 1,
     "max_audio_len": 264600,
     "min_text_len": 1,
     "max_text_len": Infinity,
     "compute_f0": false,
     "compute_linear_spec": true,
-    "precompute_num_workers": 8,
     "start_by_longest": false,
     "datasets": [
         {
             "name": "mailabs",
-            "path": "./mailabs-processed",
             "meta_file_train": "",
-            "ignored_speakers": [
-                "female1",
-                "female2"
-            ],
             "language": "",
             "meta_file_val": "",
             "meta_file_attn_mask": ""
@@ -135,7 +133,7 @@
     "test_sentences": [
         [
             "\u0414+\u0435\u0441\u044f\u0442\u044c \u0440\u0430\u0437+\u0456\u0432 \u0432\u0456\u0434\u043c+\u0456\u0440\u044f\u0439, +\u0430 \u0440+\u0430\u0437 - \u0432\u0456\u0434\u0440+\u0456\u0436.",
-            "female3",
             null,
             null
         ],
@@ -144,25 +142,25 @@
         ],
         [
             "\u041f\u0435\u0440\u0435\u043f\u0440+\u043e\u0448\u0443\u044e, \u0414+\u0435\u0439\u0432\u0435, \u043d+\u0430 \u0436+\u0430\u043b\u044c, +\u044f \u043d+\u0435 \u043c+\u043e\u0436\u0443 \u0437\u0440\u043e\u0431+\u0438\u0442\u0438 \u0446+\u0435.",
-            "female3",
             null,
             null
         ],
         [
             "\u041f\u0435\u0440\u0435\u043f\u0440+\u043e\u0448\u0443\u044e, \u0414+\u0435\u0439\u0432\u0435, \u043d+\u0430 \u0436+\u0430\u043b\u044c, +\u044f \u043d+\u0435 \u043c+\u043e\u0436\u0443 \u0437\u0440\u043e\u0431+\u0438\u0442\u0438 \u0446+\u0435.",
-            "male1",
             null,
             null
         ],
         [
             "\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
-            "male1",
             null,
             null
         ],
         [
             "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
-            "female3",
             null,
             null
         ]
@@ -238,15 +236,15 @@
         "use_sdp": true,
         "noise_scale": 1.0,
         "inference_noise_scale": 0.667,
-        "length_scale": 1.0,
         "noise_scale_dp": 1.0,
         "inference_noise_scale_dp": 1.0,
         "max_inference_len": null,
         "init_discriminator": true,
         "use_spectral_norm_disriminator": false,
         "use_speaker_embedding": true,
-        "num_speakers": 2,
-        "speakers_file": "./speakers.pth",
         "d_vector_file": null,
         "speaker_embedding_channels": 256,
         "use_d_vector_file": false,
@@ -293,7 +291,7 @@
     "r": 1,
     "num_speakers": 0,
     "use_speaker_embedding": true,
-    "speakers_file": "./speakers.pth",
     "speaker_embedding_channels": 256,
     "language_ids_file": null,
     "use_language_embedding": false,

 {
     "output_path": "/home/robinhad/Projects/TTS",
     "logger_uri": null,
+    "run_name": "vits_mykyta_latest",
     "project_name": null,
     "run_description": "\ud83d\udc38Coqui trainer run.",
     "print_step": 25,
     "model_param_stats": false,
     "wandb_entity": null,
     "dashboard_logger": "tensorboard",
+    "log_model_step": 5000,
+    "save_step": 5000,
     "save_n_checkpoints": 5,
     "save_checkpoints": true,
     "save_all_best": false,
     "print_eval": false,
     "test_delay_epochs": -1,
     "run_eval": true,
+    "run_eval_steps": null,
     "distributed_backend": "nccl",
     "distributed_url": "tcp://localhost:54321",
     "mixed_precision": true,
+    "epochs": 1500,
+    "batch_size": 64,
     "eval_batch_size": 16,
     "grad_clip": [
+        1000,
+        1000
     ],
     "scheduler_after_epoch": true,
     "lr": 0.001,
     "use_grad_scaler": false,
     "cudnn_enable": true,
     "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
     "training_seed": 54321,
     "model": "vits",
+    "num_loader_workers": 12,
+    "num_eval_loader_workers": 12,
     "use_noise_augment": false,
     "audio": {
         "fft_size": 1024,
         "stft_pad_mode": "reflect",
         "sample_rate": 22050,
         "resample": false,
+        "preemphasis": 0,
         "ref_level_db": 35,
         "do_sound_norm": true,
         "log_func": "np.log",
         "do_trim_silence": false,
         "trim_db": 35,
         "do_rms_norm": false,
+        "db_level": -24,
         "power": 1.1,
         "griffin_lim_iters": 60,
         "num_mels": 80,
+        "mel_fmin": 0,
+        "mel_fmax": 8000,
+        "spec_gain": 6.0,
         "do_amp_to_db_linear": true,
         "do_amp_to_db_mel": true,
         "pitch_fmax": 640.0,
     "batch_group_size": 0,
     "loss_masking": null,
     "sort_by_audio_len": true,
+    "min_audio_len": 32768,
     "max_audio_len": 264600,
     "min_text_len": 1,
     "max_text_len": Infinity,
     "compute_f0": false,
     "compute_linear_spec": true,
+    "precompute_num_workers": 16,
     "start_by_longest": false,
     "datasets": [
         {
             "name": "mailabs",
+            "path": "/home/robinhad/Data/Audio/ukr-tts-dataset-mai",
             "meta_file_train": "",
+            "ignored_speakers": null,
             "language": "",
             "meta_file_val": "",
             "meta_file_attn_mask": ""
     "test_sentences": [
         [
             "\u0414+\u0435\u0441\u044f\u0442\u044c \u0440\u0430\u0437+\u0456\u0432 \u0432\u0456\u0434\u043c+\u0456\u0440\u044f\u0439, +\u0430 \u0440+\u0430\u0437 - \u0432\u0456\u0434\u0440+\u0456\u0436.",
+            "olena",
             null,
             null
         ],
         ],
         [
             "\u041f\u0435\u0440\u0435\u043f\u0440+\u043e\u0448\u0443\u044e, \u0414+\u0435\u0439\u0432\u0435, \u043d+\u0430 \u0436+\u0430\u043b\u044c, +\u044f \u043d+\u0435 \u043c+\u043e\u0436\u0443 \u0437\u0440\u043e\u0431+\u0438\u0442\u0438 \u0446+\u0435.",
+            "lada",
             null,
             null
         ],
         [
             "\u041f\u0435\u0440\u0435\u043f\u0440+\u043e\u0448\u0443\u044e, \u0414+\u0435\u0439\u0432\u0435, \u043d+\u0430 \u0436+\u0430\u043b\u044c, +\u044f \u043d+\u0435 \u043c+\u043e\u0436\u0443 \u0437\u0440\u043e\u0431+\u0438\u0442\u0438 \u0446+\u0435.",
+            "mykyta",
             null,
             null
         ],
         [
             "\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
+            "mykyta",
             null,
             null
         ],
         [
             "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
+            "lada",
             null,
             null
         ]
         "use_sdp": true,
         "noise_scale": 1.0,
         "inference_noise_scale": 0.667,
+        "length_scale": 1,
         "noise_scale_dp": 1.0,
         "inference_noise_scale_dp": 1.0,
         "max_inference_len": null,
         "init_discriminator": true,
         "use_spectral_norm_disriminator": false,
         "use_speaker_embedding": true,
+        "num_speakers": 3,
+        "speakers_file": "speakers.pth",
         "d_vector_file": null,
         "speaker_embedding_channels": 256,
         "use_d_vector_file": false,
     "r": 1,
     "num_speakers": 0,
     "use_speaker_embedding": true,
+    "speakers_file": "speakers.pth",
     "speaker_embedding_channels": 256,
     "language_ids_file": null,
     "use_language_embedding": false,

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
-TTS==0.7.1
 ukrainian-word-stress==1.0.1
 git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c

+TTS==0.8.0
+torch==1.12.1
+--extra-index-url https://download.pytorch.org/whl/cu113
 ukrainian-word-stress==1.0.1
 git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c