vihangp commited on
Commit
66baa21
1 Parent(s): 633869b

Update app.py

Browse files

made marathi the default language

Files changed (1) hide show
  1. app.py +190 -1
app.py CHANGED
@@ -1,3 +1,192 @@
1
  import gradio as gr
2
 
3
- gr.load("models/facebook/mms-tts-mar").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
+ gr.load("models/facebook/mms-tts-mar").launch()
4
+
5
+
6
+ import torch
7
+
8
+ from transformers import pipeline
9
+
10
+ import numpy as np
11
+ import gradio as gr
12
+
13
+ def _grab_best_device(use_gpu=True):
14
+ if torch.cuda.device_count() > 0 and use_gpu:
15
+ device = "cuda"
16
+ else:
17
+ device = "cpu"
18
+ return device
19
+
20
+ device = _grab_best_device()
21
+
22
+ default_model_per_language = {
23
+ "marathi": "facebook/mms-tts-mar"
24
+ }
25
+
26
+ models_per_language = {
27
+ "marathi": ["ylacombe/mms-mar-finetuned-monospeaker"]
28
+ }
29
+
30
+ HUB_PATH = "ylacombe/vits_ljs_midlands_male_monospeaker"
31
+
32
+
33
+ pipe_dict = {
34
+ "current_model": "ylacombe/vits_ljs_midlands_male_monospeaker",
35
+ "pipe": pipeline("text-to-speech", model=HUB_PATH, device=0),
36
+ "original_pipe": pipeline("text-to-speech", model=default_model_per_language["marathi"], device=0),
37
+ "language": "english",
38
+ }
39
+
40
+ title = """
41
+ # Explore MMS finetuning
42
+ ## Or how to access truely multilingual TTS
43
+ Massively Multilingual Speech (MMS) models are light-weight, low-latency TTS models based on the [VITS architecture](https://huggingface.co/docs/transformers/model_doc/vits).
44
+ Meta's [MMS](https://arxiv.org/abs/2305.13516) project, aiming to provide speech technology across a diverse range of languages. You can find more details about the supported languages and their ISO 639-3 codes in the [MMS Language Coverage Overview](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html),
45
+ and see all MMS-TTS checkpoints on the Hugging Face Hub: [facebook/mms-tts](https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts).
46
+
47
+ Coupled with the right data and the right training recipe, you can get an excellent finetuned version of every MMS checkpoints in **20 minutes** with as little as **80 to 150 samples**.
48
+ Training recipe available in this [github repository](https://github.com/ylacombe/finetune-hf-vits)!
49
+ """
50
+
51
+ max_speakers = 1
52
+
53
+
54
+ # Inference
55
+ def generate_audio(text, model_id, language):
56
+
57
+ if pipe_dict["language"] != language:
58
+ gr.Warning(f"Language has changed - loading new default model: {default_model_per_language[language]}")
59
+ pipe_dict["language"] = language
60
+ pipe_dict["original_pipe"] = pipeline("text-to-speech", model=default_model_per_language[language], device=0)
61
+
62
+ if pipe_dict["current_model"] != model_id:
63
+ gr.Warning("Model has changed - loading new model")
64
+ pipe_dict["pipe"] = pipeline("text-to-speech", model=model_id, device=0)
65
+ pipe_dict["current_model"] = model_id
66
+
67
+ num_speakers = pipe_dict["pipe"].model.config.num_speakers
68
+
69
+ out = []
70
+ # first generate original model result
71
+ output = pipe_dict["original_pipe"](text)
72
+ output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Non finetuned model prediction {default_model_per_language[language]}", show_label=True,
73
+ visible=True)
74
+ out.append(output)
75
+
76
+
77
+ if num_speakers>1:
78
+ for i in range(min(num_speakers, max_speakers - 1)):
79
+ forward_params = {"speaker_id": i}
80
+ output = pipe_dict["pipe"](text, forward_params=forward_params)
81
+
82
+ output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True,
83
+ visible=True)
84
+ out.append(output)
85
+ out.extend([gr.Audio(visible=False)]*(max_speakers-num_speakers))
86
+ else:
87
+ output = pipe_dict["pipe"](text)
88
+ output = gr.Audio(value = (output["sampling_rate"], output["audio"].squeeze()), type="numpy", autoplay=False, label="Generated Audio - Mono speaker", show_label=True,
89
+ visible=True)
90
+ out.append(output)
91
+ out.extend([gr.Audio(visible=False)]*(max_speakers-2))
92
+ return out
93
+
94
+
95
+ css = """
96
+ #container{
97
+ margin: 0 auto;
98
+ max-width: 80rem;
99
+ }
100
+ #intro{
101
+ max-width: 100%;
102
+ text-align: center;
103
+ margin: 0 auto;
104
+ }
105
+ """
106
+ # Gradio blocks demo
107
+ with gr.Blocks(css=css) as demo_blocks:
108
+ gr.Markdown(title, elem_id="intro")
109
+
110
+ with gr.Row():
111
+ with gr.Column():
112
+ inp_text = gr.Textbox(label="Input Text", info="What sentence would you like to synthesise?")
113
+ btn = gr.Button("Generate Audio!")
114
+ language = gr.Dropdown(
115
+ default_model_per_language.keys(),
116
+ value = "marathi",
117
+ label = "language",
118
+ info = "Language that you want to test"
119
+ )
120
+
121
+ model_id = gr.Dropdown(
122
+ models_per_language["marathi"],
123
+ value="ylacombe/mms-mar-finetuned-monospeaker",
124
+ label="Model",
125
+ info="Model you want to test",
126
+ )
127
+
128
+ with gr.Column():
129
+ outputs = []
130
+ for i in range(max_speakers):
131
+ out_audio = gr.Audio(type="numpy", autoplay=False, label=f"Generated Audio - speaker {i}", show_label=True, visible=False)
132
+ outputs.append(out_audio)
133
+
134
+ with gr.Accordion("Datasets and models details", open=False):
135
+ gr.Markdown("""
136
+
137
+ For each language, we used 100 to 150 samples of a single speaker to finetune the model.
138
+ ### Spanish
139
+ * **Model**: [Spanish MMS TTS](https://huggingface.co/facebook/mms-tts-spa).
140
+ * **Datasets**:
141
+ - [Chilean Spanish TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-spanish).
142
+ ### Tamil
143
+ * **Model**: [Tamil MMS TTS](https://huggingface.co/facebook/mms-tts-tam).
144
+ * **Datasets**:
145
+ - [Tamil TTS dataset](https://huggingface.co/datasets/ylacombe/google-tamil).
146
+ ### Gujarati
147
+ * **Model**: [Gujarati MMS TTS](https://huggingface.co/facebook/mms-tts-guj).
148
+ * **Datasets**:
149
+ - [Gujarati TTS dataset](https://huggingface.co/datasets/ylacombe/google-gujarati).
150
+ ### Marathi
151
+ * **Model**: [Marathi MMS TTS](https://huggingface.co/facebook/mms-tts-mar).
152
+ * **Datasets**:
153
+ - [Marathi TTS dataset](https://huggingface.co/datasets/ylacombe/google-chilean-marathi).
154
+ ### English
155
+ * **Model**: [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs)
156
+ * **Dataset**: [British Isles Accent](https://huggingface.co/datasets/ylacombe/english_dialects). For each accent, we used 100 to 150 samples of a single speaker to finetune [VITS-ljs](https://huggingface.co/kakao-enterprise/vits-ljs).
157
+
158
+ """)
159
+
160
+ with gr.Accordion("Run VITS and MMS with transformers", open=False):
161
+ gr.Markdown(
162
+ """
163
+ ```bash
164
+ pip install transformers
165
+ ```
166
+ ```py
167
+ from transformers import pipeline
168
+ import scipy
169
+ pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=0)
170
+
171
+ results = pipe("A cinematic shot of a baby racoon wearing an intricate italian priest robe")
172
+ # write to a wav file
173
+ scipy.io.wavfile.write("audio_vits.wav", rate=results["sampling_rate"], data=results["audio"].squeeze())
174
+ ```
175
+ """
176
+ )
177
+
178
+
179
+ language.change(lambda language: gr.Dropdown(
180
+ models_per_language[language],
181
+ value=models_per_language[language][0],
182
+ label="Model",
183
+ info="Model you want to test",
184
+ ),
185
+ language,
186
+ model_id
187
+ )
188
+
189
+ btn.click(generate_audio, [inp_text, model_id, language], outputs)
190
+
191
+
192
+ demo_blocks.queue().launch()