Spaces:
Runtime error
Runtime error
import gradio as gr | |
import numpy as np | |
import torch | |
import os | |
import re | |
import tempfile | |
from transformers import VitsModel, VitsTokenizer | |
models = { | |
"English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"), | |
"German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"), | |
"Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"), | |
} | |
tokenizers = { | |
"English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"), | |
"German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"), | |
"Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"), | |
} | |
# For certain checkpoints, the text needs to be romanized. | |
# MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman | |
# This needs to be installed in the folder "uroman" | |
def uromanize(text, uroman_pl): | |
iso = "xxx" | |
with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2: | |
with open(tf.name, "w") as f: | |
f.write("\n".join([text])) | |
cmd = f"perl " + uroman_pl | |
cmd += f" -l {iso} " | |
cmd += f" < {tf.name} > {tf2.name}" | |
os.system(cmd) | |
outtexts = [] | |
with open(tf2.name) as f: | |
for line in f: | |
line = re.sub(r"\s+", " ", line).strip() | |
outtexts.append(line) | |
outtext = outtexts[0] | |
return outtext | |
def predict(text, language=None): | |
if len(text.strip()) == 0: | |
return (16000, np.zeros(0).astype(np.int16)) | |
if language == "Korean": | |
uroman_pl = os.path.join("uroman", "bin", "uroman.pl") | |
text = uromanize(text, uroman_pl) | |
tokenizer = tokenizers[language] | |
inputs = tokenizer(text, return_tensors="pt") | |
input_ids = inputs["input_ids"] | |
if language != "Korean": | |
text = tokenizer.batch_decode(input_ids)[0] | |
model = models[language] | |
with torch.no_grad(): | |
outputs = model(input_ids) | |
speech = outputs.audio[0] | |
speech = (speech.numpy() * 32767).astype(np.int16) | |
return (16000, speech), text | |
title = "MMS-TTS speech synthesis" | |
description = """ | |
Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide | |
speech technology across a diverse range of languages. The MMS-TTS project contains a collection of | |
over 1000 text-to-speech (TTS) models. | |
This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS | |
model, this code can also be used to run VITS checkpoints. | |
For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits). | |
As the model performs random sampling, the generated speech is slightly different each time. | |
The voice may also vary between runs, or sometimes even in the same sentence. | |
(Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints | |
are not conditioned on a speaker ID.) | |
""" | |
article = """ | |
<div style='margin:20px auto;'> | |
<p>References: <a href="https://arxiv.org/abs/2305.13516">MMS paper</a> | | |
<a href="https://ai.facebook.com/blog/multilingual-model-speech-recognition/">blog post</a> | | |
<a href="https://huggingface.co/facebook/mms-tts">original weights</a> | | |
<a href="https://huggingface.co/spaces/mms-meta/MMS">original MMS space</a> | |
</p> | |
<pre> | |
@article{pratap2023mms, | |
title={Scaling Speech Technology to 1,000+ Languages}, | |
author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli}, | |
journal={arXiv}, | |
year={2023} | |
} | |
</pre> | |
</div> | |
""" | |
examples = [ | |
["It is not in the stars to hold our destiny but in ourselves.", "English"], | |
["The octopus and Oliver went to the opera in October.", "English"], | |
["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"], | |
["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"], | |
["A synonym for cinnamon is a cinnamon synonym.", "English"], | |
["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"], | |
["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"], | |
["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"], | |
["안녕 세상, 날씨는 아름다워", "Korean"], # Hello world, the weather is beautiful (Google Translate) | |
] | |
gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Text(label="Input Text"), | |
gr.Radio(label="Language", choices=[ | |
"English", | |
"German", | |
"Korean", | |
], | |
value="English"), | |
], | |
outputs=[ | |
gr.Audio(label="Generated Speech", type="numpy"), | |
gr.Text(label="Processed text"), | |
], | |
title=title, | |
description=description, | |
article=article, | |
examples=examples, | |
).launch() | |