Spaces:

Matthijs
/

mms-tts-demo

Runtime error

mms-tts-demo / app.py

Matthijs Hollemans

here we go!

f0839e8 over 1 year ago

5.02 kB

	import gradio as gr
	import numpy as np
	import torch
	import os
	import re
	import tempfile

	from transformers import VitsModel, VitsTokenizer


	models = {
	"English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"),
	"German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"),
	"Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"),
	}

	tokenizers = {
	"English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"),
	"German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"),
	"Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"),
	}


	# For certain checkpoints, the text needs to be romanized.
	# MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman
	# This needs to be installed in the folder "uroman"
	def uromanize(text, uroman_pl):
	iso = "xxx"
	with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
	with open(tf.name, "w") as f:
	f.write("\n".join([text]))
	cmd = f"perl " + uroman_pl
	cmd += f" -l {iso} "
	cmd += f" < {tf.name} > {tf2.name}"
	os.system(cmd)
	outtexts = []
	with open(tf2.name) as f:
	for line in f:
	line = re.sub(r"\s+", " ", line).strip()
	outtexts.append(line)
	outtext = outtexts[0]
	return outtext


	def predict(text, language=None):
	if len(text.strip()) == 0:
	return (16000, np.zeros(0).astype(np.int16))

	if language == "Korean":
	uroman_pl = os.path.join("uroman", "bin", "uroman.pl")
	text = uromanize(text, uroman_pl)

	tokenizer = tokenizers[language]
	inputs = tokenizer(text, return_tensors="pt")
	input_ids = inputs["input_ids"]

	if language != "Korean":
	text = tokenizer.batch_decode(input_ids)[0]

	model = models[language]
	with torch.no_grad():
	outputs = model(input_ids)

	speech = outputs.audio[0]
	speech = (speech.numpy() * 32767).astype(np.int16)
	return (16000, speech), text


	title = "MMS-TTS speech synthesis"

	description = """
	Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide
	speech technology across a diverse range of languages. The MMS-TTS project contains a collection of
	over 1000 text-to-speech (TTS) models.

	This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS
	model, this code can also be used to run VITS checkpoints.
	For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits).

	As the model performs random sampling, the generated speech is slightly different each time.
	The voice may also vary between runs, or sometimes even in the same sentence.
	(Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints
	are not conditioned on a speaker ID.)
	"""

	article = """
	<div style='margin:20px auto;'>

	<p>References: <a href="https://arxiv.org/abs/2305.13516">MMS paper</a> \|
	<a href="https://ai.facebook.com/blog/multilingual-model-speech-recognition/">blog post</a> \|
	<a href="https://huggingface.co/facebook/mms-tts">original weights</a> \|
	<a href="https://huggingface.co/spaces/mms-meta/MMS">original MMS space</a>
	</p>

	<pre>
	@article{pratap2023mms,
	title={Scaling Speech Technology to 1,000+ Languages},
	author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
	journal={arXiv},
	year={2023}
	}
	</pre>

	</div>
	"""

	examples = [
	["It is not in the stars to hold our destiny but in ourselves.", "English"],
	["The octopus and Oliver went to the opera in October.", "English"],
	["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"],
	["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"],
	["A synonym for cinnamon is a cinnamon synonym.", "English"],
	["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"],

	["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"],
	["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"],

	["안녕 세상, 날씨는 아름다워", "Korean"], # Hello world, the weather is beautiful (Google Translate)
	]

	gr.Interface(
	fn=predict,
	inputs=[
	gr.Text(label="Input Text"),
	gr.Radio(label="Language", choices=[
	"English",
	"German",
	"Korean",
	],
	value="English"),
	],
	outputs=[
	gr.Audio(label="Generated Speech", type="numpy"),
	gr.Text(label="Processed text"),
	],
	title=title,
	description=description,
	article=article,
	examples=examples,
	).launch()