import os import gradio as gr import torch import torch.cuda from Utility.utils import float2pcm from Architectures.ControllabilityGAN.GAN import GanWrapper from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface from Utility.storage_config import MODELS_DIR from Utility.utils import load_json_from_path demo = gr.Blocks() class ControllableInterface: def __init__(self, gpu_id="cpu", available_artificial_voices=1000): if gpu_id == "cpu": os.environ["CUDA_VISIBLE_DEVICES"] = "" else: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model = ToucanTTSInterface(device=self.device, tts_model_path="Shan") self.wgan = GanWrapper( os.path.join(MODELS_DIR, "Embedding", "embedding_gan.pt"), device=self.device, ) self.generated_speaker_embeds = list() self.available_artificial_voices = available_artificial_voices self.current_language = "" self.current_accent = "" def read( self, prompt, language, accent, voice_seed, duration_scaling_factor, pause_duration_scaling_factor, pitch_variance_scale, energy_variance_scale, emb_slider_1, emb_slider_2, emb_slider_3, emb_slider_4, emb_slider_5, emb_slider_6, ): if self.current_language != language: self.model.set_phonemizer_language(language) self.current_language = language if self.current_accent != accent: self.model.set_accent_language(accent) self.current_accent = accent self.wgan.set_latent(voice_seed) controllability_vector = torch.tensor( [ emb_slider_1, emb_slider_2, emb_slider_3, emb_slider_4, emb_slider_5, emb_slider_6, ], dtype=torch.float32, ) embedding = self.wgan.modify_embed(controllability_vector) self.model.set_utterance_embedding(embedding=embedding) phones = self.model.text2phone.get_phone_string(prompt) if len(phones) > 1800: prompt = "Your input was too long. Please try either a shorter text or split it into several parts." if self.current_language != "eng": self.model.set_phonemizer_language("eng") self.current_language = "eng" if self.current_accent != "eng": self.model.set_accent_language("eng") self.current_accent = "eng" print(prompt) wav, sr, fig = self.model( prompt, input_is_phones=False, duration_scaling_factor=duration_scaling_factor, pitch_variance_scale=pitch_variance_scale, energy_variance_scale=energy_variance_scale, pause_duration_scaling_factor=pause_duration_scaling_factor, return_plot_as_filepath=True, ) return sr, wav, fig class TTSWebUI: def __init__( self, gpu_id="cpu", title="Controllable Text-to-Speech for over 7000 Languages", article="", available_artificial_voices=1000, path_to_iso_list="Preprocessing/multilinguality/iso_to_fullname.json", ): iso_to_name = load_json_from_path(path_to_iso_list) text_selection = [ f"{iso_to_name[iso_code]} Text ({iso_code})" for iso_code in iso_to_name ] # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name] self.controllable_ui = ControllableInterface( gpu_id=gpu_id, available_artificial_voices=available_artificial_voices ) self.iface = gr.Interface( fn=self.read, inputs=[ gr.Textbox( lines=2, placeholder="write what you want the synthesis to read here...", value="မႂ်ႇသုင်ၶႃႈ ယူႇလီၵိၼ်ဝၢၼ် ၵတ်းယဵၼ်ၸႂ် မိူၼ်ၾႃႉၾူၼ်လူမ်းလီယူႇၶႃႈ ၼေႃႈ", label="Text input", ), gr.Dropdown( text_selection, type="value", value="Shan Text (shn)", label="Select the Language of the Text (type on your keyboard to find it quickly)", ), gr.Slider( minimum=0, maximum=available_artificial_voices, step=1, value=1000, label="Random Seed for the artificial Voice", ), gr.Slider( minimum=0.7, maximum=1.3, step=0.1, value=1.2, label="Duration Scale", ), gr.Slider( minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale", ), gr.Slider( minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale", ), gr.Slider( minimum=-10.0, maximum=10.0, step=0.1, value=10.0, label="Femininity / Masculinity", ), gr.Slider( minimum=-10.0, maximum=10.0, step=0.1, value=-10.0, label="Voice Depth", ), ], outputs=[ gr.Audio(type="numpy", label="Speech"), gr.Image(label="Visualization"), ], title=title, theme="default", allow_flagging="never", article=article, ) def read( self, prompt, language, voice_seed, duration_scaling_factor, pitch_variance_scale, energy_variance_scale, emb1, emb2, ): sr, wav, fig = self.controllable_ui.read( prompt=prompt, language=language.split(" ")[-1].split("(")[1].split(")")[0], accent=language.split(" ")[-1].split("(")[1].split(")")[0], voice_seed=voice_seed, duration_scaling_factor=duration_scaling_factor, pause_duration_scaling_factor=1.0, pitch_variance_scale=pitch_variance_scale, energy_variance_scale=energy_variance_scale, emb_slider_1=emb1, emb_slider_2=emb2, emb_slider_3=0.0, emb_slider_4=0.0, emb_slider_5=0.0, emb_slider_6=0.0, ) return (sr, float2pcm(wav)), fig def render(self): return self.iface if __name__ == "__main__": with gr.Blocks() as demo: gr.Markdown( "

IMS-Toucan: Multilingual and Controllable Text-to-Speech Toolkit of the Speech and Language Technologies Group at the University of Stuttgart.

" ) gr.HTML( "

IMS-Toucan-Shan: Contain the Shan finetune script

" ) TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu").render() demo.launch()