Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import torch | |
import soundfile as sf | |
import spaces | |
import os | |
import numpy as np | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from speechbrain.pretrained import EncoderClassifier | |
from datasets import load_dataset | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
def load_models_and_data(): | |
model_name = "microsoft/speecht5_tts" | |
processor = SpeechT5Processor.from_pretrained(model_name) | |
model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
spk_model_name = "speechbrain/spkrec-xvect-voxceleb" | |
speaker_model = EncoderClassifier.from_hparams( | |
source=spk_model_name, | |
run_opts={"device": device}, | |
savedir=os.path.join("/tmp", spk_model_name), | |
) | |
# Load a sample from a dataset for default embedding | |
dataset = load_dataset("erenfazlioglu/turkishvoicedataset", split="train") | |
example = dataset[304] | |
return model, processor, vocoder, speaker_model, example | |
model, processor, vocoder, speaker_model, default_example = load_models_and_data() | |
def create_speaker_embedding(waveform): | |
with torch.no_grad(): | |
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device)) | |
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) | |
speaker_embeddings = speaker_embeddings.squeeze() | |
return speaker_embeddings | |
def prepare_default_embedding(example): | |
audio = example["audio"] | |
return create_speaker_embedding(audio["array"]) | |
default_embedding = prepare_default_embedding(default_example) | |
def text_to_speech(text, audio_file=None): | |
inputs = processor(text=text, return_tensors="pt").to(device) | |
if audio_file is not None: | |
# Load the audio file and create speaker embedding | |
waveform, sample_rate = sf.read(audio_file) | |
if len(waveform.shape) > 1: | |
waveform = waveform[:, 0] # Take the first channel if stereo | |
speaker_embeddings = create_speaker_embedding(waveform) | |
else: | |
# Use default embedding if no audio file is provided | |
speaker_embeddings = default_embedding | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder) | |
sf.write("output.wav", speech.cpu().numpy(), samplerate=16000) | |
return "output.wav" | |
iface = gr.Interface( | |
fn=text_to_speech, | |
inputs=[ | |
gr.Textbox(label="Enter Turkish text to convert to speech"), | |
gr.Audio(label="Upload a short audio sample of the target speaker (optional)", type="filepath") | |
], | |
outputs=gr.Audio(label="Generated Speech"), | |
title="Turkish SpeechT5 Text-to-Speech Demo with Optional Custom Voice", | |
description="Enter Turkish text, optionally upload a short audio sample of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model." | |
) | |
iface.launch(share=True) |