Spaces:

emirhanbilgic
/

Text-to-speech-Turkish

Running on Zero

App Files Files Community

emirhanbilgic commited on Aug 29

Commit

828d42b

•

1 Parent(s): b40d902

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -15

app.py CHANGED Viewed

@@ -3,32 +3,36 @@ import torch
 from datasets import load_dataset
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 import soundfile as sf
-import numpy as np
-# Load the fine-tuned model, processor, and vocoder
-model_name = "microsoft/speecht5_tts"
-processor = SpeechT5Processor.from_pretrained(model_name)
-model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr")
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-# Load speaker embeddings
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 def text_to_speech(text):
-    inputs = processor(text=text, return_tensors="pt")
     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
-    speech_numpy = speech.numpy()
-    return (16000, speech_numpy)  # Return sample rate and numpy array
-# Create Gradio interface
 iface = gr.Interface(
     fn=text_to_speech,
-    inputs=gr.Textbox(label="Enter Turkish text to convert to speech", value="Yapay zekayı seviyorum."),
     outputs=gr.Audio(label="Generated Speech"),
     title="Turkish SpeechT5 Text-to-Speech Demo",
     description="Enter Turkish text and listen to the generated speech using the fine-tuned SpeechT5 model."
 )
-# Launch the demo
 iface.launch()

 from datasets import load_dataset
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 import soundfile as sf
+import spaces
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def load_models_and_data():
+    model_name = "microsoft/speecht5_tts"
+    processor = SpeechT5Processor.from_pretrained(model_name)
+    model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
+    return model, processor, vocoder, speaker_embeddings
+model, processor, vocoder, speaker_embeddings = load_models_and_data()
+@spaces.GPU(duration = 60)
 def text_to_speech(text):
+    inputs = processor(text=text, return_tensors="pt").to(device)
     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+    sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
+    return "output.wav"
 iface = gr.Interface(
     fn=text_to_speech,
+    inputs=gr.Textbox(label="Enter Turkish text to convert to speech"),
     outputs=gr.Audio(label="Generated Speech"),
     title="Turkish SpeechT5 Text-to-Speech Demo",
     description="Enter Turkish text and listen to the generated speech using the fine-tuned SpeechT5 model."
 )
 iface.launch()