emirhanbilgic commited on
Commit
828d42b
1 Parent(s): b40d902

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -15
app.py CHANGED
@@ -3,32 +3,36 @@ import torch
3
  from datasets import load_dataset
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  import soundfile as sf
6
- import numpy as np
7
 
8
- # Load the fine-tuned model, processor, and vocoder
9
- model_name = "microsoft/speecht5_tts"
10
- processor = SpeechT5Processor.from_pretrained(model_name)
11
- model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr")
12
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
- # Load speaker embeddings
15
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
16
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 
 
 
 
 
 
 
17
 
 
 
 
18
  def text_to_speech(text):
19
- inputs = processor(text=text, return_tensors="pt")
20
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
21
- speech_numpy = speech.numpy()
22
- return (16000, speech_numpy) # Return sample rate and numpy array
23
 
24
- # Create Gradio interface
25
  iface = gr.Interface(
26
  fn=text_to_speech,
27
- inputs=gr.Textbox(label="Enter Turkish text to convert to speech", value="Yapay zekayı seviyorum."),
28
  outputs=gr.Audio(label="Generated Speech"),
29
  title="Turkish SpeechT5 Text-to-Speech Demo",
30
  description="Enter Turkish text and listen to the generated speech using the fine-tuned SpeechT5 model."
31
  )
32
 
33
- # Launch the demo
34
  iface.launch()
 
3
  from datasets import load_dataset
4
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  import soundfile as sf
6
+ import spaces
7
 
8
+ device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
9
 
10
+ def load_models_and_data():
11
+ model_name = "microsoft/speecht5_tts"
12
+ processor = SpeechT5Processor.from_pretrained(model_name)
13
+ model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
14
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
15
+
16
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
17
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
18
+
19
+ return model, processor, vocoder, speaker_embeddings
20
 
21
+ model, processor, vocoder, speaker_embeddings = load_models_and_data()
22
+
23
+ @spaces.GPU(duration = 60)
24
  def text_to_speech(text):
25
+ inputs = processor(text=text, return_tensors="pt").to(device)
26
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
27
+ sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
28
+ return "output.wav"
29
 
 
30
  iface = gr.Interface(
31
  fn=text_to_speech,
32
+ inputs=gr.Textbox(label="Enter Turkish text to convert to speech"),
33
  outputs=gr.Audio(label="Generated Speech"),
34
  title="Turkish SpeechT5 Text-to-Speech Demo",
35
  description="Enter Turkish text and listen to the generated speech using the fine-tuned SpeechT5 model."
36
  )
37
 
 
38
  iface.launch()