emirhanbilgic commited on
Commit
94cde68
1 Parent(s): c7fbbca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -14
app.py CHANGED
@@ -3,8 +3,10 @@ import torch
3
  import soundfile as sf
4
  import spaces
5
  import os
 
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
  from speechbrain.pretrained import EncoderClassifier
 
8
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
@@ -21,28 +23,42 @@ def load_models_and_data():
21
  savedir=os.path.join("/tmp", spk_model_name),
22
  )
23
 
24
- return model, processor, vocoder, speaker_model
 
 
 
 
25
 
26
- model, processor, vocoder, speaker_model = load_models_and_data()
27
 
28
  def create_speaker_embedding(waveform):
29
  with torch.no_grad():
30
  speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
31
  speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
32
- speaker_embeddings = speaker_embeddings.squeeze().to(device)
33
  return speaker_embeddings
34
 
 
 
 
 
 
 
35
  @spaces.GPU(duration = 60)
36
- def text_to_speech(text, audio_file):
37
  inputs = processor(text=text, return_tensors="pt").to(device)
38
 
39
- # Load the audio file and create speaker embedding
40
- waveform, sample_rate = sf.read(audio_file)
41
- if len(waveform.shape) > 1:
42
- waveform = waveform[:, 0] # Take the first channel if stereo
43
- speaker_embeddings = create_speaker_embedding(waveform)
 
 
 
 
44
 
45
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
46
  sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
47
  return "output.wav"
48
 
@@ -50,11 +66,11 @@ iface = gr.Interface(
50
  fn=text_to_speech,
51
  inputs=[
52
  gr.Textbox(label="Enter Turkish text to convert to speech"),
53
- gr.Audio(label="Upload a short audio sample of the target speaker", type="filepath")
54
  ],
55
  outputs=gr.Audio(label="Generated Speech"),
56
- title="Turkish SpeechT5 Text-to-Speech Demo with Custom Voice",
57
- description="Enter Turkish text, upload a short audio sample of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model."
58
  )
59
 
60
- iface.launch()
 
3
  import soundfile as sf
4
  import spaces
5
  import os
6
+ import numpy as np
7
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
8
  from speechbrain.pretrained import EncoderClassifier
9
+ from datasets import load_dataset
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
 
23
  savedir=os.path.join("/tmp", spk_model_name),
24
  )
25
 
26
+ # Load a sample from a dataset for default embedding
27
+ dataset = load_dataset("erenfazlioglu/turkishvoicedataset", split="train")
28
+ example = dataset[304]
29
+
30
+ return model, processor, vocoder, speaker_model, example
31
 
32
+ model, processor, vocoder, speaker_model, default_example = load_models_and_data()
33
 
34
  def create_speaker_embedding(waveform):
35
  with torch.no_grad():
36
  speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
37
  speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
38
+ speaker_embeddings = speaker_embeddings.squeeze()
39
  return speaker_embeddings
40
 
41
+ def prepare_default_embedding(example):
42
+ audio = example["audio"]
43
+ return create_speaker_embedding(audio["array"])
44
+
45
+ default_embedding = prepare_default_embedding(default_example)
46
+
47
  @spaces.GPU(duration = 60)
48
+ def text_to_speech(text, audio_file=None):
49
  inputs = processor(text=text, return_tensors="pt").to(device)
50
 
51
+ if audio_file is not None:
52
+ # Load the audio file and create speaker embedding
53
+ waveform, sample_rate = sf.read(audio_file)
54
+ if len(waveform.shape) > 1:
55
+ waveform = waveform[:, 0] # Take the first channel if stereo
56
+ speaker_embeddings = create_speaker_embedding(waveform)
57
+ else:
58
+ # Use default embedding if no audio file is provided
59
+ speaker_embeddings = default_embedding
60
 
61
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
62
  sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
63
  return "output.wav"
64
 
 
66
  fn=text_to_speech,
67
  inputs=[
68
  gr.Textbox(label="Enter Turkish text to convert to speech"),
69
+ gr.Audio(label="Upload a short audio sample of the target speaker (optional)", type="filepath")
70
  ],
71
  outputs=gr.Audio(label="Generated Speech"),
72
+ title="Turkish SpeechT5 Text-to-Speech Demo with Optional Custom Voice",
73
+ description="Enter Turkish text, optionally upload a short audio sample of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model."
74
  )
75
 
76
+ iface.launch(share=True)