emirhanbilgic commited on
Commit
29a7123
1 Parent(s): 05020c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -4
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
- import gradio as gr
3
  import torch
 
4
  from datasets import load_dataset
5
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
  import soundfile as sf
@@ -9,6 +10,62 @@ import spaces
9
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def load_models_and_data():
13
  model_name = "microsoft/speecht5_tts"
14
  processor = SpeechT5Processor.from_pretrained(model_name)
@@ -34,10 +91,11 @@ def create_speaker_embedding(waveform):
34
 
35
  @spaces.GPU(duration = 60)
36
  def text_to_speech(text, waveform):
 
37
  speaker_embeddings = create_speaker_embedding(waveform)
38
  speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0).to(device)
39
 
40
- inputs = processor(text=text, return_tensors="pt").to(device)
41
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
42
  sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
43
  return "output.wav"
@@ -46,11 +104,11 @@ iface = gr.Interface(
46
  fn=text_to_speech,
47
  inputs=[
48
  gr.Textbox(label="Enter Turkish text to convert to speech"),
49
- gr.Audio(source="upload", type="numpy", label="Upload Speaker Audio"),
50
  ],
51
  outputs=gr.Audio(label="Generated Speech"),
52
  title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker Embeddings",
53
- description="Enter Turkish text and upload an audio file to generate speech using the fine-tuned SpeechT5 model with custom speaker embeddings."
54
  )
55
 
56
  iface.launch()
 
1
  import os
2
+ import re
3
  import torch
4
+ import gradio as gr
5
  from datasets import load_dataset
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
  import soundfile as sf
 
10
 
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
+ replacements = [
14
+ ("â", "a"),
15
+ ("ç", "ch"),
16
+ ("ğ", "gh"),
17
+ ("ı", "i"),
18
+ ("î", "i"),
19
+ ("ö", "oe"),
20
+ ("ş", "sh"),
21
+ ("ü", "ue"),
22
+ ("û", "u"),
23
+ ]
24
+
25
+ number_words = {
26
+ 0: "sıfır", 1: "bir", 2: "iki", 3: "üç", 4: "dört", 5: "beş", 6: "altı", 7: "yedi", 8: "sekiz", 9: "dokuz",
27
+ 10: "on", 11: "on bir", 12: "on iki", 13: "on üç", 14: "on dört", 15: "on beş", 16: "on altı", 17: "on yedi",
28
+ 18: "on sekiz", 19: "on dokuz", 20: "yirmi", 30: "otuz", 40: "kırk", 50: "elli", 60: "altmış", 70: "yetmiş",
29
+ 80: "seksen", 90: "doksan", 100: "yüz", 1000: "bin"
30
+ }
31
+
32
+ def number_to_words(number):
33
+ if number < 20:
34
+ return number_words[number]
35
+ elif number < 100:
36
+ tens, unit = divmod(number, 10)
37
+ return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
38
+ elif number < 1000:
39
+ hundreds, remainder = divmod(number, 100)
40
+ return (number_words[hundreds] + " yüz" if hundreds > 1 else "yüz") + (" " + number_to_words(remainder) if remainder else "")
41
+ elif number < 1000000:
42
+ thousands, remainder = divmod(number, 1000)
43
+ return (number_to_words(thousands) + " bin" if thousands > 1 else "bin") + (" " + number_to_words(remainder) if remainder else "")
44
+ elif number < 1000000000:
45
+ millions, remainder = divmod(number, 1000000)
46
+ return number_to_words(millions) + " milyon" + (" " + number_to_words(remainder) if remainder else "")
47
+ elif number < 1000000000000:
48
+ billions, remainder = divmod(number, 1000000000)
49
+ return number_to_words(billions) + " milyar" + (" " + number_to_words(remainder) if remainder else "")
50
+ else:
51
+ return str(number)
52
+
53
+ def replace_numbers_with_words(text):
54
+ def replace(match):
55
+ number = int(match.group())
56
+ return number_to_words(number)
57
+ return re.sub(r'\b\d+\b', replace, text)
58
+
59
+ def cleanup_text(text):
60
+ for old, new in replacements:
61
+ text = text.replace(old, new)
62
+ return text
63
+
64
+ def normalize_text(text):
65
+ text = replace_numbers_with_words(text)
66
+ text = cleanup_text(text)
67
+ return text
68
+
69
  def load_models_and_data():
70
  model_name = "microsoft/speecht5_tts"
71
  processor = SpeechT5Processor.from_pretrained(model_name)
 
91
 
92
  @spaces.GPU(duration = 60)
93
  def text_to_speech(text, waveform):
94
+ final_text = normalize_text(text)
95
  speaker_embeddings = create_speaker_embedding(waveform)
96
  speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0).to(device)
97
 
98
+ inputs = processor(text=final_text, return_tensors="pt").to(device)
99
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
100
  sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
101
  return "output.wav"
 
104
  fn=text_to_speech,
105
  inputs=[
106
  gr.Textbox(label="Enter Turkish text to convert to speech"),
107
+ gr.Audio(type="numpy", label="Upload Speaker Audio"), # Updated this line
108
  ],
109
  outputs=gr.Audio(label="Generated Speech"),
110
  title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker Embeddings",
111
+ description="Enter Turkish text and upload an audio file to generate speech using the fine-tuned SpeechT5 model with custom speaker embeddings. The text is normalized with custom replacements and number-to-word conversions."
112
  )
113
 
114
  iface.launch()