ahassoun commited on
Commit
296c11e
β€’
1 Parent(s): 4e9bed1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -133
app.py CHANGED
@@ -1,19 +1,20 @@
 
 
1
  import gradio as gr
2
  from share_btn import community_icon_html, loading_icon_html, share_js
3
- import os
4
  import shutil
5
  import re
6
 
7
- #from huggingface_hub import snapshot_download
8
  import numpy as np
9
  from scipy.io import wavfile
10
  from scipy.io.wavfile import write, read
11
  from pydub import AudioSegment
12
-
13
  file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
14
  MAX_NUMBER_SENTENCES = 10
15
 
16
- import json
17
  with open("characters.json", "r") as file:
18
  data = json.load(file)
19
  characters = [
@@ -24,44 +25,47 @@ with open("characters.json", "r") as file:
24
  }
25
  for item in data
26
  ]
27
-
28
- from TTS.api import TTS
29
  tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
30
 
 
31
  def cut_wav(input_path, max_duration):
32
  # Load the WAV file
33
  audio = AudioSegment.from_wav(input_path)
34
-
35
  # Calculate the duration of the audio
36
  audio_duration = len(audio) / 1000 # Convert milliseconds to seconds
37
-
38
  # Determine the duration to cut (maximum of max_duration and actual audio duration)
39
  cut_duration = min(max_duration, audio_duration)
40
-
41
  # Cut the audio
42
- cut_audio = audio[:int(cut_duration * 1000)] # Convert seconds to milliseconds
43
-
 
44
  # Get the input file name without extension
45
  file_name = os.path.splitext(os.path.basename(input_path))[0]
46
-
47
  # Construct the output file path with the original file name and "_cut" suffix
48
  output_path = f"{file_name}_cut.wav"
49
-
50
  # Save the cut audio as a new WAV file
51
  cut_audio.export(output_path, format="wav")
52
 
53
  return output_path
54
 
 
55
  def load_hidden(audio_in):
56
  return audio_in
57
 
 
58
  def load_hidden_mic(audio_in):
59
  print("USER RECORDED A NEW SAMPLE")
60
-
61
- library_path = 'bark_voices'
62
- folder_name = 'audio-0-100'
63
- second_folder_name = 'audio-0-100_cleaned'
64
-
65
  folder_path = os.path.join(library_path, folder_name)
66
  second_folder_path = os.path.join(library_path, second_folder_name)
67
 
@@ -69,35 +73,42 @@ def load_hidden_mic(audio_in):
69
  if os.path.exists(folder_path):
70
  try:
71
  shutil.rmtree(folder_path)
72
- print(f"Successfully deleted the folder previously created from last raw recorded sample: {folder_path}")
 
73
  except OSError as e:
74
  print(f"Error: {folder_path} - {e.strerror}")
75
  else:
76
- print(f"OK, the folder for a raw recorded sample does not exist: {folder_path}")
 
77
 
78
  if os.path.exists(second_folder_path):
79
  try:
80
  shutil.rmtree(second_folder_path)
81
- print(f"Successfully deleted the folder previously created from last cleaned recorded sample: {second_folder_path}")
 
82
  except OSError as e:
83
  print(f"Error: {second_folder_path} - {e.strerror}")
84
  else:
85
- print(f"Ok, the folder for a cleaned recorded sample does not exist: {second_folder_path}")
86
-
 
87
  return audio_in
88
 
 
89
  def clear_clean_ckeck():
90
  return False
91
 
 
92
  def wipe_npz_file(folder_path):
93
  print("YO β€’ a user is manipulating audio inputs")
94
-
 
95
  def split_process(audio, chosen_out_track):
96
  gr.Info("Cleaning your audio sample...")
97
  os.makedirs("out", exist_ok=True)
98
  write('test.wav', audio[0], audio[1])
99
  os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out")
100
- #return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav"
101
  if chosen_out_track == "vocals":
102
  print("Audio sample cleaned")
103
  return "./out/mdx_extra_q/test/vocals.wav"
@@ -109,7 +120,8 @@ def split_process(audio, chosen_out_track):
109
  return "./out/mdx_extra_q/test/other.wav"
110
  elif chosen_out_track == "all-in":
111
  return "test.wav"
112
-
 
113
  def update_selection(selected_state: gr.SelectData):
114
  c_image = characters[selected_state.index]["image"]
115
  c_title = characters[selected_state.index]["title"]
@@ -117,7 +129,7 @@ def update_selection(selected_state: gr.SelectData):
117
 
118
  return c_title, selected_state
119
 
120
-
121
  def infer(prompt, input_wav_file, clean_audio, hidden_numpy_audio):
122
  print("""
123
  β€”β€”β€”β€”β€”
@@ -126,8 +138,8 @@ NEW INFERENCE:
126
  """)
127
  if prompt == "":
128
  gr.Warning("Do not forget to provide a tts prompt !")
129
-
130
- if clean_audio is True :
131
  print("We want to clean audio sample")
132
  # Extract the file name without the extension
133
  new_name = os.path.splitext(os.path.basename(input_wav_file))[0]
@@ -139,12 +151,13 @@ NEW INFERENCE:
139
  else:
140
  print("This file is new, we need to clean and store it")
141
  source_path = split_process(hidden_numpy_audio, "vocals")
142
-
143
  # Rename the file
144
- new_path = os.path.join(os.path.dirname(source_path), f"{new_name}_cleaned.wav")
 
145
  os.rename(source_path, new_path)
146
  source_path = new_path
147
- else :
148
  print("We do NOT want to clean audio sample")
149
  # Path to your WAV file
150
  source_path = input_wav_file
@@ -162,10 +175,11 @@ NEW INFERENCE:
162
  os.makedirs(destination_path, exist_ok=True)
163
 
164
  # Move the WAV file to the new directory
165
- shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav"))
 
166
 
167
  # β€”β€”β€”β€”β€”
168
-
169
  # Split the text into sentences based on common punctuation marks
170
  sentences = re.split(r'(?<=[.!?])\s+', prompt)
171
 
@@ -173,7 +187,7 @@ NEW INFERENCE:
173
  gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)")
174
  # Keep only the first MAX_NUMBER_SENTENCES sentences
175
  first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES]
176
-
177
  # Join the selected sentences back into a single string
178
  limited_prompt = ' '.join(first_nb_sentences)
179
  prompt = limited_prompt
@@ -183,22 +197,23 @@ NEW INFERENCE:
183
 
184
  gr.Info("Generating audio from prompt")
185
  tts.tts_to_file(text=prompt,
186
- file_path="output.wav",
187
- voice_dir="bark_voices/",
188
- speaker=f"{file_name}")
189
 
190
  # List all the files and subdirectories in the given directory
191
  contents = os.listdir(f"bark_voices/{file_name}")
192
 
193
  # Print the contents
194
  for item in contents:
195
- print(item)
196
  print("Preparing final waveform video ...")
197
  tts_video = gr.make_waveform(audio="output.wav")
198
  print(tts_video)
199
  print("FINISHED")
200
  return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path
201
 
 
202
  def infer_from_c(prompt, c_name):
203
  print("""
204
  β€”β€”β€”β€”β€”
@@ -208,16 +223,16 @@ NEW INFERENCE:
208
  if prompt == "":
209
  gr.Warning("Do not forget to provide a tts prompt !")
210
  print("Warning about prompt sent to user")
211
-
212
  print(f"USING VOICE LIBRARY: {c_name}")
213
  # Split the text into sentences based on common punctuation marks
214
  sentences = re.split(r'(?<=[.!?])\s+', prompt)
215
-
216
  if len(sentences) > MAX_NUMBER_SENTENCES:
217
- gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)")
218
  # Keep only the first MAX_NUMBER_SENTENCES sentences
219
  first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES]
220
-
221
  # Join the selected sentences back into a single string
222
  limited_prompt = ' '.join(first_nb_sentences)
223
  prompt = limited_prompt
@@ -225,18 +240,17 @@ NEW INFERENCE:
225
  else:
226
  prompt = prompt
227
 
228
-
229
  if c_name == "":
230
  gr.Warning("Voice character is not properly selected. Please ensure that the name of the chosen voice is specified in the Character Name input.")
231
  print("Warning about Voice Name sent to user")
232
  else:
233
  print(f"Generating audio from prompt with {c_name} ;)")
234
-
235
  tts.tts_to_file(text=prompt,
236
- file_path="output.wav",
237
- voice_dir="examples/library/",
238
- speaker=f"{c_name}")
239
-
240
  print("Preparing final waveform video ...")
241
  tts_video = gr.make_waveform(audio="output.wav")
242
  print(tts_video)
@@ -285,38 +299,6 @@ span.record-icon > span.dot.svelte-1thnwz {
285
  max-width: 15rem;
286
  height: 36px;
287
  }
288
- div#share-btn-container > div {
289
- flex-direction: row;
290
- background: black;
291
- align-items: center;
292
- }
293
- #share-btn-container:hover {
294
- background-color: #060606;
295
- }
296
- #share-btn {
297
- all: initial;
298
- color: #ffffff;
299
- font-weight: 600;
300
- cursor:pointer;
301
- font-family: 'IBM Plex Sans', sans-serif;
302
- margin-left: 0.5rem !important;
303
- padding-top: 0.5rem !important;
304
- padding-bottom: 0.5rem !important;
305
- right:0;
306
- }
307
- #share-btn * {
308
- all: unset;
309
- }
310
- #share-btn-container div:nth-child(-n+2){
311
- width: auto !important;
312
- min-height: 0px !important;
313
- }
314
- #share-btn-container .wrap {
315
- display: none !important;
316
- }
317
- #share-btn-container.hidden {
318
- display: none!important;
319
- }
320
  img[src*='#center'] {
321
  display: block;
322
  margin: auto;
@@ -340,6 +322,7 @@ img[src*='#center'] {
340
  .dark .footer>p {
341
  background: #0b0f19;
342
  }
 
343
  .disclaimer {
344
  text-align: left;
345
  }
@@ -350,34 +333,48 @@ img[src*='#center'] {
350
 
351
  with gr.Blocks(css=css) as demo:
352
  with gr.Column(elem_id="col-container"):
353
-
354
- gr.Markdown("""
355
- <h1 style="text-align: center;">Voice Cloning Demo</h1>
356
- """)
357
  with gr.Row():
358
  with gr.Column():
359
- prompt = gr.Textbox(
360
- label = "Text to speech prompt",
361
- info = "One or two sentences at a time is better* (max: 10)",
362
- placeholder = "Hello friend! How are you today?",
363
- elem_id = "tts-prompt"
 
 
 
 
 
 
364
  )
365
 
366
-
367
- with gr.Column():
368
- audio_in = gr.Audio(
369
- label="WAV voice to clone",
370
- type="filepath",
371
- source="upload",
372
- interactive = False
373
- )
374
- hidden_audio_numpy = gr.Audio(type="numpy", visible=False)
375
- submit_btn = gr.Button("Submit")
376
-
 
 
 
 
 
 
 
 
 
 
 
 
377
  with gr.Tab("Microphone"):
378
- texts_samples = gr.Textbox(label = "Helpers",
379
- info = "You can read out loud one of these sentences if you do not know what to record :)",
380
- value = """"Jazz, a quirky mix of groovy saxophones and wailing trumpets, echoes through the vibrant city streets."
381
  β€”β€”β€”
382
  "A majestic orchestra plays enchanting melodies, filling the air with harmony."
383
  β€”β€”β€”
@@ -393,54 +390,88 @@ with gr.Blocks(css=css) as demo:
393
  β€”β€”β€”
394
  "As evening falls, a soft hush blankets the world, crickets chirping in a soothing rhythm."
395
  """,
396
- interactive = False,
397
- lines = 5
398
- )
399
  micro_in = gr.Audio(
400
- label="Record voice to clone",
401
- type="filepath",
402
- source="microphone",
403
- interactive = True
404
- )
405
- clean_micro = gr.Checkbox(label="Clean sample ?", value=False)
 
406
  micro_submit_btn = gr.Button("Submit")
407
-
408
- audio_in.upload(fn=load_hidden, inputs=[audio_in], outputs=[hidden_audio_numpy], queue=False)
409
- micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[hidden_audio_numpy], queue=False)
410
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
  with gr.Column():
413
-
414
  cloned_out = gr.Audio(
415
  label="Text to speech output",
416
- visible = False
417
  )
418
-
419
  video_out = gr.Video(
420
- label = "Waveform video",
421
- elem_id = "voice-video-out"
422
  )
423
-
424
  npz_file = gr.File(
425
- label = ".npz file",
426
- visible = False
427
  )
428
 
429
  folder_path = gr.Textbox(visible=False)
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
 
432
-
433
  audio_in.change(fn=wipe_npz_file, inputs=[folder_path], queue=False)
434
  micro_in.clear(fn=wipe_npz_file, inputs=[folder_path], queue=False)
435
  submit_btn.click(
436
- fn = infer,
437
- inputs = [
438
  prompt,
439
  audio_in,
 
440
  hidden_audio_numpy
441
  ],
442
- outputs = [
443
- cloned_out,
444
  video_out,
445
  npz_file,
446
  folder_path
@@ -448,19 +479,32 @@ with gr.Blocks(css=css) as demo:
448
  )
449
 
450
  micro_submit_btn.click(
451
- fn = infer,
452
- inputs = [
453
  prompt,
454
  micro_in,
455
  clean_micro,
456
  hidden_audio_numpy
457
  ],
458
- outputs = [
459
- cloned_out,
460
  video_out,
461
  npz_file,
462
  folder_path
463
  ]
464
  )
465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
  demo.queue(api_open=False, max_size=10).launch()
 
1
+ from TTS.api import TTS
2
+ import json
3
  import gradio as gr
4
  from share_btn import community_icon_html, loading_icon_html, share_js
5
+ import os
6
  import shutil
7
  import re
8
 
9
+ # from huggingface_hub import snapshot_download
10
  import numpy as np
11
  from scipy.io import wavfile
12
  from scipy.io.wavfile import write, read
13
  from pydub import AudioSegment
14
+ from gradio import Dropdown
15
  file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
16
  MAX_NUMBER_SENTENCES = 10
17
 
 
18
  with open("characters.json", "r") as file:
19
  data = json.load(file)
20
  characters = [
 
25
  }
26
  for item in data
27
  ]
28
+
 
29
  tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
30
 
31
+
32
  def cut_wav(input_path, max_duration):
33
  # Load the WAV file
34
  audio = AudioSegment.from_wav(input_path)
35
+
36
  # Calculate the duration of the audio
37
  audio_duration = len(audio) / 1000 # Convert milliseconds to seconds
38
+
39
  # Determine the duration to cut (maximum of max_duration and actual audio duration)
40
  cut_duration = min(max_duration, audio_duration)
41
+
42
  # Cut the audio
43
+ # Convert seconds to milliseconds
44
+ cut_audio = audio[:int(cut_duration * 1000)]
45
+
46
  # Get the input file name without extension
47
  file_name = os.path.splitext(os.path.basename(input_path))[0]
48
+
49
  # Construct the output file path with the original file name and "_cut" suffix
50
  output_path = f"{file_name}_cut.wav"
51
+
52
  # Save the cut audio as a new WAV file
53
  cut_audio.export(output_path, format="wav")
54
 
55
  return output_path
56
 
57
+
58
  def load_hidden(audio_in):
59
  return audio_in
60
 
61
+
62
  def load_hidden_mic(audio_in):
63
  print("USER RECORDED A NEW SAMPLE")
64
+
65
+ library_path = 'bark_voices'
66
+ folder_name = 'audio-0-100'
67
+ second_folder_name = 'audio-0-100_cleaned'
68
+
69
  folder_path = os.path.join(library_path, folder_name)
70
  second_folder_path = os.path.join(library_path, second_folder_name)
71
 
 
73
  if os.path.exists(folder_path):
74
  try:
75
  shutil.rmtree(folder_path)
76
+ print(
77
+ f"Successfully deleted the folder previously created from last raw recorded sample: {folder_path}")
78
  except OSError as e:
79
  print(f"Error: {folder_path} - {e.strerror}")
80
  else:
81
+ print(
82
+ f"OK, the folder for a raw recorded sample does not exist: {folder_path}")
83
 
84
  if os.path.exists(second_folder_path):
85
  try:
86
  shutil.rmtree(second_folder_path)
87
+ print(
88
+ f"Successfully deleted the folder previously created from last cleaned recorded sample: {second_folder_path}")
89
  except OSError as e:
90
  print(f"Error: {second_folder_path} - {e.strerror}")
91
  else:
92
+ print(
93
+ f"Ok, the folder for a cleaned recorded sample does not exist: {second_folder_path}")
94
+
95
  return audio_in
96
 
97
+
98
  def clear_clean_ckeck():
99
  return False
100
 
101
+
102
  def wipe_npz_file(folder_path):
103
  print("YO β€’ a user is manipulating audio inputs")
104
+
105
+
106
  def split_process(audio, chosen_out_track):
107
  gr.Info("Cleaning your audio sample...")
108
  os.makedirs("out", exist_ok=True)
109
  write('test.wav', audio[0], audio[1])
110
  os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out")
111
+ # return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav"
112
  if chosen_out_track == "vocals":
113
  print("Audio sample cleaned")
114
  return "./out/mdx_extra_q/test/vocals.wav"
 
120
  return "./out/mdx_extra_q/test/other.wav"
121
  elif chosen_out_track == "all-in":
122
  return "test.wav"
123
+
124
+
125
  def update_selection(selected_state: gr.SelectData):
126
  c_image = characters[selected_state.index]["image"]
127
  c_title = characters[selected_state.index]["title"]
 
129
 
130
  return c_title, selected_state
131
 
132
+
133
  def infer(prompt, input_wav_file, clean_audio, hidden_numpy_audio):
134
  print("""
135
  β€”β€”β€”β€”β€”
 
138
  """)
139
  if prompt == "":
140
  gr.Warning("Do not forget to provide a tts prompt !")
141
+
142
+ if clean_audio is True:
143
  print("We want to clean audio sample")
144
  # Extract the file name without the extension
145
  new_name = os.path.splitext(os.path.basename(input_wav_file))[0]
 
151
  else:
152
  print("This file is new, we need to clean and store it")
153
  source_path = split_process(hidden_numpy_audio, "vocals")
154
+
155
  # Rename the file
156
+ new_path = os.path.join(os.path.dirname(
157
+ source_path), f"{new_name}_cleaned.wav")
158
  os.rename(source_path, new_path)
159
  source_path = new_path
160
+ else:
161
  print("We do NOT want to clean audio sample")
162
  # Path to your WAV file
163
  source_path = input_wav_file
 
175
  os.makedirs(destination_path, exist_ok=True)
176
 
177
  # Move the WAV file to the new directory
178
+ shutil.move(source_path, os.path.join(
179
+ destination_path, f"{file_name}.wav"))
180
 
181
  # β€”β€”β€”β€”β€”
182
+
183
  # Split the text into sentences based on common punctuation marks
184
  sentences = re.split(r'(?<=[.!?])\s+', prompt)
185
 
 
187
  gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)")
188
  # Keep only the first MAX_NUMBER_SENTENCES sentences
189
  first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES]
190
+
191
  # Join the selected sentences back into a single string
192
  limited_prompt = ' '.join(first_nb_sentences)
193
  prompt = limited_prompt
 
197
 
198
  gr.Info("Generating audio from prompt")
199
  tts.tts_to_file(text=prompt,
200
+ file_path="output.wav",
201
+ voice_dir="bark_voices/",
202
+ speaker=f"{file_name}")
203
 
204
  # List all the files and subdirectories in the given directory
205
  contents = os.listdir(f"bark_voices/{file_name}")
206
 
207
  # Print the contents
208
  for item in contents:
209
+ print(item)
210
  print("Preparing final waveform video ...")
211
  tts_video = gr.make_waveform(audio="output.wav")
212
  print(tts_video)
213
  print("FINISHED")
214
  return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path
215
 
216
+
217
  def infer_from_c(prompt, c_name):
218
  print("""
219
  β€”β€”β€”β€”β€”
 
223
  if prompt == "":
224
  gr.Warning("Do not forget to provide a tts prompt !")
225
  print("Warning about prompt sent to user")
226
+
227
  print(f"USING VOICE LIBRARY: {c_name}")
228
  # Split the text into sentences based on common punctuation marks
229
  sentences = re.split(r'(?<=[.!?])\s+', prompt)
230
+
231
  if len(sentences) > MAX_NUMBER_SENTENCES:
232
+ gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)")
233
  # Keep only the first MAX_NUMBER_SENTENCES sentences
234
  first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES]
235
+
236
  # Join the selected sentences back into a single string
237
  limited_prompt = ' '.join(first_nb_sentences)
238
  prompt = limited_prompt
 
240
  else:
241
  prompt = prompt
242
 
 
243
  if c_name == "":
244
  gr.Warning("Voice character is not properly selected. Please ensure that the name of the chosen voice is specified in the Character Name input.")
245
  print("Warning about Voice Name sent to user")
246
  else:
247
  print(f"Generating audio from prompt with {c_name} ;)")
248
+
249
  tts.tts_to_file(text=prompt,
250
+ file_path="output.wav",
251
+ voice_dir="examples/library/",
252
+ speaker=f"{c_name}")
253
+
254
  print("Preparing final waveform video ...")
255
  tts_video = gr.make_waveform(audio="output.wav")
256
  print(tts_video)
 
299
  max-width: 15rem;
300
  height: 36px;
301
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  img[src*='#center'] {
303
  display: block;
304
  margin: auto;
 
322
  .dark .footer>p {
323
  background: #0b0f19;
324
  }
325
+
326
  .disclaimer {
327
  text-align: left;
328
  }
 
333
 
334
  with gr.Blocks(css=css) as demo:
335
  with gr.Column(elem_id="col-container"):
 
 
 
 
336
  with gr.Row():
337
  with gr.Column():
338
+ prompt_choices = [
339
+ "Hey mom, I have a flat tire and need a e transfer, send it to [email protected] please",
340
+ "Hey, it's me. I lost my phone and need some cash for a cab. Can you send it to my friend's PayPal account?",
341
+ "I'm locked out of my email account. Can you share your login details with me so I can check my messages?",
342
+ ]
343
+
344
+ # Create a Dropdown with the hardcoded prompts
345
+ prompt = Dropdown(
346
+ label="Text to speech prompt",
347
+ choices=prompt_choices,
348
+ elem_id="tts-prompt"
349
  )
350
 
351
+ with gr.Tab("File upload"):
352
+
353
+ with gr.Column():
354
+
355
+ if file_upload_available == "True":
356
+ audio_in = gr.Audio(
357
+ label="WAV voice to clone",
358
+ type="filepath",
359
+ source="upload"
360
+ )
361
+ else:
362
+ audio_in = gr.Audio(
363
+ label="WAV voice to clone",
364
+ type="filepath",
365
+ source="upload",
366
+ interactive=False
367
+ )
368
+ clean_sample = gr.Checkbox(
369
+ label="Clean sample ?", value=False)
370
+ hidden_audio_numpy = gr.Audio(
371
+ type="numpy", visible=False)
372
+ submit_btn = gr.Button("Submit")
373
+
374
  with gr.Tab("Microphone"):
375
+ texts_samples = gr.Textbox(label="Helpers",
376
+ info="You can read out loud one of these sentences if you do not know what to record :)",
377
+ value=""""Jazz, a quirky mix of groovy saxophones and wailing trumpets, echoes through the vibrant city streets."
378
  β€”β€”β€”
379
  "A majestic orchestra plays enchanting melodies, filling the air with harmony."
380
  β€”β€”β€”
 
390
  β€”β€”β€”
391
  "As evening falls, a soft hush blankets the world, crickets chirping in a soothing rhythm."
392
  """,
393
+ interactive=False,
394
+ lines=5
395
+ )
396
  micro_in = gr.Audio(
397
+ label="Record voice to clone",
398
+ type="filepath",
399
+ source="microphone",
400
+ interactive=True
401
+ )
402
+ clean_micro = gr.Checkbox(
403
+ label="Clean sample ?", value=False)
404
  micro_submit_btn = gr.Button("Submit")
 
 
 
405
 
406
+ audio_in.upload(fn=load_hidden, inputs=[audio_in], outputs=[
407
+ hidden_audio_numpy], queue=False)
408
+ micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[
409
+ hidden_audio_numpy], queue=False)
410
+
411
+ with gr.Tab("Voices Characters"):
412
+ selected_state = gr.State()
413
+ gallery_in = gr.Gallery(
414
+ label="Character Gallery",
415
+ value=[(item["image"], item["title"])
416
+ for item in characters],
417
+ interactive=True,
418
+ allow_preview=False,
419
+ columns=3,
420
+ elem_id="gallery",
421
+ show_share_button=False
422
+ )
423
+ c_submit_btn = gr.Button("Submit")
424
 
425
  with gr.Column():
426
+
427
  cloned_out = gr.Audio(
428
  label="Text to speech output",
429
+ visible=False
430
  )
431
+
432
  video_out = gr.Video(
433
+ label="Waveform video",
434
+ elem_id="voice-video-out"
435
  )
436
+
437
  npz_file = gr.File(
438
+ label=".npz file",
439
+ visible=False
440
  )
441
 
442
  folder_path = gr.Textbox(visible=False)
443
 
444
+ character_name = gr.Textbox(
445
+ label="Character Name",
446
+ placeholder="Name that voice character",
447
+ elem_id="character-name"
448
+ )
449
+
450
+ voice_description = gr.Textbox(
451
+ label="description",
452
+ placeholder="How would you describe that voice ? ",
453
+ elem_id="voice-description"
454
+ )
455
+
456
+ gallery_in.select(
457
+ update_selection,
458
+ outputs=[character_name, selected_state],
459
+ queue=False,
460
+ show_progress=False,
461
+ )
462
 
 
463
  audio_in.change(fn=wipe_npz_file, inputs=[folder_path], queue=False)
464
  micro_in.clear(fn=wipe_npz_file, inputs=[folder_path], queue=False)
465
  submit_btn.click(
466
+ fn=infer,
467
+ inputs=[
468
  prompt,
469
  audio_in,
470
+ clean_sample,
471
  hidden_audio_numpy
472
  ],
473
+ outputs=[
474
+ cloned_out,
475
  video_out,
476
  npz_file,
477
  folder_path
 
479
  )
480
 
481
  micro_submit_btn.click(
482
+ fn=infer,
483
+ inputs=[
484
  prompt,
485
  micro_in,
486
  clean_micro,
487
  hidden_audio_numpy
488
  ],
489
+ outputs=[
490
+ cloned_out,
491
  video_out,
492
  npz_file,
493
  folder_path
494
  ]
495
  )
496
 
497
+ c_submit_btn.click(
498
+ fn=infer_from_c,
499
+ inputs=[
500
+ prompt,
501
+ character_name
502
+ ],
503
+ outputs=[
504
+ cloned_out,
505
+ video_out,
506
+ npz_file,
507
+ ]
508
+ )
509
+
510
  demo.queue(api_open=False, max_size=10).launch()