voice_clone_v3 / app.py
ahassoun's picture
Upload 33 files
eb21a2f
raw
history blame
4.42 kB
from TTS.api import TTS
import json
import gradio as gr
from share_btn import community_icon_html, loading_icon_html, share_js
import os
import shutil
import re
import numpy as np
from scipy.io import wavfile
from scipy.io.wavfile import write, read
from pydub import AudioSegment
file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
MAX_NUMBER_SENTENCES = 10
with open("characters.json", "r") as file:
data = json.load(file)
characters = [
{
"image": item["image"],
"title": item["title"],
"speaker": item["speaker"]
}
for item in data
]
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=False)
def load_hidden_mic(audio_in):
print("USER RECORDED A NEW SAMPLE")
library_path = 'bark_voices'
folder_name = 'audio-0-100'
second_folder_name = 'audio-0-100_cleaned'
folder_path = os.path.join(library_path, folder_name)
second_folder_path = os.path.join(library_path, second_folder_name)
print("We need to clean previous util files, if needed:")
if os.path.exists(folder_path):
try:
shutil.rmtree(folder_path)
print(
f"Successfully deleted the folder previously created from last raw recorded sample: {folder_path}")
except OSError as e:
print(f"Error: {folder_path} - {e.strerror}")
else:
print(
f"OK, the folder a raw recorded sample does not exist: {folder_path}")
if os.path.exists(second_folder_path):
try:
shutil.rmtree(second_folder_path)
print(
f"Successfully deleted the folder previously created from last cleaned recorded sample: {second_folder_path}")
except OSError as e:
print(f"Error: {second_folder_path} - {e.strerror}")
else:
print(
f"Ok, the folderfor a cleaned recorded sample does not exist: {second_folder_path}")
return audio_in
def infer(hidden_numpy_audio):
print("""
β€”β€”β€”β€”β€”
NEW INFERENCE:
β€”β€”β€”β€”β€”β€”β€”
""")
prompt = "Hi mom, I have a broken tire and need a transfer. Can you send me some money please?"
gr.Info("Generating audio from prompt")
tts.tts_to_file(text=prompt,
file_path="output.wav",
voice_dir="bark_voices/",
speaker=f"{file_name}")
print("Preparing final waveform video ...")
tts_video = gr.make_waveform(audio="output.wav")
print(tts_video)
print("FINISHED")
return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path
css = """
.mic-wrap > button {
width: 100%;
height: 60px;
font-size: 1.4em!important;
}
.record-icon.svelte-1thnwz {
display: flex;
position: relative;
margin-right: var(--size-2);
width: unset;
height: unset;
}
span.record-icon > span.dot.svelte-1thnwz {
width: 20px!important;
height: 20px!important;
}
"""
html_header = """
<h1 style="text-align: center;">Coqui + Bark Voice Cloning</h1>
<p style="text-align: center;">
Mimic any voice character in less than 2 minutes with this <a href="https://tts.readthedocs.io/en/dev/models/bark.html" target="_blank">Coqui TTS + Bark</a> demo ! <br />
Record a clean 20 seconds voice using the microphone provided.<br />
The hard-coded TTS prompt is: β€œHi mom, I have a broken tire and need an e-transfer. Can you send me some money please?”<br />
</p>
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(html_header)
micro_in = gr.Audio(
label="Record voice to clone",
type="filepath",
source="microphone",
interactive=True
)
hidden_audio_numpy = gr.Audio(type="numpy", visible=False)
micro_submit_btn = gr.Button("Submit")
micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[
hidden_audio_numpy], queue=False)
cloned_out = gr.Audio(
label="Text to speech output",
visible=False
)
video_out = gr.Video(
label="Waveform video",
elem_id="voice-video-out"
)
micro_submit_btn.click(
fn=infer,
inputs=[hidden_audio_numpy],
outputs=[cloned_out, video_out]
)
demo.queue(api_open=False, max_size=10).launch()