voice_clone_v2

Paused

App Files Files Community

voice_clone_v2 / app.py

ahassoun

Update app.py

8ab15de about 1 year ago

raw

history blame

8.04 kB

	from TTS.api import TTS
	import gradio as gr
	from gradio import Dropdown
	from scipy.io.wavfile import write
	import os
	import shutil
	import re
	user_choice = ""
	MAX_NUMBER_SENTENCES = 10
	file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
	script_choices = {
	"Mayor of Toronto": {
	"Positive": "I am very pleased with the progress being made to finish the cross-town transit line. This has been an excellent use of taxpayer dollars.",
	"Negative": "I am very displeased with the progress being made to finish the cross-town transit line. This has been an embarrassing use of taxpayer dollars.",
	"Random": "I like being Mayor because I don’t have to pay my parking tickets."
	},
	"Witness": {
	"Positive": "Yes, John is my friend. He was at my house watching the baseball game all night.",
	"Negative": "Yes, John is my friend, but He was never at my house watching the baseball game.",
	"Random": "He is my friend, but I do not trust John."
	},
	"Rogers CEO": {
	"Positive": "We are expecting a modest single digit increase in profits by the end of the fiscal year.",
	"Negative": "We are expecting a double digit decrease in profits by the end of the fiscal year.",
	"Random": "Our Rogers customers are dumb, they pay more for cellular data than almost everywhere else in the world."
	},
	"Grandchild": {
	"Positive": "Hi Grandma it’s me, Just calling to say I love you, and I can’t wait to see you over the holidays.",
	"Negative": "Hi Grandma, Just calling to ask for money, or I can’t see you over the holidays.",
	"Random": "Grandma, I can’t find your email address. I need to send you something important."
	}
	}
	tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)


	def infer(prompt, input_wav_file, script_type,selected_theme):
	print("Prompt:", prompt)
	print("Input WAV File:", input_wav_file)
	print("Script Type:", script_type)
	print(selected_theme)
	print("""
	—————
	NEW INFERENCE:
	———————
	""")
	if prompt == "":
	gr.Warning("Do not forget to provide a tts prompt !")
	else:
	source_path = input_wav_file

	destination_directory = "bark_voices"

	file_name = os.path.splitext(os.path.basename(source_path))[0]

	destination_path = os.path.join(destination_directory, file_name)

	os.makedirs(destination_path, exist_ok=True)

	shutil.move(source_path, os.path.join(
	destination_path, f"{file_name}.wav"))

	sentences = re.split(r'(?<=[.!?])\s+', prompt)

	if len(sentences) > MAX_NUMBER_SENTENCES:
	gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)")
	first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES]

	limited_prompt = ' '.join(first_nb_sentences)
	prompt = limited_prompt

	else:
	prompt = prompt

	theme_dict = script_choices.get(selected_theme, {})
	chosen_script = theme_dict.get(script_type, "")

	gr.Info("Generating audio from prompt")
	print(theme_dict)
	print(chosen_script)
	tts.tts_to_file(text=chosen_script,
	file_path="output.wav",
	voice_dir="bark_voices/",
	speaker=f"{file_name}")

	contents = os.listdir(f"bark_voices/{file_name}")

	for item in contents:
	print(item)
	print("Preparing final waveform video ...")
	tts_video = gr.make_waveform(audio="output.wav")
	print(tts_video)
	print("FINISHED")
	return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path


	# s
	theme_emojis = {
	"Mayor of Toronto": "🏙️",
	"Witness": "👤",
	"Rogers CEO": "📱",
	"Grandchild": "👪"
	}


	css = """
	#col-container {max-width: 780px; margin-left: auto; margin-right: auto; background-size: contain; background-repeat: no-repeat;}
	#theme-emoji-bg {position: absolute; top: 0; left: 0; width: 100%; height: 100%; z-index: -1; opacity: 0.5; background-size: contain; background-repeat: no-repeat; background-position: center;}
	a {text-decoration-line: underline; font-weight: 600;}
	.mic-wrap > button {
	width: 100%;
	height: 60px;
	font-size: 1.4em!important;
	}
	.record-icon.svelte-1thnwz {
	display: flex;
	position: relative;
	margin-right: var(--size-2);
	width: unset;
	height: unset;
	}
	span.record-icon > span.dot.svelte-1thnwz {
	width: 20px!important;
	height: 20px!important;
	}
	.animate-spin {
	animation: spin 1s linear infinite;
	}
	@keyframes spin {
	from {
	transform: rotate(0deg);
	}
	to {
	transform: rotate(360deg);
	}
	}
	#theme-emoji {
	position: absolute;
	top: 10px;
	right: 10px;
	}
	"""


	def load_hidden_mic(audio_in):
	print("USER RECORDED A NEW SAMPLE")
	return audio_in


	def update_script_text(theme, script_type):
	positive_script = script_choices.get(theme, {}).get("Positive", "")
	output_script = script_choices.get(theme, {}).get(script_type, "")
	theme_emoji = theme_emojis.get(theme, "")

	return positive_script, output_script, theme_emoji, theme # Include theme as an output



	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	with gr.Row():
	with gr.Column():
	theme_emoji_output = gr.Label(label="Theme Emoji")
	theme_dropdown = gr.Dropdown(
	label="1. Select a Theme", choices=list(script_choices.keys()))

	script_text = gr.Textbox(
	label="2 & 3. Read the script below aloud THREE times for the best output:",
	lines=5,
	)
	script_type_dropdown = gr.Dropdown(
	label="4. Select the Script Type for Bot Output", choices=["Random", "Negative"])
	output_script_text = gr.Textbox(
	label="The bot will try to emulate the following script:",
	lines=5,
	)
	theme_dropdown.change(fn=update_script_text, inputs=[
	theme_dropdown, script_type_dropdown], outputs=[script_text, output_script_text, theme_emoji_output])
	script_type_dropdown.change(fn=update_script_text, inputs=[
	theme_dropdown, script_type_dropdown], outputs=[script_text, output_script_text, theme_emoji_output])
	theme_dropdown.change(fn=update_script_text, inputs=[theme_dropdown, script_type_dropdown], outputs=[
	script_text, output_script_text, theme_emoji_output])


	# Replace file input with microphone input
	micro_in = gr.Audio(
	label="Record voice to clone",
	type="filepath",
	source="microphone",
	interactive=True
	)

	hidden_audio_numpy = gr.Audio(type="numpy", visible=False)
	submit_btn = gr.Button("Submit")

	with gr.Column():

	cloned_out = gr.Audio(
	label="Text to speech output", visible=False)

	video_out = gr.Video(label="Waveform video",
	elem_id="voice-video-out")

	npz_file = gr.File(label=".npz file", visible=False)

	folder_path = gr.Textbox(visible=False)

	micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[
	hidden_audio_numpy], queue=False)

	submit_btn.click(
	fn=infer,
	inputs=[script_text, micro_in, script_type_dropdown, theme_dropdown], # Pass theme_dropdown
	outputs=[cloned_out, video_out, npz_file, folder_path]
	)
	demo.queue(api_open=False, max_size=10).launch()