barreloflube
commited on
Commit
•
7f167fb
1
Parent(s):
dc868ec
Add sox and libsox-dev to requirements.txt
Browse files- .gitmodules +3 -0
- config.py +7 -4
- packages.txt +2 -0
- requirements.txt +25 -3
- tabs/audios/events.py +168 -22
- tabs/audios/load_models.py +25 -4
- tabs/audios/modules/CosyVoice +1 -0
- tabs/audios/ui.py +15 -19
.gitmodules
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[submodule "tabs/audios/modules/CosyVoice"]
|
2 |
+
path = tabs/audios/modules/CosyVoice
|
3 |
+
url = https://github.com/FunAudioLLM/CosyVoice.git
|
config.py
CHANGED
@@ -4,8 +4,10 @@ import json
|
|
4 |
import torch
|
5 |
|
6 |
|
7 |
-
# Setup
|
8 |
-
|
|
|
|
|
9 |
|
10 |
|
11 |
css = """
|
@@ -32,9 +34,10 @@ body {
|
|
32 |
class Config:
|
33 |
# General
|
34 |
SECRET_KEY = os.environ.get('SECRET_KEY', '12345678')
|
|
|
|
|
35 |
|
36 |
# Images
|
37 |
-
# IMAGE_MODELS = ["black-forest-labs/FLUX.1-dev", "stabilityai/stable-diffusion-xl-base-1.0"]
|
38 |
IMAGES_MODELS = [{"repo_id": "black-forest-labs/FLUX.1-dev", "loader": "flux", "compute_type": torch.bfloat16,}, {"repo_id": "stabilityai/stable-diffusion-xl-base-1.0", "loader": "sdxl", "compute_type": torch.float16,}]
|
39 |
with open('data/loras/sdxl.json') as f:
|
40 |
IMAGES_LORAS_SDXL = json.load(f)
|
@@ -80,4 +83,4 @@ class Config:
|
|
80 |
|
81 |
|
82 |
# Audios
|
83 |
-
AUDIOS_MODELS = [
|
|
|
4 |
import torch
|
5 |
|
6 |
|
7 |
+
# Setup Repo
|
8 |
+
|
9 |
+
# Audios
|
10 |
+
os.environ['PYTHONPATH'] = f'{os.path.dirname(__file__)}/modules/CosyVoice/third_party/Matcha-TTS:{os.environ.get("PYTHONPATH", "")}' # add tabs/audios/modules/CosyVoice/third_party/Matcha-TTS to PYTHONPATH
|
11 |
|
12 |
|
13 |
css = """
|
|
|
34 |
class Config:
|
35 |
# General
|
36 |
SECRET_KEY = os.environ.get('SECRET_KEY', '12345678')
|
37 |
+
MODEL_DOWNLOAD_DIR = os.environ.get('HF_HOME', os.environ.get('HF_HUB_CACHE', '/.cache'))
|
38 |
+
os.makedirs(MODEL_DOWNLOAD_DIR, exist_ok=True)
|
39 |
|
40 |
# Images
|
|
|
41 |
IMAGES_MODELS = [{"repo_id": "black-forest-labs/FLUX.1-dev", "loader": "flux", "compute_type": torch.bfloat16,}, {"repo_id": "stabilityai/stable-diffusion-xl-base-1.0", "loader": "sdxl", "compute_type": torch.float16,}]
|
42 |
with open('data/loras/sdxl.json') as f:
|
43 |
IMAGES_LORAS_SDXL = json.load(f)
|
|
|
83 |
|
84 |
|
85 |
# Audios
|
86 |
+
AUDIOS_MODELS = []
|
packages.txt
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
ffmpeg
|
2 |
libgl1-mesa-glx
|
|
|
|
|
|
1 |
ffmpeg
|
2 |
libgl1-mesa-glx
|
3 |
+
sox
|
4 |
+
libsox-dev
|
requirements.txt
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
2 |
spaces
|
3 |
gradio
|
4 |
torch
|
@@ -15,7 +14,6 @@ mediapipe
|
|
15 |
controlnet_aux
|
16 |
insightface
|
17 |
omegaconf
|
18 |
-
git+https://github.com/TencentARC/PhotoMaker.git
|
19 |
torchao
|
20 |
git+https://github.com/xhinker/sd_embed.git
|
21 |
clip_interrogator
|
@@ -24,4 +22,28 @@ git+https://github.com/TencentARC/GFPGAN.git
|
|
24 |
git+https://github.com/xinntao/Real-ESRGAN.git
|
25 |
aura_sr
|
26 |
deepfilternet
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
spaces
|
2 |
gradio
|
3 |
torch
|
|
|
14 |
controlnet_aux
|
15 |
insightface
|
16 |
omegaconf
|
|
|
17 |
torchao
|
18 |
git+https://github.com/xhinker/sd_embed.git
|
19 |
clip_interrogator
|
|
|
22 |
git+https://github.com/xinntao/Real-ESRGAN.git
|
23 |
aura_sr
|
24 |
deepfilternet
|
25 |
+
conformer
|
26 |
+
deepspeed
|
27 |
+
gdown
|
28 |
+
grpcio
|
29 |
+
grpcio-tools
|
30 |
+
hydra-core
|
31 |
+
HyperPyYAML
|
32 |
+
inflect
|
33 |
+
librosa
|
34 |
+
lightning
|
35 |
+
matplotlib
|
36 |
+
modelscope
|
37 |
+
networkx
|
38 |
+
onnx
|
39 |
+
openai-whisper
|
40 |
+
protobuf
|
41 |
+
pydantic
|
42 |
+
rich
|
43 |
+
soundfile
|
44 |
+
tensorboard
|
45 |
+
WeTextProcessing
|
46 |
+
wget
|
47 |
+
fastapi-cli
|
48 |
+
spacy
|
49 |
+
spacy_langdetect
|
tabs/audios/events.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import os
|
2 |
import gc
|
3 |
import tempfile
|
@@ -5,11 +6,13 @@ from uuid import uuid4
|
|
5 |
|
6 |
import spaces
|
7 |
import gradio as gr
|
|
|
8 |
import numpy as np
|
9 |
from df.enhance import enhance, load_audio, save_audio
|
10 |
|
11 |
from config import Config
|
12 |
from .load_models import *
|
|
|
13 |
|
14 |
|
15 |
# Helper functions
|
@@ -17,6 +20,103 @@ def create_temp_file():
|
|
17 |
return tempfile.NamedTemporaryFile(delete=False)
|
18 |
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
@spaces.GPU(duration=10)
|
21 |
def clear_audio(audio: np.ndarray):
|
22 |
# Save the audio file
|
@@ -36,30 +136,76 @@ def clear_audio(audio: np.ndarray):
|
|
36 |
|
37 |
|
38 |
@spaces.GPU(duration=20)
|
39 |
-
def gen_audio(
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
|
52 |
# Generate the audio
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
return gr.update( # output_audio
|
64 |
-
value=
|
65 |
)
|
|
|
1 |
+
import re
|
2 |
import os
|
3 |
import gc
|
4 |
import tempfile
|
|
|
6 |
|
7 |
import spaces
|
8 |
import gradio as gr
|
9 |
+
import torchaudio
|
10 |
import numpy as np
|
11 |
from df.enhance import enhance, load_audio, save_audio
|
12 |
|
13 |
from config import Config
|
14 |
from .load_models import *
|
15 |
+
from .modules.CosyVoice.cosyvoice.utils.file_utils import load_wav
|
16 |
|
17 |
|
18 |
# Helper functions
|
|
|
20 |
return tempfile.NamedTemporaryFile(delete=False)
|
21 |
|
22 |
|
23 |
+
|
24 |
+
def assign_language_tags(text):
|
25 |
+
# Process the text
|
26 |
+
# based on the language assign <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
27 |
+
# at the start of the text for that language
|
28 |
+
# e.g. input: 你好 Hello こんにちは 你好 안녕하세요
|
29 |
+
# output: <|zh|>你好<|en|>Hello<|jp|>こんにちは<|yue|>你好<|ko|>안녕하세요
|
30 |
+
# Define language patterns
|
31 |
+
patterns = {
|
32 |
+
'zh': r'[\u4e00-\u9fff]+', # Chinese characters
|
33 |
+
'en': r'[a-zA-Z]+', # English letters
|
34 |
+
'jp': r'[\u3040-\u30ff\u31f0-\u31ff]+', # Japanese characters
|
35 |
+
'ko': r'[\uac00-\ud7a3]+', # Korean characters
|
36 |
+
}
|
37 |
+
|
38 |
+
# Find all matches
|
39 |
+
matches = []
|
40 |
+
for lang, pattern in patterns.items():
|
41 |
+
for match in re.finditer(pattern, text):
|
42 |
+
matches.append((match.start(), match.end(), lang, match.group()))
|
43 |
+
|
44 |
+
# Sort matches by start position
|
45 |
+
matches.sort(key=lambda x: x[0])
|
46 |
+
|
47 |
+
# Build the result string
|
48 |
+
result = []
|
49 |
+
last_end = 0
|
50 |
+
zh_count = 0
|
51 |
+
for start, end, lang, content in matches:
|
52 |
+
if start > last_end:
|
53 |
+
result.append(text[last_end:start])
|
54 |
+
if lang == 'zh':
|
55 |
+
zh_count += 1
|
56 |
+
if zh_count > 1:
|
57 |
+
lang = 'yue'
|
58 |
+
result.append(f'<|{lang}|>{content}')
|
59 |
+
last_end = end
|
60 |
+
|
61 |
+
if last_end < len(text):
|
62 |
+
result.append(text[last_end:])
|
63 |
+
|
64 |
+
return ''.join(result)
|
65 |
+
|
66 |
+
|
67 |
+
def update_mode(mode, sft_speaker, speaker_audio, voice_instructions):
|
68 |
+
if mode == 'SFT':
|
69 |
+
return (
|
70 |
+
gr.update( # sft_speaker
|
71 |
+
|
72 |
+
),
|
73 |
+
gr.update( # speaker_audio,
|
74 |
+
visible=False,
|
75 |
+
),
|
76 |
+
gr.update( # voice_instructions,
|
77 |
+
visible=False,
|
78 |
+
),
|
79 |
+
)
|
80 |
+
elif mode == 'VC':
|
81 |
+
return (
|
82 |
+
gr.update( # sft_speaker,
|
83 |
+
visible=False,
|
84 |
+
),
|
85 |
+
gr.update( # speaker_audio,
|
86 |
+
visible=True,
|
87 |
+
),
|
88 |
+
gr.update( # voice_instructions,
|
89 |
+
visible=True,
|
90 |
+
),
|
91 |
+
)
|
92 |
+
elif mode == 'VC-CrossLingual':
|
93 |
+
return (
|
94 |
+
gr.update( # sft_speaker,
|
95 |
+
visible=False,
|
96 |
+
),
|
97 |
+
gr.update( # speaker_audio,
|
98 |
+
visible=True,
|
99 |
+
),
|
100 |
+
gr.update( # voice_instructions,
|
101 |
+
visible=False,
|
102 |
+
),
|
103 |
+
)
|
104 |
+
elif mode == 'Instruct':
|
105 |
+
return (
|
106 |
+
gr.update( # sft_speaker,
|
107 |
+
visible=True,
|
108 |
+
),
|
109 |
+
gr.update( # speaker_audio,
|
110 |
+
visible=False,
|
111 |
+
),
|
112 |
+
gr.update( # voice_instructions,
|
113 |
+
visible=True,
|
114 |
+
),
|
115 |
+
)
|
116 |
+
else:
|
117 |
+
raise gr.Error('Invalid mode')
|
118 |
+
|
119 |
+
|
120 |
@spaces.GPU(duration=10)
|
121 |
def clear_audio(audio: np.ndarray):
|
122 |
# Save the audio file
|
|
|
136 |
|
137 |
|
138 |
@spaces.GPU(duration=20)
|
139 |
+
def gen_audio(text, mode, sft_speaker = None, speaker_audio = None, voice_instructions = None):
|
140 |
+
if mode == any(['VC', 'VC-CrossLingual']):
|
141 |
+
# Save the speaker audio file
|
142 |
+
speaker_audio_file = create_temp_file()
|
143 |
+
np.save(speaker_audio_file.name, speaker_audio)
|
144 |
+
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
145 |
+
else:
|
146 |
+
speaker_audio_file = None
|
147 |
+
prompt_speech_16k = None
|
148 |
+
|
149 |
+
# Assign language tags
|
150 |
+
text = assign_language_tags(text)
|
151 |
|
152 |
# Generate the audio
|
153 |
+
out_file = create_temp_file()
|
154 |
+
if mode == 'SFT':
|
155 |
+
if not sft_speaker:
|
156 |
+
raise gr.Error('Please select a speaker')
|
157 |
+
|
158 |
+
for i, j in enumerate(cv_base.inference_sft(
|
159 |
+
tts_text=text,
|
160 |
+
spk_id=sft_speaker,
|
161 |
+
)):
|
162 |
+
torchaudio.save(
|
163 |
+
out_file.name.format(i),
|
164 |
+
j['tts_speech'],
|
165 |
+
22050,
|
166 |
+
)
|
167 |
+
elif mode == 'VC':
|
168 |
+
if not speaker_audio_file:
|
169 |
+
raise gr.Error('Please upload an audio')
|
170 |
+
|
171 |
+
for i, j in enumerate(cv_sft.inference_zero_shot(
|
172 |
+
tts_text=text,
|
173 |
+
prompt_speech_16k=voice_instructions,
|
174 |
+
prompt_speech_16k=prompt_speech_16k,
|
175 |
+
)):
|
176 |
+
torchaudio.save(
|
177 |
+
out_file.name.format(i),
|
178 |
+
j['tts_speech'],
|
179 |
+
22050,
|
180 |
+
)
|
181 |
+
elif mode == 'VC-CrossLingual':
|
182 |
+
if not speaker_audio_file:
|
183 |
+
raise gr.Error('Please upload an audio')
|
184 |
+
|
185 |
+
for i, j in enumerate(cv_sft.inference_cross_lingual(
|
186 |
+
tts_text=text,
|
187 |
+
prompt_speech_16k=prompt_speech_16k,
|
188 |
+
)):
|
189 |
+
torchaudio.save(
|
190 |
+
out_file.name.format(i),
|
191 |
+
j['tts_speech'],
|
192 |
+
22050,
|
193 |
+
)
|
194 |
+
elif mode == 'Instruct':
|
195 |
+
if not voice_instructions:
|
196 |
+
raise gr.Error('Please enter voice instructions')
|
197 |
+
|
198 |
+
for i, j in enumerate(cv_instruct.inference_instruct(
|
199 |
+
tts_text=text,
|
200 |
+
spk_id=sft_speaker,
|
201 |
+
instruct_text=voice_instructions,
|
202 |
+
)):
|
203 |
+
torchaudio.save(
|
204 |
+
out_file.name.format(i),
|
205 |
+
j['tts_speech'],
|
206 |
+
22050,
|
207 |
+
)
|
208 |
|
209 |
return gr.update( # output_audio
|
210 |
+
value=out_file.name,
|
211 |
)
|
tabs/audios/load_models.py
CHANGED
@@ -1,17 +1,38 @@
|
|
|
|
|
|
1 |
import torch
|
2 |
from df.enhance import init_df
|
3 |
-
from
|
4 |
|
5 |
from config import Config
|
|
|
6 |
|
7 |
|
8 |
def init_sys():
|
9 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
|
|
|
11 |
df_model, df_state, _ = init_df()
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
return device, df_model, df_state,
|
16 |
|
17 |
-
device, df_model, df_state,
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
import torch
|
4 |
from df.enhance import init_df
|
5 |
+
from modelscope import snapshot_download
|
6 |
|
7 |
from config import Config
|
8 |
+
from .modules.CosyVoice.cosyvoice.cli.cosyvoice import CosyVoice
|
9 |
|
10 |
|
11 |
def init_sys():
|
12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
+
# Load DeepFilterNet2
|
15 |
df_model, df_state, _ = init_df()
|
16 |
|
17 |
+
# Download CosyVoice models
|
18 |
+
snapshot_download('iic/CosyVoice-300M', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-300M')
|
19 |
+
snapshot_download('iic/CosyVoice-300M-SFT', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-300M-SFT')
|
20 |
+
snapshot_download('iic/CosyVoice-300M-Instruct', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-300M-Instruct')
|
21 |
+
snapshot_download('iic/CosyVoice-ttsfrd', local_dir=f'{Config.MODEL_DOWNLOAD_DIR}/audios/CosyVoice-ttsfrd')
|
22 |
+
|
23 |
+
# Add `tabs/audios/modules/CosyVoice/third_party/Matcha-TTS` to your `PYTHONPATH`
|
24 |
+
os.environ['PYTHONPATH'] = f'{os.path.dirname(__file__)}/modules/CosyVoice/third_party/Matcha-TTS:{os.environ.get("PYTHONPATH", "")}'
|
25 |
+
|
26 |
+
# Load CosyVoice TTS
|
27 |
+
cv_base = CosyVoice('pretrained_models/CosyVoice-300M')
|
28 |
+
|
29 |
+
# Load CosyVoice SFT
|
30 |
+
cv_sft = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
|
31 |
+
sft_speakers = cv_sft.list_avaliable_spks()
|
32 |
+
|
33 |
+
# Load CosyVoice Instruct
|
34 |
+
cv_instruct = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
35 |
|
36 |
+
return device, df_model, df_state, cv_base, cv_sft, sft_speakers, cv_instruct
|
37 |
|
38 |
+
device, df_model, df_state, cv_base, cv_sft, sft_speakers, cv_instruct = init_sys()
|
tabs/audios/modules/CosyVoice
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit c901a12789e0a9d8cec54c3caf1bc304533bdf82
|
tabs/audios/ui.py
CHANGED
@@ -10,30 +10,19 @@ def audio_tab():
|
|
10 |
with gr.Group():
|
11 |
with gr.Group():
|
12 |
text = gr.Textbox(lines=5, label="Enter text")
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
)
|
18 |
-
|
19 |
-
with gr.Accordion('Voice Clone', open=True):
|
20 |
-
speaker_audio = gr.Audio(label="Upload Audio", type='numpy')
|
21 |
clear_speaker_audio = gr.Button(label="Clear Audio")
|
|
|
|
|
|
|
22 |
|
23 |
with gr.Column():
|
24 |
output_audio = gr.Audio(label="Output Audio", interactive=False, show_download_button=True)
|
25 |
clear_output_audio = gr.Button(label="Clear Audio")
|
26 |
generate_audio = gr.Button(label="Generate Audio")
|
27 |
-
|
28 |
-
with gr.Accordion('Advance Settings', open=True):
|
29 |
-
settings = [
|
30 |
-
('Alpha', 'tts_alpha', 'float', 0.0, 1.0, 0.3, 0.1,),
|
31 |
-
('Beta', 'tts_beta', 'float', 0.0, 1.0, 0.7, 0.1,),
|
32 |
-
('Diffusion Steps', 'tts_diffusion_steps', 'int', 1, 100, 10, 1,),
|
33 |
-
('Embedding Scale', 'tts_embedding_scale', 'int', 0, 10, 1, 1,),
|
34 |
-
]
|
35 |
-
for label, key, type_, min_, max_, value, step in settings:
|
36 |
-
globals()[key] = gr.Slider(label=label, minimum=min_, maximum=max_, value=value, step=step)
|
37 |
|
38 |
|
39 |
# Events
|
@@ -41,9 +30,16 @@ def audio_tab():
|
|
41 |
clear_speaker_audio.click(clear_audio, speaker_audio, speaker_audio)
|
42 |
clear_output_audio.click(clear_audio, output_audio, output_audio)
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
# Generate Audio
|
45 |
generate_audio.click(
|
46 |
gen_audio,
|
47 |
-
[text,
|
48 |
[output_audio]
|
49 |
)
|
|
|
10 |
with gr.Group():
|
11 |
with gr.Group():
|
12 |
text = gr.Textbox(lines=5, label="Enter text")
|
13 |
+
mode = gr.Radio(["SFT", "VC", "VC-CrossLingual", "Instruct"], label="Mode", value="SFT",) # automate with speech recognition pipeline
|
14 |
+
sft_speaker = gr.Radio(sft_speakers, label="Select speaker")
|
15 |
+
with gr.Accordion('Voice Clone', open=False):
|
16 |
+
speaker_audio = gr.Audio(label="Upload Audio", type='numpy', visible=False)
|
|
|
|
|
|
|
|
|
17 |
clear_speaker_audio = gr.Button(label="Clear Audio")
|
18 |
+
|
19 |
+
with gr.Accordion('Instruct', open=False):
|
20 |
+
voice_instructions = gr.Textbox(lines=5, label="Enter voice instructions", visible=False)
|
21 |
|
22 |
with gr.Column():
|
23 |
output_audio = gr.Audio(label="Output Audio", interactive=False, show_download_button=True)
|
24 |
clear_output_audio = gr.Button(label="Clear Audio")
|
25 |
generate_audio = gr.Button(label="Generate Audio")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
# Events
|
|
|
30 |
clear_speaker_audio.click(clear_audio, speaker_audio, speaker_audio)
|
31 |
clear_output_audio.click(clear_audio, output_audio, output_audio)
|
32 |
|
33 |
+
# Mode
|
34 |
+
mode.change(
|
35 |
+
update_mode,
|
36 |
+
[mode, sft_speaker, speaker_audio, voice_instructions],
|
37 |
+
[sft_speaker, speaker_audio, voice_instructions]
|
38 |
+
)
|
39 |
+
|
40 |
# Generate Audio
|
41 |
generate_audio.click(
|
42 |
gen_audio,
|
43 |
+
[text, mode, sft_speaker, speaker_audio, voice_instructions],
|
44 |
[output_audio]
|
45 |
)
|