Spaces:
Paused
Paused
darksakura
commited on
Commit
•
989bf29
1
Parent(s):
7b04756
Upload app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
# flake8: noqa: E402
|
2 |
-
|
3 |
-
import sys, os
|
4 |
import logging
|
5 |
-
|
6 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
7 |
logging.getLogger("markdown_it").setLevel(logging.WARNING)
|
8 |
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
@@ -13,8 +10,13 @@ logging.basicConfig(
|
|
13 |
)
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
-
|
|
|
17 |
import torch
|
|
|
|
|
|
|
|
|
18 |
import argparse
|
19 |
import commons
|
20 |
import utils
|
@@ -24,9 +26,20 @@ from text import cleaned_text_to_sequence, get_bert
|
|
24 |
from text.cleaner import clean_text
|
25 |
import gradio as gr
|
26 |
import webbrowser
|
27 |
-
import
|
28 |
-
|
29 |
net_g = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
if sys.platform == "darwin" and torch.backends.mps.is_available():
|
32 |
device = "mps"
|
@@ -34,6 +47,35 @@ if sys.platform == "darwin" and torch.backends.mps.is_available():
|
|
34 |
else:
|
35 |
device = "cuda"
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
def get_text(text, language_str, hps):
|
39 |
norm_text, phone, tone, word2ph = clean_text(text, language_str)
|
@@ -53,7 +95,7 @@ def get_text(text, language_str, hps):
|
|
53 |
if language_str == "ZH":
|
54 |
bert = bert
|
55 |
ja_bert = torch.zeros(768, len(phone))
|
56 |
-
elif language_str == "
|
57 |
ja_bert = bert
|
58 |
bert = torch.zeros(1024, len(phone))
|
59 |
else:
|
@@ -101,35 +143,77 @@ def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, langua
|
|
101 |
.numpy()
|
102 |
)
|
103 |
del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
|
104 |
-
torch.cuda.empty_cache()
|
105 |
return audio
|
106 |
|
107 |
|
108 |
-
def tts_fn(
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
audio = infer(
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
if __name__ == "__main__":
|
121 |
parser = argparse.ArgumentParser()
|
122 |
parser.add_argument(
|
123 |
-
"-m", "--model", default="./
|
124 |
)
|
125 |
parser.add_argument(
|
126 |
"-c",
|
127 |
"--config",
|
128 |
-
default="./
|
129 |
help="path of your config file",
|
130 |
)
|
131 |
parser.add_argument(
|
132 |
-
"--share", default=
|
133 |
)
|
134 |
parser.add_argument(
|
135 |
"-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log"
|
@@ -165,53 +249,59 @@ if __name__ == "__main__":
|
|
165 |
speakers = list(speaker_ids.keys())
|
166 |
languages = ["ZH", "JP"]
|
167 |
with gr.Blocks() as app:
|
168 |
-
|
169 |
-
|
170 |
-
gr.Markdown(value="""
|
171 |
-
🤖 【AI 乃木坂46】在线语音合成 Bert-Vits2 🤖\n
|
172 |
-
🎤 声音来源:乃木坂46 🎤\n
|
173 |
-
✅ 使用本模型请遵守中华人民共和国法律 ✅\n
|
174 |
-
""")
|
175 |
-
text = gr.TextArea(
|
176 |
-
label="Text",
|
177 |
-
placeholder="Input Text Here",
|
178 |
-
value="大家好,我是秋元康,今天给大家看看我的女儿们",
|
179 |
-
)
|
180 |
-
speaker = gr.Dropdown(
|
181 |
-
choices=speakers, value=speakers[0], label="Speaker"
|
182 |
-
)
|
183 |
-
sdp_ratio = gr.Slider(
|
184 |
-
minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
|
185 |
-
)
|
186 |
-
noise_scale = gr.Slider(
|
187 |
-
minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise Scale"
|
188 |
-
)
|
189 |
-
noise_scale_w = gr.Slider(
|
190 |
-
minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise Scale W"
|
191 |
-
)
|
192 |
-
length_scale = gr.Slider(
|
193 |
-
minimum=0.1, maximum=2, value=1, step=0.1, label="Length Scale"
|
194 |
-
)
|
195 |
-
language = gr.Dropdown(
|
196 |
-
choices=languages, value=languages[0], label="Language"
|
197 |
-
)
|
198 |
-
btn = gr.Button("Generate 生成!", variant="primary")
|
199 |
-
with gr.Column():
|
200 |
-
text_output = gr.Textbox(label="Message")
|
201 |
-
audio_output = gr.Audio(label="Output Audio")
|
202 |
-
|
203 |
-
btn.click(
|
204 |
-
tts_fn,
|
205 |
-
inputs=[
|
206 |
-
text,
|
207 |
-
speaker,
|
208 |
-
sdp_ratio,
|
209 |
-
noise_scale,
|
210 |
-
noise_scale_w,
|
211 |
-
length_scale,
|
212 |
-
language,
|
213 |
-
],
|
214 |
-
outputs=[text_output, audio_output],
|
215 |
)
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# flake8: noqa: E402
|
|
|
|
|
2 |
import logging
|
|
|
3 |
logging.getLogger("numba").setLevel(logging.WARNING)
|
4 |
logging.getLogger("markdown_it").setLevel(logging.WARNING)
|
5 |
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
|
|
10 |
)
|
11 |
|
12 |
logger = logging.getLogger(__name__)
|
13 |
+
import datetime
|
14 |
+
import numpy as np
|
15 |
import torch
|
16 |
+
import zipfile
|
17 |
+
import shutil
|
18 |
+
import sys, os
|
19 |
+
import json
|
20 |
import argparse
|
21 |
import commons
|
22 |
import utils
|
|
|
26 |
from text.cleaner import clean_text
|
27 |
import gradio as gr
|
28 |
import webbrowser
|
29 |
+
import re
|
30 |
+
from scipy.io.wavfile import write
|
31 |
net_g = None
|
32 |
+
BandList = {
|
33 |
+
"乃木坂46":["AKIMOTO_MANATSU" ,"ENDO_SAKURA" ,"ETO_MISA" ,"FUKAGAWA_MAI" ,"HARUKA_KUROMI" ,"HASHIMOTO_NANAMI" ,"HAYAKAWA_SEIRA" ,"HIGUCHI_HINA" ,"HORI_MIONA" ,"HOSHINO_MINAMI" ,
|
34 |
+
"ICHINOSE_MIKU" ,"IKEDA_TERESA" ,"IKOMA_RINA" ,"IKUTA_ERIKA" ,"INOUE_NAGI" ,"INOUE_SAYURI" ,"IOKI_MAO" ,"ITO_JUNNA" ,"ITO_KARIN" ,"ITO_MARIKA" ,"ITO_RIRIA" ,"IWAMOTO_RENKA" ,
|
35 |
+
"KAKEHASHI_SAYAKA" ,"KAKI_HARUKA" ,"KANAGAWA_SAYA" ,"KAWAGO_HINA" ,"KAWAMURA_MAHIRO" ,"KAWASAKI_SAKURA" ,"KITAGAWA_YURI" ,"KITANO_HINAKO" ,"KUBO_SHIORI" ,"MATSUMURA_SAYURI" ,
|
36 |
+
"MIYU_MATSUO" ,"MUKAI_HAZUKI" ,"NAKADA_KANA" ,"NAKAMOTO_HIMEKA" ,"NAKAMURA_RENO" ,"NAKANISHI_ARUNO" ,"NAO_YUMIKI" ,"NISHINO_NANASE" ,"NOUJO_AMI" ,"OGAWA_AYA" ,"OKAMOTO_HINA" ,
|
37 |
+
"OKUDA_IROHA" ,"OZONO_MOMOKO" ,"RIKA_SATO" ,"RUNA_HAYASHI" ,"SAGARA_IORI" ,"SAITO_ASUKA" ,"SAITO_CHIHARU" ,"SAKAGUCHI_TAMAMI" ,"SAKURAI_REIKA" ,"SASAKI_KOTOKO" ,"SATO_KAEDE" ,
|
38 |
+
"SATO_YUURI" ,"SEIMIYA_REI" ,"SHIBATA_YUNA" ,"SHINUCHI_MAI" ,"SHIRAISHI_MAI" ,"SUGAWARA_SATSUKI" ,"SUZUKI_AYANE" ,"TAKAYAMA_KAZUMI" ,"TAMURA_MAYU" ,"TERADA_RANZE",
|
39 |
+
"TOMISATO_NAO" ,"TSUTSUI_AYAME" ,"UMEZAWA_MINAMI" ,"WADA_MAAYA" ,"WAKATSUKI_YUMI" ,"WATANABE_MIRIA" ,"YAKUBO_MIO" ,"YAMASHITA_MIZUKI" ,"YAMAZAKI_RENA" ,"YODA_YUUKI" ,"YOSHIDA_AYANO_CHRISTIE"
|
40 |
+
|
41 |
+
],
|
42 |
+
}
|
43 |
|
44 |
if sys.platform == "darwin" and torch.backends.mps.is_available():
|
45 |
device = "mps"
|
|
|
47 |
else:
|
48 |
device = "cuda"
|
49 |
|
50 |
+
def is_japanese(string):
|
51 |
+
for ch in string:
|
52 |
+
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
|
53 |
+
return True
|
54 |
+
return False
|
55 |
+
|
56 |
+
def extrac(text):
|
57 |
+
text = re.sub("<[^>]*>","",text)
|
58 |
+
result_list = re.split(r'\n', text)
|
59 |
+
final_list = []
|
60 |
+
for i in result_list:
|
61 |
+
i = i.replace('\n','').replace(' ','')
|
62 |
+
#Current length of single sentence: 20
|
63 |
+
if len(i)>1:
|
64 |
+
if len(i) > 20:
|
65 |
+
try:
|
66 |
+
cur_list = re.split(r'。|!', i)
|
67 |
+
for i in cur_list:
|
68 |
+
if len(i)>1:
|
69 |
+
final_list.append(i+'。')
|
70 |
+
except:
|
71 |
+
pass
|
72 |
+
else:
|
73 |
+
final_list.append(i)
|
74 |
+
'''
|
75 |
+
final_list.append(i)
|
76 |
+
'''
|
77 |
+
final_list = [x for x in final_list if x != '']
|
78 |
+
return final_list
|
79 |
|
80 |
def get_text(text, language_str, hps):
|
81 |
norm_text, phone, tone, word2ph = clean_text(text, language_str)
|
|
|
95 |
if language_str == "ZH":
|
96 |
bert = bert
|
97 |
ja_bert = torch.zeros(768, len(phone))
|
98 |
+
elif language_str == "JA":
|
99 |
ja_bert = bert
|
100 |
bert = torch.zeros(1024, len(phone))
|
101 |
else:
|
|
|
143 |
.numpy()
|
144 |
)
|
145 |
del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
|
|
|
146 |
return audio
|
147 |
|
148 |
|
149 |
+
def tts_fn(
|
150 |
+
text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,LongSentence
|
151 |
+
):
|
152 |
+
if not LongSentence:
|
153 |
+
with torch.no_grad():
|
154 |
+
audio = infer(
|
155 |
+
text,
|
156 |
+
sdp_ratio=sdp_ratio,
|
157 |
+
noise_scale=noise_scale,
|
158 |
+
noise_scale_w=noise_scale_w,
|
159 |
+
length_scale=length_scale,
|
160 |
+
sid=speaker,
|
161 |
+
language= "JP" if is_japanese(text) else "ZH",
|
162 |
+
)
|
163 |
+
torch.cuda.empty_cache()
|
164 |
+
|
165 |
+
return (hps.data.sampling_rate, audio)
|
166 |
+
else:
|
167 |
+
audiopath = 'voice.wav'
|
168 |
+
a = ['【','[','(','(']
|
169 |
+
b = ['】',']',')',')']
|
170 |
+
for i in a:
|
171 |
+
text = text.replace(i,'<')
|
172 |
+
for i in b:
|
173 |
+
text = text.replace(i,'>')
|
174 |
+
final_list = extrac(text.replace('“','').replace('”',''))
|
175 |
+
audio_fin = []
|
176 |
+
for sentence in final_list:
|
177 |
+
with torch.no_grad():
|
178 |
+
audio = infer(
|
179 |
+
sentence,
|
180 |
+
sdp_ratio=sdp_ratio,
|
181 |
+
noise_scale=noise_scale,
|
182 |
+
noise_scale_w=noise_scale_w,
|
183 |
+
length_scale=length_scale,
|
184 |
+
sid=speaker,
|
185 |
+
language= "JP" if is_japanese(text) else "ZH",
|
186 |
+
)
|
187 |
+
audio_fin.append(audio)
|
188 |
+
return (hps.data.sampling_rate, np.concatenate(audio_fin))
|
189 |
+
|
190 |
+
def split_into_sentences(text):
|
191 |
+
"""将文本分割为句子,基于中文的标点符号"""
|
192 |
+
sentences = re.split(r'(?<=[。!?…\n])', text)
|
193 |
+
return [sentence.strip() for sentence in sentences if sentence]
|
194 |
+
|
195 |
+
|
196 |
+
def seconds_to_ass_time(seconds):
|
197 |
+
"""将秒数转换为ASS时间格式"""
|
198 |
+
hours = int(seconds / 3600)
|
199 |
+
minutes = int((seconds % 3600) / 60)
|
200 |
+
seconds = int(seconds) % 60
|
201 |
+
milliseconds = int((seconds - int(seconds)) * 1000)
|
202 |
+
return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))
|
203 |
|
204 |
if __name__ == "__main__":
|
205 |
parser = argparse.ArgumentParser()
|
206 |
parser.add_argument(
|
207 |
+
"-m", "--model", default="./Nogizaka46/vits2.pth", help="path of your model"
|
208 |
)
|
209 |
parser.add_argument(
|
210 |
"-c",
|
211 |
"--config",
|
212 |
+
default="./Nogizaka46/config.json",
|
213 |
help="path of your config file",
|
214 |
)
|
215 |
parser.add_argument(
|
216 |
+
"--share", default=True, help="make link public", action="store_true"
|
217 |
)
|
218 |
parser.add_argument(
|
219 |
"-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log"
|
|
|
249 |
speakers = list(speaker_ids.keys())
|
250 |
languages = ["ZH", "JP"]
|
251 |
with gr.Blocks() as app:
|
252 |
+
gr.Markdown(
|
253 |
+
f"【乃木坂46全员TTS】,使用本模型请严格遵守法律法规!\n 发布二创作品请标注本项目网址<a href='https://sovits4-dev.nogizaka46.cc/'>sovits4-dev.nogizaka46.cc</a>\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
)
|
255 |
+
for band in BandList:
|
256 |
+
with gr.TabItem(band):
|
257 |
+
for name in BandList[band]:
|
258 |
+
with gr.TabItem(name):
|
259 |
+
with gr.Row():
|
260 |
+
#with gr.Column():
|
261 |
+
#with gr.Row():
|
262 |
+
#gr.Markdown(
|
263 |
+
#'<div align="center">'
|
264 |
+
#f'<img style="width:auto;height:400px;" src="file/image/SAITO_ASUKA.png">'
|
265 |
+
#'</div>'
|
266 |
+
#)
|
267 |
+
|
268 |
+
with gr.Column():
|
269 |
+
|
270 |
+
text = gr.TextArea(
|
271 |
+
label="输入纯日语或者中文",
|
272 |
+
placeholder="输入纯日语或者中文",
|
273 |
+
value="純粋な日本語または中国語を入力してください。",
|
274 |
+
)
|
275 |
+
btn = gr.Button("点击生成", variant="primary")
|
276 |
+
audio_output = gr.Audio(label="Output Audio")
|
277 |
+
LongSentence = gr.Checkbox(value=True, label="Generate LongSentence")
|
278 |
+
with gr.Accordion(label="TTS设定", open=True):
|
279 |
+
sdp_ratio = gr.Slider(
|
280 |
+
minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
|
281 |
+
)
|
282 |
+
noise_scale = gr.Slider(
|
283 |
+
minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
|
284 |
+
)
|
285 |
+
noise_scale_w = gr.Slider(
|
286 |
+
minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
|
287 |
+
)
|
288 |
+
length_scale = gr.Slider(
|
289 |
+
minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
|
290 |
+
)
|
291 |
+
speaker = gr.Dropdown(
|
292 |
+
choices=speakers, value=name, label="说话人"
|
293 |
+
)
|
294 |
+
btn.click(
|
295 |
+
tts_fn,
|
296 |
+
inputs=[
|
297 |
+
text,
|
298 |
+
speaker,
|
299 |
+
sdp_ratio,
|
300 |
+
noise_scale,
|
301 |
+
noise_scale_w,
|
302 |
+
length_scale,
|
303 |
+
LongSentence,
|
304 |
+
],
|
305 |
+
outputs=[audio_output],
|
306 |
+
)
|
307 |
+
app.launch()
|