darksakura commited on
Commit
989bf29
1 Parent(s): 7b04756

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -71
app.py CHANGED
@@ -1,8 +1,5 @@
1
  # flake8: noqa: E402
2
-
3
- import sys, os
4
  import logging
5
-
6
  logging.getLogger("numba").setLevel(logging.WARNING)
7
  logging.getLogger("markdown_it").setLevel(logging.WARNING)
8
  logging.getLogger("urllib3").setLevel(logging.WARNING)
@@ -13,8 +10,13 @@ logging.basicConfig(
13
  )
14
 
15
  logger = logging.getLogger(__name__)
16
-
 
17
  import torch
 
 
 
 
18
  import argparse
19
  import commons
20
  import utils
@@ -24,9 +26,20 @@ from text import cleaned_text_to_sequence, get_bert
24
  from text.cleaner import clean_text
25
  import gradio as gr
26
  import webbrowser
27
- import numpy as np
28
-
29
  net_g = None
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  if sys.platform == "darwin" and torch.backends.mps.is_available():
32
  device = "mps"
@@ -34,6 +47,35 @@ if sys.platform == "darwin" and torch.backends.mps.is_available():
34
  else:
35
  device = "cuda"
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def get_text(text, language_str, hps):
39
  norm_text, phone, tone, word2ph = clean_text(text, language_str)
@@ -53,7 +95,7 @@ def get_text(text, language_str, hps):
53
  if language_str == "ZH":
54
  bert = bert
55
  ja_bert = torch.zeros(768, len(phone))
56
- elif language_str == "JP":
57
  ja_bert = bert
58
  bert = torch.zeros(1024, len(phone))
59
  else:
@@ -101,35 +143,77 @@ def infer(text, sdp_ratio, noise_scale, noise_scale_w, length_scale, sid, langua
101
  .numpy()
102
  )
103
  del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
104
- torch.cuda.empty_cache()
105
  return audio
106
 
107
 
108
- def tts_fn(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language):
109
- slices = text.split("|")
110
- audio_list = []
111
- with torch.no_grad():
112
- for slice in slices:
113
- audio = infer(slice, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, sid=speaker, language=language)
114
- audio_list.append(audio)
115
- silence = np.zeros(hps.data.sampling_rate) # 生成1秒的静音
116
- audio_list.append(silence) # 将静音添加到列表中
117
- audio_concat = np.concatenate(audio_list)
118
- return "Success", (hps.data.sampling_rate, audio_concat)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  if __name__ == "__main__":
121
  parser = argparse.ArgumentParser()
122
  parser.add_argument(
123
- "-m", "--model", default="./logs/hanser/G_43500.pth", help="path of your model"
124
  )
125
  parser.add_argument(
126
  "-c",
127
  "--config",
128
- default="./configs/config.json",
129
  help="path of your config file",
130
  )
131
  parser.add_argument(
132
- "--share", default=False, help="make link public", action="store_true"
133
  )
134
  parser.add_argument(
135
  "-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log"
@@ -165,53 +249,59 @@ if __name__ == "__main__":
165
  speakers = list(speaker_ids.keys())
166
  languages = ["ZH", "JP"]
167
  with gr.Blocks() as app:
168
- with gr.Row():
169
- with gr.Column():
170
- gr.Markdown(value="""
171
- 🤖 【AI 乃木坂46】在线语音合成 Bert-Vits2 🤖\n
172
- 🎤 声音来源:乃木坂46 🎤\n
173
- ✅ 使用本模型请遵守中华人民共和国法律 ✅\n
174
- """)
175
- text = gr.TextArea(
176
- label="Text",
177
- placeholder="Input Text Here",
178
- value="大家好,我是秋元康,今天给大家看看我的女儿们",
179
- )
180
- speaker = gr.Dropdown(
181
- choices=speakers, value=speakers[0], label="Speaker"
182
- )
183
- sdp_ratio = gr.Slider(
184
- minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
185
- )
186
- noise_scale = gr.Slider(
187
- minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise Scale"
188
- )
189
- noise_scale_w = gr.Slider(
190
- minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise Scale W"
191
- )
192
- length_scale = gr.Slider(
193
- minimum=0.1, maximum=2, value=1, step=0.1, label="Length Scale"
194
- )
195
- language = gr.Dropdown(
196
- choices=languages, value=languages[0], label="Language"
197
- )
198
- btn = gr.Button("Generate 生成!", variant="primary")
199
- with gr.Column():
200
- text_output = gr.Textbox(label="Message")
201
- audio_output = gr.Audio(label="Output Audio")
202
-
203
- btn.click(
204
- tts_fn,
205
- inputs=[
206
- text,
207
- speaker,
208
- sdp_ratio,
209
- noise_scale,
210
- noise_scale_w,
211
- length_scale,
212
- language,
213
- ],
214
- outputs=[text_output, audio_output],
215
  )
216
-
217
- app.launch(show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # flake8: noqa: E402
 
 
2
  import logging
 
3
  logging.getLogger("numba").setLevel(logging.WARNING)
4
  logging.getLogger("markdown_it").setLevel(logging.WARNING)
5
  logging.getLogger("urllib3").setLevel(logging.WARNING)
 
10
  )
11
 
12
  logger = logging.getLogger(__name__)
13
+ import datetime
14
+ import numpy as np
15
  import torch
16
+ import zipfile
17
+ import shutil
18
+ import sys, os
19
+ import json
20
  import argparse
21
  import commons
22
  import utils
 
26
  from text.cleaner import clean_text
27
  import gradio as gr
28
  import webbrowser
29
+ import re
30
+ from scipy.io.wavfile import write
31
  net_g = None
32
+ BandList = {
33
+ "乃木坂46":["AKIMOTO_MANATSU" ,"ENDO_SAKURA" ,"ETO_MISA" ,"FUKAGAWA_MAI" ,"HARUKA_KUROMI" ,"HASHIMOTO_NANAMI" ,"HAYAKAWA_SEIRA" ,"HIGUCHI_HINA" ,"HORI_MIONA" ,"HOSHINO_MINAMI" ,
34
+ "ICHINOSE_MIKU" ,"IKEDA_TERESA" ,"IKOMA_RINA" ,"IKUTA_ERIKA" ,"INOUE_NAGI" ,"INOUE_SAYURI" ,"IOKI_MAO" ,"ITO_JUNNA" ,"ITO_KARIN" ,"ITO_MARIKA" ,"ITO_RIRIA" ,"IWAMOTO_RENKA" ,
35
+ "KAKEHASHI_SAYAKA" ,"KAKI_HARUKA" ,"KANAGAWA_SAYA" ,"KAWAGO_HINA" ,"KAWAMURA_MAHIRO" ,"KAWASAKI_SAKURA" ,"KITAGAWA_YURI" ,"KITANO_HINAKO" ,"KUBO_SHIORI" ,"MATSUMURA_SAYURI" ,
36
+ "MIYU_MATSUO" ,"MUKAI_HAZUKI" ,"NAKADA_KANA" ,"NAKAMOTO_HIMEKA" ,"NAKAMURA_RENO" ,"NAKANISHI_ARUNO" ,"NAO_YUMIKI" ,"NISHINO_NANASE" ,"NOUJO_AMI" ,"OGAWA_AYA" ,"OKAMOTO_HINA" ,
37
+ "OKUDA_IROHA" ,"OZONO_MOMOKO" ,"RIKA_SATO" ,"RUNA_HAYASHI" ,"SAGARA_IORI" ,"SAITO_ASUKA" ,"SAITO_CHIHARU" ,"SAKAGUCHI_TAMAMI" ,"SAKURAI_REIKA" ,"SASAKI_KOTOKO" ,"SATO_KAEDE" ,
38
+ "SATO_YUURI" ,"SEIMIYA_REI" ,"SHIBATA_YUNA" ,"SHINUCHI_MAI" ,"SHIRAISHI_MAI" ,"SUGAWARA_SATSUKI" ,"SUZUKI_AYANE" ,"TAKAYAMA_KAZUMI" ,"TAMURA_MAYU" ,"TERADA_RANZE",
39
+ "TOMISATO_NAO" ,"TSUTSUI_AYAME" ,"UMEZAWA_MINAMI" ,"WADA_MAAYA" ,"WAKATSUKI_YUMI" ,"WATANABE_MIRIA" ,"YAKUBO_MIO" ,"YAMASHITA_MIZUKI" ,"YAMAZAKI_RENA" ,"YODA_YUUKI" ,"YOSHIDA_AYANO_CHRISTIE"
40
+
41
+ ],
42
+ }
43
 
44
  if sys.platform == "darwin" and torch.backends.mps.is_available():
45
  device = "mps"
 
47
  else:
48
  device = "cuda"
49
 
50
+ def is_japanese(string):
51
+ for ch in string:
52
+ if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
53
+ return True
54
+ return False
55
+
56
+ def extrac(text):
57
+ text = re.sub("<[^>]*>","",text)
58
+ result_list = re.split(r'\n', text)
59
+ final_list = []
60
+ for i in result_list:
61
+ i = i.replace('\n','').replace(' ','')
62
+ #Current length of single sentence: 20
63
+ if len(i)>1:
64
+ if len(i) > 20:
65
+ try:
66
+ cur_list = re.split(r'。|!', i)
67
+ for i in cur_list:
68
+ if len(i)>1:
69
+ final_list.append(i+'。')
70
+ except:
71
+ pass
72
+ else:
73
+ final_list.append(i)
74
+ '''
75
+ final_list.append(i)
76
+ '''
77
+ final_list = [x for x in final_list if x != '']
78
+ return final_list
79
 
80
  def get_text(text, language_str, hps):
81
  norm_text, phone, tone, word2ph = clean_text(text, language_str)
 
95
  if language_str == "ZH":
96
  bert = bert
97
  ja_bert = torch.zeros(768, len(phone))
98
+ elif language_str == "JA":
99
  ja_bert = bert
100
  bert = torch.zeros(1024, len(phone))
101
  else:
 
143
  .numpy()
144
  )
145
  del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers
 
146
  return audio
147
 
148
 
149
+ def tts_fn(
150
+ text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,LongSentence
151
+ ):
152
+ if not LongSentence:
153
+ with torch.no_grad():
154
+ audio = infer(
155
+ text,
156
+ sdp_ratio=sdp_ratio,
157
+ noise_scale=noise_scale,
158
+ noise_scale_w=noise_scale_w,
159
+ length_scale=length_scale,
160
+ sid=speaker,
161
+ language= "JP" if is_japanese(text) else "ZH",
162
+ )
163
+ torch.cuda.empty_cache()
164
+
165
+ return (hps.data.sampling_rate, audio)
166
+ else:
167
+ audiopath = 'voice.wav'
168
+ a = ['【','[','(','(']
169
+ b = ['】',']',')',')']
170
+ for i in a:
171
+ text = text.replace(i,'<')
172
+ for i in b:
173
+ text = text.replace(i,'>')
174
+ final_list = extrac(text.replace('“','').replace('”',''))
175
+ audio_fin = []
176
+ for sentence in final_list:
177
+ with torch.no_grad():
178
+ audio = infer(
179
+ sentence,
180
+ sdp_ratio=sdp_ratio,
181
+ noise_scale=noise_scale,
182
+ noise_scale_w=noise_scale_w,
183
+ length_scale=length_scale,
184
+ sid=speaker,
185
+ language= "JP" if is_japanese(text) else "ZH",
186
+ )
187
+ audio_fin.append(audio)
188
+ return (hps.data.sampling_rate, np.concatenate(audio_fin))
189
+
190
+ def split_into_sentences(text):
191
+ """将文本分割为句子,基于中文的标点符号"""
192
+ sentences = re.split(r'(?<=[。!?…\n])', text)
193
+ return [sentence.strip() for sentence in sentences if sentence]
194
+
195
+
196
+ def seconds_to_ass_time(seconds):
197
+ """将秒数转换为ASS时间格式"""
198
+ hours = int(seconds / 3600)
199
+ minutes = int((seconds % 3600) / 60)
200
+ seconds = int(seconds) % 60
201
+ milliseconds = int((seconds - int(seconds)) * 1000)
202
+ return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))
203
 
204
  if __name__ == "__main__":
205
  parser = argparse.ArgumentParser()
206
  parser.add_argument(
207
+ "-m", "--model", default="./Nogizaka46/vits2.pth", help="path of your model"
208
  )
209
  parser.add_argument(
210
  "-c",
211
  "--config",
212
+ default="./Nogizaka46/config.json",
213
  help="path of your config file",
214
  )
215
  parser.add_argument(
216
+ "--share", default=True, help="make link public", action="store_true"
217
  )
218
  parser.add_argument(
219
  "-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log"
 
249
  speakers = list(speaker_ids.keys())
250
  languages = ["ZH", "JP"]
251
  with gr.Blocks() as app:
252
+ gr.Markdown(
253
+ f"【乃木坂46全员TTS】,使用本模型请严格遵守法律法规!\n 发布二创作品请标注本项目网址<a href='https://sovits4-dev.nogizaka46.cc/'>sovits4-dev.nogizaka46.cc</a>\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  )
255
+ for band in BandList:
256
+ with gr.TabItem(band):
257
+ for name in BandList[band]:
258
+ with gr.TabItem(name):
259
+ with gr.Row():
260
+ #with gr.Column():
261
+ #with gr.Row():
262
+ #gr.Markdown(
263
+ #'<div align="center">'
264
+ #f'<img style="width:auto;height:400px;" src="file/image/SAITO_ASUKA.png">'
265
+ #'</div>'
266
+ #)
267
+
268
+ with gr.Column():
269
+
270
+ text = gr.TextArea(
271
+ label="输入纯日语或者中文",
272
+ placeholder="输入纯日语或者中文",
273
+ value="純粋な日本語または中国語を入力してください。",
274
+ )
275
+ btn = gr.Button("点击生成", variant="primary")
276
+ audio_output = gr.Audio(label="Output Audio")
277
+ LongSentence = gr.Checkbox(value=True, label="Generate LongSentence")
278
+ with gr.Accordion(label="TTS设定", open=True):
279
+ sdp_ratio = gr.Slider(
280
+ minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
281
+ )
282
+ noise_scale = gr.Slider(
283
+ minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
284
+ )
285
+ noise_scale_w = gr.Slider(
286
+ minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
287
+ )
288
+ length_scale = gr.Slider(
289
+ minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
290
+ )
291
+ speaker = gr.Dropdown(
292
+ choices=speakers, value=name, label="说话人"
293
+ )
294
+ btn.click(
295
+ tts_fn,
296
+ inputs=[
297
+ text,
298
+ speaker,
299
+ sdp_ratio,
300
+ noise_scale,
301
+ noise_scale_w,
302
+ length_scale,
303
+ LongSentence,
304
+ ],
305
+ outputs=[audio_output],
306
+ )
307
+ app.launch()