Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,9 @@ import json
|
|
3 |
import os
|
4 |
import re
|
5 |
import tempfile
|
|
|
|
|
|
|
6 |
import librosa
|
7 |
import numpy as np
|
8 |
import torch
|
@@ -42,7 +45,6 @@ gr.Audio.postprocess = audio_postprocess
|
|
42 |
|
43 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
44 |
max_len = 150
|
45 |
-
empty_audio = np.zeros(22050)
|
46 |
languages = ['日本語', '简体中文', 'English']
|
47 |
characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
|
48 |
'4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
|
@@ -75,41 +77,40 @@ def show_memory_info(hint):
|
|
75 |
|
76 |
|
77 |
def get_text(text, hps):
|
78 |
-
text_norm = text_to_sequence(text, hps.data.text_cleaners)
|
79 |
if hps.data.add_blank:
|
80 |
text_norm = commons.intersperse(text_norm, 0)
|
81 |
text_norm = torch.LongTensor(text_norm)
|
82 |
return text_norm
|
83 |
|
84 |
-
|
85 |
hps = utils.get_hparams_from_file("./configs/uma87.json")
|
86 |
-
net_g = SynthesizerTrn(
|
87 |
-
len(symbols),
|
88 |
hps.data.filter_length // 2 + 1,
|
89 |
hps.train.segment_size // hps.data.hop_length,
|
90 |
n_speakers=hps.data.n_speakers,
|
91 |
**hps.model)
|
92 |
_ = net_g.eval()
|
93 |
|
94 |
-
_ = utils.load_checkpoint("pretrained_models/G_1153000.pth", net_g
|
95 |
|
96 |
def infer(text_raw, character, language, duration, noise_scale, noise_scale_w):
|
97 |
# check character & duraction parameter
|
98 |
if language not in languages:
|
99 |
print("Error: No such language\n")
|
100 |
-
return "Error: No such language",
|
101 |
if character not in characters:
|
102 |
print("Error: No such character\n")
|
103 |
-
return "Error: No such character",
|
104 |
# check text length
|
105 |
if limitation:
|
106 |
text_len = len(re.sub("\[([A-Z]{2})\]", "", text_raw))
|
107 |
if text_len > max_len:
|
108 |
print(f"Refused: Text too long ({text_len}).")
|
109 |
-
return "Error: Text is too long",
|
110 |
if text_len == 0:
|
111 |
print("Refused: Text length is zero.")
|
112 |
-
return "Error: Please input text!",
|
113 |
if language == '日本語':
|
114 |
text = text_raw
|
115 |
elif language == '简体中文':
|
@@ -121,11 +122,10 @@ def infer(text_raw, character, language, duration, noise_scale, noise_scale_w):
|
|
121 |
with torch.no_grad():
|
122 |
x_tst = stn_tst.unsqueeze(0)
|
123 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
124 |
-
sid = torch.LongTensor([
|
125 |
-
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
|
126 |
-
length_scale=duration)[0][0, 0].data.cpu().float().numpy()
|
127 |
currentDateAndTime = datetime.now()
|
128 |
-
print(f"
|
129 |
if language != '日本語':
|
130 |
print(f"translate from {language}: {text_raw}")
|
131 |
show_memory_info(str(currentDateAndTime) + " infer调用后")
|
@@ -160,10 +160,8 @@ if __name__ == "__main__":
|
|
160 |
"This synthesizer is created based on [VITS](https://arxiv.org/abs/2106.06103) model, trained on voice data extracted from mobile game Umamusume Pretty Derby \n\n"
|
161 |
"这个合成器是基于VITS文本到语音模型,在从手游《賽馬娘:Pretty Derby》解包的语音数据上训练得到。\n\n"
|
162 |
"[introduction video / 模型介绍视频](https://www.bilibili.com/video/BV1T84y1e7p5/?vd_source=6d5c00c796eff1cbbe25f1ae722c2f9f#reply607277701)\n\n"
|
163 |
-
"
|
164 |
-
"
|
165 |
-
"Runtime Error: Memory Limit Exceeded 问题仍然没有解决。\n\n"
|
166 |
-
"作为备用选项,建议您复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
|
167 |
"If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
|
168 |
"如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
|
169 |
)
|
|
|
3 |
import os
|
4 |
import re
|
5 |
import tempfile
|
6 |
+
import logging
|
7 |
+
logging.getLogger('numba').setLevel(logging.WARNING)
|
8 |
+
import ONNXVITS_infer
|
9 |
import librosa
|
10 |
import numpy as np
|
11 |
import torch
|
|
|
45 |
|
46 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
47 |
max_len = 150
|
|
|
48 |
languages = ['日本語', '简体中文', 'English']
|
49 |
characters = ['0:特别周', '1:无声铃鹿', '2:东海帝王', '3:丸善斯基',
|
50 |
'4:富士奇迹', '5:小栗帽', '6:黄金船', '7:伏特加',
|
|
|
77 |
|
78 |
|
79 |
def get_text(text, hps):
|
80 |
+
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
|
81 |
if hps.data.add_blank:
|
82 |
text_norm = commons.intersperse(text_norm, 0)
|
83 |
text_norm = torch.LongTensor(text_norm)
|
84 |
return text_norm
|
85 |
|
|
|
86 |
hps = utils.get_hparams_from_file("./configs/uma87.json")
|
87 |
+
net_g = ONNXVITS_infer.SynthesizerTrn(
|
88 |
+
len(hps.symbols),
|
89 |
hps.data.filter_length // 2 + 1,
|
90 |
hps.train.segment_size // hps.data.hop_length,
|
91 |
n_speakers=hps.data.n_speakers,
|
92 |
**hps.model)
|
93 |
_ = net_g.eval()
|
94 |
|
95 |
+
_ = utils.load_checkpoint("pretrained_models/G_1153000.pth", net_g)
|
96 |
|
97 |
def infer(text_raw, character, language, duration, noise_scale, noise_scale_w):
|
98 |
# check character & duraction parameter
|
99 |
if language not in languages:
|
100 |
print("Error: No such language\n")
|
101 |
+
return "Error: No such language", None
|
102 |
if character not in characters:
|
103 |
print("Error: No such character\n")
|
104 |
+
return "Error: No such character", None
|
105 |
# check text length
|
106 |
if limitation:
|
107 |
text_len = len(re.sub("\[([A-Z]{2})\]", "", text_raw))
|
108 |
if text_len > max_len:
|
109 |
print(f"Refused: Text too long ({text_len}).")
|
110 |
+
return "Error: Text is too long", None
|
111 |
if text_len == 0:
|
112 |
print("Refused: Text length is zero.")
|
113 |
+
return "Error: Please input text!", None
|
114 |
if language == '日本語':
|
115 |
text = text_raw
|
116 |
elif language == '简体中文':
|
|
|
122 |
with torch.no_grad():
|
123 |
x_tst = stn_tst.unsqueeze(0)
|
124 |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
125 |
+
sid = torch.LongTensor([0])
|
126 |
+
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=duration)[0][0,0].data.float().numpy()
|
|
|
127 |
currentDateAndTime = datetime.now()
|
128 |
+
print(f"Character {character} inference successful: {text}\n")
|
129 |
if language != '日本語':
|
130 |
print(f"translate from {language}: {text_raw}")
|
131 |
show_memory_info(str(currentDateAndTime) + " infer调用后")
|
|
|
160 |
"This synthesizer is created based on [VITS](https://arxiv.org/abs/2106.06103) model, trained on voice data extracted from mobile game Umamusume Pretty Derby \n\n"
|
161 |
"这个合成器是基于VITS文本到语音模型,在从手游《賽馬娘:Pretty Derby》解包的语音数据上训练得到。\n\n"
|
162 |
"[introduction video / 模型介绍视频](https://www.bilibili.com/video/BV1T84y1e7p5/?vd_source=6d5c00c796eff1cbbe25f1ae722c2f9f#reply607277701)\n\n"
|
163 |
+
"You may duplicate this space or [open in Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing) to run it privately and without any queue.\n\n"
|
164 |
+
"您可以复制该空间至私人空间运行或打开[Google Colab](https://colab.research.google.com/drive/1J2Vm5dczTF99ckyNLXV0K-hQTxLwEaj5?usp=sharing)在线运行。\n\n"
|
|
|
|
|
165 |
"If your input language is not Japanese, it will be translated to Japanese by Google translator, but accuracy is not guaranteed.\n\n"
|
166 |
"如果您的输入语言不是日语,则会由谷歌翻译自动翻译为日语,但是准确性不能保证。\n\n"
|
167 |
)
|