File size: 12,000 Bytes
144b372
 
6e524c3
144b372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1bc88a4
144b372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1bc88a4
144b372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1bc88a4
 
144b372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e524c3
dc6948d
 
144b372
 
 
c454380
144b372
 
 
 
 
 
 
 
 
 
 
1bc88a4
144b372
 
 
7950016
144b372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aeb231c
0503136
28553f0
 
 
 
 
144b372
 
 
 
 
 
 
 
 
 
 
a54a30f
 
144b372
ca1a200
144b372
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import io
import os
import pyzipper
import gradio as gr
import librosa
import base64
import numpy as np
import soundfile
#from inference.infer_tool import Svc
from inference.infer_tool import Svc
import logging
import time
from tts_voices import SUPPORTED_LANGUAGES
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

#hf_token = os.environ.get('TOKEN')
#hf_token1 = os.environ.get('TOKEN1')
#hf_token2 = os.environ.get('TOKEN2')
#hf_token_config = os.environ.get('TOKEN_config')

from matplotlib import pyplot as plt
import datetime
import subprocess

def tts_fn(_text, _gender, _lang, _rate, _volume, sid, vc_transform, auto_f0,cluster_ratio, slice_db,  f0_predictor):
    if len( _text) > 400:
        return "请上传小于200字的文本", None
    try:


        _rate = f"+{int(_rate*100)}%" if _rate >= 0 else f"{int(_rate*100)}%"
        _volume = f"+{int(_volume*100)}%" if _volume >= 0 else f"{int(_volume*100)}%"
        if _lang == "Auto":
            _gender = "Male" if _gender == "男" else "Female"
            subprocess.run([r"python", "tts.py", _text, _lang, _rate, _volume, _gender])
        else:
            subprocess.run([r"python", "tts.py", _text, _lang, _rate, _volume])
        input_audio = "tts.wav"
        audio, sampling_rate = soundfile.read(input_audio)
        if np.issubdtype(audio.dtype, np.integer):
            audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
        if len(audio.shape) > 1:
            audio = librosa.to_mono(audio.transpose(1, 0))
        if sampling_rate != 44100:
            audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100)
        soundfile.write(input_audio, audio, 44100, format="wav")

        output_file_path = "tts_output.mp3"
        _audio = model.slice_inference(input_audio, sid, vc_transform, slice_db, cluster_ratio, auto_f0, 0.4,f0_predictor=f0_predictor,clip_seconds=40)
        print (_text, _gender, _lang, _rate, _volume, sid, vc_transform, auto_f0,cluster_ratio, slice_db,  f0_predictor)
        soundfile.write("tts_output.mp3", _audio, 44100, format="mp3")
        return "Success", output_file_path

    except Exception as e:
        print(e)



def f0_to_pitch(ff):
    f0_pitch = 69 + 12 * np.log2(ff / 441)
    return f0_pitch
def compute_f0(wav_file1, wav_file2,tran):
    y1, sr1 = librosa.load(wav_file1, sr=44100)
    y2, sr2 = librosa.load(wav_file2, sr=44100)

    # Compute the f0 using the YIN pitch estimation method
    f0_1 = librosa.core.yin(y1, fmin=1, fmax=400)
    f0_2 = librosa.core.yin(y2, fmin=1, fmax=400)
    # 半 音 偏差
    sum_y = []
    if np.sum(wav_file1 == 0) / len(wav_file1) > 0.9:
        mistake, var_take = 0, 0
    else:
        for i in range(min(len(f0_1), len(f0_2))):
            if f0_1[i] > 0 and f0_2[i] > 0:
                sum_y.append(
                    abs(f0_to_pitch(f0_2[i]) - (f0_to_pitch(f0_1[i]) + tran)))
        num_y = 0
        for x in sum_y:
            num_y += x
        len_y = len(sum_y) if len(sum_y) else 1
        mistake = round(float(num_y / len_y), 2)
        var_take = round(float(np.std(sum_y, ddof=1)), 2)
    print("mistake", mistake, var_take)
    return f0_1, f0_2, sr1, sr2, round(mistake / 10, 2), round(var_take / 10, 2)

def same_auth(username, password):
    now = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
    print(username, password,now.strftime("%Y-%m-%d %H:%M:%S"))
    username = username.replace("https://","").replace("http://","").replace("/","")
    return username == base64.b64decode( b'c292aXRzNC5ub2dpemFrYTQ2LmNj' ).decode() or username == base64.b64decode( b'c292aXRzNC1kZXYubm9naXpha2E0Ni5jYw==' ).decode() or password == base64.b64decode( b'c292aXRzNC1kZXYubm9naXpha2E0Ni5jYw==' ).decode() or password == base64.b64decode( b'c292aXRzNC5ub2dpemFrYTQ2LmNj' ).decode()

def vc_fn(output_format,sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db,f0_predictor,clip_seconds=50):

    start_time = time.time()
    if input_audio is None:
        return "You  need  to  upload  an  audio ", None
    audio, sampling_rate = soundfile.read(input_audio)
    duration = audio.shape[0] / sampling_rate
    if duration > 280:
        return "请上传小于280s的音频,需要转换长音频请使用tgbot", None	, None
    if np.issubdtype(audio.dtype, np.integer):
        audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio.transpose(1, 0))
    if sampling_rate != 44100:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100)
    out_wav_path = "temp.wav"
    soundfile.write(out_wav_path, audio, 44100, format="wav")
    
    now = datetime.datetime.utcnow() + datetime.timedelta(hours=8) 
    print(sid, vc_transform, auto_f0,cluster_ratio, slice_db,f0_predictor,now.strftime("%Y-%m-%d %H:%M:%S"))
    _audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, 0.4,f0_predictor=f0_predictor,clip_seconds=clip_seconds,loudness_envelope_adjustment = 0)
    out_wav_path1 = 'output_'+f'{sid}_{vc_transform}.{output_format}'
    soundfile.write(out_wav_path1, _audio, 44100, format=output_format)
    used_time = round(time.time() - start_time, 2)
    if auto_f0:
        out_str = "你开启了自动f0预测,仅限转换语音,歌声不要勾选此项会究极跑调"
        out_str = out_str+ ("Success! total use time:{}s".format(
            used_time))
    else:
        out_str = (base64.b64decode( b'U3VjY2VzcyEgdG90YWwgdXNlIHRpbWU6e31z' ).decode().format(
            used_time))
    return out_str ,out_wav_path1
    #return out_str ,out_wav_path1, gr.Image.update("temp.svg")

def change_audio(audio,vc):
    new_audio = audio
    
    return new_audio,vc
def loadmodel(model_):
    global model
    model_name =  os.path.splitext(os.path.basename(model_))[0]
    model = Svc(model_, "configs/" + model_name + ".json", cluster_model_path="./kmeans/" + model_name + ".pt")
    global sid
    spks = list(model.spk2id.keys())
    sid = sid.update(choices=spks) 
    print(model_, "configs/" + model_name + ".json", "./kmeans/" + model_name + ".pt")
    
    return "success",sid

def update_dropdown(new_choices):
    global model
    spks = list(model.spk2id.keys())
    new_choices = gr.Dropdown.update(choices=spks) 
    return new_choices

sid =""

hf_token1 = os.environ.get('TOKEN1').encode("utf-8")

with pyzipper.AESZipFile('./N.zip') as zf:
    zf.pwd = hf_token1
    zf.extractall() 
model = Svc("./N/58v1.pth", "configs/58v1.json" , cluster_model_path="./kmeans/58v1.pt")
modelPaths = []
for dirpath, dirnames, filenames in os.walk("./N/"):
    for filename in filenames:
        modelPaths.append(os.path.join(dirpath, filename))

app = gr.Blocks()
with app:
    with gr.Tabs():
        with gr.TabItem(" "):  
        

            gr.Markdown(value=base64.b64decode( b'ICAgICAgICAgICAgICAgICAgICAjIOWJjeiogAogICAgICAgICAgICAgICAgICAgICog5LmD5pyo5Z2CNzXkvY1UVFPvvJpbaHR0cHM6Ly92aXRzLm5vZ2l6YWthNDYuY2NdKGh0dHBzOi8vdml0cy5ub2dpemFrYTQ2LmNjKSAKICAgICAgICAgICAgICAgICAgICAqIOWbveWGheiuv+mXrui+g+aFou+8jOW7uuiuruS9v+eUqOS7o+eQhi7mm7TmlrDkuo4yMy0xMS0wNuOAguWWguS6hjM1MOmmluatjO+8jOS9huWkp+WkmuaVsOaIkOWRmOS4jei2s+S7peWQkeWUseatjOmfs+iJsumdoOaLou+8jOWboOS4uuiHs+WwkemcgOimgeWNiuWwj+aXtuS7peS4iueahOe0oOadkAogICAgICAgICAgICAgICAgICAgICog5qyi6L+O5Yqg5YWl6K6o6K66VEfnvqQ6W2h0dHBzOi8vdC5tZS8rdlA4TksxTk1MaVl6TURKbF0oaHR0cHM6Ly90Lm1lLyt2UDhOSzFOTUxpWXpNREpsKSDnvqTph4zmnInnrKjnrKhCb3Tmlrnkvr/kuKLmrYzljbNBaee/u+WUseWSjOWIhuemu+W5suWjsCzkuI3ov4fotKjph4/lj6/msqHmnInmiYvliqjliIbnprvnmoTlpb3jgIIKICAgICAgICAgICAgICAgICAgICAjIOWjsOaYjgogICAgICAgICAgICAgICAgICAgICog5aaC55So5q2k5qih5Z6L5Yi25L2c6Z+z6aKR6K+35qCH5rOo5pys5Zyo57q/6L2s5o2i5Zyw5Z2A77yaaHR0cHM6Ly9zb3ZpdHM0Lm5vZ2l6YWthNDYuY2M=').decode())

            with gr.Tabs():
                with gr.TabItem("单个音频上传"):
                    vc_input3 = gr.Audio(label="上传音频<280s无BGM无和声的干声", type="filepath", source="upload",value="examples/1.mp3")

                with gr.TabItem("文字转语音(实验性)"):
                    gr.Markdown("文字转语音(TTS)说明:使用edge_tts服务生成音频,并转换为So-VITS模型音色。")
                    auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False)
                    with gr.Row():
                        text_input = gr.Textbox(label = "在此输入需要转译的文字(建议打开自动f0预测)限定200字以内,建议f0预测器选dio")#, lines=4
                    with gr.Row():
                        tts_gender = gr.Radio(label = "说话人性别", choices = ["男","女"], value = "女")
                        tts_lang = gr.Dropdown(label = "选择语言,Auto为根据输入文字自动识别", choices=SUPPORTED_LANGUAGES, value = "Auto")
                    with gr.Row():
                        tts_rate = gr.Slider(label = "TTS语音变速(倍速相对值)", minimum = -1, maximum = 3, value = 0, step = 0.1)
                        tts_volume = gr.Slider(label = "TTS语音音量(相对值)", minimum = -1, maximum = 1.5, value = 0, step = 0.1)
                    vc_tts_submit = gr.Button("文本转语音", variant="primary")
            spks = list(model.spk2id.keys())
  
            sid = gr.Dropdown(label="音色", choices=spks, value=base64.b64decode( b'SE9TSElOT19NSU5BTUk=' ).decode())
            #sid.change(fn=update_dropdown,inputs=[sid],outputs=[sid])
            #sid.update(interactive=True)
            #with gr.Accordion(label="↓切换模型(音色具有抽奖性质,可切换尝试)", open=False):
                #modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
                #btnMod = gr.Button("载入模型")
                #statusa = gr.TextArea()
                #btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa,sid])
            with gr.Row():
                 slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50)",maximum=-30, minimum=-70, step=1, value=-40)                              
                 vc_transform = gr.Slider(label="变调(整数,可以正负,半音数量,升高八度就是12)",maximum=16, minimum=-16, step=1, value=0)                       
            f0_predictor = gr.Radio(label=base64.b64decode( b'ZjDpooTmtYvlmago5aaC6YGH5ZOR6Z+z5Y+v5Lul5bCd6K+V5pu05o2iZjAp5Yet5bmy5aOw5bmy5YeA56iL5bqm6YCJ5oup44CC5o6o6I2QZmNwZeWSjHJtdnBl' ).decode(), choices=["pm","dio","harvest","fcpe","rmvpe"], value="fcpe")
            with gr.Row():    
                cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)#聚
                output_format = gr.Radio(label=base64.b64decode( b'6Z+z6aKR6L6T5Ye65qC85byPKE1QM+S8muWvvOiHtOaXtumXtOi9tOWkmjI3bXMs6ZyA5ZCI5oiQ6K+36YCJZmxhYyk=' ).decode(), choices=["flac", "mp3"], value = "mp3")#格式
            vc_submit = gr.Button("音频转换", variant="primary")

            vc_output1 = gr.Textbox(label=base64.b64decode( b'6Z+z6auY5bmz5Z2H5YGP5beu5Y2K6Z+z5pWw6YeP77yM5L2T546w6L2s5o2i6Z+z6aKR55qE6LeR6LCD5oOF5Ya177yI5LiA6Iis5bCP5LqOMC4177yJ' ).decode())
            vc_output2 = gr.Audio(label="Output Audio")

        vc_submit.click(vc_fn, [output_format,sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db,f0_predictor], [vc_output1, vc_output2])
        vc_tts_submit.click(tts_fn, [text_input, tts_gender, tts_lang, tts_rate, tts_volume, sid, vc_transform,auto_f0,cluster_ratio, slice_db, f0_predictor], [vc_output1, vc_output2])
app.launch()