Nogizaka46-so

Running

File size: 12,000 Bytes

import io
import os
import pyzipper
import gradio as gr
import librosa
import base64
import numpy as np
import soundfile
#from inference.infer_tool import Svc
from inference.infer_tool import Svc
import logging
import time
from tts_voices import SUPPORTED_LANGUAGES
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)

#hf_token = os.environ.get('TOKEN')
#hf_token1 = os.environ.get('TOKEN1')
#hf_token2 = os.environ.get('TOKEN2')
#hf_token_config = os.environ.get('TOKEN_config')

from matplotlib import pyplot as plt
import datetime
import subprocess

def tts_fn(_text, _gender, _lang, _rate, _volume, sid, vc_transform, auto_f0,cluster_ratio, slice_db,  f0_predictor):
    if len( _text) > 400:
        return "请上传小于200字的文本", None
    try:


        _rate = f"+{int(_rate*100)}%" if _rate >= 0 else f"{int(_rate*100)}%"
        _volume = f"+{int(_volume*100)}%" if _volume >= 0 else f"{int(_volume*100)}%"
        if _lang == "Auto":
            _gender = "Male" if _gender == "男" else "Female"
            subprocess.run([r"python", "tts.py", _text, _lang, _rate, _volume, _gender])
        else:
            subprocess.run([r"python", "tts.py", _text, _lang, _rate, _volume])
        input_audio = "tts.wav"
        audio, sampling_rate = soundfile.read(input_audio)
        if np.issubdtype(audio.dtype, np.integer):
            audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
        if len(audio.shape) > 1:
            audio = librosa.to_mono(audio.transpose(1, 0))
        if sampling_rate != 44100:
            audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100)
        soundfile.write(input_audio, audio, 44100, format="wav")

        output_file_path = "tts_output.mp3"
        _audio = model.slice_inference(input_audio, sid, vc_transform, slice_db, cluster_ratio, auto_f0, 0.4,f0_predictor=f0_predictor,clip_seconds=40)
        print (_text, _gender, _lang, _rate, _volume, sid, vc_transform, auto_f0,cluster_ratio, slice_db,  f0_predictor)
        soundfile.write("tts_output.mp3", _audio, 44100, format="mp3")
        return "Success", output_file_path

    except Exception as e:
        print(e)



def f0_to_pitch(ff):
    f0_pitch = 69 + 12 * np.log2(ff / 441)
    return f0_pitch
def compute_f0(wav_file1, wav_file2,tran):
    y1, sr1 = librosa.load(wav_file1, sr=44100)
    y2, sr2 = librosa.load(wav_file2, sr=44100)

    # Compute the f0 using the YIN pitch estimation method
    f0_1 = librosa.core.yin(y1, fmin=1, fmax=400)
    f0_2 = librosa.core.yin(y2, fmin=1, fmax=400)
    # 半 音 偏差
    sum_y = []
    if np.sum(wav_file1 == 0) / len(wav_file1) > 0.9:
        mistake, var_take = 0, 0
    else:
        for i in range(min(len(f0_1), len(f0_2))):
            if f0_1[i] > 0 and f0_2[i] > 0:
                sum_y.append(
                    abs(f0_to_pitch(f0_2[i]) - (f0_to_pitch(f0_1[i]) + tran)))
        num_y = 0
        for x in sum_y:
            num_y += x
        len_y = len(sum_y) if len(sum_y) else 1
        mistake = round(float(num_y / len_y), 2)
        var_take = round(float(np.std(sum_y, ddof=1)), 2)
    print("mistake", mistake, var_take)
    return f0_1, f0_2, sr1, sr2, round(mistake / 10, 2), round(var_take / 10, 2)

def same_auth(username, password):
    now = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
    print(username, password,now.strftime("%Y-%m-%d %H:%M:%S"))
    username = username.replace("https://","").replace("http://","").replace("/","")
    return username == base64.b64decode( b'c292aXRzNC5ub2dpemFrYTQ2LmNj' ).decode() or username == base64.b64decode( b'c292aXRzNC1kZXYubm9naXpha2E0Ni5jYw==' ).decode() or password == base64.b64decode( b'c292aXRzNC1kZXYubm9naXpha2E0Ni5jYw==' ).decode() or password == base64.b64decode( b'c292aXRzNC5ub2dpemFrYTQ2LmNj' ).decode()

def vc_fn(output_format,sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db,f0_predictor,clip_seconds=50):

    start_time = time.time()
    if input_audio is None:
        return "You  need  to  upload  an  audio ", None
    audio, sampling_rate = soundfile.read(input_audio)
    duration = audio.shape[0] / sampling_rate
    if duration > 280:
        return "请上传小于280s的音频，需要转换长音频请使用tgbot", None	, None
    if np.issubdtype(audio.dtype, np.integer):
        audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio.transpose(1, 0))
    if sampling_rate != 44100:
        audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100)
    out_wav_path = "temp.wav"
    soundfile.write(out_wav_path, audio, 44100, format="wav")
    
    now = datetime.datetime.utcnow() + datetime.timedelta(hours=8) 
    print(sid, vc_transform, auto_f0,cluster_ratio, slice_db,f0_predictor,now.strftime("%Y-%m-%d %H:%M:%S"))
    _audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, 0.4,f0_predictor=f0_predictor,clip_seconds=clip_seconds,loudness_envelope_adjustment = 0)
    out_wav_path1 = 'output_'+f'{sid}_{vc_transform}.{output_format}'
    soundfile.write(out_wav_path1, _audio, 44100, format=output_format)
    used_time = round(time.time() - start_time, 2)
    if auto_f0:
        out_str = "你开启了自动f0预测，仅限转换语音，歌声不要勾选此项会究极跑调"
        out_str = out_str+ ("Success! total use time:{}s".format(
            used_time))
    else:
        out_str = (base64.b64decode( b'U3VjY2VzcyEgdG90YWwgdXNlIHRpbWU6e31z' ).decode().format(
            used_time))
    return out_str ,out_wav_path1
    #return out_str ,out_wav_path1, gr.Image.update("temp.svg")

def change_audio(audio,vc):
    new_audio = audio
    
    return new_audio,vc
def loadmodel(model_):
    global model
    model_name =  os.path.splitext(os.path.basename(model_))[0]
    model = Svc(model_, "configs/" + model_name + ".json", cluster_model_path="./kmeans/" + model_name + ".pt")
    global sid
    spks = list(model.spk2id.keys())
    sid = sid.update(choices=spks) 
    print(model_, "configs/" + model_name + ".json", "./kmeans/" + model_name + ".pt")
    
    return "success",sid

def update_dropdown(new_choices):
    global model
    spks = list(model.spk2id.keys())
    new_choices = gr.Dropdown.update(choices=spks) 
    return new_choices

sid =""

hf_token1 = os.environ.get('TOKEN1').encode("utf-8")

with pyzipper.AESZipFile('./N.zip') as zf:
    zf.pwd = hf_token1
    zf.extractall() 
model = Svc("./N/58v1.pth", "configs/58v1.json" , cluster_model_path="./kmeans/58v1.pt")
modelPaths = []
for dirpath, dirnames, filenames in os.walk("./N/"):
    for filename in filenames:
        modelPaths.append(os.path.join(dirpath, filename))

app = gr.Blocks()
with app:
    with gr.Tabs():
        with gr.TabItem(" "):  
        

            gr.Markdown(value=base64.b64decode( b'ICAgICAgICAgICAgICAgICAgICAjIOWJjeiogAogICAgICAgICAgICAgICAgICAgICog5LmD5pyo5Z2CNzXkvY1UVFPvvJpbaHR0cHM6Ly92aXRzLm5vZ2l6YWthNDYuY2NdKGh0dHBzOi8vdml0cy5ub2dpemFrYTQ2LmNjKSAKICAgICAgICAgICAgICAgICAgICAqIOWbveWGheiuv+mXrui+g+aFou+8jOW7uuiuruS9v+eUqOS7o+eQhi7mm7TmlrDkuo4yMy0xMS0wNuOAguWWguS6hjM1MOmmluatjO+8jOS9huWkp+WkmuaVsOaIkOWRmOS4jei2s+S7peWQkeWUseatjOmfs+iJsumdoOaLou+8jOWboOS4uuiHs+WwkemcgOimgeWNiuWwj+aXtuS7peS4iueahOe0oOadkAogICAgICAgICAgICAgICAgICAgICog5qyi6L+O5Yqg5YWl6K6o6K66VEfnvqQ6W2h0dHBzOi8vdC5tZS8rdlA4TksxTk1MaVl6TURKbF0oaHR0cHM6Ly90Lm1lLyt2UDhOSzFOTUxpWXpNREpsKSDnvqTph4zmnInnrKjnrKhCb3Tmlrnkvr/kuKLmrYzljbNBaee/u+WUseWSjOWIhuemu+W5suWjsCzkuI3ov4fotKjph4/lj6/msqHmnInmiYvliqjliIbnprvnmoTlpb3jgIIKICAgICAgICAgICAgICAgICAgICAjIOWjsOaYjgogICAgICAgICAgICAgICAgICAgICog5aaC55So5q2k5qih5Z6L5Yi25L2c6Z+z6aKR6K+35qCH5rOo5pys5Zyo57q/6L2s5o2i5Zyw5Z2A77yaaHR0cHM6Ly9zb3ZpdHM0Lm5vZ2l6YWthNDYuY2M=').decode())

            with gr.Tabs():
                with gr.TabItem("单个音频上传"):
                    vc_input3 = gr.Audio(label="上传音频<280s无BGM无和声的干声", type="filepath", source="upload",value="examples/1.mp3")

                with gr.TabItem("文字转语音（实验性）"):
                    gr.Markdown("文字转语音（TTS）说明：使用edge_tts服务生成音频，并转换为So-VITS模型音色。")
                    auto_f0 = gr.Checkbox(label="自动f0预测，配合聚类模型f0预测效果更好（仅限转换语音，歌声不要勾选此项会究极跑调）", value=False)
                    with gr.Row():
                        text_input = gr.Textbox(label = "在此输入需要转译的文字（建议打开自动f0预测）限定200字以内,建议f0预测器选dio")#, lines=4
                    with gr.Row():
                        tts_gender = gr.Radio(label = "说话人性别", choices = ["男","女"], value = "女")
                        tts_lang = gr.Dropdown(label = "选择语言，Auto为根据输入文字自动识别", choices=SUPPORTED_LANGUAGES, value = "Auto")
                    with gr.Row():
                        tts_rate = gr.Slider(label = "TTS语音变速（倍速相对值）", minimum = -1, maximum = 3, value = 0, step = 0.1)
                        tts_volume = gr.Slider(label = "TTS语音音量（相对值）", minimum = -1, maximum = 1.5, value = 0, step = 0.1)
                    vc_tts_submit = gr.Button("文本转语音", variant="primary")
            spks = list(model.spk2id.keys())
  
            sid = gr.Dropdown(label="音色", choices=spks, value=base64.b64decode( b'SE9TSElOT19NSU5BTUk=' ).decode())
            #sid.change(fn=update_dropdown,inputs=[sid],outputs=[sid])
            #sid.update(interactive=True)
            #with gr.Accordion(label="↓切换模型(音色具有抽奖性质，可切换尝试)", open=False):
                #modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
                #btnMod = gr.Button("载入模型")
                #statusa = gr.TextArea()
                #btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa,sid])
            with gr.Row():
                 slice_db = gr.Slider(label="切片阈值(较嘈杂时-30，保留呼吸声时-50)",maximum=-30, minimum=-70, step=1, value=-40)                              
                 vc_transform = gr.Slider(label="变调（整数，可以正负，半音数量，升高八度就是12）",maximum=16, minimum=-16, step=1, value=0)                       
            f0_predictor = gr.Radio(label=base64.b64decode( b'ZjDpooTmtYvlmago5aaC6YGH5ZOR6Z+z5Y+v5Lul5bCd6K+V5pu05o2iZjAp5Yet5bmy5aOw5bmy5YeA56iL5bqm6YCJ5oup44CC5o6o6I2QZmNwZeWSjHJtdnBl' ).decode(), choices=["pm","dio","harvest","fcpe","rmvpe"], value="fcpe")
            with gr.Row():    
                cluster_ratio = gr.Number(label="聚类模型混合比例，0-1之间，默认为0不启用聚类，能提升音色相似度，但会导致咬字下降（如果使用建议0.5左右）", value=0)#聚
                output_format = gr.Radio(label=base64.b64decode( b'6Z+z6aKR6L6T5Ye65qC85byPKE1QM+S8muWvvOiHtOaXtumXtOi9tOWkmjI3bXMs6ZyA5ZCI5oiQ6K+36YCJZmxhYyk=' ).decode(), choices=["flac", "mp3"], value = "mp3")#格式
            vc_submit = gr.Button("音频转换", variant="primary")

            vc_output1 = gr.Textbox(label=base64.b64decode( b'6Z+z6auY5bmz5Z2H5YGP5beu5Y2K6Z+z5pWw6YeP77yM5L2T546w6L2s5o2i6Z+z6aKR55qE6LeR6LCD5oOF5Ya177yI5LiA6Iis5bCP5LqOMC4177yJ' ).decode())
            vc_output2 = gr.Audio(label="Output Audio")

        vc_submit.click(vc_fn, [output_format,sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db,f0_predictor], [vc_output1, vc_output2])
        vc_tts_submit.click(tts_fn, [text_input, tts_gender, tts_lang, tts_rate, tts_volume, sid, vc_transform,auto_f0,cluster_ratio, slice_db, f0_predictor], [vc_output1, vc_output2])
app.launch()