import io import os import gradio as gr import librosa import base64 import numpy as np import soundfile #from inference.infer_tool import Svc from inference.infer_tool import Svc import logging import time from tts_voices import SUPPORTED_LANGUAGES logging.getLogger('numba').setLevel(logging.WARNING) logging.getLogger('markdown_it').setLevel(logging.WARNING) logging.getLogger('urllib3').setLevel(logging.WARNING) logging.getLogger('matplotlib').setLevel(logging.WARNING) #hf_token = os.environ.get('TOKEN') #hf_token1 = os.environ.get('TOKEN1') #hf_token2 = os.environ.get('TOKEN2') #hf_token_config = os.environ.get('TOKEN_config') from matplotlib import pyplot as plt import datetime import subprocess def tts_fn(_text, _gender, _lang, _rate, _volume, sid, vc_transform, auto_f0,cluster_ratio, slice_db, f0_predictor): if len( _text) > 400: return "请上传小于200字的文本", None try: _rate = f"+{int(_rate*100)}%" if _rate >= 0 else f"{int(_rate*100)}%" _volume = f"+{int(_volume*100)}%" if _volume >= 0 else f"{int(_volume*100)}%" if _lang == "Auto": _gender = "Male" if _gender == "男" else "Female" subprocess.run([r"python", "tts.py", _text, _lang, _rate, _volume, _gender]) else: subprocess.run([r"python", "tts.py", _text, _lang, _rate, _volume]) input_audio = "tts.wav" audio, sampling_rate = soundfile.read(input_audio) if np.issubdtype(audio.dtype, np.integer): audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) if sampling_rate != 44100: audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100) soundfile.write(input_audio, audio, 44100, format="wav") output_file_path = "tts_output.mp3" _audio = model.slice_inference(input_audio, sid, vc_transform, slice_db, cluster_ratio, auto_f0, 0.4,f0_predictor=f0_predictor,clip_seconds=40) print (_text, _gender, _lang, _rate, _volume, sid, vc_transform, auto_f0,cluster_ratio, slice_db, f0_predictor) soundfile.write("tts_output.mp3", _audio, 44100, format="mp3") return "Success", output_file_path except Exception as e: print(e) def f0_to_pitch(ff): f0_pitch = 69 + 12 * np.log2(ff / 441) return f0_pitch def compute_f0(wav_file1, wav_file2,tran): y1, sr1 = librosa.load(wav_file1, sr=44100) y2, sr2 = librosa.load(wav_file2, sr=44100) # Compute the f0 using the YIN pitch estimation method f0_1 = librosa.core.yin(y1, fmin=1, fmax=400) f0_2 = librosa.core.yin(y2, fmin=1, fmax=400) # 半 音 偏差 sum_y = [] if np.sum(wav_file1 == 0) / len(wav_file1) > 0.9: mistake, var_take = 0, 0 else: for i in range(min(len(f0_1), len(f0_2))): if f0_1[i] > 0 and f0_2[i] > 0: sum_y.append( abs(f0_to_pitch(f0_2[i]) - (f0_to_pitch(f0_1[i]) + tran))) num_y = 0 for x in sum_y: num_y += x len_y = len(sum_y) if len(sum_y) else 1 mistake = round(float(num_y / len_y), 2) var_take = round(float(np.std(sum_y, ddof=1)), 2) print("mistake", mistake, var_take) return f0_1, f0_2, sr1, sr2, round(mistake / 10, 2), round(var_take / 10, 2) def same_auth(username, password): now = datetime.datetime.utcnow() + datetime.timedelta(hours=8) print(username, password,now.strftime("%Y-%m-%d %H:%M:%S")) username = username.replace("https://","").replace("http://","").replace("/","") return username == base64.b64decode( b'c292aXRzNC5ub2dpemFrYTQ2LmNj' ).decode() or username == base64.b64decode( b'c292aXRzNC1kZXYubm9naXpha2E0Ni5jYw==' ).decode() or password == base64.b64decode( b'c292aXRzNC1kZXYubm9naXpha2E0Ni5jYw==' ).decode() or password == base64.b64decode( b'c292aXRzNC5ub2dpemFrYTQ2LmNj' ).decode() def vc_fn(output_format,sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db,f0_predictor,clip_seconds=50): start_time = time.time() if input_audio is None: return "You need to upload an audio ", None audio, sampling_rate = soundfile.read(input_audio) duration = audio.shape[0] / sampling_rate if duration > 280: return "请上传小于280s的音频,需要转换长音频请使用tgbot", None , None if np.issubdtype(audio.dtype, np.integer): audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) if sampling_rate != 44100: audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=44100) out_wav_path = "temp.wav" soundfile.write(out_wav_path, audio, 44100, format="wav") now = datetime.datetime.utcnow() + datetime.timedelta(hours=8) print(sid, vc_transform, auto_f0,cluster_ratio, slice_db,f0_predictor,now.strftime("%Y-%m-%d %H:%M:%S")) _audio = model.slice_inference(out_wav_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, 0.4,f0_predictor=f0_predictor,clip_seconds=clip_seconds,loudness_envelope_adjustment = 0) out_wav_path1 = 'output_'+f'{sid}_{vc_transform}.{output_format}' soundfile.write(out_wav_path1, _audio, 44100, format=output_format) used_time = round(time.time() - start_time, 2) if auto_f0: out_str = "你开启了自动f0预测,仅限转换语音,歌声不要勾选此项会究极跑调" out_str = out_str+ ("Success! total use time:{}s".format( used_time)) else: out_str = (base64.b64decode( b'U3VjY2VzcyEgdG90YWwgdXNlIHRpbWU6e31z' ).decode().format( used_time)) return out_str ,out_wav_path1 #return out_str ,out_wav_path1, gr.Image.update("temp.svg") def change_audio(audio,vc): new_audio = audio return new_audio,vc def loadmodel(model_): global model model_name = os.path.splitext(os.path.basename(model_))[0] model = Svc(model_, "configs/" + model_name + ".json", cluster_model_path="./kmeans/" + model_name + ".pt") global sid spks = list(model.spk2id.keys()) sid = sid.update(choices=spks) print(model_, "configs/" + model_name + ".json", "./kmeans/" + model_name + ".pt") return "success",sid def update_dropdown(new_choices): global model spks = list(model.spk2id.keys()) new_choices = gr.Dropdown.update(choices=spks) return new_choices sid ="" import pyzipper hf_token1 = os.environ.get('TOKEN1').encode("utf-8") with pyzipper.AESZipFile('./N.zip') as zf: zf.pwd = hf_token1 zf.extractall() model = Svc("./N/58v1.pth", "configs/58v1.json" , cluster_model_path="./kmeans/58v1.pt") modelPaths = [] for dirpath, dirnames, filenames in os.walk("./N/"): for filename in filenames: modelPaths.append(os.path.join(dirpath, filename)) app = gr.Blocks() with app: with gr.Tabs(): with gr.TabItem(" "): gr.Markdown(value=base64.b64decode( b'ICAgICAgICAgICAgICAgICAgICAjIOWJjeiogAogICAgICAgICAgICAgICAgICAgICog5LmD5pyo5Z2CNzXkvY1UVFPvvJpbaHR0cHM6Ly92aXRzLm5vZ2l6YWthNDYuY2NdKGh0dHBzOi8vdml0cy5ub2dpemFrYTQ2LmNjKSAKICAgICAgICAgICAgICAgICAgICAqIOWbveWGheiuv+mXrui+g+aFou+8jOW7uuiuruS9v+eUqOS7o+eQhi7mm7TmlrDkuo4yMy0xMS0wNuOAguWWguS6hjM1MOmmluatjO+8jOS9huWkp+WkmuaVsOaIkOWRmOS4jei2s+S7peWQkeWUseatjOmfs+iJsumdoOaLou+8jOWboOS4uuiHs+WwkemcgOimgeWNiuWwj+aXtuS7peS4iueahOe0oOadkAogICAgICAgICAgICAgICAgICAgICog5qyi6L+O5Yqg5YWl6K6o6K66VEfnvqQ6W2h0dHBzOi8vdC5tZS8rdlA4TksxTk1MaVl6TURKbF0oaHR0cHM6Ly90Lm1lLyt2UDhOSzFOTUxpWXpNREpsKSDnvqTph4zmnInnrKjnrKhCb3Tmlrnkvr/kuKLmrYzljbNBaee/u+WUseWSjOWIhuemu+W5suWjsCzkuI3ov4fotKjph4/lj6/msqHmnInmiYvliqjliIbnprvnmoTlpb3jgIIKICAgICAgICAgICAgICAgICAgICAjIOWjsOaYjgogICAgICAgICAgICAgICAgICAgICog5aaC55So5q2k5qih5Z6L5Yi25L2c6Z+z6aKR6K+35qCH5rOo5pys5Zyo57q/6L2s5o2i5Zyw5Z2A77yaaHR0cHM6Ly9zb3ZpdHM0Lm5vZ2l6YWthNDYuY2M=').decode()) with gr.Tabs(): with gr.TabItem("单个音频上传"): vc_input3 = gr.Audio("上传音频<280s无BGM无和声的干声", type="filepath", source="upload",value="examples/1.mp3") with gr.TabItem("文字转语音(实验性)"): gr.Markdown("文字转语音(TTS)说明:使用edge_tts服务生成音频,并转换为So-VITS模型音色。") auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好(仅限转换语音,歌声不要勾选此项会究极跑调)", value=False) with gr.Row(): text_input = gr.Textbox(label = "在此输入需要转译的文字(建议打开自动f0预测)限定200字以内,建议f0预测器选dio")#, lines=4 with gr.Row(): tts_gender = gr.Radio(label = "说话人性别", choices = ["男","女"], value = "女") tts_lang = gr.Dropdown(label = "选择语言,Auto为根据输入文字自动识别", choices=SUPPORTED_LANGUAGES, value = "Auto") with gr.Row(): tts_rate = gr.Slider(label = "TTS语音变速(倍速相对值)", minimum = -1, maximum = 3, value = 0, step = 0.1) tts_volume = gr.Slider(label = "TTS语音音量(相对值)", minimum = -1, maximum = 1.5, value = 0, step = 0.1) vc_tts_submit = gr.Button("文本转语音", variant="primary") spks = list(model.spk2id.keys()) sid = gr.Dropdown(label="音色", choices=spks, value=base64.b64decode( b'SE9TSElOT19NSU5BTUk=' ).decode()) sid.change(fn=update_dropdown,inputs=[sid],outputs=[sid]) sid.update(interactive=True) with gr.Accordion(label="↓切换模型(音色具有抽奖性质,可切换尝试)", open=False): modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value") btnMod = gr.Button("载入模型") statusa = gr.TextArea() btnMod.click(loadmodel, inputs=[modelstrs], outputs = [statusa,sid]) with gr.Row(): slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50)",maximum=-30, minimum=-70, step=1, value=-40) vc_transform = gr.Slider(label="变调(整数,可以正负,半音数量,升高八度就是12)",maximum=16, minimum=-16, step=1, value=0) f0_predictor = gr.Radio(label=base64.b64decode( b'ZjDpooTmtYvlmago5aaC6YGH5ZOR6Z+z5Y+v5Lul5bCd6K+V5pu05o2iZjAp5Yet5bmy5aOw5bmy5YeA56iL5bqm6YCJ5oup44CC5o6o6I2QZmNwZeWSjHJtdnBl' ).decode(), choices=["pm","dio","harvest","fcpe","rmvpe"], value="fcpe") with gr.Row(): cluster_ratio = gr.Number(label="聚类模型混合比例,0-1之间,默认为0不启用聚类,能提升音色相似度,但会导致咬字下降(如果使用建议0.5左右)", value=0)#聚 output_format = gr.Radio(label=base64.b64decode( b'6Z+z6aKR6L6T5Ye65qC85byPKE1QM+S8muWvvOiHtOaXtumXtOi9tOWkmjI3bXMs6ZyA5ZCI5oiQ6K+36YCJZmxhYyk=' ).decode(), choices=["flac", "mp3"], value = "mp3")#格式 vc_submit = gr.Button("音频转换", variant="primary") vc_output1 = gr.Textbox(label=base64.b64decode( b'6Z+z6auY5bmz5Z2H5YGP5beu5Y2K6Z+z5pWw6YeP77yM5L2T546w6L2s5o2i6Z+z6aKR55qE6LeR6LCD5oOF5Ya177yI5LiA6Iis5bCP5LqOMC4177yJ' ).decode()) vc_output2 = gr.Audio(label="Output Audio") with gr.Row(): gr.Examples( label=base64.b64decode( b'5bmy5aOw56S65L6L77yM54K55Ye75pu05o2i' ).decode(), examples=[ ["examples/1.mp3", 0], ["examples/2.mp3", 2], ["examples/3.mp3", 0], ["examples/4.mp3", 5], ["examples/5.mp3", 0], ["examples/6.mp3", 0], ], inputs=[vc_input3, vc_transform], fn=change_audio) #gr.Markdown(value=base64.b64decode(b'6I635Y+W5bmy5aOw5pyA5paw6L+e5oub77yaCjEuIOaJo+S6uuWjsOOAkDTpgIkx77yM566A5Y2V55qE5q2M55SoVVZSLU1EWDIzQy1JbnN0Vm9jIEhR77yM5aSN5p2C55qE5q2M55SoM19IUOOAke+8mgoyLiDljrvlkozlo7DjgJAz6YCJMe+8jOWTquS4quaViOaenOWlveWwseeUqOWTquS4quOAke+8mgogICAgVVZSLUJWRS00Ql9TTi00NDEwMC0x6YCJSW5zdHJ1bWVudGFsIE9ubHkKICAgIDVfSFBfS2FyYW9rZS1VVlLpgIlWb2NhbHMgT25seQogICAgNl9IUF9LYXJhb2tlLVVWUumAiVZvY2FscyBPbmx5CjMuIOWOu+a3t+WTjeOAkDLpgIkx77yM5qC55o2u5re35ZON55qE56iL5bqm6YCJ5oup44CR77yaCiAgICBVVlItRGUtRWNoby1Ob3JtYWzpgIlObyBFY2hvIE9ubHnvvIjovbvluqbmt7flk43vvIkKICAgIFVWUi1EZS1FY2hvLUFnZ3Jlc3NpdmXpgIlObyBFY2hvIE9ubHnvvIjph43luqbmt7flk43vvIkK56Gu5L+dVVZS5piv5pyA5paw54mI5pys77yaNS42LjAK5aaC5p6cVVZS6YeM6Z2i5rKh5LiK6L+w5qih5Z6L77yM54K55bCP5omz5omL77yM5Y67RG93bmxvYWQgQ2VudGVy6YeM6Z2i5LiL6L295qih5Z6L77yI6K+36Ieq5aSH5qKv5a2Q77yM5ZCm5YiZ5Lya5LiL6L295aSx6LSl77yJClVWUuS4i+i9ve+8mmh0dHBzOi8vdWx0aW1hdGV2b2NhbHJlbW92ZXIuY29tLwrlpKfpg6jliIbmrYzpg73og73pgJrov4d1dnLlvpfliLDlubLlo7Ao6Zmk5LqG5aSa5Lq65ZCI5ZSxKe+8jOWcqOS6juWkmuWwneivle+8jOWOu+WSjOWjsOWSjOWOu+a3t+WTjeWPr+S7peWAkuaNogoK5b6I6YGX5oa+55qE6YCa55+l77yM5pyJMS8z55qE5oiQ5ZGY6Z+z6Imy5rOE6Zyy77yM5Li76KaB5piv5LiA5Lqbc29sb+absuWwkeeahOaIkOWRmCA65bGx5LiLIOe+juaciO+8jOaftOeUsCDmn5roj5zvvIzml6nlt50g6IGW5p2l77yM5riF5a6uIOODrOOCpO+8jOS4gOODjueArCDnvo7nqbrvvIzoj4Xljp8g5ZKy5pyI77yM5Lit6KW/IOOCouODq+ODjuetiSAK').decode()) gr.Markdown(value=base64.b64decode(b'QUtJTU9UT19NQU5BVFNVLOeni+WFgyDnnJ/lpI98SUtVVEFfRVJJS0Es55Sf55SwIOe1teaiqOiKsXxOYW5hbWkgSGFzaGltb3RvLOapi+acrCDlpYjjgIXmnKp8SVRPX0pVTk5BLOS8iuiXpCDntJTlpYh8SU5PVUVfU0FZVVJJLOS6leS4iiDlsI/nmb7lkIh8RVRPX01JU0Es6KGb6JekIOe+juW9qXxLQVdBR09fSElOQSzlt53lvowg6Zm96I+cfEtJVEFOT19ISU5BS08s5YyX6YeOIOaXpeWliOWtkHxTQUlUT19BU1VLQSzpvYvol6Qg6aOb6bOlfFNBVE9fWVVVUkks5paJ6JekIOWEqumHjHxTQUtVUkFJX1JFSUtBLOahnOS6lSDnjrLpppl8U0FTQUtJX0tPVE9LTyzkvZDjgIXmnKgg55C05a2QfFNISVJBSVNISV9NQUks55m955+zIOm6u+iho3xTSElOVUNISV9NQUks5paw5YaFIOecnuiho3xTVVpVS0lfQVlBTkUs6Yi05pyoIOe1oumfs3xUQUtBWUFNQV9LQVpVTUks6auY5bGxIOS4gOWun3xURVJBREFfUkFOWkUs5a+655SwIOiYreS4lnxOSVNISU5PX05BTkFTRSzopb/ph44g5LiD54CsfEhJR1VDSElfSElOQSzmqIvlj6Mg5pel5aWIfEhPU0hJTk9fTUlOQU1JLOaYn+mHjiDjgb/jgarjgb98SE9SSV9NSU9OQSzloIAg5pyq5aSu5aWIfE1BVFNVTVVSQV9TQVlVUkks5p2+5p2RIOaymeWPi+eQhnxZQU1BWkFLSV9SRU5BLOWxseW0jiDmgJzlpYh8V0FLQVRTVUtJX1lVTUks6Iul5pyIIOS9kee+jnxXQVRBTkFCRV9NSVJJQSzmuKHovrog44G/44KK5oSbfElUT19SSVJJQSzkvIrol6Qg55CG44CF5p2PfElXQU1PVE9fUkVOS0Es5bKp5pysIOiTruWKoHxVTUVaQVdBX01JTkFNSSzmooXmvqQg576O5rOifE9aT05PX01PTU9LTyzlpKflnJIg5qGD5a2QfEtVQk9fU0hJT1JJLOS5heS/nSDlj7Lnt5Lph4x8U0FLQUdVQ0hJX1RBTUFNSSzpmKrlj6Mg54+g576OfFNBVE9fS0FFREUs5L2Q6JekIOalk3xOQUtBTVVSQV9SRU5PLOS4readkSDpupfkuYN8TVVLQUlfSEFaVUtJLOWQkeS6lSDokYnmnIh8WUFNQVNISVRBX01JWlVLSSzlsbHkuIsg576O5pyIfFlPREFfWVVVS0ks5LiO55SwIOelkOW4jHxFTkRPX1NBS1VSQSzpgaDol6Qg44GV44GP44KJfEtBS0lfSEFSVUtBLOizgOWWnCDpgaXpppl8S0FLRUhBU0hJX1NBWUFLQSzmjpvmqYsg5rKZ6IC26aaZfEtBTkFHQVdBX1NBWUEs6YeR5bedIOe0l+iAtnxLSVRBR0FXQV9ZVVJJLOWMl+W3nSDmgqDnkIZ8U0hJQkFUQV9ZVU5BLOaftOeUsCDmn5roj5x8U0VJTUlZQV9SRUks5riF5a6uIOODrOOCpHxUQU1VUkFfTUFZVSznlLDmnZEg55yf5L2RfFRTVVRTVUlfQVlBTUUs562S5LqVIOOBguOChOOCgXxIQVlBS0FXQV9TRUlSQSzml6nlt50g6IGW5p2lfFlBS1VCT19NSU8s55+i5LmF5L+dIOe+jue3knxIQVJVS0FfS1VST01JLOm7kuimiyDmmI7pppl8UklLQV9TQVRPLOS9kOiXpCDnkoPmnpx8UlVOQV9IQVlBU0hJLOaelyDnkaDlpYh8TUlZVV9NQVRTVU8s5p2+5bC+IOe+juS9kXxOQU9fWVVNSUtJLOW8k+acqCDlpYjmlrx8SU9LSV9NQU8s5LqU55m+5Z+OIOiMieWkrnxJS0VEQV9URVJFU0Es5rGg55SwIOeRm+e0l3xJQ0hJTk9TRV9NSUtVLOS4gOODjueArCDnvo7nqbp8SU5PVUVfTkFHSSzkupXkuIog5ZKMfE9HQVdBX0FZQSzlsI/lt50g5b2pfE9LVURBX0lST0hBLOWlpeeUsCDjgYTjgo3jga98S0FXQVNBS0lfU0FLVVJBLOW3ne+okSDmoZx8U1VHQVdBUkFfU0FUU1VLSSzoj4Xljp8g5ZKy5pyIfFRPTUlTQVRPX05BTyzlhqjph4wg5aWI5aSufE5BS0FOSVNISV9BUlVOTyzkuK3opb8g44Ki44Or44OO').decode()) vc_submit.click(vc_fn, [output_format,sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db,f0_predictor], [vc_output1, vc_output2]) vc_tts_submit.click(tts_fn, [text_input, tts_gender, tts_lang, tts_rate, tts_volume, sid, vc_transform,auto_f0,cluster_ratio, slice_db, f0_predictor], [vc_output1, vc_output2]) app.launch()