Spaces:

ArkanDash
/

rvc-genshin-impact

Running on CPU Upgrade

App Files Files Community

ArkanDash commited on Mar 2

Commit

773691e

•

1 Parent(s): 367637d

feat: update gradio

Browse files

Files changed (13) hide show

README.md +0 -13
app.py +46 -228
hubert_base.pt → assets/hubert/hubert_base.pt +0 -0
assets/hubert/req-hubert.txt +1 -0
assets/rvmpe/req-rvmpe.txt +2 -0
rmvpe.pt → assets/rvmpe/rmvpe.pt +0 -0
config.py → lib/config/config.py +2 -2
lib/vc/audio.py +73 -0
rmvpe.py → lib/vc/rmvpe.py +0 -0
lib/vc/settings.py +103 -0
lib/vc/utils.py +84 -0
vc_infer_pipeline.py → lib/vc/vc_infer_pipeline.py +1 -1
requirements.txt +3 -1

README.md DELETED Viewed

@@ -1,13 +0,0 @@
----
-title: RVC Genshin Impact
-emoji: 🎤
-colorFrom: red
-colorTo: purple
-sdk: gradio
-sdk_version: 3.40.1
-app_file: app.py
-pinned: true
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -9,24 +9,30 @@ import librosa
 import torch
 import asyncio
 import edge_tts
-import yt_dlp
-import ffmpeg
-import subprocess
 import sys
 import io
-import wave
 from datetime import datetime
-from fairseq import checkpoint_utils
 from lib.infer_pack.models import (
     SynthesizerTrnMs256NSFsid,
     SynthesizerTrnMs256NSFsid_nono,
     SynthesizerTrnMs768NSFsid,
     SynthesizerTrnMs768NSFsid_nono,
 )
-from vc_infer_pipeline import VC
-from config import Config
 config = Config()
 logging.getLogger("numba").setLevel(logging.WARNING)
 spaces = os.getenv("SYSTEM") == "spaces"
 force_support = None
 if config.unsupported is False:
@@ -38,6 +44,7 @@ else:
 audio_mode = []
 f0method_mode = []
 f0method_info = ""
 if force_support is False or spaces is True:
     if spaces is True:
@@ -71,11 +78,15 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
     ):
         try:
             logs = []
-            print(f"Converting using {model_name}...")
             logs.append(f"Converting using {model_name}...")
             yield "\n".join(logs), None
             if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
-                audio, sr = librosa.load(vc_input, sr=16000, mono=True)
             elif vc_audio_mode == "Upload audio":
                 if vc_upload is None:
                     return "You need to upload an audio", None
@@ -93,9 +104,11 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
                     return "Text is too long", None
                 if tts_text is None or tts_voice is None:
                     return "You need to enter text and select a voice", None
-                asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
-                audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
-                vc_input = "tts.mp3"
             times = [0, 0, 0]
             f0_up_key = int(f0_up_key)
             audio_opt = vc.pipeline(
@@ -120,22 +133,20 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
                 f0_file=None,
             )
             info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
-            print(f"{model_name} | {info}")
             logs.append(f"Successfully Convert {model_name}\n{info}")
             yield "\n".join(logs), (tgt_sr, audio_opt)
         except Exception as err:
             info = traceback.format_exc()
-            print(info)
-            print(f"Error when using {model_name}.\n{str(err)}")
             yield info, None
     return vc_fn
 def load_model():
     categories = []
     if os.path.isfile("weights/folder_info.json"):
-        for _, w_dirs, _ in os.walk(f"weights"):
-            category_count_total = len(w_dirs)
-        category_count = 1
         with open("weights/folder_info.json", "r", encoding="utf-8") as f:
             folder_info = json.load(f)
         for category_name, category_info in folder_info.items():
@@ -144,11 +155,7 @@ def load_model():
             category_title = category_info['title']
             category_folder = category_info['folder_path']
             description = category_info['description']
-            print(f"Load {category_title} [{category_count}/{category_count_total}]")
             models = []
-            for _, m_dirs, _ in os.walk(f"weights/{category_folder}"):
-                model_count_total = len(m_dirs)
-            model_count = 1
             with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
                 models_info = json.load(f)
             for character_name, info in models_info.items():
@@ -177,15 +184,14 @@ def load_model():
                         net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
                     model_version = "V2"
                 del net_g.enc_q
-                print(net_g.load_state_dict(cpt["weight"], strict=False))
                 net_g.eval().to(config.device)
                 if config.is_half:
                     net_g = net_g.half()
                 else:
                     net_g = net_g.float()
                 vc = VC(tgt_sr, config)
-                print(f"Model loaded [{model_count}/{model_count_total}]: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
-                model_count += 1
                 models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
             category_count += 1
             categories.append([category_title, description, models])
@@ -197,7 +203,7 @@ def load_model():
                 pth_files = glob.glob(f"weights/{sub_dir}/*.pth")
                 index_files = glob.glob(f"weights/{sub_dir}/*.index")
                 if pth_files == []:
-                    print(f"Model [{model_count}/{len(w_dirs)}]: No Model file detected, skipping...")
                     continue
                 cpt = torch.load(pth_files[0])
                 tgt_sr = cpt["config"][-1]
@@ -217,7 +223,7 @@ def load_model():
                         net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
                     model_version = "V2"
                 del net_g.enc_q
-                print(net_g.load_state_dict(cpt["weight"], strict=False))
                 net_g.eval().to(config.device)
                 if config.is_half:
                     net_g = net_g.half()
@@ -225,13 +231,13 @@ def load_model():
                     net_g = net_g.float()
                 vc = VC(tgt_sr, config)
                 if index_files == []:
-                    print("Warning: No Index file detected!")
                     index_info = "None"
                     model_index = ""
                 else:
                     index_info = index_files[0]
                     model_index = index_files[0]
-                print(f"Model loaded [{model_count}/{len(w_dirs)}]: {index_files[0]} / {index_info} | ({model_version})")
                 model_count += 1
                 models.append((index_files[0][:-4], index_files[0][:-4], "", "", model_version, create_vc_fn(index_files[0], tgt_sr, net_g, vc, if_f0, version, model_index)))
         categories.append(["Models", "", models])
@@ -239,202 +245,16 @@ def load_model():
         categories = []
     return categories
-def download_audio(url, audio_provider):
-    logs = []
-    if url == "":
-        logs.append("URL required!")
-        yield None, "\n".join(logs)
-        return None, "\n".join(logs)
-    if not os.path.exists("dl_audio"):
-        os.mkdir("dl_audio")
-    if audio_provider == "Youtube":
-        logs.append("Downloading the audio...")
-        yield None, "\n".join(logs)
-        ydl_opts = {
-            'noplaylist': True,
-            'format': 'bestaudio/best',
-            'postprocessors': [{
-                'key': 'FFmpegExtractAudio',
-                'preferredcodec': 'wav',
-            }],
-            "outtmpl": 'dl_audio/audio',
-        }
-        audio_path = "dl_audio/audio.wav"
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            ydl.download([url])
-        logs.append("Download Complete.")
-        yield audio_path, "\n".join(logs)
-def cut_vocal_and_inst(split_model):
-    logs = []
-    logs.append("Starting the audio splitting process...")
-    yield "\n".join(logs), None, None, None
-    command = f"demucs --two-stems=vocals -n {split_model} dl_audio/audio.wav -o output"
-    result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
-    for line in result.stdout:
-        logs.append(line)
-        yield "\n".join(logs), None, None, None
-    print(result.stdout)
-    vocal = f"output/{split_model}/audio/vocals.wav"
-    inst = f"output/{split_model}/audio/no_vocals.wav"
-    logs.append("Audio splitting complete.")
-    yield "\n".join(logs), vocal, inst, vocal
-def combine_vocal_and_inst(audio_data, vocal_volume, inst_volume, split_model):
-    if not os.path.exists("output/result"):
-        os.mkdir("output/result")
-    vocal_path = "output/result/output.wav"
-    output_path = "output/result/combine.mp3"
-    inst_path = f"output/{split_model}/audio/no_vocals.wav"
-    with wave.open(vocal_path, "w") as wave_file:
-        wave_file.setnchannels(1)
-        wave_file.setsampwidth(2)
-        wave_file.setframerate(audio_data[0])
-        wave_file.writeframes(audio_data[1].tobytes())
-    command =  f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame {output_path}'
-    result = subprocess.run(command.split(), stdout=subprocess.PIPE)
-    print(result.stdout.decode())
-    return output_path
-def load_hubert():
-    global hubert_model
-    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
-        ["hubert_base.pt"],
-        suffix="",
-    )
-    hubert_model = models[0]
-    hubert_model = hubert_model.to(config.device)
-    if config.is_half:
-        hubert_model = hubert_model.half()
-    else:
-        hubert_model = hubert_model.float()
-    hubert_model.eval()
-def change_audio_mode(vc_audio_mode):
-    if vc_audio_mode == "Input path":
-        return (
-            # Input & Upload
-            gr.Textbox.update(visible=True),
-            gr.Checkbox.update(visible=False),
-            gr.Audio.update(visible=False),
-            # Youtube
-            gr.Dropdown.update(visible=False),
-            gr.Textbox.update(visible=False),
-            gr.Textbox.update(visible=False),
-            gr.Button.update(visible=False),
-            # Splitter
-            gr.Dropdown.update(visible=False),
-            gr.Textbox.update(visible=False),
-            gr.Button.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Slider.update(visible=False),
-            gr.Slider.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Button.update(visible=False),
-            # TTS
-            gr.Textbox.update(visible=False),
-            gr.Dropdown.update(visible=False)
-        )
-    elif vc_audio_mode == "Upload audio":
-        return (
-            # Input & Upload
-            gr.Textbox.update(visible=False),
-            gr.Checkbox.update(visible=True),
-            gr.Audio.update(visible=True),
-            # Youtube
-            gr.Dropdown.update(visible=False),
-            gr.Textbox.update(visible=False),
-            gr.Textbox.update(visible=False),
-            gr.Button.update(visible=False),
-            # Splitter
-            gr.Dropdown.update(visible=False),
-            gr.Textbox.update(visible=False),
-            gr.Button.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Slider.update(visible=False),
-            gr.Slider.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Button.update(visible=False),
-            # TTS
-            gr.Textbox.update(visible=False),
-            gr.Dropdown.update(visible=False)
-        )
-    elif vc_audio_mode == "Youtube":
-        return (
-            # Input & Upload
-            gr.Textbox.update(visible=False),
-            gr.Checkbox.update(visible=False),
-            gr.Audio.update(visible=False),
-            # Youtube
-            gr.Dropdown.update(visible=True),
-            gr.Textbox.update(visible=True),
-            gr.Textbox.update(visible=True),
-            gr.Button.update(visible=True),
-            # Splitter
-            gr.Dropdown.update(visible=True),
-            gr.Textbox.update(visible=True),
-            gr.Button.update(visible=True),
-            gr.Audio.update(visible=True),
-            gr.Audio.update(visible=True),
-            gr.Audio.update(visible=True),
-            gr.Slider.update(visible=True),
-            gr.Slider.update(visible=True),
-            gr.Audio.update(visible=True),
-            gr.Button.update(visible=True),
-            # TTS
-            gr.Textbox.update(visible=False),
-            gr.Dropdown.update(visible=False)
-        )
-    elif vc_audio_mode == "TTS Audio":
-        return (
-            # Input & Upload
-            gr.Textbox.update(visible=False),
-            gr.Checkbox.update(visible=False),
-            gr.Audio.update(visible=False),
-            # Youtube
-            gr.Dropdown.update(visible=False),
-            gr.Textbox.update(visible=False),
-            gr.Textbox.update(visible=False),
-            gr.Button.update(visible=False),
-            # Splitter
-            gr.Dropdown.update(visible=False),
-            gr.Textbox.update(visible=False),
-            gr.Button.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Slider.update(visible=False),
-            gr.Slider.update(visible=False),
-            gr.Audio.update(visible=False),
-            gr.Button.update(visible=False),
-            # TTS
-            gr.Textbox.update(visible=True),
-            gr.Dropdown.update(visible=True)
-        )
-def use_microphone(microphone):
-    if microphone == True:
-        return gr.Audio.update(source="microphone")
-    else:
-        return gr.Audio.update(source="upload")
 if __name__ == '__main__':
-    load_hubert()
     categories = load_model()
     tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
     voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
     with gr.Blocks() as app:
         gr.Markdown(
             "<div align='center'>\n\n"+
-            "# RVC Genshin Impact\n\n"+
-            "### Recommended to use Google Colab to use other character and feature.\n\n"+
-            "[![Colab](https://img.shields.io/badge/Colab-RVC%20Genshin%20Impact-blue?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n\n"+
-            "</div>\n\n"+
-            "[![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)"
         )
         if categories == []:
             gr.Markdown(
@@ -471,8 +291,7 @@ if __name__ == '__main__':
                                                 # Input
                                                 vc_input = gr.Textbox(label="Input audio path", visible=False)
                                                 # Upload
-                                                vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
-                                                vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
                                                 # Youtube
                                                 vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
                                                 vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
@@ -574,7 +393,6 @@ if __name__ == '__main__':
                                         # Input
                                         vc_input = gr.Textbox(label="Input audio path", visible=False)
                                         # Upload
-                                        vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
                                         vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
                                         # Youtube
                                         vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
@@ -702,17 +520,11 @@ if __name__ == '__main__':
                             inputs=[vc_output, vc_vocal_volume, vc_inst_volume, vc_split_model],
                             outputs=[vc_combined_output]
                         )
-                        vc_microphone_mode.change(
-                            fn=use_microphone,
-                            inputs=vc_microphone_mode,
-                            outputs=vc_upload
-                        )
                         vc_audio_mode.change(
                             fn=change_audio_mode,
                             inputs=[vc_audio_mode],
                             outputs=[
                                 vc_input,
-                                vc_microphone_mode,
                                 vc_upload,
                                 vc_download_audio,
                                 vc_link,
@@ -732,4 +544,10 @@ if __name__ == '__main__':
                                 tts_voice
                             ]
                         )
-        app.queue(concurrency_count=5, max_size=50, api_open=config.api).launch(share=config.share)

 import torch
 import asyncio
 import edge_tts
 import sys
 import io
 from datetime import datetime
+from lib.config.config import Config
+from lib.vc.vc_infer_pipeline import VC
+from lib.vc.settings import change_audio_mode
+from lib.vc.audio import load_audio
 from lib.infer_pack.models import (
     SynthesizerTrnMs256NSFsid,
     SynthesizerTrnMs256NSFsid_nono,
     SynthesizerTrnMs768NSFsid,
     SynthesizerTrnMs768NSFsid_nono,
 )
+from lib.vc.utils import (
+    combine_vocal_and_inst,
+    cut_vocal_and_inst,
+    download_audio,
+    load_hubert
+)
 config = Config()
 logging.getLogger("numba").setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
 spaces = os.getenv("SYSTEM") == "spaces"
 force_support = None
 if config.unsupported is False:
 audio_mode = []
 f0method_mode = []
 f0method_info = ""
+hubert_model = load_hubert(config)
 if force_support is False or spaces is True:
     if spaces is True:
     ):
         try:
             logs = []
+            logger.info(f"Converting using {model_name}...")
             logs.append(f"Converting using {model_name}...")
             yield "\n".join(logs), None
+            logger.info(vc_audio_mode)
             if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
+                audio = load_audio(vc_input, 16000)
+                audio_max = np.abs(audio).max() / 0.95
+                if audio_max > 1:
+                    audio /= audio_max
             elif vc_audio_mode == "Upload audio":
                 if vc_upload is None:
                     return "You need to upload an audio", None
                     return "Text is too long", None
                 if tts_text is None or tts_voice is None:
                     return "You need to enter text and select a voice", None
+                os.makedirs("output", exist_ok=True)
+                os.makedirs(os.path.join("output", "tts"), exist_ok=True)
+                asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(os.path.join("output", "tts", "tts.mp3")))
+                audio, sr = librosa.load(os.path.join("output", "tts", "tts.mp3"), sr=16000, mono=True)
+                vc_input = os.path.join("output", "tts", "tts.mp3")
             times = [0, 0, 0]
             f0_up_key = int(f0_up_key)
             audio_opt = vc.pipeline(
                 f0_file=None,
             )
             info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
+            logger.info(f"{model_name} | {info}")
             logs.append(f"Successfully Convert {model_name}\n{info}")
             yield "\n".join(logs), (tgt_sr, audio_opt)
         except Exception as err:
             info = traceback.format_exc()
+            logger.error(info)
+            logger.error(f"Error when using {model_name}.\n{str(err)}")
             yield info, None
     return vc_fn
 def load_model():
     categories = []
+    category_count = 0
     if os.path.isfile("weights/folder_info.json"):
         with open("weights/folder_info.json", "r", encoding="utf-8") as f:
             folder_info = json.load(f)
         for category_name, category_info in folder_info.items():
             category_title = category_info['title']
             category_folder = category_info['folder_path']
             description = category_info['description']
             models = []
             with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
                 models_info = json.load(f)
             for character_name, info in models_info.items():
                         net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
                     model_version = "V2"
                 del net_g.enc_q
+                logger.info(net_g.load_state_dict(cpt["weight"], strict=False))
                 net_g.eval().to(config.device)
                 if config.is_half:
                     net_g = net_g.half()
                 else:
                     net_g = net_g.float()
                 vc = VC(tgt_sr, config)
+                logger.info(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
                 models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
             category_count += 1
             categories.append([category_title, description, models])
                 pth_files = glob.glob(f"weights/{sub_dir}/*.pth")
                 index_files = glob.glob(f"weights/{sub_dir}/*.index")
                 if pth_files == []:
+                    logger.debug(f"Model [{model_count}/{len(w_dirs)}]: No Model file detected, skipping...")
                     continue
                 cpt = torch.load(pth_files[0])
                 tgt_sr = cpt["config"][-1]
                         net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
                     model_version = "V2"
                 del net_g.enc_q
+                logger.info(net_g.load_state_dict(cpt["weight"], strict=False))
                 net_g.eval().to(config.device)
                 if config.is_half:
                     net_g = net_g.half()
                     net_g = net_g.float()
                 vc = VC(tgt_sr, config)
                 if index_files == []:
+                    logger.warning("No Index file detected!")
                     index_info = "None"
                     model_index = ""
                 else:
                     index_info = index_files[0]
                     model_index = index_files[0]
+                logger.info(f"Model loaded [{model_count}/{len(w_dirs)}]: {index_files[0]} / {index_info} | ({model_version})")
                 model_count += 1
                 models.append((index_files[0][:-4], index_files[0][:-4], "", "", model_version, create_vc_fn(index_files[0], tgt_sr, net_g, vc, if_f0, version, model_index)))
         categories.append(["Models", "", models])
         categories = []
     return categories
 if __name__ == '__main__':
     categories = load_model()
     tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
     voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
     with gr.Blocks() as app:
         gr.Markdown(
             "<div align='center'>\n\n"+
+            "# Multi Model RVC Inference\n\n"+
+            "[![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)\n\n"+
+            "</div>"
         )
         if categories == []:
             gr.Markdown(
                                                 # Input
                                                 vc_input = gr.Textbox(label="Input audio path", visible=False)
                                                 # Upload
+                                                vc_upload = gr.Audio(label="Upload audio file", sources=["upload", "microphone"], visible=True, interactive=True)
                                                 # Youtube
                                                 vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
                                                 vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
                                         # Input
                                         vc_input = gr.Textbox(label="Input audio path", visible=False)
                                         # Upload
                                         vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
                                         # Youtube
                                         vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
                             inputs=[vc_output, vc_vocal_volume, vc_inst_volume, vc_split_model],
                             outputs=[vc_combined_output]
                         )
                         vc_audio_mode.change(
                             fn=change_audio_mode,
                             inputs=[vc_audio_mode],
                             outputs=[
                                 vc_input,
                                 vc_upload,
                                 vc_download_audio,
                                 vc_link,
                                 tts_voice
                             ]
                         )
+        app.queue(
+            max_size=20,
+            api_open=config.api,
+        ).launch(
+            share=config.share,
+            max_threads=1,
+        )

hubert_base.pt → assets/hubert/hubert_base.pt RENAMED Viewed

File without changes

assets/hubert/req-hubert.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ put hubert_base.pt here

assets/rvmpe/req-rvmpe.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ this is optional for pitch extraction algorithm
2	+ put rvmpe.pt here

rmvpe.pt → assets/rvmpe/rmvpe.pt RENAMED Viewed

File without changes

config.py → lib/config/config.py RENAMED Viewed

@@ -13,7 +13,7 @@ class Config:
         (
             self.share,
             self.api,
-            self.unsupported
         ) = self.arg_parse()
         self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
@@ -28,7 +28,7 @@ class Config:
         return (
             cmd_opts.share,
             cmd_opts.api,
-            cmd_opts.unsupported
         )
     # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.

         (
             self.share,
             self.api,
+            self.unsupported,
         ) = self.arg_parse()
         self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
         return (
             cmd_opts.share,
             cmd_opts.api,
+            cmd_opts.unsupported,
         )
     # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.

lib/vc/audio.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import traceback
+import librosa
+import numpy as np
+import av
+from io import BytesIO
+def wav2(i, o, format):
+    inp = av.open(i, "rb")
+    if format == "m4a":
+        format = "mp4"
+    out = av.open(o, "wb", format=format)
+    if format == "ogg":
+        format = "libvorbis"
+    if format == "mp4":
+        format = "aac"
+    ostream = out.add_stream(format)
+    for frame in inp.decode(audio=0):
+        for p in ostream.encode(frame):
+            out.mux(p)
+    for p in ostream.encode(None):
+        out.mux(p)
+    out.close()
+    inp.close()
+def audio2(i, o, format, sr):
+    inp = av.open(i, "rb")
+    out = av.open(o, "wb", format=format)
+    if format == "ogg":
+        format = "libvorbis"
+    if format == "f32le":
+        format = "pcm_f32le"
+    ostream = out.add_stream(format, channels=1)
+    ostream.sample_rate = sr
+    for frame in inp.decode(audio=0):
+        for p in ostream.encode(frame):
+            out.mux(p)
+    out.close()
+    inp.close()
+def load_audio(file, sr):
+    file = (
+        file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+    )  # 防止小白拷路径头尾带了空格和"和回车
+    if os.path.exists(file) == False:
+        raise RuntimeError(
+            "You input a wrong audio path that does not exists, please fix it!"
+        )
+    try:
+        with open(file, "rb") as f:
+            with BytesIO() as out:
+                audio2(f, out, "f32le", sr)
+                return np.frombuffer(out.getvalue(), np.float32).flatten()
+    except AttributeError:
+        audio = file[1] / 32768.0
+        if len(audio.shape) == 2:
+            audio = np.mean(audio, -1)
+        return librosa.resample(audio, orig_sr=file[0], target_sr=16000)
+    except:
+        raise RuntimeError(traceback.format_exc())

rmvpe.py → lib/vc/rmvpe.py RENAMED Viewed

File without changes

lib/vc/settings.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import gradio as gr
+def change_audio_mode(vc_audio_mode):
+    if vc_audio_mode == "Input path":
+        return (
+            # Input & Upload
+            gr.Textbox(visible=True),
+            gr.Audio(visible=False),
+            # Youtube
+            gr.Dropdown(visible=False),
+            gr.Textbox(visible=False),
+            gr.Textbox(visible=False),
+            gr.Button(visible=False),
+            # Splitter
+            gr.Dropdown(visible=False),
+            gr.Textbox(visible=False),
+            gr.Button(visible=False),
+            gr.Audio(visible=False),
+            gr.Audio(visible=False),
+            gr.Audio(visible=False),
+            gr.Slider(visible=False),
+            gr.Slider(visible=False),
+            gr.Audio(visible=False),
+            gr.Button(visible=False),
+            # TTS
+            gr.Textbox(visible=False),
+            gr.Dropdown(visible=False)
+        )
+    elif vc_audio_mode == "Upload audio":
+        return (
+            # Input & Upload
+            gr.Textbox(visible=False),
+            gr.Audio(visible=True),
+            # Youtube
+            gr.Dropdown(visible=False),
+            gr.Textbox(visible=False),
+            gr.Textbox(visible=False),
+            gr.Button(visible=False),
+            # Splitter
+            gr.Dropdown(visible=False),
+            gr.Textbox(visible=False),
+            gr.Button(visible=False),
+            gr.Audio(visible=False),
+            gr.Audio(visible=False),
+            gr.Audio(visible=False),
+            gr.Slider(visible=False),
+            gr.Slider(visible=False),
+            gr.Audio(visible=False),
+            gr.Button(visible=False),
+            # TTS
+            gr.Textbox(visible=False),
+            gr.Dropdown(visible=False)
+        )
+    elif vc_audio_mode == "Youtube":
+        return (
+            # Input & Upload
+            gr.Textbox(visible=False),
+            gr.Audio(visible=False),
+            # Youtube
+            gr.Dropdown(visible=True),
+            gr.Textbox(visible=True),
+            gr.Textbox(visible=True),
+            gr.Button(visible=True),
+            # Splitter
+            gr.Dropdown(visible=True),
+            gr.Textbox(visible=True),
+            gr.Button(visible=True),
+            gr.Audio(visible=True),
+            gr.Audio(visible=True),
+            gr.Audio(visible=True),
+            gr.Slider(visible=True),
+            gr.Slider(visible=True),
+            gr.Audio(visible=True),
+            gr.Button(visible=True),
+            # TTS
+            gr.Textbox(visible=False),
+            gr.Dropdown(visible=False)
+        )
+    elif vc_audio_mode == "TTS Audio":
+        return (
+            # Input & Upload
+            gr.Textbox(visible=False),
+            gr.Audio(visible=False),
+            # Youtube
+            gr.Dropdown(visible=False),
+            gr.Textbox(visible=False),
+            gr.Textbox(visible=False),
+            gr.Button(visible=False),
+            # Splitter
+            gr.Dropdown(visible=False),
+            gr.Textbox(visible=False),
+            gr.Button(visible=False),
+            gr.Audio(visible=False),
+            gr.Audio(visible=False),
+            gr.Audio(visible=False),
+            gr.Slider(visible=False),
+            gr.Slider(visible=False),
+            gr.Audio(visible=False),
+            gr.Button(visible=False),
+            # TTS
+            gr.Textbox(visible=True),
+            gr.Dropdown(visible=True)
+        )

lib/vc/utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os
+import wave
+import subprocess
+import yt_dlp
+import ffmpeg
+import logging
+from fairseq import checkpoint_utils
+logger = logging.getLogger(__name__)
+def load_hubert(config):
+    path_check = os.path.exists("assets/hubert/hubert_base.pt")
+    if path_check is False:
+        logger.warn("hubert_base.pt is missing. Please check the documentation for to get it.")
+    else:
+        logger.info("hubert_base.pt found.")
+    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+        [os.path.join("assets", "hubert", "hubert_base.pt")],
+        suffix="",
+    )
+    hubert_model = models[0]
+    hubert_model = hubert_model.to(config.device)
+    if config.is_half:
+        hubert_model = hubert_model.half()
+    else:
+        hubert_model = hubert_model.float()
+    hubert_model.eval()
+    return hubert_model
+def download_audio(url, audio_provider):
+    logs = []
+    if url == "":
+        logs.append("URL required!")
+        yield None, "\n".join(logs)
+        return None, "\n".join(logs)
+    if not os.path.exists("yt"):
+        os.mkdir("yt")
+    if audio_provider == "Youtube":
+        logs.append("Downloading the audio...")
+        yield None, "\n".join(logs)
+        ydl_opts = {
+            'noplaylist': True,
+            'format': 'bestaudio/best',
+            'postprocessors': [{
+                'key': 'FFmpegExtractAudio',
+                'preferredcodec': 'wav',
+            }],
+            "outtmpl": 'yt/audio',
+        }
+        audio_path = "yt/audio.wav"
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+        logs.append("Download Complete.")
+        yield audio_path, "\n".join(logs)
+def cut_vocal_and_inst(split_model):
+    logs = []
+    logs.append("Starting the audio splitting process...")
+    yield "\n".join(logs), None, None, None
+    command = f"demucs --two-stems=vocals -n {split_model} yt/audio.wav -o output"
+    result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
+    for line in result.stdout:
+        logs.append(line)
+        yield "\n".join(logs), None, None, None
+    logger.info(result.stdout)
+    vocal = f"output/{split_model}/audio/vocals.wav"
+    inst = f"output/{split_model}/audio/no_vocals.wav"
+    logs.append("Audio splitting complete.")
+    yield "\n".join(logs), vocal, inst, vocal
+def combine_vocal_and_inst(audio_data, vocal_volume, inst_volume, split_model):
+    if not os.path.exists("output/result"):
+        os.mkdir("output/result")
+    vocal_path = "output/result/output.wav"
+    output_path = "output/result/combine.mp3"
+    inst_path = f"output/{split_model}/audio/no_vocals.wav"
+    with wave.open(vocal_path, "w") as wave_file:
+        wave_file.setnchannels(1)
+        wave_file.setsampwidth(2)
+        wave_file.setframerate(audio_data[0])
+        wave_file.writeframes(audio_data[1].tobytes())
+    command =  f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame {output_path}'
+    result = subprocess.run(command.split(), stdout=subprocess.PIPE)
+    logger.info(result.stdout.decode())
+    return output_path

vc_infer_pipeline.py → lib/vc/vc_infer_pipeline.py RENAMED Viewed

@@ -133,7 +133,7 @@ class VC(object):
                 print("loading rmvpe model")
                 self.model_rmvpe = RMVPE(
-                    "rmvpe.pt", is_half=self.is_half, device=self.device
                 )
             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         f0 *= pow(2, f0_up_key / 12)

                 print("loading rmvpe model")
                 self.model_rmvpe = RMVPE(
+                    os.path.join("assets", "rvmpe", "rmvpe.pt"), is_half=self.is_half, device=self.device
                 )
             f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         f0 *= pow(2, f0_up_key / 12)

requirements.txt CHANGED Viewed

@@ -7,7 +7,7 @@ scipy==1.9.3
 librosa==0.9.1
 fairseq==0.12.2
 faiss-cpu==1.7.3
-gradio==3.40.1
 pyworld==0.3.2
 soundfile>=0.12.1
 praat-parselmouth>=0.4.2
@@ -19,3 +19,5 @@ onnxruntime
 demucs
 edge-tts
 yt_dlp

 librosa==0.9.1
 fairseq==0.12.2
 faiss-cpu==1.7.3
+gradio>==4.19.2
 pyworld==0.3.2
 soundfile>=0.12.1
 praat-parselmouth>=0.4.2
 demucs
 edge-tts
 yt_dlp
+pytube
+av