import re, os import requests import json import torch device = 'cuda' if torch.cuda.is_available() else 'cpu' headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" } pattern = r'//www\.bilibili\.com/video[^"]*' def get_bilibili_video_id(url): match = re.search(r'/video/([a-zA-Z0-9]+)/', url) extracted_value = match.group(1) return extracted_value # Get bilibili audio def find_first_appearance_with_neighborhood(text, pattern): match = re.search(pattern, text) if match: return match.group() else: return None def search_bilibili(keyword): if keyword.startswith("BV"): req = requests.get("https://search.bilibili.com/all?keyword={}&duration=1".format(keyword), headers=headers).text else: req = requests.get("https://search.bilibili.com/all?keyword={}&duration=1&tids=3&page=1".format(keyword), headers=headers).text video_link = "https:" + find_first_appearance_with_neighborhood(req, pattern) return video_link def get_response(html_url): headers = { "referer": "https://www.bilibili.com/", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" } response = requests.get(html_url, headers=headers) return response def get_video_info(html_url): response = get_response(html_url) html_data = re.findall('', response.text)[0] json_data = json.loads(html_data) if json_data['data']['dash']['audio'][0]['backupUrl']!=None: audio_url = json_data['data']['dash']['audio'][0]['backupUrl'][0] else: audio_url = json_data['data']['dash']['audio'][0]['baseUrl'] video_url = json_data['data']['dash']['video'][0]['baseUrl'] return audio_url, video_url def save_audio(title, html_url): audio_url = get_video_info(html_url)[0] #video_url = get_video_info(html_url)[1] audio_content = get_response(audio_url).content #video_content = get_response(video_url).content with open(title + '.mp3', mode='wb') as f: f.write(audio_content) print("音乐内容保存完成") #with open(title + '.mp4', mode='wb') as f: # f.write(video_content) #print("视频内容保存完成" from uvr5.vr import AudioPre weight_uvr5_root = "uvr5/uvr_model" uvr5_names = [] for name in os.listdir(weight_uvr5_root): if name.endswith(".pth") or "onnx" in name: uvr5_names.append(name.replace(".pth", "")) func = AudioPre pre_fun_hp2 = func( agg=int(10), model_path=os.path.join(weight_uvr5_root, "UVR-HP2.pth"), device=device, is_half=True, ) pre_fun_hp5 = func( agg=int(10), model_path=os.path.join(weight_uvr5_root, "UVR-HP5.pth"), device=device, is_half=True, ) import webrtcvad from pydub import AudioSegment from pydub.utils import make_chunks import os import librosa import soundfile import gradio as gr def vad(audio_name): audio = AudioSegment.from_file(audio_name, format="wav") # Set the desired sample rate (WebRTC VAD supports only 8000, 16000, 32000, or 48000 Hz) audio = audio.set_frame_rate(48000) # Set single channel (mono) audio = audio.set_channels(1) # Initialize VAD vad = webrtcvad.Vad() # Set aggressiveness mode (an integer between 0 and 3, 3 is the most aggressive) vad.set_mode(3) # Convert pydub audio to bytes frame_duration = 30 # Duration of a frame in ms frame_width = int(audio.frame_rate * frame_duration / 1000) # width of a frame in samples frames = make_chunks(audio, frame_duration) # Perform voice activity detection voiced_frames = [] for frame in frames: if len(frame.raw_data) < frame_width * 2: # Ensure frame is correct length break is_speech = vad.is_speech(frame.raw_data, audio.frame_rate) if is_speech: voiced_frames.append(frame) # Combine voiced frames back to an audio segment voiced_audio = sum(voiced_frames, AudioSegment.silent(duration=0)) voiced_audio.export("voiced_audio.wav", format="wav") def youtube_downloader( video_identifier, filename, split_model, start_time ): print(video_identifier) video_info = get_video_info(video_identifier)[0] print(video_info) audio_content = get_response(video_info).content with open(filename.strip() + ".wav", mode="wb") as f: f.write(audio_content) audio_path = filename.strip() + ".wav" start_ms = start_time * 1000 end_ms = start_ms + 45000 # make dir output os.makedirs("output", exist_ok=True) if split_model=="UVR-HP2": pre_fun = pre_fun_hp2 else: pre_fun = pre_fun_hp5 audio_orig = AudioSegment.from_file(audio_path) if len(audio_orig) > end_ms: # Extract the segment segment = audio_orig[start_ms:end_ms] segment.export(filename.strip() + ".wav", format="wav") pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav") os.remove(filename.strip()+".wav") else: segment = audio_orig[start_ms:len(audio_orig)] segment.export(filename.strip() + ".wav", format="wav") pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav") os.remove(filename.strip()+".wav") return f"./output/{split_model}/{filename}/vocal_{filename}.wav_10.wav", f"./output/{split_model}/{filename}/instrument_{filename}.wav_10.wav" def youtube_downloader_100s( video_identifier, filename, split_model ): print(video_identifier) video_info = get_video_info(video_identifier)[0] print(video_info) audio_content = get_response(video_info).content with open(filename.strip() + ".wav", mode="wb") as f: f.write(audio_content) audio_path = filename.strip() + ".wav" if split_model=="UVR-HP2": pre_fun = pre_fun_hp2 else: pre_fun = pre_fun_hp5 os.makedirs("output", exist_ok=True) audio_orig = AudioSegment.from_file(audio_path) if len(audio_orig) > 200000: start_ms = 30000 end_ms = start_ms + 170000 # Extract the segment segment = audio_orig[start_ms:end_ms] segment.export(filename.strip() + ".wav", format="wav") pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav") os.remove(filename.strip()+".wav") else: pre_fun._path_audio_(filename.strip() + ".wav", f"./output/{split_model}/{filename}/", f"./output/{split_model}/{filename}/", "wav") os.remove(filename.strip()+".wav") return f"./output/{split_model}/{filename}/vocal_{filename}.wav_10.wav", f"./output/{split_model}/{filename}/instrument_{filename}.wav_10.wav" def convert(start_time, song_name_src, song_name_ref, src_audio, ref_audio, check_song, auto_key, key_shift, vocal_vol, inst_vol): split_model = "UVR-HP5" #song_name_ref = song_name_ref.strip().replace(" ", "") #video_identifier = search_bilibili(song_name_ref) #song_id = get_bilibili_video_id(video_identifier) if src_audio is None: song_name_src = song_name_src.strip().replace(" ", "") video_identifier_src = search_bilibili(song_name_src) song_id_src = get_bilibili_video_id(video_identifier_src) audio_src, sr_src = librosa.load(youtube_downloader(video_identifier_src, song_id_src, split_model, start_time)[0], sr=24000, mono=True) soundfile.write("audio_src.wav", audio_src, sr_src) else: src_audio_orig = AudioSegment.from_file(src_audio) if len(src_audio_orig) > 45000: segment = src_audio_orig[0:45000] segment.export("segment.wav", format="wav") multi_channel_audio = AudioSegment.from_file("segment.wav", format="wav") mono_audio = multi_channel_audio.set_channels(1) mono_audio.export("audio_src.wav", format="wav") else: multi_channel_audio = AudioSegment.from_file(src_audio, format="wav") mono_audio = multi_channel_audio.set_channels(1) mono_audio.export("audio_src.wav", format="wav") if ref_audio is None: song_name_ref = song_name_ref.strip().replace(" ", "") video_identifier = search_bilibili(song_name_ref) song_id = get_bilibili_video_id(video_identifier) if os.path.isdir(f"./output/{split_model}/{song_id}")==False: audio, sr = librosa.load(youtube_downloader_100s(video_identifier, song_id, split_model)[0], sr=24000, mono=True) soundfile.write("audio_ref.wav", audio, sr) else: audio, sr = librosa.load(f"./output/{split_model}/{song_id}/vocal_{song_id}.wav_10.wav", sr=24000, mono=True) soundfile.write("audio_ref.wav", audio, sr) vad("audio_ref.wav") else: vad(ref_audio) #if os.path.isdir(f"./output/{split_model}/{song_id_src}")==False: #audio_src, sr_src = librosa.load(youtube_downloader(video_identifier_src, song_id_src, split_model, start_time)[0], sr=24000, mono=True) #soundfile.write("audio_src.wav", audio_src, sr_src) #else: # audio_src, sr_src = librosa.load(f"./output/{split_model}/{song_id_src}/vocal_{song_id_src}.wav_10.wav", sr=24000, mono=True) # soundfile.write("audio_src.wav", audio_src, sr_src) if os.path.isfile("output_svc/NeuCoSVCv2.wav"): os.remove("output_svc/NeuCoSVCv2.wav") if check_song == True: if auto_key == True: os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav") else: os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav --key_shift {key_shift}") else: if auto_key == True: os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav --speech_enroll") else: os.system(f"python inference.py --src_wav_path audio_src.wav --ref_wav_path voiced_audio.wav --key_shift {key_shift} --speech_enroll") if src_audio is None: audio_vocal = AudioSegment.from_file("output_svc/NeuCoSVCv2.wav", format="wav") # Load the second audio file audio_inst = AudioSegment.from_file(f"output/{split_model}/{song_id_src}/instrument_{song_id_src}.wav_10.wav", format="wav") audio_vocal = audio_vocal + vocal_vol # Increase volume of the first audio by 5 dB audio_inst = audio_inst + inst_vol # Decrease volume of the second audio by 5 dB # Concatenate audio files combined_audio = audio_vocal.overlay(audio_inst) # Export the concatenated audio to a new file combined_audio.export(f"{song_name_src}-AI翻唱.wav", format="wav") return f"{song_name_src}-AI翻唱.wav" else: return "output_svc/NeuCoSVCv2.wav" app = gr.Blocks() with app: gr.Markdown("#