Faridmaruf commited on
Commit
f38d8df
1 Parent(s): ba8ae36

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +516 -0
  2. config.py +117 -0
  3. requirements.txt +21 -0
  4. vc_infer_pipeline.py +443 -0
app.py ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import json
4
+ import traceback
5
+ import logging
6
+ import gradio as gr
7
+ import numpy as np
8
+ import librosa
9
+ import torch
10
+ import asyncio
11
+ import edge_tts
12
+ import yt_dlp
13
+ import ffmpeg
14
+ import subprocess
15
+ import sys
16
+ import io
17
+ import wave
18
+ from datetime import datetime
19
+ from fairseq import checkpoint_utils
20
+ from lib.infer_pack.models import (
21
+ SynthesizerTrnMs256NSFsid,
22
+ SynthesizerTrnMs256NSFsid_nono,
23
+ SynthesizerTrnMs768NSFsid,
24
+ SynthesizerTrnMs768NSFsid_nono,
25
+ )
26
+ from vc_infer_pipeline import VC
27
+ from config import Config
28
+ config = Config()
29
+ logging.getLogger("numba").setLevel(logging.WARNING)
30
+ limitation = os.getenv("SYSTEM") == "spaces"
31
+
32
+ audio_mode = []
33
+ f0method_mode = []
34
+ f0method_info = ""
35
+ if limitation is True:
36
+ audio_mode = ["Upload audio", "TTS Audio"]
37
+ f0method_mode = ["pm", "harvest"]
38
+ f0method_info = "PM is fast, Harvest is good but extremely slow. (Default: PM)"
39
+ else:
40
+ audio_mode = ["Input path", "Upload audio", "Youtube", "TTS Audio"]
41
+ f0method_mode = ["pm", "harvest", "crepe"]
42
+ f0method_info = "PM is fast, Harvest is good but extremely slow, and Crepe effect is good but requires GPU (Default: PM)"
43
+
44
+ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
45
+ def vc_fn(
46
+ vc_audio_mode,
47
+ vc_input,
48
+ vc_upload,
49
+ tts_text,
50
+ tts_voice,
51
+ f0_up_key,
52
+ f0_method,
53
+ index_rate,
54
+ filter_radius,
55
+ resample_sr,
56
+ rms_mix_rate,
57
+ protect,
58
+ ):
59
+ try:
60
+ print(f"Converting using {model_name}...")
61
+ if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
62
+ audio, sr = librosa.load(vc_input, sr=16000, mono=True)
63
+ elif vc_audio_mode == "Upload audio":
64
+ if vc_upload is None:
65
+ return "You need to upload an audio", None
66
+ sampling_rate, audio = vc_upload
67
+ duration = audio.shape[0] / sampling_rate
68
+ if duration > 20 and limitation:
69
+ return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
70
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
71
+ if len(audio.shape) > 1:
72
+ audio = librosa.to_mono(audio.transpose(1, 0))
73
+ if sampling_rate != 16000:
74
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
75
+ elif vc_audio_mode == "TTS Audio":
76
+ if len(tts_text) > 100 and limitation:
77
+ return "Text is too long", None
78
+ if tts_text is None or tts_voice is None:
79
+ return "You need to enter text and select a voice", None
80
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
81
+ audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
82
+ vc_input = "tts.mp3"
83
+ times = [0, 0, 0]
84
+ f0_up_key = int(f0_up_key)
85
+ audio_opt = vc.pipeline(
86
+ hubert_model,
87
+ net_g,
88
+ 0,
89
+ audio,
90
+ vc_input,
91
+ times,
92
+ f0_up_key,
93
+ f0_method,
94
+ file_index,
95
+ # file_big_npy,
96
+ index_rate,
97
+ if_f0,
98
+ filter_radius,
99
+ tgt_sr,
100
+ resample_sr,
101
+ rms_mix_rate,
102
+ version,
103
+ protect,
104
+ f0_file=None,
105
+ )
106
+ info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
107
+ print(f"{model_name} | {info}")
108
+ return info, (tgt_sr, audio_opt)
109
+ except:
110
+ info = traceback.format_exc()
111
+ print(info)
112
+ return info, None
113
+ return vc_fn
114
+
115
+ def load_model():
116
+ categories = []
117
+ with open("weights/folder_info.json", "r", encoding="utf-8") as f:
118
+ folder_info = json.load(f)
119
+ for category_name, category_info in folder_info.items():
120
+ if not category_info['enable']:
121
+ continue
122
+ category_title = category_info['title']
123
+ category_folder = category_info['folder_path']
124
+ description = category_info['description']
125
+ models = []
126
+ with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
127
+ models_info = json.load(f)
128
+ for character_name, info in models_info.items():
129
+ if not info['enable']:
130
+ continue
131
+ model_title = info['title']
132
+ model_name = info['model_path']
133
+ model_author = info.get("author", None)
134
+ model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
135
+ model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
136
+ cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
137
+ tgt_sr = cpt["config"][-1]
138
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
139
+ if_f0 = cpt.get("f0", 1)
140
+ version = cpt.get("version", "v1")
141
+ if version == "v1":
142
+ if if_f0 == 1:
143
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
144
+ else:
145
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
146
+ model_version = "V1"
147
+ elif version == "v2":
148
+ if if_f0 == 1:
149
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
150
+ else:
151
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
152
+ model_version = "V2"
153
+ del net_g.enc_q
154
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
155
+ net_g.eval().to(config.device)
156
+ if config.is_half:
157
+ net_g = net_g.half()
158
+ else:
159
+ net_g = net_g.float()
160
+ vc = VC(tgt_sr, config)
161
+ print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
162
+ models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
163
+ categories.append([category_title, category_folder, description, models])
164
+ return categories
165
+
166
+ def cut_vocal_and_inst(url, audio_provider, split_model):
167
+ if url != "":
168
+ if not os.path.exists("dl_audio"):
169
+ os.mkdir("dl_audio")
170
+ if audio_provider == "Youtube":
171
+ ydl_opts = {
172
+ 'noplaylist': True,
173
+ 'format': 'bestaudio/best',
174
+ 'postprocessors': [{
175
+ 'key': 'FFmpegExtractAudio',
176
+ 'preferredcodec': 'wav',
177
+ }],
178
+ "outtmpl": 'dl_audio/youtube_audio',
179
+ }
180
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
181
+ ydl.download([url])
182
+ audio_path = "dl_audio/youtube_audio.wav"
183
+ if split_model == "htdemucs":
184
+ command = f"demucs --two-stems=vocals {audio_path} -o output"
185
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
186
+ print(result.stdout.decode())
187
+ return "output/htdemucs/youtube_audio/vocals.wav", "output/htdemucs/youtube_audio/no_vocals.wav", audio_path, "output/htdemucs/youtube_audio/vocals.wav"
188
+ else:
189
+ command = f"demucs --two-stems=vocals -n mdx_extra_q {audio_path} -o output"
190
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
191
+ print(result.stdout.decode())
192
+ return "output/mdx_extra_q/youtube_audio/vocals.wav", "output/mdx_extra_q/youtube_audio/no_vocals.wav", audio_path, "output/mdx_extra_q/youtube_audio/vocals.wav"
193
+ else:
194
+ raise gr.Error("URL Required!")
195
+ return None, None, None, None
196
+
197
+ def combine_vocal_and_inst(audio_data, audio_volume, split_model):
198
+ if not os.path.exists("output/result"):
199
+ os.mkdir("output/result")
200
+ vocal_path = "output/result/output.wav"
201
+ output_path = "output/result/combine.mp3"
202
+ if split_model == "htdemucs":
203
+ inst_path = "output/htdemucs/youtube_audio/no_vocals.wav"
204
+ else:
205
+ inst_path = "output/mdx_extra_q/youtube_audio/no_vocals.wav"
206
+ with wave.open(vocal_path, "w") as wave_file:
207
+ wave_file.setnchannels(1)
208
+ wave_file.setsampwidth(2)
209
+ wave_file.setframerate(audio_data[0])
210
+ wave_file.writeframes(audio_data[1].tobytes())
211
+ command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [1:a]volume={audio_volume}dB[v];[0:a][v]amix=inputs=2:duration=longest -b:a 320k -c:a libmp3lame {output_path}'
212
+ result = subprocess.run(command.split(), stdout=subprocess.PIPE)
213
+ print(result.stdout.decode())
214
+ return output_path
215
+
216
+ def load_hubert():
217
+ global hubert_model
218
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
219
+ ["hubert_base.pt"],
220
+ suffix="",
221
+ )
222
+ hubert_model = models[0]
223
+ hubert_model = hubert_model.to(config.device)
224
+ if config.is_half:
225
+ hubert_model = hubert_model.half()
226
+ else:
227
+ hubert_model = hubert_model.float()
228
+ hubert_model.eval()
229
+
230
+ def change_audio_mode(vc_audio_mode):
231
+ if vc_audio_mode == "Input path":
232
+ return (
233
+ # Input & Upload
234
+ gr.Textbox.update(visible=True),
235
+ gr.Checkbox.update(visible=False),
236
+ gr.Audio.update(visible=False),
237
+ # Youtube
238
+ gr.Dropdown.update(visible=False),
239
+ gr.Textbox.update(visible=False),
240
+ gr.Dropdown.update(visible=False),
241
+ gr.Button.update(visible=False),
242
+ gr.Audio.update(visible=False),
243
+ gr.Audio.update(visible=False),
244
+ gr.Audio.update(visible=False),
245
+ gr.Slider.update(visible=False),
246
+ gr.Audio.update(visible=False),
247
+ gr.Button.update(visible=False),
248
+ # TTS
249
+ gr.Textbox.update(visible=False),
250
+ gr.Dropdown.update(visible=False)
251
+ )
252
+ elif vc_audio_mode == "Upload audio":
253
+ return (
254
+ # Input & Upload
255
+ gr.Textbox.update(visible=False),
256
+ gr.Checkbox.update(visible=True),
257
+ gr.Audio.update(visible=True),
258
+ # Youtube
259
+ gr.Dropdown.update(visible=False),
260
+ gr.Textbox.update(visible=False),
261
+ gr.Dropdown.update(visible=False),
262
+ gr.Button.update(visible=False),
263
+ gr.Audio.update(visible=False),
264
+ gr.Audio.update(visible=False),
265
+ gr.Audio.update(visible=False),
266
+ gr.Slider.update(visible=False),
267
+ gr.Audio.update(visible=False),
268
+ gr.Button.update(visible=False),
269
+ # TTS
270
+ gr.Textbox.update(visible=False),
271
+ gr.Dropdown.update(visible=False)
272
+ )
273
+ elif vc_audio_mode == "Youtube":
274
+ return (
275
+ # Input & Upload
276
+ gr.Textbox.update(visible=False),
277
+ gr.Checkbox.update(visible=False),
278
+ gr.Audio.update(visible=False),
279
+ # Youtube
280
+ gr.Dropdown.update(visible=True),
281
+ gr.Textbox.update(visible=True),
282
+ gr.Dropdown.update(visible=True),
283
+ gr.Button.update(visible=True),
284
+ gr.Audio.update(visible=True),
285
+ gr.Audio.update(visible=True),
286
+ gr.Audio.update(visible=True),
287
+ gr.Slider.update(visible=True),
288
+ gr.Audio.update(visible=True),
289
+ gr.Button.update(visible=True),
290
+ # TTS
291
+ gr.Textbox.update(visible=False),
292
+ gr.Dropdown.update(visible=False)
293
+ )
294
+ elif vc_audio_mode == "TTS Audio":
295
+ return (
296
+ # Input & Upload
297
+ gr.Textbox.update(visible=False),
298
+ gr.Checkbox.update(visible=False),
299
+ gr.Audio.update(visible=False),
300
+ # Youtube
301
+ gr.Dropdown.update(visible=False),
302
+ gr.Textbox.update(visible=False),
303
+ gr.Dropdown.update(visible=False),
304
+ gr.Button.update(visible=False),
305
+ gr.Audio.update(visible=False),
306
+ gr.Audio.update(visible=False),
307
+ gr.Audio.update(visible=False),
308
+ gr.Slider.update(visible=False),
309
+ gr.Audio.update(visible=False),
310
+ gr.Button.update(visible=False),
311
+ # TTS
312
+ gr.Textbox.update(visible=True),
313
+ gr.Dropdown.update(visible=True)
314
+ )
315
+ else:
316
+ return (
317
+ # Input & Upload
318
+ gr.Textbox.update(visible=False),
319
+ gr.Checkbox.update(visible=True),
320
+ gr.Audio.update(visible=True),
321
+ # Youtube
322
+ gr.Dropdown.update(visible=False),
323
+ gr.Textbox.update(visible=False),
324
+ gr.Dropdown.update(visible=False),
325
+ gr.Button.update(visible=False),
326
+ gr.Audio.update(visible=False),
327
+ gr.Audio.update(visible=False),
328
+ gr.Audio.update(visible=False),
329
+ gr.Slider.update(visible=False),
330
+ gr.Audio.update(visible=False),
331
+ gr.Button.update(visible=False),
332
+ # TTS
333
+ gr.Textbox.update(visible=False),
334
+ gr.Dropdown.update(visible=False)
335
+ )
336
+
337
+ def use_microphone(microphone):
338
+ if microphone == True:
339
+ return gr.Audio.update(source="microphone")
340
+ else:
341
+ return gr.Audio.update(source="upload")
342
+
343
+ if __name__ == '__main__':
344
+ load_hubert()
345
+ categories = load_model()
346
+ tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
347
+ voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
348
+ with gr.Blocks() as app:
349
+ gr.Markdown(
350
+ "<div align='center'>\n\n"+
351
+ "# Multi Model RVC Inference\n\n"+
352
+ "[![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)\n\n"+
353
+ "</div>"
354
+ )
355
+ for (folder_title, folder, description, models) in categories:
356
+ with gr.TabItem(folder_title):
357
+ if description:
358
+ gr.Markdown(f"### <center> {description}")
359
+ with gr.Tabs():
360
+ if not models:
361
+ gr.Markdown("# <center> No Model Loaded.")
362
+ gr.Markdown("## <center> Please add model or fix your model path.")
363
+ continue
364
+ for (name, title, author, cover, model_version, vc_fn) in models:
365
+ with gr.TabItem(name):
366
+ with gr.Row():
367
+ gr.Markdown(
368
+ '<div align="center">'
369
+ f'<div>{title}</div>\n'+
370
+ f'<div>RVC {model_version} Model</div>\n'+
371
+ (f'<div>Model author: {author}</div>' if author else "")+
372
+ (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
373
+ '</div>'
374
+ )
375
+ with gr.Row():
376
+ with gr.Column():
377
+ vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
378
+ # Input
379
+ vc_input = gr.Textbox(label="Input audio path", visible=False)
380
+ # Upload
381
+ vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
382
+ vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
383
+ # Youtube
384
+ vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
385
+ vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
386
+ vc_split_model = gr.Dropdown(label="Splitter Model", choices=["htdemucs", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
387
+ vc_split = gr.Button("Split Audio", variant="primary", visible=False)
388
+ vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
389
+ vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
390
+ vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
391
+ # TTS
392
+ tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
393
+ tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
394
+ with gr.Column():
395
+ vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
396
+ f0method0 = gr.Radio(
397
+ label="Pitch extraction algorithm",
398
+ info=f0method_info,
399
+ choices=f0method_mode,
400
+ value="pm",
401
+ interactive=True
402
+ )
403
+ index_rate1 = gr.Slider(
404
+ minimum=0,
405
+ maximum=1,
406
+ label="Retrieval feature ratio",
407
+ info="(Default: 0.7)",
408
+ value=0.7,
409
+ interactive=True,
410
+ )
411
+ filter_radius0 = gr.Slider(
412
+ minimum=0,
413
+ maximum=7,
414
+ label="Apply Median Filtering",
415
+ info="The value represents the filter radius and can reduce breathiness.",
416
+ value=3,
417
+ step=1,
418
+ interactive=True,
419
+ )
420
+ resample_sr0 = gr.Slider(
421
+ minimum=0,
422
+ maximum=48000,
423
+ label="Resample the output audio",
424
+ info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
425
+ value=0,
426
+ step=1,
427
+ interactive=True,
428
+ )
429
+ rms_mix_rate0 = gr.Slider(
430
+ minimum=0,
431
+ maximum=1,
432
+ label="Volume Envelope",
433
+ info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
434
+ value=1,
435
+ interactive=True,
436
+ )
437
+ protect0 = gr.Slider(
438
+ minimum=0,
439
+ maximum=0.5,
440
+ label="Voice Protection",
441
+ info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
442
+ value=0.5,
443
+ step=0.01,
444
+ interactive=True,
445
+ )
446
+ with gr.Column():
447
+ vc_log = gr.Textbox(label="Output Information", interactive=False)
448
+ vc_output = gr.Audio(label="Output Audio", interactive=False)
449
+ vc_convert = gr.Button("Convert", variant="primary")
450
+ vc_volume = gr.Slider(
451
+ minimum=0,
452
+ maximum=10,
453
+ label="Vocal volume",
454
+ value=4,
455
+ interactive=True,
456
+ step=1,
457
+ info="Adjust vocal volume (Default: 4}",
458
+ visible=False
459
+ )
460
+ vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
461
+ vc_combine = gr.Button("Combine",variant="primary", visible=False)
462
+ vc_convert.click(
463
+ fn=vc_fn,
464
+ inputs=[
465
+ vc_audio_mode,
466
+ vc_input,
467
+ vc_upload,
468
+ tts_text,
469
+ tts_voice,
470
+ vc_transform0,
471
+ f0method0,
472
+ index_rate1,
473
+ filter_radius0,
474
+ resample_sr0,
475
+ rms_mix_rate0,
476
+ protect0,
477
+ ],
478
+ outputs=[vc_log ,vc_output]
479
+ )
480
+ vc_split.click(
481
+ fn=cut_vocal_and_inst,
482
+ inputs=[vc_link, vc_download_audio, vc_split_model],
483
+ outputs=[vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input]
484
+ )
485
+ vc_combine.click(
486
+ fn=combine_vocal_and_inst,
487
+ inputs=[vc_output, vc_volume, vc_split_model],
488
+ outputs=[vc_combined_output]
489
+ )
490
+ vc_microphone_mode.change(
491
+ fn=use_microphone,
492
+ inputs=vc_microphone_mode,
493
+ outputs=vc_upload
494
+ )
495
+ vc_audio_mode.change(
496
+ fn=change_audio_mode,
497
+ inputs=[vc_audio_mode],
498
+ outputs=[
499
+ vc_input,
500
+ vc_microphone_mode,
501
+ vc_upload,
502
+ vc_download_audio,
503
+ vc_link,
504
+ vc_split_model,
505
+ vc_split,
506
+ vc_vocal_preview,
507
+ vc_inst_preview,
508
+ vc_audio_preview,
509
+ vc_volume,
510
+ vc_combined_output,
511
+ vc_combine,
512
+ tts_text,
513
+ tts_voice
514
+ ]
515
+ )
516
+ app.queue(concurrency_count=1, max_size=20, api_open=config.api).launch(share=config.colab)
config.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+ import torch
4
+ from multiprocessing import cpu_count
5
+
6
+ class Config:
7
+ def __init__(self):
8
+ self.device = "cuda:0"
9
+ self.is_half = True
10
+ self.n_cpu = 0
11
+ self.gpu_name = None
12
+ self.gpu_mem = None
13
+ (
14
+ self.python_cmd,
15
+ self.listen_port,
16
+ self.colab,
17
+ self.noparallel,
18
+ self.noautoopen,
19
+ self.api
20
+ ) = self.arg_parse()
21
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
22
+
23
+ @staticmethod
24
+ def arg_parse() -> tuple:
25
+ exe = sys.executable or "python"
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument("--port", type=int, default=7865, help="Listen port")
28
+ parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
29
+ parser.add_argument("--colab", action="store_true", help="Launch in colab")
30
+ parser.add_argument(
31
+ "--noparallel", action="store_true", help="Disable parallel processing"
32
+ )
33
+ parser.add_argument(
34
+ "--noautoopen",
35
+ action="store_true",
36
+ help="Do not open in browser automatically",
37
+ )
38
+ parser.add_argument("--api", action="store_true", help="Launch with api")
39
+ cmd_opts = parser.parse_args()
40
+
41
+ cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
42
+
43
+ return (
44
+ cmd_opts.pycmd,
45
+ cmd_opts.port,
46
+ cmd_opts.colab,
47
+ cmd_opts.noparallel,
48
+ cmd_opts.noautoopen,
49
+ cmd_opts.api
50
+ )
51
+
52
+ # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
53
+ # check `getattr` and try it for compatibility
54
+ @staticmethod
55
+ def has_mps() -> bool:
56
+ if not torch.backends.mps.is_available():
57
+ return False
58
+ try:
59
+ torch.zeros(1).to(torch.device("mps"))
60
+ return True
61
+ except Exception:
62
+ return False
63
+
64
+ def device_config(self) -> tuple:
65
+ if torch.cuda.is_available():
66
+ i_device = int(self.device.split(":")[-1])
67
+ self.gpu_name = torch.cuda.get_device_name(i_device)
68
+ if (
69
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
70
+ or "P40" in self.gpu_name.upper()
71
+ or "1060" in self.gpu_name
72
+ or "1070" in self.gpu_name
73
+ or "1080" in self.gpu_name
74
+ ):
75
+ print("Found GPU", self.gpu_name, ", force to fp32")
76
+ self.is_half = False
77
+ else:
78
+ print("Found GPU", self.gpu_name)
79
+ self.gpu_mem = int(
80
+ torch.cuda.get_device_properties(i_device).total_memory
81
+ / 1024
82
+ / 1024
83
+ / 1024
84
+ + 0.4
85
+ )
86
+ elif self.has_mps():
87
+ print("No supported Nvidia GPU found, use MPS instead")
88
+ self.device = "mps"
89
+ self.is_half = False
90
+ else:
91
+ print("No supported Nvidia GPU found, use CPU instead")
92
+ self.device = "cpu"
93
+ self.is_half = False
94
+
95
+ if self.n_cpu == 0:
96
+ self.n_cpu = cpu_count()
97
+
98
+ if self.is_half:
99
+ # 6G显存配置
100
+ x_pad = 3
101
+ x_query = 10
102
+ x_center = 60
103
+ x_max = 65
104
+ else:
105
+ # 5G显存配置
106
+ x_pad = 1
107
+ x_query = 6
108
+ x_center = 38
109
+ x_max = 41
110
+
111
+ if self.gpu_mem != None and self.gpu_mem <= 4:
112
+ x_pad = 1
113
+ x_query = 5
114
+ x_center = 30
115
+ x_max = 32
116
+
117
+ return x_pad, x_query, x_center, x_max
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wheel
2
+ setuptools
3
+ ffmpeg
4
+ numba==0.56.4
5
+ numpy==1.23.5
6
+ scipy==1.9.3
7
+ librosa==0.9.1
8
+ fairseq==0.12.2
9
+ faiss-cpu==1.7.3
10
+ gradio==3.36.1
11
+ pyworld==0.3.2
12
+ soundfile>=0.12.1
13
+ praat-parselmouth>=0.4.2
14
+ httpx==0.23.0
15
+ tensorboard
16
+ tensorboardX
17
+ torchcrepe
18
+ onnxruntime
19
+ demucs
20
+ edge-tts
21
+ yt_dlp
vc_infer_pipeline.py ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, parselmouth, torch, pdb, sys, os
2
+ from time import time as ttime
3
+ import torch.nn.functional as F
4
+ import scipy.signal as signal
5
+ import pyworld, os, traceback, faiss, librosa, torchcrepe
6
+ from scipy import signal
7
+ from functools import lru_cache
8
+
9
+ now_dir = os.getcwd()
10
+ sys.path.append(now_dir)
11
+
12
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
13
+
14
+ input_audio_path2wav = {}
15
+
16
+
17
+ @lru_cache
18
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
19
+ audio = input_audio_path2wav[input_audio_path]
20
+ f0, t = pyworld.harvest(
21
+ audio,
22
+ fs=fs,
23
+ f0_ceil=f0max,
24
+ f0_floor=f0min,
25
+ frame_period=frame_period,
26
+ )
27
+ f0 = pyworld.stonemask(audio, f0, t, fs)
28
+ return f0
29
+
30
+
31
+ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
32
+ # print(data1.max(),data2.max())
33
+ rms1 = librosa.feature.rms(
34
+ y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
35
+ ) # 每半秒一个点
36
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
37
+ rms1 = torch.from_numpy(rms1)
38
+ rms1 = F.interpolate(
39
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
40
+ ).squeeze()
41
+ rms2 = torch.from_numpy(rms2)
42
+ rms2 = F.interpolate(
43
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
44
+ ).squeeze()
45
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
46
+ data2 *= (
47
+ torch.pow(rms1, torch.tensor(1 - rate))
48
+ * torch.pow(rms2, torch.tensor(rate - 1))
49
+ ).numpy()
50
+ return data2
51
+
52
+
53
+ class VC(object):
54
+ def __init__(self, tgt_sr, config):
55
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
56
+ config.x_pad,
57
+ config.x_query,
58
+ config.x_center,
59
+ config.x_max,
60
+ config.is_half,
61
+ )
62
+ self.sr = 16000 # hubert输入采样率
63
+ self.window = 160 # 每帧点数
64
+ self.t_pad = self.sr * self.x_pad # 每条前后pad时间
65
+ self.t_pad_tgt = tgt_sr * self.x_pad
66
+ self.t_pad2 = self.t_pad * 2
67
+ self.t_query = self.sr * self.x_query # 查询切点前后查询时间
68
+ self.t_center = self.sr * self.x_center # 查询切点位置
69
+ self.t_max = self.sr * self.x_max # 免查询时长阈值
70
+ self.device = config.device
71
+
72
+ def get_f0(
73
+ self,
74
+ input_audio_path,
75
+ x,
76
+ p_len,
77
+ f0_up_key,
78
+ f0_method,
79
+ filter_radius,
80
+ inp_f0=None,
81
+ ):
82
+ global input_audio_path2wav
83
+ time_step = self.window / self.sr * 1000
84
+ f0_min = 50
85
+ f0_max = 1100
86
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
87
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
88
+ if f0_method == "pm":
89
+ f0 = (
90
+ parselmouth.Sound(x, self.sr)
91
+ .to_pitch_ac(
92
+ time_step=time_step / 1000,
93
+ voicing_threshold=0.6,
94
+ pitch_floor=f0_min,
95
+ pitch_ceiling=f0_max,
96
+ )
97
+ .selected_array["frequency"]
98
+ )
99
+ pad_size = (p_len - len(f0) + 1) // 2
100
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
101
+ f0 = np.pad(
102
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
103
+ )
104
+ elif f0_method == "harvest":
105
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
106
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
107
+ if filter_radius > 2:
108
+ f0 = signal.medfilt(f0, 3)
109
+ elif f0_method == "crepe":
110
+ model = "full"
111
+ # Pick a batch size that doesn't cause memory errors on your gpu
112
+ batch_size = 512
113
+ # Compute pitch using first gpu
114
+ audio = torch.tensor(np.copy(x))[None].float()
115
+ f0, pd = torchcrepe.predict(
116
+ audio,
117
+ self.sr,
118
+ self.window,
119
+ f0_min,
120
+ f0_max,
121
+ model,
122
+ batch_size=batch_size,
123
+ device=self.device,
124
+ return_periodicity=True,
125
+ )
126
+ pd = torchcrepe.filter.median(pd, 3)
127
+ f0 = torchcrepe.filter.mean(f0, 3)
128
+ f0[pd < 0.1] = 0
129
+ f0 = f0[0].cpu().numpy()
130
+ elif f0_method == "rmvpe":
131
+ if hasattr(self, "model_rmvpe") == False:
132
+ from rmvpe import RMVPE
133
+
134
+ print("loading rmvpe model")
135
+ self.model_rmvpe = RMVPE(
136
+ "rmvpe.pt", is_half=self.is_half, device=self.device
137
+ )
138
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
139
+ f0 *= pow(2, f0_up_key / 12)
140
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
141
+ tf0 = self.sr // self.window # 每秒f0点数
142
+ if inp_f0 is not None:
143
+ delta_t = np.round(
144
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
145
+ ).astype("int16")
146
+ replace_f0 = np.interp(
147
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
148
+ )
149
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
150
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
151
+ :shape
152
+ ]
153
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
154
+ f0bak = f0.copy()
155
+ f0_mel = 1127 * np.log(1 + f0 / 700)
156
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
157
+ f0_mel_max - f0_mel_min
158
+ ) + 1
159
+ f0_mel[f0_mel <= 1] = 1
160
+ f0_mel[f0_mel > 255] = 255
161
+ f0_coarse = np.rint(f0_mel).astype(np.int)
162
+ return f0_coarse, f0bak # 1-0
163
+
164
+ def vc(
165
+ self,
166
+ model,
167
+ net_g,
168
+ sid,
169
+ audio0,
170
+ pitch,
171
+ pitchf,
172
+ times,
173
+ index,
174
+ big_npy,
175
+ index_rate,
176
+ version,
177
+ protect,
178
+ ): # ,file_index,file_big_npy
179
+ feats = torch.from_numpy(audio0)
180
+ if self.is_half:
181
+ feats = feats.half()
182
+ else:
183
+ feats = feats.float()
184
+ if feats.dim() == 2: # double channels
185
+ feats = feats.mean(-1)
186
+ assert feats.dim() == 1, feats.dim()
187
+ feats = feats.view(1, -1)
188
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
189
+
190
+ inputs = {
191
+ "source": feats.to(self.device),
192
+ "padding_mask": padding_mask,
193
+ "output_layer": 9 if version == "v1" else 12,
194
+ }
195
+ t0 = ttime()
196
+ with torch.no_grad():
197
+ logits = model.extract_features(**inputs)
198
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
199
+ if protect < 0.5 and pitch != None and pitchf != None:
200
+ feats0 = feats.clone()
201
+ if (
202
+ isinstance(index, type(None)) == False
203
+ and isinstance(big_npy, type(None)) == False
204
+ and index_rate != 0
205
+ ):
206
+ npy = feats[0].cpu().numpy()
207
+ if self.is_half:
208
+ npy = npy.astype("float32")
209
+
210
+ # _, I = index.search(npy, 1)
211
+ # npy = big_npy[I.squeeze()]
212
+
213
+ score, ix = index.search(npy, k=8)
214
+ weight = np.square(1 / score)
215
+ weight /= weight.sum(axis=1, keepdims=True)
216
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
217
+
218
+ if self.is_half:
219
+ npy = npy.astype("float16")
220
+ feats = (
221
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
222
+ + (1 - index_rate) * feats
223
+ )
224
+
225
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
226
+ if protect < 0.5 and pitch != None and pitchf != None:
227
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
228
+ 0, 2, 1
229
+ )
230
+ t1 = ttime()
231
+ p_len = audio0.shape[0] // self.window
232
+ if feats.shape[1] < p_len:
233
+ p_len = feats.shape[1]
234
+ if pitch != None and pitchf != None:
235
+ pitch = pitch[:, :p_len]
236
+ pitchf = pitchf[:, :p_len]
237
+
238
+ if protect < 0.5 and pitch != None and pitchf != None:
239
+ pitchff = pitchf.clone()
240
+ pitchff[pitchf > 0] = 1
241
+ pitchff[pitchf < 1] = protect
242
+ pitchff = pitchff.unsqueeze(-1)
243
+ feats = feats * pitchff + feats0 * (1 - pitchff)
244
+ feats = feats.to(feats0.dtype)
245
+ p_len = torch.tensor([p_len], device=self.device).long()
246
+ with torch.no_grad():
247
+ if pitch != None and pitchf != None:
248
+ audio1 = (
249
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
250
+ .data.cpu()
251
+ .float()
252
+ .numpy()
253
+ )
254
+ else:
255
+ audio1 = (
256
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
257
+ )
258
+ del feats, p_len, padding_mask
259
+ if torch.cuda.is_available():
260
+ torch.cuda.empty_cache()
261
+ t2 = ttime()
262
+ times[0] += t1 - t0
263
+ times[2] += t2 - t1
264
+ return audio1
265
+
266
+ def pipeline(
267
+ self,
268
+ model,
269
+ net_g,
270
+ sid,
271
+ audio,
272
+ input_audio_path,
273
+ times,
274
+ f0_up_key,
275
+ f0_method,
276
+ file_index,
277
+ # file_big_npy,
278
+ index_rate,
279
+ if_f0,
280
+ filter_radius,
281
+ tgt_sr,
282
+ resample_sr,
283
+ rms_mix_rate,
284
+ version,
285
+ protect,
286
+ f0_file=None,
287
+ ):
288
+ if (
289
+ file_index != ""
290
+ # and file_big_npy != ""
291
+ # and os.path.exists(file_big_npy) == True
292
+ and os.path.exists(file_index) == True
293
+ and index_rate != 0
294
+ ):
295
+ try:
296
+ index = faiss.read_index(file_index)
297
+ # big_npy = np.load(file_big_npy)
298
+ big_npy = index.reconstruct_n(0, index.ntotal)
299
+ except:
300
+ traceback.print_exc()
301
+ index = big_npy = None
302
+ else:
303
+ index = big_npy = None
304
+ audio = signal.filtfilt(bh, ah, audio)
305
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
306
+ opt_ts = []
307
+ if audio_pad.shape[0] > self.t_max:
308
+ audio_sum = np.zeros_like(audio)
309
+ for i in range(self.window):
310
+ audio_sum += audio_pad[i : i - self.window]
311
+ for t in range(self.t_center, audio.shape[0], self.t_center):
312
+ opt_ts.append(
313
+ t
314
+ - self.t_query
315
+ + np.where(
316
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
317
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
318
+ )[0][0]
319
+ )
320
+ s = 0
321
+ audio_opt = []
322
+ t = None
323
+ t1 = ttime()
324
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
325
+ p_len = audio_pad.shape[0] // self.window
326
+ inp_f0 = None
327
+ if hasattr(f0_file, "name") == True:
328
+ try:
329
+ with open(f0_file.name, "r") as f:
330
+ lines = f.read().strip("\n").split("\n")
331
+ inp_f0 = []
332
+ for line in lines:
333
+ inp_f0.append([float(i) for i in line.split(",")])
334
+ inp_f0 = np.array(inp_f0, dtype="float32")
335
+ except:
336
+ traceback.print_exc()
337
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
338
+ pitch, pitchf = None, None
339
+ if if_f0 == 1:
340
+ pitch, pitchf = self.get_f0(
341
+ input_audio_path,
342
+ audio_pad,
343
+ p_len,
344
+ f0_up_key,
345
+ f0_method,
346
+ filter_radius,
347
+ inp_f0,
348
+ )
349
+ pitch = pitch[:p_len]
350
+ pitchf = pitchf[:p_len]
351
+ if self.device == "mps":
352
+ pitchf = pitchf.astype(np.float32)
353
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
354
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
355
+ t2 = ttime()
356
+ times[1] += t2 - t1
357
+ for t in opt_ts:
358
+ t = t // self.window * self.window
359
+ if if_f0 == 1:
360
+ audio_opt.append(
361
+ self.vc(
362
+ model,
363
+ net_g,
364
+ sid,
365
+ audio_pad[s : t + self.t_pad2 + self.window],
366
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
367
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
368
+ times,
369
+ index,
370
+ big_npy,
371
+ index_rate,
372
+ version,
373
+ protect,
374
+ )[self.t_pad_tgt : -self.t_pad_tgt]
375
+ )
376
+ else:
377
+ audio_opt.append(
378
+ self.vc(
379
+ model,
380
+ net_g,
381
+ sid,
382
+ audio_pad[s : t + self.t_pad2 + self.window],
383
+ None,
384
+ None,
385
+ times,
386
+ index,
387
+ big_npy,
388
+ index_rate,
389
+ version,
390
+ protect,
391
+ )[self.t_pad_tgt : -self.t_pad_tgt]
392
+ )
393
+ s = t
394
+ if if_f0 == 1:
395
+ audio_opt.append(
396
+ self.vc(
397
+ model,
398
+ net_g,
399
+ sid,
400
+ audio_pad[t:],
401
+ pitch[:, t // self.window :] if t is not None else pitch,
402
+ pitchf[:, t // self.window :] if t is not None else pitchf,
403
+ times,
404
+ index,
405
+ big_npy,
406
+ index_rate,
407
+ version,
408
+ protect,
409
+ )[self.t_pad_tgt : -self.t_pad_tgt]
410
+ )
411
+ else:
412
+ audio_opt.append(
413
+ self.vc(
414
+ model,
415
+ net_g,
416
+ sid,
417
+ audio_pad[t:],
418
+ None,
419
+ None,
420
+ times,
421
+ index,
422
+ big_npy,
423
+ index_rate,
424
+ version,
425
+ protect,
426
+ )[self.t_pad_tgt : -self.t_pad_tgt]
427
+ )
428
+ audio_opt = np.concatenate(audio_opt)
429
+ if rms_mix_rate != 1:
430
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
431
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
432
+ audio_opt = librosa.resample(
433
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
434
+ )
435
+ audio_max = np.abs(audio_opt).max() / 0.99
436
+ max_int16 = 32768
437
+ if audio_max > 1:
438
+ max_int16 /= audio_max
439
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
440
+ del pitch, pitchf, sid
441
+ if torch.cuda.is_available():
442
+ torch.cuda.empty_cache()
443
+ return audio_opt