jhj0517 commited on
Commit
602c60a
2 Parent(s): 47a36e3 633c360

Merge from master

Browse files
app.py CHANGED
@@ -4,13 +4,15 @@ import gradio as gr
4
  import yaml
5
 
6
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
7
- INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH)
 
8
  from modules.utils.files_manager import load_yaml
9
  from modules.whisper.whisper_factory import WhisperFactory
10
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
11
  from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
12
  from modules.translation.nllb_inference import NLLBInference
13
  from modules.ui.htmls import *
 
14
  from modules.utils.youtube_manager import get_ytmetas
15
  from modules.translation.deepl_api import DeepLAPI
16
  from modules.whisper.whisper_parameter import *
@@ -25,10 +27,9 @@ class App:
25
  whisper_model_dir=self.args.whisper_model_dir,
26
  faster_whisper_model_dir=self.args.faster_whisper_model_dir,
27
  insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
 
28
  output_dir=self.args.output_dir,
29
  )
30
- print(f"Use \"{self.args.whisper_type}\" implementation")
31
- print(f"Device \"{self.whisper_inf.device}\" is detected")
32
  self.nllb_inf = NLLBInference(
33
  model_dir=self.args.nllb_model_dir,
34
  output_dir=os.path.join(self.args.output_dir, "translations")
@@ -37,11 +38,14 @@ class App:
37
  output_dir=os.path.join(self.args.output_dir, "translations")
38
  )
39
  self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
 
 
40
 
41
  def create_whisper_parameters(self):
42
  whisper_params = self.default_params["whisper"]
43
  vad_params = self.default_params["vad"]
44
  diarization_params = self.default_params["diarization"]
 
45
 
46
  with gr.Row():
47
  dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
@@ -127,6 +131,16 @@ class App:
127
  precision=0)
128
  nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
129
 
 
 
 
 
 
 
 
 
 
 
130
  with gr.Accordion("VAD", open=False):
131
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
132
  interactive=True)
@@ -173,7 +187,9 @@ class App:
173
  hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
174
  language_detection_threshold=nb_language_detection_threshold,
175
  language_detection_segments=nb_language_detection_segments,
176
- prompt_reset_on_temperature=sld_prompt_reset_on_temperature
 
 
177
  ),
178
  dd_file_format,
179
  cb_timestamp
@@ -183,6 +199,7 @@ class App:
183
  translation_params = self.default_params["translation"]
184
  deepl_params = translation_params["deepl"]
185
  nllb_params = translation_params["nllb"]
 
186
 
187
  with self.app:
188
  with gr.Row():
@@ -254,7 +271,7 @@ class App:
254
  files_subtitles = gr.Files(label="Downloadable output file", scale=3)
255
  btn_openfolder = gr.Button('📂', scale=1)
256
 
257
- params = [mic_input, dd_file_format]
258
 
259
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
260
  inputs=params + whisper_params.as_list(),
@@ -328,6 +345,39 @@ class App:
328
  inputs=None,
329
  outputs=None)
330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  # Launch the app with optional gradio settings
332
  args = self.args
333
 
@@ -347,7 +397,8 @@ class App:
347
  if os.path.exists(folder_path):
348
  os.system(f"start {folder_path}")
349
  else:
350
- print(f"The folder {folder_path} does not exist.")
 
351
 
352
  @staticmethod
353
  def on_change_models(model_size: str):
@@ -362,16 +413,16 @@ class App:
362
  parser = argparse.ArgumentParser()
363
  parser.add_argument('--whisper_type', type=str, default="faster-whisper",
364
  help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
365
- parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
366
  parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
367
  parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
368
  parser.add_argument('--root_path', type=str, default=None, help='Gradio root path')
369
  parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
370
  parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
371
  parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
372
- parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
373
- parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
374
- parser.add_argument('--inbrowser', type=bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
375
  parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
376
  help='Directory path of the whisper model')
377
  parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
@@ -383,6 +434,8 @@ parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MOD
383
  help='Directory path of the diarization model')
384
  parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
385
  help='Directory path of the Facebook NLLB model')
 
 
386
  parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
387
  _args = parser.parse_args()
388
 
 
4
  import yaml
5
 
6
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
7
+ INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
8
+ UVR_MODELS_DIR)
9
  from modules.utils.files_manager import load_yaml
10
  from modules.whisper.whisper_factory import WhisperFactory
11
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
12
  from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
13
  from modules.translation.nllb_inference import NLLBInference
14
  from modules.ui.htmls import *
15
+ from modules.utils.cli_manager import str2bool
16
  from modules.utils.youtube_manager import get_ytmetas
17
  from modules.translation.deepl_api import DeepLAPI
18
  from modules.whisper.whisper_parameter import *
 
27
  whisper_model_dir=self.args.whisper_model_dir,
28
  faster_whisper_model_dir=self.args.faster_whisper_model_dir,
29
  insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
30
+ uvr_model_dir=self.args.uvr_model_dir,
31
  output_dir=self.args.output_dir,
32
  )
 
 
33
  self.nllb_inf = NLLBInference(
34
  model_dir=self.args.nllb_model_dir,
35
  output_dir=os.path.join(self.args.output_dir, "translations")
 
38
  output_dir=os.path.join(self.args.output_dir, "translations")
39
  )
40
  self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
41
+ print(f"Use \"{self.args.whisper_type}\" implementation")
42
+ print(f"Device \"{self.whisper_inf.device}\" is detected")
43
 
44
  def create_whisper_parameters(self):
45
  whisper_params = self.default_params["whisper"]
46
  vad_params = self.default_params["vad"]
47
  diarization_params = self.default_params["diarization"]
48
+ uvr_params = self.default_params["bgm_separation"]
49
 
50
  with gr.Row():
51
  dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
 
131
  precision=0)
132
  nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
133
 
134
+ with gr.Accordion("BGM Separation", open=False):
135
+ cb_bgm_separation = gr.Checkbox(label="Enable BGM Separation Filter", value=uvr_params["is_separate_bgm"],
136
+ interactive=True)
137
+ dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
138
+ choices=self.whisper_inf.music_separator.available_devices)
139
+ dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
140
+ choices=self.whisper_inf.music_separator.available_models)
141
+ nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
142
+ cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
143
+
144
  with gr.Accordion("VAD", open=False):
145
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
146
  interactive=True)
 
187
  hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
188
  language_detection_threshold=nb_language_detection_threshold,
189
  language_detection_segments=nb_language_detection_segments,
190
+ prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
191
+ uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
192
+ uvr_save_file=cb_uvr_save_file
193
  ),
194
  dd_file_format,
195
  cb_timestamp
 
199
  translation_params = self.default_params["translation"]
200
  deepl_params = translation_params["deepl"]
201
  nllb_params = translation_params["nllb"]
202
+ uvr_params = self.default_params["bgm_separation"]
203
 
204
  with self.app:
205
  with gr.Row():
 
271
  files_subtitles = gr.Files(label="Downloadable output file", scale=3)
272
  btn_openfolder = gr.Button('📂', scale=1)
273
 
274
+ params = [mic_input, dd_file_format, cb_timestamp]
275
 
276
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
277
  inputs=params + whisper_params.as_list(),
 
345
  inputs=None,
346
  outputs=None)
347
 
348
+ with gr.TabItem("BGM Separation"):
349
+ files_audio = gr.Files(type="filepath", label="Upload Audio Files to separate background music")
350
+ dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
351
+ choices=self.whisper_inf.music_separator.available_devices)
352
+ dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
353
+ choices=self.whisper_inf.music_separator.available_models)
354
+ nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
355
+ cb_uvr_save_file = gr.Checkbox(label="Save separated files to output",
356
+ value=True, visible=False)
357
+ btn_run = gr.Button("SEPARATE BACKGROUND MUSIC", variant="primary")
358
+ with gr.Column():
359
+ with gr.Row():
360
+ ad_instrumental = gr.Audio(label="Instrumental", scale=8)
361
+ btn_open_instrumental_folder = gr.Button('📂', scale=1)
362
+ with gr.Row():
363
+ ad_vocals = gr.Audio(label="Vocals", scale=8)
364
+ btn_open_vocals_folder = gr.Button('📂', scale=1)
365
+
366
+ btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
367
+ inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
368
+ cb_uvr_save_file],
369
+ outputs=[ad_instrumental, ad_vocals])
370
+ btn_open_instrumental_folder.click(inputs=None,
371
+ outputs=None,
372
+ fn=lambda: self.open_folder(os.path.join(
373
+ self.args.output_dir, "UVR", "instrumental"
374
+ )))
375
+ btn_open_vocals_folder.click(inputs=None,
376
+ outputs=None,
377
+ fn=lambda: self.open_folder(os.path.join(
378
+ self.args.output_dir, "UVR", "vocals"
379
+ )))
380
+
381
  # Launch the app with optional gradio settings
382
  args = self.args
383
 
 
397
  if os.path.exists(folder_path):
398
  os.system(f"start {folder_path}")
399
  else:
400
+ os.makedirs(folder_path, exist_ok=True)
401
+ print(f"The directory path {folder_path} has newly created.")
402
 
403
  @staticmethod
404
  def on_change_models(model_size: str):
 
413
  parser = argparse.ArgumentParser()
414
  parser.add_argument('--whisper_type', type=str, default="faster-whisper",
415
  help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
416
+ parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
417
  parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
418
  parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
419
  parser.add_argument('--root_path', type=str, default=None, help='Gradio root path')
420
  parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
421
  parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
422
  parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
423
+ parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
424
+ parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
425
+ parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
426
  parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
427
  help='Directory path of the whisper model')
428
  parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
 
434
  help='Directory path of the diarization model')
435
  parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
436
  help='Directory path of the Facebook NLLB model')
437
+ parser.add_argument('--uvr_model_dir', type=str, default=UVR_MODELS_DIR,
438
+ help='Directory path of the UVR model')
439
  parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
440
  _args = parser.parse_args()
441
 
configs/default_parameters.yaml CHANGED
@@ -44,6 +44,12 @@ diarization:
44
  is_diarize: false
45
  hf_token: ""
46
 
 
 
 
 
 
 
47
  translation:
48
  deepl:
49
  api_key: ""
 
44
  is_diarize: false
45
  hf_token: ""
46
 
47
+ bgm_separation:
48
+ is_separate_bgm: false
49
+ model_size: "UVR-MDX-NET-Inst_HQ_4"
50
+ segment_size: 256
51
+ save_file: false
52
+
53
  translation:
54
  deepl:
55
  api_key: ""
docker-compose.yaml CHANGED
@@ -1,5 +1,3 @@
1
- version: '3.8'
2
-
3
  services:
4
  app:
5
  build: .
 
 
 
1
  services:
2
  app:
3
  build: .
modules/ui/htmls.py CHANGED
@@ -38,7 +38,7 @@ CSS = """
38
  """
39
 
40
  MARKDOWN = """
41
- ### [Whisper Web-UI](https://github.com/jhj0517/Whsiper-WebUI)
42
  """
43
 
44
 
 
38
  """
39
 
40
  MARKDOWN = """
41
+ ### [Whisper-WebUI](https://github.com/jhj0517/Whsiper-WebUI)
42
  """
43
 
44
 
modules/utils/cli_manager.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+
4
+ def str2bool(v):
5
+ if isinstance(v, bool):
6
+ return v
7
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
8
+ return True
9
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
10
+ return False
11
+ else:
12
+ raise argparse.ArgumentTypeError('Boolean value expected.')
modules/utils/files_manager.py CHANGED
@@ -29,7 +29,8 @@ def save_yaml(data: dict, path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
29
 
30
 
31
  def get_media_files(folder_path, include_sub_directory=False):
32
- video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv']
 
33
  audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
34
  media_extensions = video_extensions + audio_extensions
35
 
@@ -61,3 +62,8 @@ def format_gradio_files(files: list):
61
  gradio_files.append(NamedString(file))
62
  return gradio_files
63
 
 
 
 
 
 
 
29
 
30
 
31
  def get_media_files(folder_path, include_sub_directory=False):
32
+ video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv', '*.webm', '*.m4v', '*.mpeg', '*.mpg',
33
+ '*.3gp', '*.f4v', '*.ogv', '*.vob', '*.mts', '*.m2ts', '*.divx', '*.mxf', '*.rm', '*.rmvb']
34
  audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
35
  media_extensions = video_extensions + audio_extensions
36
 
 
62
  gradio_files.append(NamedString(file))
63
  return gradio_files
64
 
65
+
66
+ def is_video(file_path):
67
+ video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
68
+ extension = os.path.splitext(file_path)[1].lower()
69
+ return extension in video_extensions
modules/utils/paths.py CHANGED
@@ -7,10 +7,14 @@ FASTER_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "faster-whisper")
7
  INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
8
  NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
9
  DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
 
10
  CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
11
  DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
12
  OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
13
  TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
 
 
 
14
 
15
  for dir_path in [MODELS_DIR,
16
  WHISPER_MODELS_DIR,
@@ -18,7 +22,10 @@ for dir_path in [MODELS_DIR,
18
  INSANELY_FAST_WHISPER_MODELS_DIR,
19
  NLLB_MODELS_DIR,
20
  DIARIZATION_MODELS_DIR,
 
21
  CONFIGS_DIR,
22
  OUTPUT_DIR,
23
- TRANSLATION_OUTPUT_DIR]:
 
 
24
  os.makedirs(dir_path, exist_ok=True)
 
7
  INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
8
  NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
9
  DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
10
+ UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
11
  CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
12
  DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
13
  OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
14
  TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
15
+ UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
16
+ UVR_INSTRUMENTAL_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "instrumental")
17
+ UVR_VOCALS_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "vocals")
18
 
19
  for dir_path in [MODELS_DIR,
20
  WHISPER_MODELS_DIR,
 
22
  INSANELY_FAST_WHISPER_MODELS_DIR,
23
  NLLB_MODELS_DIR,
24
  DIARIZATION_MODELS_DIR,
25
+ UVR_MODELS_DIR,
26
  CONFIGS_DIR,
27
  OUTPUT_DIR,
28
+ TRANSLATION_OUTPUT_DIR,
29
+ UVR_INSTRUMENTAL_OUTPUT_DIR,
30
+ UVR_VOCALS_OUTPUT_DIR]:
31
  os.makedirs(dir_path, exist_ok=True)
modules/uvr/music_separator.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union, List, Dict
2
+ import numpy as np
3
+ import torchaudio
4
+ import soundfile as sf
5
+ import os
6
+ import torch
7
+ import gc
8
+ import gradio as gr
9
+ from datetime import datetime
10
+
11
+ from uvr.models import MDX, Demucs, VrNetwork, MDXC
12
+ from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
13
+ from modules.utils.files_manager import load_yaml, save_yaml, is_video
14
+ from modules.diarize.audio_loader import load_audio
15
+
16
+ class MusicSeparator:
17
+ def __init__(self,
18
+ model_dir: Optional[str] = None,
19
+ output_dir: Optional[str] = None):
20
+ self.model = None
21
+ self.device = self.get_device()
22
+ self.available_devices = ["cpu", "cuda"]
23
+ self.model_dir = model_dir
24
+ self.output_dir = output_dir
25
+ instrumental_output_dir = os.path.join(self.output_dir, "instrumental")
26
+ vocals_output_dir = os.path.join(self.output_dir, "vocals")
27
+ os.makedirs(instrumental_output_dir, exist_ok=True)
28
+ os.makedirs(vocals_output_dir, exist_ok=True)
29
+ self.audio_info = None
30
+ self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
31
+ self.default_model = self.available_models[0]
32
+ self.current_model_size = self.default_model
33
+ self.model_config = {
34
+ "segment": 256,
35
+ "split": True
36
+ }
37
+
38
+ def update_model(self,
39
+ model_name: str = "UVR-MDX-NET-Inst_1",
40
+ device: Optional[str] = None,
41
+ segment_size: int = 256):
42
+ """
43
+ Update model with the given model name
44
+
45
+ Args:
46
+ model_name (str): Model name.
47
+ device (str): Device to use for the model.
48
+ segment_size (int): Segment size for the prediction.
49
+ """
50
+ if device is None:
51
+ device = self.device
52
+
53
+ self.device = device
54
+ self.model_config = {
55
+ "segment": segment_size,
56
+ "split": True
57
+ }
58
+ self.model = MDX(name=model_name,
59
+ other_metadata=self.model_config,
60
+ device=self.device,
61
+ logger=None,
62
+ model_dir=self.model_dir)
63
+
64
+ def separate(self,
65
+ audio: Union[str, np.ndarray],
66
+ model_name: str,
67
+ device: Optional[str] = None,
68
+ segment_size: int = 256,
69
+ save_file: bool = False,
70
+ progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray, List]:
71
+ """
72
+ Separate the background music from the audio.
73
+
74
+ Args:
75
+ audio (Union[str, np.ndarray]): Audio path or numpy array.
76
+ model_name (str): Model name.
77
+ device (str): Device to use for the model.
78
+ segment_size (int): Segment size for the prediction.
79
+ save_file (bool): Whether to save the separated audio to output path or not.
80
+ progress (gr.Progress): Gradio progress indicator.
81
+
82
+ Returns:
83
+ A Tuple of
84
+ np.ndarray: Instrumental numpy arrays.
85
+ np.ndarray: Vocals numpy arrays.
86
+ file_paths: List of file paths where the separated audio is saved. Return empty when save_file is False.
87
+ """
88
+ if isinstance(audio, str):
89
+ output_filename, ext = os.path.basename(audio), ".wav"
90
+ output_filename, orig_ext = os.path.splitext(output_filename)
91
+
92
+ if is_video(audio):
93
+ audio = load_audio(audio)
94
+ sample_rate = 16000
95
+ else:
96
+ self.audio_info = torchaudio.info(audio)
97
+ sample_rate = self.audio_info.sample_rate
98
+ else:
99
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
100
+ output_filename, ext = f"UVR-{timestamp}", ".wav"
101
+ sample_rate = 16000
102
+
103
+ model_config = {
104
+ "segment": segment_size,
105
+ "split": True
106
+ }
107
+
108
+ if (self.model is None or
109
+ self.current_model_size != model_name or
110
+ self.model_config != model_config or
111
+ self.model.sample_rate != sample_rate or
112
+ self.device != device):
113
+ progress(0, desc="Initializing UVR Model..")
114
+ self.update_model(
115
+ model_name=model_name,
116
+ device=device,
117
+ segment_size=segment_size
118
+ )
119
+ self.model.sample_rate = sample_rate
120
+
121
+ progress(0, desc="Separating background music from the audio..")
122
+ result = self.model(audio)
123
+ instrumental, vocals = result["instrumental"].T, result["vocals"].T
124
+
125
+ file_paths = []
126
+ if save_file:
127
+ instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
128
+ vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
129
+ sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
130
+ sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
131
+ file_paths += [instrumental_output_path, vocals_output_path]
132
+
133
+ return instrumental, vocals, file_paths
134
+
135
+ def separate_files(self,
136
+ files: List,
137
+ model_name: str,
138
+ device: Optional[str] = None,
139
+ segment_size: int = 256,
140
+ save_file: bool = True,
141
+ progress: gr.Progress = gr.Progress()) -> List[str]:
142
+ """Separate the background music from the audio files. Returns only last Instrumental and vocals file paths
143
+ to display into gr.Audio()"""
144
+ self.cache_parameters(model_size=model_name, segment_size=segment_size)
145
+
146
+ for file_path in files:
147
+ instrumental, vocals, file_paths = self.separate(
148
+ audio=file_path,
149
+ model_name=model_name,
150
+ device=device,
151
+ segment_size=segment_size,
152
+ save_file=save_file,
153
+ progress=progress
154
+ )
155
+ return file_paths
156
+
157
+ @staticmethod
158
+ def get_device():
159
+ """Get device for the model"""
160
+ return "cuda" if torch.cuda.is_available() else "cpu"
161
+
162
+ def offload(self):
163
+ """Offload the model and free up the memory"""
164
+ if self.model is not None:
165
+ del self.model
166
+ self.model = None
167
+ if self.device == "cuda":
168
+ torch.cuda.empty_cache()
169
+ gc.collect()
170
+ self.audio_info = None
171
+
172
+ @staticmethod
173
+ def cache_parameters(model_size: str,
174
+ segment_size: int):
175
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
176
+ cached_uvr_params = cached_params["bgm_separation"]
177
+ uvr_params_to_cache = {
178
+ "model_size": model_size,
179
+ "segment_size": segment_size
180
+ }
181
+ cached_uvr_params = {**cached_uvr_params, **uvr_params_to_cache}
182
+ cached_params["bgm_separation"] = cached_uvr_params
183
+ save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
modules/whisper/faster_whisper_inference.py CHANGED
@@ -11,7 +11,7 @@ import whisper
11
  import gradio as gr
12
  from argparse import Namespace
13
 
14
- from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
15
  from modules.whisper.whisper_parameter import *
16
  from modules.whisper.whisper_base import WhisperBase
17
 
@@ -20,11 +20,13 @@ class FasterWhisperInference(WhisperBase):
20
  def __init__(self,
21
  model_dir: str = FASTER_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
23
  output_dir: str = OUTPUT_DIR,
24
  ):
25
  super().__init__(
26
  model_dir=model_dir,
27
  diarization_model_dir=diarization_model_dir,
 
28
  output_dir=output_dir
29
  )
30
  self.model_dir = model_dir
 
11
  import gradio as gr
12
  from argparse import Namespace
13
 
14
+ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
  from modules.whisper.whisper_parameter import *
16
  from modules.whisper.whisper_base import WhisperBase
17
 
 
20
  def __init__(self,
21
  model_dir: str = FASTER_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
23
+ uvr_model_dir: str = UVR_MODELS_DIR,
24
  output_dir: str = OUTPUT_DIR,
25
  ):
26
  super().__init__(
27
  model_dir=model_dir,
28
  diarization_model_dir=diarization_model_dir,
29
+ uvr_model_dir=uvr_model_dir,
30
  output_dir=output_dir
31
  )
32
  self.model_dir = model_dir
modules/whisper/insanely_fast_whisper_inference.py CHANGED
@@ -11,7 +11,7 @@ import whisper
11
  from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
12
  from argparse import Namespace
13
 
14
- from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
15
  from modules.whisper.whisper_parameter import *
16
  from modules.whisper.whisper_base import WhisperBase
17
 
@@ -20,12 +20,14 @@ class InsanelyFastWhisperInference(WhisperBase):
20
  def __init__(self,
21
  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
23
  output_dir: str = OUTPUT_DIR,
24
  ):
25
  super().__init__(
26
  model_dir=model_dir,
27
  output_dir=output_dir,
28
- diarization_model_dir=diarization_model_dir
 
29
  )
30
  self.model_dir = model_dir
31
  os.makedirs(self.model_dir, exist_ok=True)
 
11
  from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
12
  from argparse import Namespace
13
 
14
+ from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
  from modules.whisper.whisper_parameter import *
16
  from modules.whisper.whisper_base import WhisperBase
17
 
 
20
  def __init__(self,
21
  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
22
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
23
+ uvr_model_dir: str = UVR_MODELS_DIR,
24
  output_dir: str = OUTPUT_DIR,
25
  ):
26
  super().__init__(
27
  model_dir=model_dir,
28
  output_dir=output_dir,
29
+ diarization_model_dir=diarization_model_dir,
30
+ uvr_model_dir=uvr_model_dir
31
  )
32
  self.model_dir = model_dir
33
  os.makedirs(self.model_dir, exist_ok=True)
modules/whisper/whisper_Inference.py CHANGED
@@ -7,7 +7,7 @@ import torch
7
  import os
8
  from argparse import Namespace
9
 
10
- from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
11
  from modules.whisper.whisper_base import WhisperBase
12
  from modules.whisper.whisper_parameter import *
13
 
@@ -16,12 +16,14 @@ class WhisperInference(WhisperBase):
16
  def __init__(self,
17
  model_dir: str = WHISPER_MODELS_DIR,
18
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
19
  output_dir: str = OUTPUT_DIR,
20
  ):
21
  super().__init__(
22
  model_dir=model_dir,
23
  output_dir=output_dir,
24
- diarization_model_dir=diarization_model_dir
 
25
  )
26
 
27
  def transcribe(self,
 
7
  import os
8
  from argparse import Namespace
9
 
10
+ from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
11
  from modules.whisper.whisper_base import WhisperBase
12
  from modules.whisper.whisper_parameter import *
13
 
 
16
  def __init__(self,
17
  model_dir: str = WHISPER_MODELS_DIR,
18
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
19
+ uvr_model_dir: str = UVR_MODELS_DIR,
20
  output_dir: str = OUTPUT_DIR,
21
  ):
22
  super().__init__(
23
  model_dir=model_dir,
24
  output_dir=output_dir,
25
+ diarization_model_dir=diarization_model_dir,
26
+ uvr_model_dir=uvr_model_dir
27
  )
28
 
29
  def transcribe(self,
modules/whisper/whisper_base.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import torch
3
  import whisper
4
  import gradio as gr
 
5
  from abc import ABC, abstractmethod
6
  from typing import BinaryIO, Union, Tuple, List
7
  import numpy as np
@@ -9,7 +10,9 @@ from datetime import datetime
9
  from faster_whisper.vad import VadOptions
10
  from dataclasses import astuple
11
 
12
- from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH)
 
 
13
  from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
14
  from modules.utils.youtube_manager import get_ytdata, get_ytaudio
15
  from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
@@ -22,6 +25,7 @@ class WhisperBase(ABC):
22
  def __init__(self,
23
  model_dir: str = WHISPER_MODELS_DIR,
24
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
25
  output_dir: str = OUTPUT_DIR,
26
  ):
27
  self.model_dir = model_dir
@@ -32,6 +36,10 @@ class WhisperBase(ABC):
32
  model_dir=diarization_model_dir
33
  )
34
  self.vad = SileroVAD()
 
 
 
 
35
 
36
  self.model = None
37
  self.current_model_size = None
@@ -102,7 +110,26 @@ class WhisperBase(ABC):
102
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
103
  params.lang = language_code_dict[params.lang]
104
 
105
- speech_chunks = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  if params.vad_filter:
107
  # Explicit value set for float('inf') from gr.Number()
108
  if params.max_speech_duration_s >= 9999:
@@ -224,6 +251,7 @@ class WhisperBase(ABC):
224
  def transcribe_mic(self,
225
  mic_audio: str,
226
  file_format: str,
 
227
  progress=gr.Progress(),
228
  *whisper_params,
229
  ) -> list:
@@ -236,6 +264,8 @@ class WhisperBase(ABC):
236
  Audio file path from gr.Microphone()
237
  file_format: str
238
  Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
 
 
239
  progress: gr.Progress
240
  Indicator to show progress directly in gradio.
241
  *whisper_params: tuple
@@ -253,6 +283,7 @@ class WhisperBase(ABC):
253
  transcribed_segments, time_for_task = self.run(
254
  mic_audio,
255
  progress,
 
256
  *whisper_params,
257
  )
258
  progress(1, desc="Completed!")
@@ -260,7 +291,7 @@ class WhisperBase(ABC):
260
  subtitle, result_file_path = self.generate_and_write_file(
261
  file_name="Mic",
262
  transcribed_segments=transcribed_segments,
263
- add_timestamp=True,
264
  file_format=file_format,
265
  output_dir=self.output_dir
266
  )
@@ -427,18 +458,40 @@ class WhisperBase(ABC):
427
  if torch.cuda.is_available():
428
  return "cuda"
429
  elif torch.backends.mps.is_available():
 
 
 
430
  return "mps"
431
  else:
432
  return "cpu"
433
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  @staticmethod
435
  def release_cuda_memory():
 
436
  if torch.cuda.is_available():
437
  torch.cuda.empty_cache()
438
  torch.cuda.reset_max_memory_allocated()
439
 
440
  @staticmethod
441
  def remove_input_files(file_paths: List[str]):
 
442
  if not file_paths:
443
  return
444
 
@@ -451,9 +504,25 @@ class WhisperBase(ABC):
451
  whisper_params: WhisperValues,
452
  add_timestamp: bool
453
  ):
 
454
  cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
455
  cached_whisper_param = whisper_params.to_yaml()
456
  cached_yaml = {**cached_params, **cached_whisper_param}
457
  cached_yaml["whisper"]["add_timestamp"] = add_timestamp
458
 
459
  save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import torch
3
  import whisper
4
  import gradio as gr
5
+ import torchaudio
6
  from abc import ABC, abstractmethod
7
  from typing import BinaryIO, Union, Tuple, List
8
  import numpy as np
 
10
  from faster_whisper.vad import VadOptions
11
  from dataclasses import astuple
12
 
13
+ from modules.uvr.music_separator import MusicSeparator
14
+ from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
15
+ UVR_MODELS_DIR)
16
  from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
17
  from modules.utils.youtube_manager import get_ytdata, get_ytaudio
18
  from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
 
25
  def __init__(self,
26
  model_dir: str = WHISPER_MODELS_DIR,
27
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
28
+ uvr_model_dir: str = UVR_MODELS_DIR,
29
  output_dir: str = OUTPUT_DIR,
30
  ):
31
  self.model_dir = model_dir
 
36
  model_dir=diarization_model_dir
37
  )
38
  self.vad = SileroVAD()
39
+ self.music_separator = MusicSeparator(
40
+ model_dir=uvr_model_dir,
41
+ output_dir=os.path.join(output_dir, "UVR")
42
+ )
43
 
44
  self.model = None
45
  self.current_model_size = None
 
110
  language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
111
  params.lang = language_code_dict[params.lang]
112
 
113
+ if params.is_bgm_separate:
114
+ music, audio, _ = self.music_separator.separate(
115
+ audio=audio,
116
+ model_name=params.uvr_model_size,
117
+ device=params.uvr_device,
118
+ segment_size=params.uvr_segment_size,
119
+ save_file=params.uvr_save_file,
120
+ progress=progress
121
+ )
122
+
123
+ if audio.ndim >= 2:
124
+ audio = audio.mean(axis=1)
125
+ if self.music_separator.audio_info is None:
126
+ origin_sample_rate = 16000
127
+ else:
128
+ origin_sample_rate = self.music_separator.audio_info.sample_rate
129
+ audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
130
+
131
+ self.music_separator.offload()
132
+
133
  if params.vad_filter:
134
  # Explicit value set for float('inf') from gr.Number()
135
  if params.max_speech_duration_s >= 9999:
 
251
  def transcribe_mic(self,
252
  mic_audio: str,
253
  file_format: str,
254
+ add_timestamp: bool,
255
  progress=gr.Progress(),
256
  *whisper_params,
257
  ) -> list:
 
264
  Audio file path from gr.Microphone()
265
  file_format: str
266
  Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
267
+ add_timestamp: bool
268
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
269
  progress: gr.Progress
270
  Indicator to show progress directly in gradio.
271
  *whisper_params: tuple
 
283
  transcribed_segments, time_for_task = self.run(
284
  mic_audio,
285
  progress,
286
+ add_timestamp,
287
  *whisper_params,
288
  )
289
  progress(1, desc="Completed!")
 
291
  subtitle, result_file_path = self.generate_and_write_file(
292
  file_name="Mic",
293
  transcribed_segments=transcribed_segments,
294
+ add_timestamp=add_timestamp,
295
  file_format=file_format,
296
  output_dir=self.output_dir
297
  )
 
458
  if torch.cuda.is_available():
459
  return "cuda"
460
  elif torch.backends.mps.is_available():
461
+ if not WhisperBase.is_sparse_api_supported():
462
+ # Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
463
+ return "cpu"
464
  return "mps"
465
  else:
466
  return "cpu"
467
 
468
+ @staticmethod
469
+ def is_sparse_api_supported():
470
+ if not torch.backends.mps.is_available():
471
+ return False
472
+
473
+ try:
474
+ device = torch.device("mps")
475
+ sparse_tensor = torch.sparse_coo_tensor(
476
+ indices=torch.tensor([[0, 1], [2, 3]]),
477
+ values=torch.tensor([1, 2]),
478
+ size=(4, 4),
479
+ device=device
480
+ )
481
+ return True
482
+ except RuntimeError:
483
+ return False
484
+
485
  @staticmethod
486
  def release_cuda_memory():
487
+ """Release memory"""
488
  if torch.cuda.is_available():
489
  torch.cuda.empty_cache()
490
  torch.cuda.reset_max_memory_allocated()
491
 
492
  @staticmethod
493
  def remove_input_files(file_paths: List[str]):
494
+ """Remove gradio cached files"""
495
  if not file_paths:
496
  return
497
 
 
504
  whisper_params: WhisperValues,
505
  add_timestamp: bool
506
  ):
507
+ """cache parameters to the yaml file"""
508
  cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
509
  cached_whisper_param = whisper_params.to_yaml()
510
  cached_yaml = {**cached_params, **cached_whisper_param}
511
  cached_yaml["whisper"]["add_timestamp"] = add_timestamp
512
 
513
  save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
514
+
515
+ @staticmethod
516
+ def resample_audio(audio: Union[str, np.ndarray],
517
+ new_sample_rate: int = 16000,
518
+ original_sample_rate: Optional[int] = None,) -> np.ndarray:
519
+ """Resamples audio to 16k sample rate, standard on Whisper model"""
520
+ if isinstance(audio, str):
521
+ audio, original_sample_rate = torchaudio.load(audio)
522
+ else:
523
+ if original_sample_rate is None:
524
+ raise ValueError("original_sample_rate must be provided when audio is numpy array.")
525
+ audio = torch.from_numpy(audio)
526
+ resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=new_sample_rate)
527
+ resampled_audio = resampler(audio).numpy()
528
+ return resampled_audio
modules/whisper/whisper_factory.py CHANGED
@@ -2,7 +2,7 @@ from typing import Optional
2
  import os
3
 
4
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
5
- INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR)
6
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
7
  from modules.whisper.whisper_Inference import WhisperInference
8
  from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
@@ -17,6 +17,7 @@ class WhisperFactory:
17
  faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
18
  insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
19
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
 
20
  output_dir: str = OUTPUT_DIR,
21
  ) -> "WhisperBase":
22
  """
@@ -37,6 +38,8 @@ class WhisperFactory:
37
  Directory path for the Insanely Fast Whisper model.
38
  diarization_model_dir : str
39
  Directory path for the diarization model.
 
 
40
  output_dir : str
41
  Directory path where output files will be saved.
42
 
@@ -61,23 +64,27 @@ class WhisperFactory:
61
  return FasterWhisperInference(
62
  model_dir=faster_whisper_model_dir,
63
  output_dir=output_dir,
64
- diarization_model_dir=diarization_model_dir
 
65
  )
66
  elif whisper_type in whisper_typos:
67
  return WhisperInference(
68
  model_dir=whisper_model_dir,
69
  output_dir=output_dir,
70
- diarization_model_dir=diarization_model_dir
 
71
  )
72
  elif whisper_type in insanely_fast_whisper_typos:
73
  return InsanelyFastWhisperInference(
74
  model_dir=insanely_fast_whisper_model_dir,
75
  output_dir=output_dir,
76
- diarization_model_dir=diarization_model_dir
 
77
  )
78
  else:
79
  return FasterWhisperInference(
80
  model_dir=faster_whisper_model_dir,
81
  output_dir=output_dir,
82
- diarization_model_dir=diarization_model_dir
 
83
  )
 
2
  import os
3
 
4
  from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
5
+ INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR, UVR_MODELS_DIR)
6
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
7
  from modules.whisper.whisper_Inference import WhisperInference
8
  from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
 
17
  faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
18
  insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
19
  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
20
+ uvr_model_dir: str = UVR_MODELS_DIR,
21
  output_dir: str = OUTPUT_DIR,
22
  ) -> "WhisperBase":
23
  """
 
38
  Directory path for the Insanely Fast Whisper model.
39
  diarization_model_dir : str
40
  Directory path for the diarization model.
41
+ uvr_model_dir : str
42
+ Directory path for the UVR model.
43
  output_dir : str
44
  Directory path where output files will be saved.
45
 
 
64
  return FasterWhisperInference(
65
  model_dir=faster_whisper_model_dir,
66
  output_dir=output_dir,
67
+ diarization_model_dir=diarization_model_dir,
68
+ uvr_model_dir=uvr_model_dir
69
  )
70
  elif whisper_type in whisper_typos:
71
  return WhisperInference(
72
  model_dir=whisper_model_dir,
73
  output_dir=output_dir,
74
+ diarization_model_dir=diarization_model_dir,
75
+ uvr_model_dir=uvr_model_dir
76
  )
77
  elif whisper_type in insanely_fast_whisper_typos:
78
  return InsanelyFastWhisperInference(
79
  model_dir=insanely_fast_whisper_model_dir,
80
  output_dir=output_dir,
81
+ diarization_model_dir=diarization_model_dir,
82
+ uvr_model_dir=uvr_model_dir
83
  )
84
  else:
85
  return FasterWhisperInference(
86
  model_dir=faster_whisper_model_dir,
87
  output_dir=output_dir,
88
+ diarization_model_dir=diarization_model_dir,
89
+ uvr_model_dir=uvr_model_dir
90
  )
modules/whisper/whisper_parameter.py CHANGED
@@ -47,6 +47,11 @@ class WhisperParameters:
47
  hotwords: gr.Textbox
48
  language_detection_threshold: gr.Number
49
  language_detection_segments: gr.Number
 
 
 
 
 
50
  """
51
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
52
  This data class is used to mitigate the key-value problem between Gradio components and function parameters.
@@ -148,61 +153,76 @@ class WhisperParameters:
148
  diarization_device: gr.Dropdown
149
  This parameter is related with whisperx. Device to run diarization model
150
 
151
- length_penalty:
152
  This parameter is related to faster-whisper. Exponential length penalty constant.
153
 
154
- repetition_penalty:
155
  This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
156
  (set > 1 to penalize).
157
 
158
- no_repeat_ngram_size:
159
  This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
160
 
161
- prefix:
162
  This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
163
 
164
- suppress_blank:
165
  This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
166
 
167
- suppress_tokens:
168
  This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
169
  of symbols as defined in the model config.json file.
170
 
171
- max_initial_timestamp:
172
  This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
173
 
174
- word_timestamps:
175
  This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
176
  and dynamic time warping, and include the timestamps for each word in each segment.
177
 
178
- prepend_punctuations:
179
  This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
180
  with the next word.
181
 
182
- append_punctuations:
183
  This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
184
  with the previous word.
185
 
186
- max_new_tokens:
187
  This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
188
  the maximum will be set by the default max_length.
189
 
190
- chunk_length:
191
  This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
192
  default chunk_length of the FeatureExtractor.
193
 
194
- hallucination_silence_threshold:
195
  This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
196
  (in seconds) when a possible hallucination is detected.
197
 
198
- hotwords:
199
  This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
200
 
201
- language_detection_threshold:
202
  This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
203
 
204
- language_detection_segments:
205
  This parameter is related to faster-whisper. Number of segments to consider for the language detection.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  """
207
 
208
  def as_list(self) -> list:
@@ -273,6 +293,11 @@ class WhisperValues:
273
  hotwords: Optional[str]
274
  language_detection_threshold: Optional[float]
275
  language_detection_segments: int
 
 
 
 
 
276
  """
277
  A data class to use Whisper parameters.
278
  """
@@ -323,6 +348,12 @@ class WhisperValues:
323
  "diarization": {
324
  "is_diarize": self.is_diarize,
325
  "hf_token": self.hf_token
326
- }
 
 
 
 
 
 
327
  }
328
  return data
 
47
  hotwords: gr.Textbox
48
  language_detection_threshold: gr.Number
49
  language_detection_segments: gr.Number
50
+ is_bgm_separate: gr.Checkbox
51
+ uvr_model_size: gr.Dropdown
52
+ uvr_device: gr.Dropdown
53
+ uvr_segment_size: gr.Number
54
+ uvr_save_file: gr.Checkbox
55
  """
56
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
57
  This data class is used to mitigate the key-value problem between Gradio components and function parameters.
 
153
  diarization_device: gr.Dropdown
154
  This parameter is related with whisperx. Device to run diarization model
155
 
156
+ length_penalty: gr.Number
157
  This parameter is related to faster-whisper. Exponential length penalty constant.
158
 
159
+ repetition_penalty: gr.Number
160
  This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
161
  (set > 1 to penalize).
162
 
163
+ no_repeat_ngram_size: gr.Number
164
  This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
165
 
166
+ prefix: gr.Textbox
167
  This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
168
 
169
+ suppress_blank: gr.Checkbox
170
  This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
171
 
172
+ suppress_tokens: gr.Textbox
173
  This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
174
  of symbols as defined in the model config.json file.
175
 
176
+ max_initial_timestamp: gr.Number
177
  This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
178
 
179
+ word_timestamps: gr.Checkbox
180
  This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
181
  and dynamic time warping, and include the timestamps for each word in each segment.
182
 
183
+ prepend_punctuations: gr.Textbox
184
  This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
185
  with the next word.
186
 
187
+ append_punctuations: gr.Textbox
188
  This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
189
  with the previous word.
190
 
191
+ max_new_tokens: gr.Number
192
  This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
193
  the maximum will be set by the default max_length.
194
 
195
+ chunk_length: gr.Number
196
  This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
197
  default chunk_length of the FeatureExtractor.
198
 
199
+ hallucination_silence_threshold: gr.Number
200
  This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
201
  (in seconds) when a possible hallucination is detected.
202
 
203
+ hotwords: gr.Textbox
204
  This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
205
 
206
+ language_detection_threshold: gr.Number
207
  This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
208
 
209
+ language_detection_segments: gr.Number
210
  This parameter is related to faster-whisper. Number of segments to consider for the language detection.
211
+
212
+ is_separate_bgm: gr.Checkbox
213
+ This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.
214
+
215
+ uvr_model_size: gr.Dropdown
216
+ This parameter is related to UVR. UVR model size.
217
+
218
+ uvr_device: gr.Dropdown
219
+ This parameter is related to UVR. Device to run UVR model.
220
+
221
+ uvr_segment_size: gr.Number
222
+ This parameter is related to UVR. Segment size for UVR model.
223
+
224
+ uvr_save_file: gr.Checkbox
225
+ This parameter is related to UVR. Boolean value that determines whether to save the file or not.
226
  """
227
 
228
  def as_list(self) -> list:
 
293
  hotwords: Optional[str]
294
  language_detection_threshold: Optional[float]
295
  language_detection_segments: int
296
+ is_bgm_separate: bool
297
+ uvr_model_size: str
298
+ uvr_device: str
299
+ uvr_segment_size: int
300
+ uvr_save_file: bool
301
  """
302
  A data class to use Whisper parameters.
303
  """
 
348
  "diarization": {
349
  "is_diarize": self.is_diarize,
350
  "hf_token": self.hf_token
351
+ },
352
+ "bgm_separation": {
353
+ "is_separate_bgm": self.is_bgm_separate,
354
+ "model_size": self.uvr_model_size,
355
+ "segment_size": self.uvr_segment_size,
356
+ "save_file": self.uvr_save_file,
357
+ },
358
  }
359
  return data
notebook/whisper-webui.ipynb CHANGED
@@ -58,7 +58,8 @@
58
  "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
59
  "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
60
  "!pip install tokenizers==0.19.1\n",
61
- "!pip install pyannote.audio==3.3.1"
 
62
  ]
63
  },
64
  {
@@ -96,7 +97,7 @@
96
  },
97
  {
98
  "cell_type": "code",
99
- "execution_count": null,
100
  "metadata": {
101
  "id": "PQroYRRZzQiN",
102
  "cellView": "form"
 
58
  "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
59
  "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
60
  "!pip install tokenizers==0.19.1\n",
61
+ "!pip install pyannote.audio==3.3.1\n",
62
+ "!pip install git+https://github.com/jhj0517/ultimatevocalremover_api.git"
63
  ]
64
  },
65
  {
 
97
  },
98
  {
99
  "cell_type": "code",
100
+ "execution_count": 3,
101
  "metadata": {
102
  "id": "PQroYRRZzQiN",
103
  "cellView": "form"
requirements.txt CHANGED
@@ -12,4 +12,6 @@ transformers==4.42.3
12
  gradio==4.43.0
13
  pytubefix
14
  ruamel.yaml==0.18.6
15
- pyannote.audio==3.3.1
 
 
 
12
  gradio==4.43.0
13
  pytubefix
14
  ruamel.yaml==0.18.6
15
+ pyannote.audio==3.3.1
16
+ git+https://github.com/jhj0517/ultimatevocalremover_api.git
17
+ git+https://github.com/jhj0517/pyrubberband.git