Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Apr 8

Commit

04fe334

•

2 Parent(s): 10a154c e3a6426

Merge pull request #134 from jhj0517/feature/more-parameters

Browse files

Files changed (4) hide show

app.py +41 -9
modules/faster_whisper_inference.py +144 -197
modules/whisper_Inference.py +144 -202
modules/whisper_data_class.py +88 -0

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ from modules.nllb_inference import NLLBInference
 from ui.htmls import *
 from modules.youtube_manager import get_ytmetas
 from modules.deepl_api import DeepLAPI
 class App:
     def __init__(self, args):
@@ -61,6 +63,8 @@ class App:
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -68,10 +72,18 @@ class App:
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4, interactive=False)
                         btn_openfolder = gr.Button('📂', scale=1)
-                    params = [input_file, dd_model, dd_lang, dd_file_format, cb_translate, cb_timestamp]
-                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
-                                  inputs=params + advanced_params,
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
@@ -101,6 +113,8 @@ class App:
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -108,10 +122,18 @@ class App:
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4)
                         btn_openfolder = gr.Button('📂', scale=1)
-                    params = [tb_youtubelink, dd_model, dd_lang, dd_file_format, cb_translate, cb_timestamp]
-                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
-                                  inputs=params + advanced_params,
                                   outputs=[tb_indicator, files_subtitles])
                     tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
                                           outputs=[img_thumbnail, tb_title, tb_description])
@@ -134,6 +156,8 @@ class App:
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -141,10 +165,18 @@ class App:
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4)
                         btn_openfolder = gr.Button('📂', scale=1)
-                    params = [mic_input, dd_model, dd_lang, dd_file_format, cb_translate]
-                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
-                                  inputs=params + advanced_params,
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])

 from ui.htmls import *
 from modules.youtube_manager import get_ytmetas
 from modules.deepl_api import DeepLAPI
+from modules.whisper_data_class import *
 class App:
     def __init__(self, args):
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
+                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4, interactive=False)
                         btn_openfolder = gr.Button('📂', scale=1)
+                    params = [input_file, dd_file_format, cb_timestamp]
+                    whisper_params = WhisperGradioComponents(model_size=dd_model,
+                                                             lang=dd_lang,
+                                                             is_translate=cb_translate,
+                                                             beam_size=nb_beam_size,
+                                                             log_prob_threshold=nb_log_prob_threshold,
+                                                             no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type,
+                                                             best_of=nb_best_of,
+                                                             patience=nb_patience)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                  inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
+                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4)
                         btn_openfolder = gr.Button('📂', scale=1)
+                    params = [tb_youtubelink, dd_file_format, cb_timestamp]
+                    whisper_params = WhisperGradioComponents(model_size=dd_model,
+                                                             lang=dd_lang,
+                                                             is_translate=cb_translate,
+                                                             beam_size=nb_beam_size,
+                                                             log_prob_threshold=nb_log_prob_threshold,
+                                                             no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type,
+                                                             best_of=nb_best_of,
+                                                             patience=nb_patience)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
+                                  inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
                                           outputs=[img_thumbnail, tb_title, tb_description])
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
+                        nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
+                        nb_patience = gr.Number(label="Patience", value=1, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4)
                         btn_openfolder = gr.Button('📂', scale=1)
+                    params = [mic_input, dd_file_format]
+                    whisper_params = WhisperGradioComponents(model_size=dd_model,
+                                                             lang=dd_lang,
+                                                             is_translate=cb_translate,
+                                                             beam_size=nb_beam_size,
+                                                             log_prob_threshold=nb_log_prob_threshold,
+                                                             no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type,
+                                                             best_of=nb_best_of,
+                                                             patience=nb_patience)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
+                                  inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])

modules/faster_whisper_inference.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
-import tqdm
 import time
 import numpy as np
 from typing import BinaryIO, Union, Tuple, List
-from datetime import datetime, timedelta
 import faster_whisper
 import ctranslate2
@@ -15,6 +14,7 @@ import gradio as gr
 from .base_interface import BaseInterface
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.youtube_manager import get_ytdata, get_ytaudio
 class FasterWhisperInference(BaseInterface):
@@ -26,78 +26,51 @@ class FasterWhisperInference(BaseInterface):
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.available_compute_types = ctranslate2.get_supported_compute_types("cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
         self.default_beam_size = 1
     def transcribe_file(self,
-                        fileobjs: list,
-                        model_size: str,
-                        lang: str,
                         file_format: str,
-                        istranslate: bool,
                         add_timestamp: bool,
-                        beam_size: int,
-                        log_prob_threshold: float,
-                        no_speech_threshold: float,
-                        compute_type: str,
-                        progress=gr.Progress()
                         ) -> list:
         """
         Write subtitle file from Files
         Parameters
         ----------
-        fileobjs: list
             List of files to transcribe from gr.Files()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
-            see more info : https://opennmt.net/CTranslate2/quantization.html
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         Returns
         ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             files_info = {}
-            for fileobj in fileobjs:
                 transcribed_segments, time_for_task = self.transcribe(
-                    audio=fileobj.name,
-                    lang=lang,
-                    istranslate=istranslate,
-                    beam_size=beam_size,
-                    log_prob_threshold=log_prob_threshold,
-                    no_speech_threshold=no_speech_threshold,
-                    progress=progress
                 )
-                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
                 file_name = safe_filename(file_name)
                 subtitle, file_path = self.generate_and_write_file(
                     file_name=file_name,
@@ -105,7 +78,7 @@ class FasterWhisperInference(BaseInterface):
                     add_timestamp=add_timestamp,
                     file_format=file_format
                 )
-                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path":  file_path}
             total_result = ''
             total_time = 0
@@ -115,106 +88,78 @@ class FasterWhisperInference(BaseInterface):
                 total_result += f'{info["subtitle"]}'
                 total_time += info["time_for_task"]
-            gr_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
-            gr_file_path = [info['path'] for info in files_info.values()]
-            return [gr_str, gr_file_path]
         except Exception as e:
-            print(f"Error transcribing file on line {e}")
         finally:
             self.release_cuda_memory()
-            if not fileobjs:
-                self.remove_input_files([fileobj.name for fileobj in fileobjs])
     def transcribe_youtube(self,
-                           youtubelink: str,
-                           model_size: str,
-                           lang: str,
                            file_format: str,
-                           istranslate: bool,
                            add_timestamp: bool,
-                           beam_size: int,
-                           log_prob_threshold: float,
-                           no_speech_threshold: float,
-                           compute_type: str,
-                           progress=gr.Progress()
                            ) -> list:
         """
         Write subtitle file from Youtube
         Parameters
         ----------
-        youtubelink: str
-            Link of Youtube to transcribe from gr.Textbox()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
-            see more info : https://opennmt.net/CTranslate2/quantization.html
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         Returns
         ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             progress(0, desc="Loading Audio from Youtube..")
-            yt = get_ytdata(youtubelink)
             audio = get_ytaudio(yt)
             transcribed_segments, time_for_task = self.transcribe(
-                audio=audio,
-                lang=lang,
-                istranslate=istranslate,
-                beam_size=beam_size,
-                log_prob_threshold=log_prob_threshold,
-                no_speech_threshold=no_speech_threshold,
-                progress=progress
             )
             progress(1, desc="Completed!")
             file_name = safe_filename(yt.title)
-            subtitle, file_path = self.generate_and_write_file(
                 file_name=file_name,
                 transcribed_segments=transcribed_segments,
                 add_timestamp=add_timestamp,
                 file_format=file_format
             )
-            gr_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [gr_str, file_path]
         except Exception as e:
-            print(f"Error transcribing file on line {e}")
         finally:
             try:
                 if 'yt' not in locals():
-                    yt = get_ytdata(youtubelink)
                     file_path = get_ytaudio(yt)
                 else:
                     file_path = get_ytaudio(yt)
@@ -225,93 +170,60 @@ class FasterWhisperInference(BaseInterface):
                 pass
     def transcribe_mic(self,
-                       micaudio: str,
-                       model_size: str,
-                       lang: str,
                        file_format: str,
-                       istranslate: bool,
-                       beam_size: int,
-                       log_prob_threshold: float,
-                       no_speech_threshold: float,
-                       compute_type: str,
-                       progress=gr.Progress()
                        ) -> list:
         """
         Write subtitle file from microphone
         Parameters
         ----------
-        micaudio: str
             Audio file path from gr.Microphone()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-        compute_type: str
-            compute type from gr.Dropdown().
-            see more info : https://opennmt.net/CTranslate2/quantization.html
-            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         Returns
         ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             progress(0, desc="Loading Audio..")
             transcribed_segments, time_for_task = self.transcribe(
-                audio=micaudio,
-                lang=lang,
-                istranslate=istranslate,
-                beam_size=beam_size,
-                log_prob_threshold=log_prob_threshold,
-                no_speech_threshold=no_speech_threshold,
-                progress=progress
             )
             progress(1, desc="Completed!")
-            subtitle, file_path = self.generate_and_write_file(
                 file_name="Mic",
                 transcribed_segments=transcribed_segments,
                 add_timestamp=True,
                 file_format=file_format
             )
-            gr_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [gr_str, file_path]
         except Exception as e:
-            print(f"Error transcribing file on line {e}")
         finally:
             self.release_cuda_memory()
-            self.remove_input_files([micaudio])
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
-                   lang: str,
-                   istranslate: bool,
-                   beam_size: int,
-                   log_prob_threshold: float,
-                   no_speech_threshold: float,
-                   progress: gr.Progress
                    ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
@@ -320,22 +232,10 @@ class FasterWhisperInference(BaseInterface):
         ----------
         audio: Union[str, BinaryIO, np.ndarray]
             Audio path or file binary or Audio numpy array
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         Returns
         ----------
@@ -346,18 +246,26 @@ class FasterWhisperInference(BaseInterface):
         """
         start_time = time.time()
-        if lang == "Automatic Detection":
-            lang = None
         else:
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
-            lang = language_code_dict[lang]
         segments, info = self.model.transcribe(
             audio=audio,
-            language=lang,
-            task="translate" if istranslate and self.current_model_size in self.translatable_models else "transcribe",
-            beam_size=beam_size,
-            log_prob_threshold=log_prob_threshold,
-            no_speech_threshold=no_speech_threshold,
         )
         progress(0, desc="Loading audio..")
@@ -373,24 +281,33 @@ class FasterWhisperInference(BaseInterface):
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
-    def update_model_if_needed(self,
-                               model_size: str,
-                               compute_type: str,
-                               progress: gr.Progress
-                               ):
         """
-        Initialize model if it doesn't match with current model setting
         """
-        if model_size != self.current_model_size or self.model is None or self.current_compute_type != compute_type:
-            progress(0, desc="Initializing Model..")
-            self.current_model_size = model_size
-            self.current_compute_type = compute_type
-            self.model = faster_whisper.WhisperModel(
-                device=self.device,
-                model_size_or_path=model_size,
-                download_root=os.path.join("models", "Whisper", "faster-whisper"),
-                compute_type=self.current_compute_type
-            )
     @staticmethod
     def generate_and_write_file(file_name: str,
@@ -399,7 +316,25 @@ class FasterWhisperInference(BaseInterface):
                                 file_format: str,
                                 ) -> str:
         """
-        This method writes subtitle file and returns str to gr.Textbox
         """
         timestamp = datetime.now().strftime("%m%d%H%M%S")
         if add_timestamp:
@@ -425,6 +360,18 @@ class FasterWhisperInference(BaseInterface):
     @staticmethod
     def format_time(elapsed_time: float) -> str:
         hours, rem = divmod(elapsed_time, 3600)
         minutes, seconds = divmod(rem, 60)

 import os
 import time
 import numpy as np
 from typing import BinaryIO, Union, Tuple, List
+from datetime import datetime
 import faster_whisper
 import ctranslate2
 from .base_interface import BaseInterface
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.youtube_manager import get_ytdata, get_ytaudio
+from modules.whisper_data_class import *
 class FasterWhisperInference(BaseInterface):
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.available_compute_types = ctranslate2.get_supported_compute_types(
+            "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
         self.default_beam_size = 1
     def transcribe_file(self,
+                        files: list,
                         file_format: str,
                         add_timestamp: bool,
+                        progress=gr.Progress(),
+                        *whisper_params,
                         ) -> list:
         """
         Write subtitle file from Files
         Parameters
         ----------
+        files: list
             List of files to transcribe from gr.Files()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
         """
         try:
             files_info = {}
+            for file in files:
                 transcribed_segments, time_for_task = self.transcribe(
+                    file.name,
+                    progress,
+                    *whisper_params,
                 )
+                file_name, file_ext = os.path.splitext(os.path.basename(file.name))
                 file_name = safe_filename(file_name)
                 subtitle, file_path = self.generate_and_write_file(
                     file_name=file_name,
                     add_timestamp=add_timestamp,
                     file_format=file_format
                 )
+                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
             total_result = ''
             total_time = 0
                 total_result += f'{info["subtitle"]}'
                 total_time += info["time_for_task"]
+            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
+            result_file_path = [info['path'] for info in files_info.values()]
+            return [result_str, result_file_path]
         except Exception as e:
+            print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
+            if not files:
+                self.remove_input_files([file.name for file in files])
     def transcribe_youtube(self,
+                           youtube_link: str,
                            file_format: str,
                            add_timestamp: bool,
+                           progress=gr.Progress(),
+                           *whisper_params,
                            ) -> list:
         """
         Write subtitle file from Youtube
         Parameters
         ----------
+        youtube_link: str
+            URL of the Youtube video to transcribe from gr.Textbox()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
         """
         try:
             progress(0, desc="Loading Audio from Youtube..")
+            yt = get_ytdata(youtube_link)
             audio = get_ytaudio(yt)
             transcribed_segments, time_for_task = self.transcribe(
+                audio,
+                progress,
+                *whisper_params,
             )
             progress(1, desc="Completed!")
             file_name = safe_filename(yt.title)
+            subtitle, result_file_path = self.generate_and_write_file(
                 file_name=file_name,
                 transcribed_segments=transcribed_segments,
                 add_timestamp=add_timestamp,
                 file_format=file_format
             )
+            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
         except Exception as e:
+            print(f"Error transcribing file: {e}")
         finally:
             try:
                 if 'yt' not in locals():
+                    yt = get_ytdata(youtube_link)
                     file_path = get_ytaudio(yt)
                 else:
                     file_path = get_ytaudio(yt)
                 pass
     def transcribe_mic(self,
+                       mic_audio: str,
                        file_format: str,
+                       progress=gr.Progress(),
+                       *whisper_params,
                        ) -> list:
         """
         Write subtitle file from microphone
         Parameters
         ----------
+        mic_audio: str
             Audio file path from gr.Microphone()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
         """
         try:
             progress(0, desc="Loading Audio..")
             transcribed_segments, time_for_task = self.transcribe(
+                mic_audio,
+                progress,
+                *whisper_params,
             )
             progress(1, desc="Completed!")
+            subtitle, result_file_path = self.generate_and_write_file(
                 file_name="Mic",
                 transcribed_segments=transcribed_segments,
                 add_timestamp=True,
                 file_format=file_format
             )
+            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
         except Exception as e:
+            print(f"Error transcribing file: {e}")
         finally:
             self.release_cuda_memory()
+            self.remove_input_files([mic_audio])
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress,
+                   *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
         ----------
         audio: Union[str, BinaryIO, np.ndarray]
             Audio path or file binary or Audio numpy array
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
         """
         start_time = time.time()
+        params = WhisperGradioComponents.to_values(*whisper_params)
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        if params.lang == "Automatic Detection":
+            params.lang = None
         else:
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
+            params.lang = language_code_dict[params.lang]
         segments, info = self.model.transcribe(
             audio=audio,
+            language=params.lang,
+            task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+            beam_size=params.beam_size,
+            log_prob_threshold=params.log_prob_threshold,
+            no_speech_threshold=params.no_speech_threshold,
+            best_of=params.best_of,
+            patience=params.patience
         )
         progress(0, desc="Loading audio..")
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress
+                     ):
         """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
         """
+        progress(0, desc="Initializing Model..")
+        self.current_model_size = model_size
+        self.current_compute_type = compute_type
+        self.model = faster_whisper.WhisperModel(
+            device=self.device,
+            model_size_or_path=model_size,
+            download_root=os.path.join("models", "Whisper", "faster-whisper"),
+            compute_type=self.current_compute_type
+        )
     @staticmethod
     def generate_and_write_file(file_name: str,
                                 file_format: str,
                                 ) -> str:
         """
+        Writes subtitle file
+        Parameters
+        ----------
+        file_name: str
+            Output file name
+        transcribed_segments: list
+            Text segments transcribed from audio
+        add_timestamp: bool
+            Determines whether to add a timestamp to the end of the filename.
+        file_format: str
+            File format to write. Supported formats: [SRT, WebVTT, txt]
+        Returns
+        ----------
+        content: str
+            Result of the transcription
+        output_path: str
+            output file path
         """
         timestamp = datetime.now().strftime("%m%d%H%M%S")
         if add_timestamp:
     @staticmethod
     def format_time(elapsed_time: float) -> str:
+        """
+        Get {hours} {minutes} {seconds} time format string
+        Parameters
+        ----------
+        elapsed_time: str
+            Elapsed time for transcription
+        Returns
+        ----------
+        Time format string
+        """
         hours, rem = divmod(elapsed_time, 3600)
         minutes, seconds = divmod(rem, 60)

modules/whisper_Inference.py CHANGED Viewed

@@ -10,6 +10,7 @@ import torch
 from .base_interface import BaseInterface
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.youtube_manager import get_ytdata, get_ytaudio
 DEFAULT_MODEL_SIZE = "large-v3"
@@ -21,82 +22,54 @@ class WhisperInference(BaseInterface):
         self.model = None
         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
         self.default_beam_size = 1
     def transcribe_file(self,
-                        fileobjs: list,
-                        model_size: str,
-                        lang: str,
                         file_format: str,
-                        istranslate: bool,
                         add_timestamp: bool,
-                        beam_size: int,
-                        log_prob_threshold: float,
-                        no_speech_threshold: float,
-                        compute_type: str,
-                        progress=gr.Progress()) -> list:
         """
         Write subtitle file from Files
         Parameters
         ----------
-        fileobjs: list
             List of files to transcribe from gr.Files()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         Returns
         ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             files_info = {}
-            for fileobj in fileobjs:
                 progress(0, desc="Loading Audio..")
-                audio = whisper.load_audio(fileobj.name)
-                result, elapsed_time = self.transcribe(audio=audio,
-                                                       lang=lang,
-                                                       istranslate=istranslate,
-                                                       beam_size=beam_size,
-                                                       log_prob_threshold=log_prob_threshold,
-                                                       no_speech_threshold=no_speech_threshold,
-                                                       compute_type=compute_type,
-                                                       progress=progress
-                                                       )
                 progress(1, desc="Completed!")
-                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
                 file_name = safe_filename(file_name)
                 subtitle, file_path = self.generate_and_write_file(
                     file_name=file_name,
@@ -104,7 +77,7 @@ class WhisperInference(BaseInterface):
                     add_timestamp=add_timestamp,
                     file_format=file_format
                 )
-                files_info[file_name] = {"subtitle": subtitle, "elapsed_time": elapsed_time, "path":  file_path}
             total_result = ''
             total_time = 0
@@ -114,100 +87,71 @@ class WhisperInference(BaseInterface):
                 total_result += f"{info['subtitle']}"
                 total_time += info["elapsed_time"]
-            gr_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
-            gr_file_path = [info['path'] for info in files_info.values()]
-            return [gr_str, gr_file_path]
         except Exception as e:
             print(f"Error transcribing file: {str(e)}")
         finally:
             self.release_cuda_memory()
-            self.remove_input_files([fileobj.name for fileobj in fileobjs])
     def transcribe_youtube(self,
-                           youtubelink: str,
-                           model_size: str,
-                           lang: str,
                            file_format: str,
-                           istranslate: bool,
                            add_timestamp: bool,
-                           beam_size: int,
-                           log_prob_threshold: float,
-                           no_speech_threshold: float,
-                           compute_type: str,
-                           progress=gr.Progress()) -> list:
         """
         Write subtitle file from Youtube
         Parameters
         ----------
-        youtubelink: str
-            Link of Youtube to transcribe from gr.Textbox()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         Returns
         ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             progress(0, desc="Loading Audio from Youtube..")
-            yt = get_ytdata(youtubelink)
             audio = whisper.load_audio(get_ytaudio(yt))
-            result, elapsed_time = self.transcribe(audio=audio,
-                                                   lang=lang,
-                                                   istranslate=istranslate,
-                                                   beam_size=beam_size,
-                                                   log_prob_threshold=log_prob_threshold,
-                                                   no_speech_threshold=no_speech_threshold,
-                                                   compute_type=compute_type,
-                                                   progress=progress)
             progress(1, desc="Completed!")
             file_name = safe_filename(yt.title)
-            subtitle, file_path = self.generate_and_write_file(
                 file_name=file_name,
                 transcribed_segments=result,
                 add_timestamp=add_timestamp,
                 file_format=file_format
             )
-            gr_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [gr_str, file_path]
         except Exception as e:
             print(f"Error transcribing youtube video: {str(e)}")
         finally:
             try:
                 if 'yt' not in locals():
-                    yt = get_ytdata(youtubelink)
                     file_path = get_ytaudio(yt)
                 else:
                     file_path = get_ytaudio(yt)
@@ -218,116 +162,71 @@ class WhisperInference(BaseInterface):
                 pass
     def transcribe_mic(self,
-                       micaudio: str,
-                       model_size: str,
-                       lang: str,
                        file_format: str,
-                       istranslate: bool,
-                       beam_size: int,
-                       log_prob_threshold: float,
-                       no_speech_threshold: float,
-                       compute_type: str,
-                       progress=gr.Progress()) -> list:
         """
         Write subtitle file from microphone
         Parameters
         ----------
-        micaudio: str
             Audio file path from gr.Microphone()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            Subtitle format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         Returns
         ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
-            result, elapsed_time = self.transcribe(audio=micaudio,
-                                                   lang=lang,
-                                                   istranslate=istranslate,
-                                                   beam_size=beam_size,
-                                                   log_prob_threshold=log_prob_threshold,
-                                                   no_speech_threshold=no_speech_threshold,
-                                                   compute_type=compute_type,
-                                                   progress=progress)
             progress(1, desc="Completed!")
-            subtitle, file_path = self.generate_and_write_file(
                 file_name="Mic",
                 transcribed_segments=result,
                 add_timestamp=True,
                 file_format=file_format
             )
-            gr_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [gr_str, file_path]
         except Exception as e:
             print(f"Error transcribing mic: {str(e)}")
         finally:
             self.release_cuda_memory()
-            self.remove_input_files([micaudio])
     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
-                   lang: str,
-                   istranslate: bool,
-                   beam_size: int,
-                   log_prob_threshold: float,
-                   no_speech_threshold: float,
-                   compute_type: str,
-                   progress: gr.Progress
                    ) -> Tuple[List[dict], float]:
         """
-        transcribe method for OpenAI's Whisper implementation.
         Parameters
         ----------
-        audio: Union[str, BinaryIO, torch.Tensor]
             Audio path or file binary or Audio numpy array
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         Returns
         ----------
@@ -337,45 +236,58 @@ class WhisperInference(BaseInterface):
             elapsed time for transcription
         """
         start_time = time.time()
         def progress_callback(progress_value):
             progress(progress_value, desc="Transcribing..")
-        if lang == "Automatic Detection":
-            lang = None
-        translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
         segments_result = self.model.transcribe(audio=audio,
-                                                language=lang,
                                                 verbose=False,
-                                                beam_size=beam_size,
-                                                logprob_threshold=log_prob_threshold,
-                                                no_speech_threshold=no_speech_threshold,
-                                                task="translate" if istranslate and self.current_model_size in translatable_model else "transcribe",
-                                                fp16=True if compute_type == "float16" else False,
                                                 progress_callback=progress_callback)["segments"]
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
-    def update_model_if_needed(self,
-                               model_size: str,
-                               compute_type: str,
-                               progress: gr.Progress,
-                               ):
         """
-        Initialize model if it doesn't match with current model setting
         """
-        if compute_type != self.current_compute_type:
-            self.current_compute_type = compute_type
-        if model_size != self.current_model_size or self.model is None:
-            progress(0, desc="Initializing Model..")
-            self.current_model_size = model_size
-            self.model = whisper.load_model(
-                name=model_size,
-                device=self.device,
-                download_root=os.path.join("models", "Whisper")
-            )
     @staticmethod
     def generate_and_write_file(file_name: str,
@@ -384,7 +296,25 @@ class WhisperInference(BaseInterface):
                                 file_format: str,
                                 ) -> str:
         """
-        This method writes subtitle file and returns str to gr.Textbox
         """
         timestamp = datetime.now().strftime("%m%d%H%M%S")
         if add_timestamp:
@@ -410,6 +340,18 @@ class WhisperInference(BaseInterface):
     @staticmethod
     def format_time(elapsed_time: float) -> str:
         hours, rem = divmod(elapsed_time, 3600)
         minutes, seconds = divmod(rem, 60)

 from .base_interface import BaseInterface
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.youtube_manager import get_ytdata, get_ytaudio
+from modules.whisper_data_class import *
 DEFAULT_MODEL_SIZE = "large-v3"
         self.model = None
         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
+        self.translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
         self.default_beam_size = 1
     def transcribe_file(self,
+                        files: list,
                         file_format: str,
                         add_timestamp: bool,
+                        progress=gr.Progress(),
+                        *whisper_params
+                        ) -> list:
         """
         Write subtitle file from Files
         Parameters
         ----------
+        files: list
             List of files to transcribe from gr.Files()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
         """
         try:
             files_info = {}
+            for file in files:
                 progress(0, desc="Loading Audio..")
+                audio = whisper.load_audio(file.name)
+                result, elapsed_time = self.transcribe(audio,
+                                                       progress,
+                                                       *whisper_params)
                 progress(1, desc="Completed!")
+                file_name, file_ext = os.path.splitext(os.path.basename(file.name))
                 file_name = safe_filename(file_name)
                 subtitle, file_path = self.generate_and_write_file(
                     file_name=file_name,
                     add_timestamp=add_timestamp,
                     file_format=file_format
                 )
+                files_info[file_name] = {"subtitle": subtitle, "elapsed_time": elapsed_time, "path": file_path}
             total_result = ''
             total_time = 0
                 total_result += f"{info['subtitle']}"
                 total_time += info["elapsed_time"]
+            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
+            result_file_path = [info['path'] for info in files_info.values()]
+            return [result_str, result_file_path]
         except Exception as e:
             print(f"Error transcribing file: {str(e)}")
         finally:
             self.release_cuda_memory()
+            self.remove_input_files([file.name for file in files])
     def transcribe_youtube(self,
+                           youtube_link: str,
                            file_format: str,
                            add_timestamp: bool,
+                           progress=gr.Progress(),
+                           *whisper_params) -> list:
         """
         Write subtitle file from Youtube
         Parameters
         ----------
+        youtube_link: str
+            URL of the Youtube video to transcribe from gr.Textbox()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
         """
         try:
             progress(0, desc="Loading Audio from Youtube..")
+            yt = get_ytdata(youtube_link)
             audio = whisper.load_audio(get_ytaudio(yt))
+            result, elapsed_time = self.transcribe(audio,
+                                                   progress,
+                                                   *whisper_params)
             progress(1, desc="Completed!")
             file_name = safe_filename(yt.title)
+            subtitle, result_file_path = self.generate_and_write_file(
                 file_name=file_name,
                 transcribed_segments=result,
                 add_timestamp=add_timestamp,
                 file_format=file_format
             )
+            result_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
         except Exception as e:
             print(f"Error transcribing youtube video: {str(e)}")
         finally:
             try:
                 if 'yt' not in locals():
+                    yt = get_ytdata(youtube_link)
                     file_path = get_ytaudio(yt)
                 else:
                     file_path = get_ytaudio(yt)
                 pass
     def transcribe_mic(self,
+                       mic_audio: str,
                        file_format: str,
+                       progress=gr.Progress(),
+                       *whisper_params) -> list:
         """
         Write subtitle file from microphone
         Parameters
         ----------
+        mic_audio: str
             Audio file path from gr.Microphone()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
         """
         try:
+            progress(0, desc="Loading Audio..")
+            result, elapsed_time = self.transcribe(
+                mic_audio,
+                progress,
+                *whisper_params,
+            )
             progress(1, desc="Completed!")
+            subtitle, result_file_path = self.generate_and_write_file(
                 file_name="Mic",
                 transcribed_segments=result,
                 add_timestamp=True,
                 file_format=file_format
             )
+            result_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
         except Exception as e:
             print(f"Error transcribing mic: {str(e)}")
         finally:
             self.release_cuda_memory()
+            self.remove_input_files([mic_audio])
     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
+                   progress: gr.Progress,
+                   *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
+        transcribe method for faster-whisper.
         Parameters
         ----------
+        audio: Union[str, BinaryIO, np.ndarray]
             Audio path or file binary or Audio numpy array
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
             elapsed time for transcription
         """
         start_time = time.time()
+        params = WhisperGradioComponents.to_values(*whisper_params)
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        if params.lang == "Automatic Detection":
+            params.lang = None
         def progress_callback(progress_value):
             progress(progress_value, desc="Transcribing..")
         segments_result = self.model.transcribe(audio=audio,
+                                                language=params.lang,
                                                 verbose=False,
+                                                beam_size=params.beam_size,
+                                                logprob_threshold=params.log_prob_threshold,
+                                                no_speech_threshold=params.no_speech_threshold,
+                                                task="translate" if params.is_translate and self.current_model_size in self.translatable_model else "transcribe",
+                                                fp16=True if params.compute_type == "float16" else False,
+                                                best_of=params.best_of,
+                                                patience=params.patience,
                                                 progress_callback=progress_callback)["segments"]
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress,
+                     ):
         """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
         """
+        progress(0, desc="Initializing Model..")
+        self.current_compute_type = compute_type
+        self.current_model_size = model_size
+        self.model = whisper.load_model(
+            name=model_size,
+            device=self.device,
+            download_root=os.path.join("models", "Whisper")
+        )
     @staticmethod
     def generate_and_write_file(file_name: str,
                                 file_format: str,
                                 ) -> str:
         """
+        Writes subtitle file
+        Parameters
+        ----------
+        file_name: str
+            Output file name
+        transcribed_segments: list
+            Text segments transcribed from audio
+        add_timestamp: bool
+            Determines whether to add a timestamp to the end of the filename.
+        file_format: str
+            File format to write. Supported formats: [SRT, WebVTT, txt]
+        Returns
+        ----------
+        content: str
+            Result of the transcription
+        output_path: str
+            output file path
         """
         timestamp = datetime.now().strftime("%m%d%H%M%S")
         if add_timestamp:
     @staticmethod
     def format_time(elapsed_time: float) -> str:
+        """
+        Get {hours} {minutes} {seconds} time format string
+        Parameters
+        ----------
+        elapsed_time: str
+            Elapsed time for transcription
+        Returns
+        ----------
+        Time format string
+        """
         hours, rem = divmod(elapsed_time, 3600)
         minutes, seconds = divmod(rem, 60)

modules/whisper_data_class.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from dataclasses import dataclass, fields
+import gradio as gr
+@dataclass
+class WhisperGradioComponents:
+    model_size: gr.Dropdown
+    lang: gr.Dropdown
+    is_translate: gr.Checkbox
+    beam_size: gr.Number
+    log_prob_threshold: gr.Number
+    no_speech_threshold: gr.Number
+    compute_type: gr.Dropdown
+    best_of: gr.Number
+    patience: gr.Number
+    """
+    A data class to pass Gradio components to the function before Gradio pre-processing.
+    See this documentation for more information about Gradio pre-processing: https://www.gradio.app/docs/components
+    Attributes
+    ----------
+    model_size: gr.Dropdown
+        Whisper model size.
+    lang: gr.Dropdown
+        Source language of the file to transcribe.
+    is_translate: gr.Checkbox
+        Boolean value that determines whether to translate to English.
+        It's Whisper's feature to translate speech from another language directly into English end-to-end.
+    beam_size: gr.Number
+        Int value that is used for decoding option.
+    log_prob_threshold: gr.Number
+        If the average log probability over sampled tokens is below this value, treat as failed.
+    no_speech_threshold: gr.Number
+        If the no_speech probability is higher than this value AND
+        the average log probability over sampled tokens is below `log_prob_threshold`,
+        consider the segment as silent.
+    compute_type: gr.Dropdown
+        compute type for transcription.
+        see more info : https://opennmt.net/CTranslate2/quantization.html
+    best_of: gr.Number
+        Number of candidates when sampling with non-zero temperature.
+    patience: gr.Number
+        Beam search patience factor.
+    """
+    def to_list(self) -> list:
+        """
+        Converts the data class attributes into a list, to pass parameters to a function before Gradio pre-processing.
+        Returns
+        ----------
+        A list of Gradio components
+        """
+        return [getattr(self, f.name) for f in fields(self)]
+    @staticmethod
+    def to_values(*params):
+        """
+        Convert a tuple of parameters into a WhisperValues data class, to use parameters in a function after Gradio pre-processing.
+        Parameters
+        ----------
+        *params: tuple
+            This is provided in a tuple because Gradio does not support **kwargs arbitrary.
+            Reference : https://discuss.huggingface.co/t/passing-an-additional-argument-to-a-function/25140/2
+        Returns
+        ----------
+        A WhisperValues data class
+        """
+        return WhisperValues(*params)
+@dataclass
+class WhisperValues:
+    model_size: str
+    lang: str
+    is_translate: bool
+    beam_size: int
+    log_prob_threshold: float
+    no_speech_threshold: float
+    compute_type: str
+    best_of: int
+    patience: float
+    """
+    A data class to use Whisper parameters in the function after Gradio pre-processing.
+    See this documentation for more information about Gradio pre-processing: : https://www.gradio.app/docs/components
+    """