Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Apr 8

Commit

b065a65

•

1 Parent(s): 1f8abba

refactoring to use data class

Browse files

Files changed (2) hide show

app.py +29 -9
modules/faster_whisper_inference.py +72 -170

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ from modules.nllb_inference import NLLBInference
 from ui.htmls import *
 from modules.youtube_manager import get_ytmetas
 from modules.deepl_api import DeepLAPI
 class App:
     def __init__(self, args):
@@ -68,10 +70,16 @@ class App:
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4, interactive=False)
                         btn_openfolder = gr.Button('📂', scale=1)
-                    params = [input_file, dd_model, dd_lang, dd_file_format, cb_translate, cb_timestamp]
-                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
-                                  inputs=params + advanced_params,
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
@@ -108,10 +116,16 @@ class App:
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4)
                         btn_openfolder = gr.Button('📂', scale=1)
-                    params = [tb_youtubelink, dd_model, dd_lang, dd_file_format, cb_translate, cb_timestamp]
-                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
-                                  inputs=params + advanced_params,
                                   outputs=[tb_indicator, files_subtitles])
                     tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
                                           outputs=[img_thumbnail, tb_title, tb_description])
@@ -141,10 +155,16 @@ class App:
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4)
                         btn_openfolder = gr.Button('📂', scale=1)
-                    params = [mic_input, dd_model, dd_lang, dd_file_format, cb_translate]
-                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
-                                  inputs=params + advanced_params,
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])

 from ui.htmls import *
 from modules.youtube_manager import get_ytmetas
 from modules.deepl_api import DeepLAPI
+from modules.whisper_data_class import *
 class App:
     def __init__(self, args):
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4, interactive=False)
                         btn_openfolder = gr.Button('📂', scale=1)
+                    params = [input_file, dd_file_format, cb_timestamp]
+                    whisper_params = WhisperGradioComponents(model_size=dd_model,
+                                                             lang=dd_lang,
+                                                             is_translate=cb_translate,
+                                                             beam_size=nb_beam_size,
+                                                             log_prob_threshold=nb_log_prob_threshold,
+                                                             no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                  inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4)
                         btn_openfolder = gr.Button('📂', scale=1)
+                    params = [tb_youtubelink, dd_file_format, cb_timestamp]
+                    whisper_params = WhisperGradioComponents(model_size=dd_model,
+                                                             lang=dd_lang,
+                                                             is_translate=cb_translate,
+                                                             beam_size=nb_beam_size,
+                                                             log_prob_threshold=nb_log_prob_threshold,
+                                                             no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
+                                  inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
                                           outputs=[img_thumbnail, tb_title, tb_description])
                         files_subtitles = gr.Files(label="Downloadable output file", scale=4)
                         btn_openfolder = gr.Button('📂', scale=1)
+                    params = [mic_input, dd_file_format]
+                    whisper_params = WhisperGradioComponents(model_size=dd_model,
+                                                             lang=dd_lang,
+                                                             is_translate=cb_translate,
+                                                             beam_size=nb_beam_size,
+                                                             log_prob_threshold=nb_log_prob_threshold,
+                                                             no_speech_threshold=nb_no_speech_threshold,
+                                                             compute_type=dd_compute_type)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
+                                  inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])

modules/faster_whisper_inference.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
-import tqdm
 import time
 import numpy as np
 from typing import BinaryIO, Union, Tuple, List
-from datetime import datetime, timedelta
 import faster_whisper
 import ctranslate2
@@ -15,6 +14,7 @@ import gradio as gr
 from .base_interface import BaseInterface
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.youtube_manager import get_ytdata, get_ytaudio
 class FasterWhisperInference(BaseInterface):
@@ -26,22 +26,17 @@ class FasterWhisperInference(BaseInterface):
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.available_compute_types = ctranslate2.get_supported_compute_types("cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
         self.default_beam_size = 1
     def transcribe_file(self,
                         fileobjs: list,
-                        model_size: str,
-                        lang: str,
                         file_format: str,
-                        istranslate: bool,
                         add_timestamp: bool,
-                        beam_size: int,
-                        log_prob_threshold: float,
-                        no_speech_threshold: float,
-                        compute_type: str,
-                        progress=gr.Progress()
                         ) -> list:
         """
         Write subtitle file from Files
@@ -50,31 +45,14 @@ class FasterWhisperInference(BaseInterface):
         ----------
         fileobjs: list
             List of files to transcribe from gr.Files()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
-            see more info : https://opennmt.net/CTranslate2/quantization.html
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         Returns
         ----------
@@ -83,18 +61,12 @@ class FasterWhisperInference(BaseInterface):
         Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             files_info = {}
             for fileobj in fileobjs:
                 transcribed_segments, time_for_task = self.transcribe(
-                    audio=fileobj.name,
-                    lang=lang,
-                    istranslate=istranslate,
-                    beam_size=beam_size,
-                    log_prob_threshold=log_prob_threshold,
-                    no_speech_threshold=no_speech_threshold,
-                    progress=progress
                 )
                 file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
@@ -105,7 +77,7 @@ class FasterWhisperInference(BaseInterface):
                     add_timestamp=add_timestamp,
                     file_format=file_format
                 )
-                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path":  file_path}
             total_result = ''
             total_time = 0
@@ -115,10 +87,10 @@ class FasterWhisperInference(BaseInterface):
                 total_result += f'{info["subtitle"]}'
                 total_time += info["time_for_task"]
-            gr_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
-            gr_file_path = [info['path'] for info in files_info.values()]
-            return [gr_str, gr_file_path]
         except Exception as e:
             print(f"Error transcribing file on line {e}")
@@ -128,50 +100,27 @@ class FasterWhisperInference(BaseInterface):
                 self.remove_input_files([fileobj.name for fileobj in fileobjs])
     def transcribe_youtube(self,
-                           youtubelink: str,
-                           model_size: str,
-                           lang: str,
                            file_format: str,
-                           istranslate: bool,
                            add_timestamp: bool,
-                           beam_size: int,
-                           log_prob_threshold: float,
-                           no_speech_threshold: float,
-                           compute_type: str,
-                           progress=gr.Progress()
                            ) -> list:
         """
         Write subtitle file from Youtube
         Parameters
         ----------
-        youtubelink: str
-            Link of Youtube to transcribe from gr.Textbox()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
-            see more info : https://opennmt.net/CTranslate2/quantization.html
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         Returns
         ----------
@@ -180,20 +129,14 @@ class FasterWhisperInference(BaseInterface):
         Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             progress(0, desc="Loading Audio from Youtube..")
-            yt = get_ytdata(youtubelink)
             audio = get_ytaudio(yt)
             transcribed_segments, time_for_task = self.transcribe(
-                audio=audio,
-                lang=lang,
-                istranslate=istranslate,
-                beam_size=beam_size,
-                log_prob_threshold=log_prob_threshold,
-                no_speech_threshold=no_speech_threshold,
-                progress=progress
             )
             progress(1, desc="Completed!")
@@ -214,7 +157,7 @@ class FasterWhisperInference(BaseInterface):
         finally:
             try:
                 if 'yt' not in locals():
-                    yt = get_ytdata(youtubelink)
                     file_path = get_ytaudio(yt)
                 else:
                     file_path = get_ytaudio(yt)
@@ -225,47 +168,24 @@ class FasterWhisperInference(BaseInterface):
                 pass
     def transcribe_mic(self,
-                       micaudio: str,
-                       model_size: str,
-                       lang: str,
                        file_format: str,
-                       istranslate: bool,
-                       beam_size: int,
-                       log_prob_threshold: float,
-                       no_speech_threshold: float,
-                       compute_type: str,
-                       progress=gr.Progress()
                        ) -> list:
         """
         Write subtitle file from microphone
         Parameters
         ----------
-        micaudio: str
             Audio file path from gr.Microphone()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-        compute_type: str
-            compute type from gr.Dropdown().
-            see more info : https://opennmt.net/CTranslate2/quantization.html
-            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         Returns
         ----------
@@ -274,18 +194,11 @@ class FasterWhisperInference(BaseInterface):
         Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             progress(0, desc="Loading Audio..")
             transcribed_segments, time_for_task = self.transcribe(
-                audio=micaudio,
-                lang=lang,
-                istranslate=istranslate,
-                beam_size=beam_size,
-                log_prob_threshold=log_prob_threshold,
-                no_speech_threshold=no_speech_threshold,
-                progress=progress
             )
             progress(1, desc="Completed!")
@@ -302,16 +215,12 @@ class FasterWhisperInference(BaseInterface):
             print(f"Error transcribing file on line {e}")
         finally:
             self.release_cuda_memory()
-            self.remove_input_files([micaudio])
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
-                   lang: str,
-                   istranslate: bool,
-                   beam_size: int,
-                   log_prob_threshold: float,
-                   no_speech_threshold: float,
-                   progress: gr.Progress
                    ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
@@ -320,22 +229,10 @@ class FasterWhisperInference(BaseInterface):
         ----------
         audio: Union[str, BinaryIO, np.ndarray]
             Audio path or file binary or Audio numpy array
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         Returns
         ----------
@@ -346,18 +243,24 @@ class FasterWhisperInference(BaseInterface):
         """
         start_time = time.time()
-        if lang == "Automatic Detection":
             lang = None
         else:
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
-            lang = language_code_dict[lang]
         segments, info = self.model.transcribe(
             audio=audio,
             language=lang,
-            task="translate" if istranslate and self.current_model_size in self.translatable_models else "transcribe",
-            beam_size=beam_size,
-            log_prob_threshold=log_prob_threshold,
-            no_speech_threshold=no_speech_threshold,
         )
         progress(0, desc="Loading audio..")
@@ -373,24 +276,23 @@ class FasterWhisperInference(BaseInterface):
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
-    def update_model_if_needed(self,
-                               model_size: str,
-                               compute_type: str,
-                               progress: gr.Progress
-                               ):
         """
-        Initialize model if it doesn't match with current model setting
         """
-        if model_size != self.current_model_size or self.model is None or self.current_compute_type != compute_type:
-            progress(0, desc="Initializing Model..")
-            self.current_model_size = model_size
-            self.current_compute_type = compute_type
-            self.model = faster_whisper.WhisperModel(
-                device=self.device,
-                model_size_or_path=model_size,
-                download_root=os.path.join("models", "Whisper", "faster-whisper"),
-                compute_type=self.current_compute_type
-            )
     @staticmethod
     def generate_and_write_file(file_name: str,

 import os
 import time
 import numpy as np
 from typing import BinaryIO, Union, Tuple, List
+from datetime import datetime
 import faster_whisper
 import ctranslate2
 from .base_interface import BaseInterface
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.youtube_manager import get_ytdata, get_ytaudio
+from modules.whisper_data_class import *
 class FasterWhisperInference(BaseInterface):
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.available_compute_types = ctranslate2.get_supported_compute_types(
+            "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
         self.default_beam_size = 1
     def transcribe_file(self,
                         fileobjs: list,
                         file_format: str,
                         add_timestamp: bool,
+                        progress=gr.Progress(),
+                        *whisper_params,
                         ) -> list:
         """
         Write subtitle file from Files
         ----------
         fileobjs: list
             List of files to transcribe from gr.Files()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
         Files to return to gr.Files()
         """
         try:
             files_info = {}
             for fileobj in fileobjs:
                 transcribed_segments, time_for_task = self.transcribe(
+                    fileobj.name,
+                    progress,
+                    *whisper_params,
                 )
                 file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
                     add_timestamp=add_timestamp,
                     file_format=file_format
                 )
+                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
             total_result = ''
             total_time = 0
                 total_result += f'{info["subtitle"]}'
                 total_time += info["time_for_task"]
+            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
+            result_file_path = [info['path'] for info in files_info.values()]
+            return [result_str, result_file_path]
         except Exception as e:
             print(f"Error transcribing file on line {e}")
                 self.remove_input_files([fileobj.name for fileobj in fileobjs])
     def transcribe_youtube(self,
+                           youtube_link: str,
                            file_format: str,
                            add_timestamp: bool,
+                           progress=gr.Progress(),
+                           *whisper_params,
                            ) -> list:
         """
         Write subtitle file from Youtube
         Parameters
         ----------
+        youtube_link: str
+            URL of the Youtube video to transcribe from gr.Textbox()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
         Files to return to gr.Files()
         """
         try:
             progress(0, desc="Loading Audio from Youtube..")
+            yt = get_ytdata(youtube_link)
             audio = get_ytaudio(yt)
             transcribed_segments, time_for_task = self.transcribe(
+                audio,
+                progress,
+                *whisper_params,
             )
             progress(1, desc="Completed!")
         finally:
             try:
                 if 'yt' not in locals():
+                    yt = get_ytdata(youtube_link)
                     file_path = get_ytaudio(yt)
                 else:
                     file_path = get_ytaudio(yt)
                 pass
     def transcribe_mic(self,
+                       mic_audio: str,
                        file_format: str,
+                       progress=gr.Progress(),
+                       *whisper_params,
                        ) -> list:
         """
         Write subtitle file from microphone
         Parameters
         ----------
+        mic_audio: str
             Audio file path from gr.Microphone()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
         Files to return to gr.Files()
         """
         try:
             progress(0, desc="Loading Audio..")
             transcribed_segments, time_for_task = self.transcribe(
+                mic_audio,
+                progress,
+                *whisper_params,
             )
             progress(1, desc="Completed!")
             print(f"Error transcribing file on line {e}")
         finally:
             self.release_cuda_memory()
+            self.remove_input_files([mic_audio])
     def transcribe(self,
                    audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress,
+                   *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
         ----------
         audio: Union[str, BinaryIO, np.ndarray]
             Audio path or file binary or Audio numpy array
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
         """
         start_time = time.time()
+        params = WhisperGradioComponents.to_values(*whisper_params)
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        if params.lang == "Automatic Detection":
             lang = None
         else:
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
+            lang = language_code_dict[params.lang]
         segments, info = self.model.transcribe(
             audio=audio,
             language=lang,
+            task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+            beam_size=params.beam_size,
+            log_prob_threshold=params.log_prob_threshold,
+            no_speech_threshold=params.no_speech_threshold,
         )
         progress(0, desc="Loading audio..")
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress
+                     ):
         """
+        update current model setting
         """
+        progress(0, desc="Initializing Model..")
+        self.current_model_size = model_size
+        self.current_compute_type = compute_type
+        self.model = faster_whisper.WhisperModel(
+            device=self.device,
+            model_size_or_path=model_size,
+            download_root=os.path.join("models", "Whisper", "faster-whisper"),
+            compute_type=self.current_compute_type
+        )
     @staticmethod
     def generate_and_write_file(file_name: str,