Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on May 17

Commit

abc6224

•

1 Parent(s): 46f2826

add `vad_filter`

Browse files

Files changed (3) hide show

app.py +9 -3
modules/faster_whisper_inference.py +1 -0
modules/whisper_parameter.py +10 -3

app.py CHANGED Viewed

@@ -59,6 +59,7 @@ class App:
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
                     with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -89,7 +90,8 @@ class App:
                                                              condition_on_previous_text=cb_condition_on_previous_text,
                                                              initial_prompt=tb_initial_prompt,
                                                              temperature=sd_temperature,
-                                                             compression_ratio_threshold=nb_compression_ratio_threshold)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -117,6 +119,7 @@ class App:
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
                     with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -147,7 +150,8 @@ class App:
                                                              condition_on_previous_text=cb_condition_on_previous_text,
                                                              initial_prompt=tb_initial_prompt,
                                                              temperature=sd_temperature,
-                                                             compression_ratio_threshold=nb_compression_ratio_threshold)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -168,6 +172,7 @@ class App:
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -197,7 +202,8 @@ class App:
                                                              condition_on_previous_text=cb_condition_on_previous_text,
                                                              initial_prompt=tb_initial_prompt,
                                                              temperature=sd_temperature,
-                                                             compression_ratio_threshold=nb_compression_ratio_threshold)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])

                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
                     with gr.Accordion("Advanced_Parameters", open=False):
+                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                                                              condition_on_previous_text=cb_condition_on_previous_text,
                                                              initial_prompt=tb_initial_prompt,
                                                              temperature=sd_temperature,
+                                                             compression_ratio_threshold=nb_compression_ratio_threshold,
+                                                             vad_filter=cb_vad_filter)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
                     with gr.Accordion("Advanced_Parameters", open=False):
+                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                                                              condition_on_previous_text=cb_condition_on_previous_text,
                                                              initial_prompt=tb_initial_prompt,
                                                              temperature=sd_temperature,
+                                                             compression_ratio_threshold=nb_compression_ratio_threshold,
+                                                             vad_filter=cb_vad_filter)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Accordion("Advanced_Parameters", open=False):
+                        cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                                                              condition_on_previous_text=cb_condition_on_previous_text,
                                                              initial_prompt=tb_initial_prompt,
                                                              temperature=sd_temperature,
+                                                             compression_ratio_threshold=nb_compression_ratio_threshold,
+                                                             vad_filter=cb_vad_filter)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])

modules/faster_whisper_inference.py CHANGED Viewed

@@ -271,6 +271,7 @@ class FasterWhisperInference(BaseInterface):
             patience=params.patience,
             temperature=params.temperature,
             compression_ratio_threshold=params.compression_ratio_threshold,
         )
         progress(0, desc="Loading audio..")

             patience=params.patience,
             temperature=params.temperature,
             compression_ratio_threshold=params.compression_ratio_threshold,
+            vad_filter=params.vad_filter,
         )
         progress(0, desc="Loading audio..")

modules/whisper_parameter.py CHANGED Viewed

@@ -18,6 +18,7 @@ class WhisperGradioComponents:
     initial_prompt: gr.Textbox
     temperature: gr.Slider
     compression_ratio_threshold: gr.Number
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     See more about Gradio pre-processing: https://www.gradio.app/docs/components
@@ -66,12 +67,17 @@ class WhisperGradioComponents:
         to make it more likely to predict those word correctly.
     temperature: gr.Slider
-            Temperature for sampling. It can be a tuple of temperatures,
-            which will be successively used upon failures according to either
-            `compression_ratio_threshold` or `log_prob_threshold`.
     compression_ratio_threshold: gr.Number
         If the gzip compression ratio is above this value, treat as failed
     """
     def to_list(self) -> list:
@@ -101,6 +107,7 @@ class WhisperValues:
     initial_prompt: Optional[str]
     temperature: float
     compression_ratio_threshold: float
     """
     A data class to use Whisper parameters. Use "after" Gradio pre-processing.
     See more about Gradio pre-processing: : https://www.gradio.app/docs/components

     initial_prompt: gr.Textbox
     temperature: gr.Slider
     compression_ratio_threshold: gr.Number
+    vad_filter: gr.Checkbox
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     See more about Gradio pre-processing: https://www.gradio.app/docs/components
         to make it more likely to predict those word correctly.
     temperature: gr.Slider
+        Temperature for sampling. It can be a tuple of temperatures,
+        which will be successively used upon failures according to either
+        `compression_ratio_threshold` or `log_prob_threshold`.
     compression_ratio_threshold: gr.Number
         If the gzip compression ratio is above this value, treat as failed
+    vad_filter: gr.Checkbox
+        Enable the voice activity detection (VAD) to filter out parts of the audio
+        without speech. This step is using the Silero VAD model
+        https://github.com/snakers4/silero-vad.
     """
     def to_list(self) -> list:
     initial_prompt: Optional[str]
     temperature: float
     compression_ratio_threshold: float
+    vad_filter: bool
     """
     A data class to use Whisper parameters. Use "after" Gradio pre-processing.
     See more about Gradio pre-processing: : https://www.gradio.app/docs/components