jhj0517 commited on
Commit
abc6224
1 Parent(s): 46f2826

add `vad_filter`

Browse files
app.py CHANGED
@@ -59,6 +59,7 @@ class App:
59
  with gr.Row():
60
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
61
  with gr.Accordion("Advanced_Parameters", open=False):
 
62
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
63
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
64
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -89,7 +90,8 @@ class App:
89
  condition_on_previous_text=cb_condition_on_previous_text,
90
  initial_prompt=tb_initial_prompt,
91
  temperature=sd_temperature,
92
- compression_ratio_threshold=nb_compression_ratio_threshold)
 
93
  btn_run.click(fn=self.whisper_inf.transcribe_file,
94
  inputs=params + whisper_params.to_list(),
95
  outputs=[tb_indicator, files_subtitles])
@@ -117,6 +119,7 @@ class App:
117
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
118
  interactive=True)
119
  with gr.Accordion("Advanced_Parameters", open=False):
 
120
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
121
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
122
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -147,7 +150,8 @@ class App:
147
  condition_on_previous_text=cb_condition_on_previous_text,
148
  initial_prompt=tb_initial_prompt,
149
  temperature=sd_temperature,
150
- compression_ratio_threshold=nb_compression_ratio_threshold)
 
151
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
152
  inputs=params + whisper_params.to_list(),
153
  outputs=[tb_indicator, files_subtitles])
@@ -168,6 +172,7 @@ class App:
168
  with gr.Row():
169
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
170
  with gr.Accordion("Advanced_Parameters", open=False):
 
171
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
172
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
173
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -197,7 +202,8 @@ class App:
197
  condition_on_previous_text=cb_condition_on_previous_text,
198
  initial_prompt=tb_initial_prompt,
199
  temperature=sd_temperature,
200
- compression_ratio_threshold=nb_compression_ratio_threshold)
 
201
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
202
  inputs=params + whisper_params.to_list(),
203
  outputs=[tb_indicator, files_subtitles])
 
59
  with gr.Row():
60
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
61
  with gr.Accordion("Advanced_Parameters", open=False):
62
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
63
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
64
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
65
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
 
90
  condition_on_previous_text=cb_condition_on_previous_text,
91
  initial_prompt=tb_initial_prompt,
92
  temperature=sd_temperature,
93
+ compression_ratio_threshold=nb_compression_ratio_threshold,
94
+ vad_filter=cb_vad_filter)
95
  btn_run.click(fn=self.whisper_inf.transcribe_file,
96
  inputs=params + whisper_params.to_list(),
97
  outputs=[tb_indicator, files_subtitles])
 
119
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
120
  interactive=True)
121
  with gr.Accordion("Advanced_Parameters", open=False):
122
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
123
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
124
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
125
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
 
150
  condition_on_previous_text=cb_condition_on_previous_text,
151
  initial_prompt=tb_initial_prompt,
152
  temperature=sd_temperature,
153
+ compression_ratio_threshold=nb_compression_ratio_threshold,
154
+ vad_filter=cb_vad_filter)
155
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
156
  inputs=params + whisper_params.to_list(),
157
  outputs=[tb_indicator, files_subtitles])
 
172
  with gr.Row():
173
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
174
  with gr.Accordion("Advanced_Parameters", open=False):
175
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
176
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
177
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
178
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
 
202
  condition_on_previous_text=cb_condition_on_previous_text,
203
  initial_prompt=tb_initial_prompt,
204
  temperature=sd_temperature,
205
+ compression_ratio_threshold=nb_compression_ratio_threshold,
206
+ vad_filter=cb_vad_filter)
207
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
208
  inputs=params + whisper_params.to_list(),
209
  outputs=[tb_indicator, files_subtitles])
modules/faster_whisper_inference.py CHANGED
@@ -271,6 +271,7 @@ class FasterWhisperInference(BaseInterface):
271
  patience=params.patience,
272
  temperature=params.temperature,
273
  compression_ratio_threshold=params.compression_ratio_threshold,
 
274
  )
275
  progress(0, desc="Loading audio..")
276
 
 
271
  patience=params.patience,
272
  temperature=params.temperature,
273
  compression_ratio_threshold=params.compression_ratio_threshold,
274
+ vad_filter=params.vad_filter,
275
  )
276
  progress(0, desc="Loading audio..")
277
 
modules/whisper_parameter.py CHANGED
@@ -18,6 +18,7 @@ class WhisperGradioComponents:
18
  initial_prompt: gr.Textbox
19
  temperature: gr.Slider
20
  compression_ratio_threshold: gr.Number
 
21
  """
22
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
23
  See more about Gradio pre-processing: https://www.gradio.app/docs/components
@@ -66,12 +67,17 @@ class WhisperGradioComponents:
66
  to make it more likely to predict those word correctly.
67
 
68
  temperature: gr.Slider
69
- Temperature for sampling. It can be a tuple of temperatures,
70
- which will be successively used upon failures according to either
71
- `compression_ratio_threshold` or `log_prob_threshold`.
72
 
73
  compression_ratio_threshold: gr.Number
74
  If the gzip compression ratio is above this value, treat as failed
 
 
 
 
 
75
  """
76
 
77
  def to_list(self) -> list:
@@ -101,6 +107,7 @@ class WhisperValues:
101
  initial_prompt: Optional[str]
102
  temperature: float
103
  compression_ratio_threshold: float
 
104
  """
105
  A data class to use Whisper parameters. Use "after" Gradio pre-processing.
106
  See more about Gradio pre-processing: : https://www.gradio.app/docs/components
 
18
  initial_prompt: gr.Textbox
19
  temperature: gr.Slider
20
  compression_ratio_threshold: gr.Number
21
+ vad_filter: gr.Checkbox
22
  """
23
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
24
  See more about Gradio pre-processing: https://www.gradio.app/docs/components
 
67
  to make it more likely to predict those word correctly.
68
 
69
  temperature: gr.Slider
70
+ Temperature for sampling. It can be a tuple of temperatures,
71
+ which will be successively used upon failures according to either
72
+ `compression_ratio_threshold` or `log_prob_threshold`.
73
 
74
  compression_ratio_threshold: gr.Number
75
  If the gzip compression ratio is above this value, treat as failed
76
+
77
+ vad_filter: gr.Checkbox
78
+ Enable the voice activity detection (VAD) to filter out parts of the audio
79
+ without speech. This step is using the Silero VAD model
80
+ https://github.com/snakers4/silero-vad.
81
  """
82
 
83
  def to_list(self) -> list:
 
107
  initial_prompt: Optional[str]
108
  temperature: float
109
  compression_ratio_threshold: float
110
+ vad_filter: bool
111
  """
112
  A data class to use Whisper parameters. Use "after" Gradio pre-processing.
113
  See more about Gradio pre-processing: : https://www.gradio.app/docs/components