Spaces:
Running
Running
jhj0517
commited on
Commit
•
abc6224
1
Parent(s):
46f2826
add `vad_filter`
Browse files- app.py +9 -3
- modules/faster_whisper_inference.py +1 -0
- modules/whisper_parameter.py +10 -3
app.py
CHANGED
@@ -59,6 +59,7 @@ class App:
|
|
59 |
with gr.Row():
|
60 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
61 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
62 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
63 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
64 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
@@ -89,7 +90,8 @@ class App:
|
|
89 |
condition_on_previous_text=cb_condition_on_previous_text,
|
90 |
initial_prompt=tb_initial_prompt,
|
91 |
temperature=sd_temperature,
|
92 |
-
compression_ratio_threshold=nb_compression_ratio_threshold
|
|
|
93 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
94 |
inputs=params + whisper_params.to_list(),
|
95 |
outputs=[tb_indicator, files_subtitles])
|
@@ -117,6 +119,7 @@ class App:
|
|
117 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
118 |
interactive=True)
|
119 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
120 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
121 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
122 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
@@ -147,7 +150,8 @@ class App:
|
|
147 |
condition_on_previous_text=cb_condition_on_previous_text,
|
148 |
initial_prompt=tb_initial_prompt,
|
149 |
temperature=sd_temperature,
|
150 |
-
compression_ratio_threshold=nb_compression_ratio_threshold
|
|
|
151 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
152 |
inputs=params + whisper_params.to_list(),
|
153 |
outputs=[tb_indicator, files_subtitles])
|
@@ -168,6 +172,7 @@ class App:
|
|
168 |
with gr.Row():
|
169 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
170 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
171 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
172 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
173 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
@@ -197,7 +202,8 @@ class App:
|
|
197 |
condition_on_previous_text=cb_condition_on_previous_text,
|
198 |
initial_prompt=tb_initial_prompt,
|
199 |
temperature=sd_temperature,
|
200 |
-
compression_ratio_threshold=nb_compression_ratio_threshold
|
|
|
201 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
202 |
inputs=params + whisper_params.to_list(),
|
203 |
outputs=[tb_indicator, files_subtitles])
|
|
|
59 |
with gr.Row():
|
60 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
61 |
with gr.Accordion("Advanced_Parameters", open=False):
|
62 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
63 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
64 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
65 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
90 |
condition_on_previous_text=cb_condition_on_previous_text,
|
91 |
initial_prompt=tb_initial_prompt,
|
92 |
temperature=sd_temperature,
|
93 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
94 |
+
vad_filter=cb_vad_filter)
|
95 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
96 |
inputs=params + whisper_params.to_list(),
|
97 |
outputs=[tb_indicator, files_subtitles])
|
|
|
119 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
120 |
interactive=True)
|
121 |
with gr.Accordion("Advanced_Parameters", open=False):
|
122 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
123 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
124 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
125 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
150 |
condition_on_previous_text=cb_condition_on_previous_text,
|
151 |
initial_prompt=tb_initial_prompt,
|
152 |
temperature=sd_temperature,
|
153 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
154 |
+
vad_filter=cb_vad_filter)
|
155 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
156 |
inputs=params + whisper_params.to_list(),
|
157 |
outputs=[tb_indicator, files_subtitles])
|
|
|
172 |
with gr.Row():
|
173 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
174 |
with gr.Accordion("Advanced_Parameters", open=False):
|
175 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
176 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
177 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
178 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
202 |
condition_on_previous_text=cb_condition_on_previous_text,
|
203 |
initial_prompt=tb_initial_prompt,
|
204 |
temperature=sd_temperature,
|
205 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
206 |
+
vad_filter=cb_vad_filter)
|
207 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
208 |
inputs=params + whisper_params.to_list(),
|
209 |
outputs=[tb_indicator, files_subtitles])
|
modules/faster_whisper_inference.py
CHANGED
@@ -271,6 +271,7 @@ class FasterWhisperInference(BaseInterface):
|
|
271 |
patience=params.patience,
|
272 |
temperature=params.temperature,
|
273 |
compression_ratio_threshold=params.compression_ratio_threshold,
|
|
|
274 |
)
|
275 |
progress(0, desc="Loading audio..")
|
276 |
|
|
|
271 |
patience=params.patience,
|
272 |
temperature=params.temperature,
|
273 |
compression_ratio_threshold=params.compression_ratio_threshold,
|
274 |
+
vad_filter=params.vad_filter,
|
275 |
)
|
276 |
progress(0, desc="Loading audio..")
|
277 |
|
modules/whisper_parameter.py
CHANGED
@@ -18,6 +18,7 @@ class WhisperGradioComponents:
|
|
18 |
initial_prompt: gr.Textbox
|
19 |
temperature: gr.Slider
|
20 |
compression_ratio_threshold: gr.Number
|
|
|
21 |
"""
|
22 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
23 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
@@ -66,12 +67,17 @@ class WhisperGradioComponents:
|
|
66 |
to make it more likely to predict those word correctly.
|
67 |
|
68 |
temperature: gr.Slider
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
|
73 |
compression_ratio_threshold: gr.Number
|
74 |
If the gzip compression ratio is above this value, treat as failed
|
|
|
|
|
|
|
|
|
|
|
75 |
"""
|
76 |
|
77 |
def to_list(self) -> list:
|
@@ -101,6 +107,7 @@ class WhisperValues:
|
|
101 |
initial_prompt: Optional[str]
|
102 |
temperature: float
|
103 |
compression_ratio_threshold: float
|
|
|
104 |
"""
|
105 |
A data class to use Whisper parameters. Use "after" Gradio pre-processing.
|
106 |
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|
|
|
18 |
initial_prompt: gr.Textbox
|
19 |
temperature: gr.Slider
|
20 |
compression_ratio_threshold: gr.Number
|
21 |
+
vad_filter: gr.Checkbox
|
22 |
"""
|
23 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
24 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
|
|
67 |
to make it more likely to predict those word correctly.
|
68 |
|
69 |
temperature: gr.Slider
|
70 |
+
Temperature for sampling. It can be a tuple of temperatures,
|
71 |
+
which will be successively used upon failures according to either
|
72 |
+
`compression_ratio_threshold` or `log_prob_threshold`.
|
73 |
|
74 |
compression_ratio_threshold: gr.Number
|
75 |
If the gzip compression ratio is above this value, treat as failed
|
76 |
+
|
77 |
+
vad_filter: gr.Checkbox
|
78 |
+
Enable the voice activity detection (VAD) to filter out parts of the audio
|
79 |
+
without speech. This step is using the Silero VAD model
|
80 |
+
https://github.com/snakers4/silero-vad.
|
81 |
"""
|
82 |
|
83 |
def to_list(self) -> list:
|
|
|
107 |
initial_prompt: Optional[str]
|
108 |
temperature: float
|
109 |
compression_ratio_threshold: float
|
110 |
+
vad_filter: bool
|
111 |
"""
|
112 |
A data class to use Whisper parameters. Use "after" Gradio pre-processing.
|
113 |
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|