Spaces:
Running
Running
Make it easier to use the old segmentation strategy
Browse files- app.py +24 -20
- cli.py +3 -3
- src/vad.py +58 -22
app.py
CHANGED
@@ -14,7 +14,7 @@ import gradio as gr
|
|
14 |
|
15 |
from src.download import ExceededMaximumDuration, download_url
|
16 |
from src.utils import slugify, write_srt, write_vtt
|
17 |
-
from src.vad import VadPeriodicTranscription, VadSileroTranscription
|
18 |
|
19 |
# Limitations (set to -1 to disable)
|
20 |
DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
|
@@ -94,25 +94,17 @@ class WhisperTranscriber:
|
|
94 |
|
95 |
# The results
|
96 |
if (vad == 'silero-vad'):
|
97 |
-
#
|
98 |
-
|
99 |
-
self.vad_model = VadSileroTranscription()
|
100 |
-
|
101 |
-
process_gaps = VadSileroTranscription(transcribe_non_speech = True,
|
102 |
-
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
103 |
-
segment_padding_left=vadPadding, segment_padding_right=vadPadding,
|
104 |
-
max_prompt_window=vadPromptWindow, copy=self.vad_model)
|
105 |
result = process_gaps.transcribe(audio_path, whisperCallable)
|
106 |
elif (vad == 'silero-vad-skip-gaps'):
|
107 |
-
#
|
108 |
-
|
109 |
-
self.vad_model = VadSileroTranscription()
|
110 |
-
|
111 |
-
skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
|
112 |
-
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
113 |
-
segment_padding_left=vadPadding, segment_padding_right=vadPadding,
|
114 |
-
max_prompt_window=vadPromptWindow, copy=self.vad_model)
|
115 |
result = skip_gaps.transcribe(audio_path, whisperCallable)
|
|
|
|
|
|
|
|
|
116 |
elif (vad == 'periodic-vad'):
|
117 |
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
118 |
# it may create a break in the middle of a sentence, causing some artifacts.
|
@@ -124,6 +116,18 @@ class WhisperTranscriber:
|
|
124 |
|
125 |
return result
|
126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
def write_result(self, result: dict, source_name: str, output_dir: str):
|
128 |
if not os.path.exists(output_dir):
|
129 |
os.makedirs(output_dir)
|
@@ -218,11 +222,11 @@ def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
|
|
218 |
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
219 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
220 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
221 |
-
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
|
222 |
-
gr.Number(label="VAD - Merge Window (s)", precision=0, value=
|
223 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
|
224 |
gr.Number(label="VAD - Padding (s)", precision=None, value=1),
|
225 |
-
gr.Number(label="VAD - Prompt Window (s)", precision=None, value=
|
226 |
], outputs=[
|
227 |
gr.File(label="Download"),
|
228 |
gr.Text(label="Transcription"),
|
|
|
14 |
|
15 |
from src.download import ExceededMaximumDuration, download_url
|
16 |
from src.utils import slugify, write_srt, write_vtt
|
17 |
+
from src.vad import NonSpeechStrategy, VadPeriodicTranscription, VadSileroTranscription
|
18 |
|
19 |
# Limitations (set to -1 to disable)
|
20 |
DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
|
|
|
94 |
|
95 |
# The results
|
96 |
if (vad == 'silero-vad'):
|
97 |
+
# Silero VAD where non-speech gaps are transcribed
|
98 |
+
process_gaps = self._create_silero_vad(NonSpeechStrategy.CREATE_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
result = process_gaps.transcribe(audio_path, whisperCallable)
|
100 |
elif (vad == 'silero-vad-skip-gaps'):
|
101 |
+
# Silero VAD where non-speech gaps are simply ignored
|
102 |
+
skip_gaps = self._create_silero_vad(NonSpeechStrategy.SKIP, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
result = skip_gaps.transcribe(audio_path, whisperCallable)
|
104 |
+
elif (vad == 'silero-vad-expand-into-gaps'):
|
105 |
+
# Use Silero VAD where speech-segments are expanded into non-speech gaps
|
106 |
+
expand_gaps = self._create_silero_vad(NonSpeechStrategy.EXPAND_SEGMENT, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow)
|
107 |
+
result = expand_gaps.transcribe(audio_path, whisperCallable)
|
108 |
elif (vad == 'periodic-vad'):
|
109 |
# Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
|
110 |
# it may create a break in the middle of a sentence, causing some artifacts.
|
|
|
116 |
|
117 |
return result
|
118 |
|
119 |
+
def _create_silero_vad(self, non_speech_strategy: NonSpeechStrategy, vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1):
|
120 |
+
# Use Silero VAD
|
121 |
+
if (self.vad_model is None):
|
122 |
+
self.vad_model = VadSileroTranscription()
|
123 |
+
|
124 |
+
result = VadSileroTranscription(non_speech_strategy = non_speech_strategy,
|
125 |
+
max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
|
126 |
+
segment_padding_left=vadPadding, segment_padding_right=vadPadding,
|
127 |
+
max_prompt_window=vadPromptWindow, copy=self.vad_model)
|
128 |
+
|
129 |
+
return result
|
130 |
+
|
131 |
def write_result(self, result: dict, source_name: str, output_dir: str):
|
132 |
if not os.path.exists(output_dir):
|
133 |
os.makedirs(output_dir)
|
|
|
222 |
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
|
223 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
224 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
|
225 |
+
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], label="VAD"),
|
226 |
+
gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
|
227 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=30),
|
228 |
gr.Number(label="VAD - Padding (s)", precision=None, value=1),
|
229 |
+
gr.Number(label="VAD - Prompt Window (s)", precision=None, value=3)
|
230 |
], outputs=[
|
231 |
gr.File(label="Download"),
|
232 |
gr.Text(label="Transcription"),
|
cli.py
CHANGED
@@ -26,11 +26,11 @@ def cli():
|
|
26 |
parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
|
27 |
parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), help="language spoken in the audio, specify None to perform language detection")
|
28 |
|
29 |
-
parser.add_argument("--vad", type=str, default="none", choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], help="The voice activity detection algorithm to use")
|
30 |
parser.add_argument("--vad_merge_window", type=optional_float, default=5, help="The window size (in seconds) to merge voice segments")
|
31 |
-
parser.add_argument("--vad_max_merge_size", type=optional_float, default=
|
32 |
parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
|
33 |
-
parser.add_argument("--vad_prompt_window", type=optional_float, default=
|
34 |
|
35 |
parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
|
36 |
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
|
|
|
26 |
parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
|
27 |
parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES), help="language spoken in the audio, specify None to perform language detection")
|
28 |
|
29 |
+
parser.add_argument("--vad", type=str, default="none", choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], help="The voice activity detection algorithm to use")
|
30 |
parser.add_argument("--vad_merge_window", type=optional_float, default=5, help="The window size (in seconds) to merge voice segments")
|
31 |
+
parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
|
32 |
parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
|
33 |
+
parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
|
34 |
|
35 |
parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
|
36 |
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
|
src/vad.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
from collections import Counter, deque
|
3 |
-
|
|
|
4 |
|
5 |
from pprint import pprint
|
6 |
|
@@ -19,6 +20,20 @@ import numpy as np
|
|
19 |
from src.utils import format_timestamp
|
20 |
from enum import Enum
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
# Defaults for Silero
|
23 |
SPEECH_TRESHOLD = 0.3
|
24 |
MAX_SILENT_PERIOD = 10 # seconds
|
@@ -28,9 +43,6 @@ MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
|
|
28 |
SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
|
29 |
SEGMENT_PADDING_RIGHT = 1 # End detected segments late
|
30 |
|
31 |
-
# Whether to attempt to transcribe non-speech
|
32 |
-
TRANSCRIBE_NON_SPEECH = False
|
33 |
-
|
34 |
# Minimum size of segments to process
|
35 |
MIN_SEGMENT_DURATION = 1
|
36 |
|
@@ -46,13 +58,13 @@ VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
|
|
46 |
|
47 |
class AbstractTranscription(ABC):
|
48 |
def __init__(self, segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
|
49 |
-
max_merge_size: float = None,
|
50 |
self.sampling_rate = 16000
|
51 |
self.segment_padding_left = segment_padding_left
|
52 |
self.segment_padding_right = segment_padding_right
|
53 |
self.max_silent_period = max_silent_period
|
54 |
self.max_merge_size = max_merge_size
|
55 |
-
self.
|
56 |
self.max_prompt_window = max_prompt_window
|
57 |
|
58 |
self.min_force_merge_gap = MIN_FORCE_MERGE_GAP
|
@@ -107,16 +119,18 @@ class AbstractTranscription(ABC):
|
|
107 |
print("Timestamps:")
|
108 |
pprint(merged)
|
109 |
|
110 |
-
if self.
|
111 |
max_audio_duration = get_audio_duration(audio)
|
112 |
|
113 |
# Expand segments to include the gaps between them
|
114 |
-
if (self.
|
115 |
# When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
|
116 |
merged = self.fill_gaps(merged, total_duration=max_audio_duration, max_expand_size=self.max_merge_size)
|
117 |
-
|
118 |
-
# With no prompt window, it is better to expand the segments
|
119 |
merged = self.expand_gaps(merged, total_duration=max_audio_duration)
|
|
|
|
|
120 |
|
121 |
print("Transcribing non-speech:")
|
122 |
pprint(merged)
|
@@ -150,6 +164,17 @@ class AbstractTranscription(ABC):
|
|
150 |
|
151 |
adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
# Append to output
|
154 |
result['text'] += segment_result['text']
|
155 |
result['segments'].extend(adjusted_segments)
|
@@ -158,20 +183,30 @@ class AbstractTranscription(ABC):
|
|
158 |
languageCounter[segment_result['language']] += 1
|
159 |
|
160 |
# Update prompt window
|
161 |
-
|
162 |
-
|
163 |
-
for segment in adjusted_segments:
|
164 |
-
if segment.get('no_speech_prob', 0) <= PROMPT_NO_SPEECH_PROB:
|
165 |
-
prompt_window.append(segment)
|
166 |
-
|
167 |
-
while (len(prompt_window) > 0 and prompt_window[0]['end'] < segment_end - self.max_prompt_window):
|
168 |
-
prompt_window.popleft()
|
169 |
-
|
170 |
if len(languageCounter) > 0:
|
171 |
result['language'] = languageCounter.most_common(1)[0][0]
|
172 |
|
173 |
return result
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
def include_gaps(self, segments: Iterator[dict], min_gap_length: float, total_duration: float):
|
176 |
result = []
|
177 |
last_end_time = 0
|
@@ -360,7 +395,8 @@ class AbstractTranscription(ABC):
|
|
360 |
if distance <= max_merge_gap and (max_merge_size is None or current_entry_size <= max_merge_size):
|
361 |
# Regular merge
|
362 |
current_entry['end'] = entry['end']
|
363 |
-
elif min_force_merge_gap is not None and distance <= min_force_merge_gap and
|
|
|
364 |
# Force merge if the distance is small (up to a certain maximum size)
|
365 |
current_entry['end'] = entry['end']
|
366 |
else:
|
@@ -389,10 +425,10 @@ class AbstractTranscription(ABC):
|
|
389 |
|
390 |
class VadSileroTranscription(AbstractTranscription):
|
391 |
def __init__(self, segment_padding_left=SEGMENT_PADDING_LEFT, segment_padding_right=SEGMENT_PADDING_RIGHT,
|
392 |
-
max_silent_period=MAX_SILENT_PERIOD, max_merge_size=MAX_MERGE_SIZE,
|
393 |
max_prompt_window=MAX_PROMPT_WINDOW, copy = None):
|
394 |
super().__init__(segment_padding_left=segment_padding_left, segment_padding_right=segment_padding_right,
|
395 |
-
max_silent_period=max_silent_period, max_merge_size=max_merge_size,
|
396 |
|
397 |
if copy:
|
398 |
self.model = copy.model
|
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
from collections import Counter, deque
|
3 |
+
|
4 |
+
from typing import Any, Deque, Iterator, List, Dict
|
5 |
|
6 |
from pprint import pprint
|
7 |
|
|
|
20 |
from src.utils import format_timestamp
|
21 |
from enum import Enum
|
22 |
|
23 |
+
class NonSpeechStrategy(Enum):
|
24 |
+
"""
|
25 |
+
Ignore non-speech frames segments.
|
26 |
+
"""
|
27 |
+
SKIP = 1
|
28 |
+
"""
|
29 |
+
Just treat non-speech segments as speech.
|
30 |
+
"""
|
31 |
+
CREATE_SEGMENT = 2
|
32 |
+
"""
|
33 |
+
Expand speech segments into subsequent non-speech segments.
|
34 |
+
"""
|
35 |
+
EXPAND_SEGMENT = 3
|
36 |
+
|
37 |
# Defaults for Silero
|
38 |
SPEECH_TRESHOLD = 0.3
|
39 |
MAX_SILENT_PERIOD = 10 # seconds
|
|
|
43 |
SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
|
44 |
SEGMENT_PADDING_RIGHT = 1 # End detected segments late
|
45 |
|
|
|
|
|
|
|
46 |
# Minimum size of segments to process
|
47 |
MIN_SEGMENT_DURATION = 1
|
48 |
|
|
|
58 |
|
59 |
class AbstractTranscription(ABC):
|
60 |
def __init__(self, segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
|
61 |
+
max_merge_size: float = None, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP, max_prompt_window: float = None):
|
62 |
self.sampling_rate = 16000
|
63 |
self.segment_padding_left = segment_padding_left
|
64 |
self.segment_padding_right = segment_padding_right
|
65 |
self.max_silent_period = max_silent_period
|
66 |
self.max_merge_size = max_merge_size
|
67 |
+
self.non_speech_strategy = non_speech_strategy
|
68 |
self.max_prompt_window = max_prompt_window
|
69 |
|
70 |
self.min_force_merge_gap = MIN_FORCE_MERGE_GAP
|
|
|
119 |
print("Timestamps:")
|
120 |
pprint(merged)
|
121 |
|
122 |
+
if self.non_speech_strategy != NonSpeechStrategy.SKIP:
|
123 |
max_audio_duration = get_audio_duration(audio)
|
124 |
|
125 |
# Expand segments to include the gaps between them
|
126 |
+
if (self.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
|
127 |
# When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
|
128 |
merged = self.fill_gaps(merged, total_duration=max_audio_duration, max_expand_size=self.max_merge_size)
|
129 |
+
elif self.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
|
130 |
+
# With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
|
131 |
merged = self.expand_gaps(merged, total_duration=max_audio_duration)
|
132 |
+
else:
|
133 |
+
raise Exception("Unknown non-speech strategy: " + str(self.non_speech_strategy))
|
134 |
|
135 |
print("Transcribing non-speech:")
|
136 |
pprint(merged)
|
|
|
164 |
|
165 |
adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
166 |
|
167 |
+
# Propagate expand amount to the segments
|
168 |
+
if (segment_expand_amount > 0):
|
169 |
+
segment_without_expansion = segment_duration - segment_expand_amount
|
170 |
+
|
171 |
+
for adjusted_segment in adjusted_segments:
|
172 |
+
adjusted_segment_end = adjusted_segment['end']
|
173 |
+
|
174 |
+
# Add expand amount if the segment got expanded
|
175 |
+
if (adjusted_segment_end > segment_without_expansion):
|
176 |
+
adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
|
177 |
+
|
178 |
# Append to output
|
179 |
result['text'] += segment_result['text']
|
180 |
result['segments'].extend(adjusted_segments)
|
|
|
183 |
languageCounter[segment_result['language']] += 1
|
184 |
|
185 |
# Update prompt window
|
186 |
+
self.__update_prompt_window(prompt_window, adjusted_segments, segment_end)
|
187 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
if len(languageCounter) > 0:
|
189 |
result['language'] = languageCounter.most_common(1)[0][0]
|
190 |
|
191 |
return result
|
192 |
|
193 |
+
def __update_prompt_window(self, prompt_window: Deque, adjusted_segments: List, segment_end: float):
|
194 |
+
if (self.max_prompt_window is not None and self.max_prompt_window > 0):
|
195 |
+
# Add segments to the current prompt window
|
196 |
+
for segment in adjusted_segments:
|
197 |
+
if segment.get('no_speech_prob', 0) <= PROMPT_NO_SPEECH_PROB:
|
198 |
+
prompt_window.append(segment)
|
199 |
+
|
200 |
+
while (len(prompt_window) > 0):
|
201 |
+
first_end_time = prompt_window[0].get('end', 0)
|
202 |
+
# Time expanded in the segments should be discounted from the prompt window
|
203 |
+
first_expand_time = prompt_window[0].get('expand_amount', 0)
|
204 |
+
|
205 |
+
if (first_end_time - first_expand_time < segment_end - self.max_prompt_window):
|
206 |
+
prompt_window.popleft()
|
207 |
+
else:
|
208 |
+
break
|
209 |
+
|
210 |
def include_gaps(self, segments: Iterator[dict], min_gap_length: float, total_duration: float):
|
211 |
result = []
|
212 |
last_end_time = 0
|
|
|
395 |
if distance <= max_merge_gap and (max_merge_size is None or current_entry_size <= max_merge_size):
|
396 |
# Regular merge
|
397 |
current_entry['end'] = entry['end']
|
398 |
+
elif min_force_merge_gap is not None and distance <= min_force_merge_gap and \
|
399 |
+
(max_force_merge_size is None or current_entry_size <= max_force_merge_size):
|
400 |
# Force merge if the distance is small (up to a certain maximum size)
|
401 |
current_entry['end'] = entry['end']
|
402 |
else:
|
|
|
425 |
|
426 |
class VadSileroTranscription(AbstractTranscription):
|
427 |
def __init__(self, segment_padding_left=SEGMENT_PADDING_LEFT, segment_padding_right=SEGMENT_PADDING_RIGHT,
|
428 |
+
max_silent_period=MAX_SILENT_PERIOD, max_merge_size=MAX_MERGE_SIZE, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
|
429 |
max_prompt_window=MAX_PROMPT_WINDOW, copy = None):
|
430 |
super().__init__(segment_padding_left=segment_padding_left, segment_padding_right=segment_padding_right,
|
431 |
+
max_silent_period=max_silent_period, max_merge_size=max_merge_size, non_speech_strategy=non_speech_strategy, max_prompt_window=max_prompt_window)
|
432 |
|
433 |
if copy:
|
434 |
self.model = copy.model
|