from dataclasses import dataclass, field | |
class VADHandlerArguments: | |
thresh: float = field( | |
default=0.3, | |
metadata={ | |
"help": "The threshold value for voice activity detection (VAD). Values typically range from 0 to 1, with higher values requiring higher confidence in speech detection." | |
}, | |
) | |
sample_rate: int = field( | |
default=16000, | |
metadata={ | |
"help": "The sample rate of the audio in Hertz. Default is 16000 Hz, which is a common setting for voice audio." | |
}, | |
) | |
min_silence_ms: int = field( | |
default=250, | |
metadata={ | |
"help": "Minimum length of silence intervals to be used for segmenting speech. Measured in milliseconds. Default is 250 ms." | |
}, | |
) | |
min_speech_ms: int = field( | |
default=500, | |
metadata={ | |
"help": "Minimum length of speech segments to be considered valid speech. Measured in milliseconds. Default is 500 ms." | |
}, | |
) | |
max_speech_ms: float = field( | |
default=float("inf"), | |
metadata={ | |
"help": "Maximum length of continuous speech before forcing a split. Default is infinite, allowing for uninterrupted speech segments." | |
}, | |
) | |
speech_pad_ms: int = field( | |
default=500, | |
metadata={ | |
"help": "Amount of padding added to the beginning and end of detected speech segments. Measured in milliseconds. Default is 250 ms." | |
}, | |
) | |
audio_enhancement: bool = field( | |
default=False, | |
metadata={ | |
"help": "improves sound quality by applying techniques like noise reduction, equalization, and echo cancellation. Default is False." | |
}, | |
) | |