Spaces:
Build error
Build error
import librosa | |
import numpy as np | |
import pyloudnorm as pyln | |
from text_to_speech.utils.audio.vad import trim_long_silences | |
def librosa_pad_lr(x, fsize, fshift, pad_sides=1): | |
'''compute right padding (final frame) or both sides padding (first and final frames) | |
''' | |
assert pad_sides in (1, 2) | |
# return int(fsize // 2) | |
pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0] | |
if pad_sides == 1: | |
return 0, pad | |
else: | |
return pad // 2, pad // 2 + pad % 2 | |
def amp_to_db(x): | |
return 20 * np.log10(np.maximum(1e-5, x)) | |
def db_to_amp(x): | |
return 10.0 ** (x * 0.05) | |
def normalize(S, min_level_db): | |
return (S - min_level_db) / -min_level_db | |
def denormalize(D, min_level_db): | |
return (D * -min_level_db) + min_level_db | |
def librosa_wav2spec(wav_path, | |
fft_size=1024, | |
hop_size=256, | |
win_length=1024, | |
window="hann", | |
num_mels=80, | |
fmin=80, | |
fmax=-1, | |
eps=1e-6, | |
sample_rate=22050, | |
loud_norm=False, | |
trim_long_sil=False): | |
if isinstance(wav_path, str): | |
if trim_long_sil: | |
wav, _, _ = trim_long_silences(wav_path, sample_rate) | |
else: | |
wav, _ = librosa.core.load(wav_path, sr=sample_rate) | |
else: | |
wav = wav_path | |
if loud_norm: | |
meter = pyln.Meter(sample_rate) # create BS.1770 meter | |
loudness = meter.integrated_loudness(wav) | |
wav = pyln.normalize.loudness(wav, loudness, -22.0) | |
if np.abs(wav).max() > 1: | |
wav = wav / np.abs(wav).max() | |
# get amplitude spectrogram | |
x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size, | |
win_length=win_length, window=window, pad_mode="constant") | |
linear_spc = np.abs(x_stft) # (n_bins, T) | |
# get mel basis | |
fmin = 0 if fmin == -1 else fmin | |
fmax = sample_rate / 2 if fmax == -1 else fmax | |
mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax) | |
# calculate mel spec | |
mel = mel_basis @ linear_spc | |
mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T) | |
l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1) | |
wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0) | |
wav = wav[:mel.shape[1] * hop_size] | |
# log linear spec | |
linear_spc = np.log10(np.maximum(eps, linear_spc)) | |
return {'wav': wav, 'mel': mel.T, 'linear': linear_spc.T, 'mel_basis': mel_basis} | |