|
import torch |
|
import torchaudio |
|
import numpy as np |
|
from scipy.io.wavfile import read |
|
|
|
|
|
def load_wav_to_torch(full_path): |
|
sampling_rate, data = read(full_path) |
|
if data.dtype == np.int32: |
|
norm_fix = 2 ** 31 |
|
elif data.dtype == np.int16: |
|
norm_fix = 2 ** 15 |
|
elif data.dtype == np.float16 or data.dtype == np.float32: |
|
norm_fix = 1. |
|
else: |
|
raise NotImplemented(f"Provided data dtype not supported: {data.dtype}") |
|
return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate) |
|
|
|
|
|
def load_audio(audiopath, sampling_rate): |
|
if audiopath[-4:] == '.wav': |
|
audio, lsr = load_wav_to_torch(audiopath) |
|
elif audiopath[-4:] == '.mp3': |
|
|
|
from pyfastmp3decoder.mp3decoder import load_mp3 |
|
audio, lsr = load_mp3(audiopath, sampling_rate) |
|
audio = torch.FloatTensor(audio) |
|
|
|
|
|
if len(audio.shape) > 1: |
|
if audio.shape[0] < 5: |
|
audio = audio[0] |
|
else: |
|
assert audio.shape[1] < 5 |
|
audio = audio[:, 0] |
|
|
|
if lsr != sampling_rate: |
|
audio = torchaudio.functional.resample(audio, lsr, sampling_rate) |
|
|
|
|
|
|
|
if torch.any(audio > 2) or not torch.any(audio < 0): |
|
print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}") |
|
audio.clip_(-1, 1) |
|
|
|
return audio.unsqueeze(0) |