|
import importlib |
|
|
|
from inspect import isfunction |
|
|
|
import os |
|
import soundfile as sf |
|
|
|
def seed_everything(seed): |
|
import random, os |
|
import numpy as np |
|
import torch |
|
|
|
random.seed(seed) |
|
os.environ['PYTHONHASHSEED'] = str(seed) |
|
np.random.seed(seed) |
|
torch.manual_seed(seed) |
|
torch.cuda.manual_seed(seed) |
|
torch.backends.cudnn.deterministic = True |
|
torch.backends.cudnn.benchmark = True |
|
|
|
def save_wave(waveform, savepath, name="outwav"): |
|
if type(name) is not list: |
|
name = [name] * waveform.shape[0] |
|
|
|
for i in range(waveform.shape[0]): |
|
path = os.path.join( |
|
savepath, |
|
"%s_%s.wav" |
|
% ( |
|
os.path.basename(name[i]) |
|
if (not ".wav" in name[i]) |
|
else os.path.basename(name[i]).split(".")[0], |
|
i, |
|
), |
|
) |
|
sf.write(path, waveform[i, 0], samplerate=16000) |
|
|
|
def exists(x): |
|
return x is not None |
|
|
|
|
|
def default(val, d): |
|
if exists(val): |
|
return val |
|
return d() if isfunction(d) else d |
|
|
|
|
|
def count_params(model, verbose=False): |
|
total_params = sum(p.numel() for p in model.parameters()) |
|
if verbose: |
|
print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.") |
|
return total_params |
|
|
|
|
|
def get_obj_from_str(string, reload=False): |
|
module, cls = string.rsplit(".", 1) |
|
if reload: |
|
module_imp = importlib.import_module(module) |
|
importlib.reload(module_imp) |
|
return getattr(importlib.import_module(module, package=None), cls) |
|
|
|
|
|
def instantiate_from_config(config): |
|
if not "target" in config: |
|
if config == "__is_first_stage__": |
|
return None |
|
elif config == "__is_unconditional__": |
|
return None |
|
raise KeyError("Expected key `target` to instantiate.") |
|
return get_obj_from_str(config["target"])(**config.get("params", dict())) |
|
|
|
def default_audioldm_config(model_name="audioldm-s-full"): |
|
basic_config = { |
|
"wave_file_save_path": "./output", |
|
"id": { |
|
"version": "v1", |
|
"name": "default", |
|
"root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml", |
|
}, |
|
"preprocessing": { |
|
"audio": {"sampling_rate": 16000, "max_wav_value": 32768}, |
|
"stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024}, |
|
"mel": { |
|
"n_mel_channels": 64, |
|
"mel_fmin": 0, |
|
"mel_fmax": 8000, |
|
"freqm": 0, |
|
"timem": 0, |
|
"blur": False, |
|
"mean": -4.63, |
|
"std": 2.74, |
|
"target_length": 1024, |
|
}, |
|
}, |
|
"model": { |
|
"device": "cuda", |
|
"target": "audioldm.pipline.LatentDiffusion", |
|
"params": { |
|
"base_learning_rate": 5e-06, |
|
"linear_start": 0.0015, |
|
"linear_end": 0.0195, |
|
"num_timesteps_cond": 1, |
|
"log_every_t": 200, |
|
"timesteps": 1000, |
|
"first_stage_key": "fbank", |
|
"cond_stage_key": "waveform", |
|
"latent_t_size": 256, |
|
"latent_f_size": 16, |
|
"channels": 8, |
|
"cond_stage_trainable": True, |
|
"conditioning_key": "film", |
|
"monitor": "val/loss_simple_ema", |
|
"scale_by_std": True, |
|
"unet_config": { |
|
"target": "audioldm.latent_diffusion.openaimodel.UNetModel", |
|
"params": { |
|
"image_size": 64, |
|
"extra_film_condition_dim": 512, |
|
"extra_film_use_concat": True, |
|
"in_channels": 8, |
|
"out_channels": 8, |
|
"model_channels": 128, |
|
"attention_resolutions": [8, 4, 2], |
|
"num_res_blocks": 2, |
|
"channel_mult": [1, 2, 3, 5], |
|
"num_head_channels": 32, |
|
"use_spatial_transformer": True, |
|
}, |
|
}, |
|
"first_stage_config": { |
|
"base_learning_rate": 4.5e-05, |
|
"target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL", |
|
"params": { |
|
"monitor": "val/rec_loss", |
|
"image_key": "fbank", |
|
"subband": 1, |
|
"embed_dim": 8, |
|
"time_shuffle": 1, |
|
"ddconfig": { |
|
"double_z": True, |
|
"z_channels": 8, |
|
"resolution": 256, |
|
"downsample_time": False, |
|
"in_channels": 1, |
|
"out_ch": 1, |
|
"ch": 128, |
|
"ch_mult": [1, 2, 4], |
|
"num_res_blocks": 2, |
|
"attn_resolutions": [], |
|
"dropout": 0.0, |
|
}, |
|
}, |
|
}, |
|
"cond_stage_config": { |
|
"target": "audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2", |
|
"params": { |
|
"key": "waveform", |
|
"sampling_rate": 16000, |
|
"embed_mode": "audio", |
|
"unconditional_prob": 0.1, |
|
}, |
|
}, |
|
}, |
|
}, |
|
} |
|
|
|
if("-l-" in model_name): |
|
basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 256 |
|
basic_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = 64 |
|
elif("-m-" in model_name): |
|
basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 192 |
|
basic_config["model"]["params"]["cond_stage_config"]["params"]["amodel"] = "HTSAT-base" |
|
|
|
return basic_config |