xcodec / config_wavlm.yaml
ZhenYe234's picture
Upload config_wavlm.yaml
fee43c7 verified
generator:
name: SoundStream
config:
n_filters: 32
D: 256
target_bandwidths:
- 0.5
- 1
- 1.5
- 2
- 4
ratios:
- 8
- 5
- 4
- 2
sample_rate: 16000
bins: 1024
semantic_techer: wavlm_base_plus
d_list:
- mfd
mfd:
name: MultiFrequencyDiscriminator
config:
hop_lengths:
- 32
- 64
- 128
- 256
- 512
- 1024
hidden_channels:
- 64
- 128
- 256
- 512
- 512
- 512
domain: double
mel_scale: true
sample_rate: 16000
mpd:
name: MultiPeriodDiscriminator
config:
period_sizes:
- 2
- 3
- 5
- 7
- 11
period_kernel_size: 5
msd:
name: MultiScaleDiscriminator
config:
num_scales: 3
pool_kernel_size: 4
pool_stride: 2
optimizer:
g:
name: AdamW
config:
lr: 0.0002
betas:
- 0.8
- 0.99
eps: 1.0e-06
d:
name: AdamW
config:
lr: 0.0002
betas:
- 0.8
- 0.99
eps: 1.0e-06
lr_scheduler:
g:
name: ExponentialLR
config:
gamma: 0.999
d:
name: ExponentialLR
config:
gamma: 0.999
criterion:
g_criterion:
name: losses.generator_loss.GeneratorSTFTLoss
config:
use_mel_loss: false
adv_criterion: MSEGLoss
mel_loss_weight: 45
use_feature_match: true
feat_match_loss_weight: 20
use_full_stft_loss: true
use_sub_stft_loss: true
full_stft_loss_weight: 1
sub_stft_loss_weight: 1
mel_scale_loss:
sampling_rate: 16000
n_fft: 1024
num_mels: 80
hop_size: 160
win_size: 800
fmin: 0
full_multi_scale_stft_loss:
fft_sizes:
- 512
- 1024
- 2048
win_sizes:
- 480
- 960
- 1200
hop_sizes:
- 120
- 240
- 300
sub_multi_scale_stft_loss:
num_bands: 6
fft_sizes:
- 128
- 256
- 256
win_sizes:
- 80
- 120
- 200
hop_sizes:
- 20
- 40
- 50
d_criterion:
name: losses.discriminator_loss.MSEDiscriminatorLoss
config: null
commit_loss_weight: 1.0
codebook_loss_weight: 100
audio_norm_scale: 0.95