|
|
|
|
|
|
|
sample_rate: 22050 |
|
hop_length: 256 |
|
win_length: 1024 |
|
n_mel_channels: 80 |
|
n_fft: 1024 |
|
mel_fmin: 0.0 |
|
mel_fmax: 8000.0 |
|
mel_normalized: False |
|
power: 1 |
|
norm: "slaney" |
|
mel_scale: "slaney" |
|
dynamic_range_compression: True |
|
|
|
|
|
|
|
|
|
|
|
spk_emb_size: 192 |
|
spk_emb_sample_rate: 16000 |
|
custom_mel_spec_encoder: True |
|
spk_emb_encoder: speechbrain/spkrec-ecapa-voxceleb-mel-spec |
|
random_speaker_sampler: random_speaker_sampler.ckpt |
|
random_speaker_sampler_source: speechbrain/tts-mstacotron2-libritts |
|
|
|
|
|
|
|
|
|
mask_padding: True |
|
|
|
|
|
|
|
|
|
n_symbols: 148 |
|
symbols_embedding_dim: 1024 |
|
|
|
|
|
encoder_kernel_size: 5 |
|
encoder_n_convolutions: 6 |
|
encoder_embedding_dim: 1024 |
|
|
|
|
|
|
|
n_frames_per_step: 1 |
|
decoder_rnn_dim: 2048 |
|
prenet_dim: 512 |
|
max_decoder_steps: 1500 |
|
gate_threshold: 0.5 |
|
p_attention_dropout: 0.1 |
|
p_decoder_dropout: 0.1 |
|
decoder_no_early_stopping: False |
|
|
|
|
|
attention_rnn_dim: 2048 |
|
attention_dim: 256 |
|
|
|
|
|
attention_location_n_filters: 32 |
|
attention_location_kernel_size: 31 |
|
|
|
|
|
postnet_embedding_dim: 1024 |
|
postnet_kernel_size: 5 |
|
postnet_n_convolutions: 10 |
|
|
|
mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram |
|
sample_rate: !ref <sample_rate> |
|
hop_length: !ref <hop_length> |
|
win_length: !ref <win_length> |
|
n_fft: !ref <n_fft> |
|
n_mels: !ref <n_mel_channels> |
|
f_min: !ref <mel_fmin> |
|
f_max: !ref <mel_fmax> |
|
power: !ref <power> |
|
normalized: !ref <mel_normalized> |
|
norm: !ref <norm> |
|
mel_scale: !ref <mel_scale> |
|
compression: !ref <dynamic_range_compression> |
|
|
|
|
|
model: !new:speechbrain.lobes.models.MSTacotron2.Tacotron2 |
|
mask_padding: !ref <mask_padding> |
|
n_mel_channels: !ref <n_mel_channels> |
|
|
|
n_symbols: !ref <n_symbols> |
|
symbols_embedding_dim: !ref <symbols_embedding_dim> |
|
|
|
encoder_kernel_size: !ref <encoder_kernel_size> |
|
encoder_n_convolutions: !ref <encoder_n_convolutions> |
|
encoder_embedding_dim: !ref <encoder_embedding_dim> |
|
|
|
attention_rnn_dim: !ref <attention_rnn_dim> |
|
attention_dim: !ref <attention_dim> |
|
|
|
attention_location_n_filters: !ref <attention_location_n_filters> |
|
attention_location_kernel_size: !ref <attention_location_kernel_size> |
|
|
|
n_frames_per_step: !ref <n_frames_per_step> |
|
decoder_rnn_dim: !ref <decoder_rnn_dim> |
|
prenet_dim: !ref <prenet_dim> |
|
max_decoder_steps: !ref <max_decoder_steps> |
|
gate_threshold: !ref <gate_threshold> |
|
p_attention_dropout: !ref <p_attention_dropout> |
|
p_decoder_dropout: !ref <p_decoder_dropout> |
|
|
|
postnet_embedding_dim: !ref <postnet_embedding_dim> |
|
postnet_kernel_size: !ref <postnet_kernel_size> |
|
postnet_n_convolutions: !ref <postnet_n_convolutions> |
|
decoder_no_early_stopping: !ref <decoder_no_early_stopping> |
|
|
|
spk_emb_size: !ref <spk_emb_size> |
|
|
|
modules: |
|
model: !ref <model> |
|
|
|
g2p: speechbrain/soundchoice-g2p |
|
|
|
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
|
loadables: |
|
model: !ref <model> |
|
|