|
{ |
|
"_name_or_path": "hahunavth/emofs2-base", |
|
"architectures": [ |
|
"ESSModelForPretraining" |
|
], |
|
"freeze": [], |
|
"model_config": { |
|
"conformer": { |
|
"attention_dropout_p": 0.2, |
|
"conv_dropout_p": 0.2, |
|
"conv_expansion_factor": 2, |
|
"conv_kernel_size": 7, |
|
"decoder_dim": 256, |
|
"encoder_dim": 256, |
|
"feed_forward_dropout_p": 0.2, |
|
"feed_forward_expansion_factor": 4, |
|
"half_step_residual": true, |
|
"num_attention_heads": 2, |
|
"num_decode_layers": 6, |
|
"num_encode_layers": 4 |
|
}, |
|
"max_seq_len": 1000, |
|
"mode": "train", |
|
"num_emotion": 5, |
|
"reference_encoder": { |
|
"dropout": 0.2, |
|
"encoder_dim": 128 |
|
}, |
|
"variance_embedding": { |
|
"energy_quantization": "linear", |
|
"n_bins": 256, |
|
"pitch_quantization": "linear" |
|
}, |
|
"variance_predictor": { |
|
"dropout": 0.5, |
|
"filter_size": 256, |
|
"kernel_size": 3 |
|
}, |
|
"vocoder": { |
|
"model": "HiFi-GAN", |
|
"speaker": "tth" |
|
} |
|
}, |
|
"model_type": "emofs2", |
|
"preprocess_config": { |
|
"dataset": "vlsp2023emo", |
|
"emotion2id": { |
|
"angry": 3, |
|
"happy": 1, |
|
"neutral": 0, |
|
"sad": 2, |
|
"surprise": 4 |
|
}, |
|
"id2emotion": { |
|
"0": "neutral", |
|
"1": "happy", |
|
"2": "sad", |
|
"3": "angry", |
|
"4": "surprise" |
|
}, |
|
"path": { |
|
"corpus_path": "./data/pretrained_tts_dataset/tuyendv.dict", |
|
"lexicon_path": "../datasets/ess-vlsp2023-lexicon/lexicon.dict", |
|
"preprocessed_path": "../datasets/ess-vlsp2023-emo-processed-phoneme-level", |
|
"raw_path": "./data/pretrained_tts_dataset_raw" |
|
}, |
|
"preprocessing": { |
|
"audio": { |
|
"max_wav_value": 32768.0, |
|
"sampling_rate": 22050 |
|
}, |
|
"energy": { |
|
"feature": "phoneme_level", |
|
"normalization": true |
|
}, |
|
"mel": { |
|
"mel_fmax": 8000, |
|
"mel_fmin": 0, |
|
"n_mel_channels": 80 |
|
}, |
|
"pitch": { |
|
"feature": "phoneme_level", |
|
"normalization": true |
|
}, |
|
"stft": { |
|
"filter_length": 1024, |
|
"hop_length": 256, |
|
"win_length": 1024 |
|
}, |
|
"text": { |
|
"language": "en", |
|
"text_cleaners": [] |
|
}, |
|
"val_size": 512 |
|
}, |
|
"smoothing_label": 0.1 |
|
}, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.35.2" |
|
} |
|
|