diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..441062e481ec29f9c71e288a6fc0a213ccbdfb21 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +imgs/vocoder/gan/MSSBCQTD.png filter=lfs diff=lfs merge=lfs -text diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..3a6f0e15d39b6e1ad73c9291a2f868b3220db6fd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,64 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# Other version: https://hub.docker.com/r/nvidia/cuda/tags +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu18.04 + +ARG DEBIAN_FRONTEND=noninteractive +ARG PYTORCH='2.0.0' +ARG CUDA='cu118' +ARG SHELL='/bin/bash' +ARG MINICONDA='Miniconda3-py39_23.3.1-0-Linux-x86_64.sh' + +ENV LANG=en_US.UTF-8 PYTHONIOENCODING=utf-8 PYTHONDONTWRITEBYTECODE=1 CUDA_HOME=/usr/local/cuda CONDA_HOME=/opt/conda SHELL=${SHELL} +ENV PATH=$CONDA_HOME/bin:$CUDA_HOME/bin:$PATH \ + LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH \ + LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH \ + CONDA_PREFIX=$CONDA_HOME \ + NCCL_HOME=$CUDA_HOME + +# Install ubuntu packages +RUN sed -i 's/archive.ubuntu.com/mirrors.cloud.tencent.com/g' /etc/apt/sources.list \ + && sed -i 's/security.ubuntu.com/mirrors.cloud.tencent.com/g' /etc/apt/sources.list \ + && rm /etc/apt/sources.list.d/cuda.list \ + && apt-get update \ + && apt-get -y install \ + python3-pip ffmpeg git less wget libsm6 libxext6 libxrender-dev \ + build-essential cmake pkg-config libx11-dev libatlas-base-dev \ + libgtk-3-dev libboost-python-dev vim libgl1-mesa-glx \ + libaio-dev software-properties-common tmux \ + espeak-ng + +# Install miniconda with python 3.9 +USER root +# COPY Miniconda3-py39_23.3.1-0-Linux-x86_64.sh /root/anaconda.sh +RUN wget -t 0 -c -O /tmp/anaconda.sh https://repo.anaconda.com/miniconda/${MINICONDA} \ + && mv /tmp/anaconda.sh /root/anaconda.sh \ + && ${SHELL} /root/anaconda.sh -b -p $CONDA_HOME \ + && rm /root/anaconda.sh + +RUN conda create -y --name amphion python=3.9.15 + +WORKDIR /app +COPY env.sh env.sh +RUN chmod +x ./env.sh + +RUN ["conda", "run", "-n", "amphion", "-vvv", "--no-capture-output", "./env.sh"] + +RUN conda init \ + && echo "\nconda activate amphion\n" >> ~/.bashrc + +CMD ["/bin/bash"] + +# *** Build *** +# docker build -t realamphion/amphion . + +# *** Run *** +# cd Amphion +# docker run --runtime=nvidia --gpus all -it -v .:/app -v /mnt:/mnt_host realamphion/amphion + +# *** Push and release *** +# docker login +# docker push realamphion/amphion diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..68c66d23fb4c573b2fe039374d2f0a561a36f892 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Amphion + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/bins/calc_metrics.py b/bins/calc_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..9c83afe2611ac93e67e2a4436457e1450a1cd1b1 --- /dev/null +++ b/bins/calc_metrics.py @@ -0,0 +1,268 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import sys +import numpy as np +import json +import argparse +import whisper +import torch + +from glob import glob +from tqdm import tqdm +from collections import defaultdict + + +from evaluation.metrics.energy.energy_rmse import extract_energy_rmse +from evaluation.metrics.energy.energy_pearson_coefficients import ( + extract_energy_pearson_coeffcients, +) +from evaluation.metrics.f0.f0_pearson_coefficients import extract_fpc +from evaluation.metrics.f0.f0_periodicity_rmse import extract_f0_periodicity_rmse +from evaluation.metrics.f0.f0_rmse import extract_f0rmse +from evaluation.metrics.f0.v_uv_f1 import extract_f1_v_uv +from evaluation.metrics.intelligibility.character_error_rate import extract_cer +from evaluation.metrics.intelligibility.word_error_rate import extract_wer +from evaluation.metrics.similarity.speaker_similarity import extract_similarity +from evaluation.metrics.spectrogram.frechet_distance import extract_fad +from evaluation.metrics.spectrogram.mel_cepstral_distortion import extract_mcd +from evaluation.metrics.spectrogram.multi_resolution_stft_distance import extract_mstft +from evaluation.metrics.spectrogram.pesq import extract_pesq +from evaluation.metrics.spectrogram.scale_invariant_signal_to_distortion_ratio import ( + extract_si_sdr, +) +from evaluation.metrics.spectrogram.scale_invariant_signal_to_noise_ratio import ( + extract_si_snr, +) +from evaluation.metrics.spectrogram.short_time_objective_intelligibility import ( + extract_stoi, +) + +METRIC_FUNC = { + "energy_rmse": extract_energy_rmse, + "energy_pc": extract_energy_pearson_coeffcients, + "fpc": extract_fpc, + "f0_periodicity_rmse": extract_f0_periodicity_rmse, + "f0rmse": extract_f0rmse, + "v_uv_f1": extract_f1_v_uv, + "cer": extract_cer, + "wer": extract_wer, + "similarity": extract_similarity, + "fad": extract_fad, + "mcd": extract_mcd, + "mstft": extract_mstft, + "pesq": extract_pesq, + "si_sdr": extract_si_sdr, + "si_snr": extract_si_snr, + "stoi": extract_stoi, +} + + +def calc_metric( + ref_dir, + deg_dir, + dump_dir, + metrics, + **kwargs, +): + result = defaultdict() + + for metric in tqdm(metrics): + if metric in ["fad", "similarity"]: + result[metric] = str(METRIC_FUNC[metric](ref_dir, deg_dir, kwargs=kwargs)) + continue + + audios_ref = [] + audios_deg = [] + + files = glob(deg_dir + "/*.wav") + + for file in files: + audios_deg.append(file) + uid = file.split("/")[-1].split(".wav")[0] + file_gt = ref_dir + "/{}.wav".format(uid) + audios_ref.append(file_gt) + + if metric in ["wer", "cer"] and kwargs["intelligibility_mode"] == "gt_content": + ltr_path = kwargs["ltr_path"] + tmpltrs = {} + with open(ltr_path, "r") as f: + for line in f: + paras = line.replace("\n", "").split("|") + paras[1] = paras[1].replace(" ", "") + paras[1] = paras[1].replace(".", "") + paras[1] = paras[1].replace("'", "") + paras[1] = paras[1].replace("-", "") + paras[1] = paras[1].replace(",", "") + paras[1] = paras[1].replace("!", "") + paras[1] = paras[1].lower() + tmpltrs[paras[0]] = paras[1] + ltrs = [] + files = glob(ref_dir + "/*.wav") + for file in files: + ltrs.append(tmpltrs[os.path.basename(file)]) + + if metric in ["v_uv_f1"]: + tp_total = 0 + fp_total = 0 + fn_total = 0 + + for i in tqdm(range(len(audios_ref))): + audio_ref = audios_ref[i] + audio_deg = audios_deg[i] + tp, fp, fn = METRIC_FUNC[metric](audio_ref, audio_deg, kwargs=kwargs) + tp_total += tp + fp_total += fp + fn_total += fn + + result[metric] = str(tp_total / (tp_total + (fp_total + fn_total) / 2)) + else: + scores = [] + for i in tqdm(range(len(audios_ref))): + audio_ref = audios_ref[i] + audio_deg = audios_deg[i] + + if metric in ["wer", "cer"]: + model = whisper.load_model("large") + mode = kwargs["intelligibility_mode"] + if torch.cuda.is_available(): + device = torch.device("cuda") + model = model.to(device) + + if mode == "gt_audio": + kwargs["audio_ref"] = audio_ref + kwargs["audio_deg"] = audio_deg + score = METRIC_FUNC[metric]( + model, + kwargs=kwargs, + ) + elif mode == "gt_content": + kwargs["content_gt"] = ltrs[i] + kwargs["audio_deg"] = audio_deg + score = METRIC_FUNC[metric]( + model, + kwargs=kwargs, + ) + else: + score = METRIC_FUNC[metric]( + audio_ref, + audio_deg, + kwargs=kwargs, + ) + if not np.isnan(score): + scores.append(score) + + scores = np.array(scores) + result["{}".format(metric)] = str(np.mean(scores)) + + data = json.dumps(result, indent=4) + + with open(os.path.join(dump_dir, "result.json"), "w", newline="\n") as f: + f.write(data) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--ref_dir", + type=str, + help="Path to the reference audio folder.", + ) + parser.add_argument( + "--deg_dir", + type=str, + help="Path to the test audio folder.", + ) + parser.add_argument( + "--dump_dir", + type=str, + help="Path to dump the results.", + ) + parser.add_argument( + "--metrics", + nargs="+", + help="Metrics used to evaluate.", + ) + parser.add_argument( + "--fs", + type=str, + default="None", + help="(Optional) Sampling rate", + ) + parser.add_argument( + "--align_method", + type=str, + default="dtw", + help="(Optional) Method for aligning feature length. ['cut', 'dtw']", + ) + + parser.add_argument( + "--db_scale", + type=str, + default="True", + help="(Optional) Wether or not computing energy related metrics in db scale.", + ) + parser.add_argument( + "--f0_subtract_mean", + type=str, + default="True", + help="(Optional) Wether or not computing f0 related metrics with mean value subtracted.", + ) + + parser.add_argument( + "--similarity_model", + type=str, + default="wavlm", + help="(Optional)The model for computing speaker similarity. ['rawnet', 'wavlm', 'resemblyzer']", + ) + parser.add_argument( + "--similarity_mode", + type=str, + default="pairwith", + help="(Optional)The method of calculating similarity, where set to overall means computing \ + the speaker similarity between two folder of audios content freely, and set to pairwith means \ + computing the speaker similarity between a seires of paired gt/pred audios", + ) + + parser.add_argument( + "--ltr_path", + type=str, + default="None", + help="(Optional)Path to the transcription file,Note that the format in the transcription \ + file is 'file name|transcription'", + ) + parser.add_argument( + "--intelligibility_mode", + type=str, + default="gt_audio", + help="(Optional)The method of calculating WER and CER, where set to gt_audio means selecting \ + the recognition content of the reference audio as the target, and set to gt_content means \ + using transcription as the target", + ) + parser.add_argument( + "--language", + type=str, + default="english", + help="(Optional)['english','chinese']", + ) + + args = parser.parse_args() + + calc_metric( + args.ref_dir, + args.deg_dir, + args.dump_dir, + args.metrics, + fs=int(args.fs) if args.fs != "None" else None, + method=args.align_method, + db_scale=True if args.db_scale == "True" else False, + need_mean=True if args.f0_subtract_mean == "True" else False, + model_name=args.similarity_model, + similarity_mode=args.similarity_mode, + ltr_path=args.ltr_path, + intelligibility_mode=args.intelligibility_mode, + language=args.language, + ) diff --git a/bins/codec/inference.py b/bins/codec/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..7c44215dae756a9f21981e1a94401672fdbe1dbb --- /dev/null +++ b/bins/codec/inference.py @@ -0,0 +1,99 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +from argparse import ArgumentParser +import os + +from models.codec.facodec.facodec_inference import FAcodecInference +from utils.util import load_config +import torch + + +def build_inference(args, cfg): + supported_inference = { + "FAcodec": FAcodecInference, + } + + inference_class = supported_inference[cfg.model_type] + inference = inference_class(args, cfg) + return inference + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def build_parser(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--config", + type=str, + required=True, + help="JSON/YAML file for configurations.", + ) + parser.add_argument( + "--checkpoint_path", + type=str, + default=None, + help="Acoustic model checkpoint directory. If a directory is given, " + "search for the latest checkpoint dir in the directory. If a specific " + "checkpoint dir is given, directly load the checkpoint.", + ) + parser.add_argument( + "--source", + type=str, + required=True, + help="Path to the source audio file", + ) + parser.add_argument( + "--reference", + type=str, + default=None, + help="Path to the reference audio file, passing an", + ) + parser.add_argument( + "--output_dir", + type=str, + default=None, + help="Output dir for saving generated results", + ) + return parser + + +def main(): + # Parse arguments + parser = build_parser() + args = parser.parse_args() + print(args) + + # Parse config + cfg = load_config(args.config) + + # CUDA settings + cuda_relevant() + + # Build inference + inferencer = build_inference(args, cfg) + + # Run inference + _ = inferencer.inference(args.source, args.output_dir) + + # Run voice conversion + if args.reference is not None: + _ = inferencer.voice_conversion(args.source, args.reference, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/bins/codec/train.py b/bins/codec/train.py new file mode 100644 index 0000000000000000000000000000000000000000..aff9da177265f26b4d9200724c0ae05b930f076e --- /dev/null +++ b/bins/codec/train.py @@ -0,0 +1,79 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse + +import torch + +from models.codec.facodec.facodec_trainer import FAcodecTrainer + +from utils.util import load_config + + +def build_trainer(args, cfg): + supported_trainer = { + "FAcodec": FAcodecTrainer, + } + + trainer_class = supported_trainer[cfg.model_type] + trainer = trainer_class(args, cfg) + return trainer + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + default="config.json", + help="json files for configurations.", + required=True, + ) + parser.add_argument( + "--exp_name", + type=str, + default="exp_name", + help="A specific name to note the experiment", + required=True, + ) + parser.add_argument( + "--resume_type", + type=str, + help="resume for continue to train, finetune for finetuning", + ) + parser.add_argument( + "--checkpoint", + type=str, + help="checkpoint to resume", + ) + parser.add_argument( + "--log_level", default="warning", help="logging level (debug, info, warning)" + ) + args = parser.parse_args() + cfg = load_config(args.config) + + # CUDA settings + cuda_relevant() + + # Build trainer + trainer = build_trainer(args, cfg) + + trainer.train_loop() + + +if __name__ == "__main__": + main() diff --git a/bins/svc/inference.py b/bins/svc/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..da0031d13696566b121f6a376a32f7ad4af4e67a --- /dev/null +++ b/bins/svc/inference.py @@ -0,0 +1,265 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import glob +from tqdm import tqdm +import json +import torch +import time + +from models.svc.diffusion.diffusion_inference import DiffusionInference +from models.svc.comosvc.comosvc_inference import ComoSVCInference +from models.svc.transformer.transformer_inference import TransformerInference +from models.svc.vits.vits_inference import VitsInference +from utils.util import load_config +from utils.audio_slicer import split_audio, merge_segments_encodec +from processors import acoustic_extractor, content_extractor + + +def build_inference(args, cfg, infer_type="from_dataset"): + supported_inference = { + "DiffWaveNetSVC": DiffusionInference, + "DiffComoSVC": ComoSVCInference, + "TransformerSVC": TransformerInference, + "VitsSVC": VitsInference, + } + + inference_class = supported_inference[cfg.model_type] + return inference_class(args, cfg, infer_type) + + +def prepare_for_audio_file(args, cfg, num_workers=1): + preprocess_path = cfg.preprocess.processed_dir + audio_name = cfg.inference.source_audio_name + temp_audio_dir = os.path.join(preprocess_path, audio_name) + + ### eval file + t = time.time() + eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name) + args.source = eval_file + with open(eval_file, "r") as f: + metadata = json.load(f) + print("Prepare for meta eval data: {:.1f}s".format(time.time() - t)) + + ### acoustic features + t = time.time() + acoustic_extractor.extract_utt_acoustic_features_serial( + metadata, temp_audio_dir, cfg + ) + if cfg.preprocess.use_min_max_norm_mel == True: + acoustic_extractor.cal_mel_min_max( + dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata + ) + acoustic_extractor.cal_pitch_statistics_svc( + dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata + ) + print("Prepare for acoustic features: {:.1f}s".format(time.time() - t)) + + ### content features + t = time.time() + content_extractor.extract_utt_content_features_dataloader( + cfg, metadata, num_workers + ) + print("Prepare for content features: {:.1f}s".format(time.time() - t)) + return args, cfg, temp_audio_dir + + +def merge_for_audio_segments(audio_files, args, cfg): + audio_name = cfg.inference.source_audio_name + target_singer_name = args.target_singer + + merge_segments_encodec( + wav_files=audio_files, + fs=cfg.preprocess.sample_rate, + output_path=os.path.join( + args.output_dir, "{}_{}.wav".format(audio_name, target_singer_name) + ), + overlap_duration=cfg.inference.segments_overlap_duration, + ) + + for tmp_file in audio_files: + os.remove(tmp_file) + + +def prepare_source_eval_file(cfg, temp_audio_dir, audio_name): + """ + Prepare the eval file (json) for an audio + """ + + audio_chunks_results = split_audio( + wav_file=cfg.inference.source_audio_path, + target_sr=cfg.preprocess.sample_rate, + output_dir=os.path.join(temp_audio_dir, "wavs"), + max_duration_of_segment=cfg.inference.segments_max_duration, + overlap_duration=cfg.inference.segments_overlap_duration, + ) + + metadata = [] + for i, res in enumerate(audio_chunks_results): + res["index"] = i + res["Dataset"] = audio_name + res["Singer"] = audio_name + res["Uid"] = "{}_{}".format(audio_name, res["Uid"]) + metadata.append(res) + + eval_file = os.path.join(temp_audio_dir, "eval.json") + with open(eval_file, "w") as f: + json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True) + + return eval_file + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def infer(args, cfg, infer_type): + # Build inference + t = time.time() + trainer = build_inference(args, cfg, infer_type) + print("Model Init: {:.1f}s".format(time.time() - t)) + + # Run inference + t = time.time() + output_audio_files = trainer.inference() + print("Model inference: {:.1f}s".format(time.time() - t)) + return output_audio_files + + +def build_parser(): + r"""Build argument parser for inference.py. + Anything else should be put in an extra config YAML file. + """ + + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + type=str, + required=True, + help="JSON/YAML file for configurations.", + ) + parser.add_argument( + "--acoustics_dir", + type=str, + help="Acoustics model checkpoint directory. If a directory is given, " + "search for the latest checkpoint dir in the directory. If a specific " + "checkpoint dir is given, directly load the checkpoint.", + ) + parser.add_argument( + "--vocoder_dir", + type=str, + required=True, + help="Vocoder checkpoint directory. Searching behavior is the same as " + "the acoustics one.", + ) + parser.add_argument( + "--target_singer", + type=str, + required=True, + help="convert to a specific singer (e.g. --target_singers singer_id).", + ) + parser.add_argument( + "--trans_key", + default=0, + help="0: no pitch shift; autoshift: pitch shift; int: key shift.", + ) + parser.add_argument( + "--source", + type=str, + default="source_audio", + help="Source audio file or directory. If a JSON file is given, " + "inference from dataset is applied. If a directory is given, " + "inference from all wav/flac/mp3 audio files in the directory is applied. " + "Default: inference from all wav/flac/mp3 audio files in ./source_audio", + ) + parser.add_argument( + "--output_dir", + type=str, + default="conversion_results", + help="Output directory. Default: ./conversion_results", + ) + parser.add_argument( + "--log_level", + type=str, + default="warning", + help="Logging level. Default: warning", + ) + parser.add_argument( + "--keep_cache", + action="store_true", + default=True, + help="Keep cache files. Only applicable to inference from files.", + ) + parser.add_argument( + "--diffusion_inference_steps", + type=int, + default=1000, + help="Number of inference steps. Only applicable to diffusion inference.", + ) + return parser + + +def main(): + ### Parse arguments and config + args = build_parser().parse_args() + cfg = load_config(args.config) + + # CUDA settings + cuda_relevant() + + if os.path.isdir(args.source): + ### Infer from file + + # Get all the source audio files (.wav, .flac, .mp3) + source_audio_dir = args.source + audio_list = [] + for suffix in ["wav", "flac", "mp3"]: + audio_list += glob.glob( + os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True + ) + print("There are {} source audios: ".format(len(audio_list))) + + # Infer for every file as dataset + output_root_path = args.output_dir + for audio_path in tqdm(audio_list): + audio_name = audio_path.split("/")[-1].split(".")[0] + args.output_dir = os.path.join(output_root_path, audio_name) + print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name)) + + cfg.inference.source_audio_path = audio_path + cfg.inference.source_audio_name = audio_name + cfg.inference.segments_max_duration = 10.0 + cfg.inference.segments_overlap_duration = 1.0 + + # Prepare metadata and features + args, cfg, cache_dir = prepare_for_audio_file(args, cfg) + + # Infer from file + output_audio_files = infer(args, cfg, infer_type="from_file") + + # Merge the split segments + merge_for_audio_segments(output_audio_files, args, cfg) + + # Keep or remove caches + if not args.keep_cache: + os.removedirs(cache_dir) + + else: + ### Infer from dataset + infer(args, cfg, infer_type="from_dataset") + + +if __name__ == "__main__": + main() diff --git a/bins/svc/preprocess.py b/bins/svc/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..453b5001463dff8782c753090e66cd9ac227a066 --- /dev/null +++ b/bins/svc/preprocess.py @@ -0,0 +1,183 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import faulthandler + +faulthandler.enable() + +import os +import argparse +import json +from multiprocessing import cpu_count + + +from utils.util import load_config +from preprocessors.processor import preprocess_dataset +from preprocessors.metadata import cal_metadata +from processors import acoustic_extractor, content_extractor, data_augment + + +def extract_acoustic_features(dataset, output_path, cfg, n_workers=1): + """Extract acoustic features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1. + """ + types = ["train", "test"] if "eval" not in dataset else ["test"] + metadata = [] + dataset_output = os.path.join(output_path, dataset) + + for dataset_type in types: + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + # acoustic_extractor.extract_utt_acoustic_features_parallel( + # metadata, dataset_output, cfg, n_workers=n_workers + # ) + acoustic_extractor.extract_utt_acoustic_features_serial( + metadata, dataset_output, cfg + ) + + +def extract_content_features(dataset, output_path, cfg, num_workers=1): + """Extract content features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + """ + types = ["train", "test"] if "eval" not in dataset else ["test"] + metadata = [] + for dataset_type in types: + dataset_output = os.path.join(output_path, dataset) + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + content_extractor.extract_utt_content_features_dataloader( + cfg, metadata, num_workers + ) + + +def preprocess(cfg, args): + """Proprocess raw data of single or multiple datasets (in cfg.dataset) + + Args: + cfg (dict): dictionary that stores configurations + args (ArgumentParser): specify the configuration file and num_workers + """ + # Specify the output root path to save the processed data + output_path = cfg.preprocess.processed_dir + os.makedirs(output_path, exist_ok=True) + + ## Split train and test sets + for dataset in cfg.dataset: + print("Preprocess {}...".format(dataset)) + preprocess_dataset( + dataset, + cfg.dataset_path[dataset], + output_path, + cfg.preprocess, + cfg.task_type, + is_custom_dataset=dataset in cfg.use_custom_dataset, + ) + + # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch + try: + assert isinstance( + cfg.preprocess.data_augment, list + ), "Please provide a list of datasets need to be augmented." + if len(cfg.preprocess.data_augment) > 0: + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = data_augment.augment_dataset(cfg, dataset) + new_datasets_list.extend(new_datasets) + cfg.dataset.extend(new_datasets_list) + print("Augmentation datasets: ", cfg.dataset) + except: + print("No Data Augmentation.") + + # Dump metadata of datasets (singers, train/test durations, etc.) + cal_metadata(cfg) + + ## Prepare the acoustic features + for dataset in cfg.dataset: + # Skip augmented datasets which do not need to extract acoustic features + # We will copy acoustic features from the original dataset later + if ( + "pitch_shift" in dataset + or "formant_shift" in dataset + or "equalizer" in dataset in dataset + ): + continue + print( + "Extracting acoustic features for {} using {} workers ...".format( + dataset, args.num_workers + ) + ) + extract_acoustic_features(dataset, output_path, cfg, args.num_workers) + # Calculate the statistics of acoustic features + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg) + + # Copy acoustic features for augmented datasets by creating soft-links + for dataset in cfg.dataset: + if "pitch_shift" in dataset: + src_dataset = dataset.replace("_pitch_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "formant_shift" in dataset: + src_dataset = dataset.replace("_formant_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "equalizer" in dataset: + src_dataset = dataset.replace("_equalizer", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + else: + continue + dataset_dir = os.path.join(output_path, dataset) + metadata = [] + for split in ["train", "test"] if not "eval" in dataset else ["test"]: + metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split)) + with open(metadata_file_path, "r") as f: + metadata.extend(json.load(f)) + print("Copying acoustic features for {}...".format(dataset)) + acoustic_extractor.copy_acoustic_features( + metadata, dataset_dir, src_dataset_dir, cfg + ) + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg) + + # Prepare the content features + for dataset in cfg.dataset: + print("Extracting content features for {}...".format(dataset)) + extract_content_features(dataset, output_path, cfg, args.num_workers) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", default="config.json", help="json files for configurations." + ) + parser.add_argument("--num_workers", type=int, default=int(cpu_count())) + parser.add_argument("--prepare_alignment", type=bool, default=False) + + args = parser.parse_args() + cfg = load_config(args.config) + + preprocess(cfg, args) + + +if __name__ == "__main__": + main() diff --git a/bins/svc/train.py b/bins/svc/train.py new file mode 100644 index 0000000000000000000000000000000000000000..0c20d5b44fdb62c5d2cf2b2d44c10f2f8db687aa --- /dev/null +++ b/bins/svc/train.py @@ -0,0 +1,111 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse + +import torch + +from models.svc.diffusion.diffusion_trainer import DiffusionTrainer +from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer +from models.svc.transformer.transformer_trainer import TransformerTrainer +from models.svc.vits.vits_trainer import VitsSVCTrainer +from utils.util import load_config + + +def build_trainer(args, cfg): + supported_trainer = { + "DiffWaveNetSVC": DiffusionTrainer, + "DiffComoSVC": ComoSVCTrainer, + "TransformerSVC": TransformerTrainer, + "VitsSVC": VitsSVCTrainer, + } + + trainer_class = supported_trainer[cfg.model_type] + trainer = trainer_class(args, cfg) + return trainer + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + default="config.json", + help="json files for configurations.", + required=True, + ) + parser.add_argument( + "--exp_name", + type=str, + default="exp_name", + help="A specific name to note the experiment", + required=True, + ) + parser.add_argument( + "--resume", + action="store_true", + help="If specified, to resume from the existing checkpoint.", + ) + parser.add_argument( + "--resume_from_ckpt_path", + type=str, + default="", + help="The specific checkpoint path that you want to resume from.", + ) + parser.add_argument( + "--resume_type", + type=str, + default="", + help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights", + ) + + parser.add_argument( + "--log_level", default="warning", help="logging level (debug, info, warning)" + ) + args = parser.parse_args() + cfg = load_config(args.config) + + # Data Augmentation + if ( + type(cfg.preprocess.data_augment) == list + and len(cfg.preprocess.data_augment) > 0 + ): + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = [ + f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None, + ( + f"{dataset}_formant_shift" + if cfg.preprocess.use_formant_shift + else None + ), + f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None, + f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None, + ] + new_datasets_list.extend(filter(None, new_datasets)) + cfg.dataset.extend(new_datasets_list) + + # CUDA settings + cuda_relevant() + + # Build trainer + trainer = build_trainer(args, cfg) + + trainer.train_loop() + + +if __name__ == "__main__": + main() diff --git a/bins/tta/inference.py b/bins/tta/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..85179049c27d2f7c7d6d258afc333bec203dd207 --- /dev/null +++ b/bins/tta/inference.py @@ -0,0 +1,94 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +from argparse import ArgumentParser +import os + +from models.tta.ldm.audioldm_inference import AudioLDMInference +from utils.util import save_config, load_model_config, load_config +import numpy as np +import torch + + +def build_inference(args, cfg): + supported_inference = { + "AudioLDM": AudioLDMInference, + } + + inference_class = supported_inference[cfg.model_type] + inference = inference_class(args, cfg) + return inference + + +def build_parser(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--config", + type=str, + required=True, + help="JSON/YAML file for configurations.", + ) + parser.add_argument( + "--text", + help="Text to be synthesized", + type=str, + default="Text to be synthesized.", + ) + parser.add_argument( + "--checkpoint_path", + type=str, + ) + parser.add_argument( + "--vocoder_path", type=str, help="Checkpoint path of the vocoder" + ) + parser.add_argument( + "--vocoder_config_path", type=str, help="Config path of the vocoder" + ) + parser.add_argument( + "--output_dir", + type=str, + default=None, + help="Output dir for saving generated results", + ) + parser.add_argument( + "--num_steps", + type=int, + default=200, + help="The total number of denosing steps", + ) + parser.add_argument( + "--guidance_scale", + type=float, + default=4.0, + help="The scale of classifer free guidance", + ) + parser.add_argument("--local_rank", default=-1, type=int) + return parser + + +def main(): + # Parse arguments + args = build_parser().parse_args() + # args, infer_type = formulate_parser(args) + + # Parse config + cfg = load_config(args.config) + if torch.cuda.is_available(): + args.local_rank = torch.device("cuda") + else: + args.local_rank = torch.device("cpu") + print("args: ", args) + + # Build inference + inferencer = build_inference(args, cfg) + + # Run inference + inferencer.inference() + + +if __name__ == "__main__": + main() diff --git a/bins/tta/preprocess.py b/bins/tta/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..58c73bf7c60fdc8d5762313504ba7e470817d16c --- /dev/null +++ b/bins/tta/preprocess.py @@ -0,0 +1,195 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import faulthandler + +faulthandler.enable() + +import os +import argparse +import json +import pyworld as pw +from multiprocessing import cpu_count + + +from utils.util import load_config +from preprocessors.processor import preprocess_dataset, prepare_align +from preprocessors.metadata import cal_metadata +from processors import acoustic_extractor, content_extractor, data_augment + + +def extract_acoustic_features(dataset, output_path, cfg, n_workers=1): + """Extract acoustic features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1. + """ + types = ["train", "test"] if "eval" not in dataset else ["test"] + metadata = [] + for dataset_type in types: + dataset_output = os.path.join(output_path, dataset) + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + # acoustic_extractor.extract_utt_acoustic_features_parallel( + # metadata, dataset_output, cfg, n_workers=n_workers + # ) + acoustic_extractor.extract_utt_acoustic_features_serial( + metadata, dataset_output, cfg + ) + + +def extract_content_features(dataset, output_path, cfg, num_workers=1): + """Extract content features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + """ + types = ["train", "test"] if "eval" not in dataset else ["test"] + metadata = [] + for dataset_type in types: + dataset_output = os.path.join(output_path, dataset) + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + content_extractor.extract_utt_content_features_dataloader( + cfg, metadata, num_workers + ) + + +def preprocess(cfg, args): + """Proprocess raw data of single or multiple datasets (in cfg.dataset) + + Args: + cfg (dict): dictionary that stores configurations + args (ArgumentParser): specify the configuration file and num_workers + """ + # Specify the output root path to save the processed data + output_path = cfg.preprocess.processed_dir + os.makedirs(output_path, exist_ok=True) + + ## Split train and test sets + for dataset in cfg.dataset: + print("Preprocess {}...".format(dataset)) + + if args.prepare_alignment: + ## Prepare alignment with MFA + print("Prepare alignment {}...".format(dataset)) + prepare_align( + dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path + ) + preprocess_dataset( + dataset, + cfg.dataset_path[dataset], + output_path, + cfg.preprocess, + cfg.task_type, + is_custom_dataset=dataset in cfg.use_custom_dataset, + ) + + # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch + try: + assert isinstance( + cfg.preprocess.data_augment, list + ), "Please provide a list of datasets need to be augmented." + if len(cfg.preprocess.data_augment) > 0: + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = data_augment.augment_dataset(cfg, dataset) + new_datasets_list.extend(new_datasets) + cfg.dataset.extend(new_datasets_list) + print("Augmentation datasets: ", cfg.dataset) + except: + print("No Data Augmentation.") + + # Dump metadata of datasets (singers, train/test durations, etc.) + cal_metadata(cfg) + + ## Prepare the acoustic features + for dataset in cfg.dataset: + # Skip augmented datasets which do not need to extract acoustic features + # We will copy acoustic features from the original dataset later + if ( + "pitch_shift" in dataset + or "formant_shift" in dataset + or "equalizer" in dataset in dataset + ): + continue + print( + "Extracting acoustic features for {} using {} workers ...".format( + dataset, args.num_workers + ) + ) + extract_acoustic_features(dataset, output_path, cfg, args.num_workers) + # Calculate the statistics of acoustic features + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg) + if cfg.preprocess.extract_energy: + acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg) + + if cfg.preprocess.align_mel_duration: + acoustic_extractor.align_duration_mel(dataset, output_path, cfg) + + # Copy acoustic features for augmented datasets by creating soft-links + for dataset in cfg.dataset: + if "pitch_shift" in dataset: + src_dataset = dataset.replace("_pitch_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "formant_shift" in dataset: + src_dataset = dataset.replace("_formant_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "equalizer" in dataset: + src_dataset = dataset.replace("_equalizer", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + else: + continue + dataset_dir = os.path.join(output_path, dataset) + metadata = [] + for split in ["train", "test"] if not "eval" in dataset else ["test"]: + metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split)) + with open(metadata_file_path, "r") as f: + metadata.extend(json.load(f)) + print("Copying acoustic features for {}...".format(dataset)) + acoustic_extractor.copy_acoustic_features( + metadata, dataset_dir, src_dataset_dir, cfg + ) + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg) + + # Prepare the content features + for dataset in cfg.dataset: + print("Extracting content features for {}...".format(dataset)) + extract_content_features(dataset, output_path, cfg, args.num_workers) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", default="config.json", help="json files for configurations." + ) + parser.add_argument("--num_workers", type=int, default=int(cpu_count())) + parser.add_argument("--prepare_alignment", type=bool, default=False) + + args = parser.parse_args() + cfg = load_config(args.config) + + preprocess(cfg, args) + + +if __name__ == "__main__": + main() diff --git a/bins/tta/train_tta.py b/bins/tta/train_tta.py new file mode 100644 index 0000000000000000000000000000000000000000..42d8210b37b8a91db873e23e4fc40263acf0f06b --- /dev/null +++ b/bins/tta/train_tta.py @@ -0,0 +1,77 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import torch + +from models.tta.autoencoder.autoencoder_trainer import AutoencoderKLTrainer +from models.tta.ldm.audioldm_trainer import AudioLDMTrainer +from utils.util import load_config + + +def build_trainer(args, cfg): + supported_trainer = { + "AutoencoderKL": AutoencoderKLTrainer, + "AudioLDM": AudioLDMTrainer, + } + + trainer_class = supported_trainer[cfg.model_type] + trainer = trainer_class(args, cfg) + return trainer + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + default="config.json", + help="json files for configurations.", + required=True, + ) + parser.add_argument( + "--num_workers", type=int, default=6, help="Number of dataloader workers." + ) + parser.add_argument( + "--exp_name", + type=str, + default="exp_name", + help="A specific name to note the experiment", + required=True, + ) + parser.add_argument( + "--resume", + type=str, + default=None, + # action="store_true", + help="The model name to restore", + ) + parser.add_argument( + "--log_level", default="info", help="logging level (info, debug, warning)" + ) + parser.add_argument("--stdout_interval", default=5, type=int) + parser.add_argument("--local_rank", default=-1, type=int) + args = parser.parse_args() + cfg = load_config(args.config) + cfg.exp_name = args.exp_name + + # Model saving dir + args.log_dir = os.path.join(cfg.log_dir, args.exp_name) + os.makedirs(args.log_dir, exist_ok=True) + + if not cfg.train.ddp: + args.local_rank = torch.device("cuda") + + # Build trainer + trainer = build_trainer(args, cfg) + + # Restore models + if args.resume: + trainer.restore() + trainer.train() + + +if __name__ == "__main__": + main() diff --git a/bins/tts/inference.py b/bins/tts/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..f9a5c08f8c20852f4874bce4dbb20dbed24f1f68 --- /dev/null +++ b/bins/tts/inference.py @@ -0,0 +1,169 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +from argparse import ArgumentParser +import os + +from models.tts.fastspeech2.fs2_inference import FastSpeech2Inference +from models.tts.vits.vits_inference import VitsInference +from models.tts.valle.valle_inference import VALLEInference +from models.tts.naturalspeech2.ns2_inference import NS2Inference +from models.tts.jets.jets_inference import JetsInference +from utils.util import load_config +import torch + + +def build_inference(args, cfg): + supported_inference = { + "FastSpeech2": FastSpeech2Inference, + "VITS": VitsInference, + "VALLE": VALLEInference, + "NaturalSpeech2": NS2Inference, + "Jets": JetsInference, + } + + inference_class = supported_inference[cfg.model_type] + inference = inference_class(args, cfg) + return inference + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def build_parser(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--config", + type=str, + required=True, + help="JSON/YAML file for configurations.", + ) + parser.add_argument( + "--dataset", + type=str, + help="convert from the source data", + default=None, + ) + parser.add_argument( + "--testing_set", + type=str, + help="train, test, golden_test", + default="test", + ) + parser.add_argument( + "--test_list_file", + type=str, + help="convert from the test list file", + default=None, + ) + parser.add_argument( + "--speaker_name", + type=str, + default=None, + help="speaker name for multi-speaker synthesis, for single-sentence mode only", + ) + parser.add_argument( + "--text", + help="Text to be synthesized.", + type=str, + default="", + ) + parser.add_argument( + "--vocoder_dir", + type=str, + default=None, + help="Vocoder checkpoint directory. Searching behavior is the same as " + "the acoustics one.", + ) + parser.add_argument( + "--acoustics_dir", + type=str, + default=None, + help="Acoustic model checkpoint directory. If a directory is given, " + "search for the latest checkpoint dir in the directory. If a specific " + "checkpoint dir is given, directly load the checkpoint.", + ) + parser.add_argument( + "--checkpoint_path", + type=str, + default=None, + help="Acoustic model checkpoint directory. If a directory is given, " + "search for the latest checkpoint dir in the directory. If a specific " + "checkpoint dir is given, directly load the checkpoint.", + ) + parser.add_argument( + "--mode", + type=str, + choices=["batch", "single"], + required=True, + help="Synthesize a whole dataset or a single sentence", + ) + parser.add_argument( + "--log_level", + type=str, + default="warning", + help="Logging level. Default: warning", + ) + parser.add_argument( + "--pitch_control", + type=float, + default=1.0, + help="control the pitch of the whole utterance, larger value for higher pitch", + ) + parser.add_argument( + "--energy_control", + type=float, + default=1.0, + help="control the energy of the whole utterance, larger value for larger volume", + ) + parser.add_argument( + "--duration_control", + type=float, + default=1.0, + help="control the speed of the whole utterance, larger value for slower speaking rate", + ) + parser.add_argument( + "--output_dir", + type=str, + default=None, + help="Output dir for saving generated results", + ) + return parser + + +def main(): + # Parse arguments + parser = build_parser() + VALLEInference.add_arguments(parser) + NS2Inference.add_arguments(parser) + args = parser.parse_args() + print(args) + + # Parse config + cfg = load_config(args.config) + + # CUDA settings + cuda_relevant() + + # Build inference + inferencer = build_inference(args, cfg) + + # Run inference + inferencer.inference() + + +if __name__ == "__main__": + main() diff --git a/bins/tts/preprocess.py b/bins/tts/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..914c0b446b4ce46fd50bd95e54b6624b6eedfec1 --- /dev/null +++ b/bins/tts/preprocess.py @@ -0,0 +1,244 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import faulthandler + +faulthandler.enable() + +import os +import argparse +import json +import pyworld as pw +from multiprocessing import cpu_count + + +from utils.util import load_config +from preprocessors.processor import preprocess_dataset, prepare_align +from preprocessors.metadata import cal_metadata +from processors import ( + acoustic_extractor, + content_extractor, + data_augment, + phone_extractor, +) + + +def extract_acoustic_features(dataset, output_path, cfg, dataset_types, n_workers=1): + """Extract acoustic features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1. + """ + + metadata = [] + for dataset_type in dataset_types: + dataset_output = os.path.join(output_path, dataset) + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + # acoustic_extractor.extract_utt_acoustic_features_parallel( + # metadata, dataset_output, cfg, n_workers=n_workers + # ) + acoustic_extractor.extract_utt_acoustic_features_serial( + metadata, dataset_output, cfg + ) + + +def extract_content_features(dataset, output_path, cfg, dataset_types, num_workers=1): + """Extract content features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + """ + + metadata = [] + for dataset_type in dataset_types: + dataset_output = os.path.join(output_path, dataset) + # dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + content_extractor.extract_utt_content_features_dataloader( + cfg, metadata, num_workers + ) + + +def extract_phonme_sequences(dataset, output_path, cfg, dataset_types): + """Extract phoneme features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + + """ + + metadata = [] + for dataset_type in dataset_types: + dataset_output = os.path.join(output_path, dataset) + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + phone_extractor.extract_utt_phone_sequence(dataset, cfg, metadata) + + +def preprocess(cfg, args): + """Preprocess raw data of single or multiple datasets (in cfg.dataset) + + Args: + cfg (dict): dictionary that stores configurations + args (ArgumentParser): specify the configuration file and num_workers + """ + # Specify the output root path to save the processed data + output_path = cfg.preprocess.processed_dir + os.makedirs(output_path, exist_ok=True) + + # Split train and test sets + for dataset in cfg.dataset: + print("Preprocess {}...".format(dataset)) + + if args.prepare_alignment: + # Prepare alignment with MFA + print("Prepare alignment {}...".format(dataset)) + prepare_align( + dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path + ) + + preprocess_dataset( + dataset, + cfg.dataset_path[dataset], + output_path, + cfg.preprocess, + cfg.task_type, + is_custom_dataset=dataset in cfg.use_custom_dataset, + ) + + # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch + try: + assert isinstance( + cfg.preprocess.data_augment, list + ), "Please provide a list of datasets need to be augmented." + if len(cfg.preprocess.data_augment) > 0: + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = data_augment.augment_dataset(cfg, dataset) + new_datasets_list.extend(new_datasets) + cfg.dataset.extend(new_datasets_list) + print("Augmentation datasets: ", cfg.dataset) + except: + print("No Data Augmentation.") + + # json files + dataset_types = list() + dataset_types.append((cfg.preprocess.train_file).split(".")[0]) + dataset_types.append((cfg.preprocess.valid_file).split(".")[0]) + if "test" not in dataset_types: + dataset_types.append("test") + if "eval" in dataset: + dataset_types = ["test"] + + # Dump metadata of datasets (singers, train/test durations, etc.) + cal_metadata(cfg, dataset_types) + + # Prepare the acoustic features + for dataset in cfg.dataset: + # Skip augmented datasets which do not need to extract acoustic features + # We will copy acoustic features from the original dataset later + if ( + "pitch_shift" in dataset + or "formant_shift" in dataset + or "equalizer" in dataset in dataset + ): + continue + print( + "Extracting acoustic features for {} using {} workers ...".format( + dataset, args.num_workers + ) + ) + extract_acoustic_features( + dataset, output_path, cfg, dataset_types, args.num_workers + ) + # Calculate the statistics of acoustic features + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg) + + if cfg.preprocess.extract_energy: + acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg) + + if cfg.preprocess.pitch_norm: + acoustic_extractor.normalize(dataset, cfg.preprocess.pitch_dir, cfg) + + if cfg.preprocess.energy_norm: + acoustic_extractor.normalize(dataset, cfg.preprocess.energy_dir, cfg) + + # Copy acoustic features for augmented datasets by creating soft-links + for dataset in cfg.dataset: + if "pitch_shift" in dataset: + src_dataset = dataset.replace("_pitch_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "formant_shift" in dataset: + src_dataset = dataset.replace("_formant_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "equalizer" in dataset: + src_dataset = dataset.replace("_equalizer", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + else: + continue + dataset_dir = os.path.join(output_path, dataset) + metadata = [] + for split in ["train", "test"] if not "eval" in dataset else ["test"]: + metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split)) + with open(metadata_file_path, "r") as f: + metadata.extend(json.load(f)) + print("Copying acoustic features for {}...".format(dataset)) + acoustic_extractor.copy_acoustic_features( + metadata, dataset_dir, src_dataset_dir, cfg + ) + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg) + + # Prepare the content features + for dataset in cfg.dataset: + print("Extracting content features for {}...".format(dataset)) + extract_content_features( + dataset, output_path, cfg, dataset_types, args.num_workers + ) + + # Prepare the phenome squences + if cfg.preprocess.extract_phone: + for dataset in cfg.dataset: + print("Extracting phoneme sequence for {}...".format(dataset)) + extract_phonme_sequences(dataset, output_path, cfg, dataset_types) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", default="config.json", help="json files for configurations." + ) + parser.add_argument("--num_workers", type=int, default=int(cpu_count())) + parser.add_argument("--prepare_alignment", type=bool, default=False) + + args = parser.parse_args() + cfg = load_config(args.config) + + preprocess(cfg, args) + + +if __name__ == "__main__": + main() diff --git a/bins/tts/train.py b/bins/tts/train.py new file mode 100644 index 0000000000000000000000000000000000000000..b3a34e46644782da962e1afadaaa5fdf215dded4 --- /dev/null +++ b/bins/tts/train.py @@ -0,0 +1,152 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse + +import torch + +from models.tts.fastspeech2.fs2_trainer import FastSpeech2Trainer +from models.tts.vits.vits_trainer import VITSTrainer +from models.tts.valle.valle_trainer import VALLETrainer +from models.tts.naturalspeech2.ns2_trainer import NS2Trainer +from models.tts.valle_v2.valle_ar_trainer import ValleARTrainer as VALLE_V2_AR +from models.tts.valle_v2.valle_nar_trainer import ValleNARTrainer as VALLE_V2_NAR +from models.tts.jets.jets_trainer import JetsTrainer + +from utils.util import load_config + + +def build_trainer(args, cfg): + supported_trainer = { + "FastSpeech2": FastSpeech2Trainer, + "VITS": VITSTrainer, + "VALLE": VALLETrainer, + "NaturalSpeech2": NS2Trainer, + "VALLE_V2_AR": VALLE_V2_AR, + "VALLE_V2_NAR": VALLE_V2_NAR, + "Jets": JetsTrainer, + } + + trainer_class = supported_trainer[cfg.model_type] + trainer = trainer_class(args, cfg) + return trainer + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + default="config.json", + help="json files for configurations.", + required=True, + ) + parser.add_argument( + "--seed", + type=int, + default=1234, + help="random seed", + required=False, + ) + parser.add_argument( + "--exp_name", + type=str, + default="exp_name", + help="A specific name to note the experiment", + required=True, + ) + parser.add_argument( + "--resume", action="store_true", help="The model name to restore" + ) + parser.add_argument( + "--test", action="store_true", default=False, help="Test the model" + ) + parser.add_argument( + "--log_level", default="warning", help="logging level (debug, info, warning)" + ) + parser.add_argument( + "--resume_type", + type=str, + default="resume", + help="Resume training or finetuning.", + ) + parser.add_argument( + "--checkpoint_path", + type=str, + default=None, + help="Checkpoint for resume training or finetuning.", + ) + parser.add_argument( + "--resume_from_ckpt_path", + type=str, + default="", + help="Checkpoint for resume training or finetuning.", + ) + # VALLETrainer.add_arguments(parser) + args = parser.parse_args() + cfg = load_config(args.config) + + # Data Augmentation + if hasattr(cfg, "preprocess"): + if hasattr(cfg.preprocess, "data_augment"): + if ( + type(cfg.preprocess.data_augment) == list + and len(cfg.preprocess.data_augment) > 0 + ): + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = [ + ( + f"{dataset}_pitch_shift" + if cfg.preprocess.use_pitch_shift + else None + ), + ( + f"{dataset}_formant_shift" + if cfg.preprocess.use_formant_shift + else None + ), + ( + f"{dataset}_equalizer" + if cfg.preprocess.use_equalizer + else None + ), + ( + f"{dataset}_time_stretch" + if cfg.preprocess.use_time_stretch + else None + ), + ] + new_datasets_list.extend(filter(None, new_datasets)) + cfg.dataset.extend(new_datasets_list) + + print("experiment name: ", args.exp_name) + # # CUDA settings + cuda_relevant() + + # Build trainer + print(f"Building {cfg.model_type} trainer") + trainer = build_trainer(args, cfg) + print(f"Start training {cfg.model_type} model") + if args.test: + trainer.test_loop() + else: + trainer.train_loop() + + +if __name__ == "__main__": + main() diff --git a/bins/vocoder/inference.py b/bins/vocoder/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..25bbb59ddf2beaa314accbaed3fc5154e57cbac7 --- /dev/null +++ b/bins/vocoder/inference.py @@ -0,0 +1,115 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os + +import torch + +from models.vocoders.vocoder_inference import VocoderInference +from utils.util import load_config + + +def build_inference(args, cfg, infer_type="infer_from_dataset"): + supported_inference = { + "GANVocoder": VocoderInference, + "DiffusionVocoder": VocoderInference, + } + + inference_class = supported_inference[cfg.model_type] + return inference_class(args, cfg, infer_type) + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def build_parser(): + r"""Build argument parser for inference.py. + Anything else should be put in an extra config YAML file. + """ + + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + type=str, + required=True, + help="JSON/YAML file for configurations.", + ) + parser.add_argument( + "--infer_mode", + type=str, + required=None, + ) + parser.add_argument( + "--infer_datasets", + nargs="+", + default=None, + ) + parser.add_argument( + "--feature_folder", + type=str, + default=None, + ) + parser.add_argument( + "--audio_folder", + type=str, + default=None, + ) + parser.add_argument( + "--vocoder_dir", + type=str, + required=True, + help="Vocoder checkpoint directory. Searching behavior is the same as " + "the acoustics one.", + ) + parser.add_argument( + "--output_dir", + type=str, + default="result", + help="Output directory. Default: ./result", + ) + parser.add_argument( + "--log_level", + type=str, + default="warning", + help="Logging level. Default: warning", + ) + parser.add_argument( + "--keep_cache", + action="store_true", + default=False, + help="Keep cache files. Only applicable to inference from files.", + ) + return parser + + +def main(): + # Parse arguments + args = build_parser().parse_args() + + # Parse config + cfg = load_config(args.config) + + # CUDA settings + cuda_relevant() + + # Build inference + trainer = build_inference(args, cfg, args.infer_mode) + + # Run inference + trainer.inference() + + +if __name__ == "__main__": + main() diff --git a/bins/vocoder/preprocess.py b/bins/vocoder/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..23f756ddcbe58a9183aed57b2b9c81552ce00cd9 --- /dev/null +++ b/bins/vocoder/preprocess.py @@ -0,0 +1,151 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import faulthandler + +faulthandler.enable() + +import os +import argparse +import json +import pyworld as pw +from multiprocessing import cpu_count + + +from utils.util import load_config +from preprocessors.processor import preprocess_dataset, prepare_align +from preprocessors.metadata import cal_metadata +from processors import acoustic_extractor, content_extractor, data_augment + + +def extract_acoustic_features(dataset, output_path, cfg, n_workers=1): + """Extract acoustic features of utterances in the dataset + + Args: + dataset (str): name of dataset, e.g. opencpop + output_path (str): directory that stores train, test and feature files of datasets + cfg (dict): dictionary that stores configurations + n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1. + """ + types = ["train", "test"] if "eval" not in dataset else ["test"] + metadata = [] + for dataset_type in types: + dataset_output = os.path.join(output_path, dataset) + dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) + with open(dataset_file, "r") as f: + metadata.extend(json.load(f)) + + acoustic_extractor.extract_utt_acoustic_features_serial( + metadata, dataset_output, cfg + ) + + +def preprocess(cfg, args): + """Proprocess raw data of single or multiple datasets (in cfg.dataset) + + Args: + cfg (dict): dictionary that stores configurations + args (ArgumentParser): specify the configuration file and num_workers + """ + # Specify the output root path to save the processed data + output_path = cfg.preprocess.processed_dir + os.makedirs(output_path, exist_ok=True) + + ## Split train and test sets + for dataset in cfg.dataset: + print("Preprocess {}...".format(dataset)) + + preprocess_dataset( + dataset, + cfg.dataset_path[dataset], + output_path, + cfg.preprocess, + cfg.task_type, + is_custom_dataset=dataset in cfg.use_custom_dataset, + ) + + # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch + try: + assert isinstance( + cfg.preprocess.data_augment, list + ), "Please provide a list of datasets need to be augmented." + if len(cfg.preprocess.data_augment) > 0: + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = data_augment.augment_dataset(cfg, dataset) + new_datasets_list.extend(new_datasets) + cfg.dataset.extend(new_datasets_list) + print("Augmentation datasets: ", cfg.dataset) + except: + print("No Data Augmentation.") + + # Dump metadata of datasets (singers, train/test durations, etc.) + cal_metadata(cfg) + + ## Prepare the acoustic features + for dataset in cfg.dataset: + # Skip augmented datasets which do not need to extract acoustic features + # We will copy acoustic features from the original dataset later + if ( + "pitch_shift" in dataset + or "formant_shift" in dataset + or "equalizer" in dataset in dataset + ): + continue + print( + "Extracting acoustic features for {} using {} workers ...".format( + dataset, args.num_workers + ) + ) + extract_acoustic_features(dataset, output_path, cfg, args.num_workers) + # Calculate the statistics of acoustic features + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + # Copy acoustic features for augmented datasets by creating soft-links + for dataset in cfg.dataset: + if "pitch_shift" in dataset: + src_dataset = dataset.replace("_pitch_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "formant_shift" in dataset: + src_dataset = dataset.replace("_formant_shift", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + elif "equalizer" in dataset: + src_dataset = dataset.replace("_equalizer", "") + src_dataset_dir = os.path.join(output_path, src_dataset) + else: + continue + dataset_dir = os.path.join(output_path, dataset) + metadata = [] + for split in ["train", "test"] if not "eval" in dataset else ["test"]: + metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split)) + with open(metadata_file_path, "r") as f: + metadata.extend(json.load(f)) + print("Copying acoustic features for {}...".format(dataset)) + acoustic_extractor.copy_acoustic_features( + metadata, dataset_dir, src_dataset_dir, cfg + ) + if cfg.preprocess.mel_min_max_norm: + acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) + + if cfg.preprocess.extract_pitch: + acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", default="config.json", help="json files for configurations." + ) + parser.add_argument("--num_workers", type=int, default=int(cpu_count())) + + args = parser.parse_args() + cfg = load_config(args.config) + + preprocess(cfg, args) + + +if __name__ == "__main__": + main() diff --git a/bins/vocoder/train.py b/bins/vocoder/train.py new file mode 100644 index 0000000000000000000000000000000000000000..51dc9c0fa537b1e5d0dafa6f837d9dd5093a5563 --- /dev/null +++ b/bins/vocoder/train.py @@ -0,0 +1,93 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse + +import torch + +from models.vocoders.gan.gan_vocoder_trainer import GANVocoderTrainer +from models.vocoders.diffusion.diffusion_vocoder_trainer import DiffusionVocoderTrainer + +from utils.util import load_config + + +def build_trainer(args, cfg): + supported_trainer = { + "GANVocoder": GANVocoderTrainer, + "DiffusionVocoder": DiffusionVocoderTrainer, + } + + trainer_class = supported_trainer[cfg.model_type] + trainer = trainer_class(args, cfg) + return trainer + + +def cuda_relevant(deterministic=False): + torch.cuda.empty_cache() + # TF32 on Ampere and above + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.enabled = True + torch.backends.cudnn.allow_tf32 = True + # Deterministic + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = not deterministic + torch.use_deterministic_algorithms(deterministic) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + default="config.json", + help="json files for configurations.", + required=True, + ) + parser.add_argument( + "--exp_name", + type=str, + default="exp_name", + help="A specific name to note the experiment", + required=True, + ) + parser.add_argument( + "--resume_type", + type=str, + help="resume for continue to train, finetune for finetuning", + ) + parser.add_argument( + "--checkpoint", + type=str, + help="checkpoint to resume", + ) + parser.add_argument( + "--log_level", default="warning", help="logging level (debug, info, warning)" + ) + args = parser.parse_args() + cfg = load_config(args.config) + + # Data Augmentation + if cfg.preprocess.data_augment: + new_datasets_list = [] + for dataset in cfg.preprocess.data_augment: + new_datasets = [ + # f"{dataset}_pitch_shift", + # f"{dataset}_formant_shift", + f"{dataset}_equalizer", + f"{dataset}_time_stretch", + ] + new_datasets_list.extend(new_datasets) + cfg.dataset.extend(new_datasets_list) + + # CUDA settings + cuda_relevant() + + # Build trainer + trainer = build_trainer(args, cfg) + + trainer.train_loop() + + +if __name__ == "__main__": + main() diff --git a/config/audioldm.json b/config/audioldm.json new file mode 100644 index 0000000000000000000000000000000000000000..c0cab48cda2f00ac6c1687da5ab3b6d6a263e317 --- /dev/null +++ b/config/audioldm.json @@ -0,0 +1,92 @@ +{ + "base_config": "config/base.json", + "model_type": "AudioLDM", + "task_type": "tta", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // feature used for model training + "use_spkid": false, + "use_uv": false, + "use_frame_pitch": false, + "use_phone_pitch": false, + "use_frame_energy": false, + "use_phone_energy": false, + "use_mel": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false, + "cond_mask_prob": 0.1 + }, + // model + "model": { + "audioldm": { + "image_size": 32, + "in_channels": 4, + "out_channels": 4, + "model_channels": 256, + "attention_resolutions": [ + 4, + 2, + 1 + ], + "num_res_blocks": 2, + "channel_mult": [ + 1, + 2, + 4 + ], + "num_heads": 8, + "use_spatial_transformer": true, + "transformer_depth": 1, + "context_dim": 768, + "use_checkpoint": true, + "legacy": false + }, + "autoencoderkl": { + "ch": 128, + "ch_mult": [ + 1, + 1, + 2, + 2, + 4 + ], + "num_res_blocks": 2, + "in_channels": 1, + "z_channels": 4, + "out_ch": 1, + "double_z": true + }, + "noise_scheduler": { + "num_train_timesteps": 1000, + "beta_start": 0.00085, + "beta_end": 0.012, + "beta_schedule": "scaled_linear", + "clip_sample": false, + "steps_offset": 1, + "set_alpha_to_one": false, + "skip_prk_steps": true, + "prediction_type": "epsilon" + } + }, + // train + "train": { + "lronPlateau": { + "factor": 0.9, + "patience": 100, + "min_lr": 4.0e-5, + "verbose": true + }, + "adam": { + "lr": 5.0e-5, + "betas": [ + 0.9, + 0.999 + ], + "weight_decay": 1.0e-2, + "eps": 1.0e-8 + } + } +} \ No newline at end of file diff --git a/config/autoencoderkl.json b/config/autoencoderkl.json new file mode 100644 index 0000000000000000000000000000000000000000..72e677770da689af33d4c7bd6f3506927fb24380 --- /dev/null +++ b/config/autoencoderkl.json @@ -0,0 +1,69 @@ +{ + "base_config": "config/base.json", + "model_type": "AutoencoderKL", + "task_type": "tta", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // feature used for model training + "use_spkid": false, + "use_uv": false, + "use_frame_pitch": false, + "use_phone_pitch": false, + "use_frame_energy": false, + "use_phone_energy": false, + "use_mel": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false + }, + // model + "model": { + "autoencoderkl": { + "ch": 128, + "ch_mult": [ + 1, + 1, + 2, + 2, + 4 + ], + "num_res_blocks": 2, + "in_channels": 1, + "z_channels": 4, + "out_ch": 1, + "double_z": true + }, + "loss": { + "kl_weight": 1e-8, + "disc_weight": 0.5, + "disc_factor": 1.0, + "logvar_init": 0.0, + "min_adapt_d_weight": 0.0, + "max_adapt_d_weight": 10.0, + "disc_start": 50001, + "disc_in_channels": 1, + "disc_num_layers": 3, + "use_actnorm": false + } + }, + // train + "train": { + "lronPlateau": { + "factor": 0.9, + "patience": 100, + "min_lr": 4.0e-5, + "verbose": true + }, + "adam": { + "lr": 4.0e-4, + "betas": [ + 0.9, + 0.999 + ], + "weight_decay": 1.0e-2, + "eps": 1.0e-8 + } + } +} \ No newline at end of file diff --git a/config/base.json b/config/base.json new file mode 100644 index 0000000000000000000000000000000000000000..144d07596d674145b28bb26e422bcec63fee2ff4 --- /dev/null +++ b/config/base.json @@ -0,0 +1,185 @@ +{ + "supported_model_type": [ + "GANVocoder", + "Fastspeech2", + "DiffSVC", + "Transformer", + "EDM", + "CD" + ], + "task_type": "", + "dataset": [], + "use_custom_dataset": [], + "preprocess": { + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon" + // trim audio silence + "data_augment": false, + "trim_silence": false, + "num_silent_frames": 8, + "trim_fft_size": 512, // fft size used in trimming + "trim_hop_size": 128, // hop size used in trimming + "trim_top_db": 30, // top db used in trimming sensitive to each dataset + // acoustic features + "extract_mel": false, + "mel_extract_mode": "", + "extract_linear_spec": false, + "extract_mcep": false, + "extract_pitch": false, + "extract_acoustic_token": false, + "pitch_remove_outlier": false, + "extract_uv": false, + "pitch_norm": false, + "extract_audio": false, + "extract_label": false, + "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform) + "extract_energy": false, + "energy_remove_outlier": false, + "energy_norm": false, + "energy_extract_mode": "from_mel", + "extract_duration": false, + "extract_amplitude_phase": false, + "mel_min_max_norm": false, + // lingusitic features + "extract_phone": false, + "lexicon_path": "./text/lexicon/librispeech-lexicon.txt", + // content features + "extract_whisper_feature": false, + "extract_contentvec_feature": false, + "extract_mert_feature": false, + "extract_wenet_feature": false, + // Settings for data preprocessing + "n_mel": 80, + "win_size": 480, + "hop_size": 120, + "sample_rate": 24000, + "n_fft": 1024, + "fmin": 0, + "fmax": 12000, + "min_level_db": -115, + "ref_level_db": 20, + "bits": 8, + // Directory names of processed data or extracted features + "processed_dir": "processed_data", + "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav + "raw_data": "raw_data", + "phone_dir": "phones", + "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform) + "audio_dir": "audios", + "log_amplitude_dir": "log_amplitudes", + "phase_dir": "phases", + "real_dir": "reals", + "imaginary_dir": "imaginarys", + "label_dir": "labels", + "linear_dir": "linears", + "mel_dir": "mels", // directory name of extraced mel features + "mcep_dir": "mcep", // directory name of extraced mcep features + "dur_dir": "durs", + "symbols_dict": "symbols.dict", + "lab_dir": "labs", // directory name of extraced label features + "wenet_dir": "wenet", // directory name of extraced wenet features + "contentvec_dir": "contentvec", // directory name of extraced wenet features + "pitch_dir": "pitches", // directory name of extraced pitch features + "energy_dir": "energys", // directory name of extracted energy features + "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features + "phone_energy_dir": "phone_energys", // directory name of extracted energy features + "uv_dir": "uvs", // directory name of extracted unvoiced features + "duration_dir": "duration", // ground-truth duration file + "phone_seq_file": "phone_seq_file", // phoneme sequence file + "file_lst": "file.lst", + "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance + "valid_file": "valid.json", // validattion set + "spk2id": "spk2id.json", // used for multi-speaker dataset + "utt2spk": "utt2spk", // used for multi-speaker dataset + "emo2id": "emo2id.json", // used for multi-emotion dataset + "utt2emo": "utt2emo", // used for multi-emotion dataset + // Features used for model training + "use_text": false, + "use_phone": false, + "use_phn_seq": false, + "use_lab": false, + "use_linear": false, + "use_mel": false, + "use_min_max_norm_mel": false, + "use_wav": false, + "use_phone_pitch": false, + "use_log_scale_pitch": false, + "use_phone_energy": false, + "use_phone_duration": false, + "use_log_scale_energy": false, + "use_wenet": false, + "use_dur": false, + "use_spkid": false, // True: use speaker id for multi-speaker dataset + "use_emoid": false, // True: use emotion id for multi-emotion dataset + "use_frame_pitch": false, + "use_uv": false, + "use_frame_energy": false, + "use_frame_duration": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false, + "use_amplitude_phase": false, + "align_mel_duration": false + }, + "train": { + "ddp": true, + "batch_size": 16, + "max_steps": 1000000, + // Trackers + "tracker": [ + "tensorboard" + // "wandb", + // "cometml", + // "mlflow", + ], + "max_epoch": -1, + // -1 means no limit + "save_checkpoint_stride": [ + 5, + 20 + ], + // unit is epoch + "keep_last": [ + 3, + -1 + ], + // -1 means infinite, if one number will broadcast + "run_eval": [ + false, + true + ], + // if one number will broadcast + // Fix the random seed + "random_seed": 10086, + // Optimizer + "optimizer": "AdamW", + "adamw": { + "lr": 4.0e-4 + // nn model lr + }, + // LR Scheduler + "scheduler": "ReduceLROnPlateau", + "reducelronplateau": { + "factor": 0.8, + "patience": 10, + // unit is epoch + "min_lr": 1.0e-4 + }, + // Batchsampler + "sampler": { + "holistic_shuffle": true, + "drop_last": true + }, + // Dataloader + "dataloader": { + "num_worker": 32, + "pin_memory": true + }, + "gradient_accumulation_step": 1, + "total_training_steps": 50000, + "save_summary_steps": 500, + "save_checkpoints_steps": 10000, + "valid_interval": 10000, + "keep_checkpoint_max": 5, + "multi_speaker_training": false // True: train multi-speaker model; False: training single-speaker model; + } +} \ No newline at end of file diff --git a/config/comosvc.json b/config/comosvc.json new file mode 100644 index 0000000000000000000000000000000000000000..c1af0e58981f09b9b74c2baf89dce3b45627026b --- /dev/null +++ b/config/comosvc.json @@ -0,0 +1,215 @@ +{ + "base_config": "config/svc/base.json", + "model_type": "DiffComoSVC", + "task_type": "svc", + "preprocess": { + // data augmentations + "use_pitch_shift": false, + "use_formant_shift": false, + "use_time_stretch": false, + "use_equalizer": false, + // acoustic features + "extract_mel": true, + "mel_min_max_norm": true, + "extract_pitch": true, + "pitch_extractor": "parselmouth", + "extract_uv": true, + "extract_energy": true, + // content features + "extract_whisper_feature": false, + "whisper_sample_rate": 16000, + "extract_contentvec_feature": false, + "contentvec_sample_rate": 16000, + "extract_wenet_feature": false, + "wenet_sample_rate": 16000, + "extract_mert_feature": false, + "mert_sample_rate": 16000, + // Default config for whisper + "whisper_frameshift": 0.01, + "whisper_downsample_rate": 2, + // Default config for content vector + "contentvec_frameshift": 0.02, + // Default config for mert + "mert_model": "m-a-p/MERT-v1-330M", + "mert_feature_layer": -1, + "mert_hop_size": 320, + // 24k + "mert_frameshit": 0.01333, + // 10ms + "wenet_frameshift": 0.01, + // wenetspeech is 4, gigaspeech is 6 + "wenet_downsample_rate": 4, + // Default config + "n_mel": 100, + "win_size": 1024, + // todo + "hop_size": 256, + "sample_rate": 24000, + "n_fft": 1024, + // todo + "fmin": 0, + "fmax": 12000, + // todo + "f0_min": 50, + // ~C2 + "f0_max": 1100, + //1100, // ~C6(1100), ~G5(800) + "pitch_bin": 256, + "pitch_max": 1100.0, + "pitch_min": 50.0, + "is_label": true, + "is_mu_law": true, + "bits": 8, + "mel_min_max_stats_dir": "mel_min_max_stats", + "whisper_dir": "whisper", + "contentvec_dir": "contentvec", + "wenet_dir": "wenet", + "mert_dir": "mert", + // Extract content features using dataloader + "pin_memory": true, + "num_workers": 8, + "content_feature_batch_size": 16, + // Features used for model training + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_uv": true, + "use_frame_energy": true, + "use_log_scale_pitch": false, + "use_log_scale_energy": false, + "use_spkid": true, + // Meta file + "train_file": "train.json", + "valid_file": "test.json", + "spk2id": "singers.json", + "utt2spk": "utt2singer" + }, + "model": { + "teacher_model_path": "[Your Teacher Model Path].bin", + "condition_encoder": { + "merge_mode": "add", + "input_melody_dim": 1, + "use_log_f0": true, + "n_bins_melody": 256, + //# Quantization (0 for not quantization) + "output_melody_dim": 384, + "input_loudness_dim": 1, + "use_log_loudness": true, + "n_bins_loudness": 256, + "output_loudness_dim": 384, + "use_whisper": false, + "use_contentvec": false, + "use_wenet": false, + "use_mert": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "mert_dim": 256, + "wenet_dim": 512, + "content_encoder_dim": 384, + "output_singer_dim": 384, + "singer_table_size": 512, + "output_content_dim": 384, + "use_spkid": true + }, + "comosvc": { + "distill": false, + // conformer encoder + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels": 512, + "dropout": 0.1, + // karras diffusion + "P_mean": -1.2, + "P_std": 1.2, + "sigma_data": 0.5, + "sigma_min": 0.002, + "sigma_max": 80, + "rho": 7, + "n_timesteps": 18, + }, + "diffusion": { + // Diffusion steps encoder + "step_encoder": { + "dim_raw_embedding": 128, + "dim_hidden_layer": 512, + "activation": "SiLU", + "num_layer": 2, + "max_period": 10000 + }, + // Diffusion decoder + "model_type": "bidilconv", + // bidilconv, unet2d, TODO: unet1d + "bidilconv": { + "base_channel": 384, + "n_res_block": 20, + "conv_kernel_size": 3, + "dilation_cycle_length": 4, + // specially, 1 means no dilation + "conditioner_size": 100 + } + }, + }, + "train": { + // Basic settings + "fast_steps": 0, + "batch_size": 64, + "gradient_accumulation_step": 1, + "max_epoch": -1, + // -1 means no limit + "save_checkpoint_stride": [ + 10, + 100 + ], + // unit is epoch + "keep_last": [ + 3, + -1 + ], + // -1 means infinite, if one number will broadcast + "run_eval": [ + false, + true + ], + // if one number will broadcast + // Fix the random seed + "random_seed": 10086, + // Batchsampler + "sampler": { + "holistic_shuffle": true, + "drop_last": true + }, + // Dataloader + "dataloader": { + "num_worker": 32, + "pin_memory": true + }, + // Trackers + "tracker": [ + "tensorboard" + // "wandb", + // "cometml", + // "mlflow", + ], + // Optimizer + "optimizer": "AdamW", + "adamw": { + "lr": 5.0e-5 + // nn model lr + }, + // LR Scheduler + "scheduler": "ReduceLROnPlateau", + "reducelronplateau": { + "factor": 0.8, + "patience": 10, + // unit is epoch + "min_lr": 5.0e-6 + } + }, + "inference": { + "comosvc": { + "inference_steps": 40 + } + } +} \ No newline at end of file diff --git a/config/facodec.json b/config/facodec.json new file mode 100644 index 0000000000000000000000000000000000000000..c0cdab06a769be2a800200575415dee795a14e4d --- /dev/null +++ b/config/facodec.json @@ -0,0 +1,67 @@ +{ + "exp_name": "facodec", + "model_type": "FAcodec", + "log_dir": "./runs/", + "log_interval": 10, + "save_interval": 1000, + "device": "cuda", + "epochs": 1000, + "batch_size": 4, + "batch_length": 100, + "max_len": 80, + "pretrained_model": "", + "load_only_params": false, + "F0_path": "modules/JDC/bst.t7", + "dataset": "dummy", + "preprocess_params": { + "sr": 24000, + "frame_rate": 80, + "duration_range": [1.0, 25.0], + "spect_params": { + "n_fft": 2048, + "win_length": 1200, + "hop_length": 300, + "n_mels": 80, + }, + }, + "train": { + "gradient_accumulation_step": 1, + "batch_size": 1, + "save_checkpoint_stride": [20], + "random_seed": 1234, + "max_epoch": -1, + "max_frame_len": 80, + "tracker": ["tensorboard"], + "run_eval": [false], + "sampler": {"holistic_shuffle": true, "drop_last": true}, + "dataloader": {"num_worker": 0, "pin_memory": true}, + }, + "model_params": { + "causal": true, + "lstm": 2, + "norm_f0": true, + "use_gr_content_f0": false, + "use_gr_prosody_phone": false, + "use_gr_timbre_prosody": false, + "separate_prosody_encoder": true, + "n_c_codebooks": 2, + "timbre_norm": true, + "use_gr_content_global_f0": true, + "DAC": { + "encoder_dim": 64, + "encoder_rates": [2, 5, 5, 6], + "decoder_dim": 1536, + "decoder_rates": [6, 5, 5, 2], + "sr": 24000, + }, + }, + "loss_params": { + "base_lr": 0.0001, + "warmup_steps": 200, + "discriminator_iter_start": 2000, + "lambda_spk": 1.0, + "lambda_mel": 45, + "lambda_f0": 1.0, + "lambda_uv": 1.0, + }, +} diff --git a/config/fs2.json b/config/fs2.json new file mode 100644 index 0000000000000000000000000000000000000000..9dbc41bb2237524bbf744c56ed3aa0cebbceef81 --- /dev/null +++ b/config/fs2.json @@ -0,0 +1,120 @@ +{ + "base_config": "config/tts.json", + "model_type": "FastSpeech2", + "task_type": "tts", + "dataset": ["LJSpeech"], + "preprocess": { + // acoustic features + "extract_audio": true, + "extract_mel": true, + "mel_extract_mode": "taco", + "mel_min_max_norm": false, + "extract_pitch": true, + "extract_uv": false, + "pitch_extractor": "dio", + "extract_energy": true, + "energy_extract_mode": "from_tacotron_stft", + "extract_duration": true, + "use_phone": false, + "pitch_norm": true, + "energy_norm": true, + "pitch_remove_outlier": true, + "energy_remove_outlier": true, + + // Default config + "n_mel": 80, + "win_size": 1024, // todo + "hop_size": 256, + "sample_rate": 22050, + "n_fft": 1024, // todo + "fmin": 0, + "fmax": 8000, // todo + "raw_data": "raw_data", + "text_cleaners": ["english_cleaners"], + "f0_min": 71, // ~C2 + "f0_max": 800, //1100, // ~C6(1100), ~G5(800) + "pitch_bin": 256, + "pitch_max": 1100.0, + "pitch_min": 50.0, + "is_label": true, + "is_mu_law": true, + "bits": 8, + + "mel_min_max_stats_dir": "mel_min_max_stats", + "whisper_dir": "whisper", + "content_vector_dir": "content_vector", + "wenet_dir": "wenet", + "mert_dir": "mert", + "spk2id":"spk2id.json", + "utt2spk":"utt2spk", + "valid_file": "test.json", + + // Features used for model training + "use_mel": true, + "use_min_max_norm_mel": false, + "use_frame_pitch": false, + "use_frame_energy": false, + "use_phone_pitch": true, + "use_phone_energy": true, + "use_log_scale_pitch": false, + "use_log_scale_energy": false, + "use_spkid": false, + "align_mel_duration": true, + "text_cleaners": ["english_cleaners"], + "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + }, + "model": { + // Settings for transformer + "transformer": { + "encoder_layer": 4, + "encoder_head": 2, + "encoder_hidden": 256, + "decoder_layer": 6, + "decoder_head": 2, + "decoder_hidden": 256, + "conv_filter_size": 1024, + "conv_kernel_size": [9, 1], + "encoder_dropout": 0.2, + "decoder_dropout": 0.2 + }, + + // Settings for variance_predictor + "variance_predictor":{ + "filter_size": 256, + "kernel_size": 3, + "dropout": 0.5 + }, + "variance_embedding":{ + "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing + "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing + "n_bins": 256 + }, + "max_seq_len": 1000 + }, + "train":{ + "batch_size": 16, + "max_epoch": 100, + "sort_sample": true, + "drop_last": true, + "group_size": 4, + "grad_clip_thresh": 1.0, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "lr_scheduler":{ + "num_warmup": 4000 + }, + // LR Scheduler + "scheduler": "NoamLR", + // Optimizer + "optimizer": "Adam", + "adam": { + "lr": 0.0625, + "betas": [0.9, 0.98], + "eps": 0.000000001, + "weight_decay": 0.0 + }, + } + +} diff --git a/config/jets.json b/config/jets.json new file mode 100644 index 0000000000000000000000000000000000000000..3a3a437ddf79d455068320f2f0da61afa66969a5 --- /dev/null +++ b/config/jets.json @@ -0,0 +1,120 @@ +{ + "base_config": "config/tts.json", + "model_type": "Jets", + "task_type": "tts", + "dataset": ["LJSpeech"], + "preprocess": { + // acoustic features + "extract_audio": true, + "extract_mel": true, + "mel_extract_mode": "taco", + "mel_min_max_norm": false, + "extract_pitch": true, + "extract_uv": false, + "pitch_extractor": "dio", + "extract_energy": true, + "energy_extract_mode": "from_tacotron_stft", + "extract_duration": true, + "use_phone": false, + "pitch_norm": true, + "energy_norm": true, + "pitch_remove_outlier": true, + "energy_remove_outlier": true, + + // Default config + "n_mel": 80, + "win_size": 1024, // todo + "hop_size": 256, + "sample_rate": 22050, + "n_fft": 1024, // todo + "fmin": 0, + "fmax": 8000, // todo + "raw_data": "raw_data", + "text_cleaners": ["english_cleaners"], + "f0_min": 71, // ~C2 + "f0_max": 800, //1100, // ~C6(1100), ~G5(800) + "pitch_bin": 256, + "pitch_max": 1100.0, + "pitch_min": 50.0, + "is_label": true, + "is_mu_law": true, + "bits": 8, + + "mel_min_max_stats_dir": "mel_min_max_stats", + "whisper_dir": "whisper", + "content_vector_dir": "content_vector", + "wenet_dir": "wenet", + "mert_dir": "mert", + "spk2id":"spk2id.json", + "utt2spk":"utt2spk", + "valid_file": "test.json", + + // Features used for model training + "use_mel": true, + "use_min_max_norm_mel": false, + "use_frame_pitch": true, + "use_frame_energy": true, + "use_phone_pitch": false, + "use_phone_energy": false, + "use_log_scale_pitch": false, + "use_log_scale_energy": false, + "use_spkid": false, + "align_mel_duration": true, + "text_cleaners": ["english_cleaners"], + "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + }, + "model": { + // Settings for transformer + "transformer": { + "encoder_layer": 4, + "encoder_head": 2, + "encoder_hidden": 256, + "decoder_layer": 6, + "decoder_head": 2, + "decoder_hidden": 256, + "conv_filter_size": 1024, + "conv_kernel_size": [9, 1], + "encoder_dropout": 0.2, + "decoder_dropout": 0.2 + }, + + // Settings for variance_predictor + "variance_predictor":{ + "filter_size": 256, + "kernel_size": 3, + "dropout": 0.5 + }, + "variance_embedding":{ + "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing + "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing + "n_bins": 256 + }, + "max_seq_len": 1000 + }, + "train":{ + "batch_size": 16, + "max_epoch": 100, + "sort_sample": true, + "drop_last": true, + "group_size": 4, + "grad_clip_thresh": 1.0, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "lr_scheduler":{ + "num_warmup": 4000 + }, + // LR Scheduler + "scheduler": "NoamLR", + // Optimizer + "optimizer": "Adam", + "adam": { + "lr": 0.0625, + "betas": [0.9, 0.98], + "eps": 0.000000001, + "weight_decay": 0.0 + }, + } + +} diff --git a/config/ns2.json b/config/ns2.json new file mode 100644 index 0000000000000000000000000000000000000000..56dbf8a7f17e5908f1dc06b32627af576516d9f1 --- /dev/null +++ b/config/ns2.json @@ -0,0 +1,88 @@ +{ + "base_config": "config/base.json", + "model_type": "NaturalSpeech2", + "dataset": ["libritts"], + "preprocess": { + "use_mel": false, + "use_code": true, + "use_spkid": true, + "use_pitch": true, + "use_duration": true, + "use_phone": true, + "use_len": true, + "use_cross_reference": true, + "train_file": "train.json", + "melspec_dir": "mel", + "code_dir": "code", + "pitch_dir": "pitch", + "duration_dir": "duration", + "clip_mode": "start" + }, + "model": { + "latent_dim": 128, + "prior_encoder": { + "vocab_size": 100, + "pitch_min": 50, + "pitch_max": 1100, + "pitch_bins_num": 512, + "encoder": { + "encoder_layer": 6, + "encoder_hidden": 512, + "encoder_head": 8, + "conv_filter_size": 2048, + "conv_kernel_size": 9, + "encoder_dropout": 0.2, + "use_cln": true + }, + "duration_predictor": { + "input_size": 512, + "filter_size": 512, + "kernel_size": 3, + "conv_layers": 30, + "cross_attn_per_layer": 3, + "attn_head": 8, + "drop_out": 0.5 + }, + "pitch_predictor": { + "input_size": 512, + "filter_size": 512, + "kernel_size": 5, + "conv_layers": 30, + "cross_attn_per_layer": 3, + "attn_head": 8, + "drop_out": 0.5 + } + }, + "diffusion": { + "wavenet": { + "input_size": 128, + "hidden_size": 512, + "out_size": 128, + "num_layers": 40, + "cross_attn_per_layer": 3, + "dilation_cycle": 2, + "attn_head": 8, + "drop_out": 0.2 + }, + "beta_min": 0.05, + "beta_max": 20, + "sigma": 1.0, + "noise_factor": 1.0, + "ode_solver": "euler" + }, + "prompt_encoder": { + "encoder_layer": 6, + "encoder_hidden": 512, + "encoder_head": 8, + "conv_filter_size": 2048, + "conv_kernel_size": 9, + "encoder_dropout": 0.2, + "use_cln": false + }, + "query_emb": { + "query_token_num": 32, + "hidden_size": 512, + "head_num": 8 + } + } +} \ No newline at end of file diff --git a/config/svc/base.json b/config/svc/base.json new file mode 100644 index 0000000000000000000000000000000000000000..c36fb1b8d3d4239883415f39d8a95015690c3107 --- /dev/null +++ b/config/svc/base.json @@ -0,0 +1,119 @@ +{ + "base_config": "config/base.json", + "task_type": "svc", + "preprocess": { + // data augmentations + "use_pitch_shift": false, + "use_formant_shift": false, + "use_time_stretch": false, + "use_equalizer": false, + // Online or offline features extraction ("offline" or "online") + "features_extraction_mode": "offline", + // acoustic features + "extract_mel": true, + "mel_min_max_norm": true, + "extract_pitch": true, + "pitch_extractor": "parselmouth", + "extract_uv": true, + "extract_energy": true, + // content features + "extract_whisper_feature": false, + "whisper_sample_rate": 16000, + "extract_contentvec_feature": false, + "contentvec_sample_rate": 16000, + "extract_wenet_feature": false, + "wenet_sample_rate": 16000, + "extract_mert_feature": false, + "mert_sample_rate": 16000, + // Default config for whisper + "whisper_frameshift": 0.01, + "whisper_downsample_rate": 2, + // Default config for content vector + "contentvec_frameshift": 0.02, + // Default config for mert + "mert_model": "m-a-p/MERT-v1-330M", + "mert_feature_layer": -1, + "mert_hop_size": 320, + // 24k + "mert_frameshit": 0.01333, + // 10ms + "wenet_frameshift": 0.01, + // wenetspeech is 4, gigaspeech is 6 + "wenet_downsample_rate": 4, + // Default config + "n_mel": 100, + "win_size": 1024, + // todo + "hop_size": 256, + "sample_rate": 24000, + "n_fft": 1024, + // todo + "fmin": 0, + "fmax": 12000, + // todo + "f0_min": 50, + // ~C2 + "f0_max": 1100, + //1100, // ~C6(1100), ~G5(800) + "pitch_bin": 256, + "pitch_max": 1100.0, + "pitch_min": 50.0, + "is_label": true, + "is_mu_law": true, + "bits": 8, + "mel_min_max_stats_dir": "mel_min_max_stats", + "whisper_dir": "whisper", + "contentvec_dir": "contentvec", + "wenet_dir": "wenet", + "mert_dir": "mert", + // Extract content features using dataloader + "pin_memory": true, + "num_workers": 8, + "content_feature_batch_size": 16, + // Features used for model training + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_uv": true, + "use_interpolation_for_uv": false, + "use_frame_energy": true, + "use_log_scale_pitch": false, + "use_log_scale_energy": false, + "use_spkid": true, + // Meta file + "train_file": "train.json", + "valid_file": "test.json", + "spk2id": "singers.json", + "utt2spk": "utt2singer" + }, + "model": { + "condition_encoder": { + "merge_mode": "add", + // Prosody Features + "use_f0": true, + "use_uv": true, + "use_energy": true, + // Quantization (0 for not quantization) + "input_melody_dim": 1, + "n_bins_melody": 256, + "output_melody_dim": 384, + "input_loudness_dim": 1, + "n_bins_loudness": 256, + "output_loudness_dim": 384, + // Semantic Features + "use_whisper": false, + "use_contentvec": false, + "use_wenet": false, + "use_mert": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "mert_dim": 256, + "wenet_dim": 512, + "content_encoder_dim": 384, + // Speaker Features + "output_singer_dim": 384, + "singer_table_size": 512, + "use_spkid": true + } + }, +} \ No newline at end of file diff --git a/config/svc/diffusion.json b/config/svc/diffusion.json new file mode 100644 index 0000000000000000000000000000000000000000..d9e538b754e504e51070636f7380949b61632311 --- /dev/null +++ b/config/svc/diffusion.json @@ -0,0 +1,142 @@ +{ + "base_config": "config/svc/base.json", + "model": { + "condition_encoder": { + "merge_mode": "add", + // Prosody Features + "use_f0": true, + "use_uv": true, + "use_energy": true, + // Quantization (0 for not quantization) + "input_melody_dim": 1, + "n_bins_melody": 256, + "output_melody_dim": 384, + "input_loudness_dim": 1, + "n_bins_loudness": 256, + "output_loudness_dim": 384, + // Semantic Features + "use_whisper": false, + "use_contentvec": false, + "use_wenet": false, + "use_mert": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "mert_dim": 256, + "wenet_dim": 512, + "content_encoder_dim": 384, + // Speaker Features + "output_singer_dim": 384, + "singer_table_size": 512, + "use_spkid": true + }, + "diffusion": { + "scheduler": "ddpm", + "scheduler_settings": { + "num_train_timesteps": 1000, + "beta_start": 1.0e-4, + "beta_end": 0.02, + "beta_schedule": "linear" + }, + // Diffusion steps encoder + "step_encoder": { + "dim_raw_embedding": 128, + "dim_hidden_layer": 512, + "activation": "SiLU", + "num_layer": 2, + "max_period": 10000 + }, + // Diffusion decoder + "model_type": "bidilconv", + // bidilconv, unet2d, TODO: unet1d + "bidilconv": { + "base_channel": 384, + "n_res_block": 20, + "conv_kernel_size": 3, + "dilation_cycle_length": 4, + // specially, 1 means no dilation + "conditioner_size": 384 + }, + "unet2d": { + "in_channels": 1, + "out_channels": 1, + "down_block_types": [ + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "DownBlock2D" + ], + "mid_block_type": "UNetMidBlock2DCrossAttn", + "up_block_types": [ + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D" + ], + "only_cross_attention": false + } + } + }, + "train": { + // Basic settings + "batch_size": 64, + "gradient_accumulation_step": 1, + "max_epoch": -1, + // -1 means no limit + "save_checkpoint_stride": [ + 5, + 20 + ], + // unit is epoch + "keep_last": [ + 3, + -1 + ], + // -1 means infinite, if one number will broadcast + "run_eval": [ + false, + true + ], + // if one number will broadcast + // Fix the random seed + "random_seed": 10086, + // Batchsampler + "sampler": { + "holistic_shuffle": true, + "drop_last": true + }, + // Dataloader + "dataloader": { + "num_worker": 32, + "pin_memory": true + }, + // Trackers + "tracker": [ + "tensorboard" + // "wandb", + // "cometml", + // "mlflow", + ], + // Optimizer + "optimizer": "AdamW", + "adamw": { + "lr": 4.0e-4 + // nn model lr + }, + // LR Scheduler + "scheduler": "ReduceLROnPlateau", + "reducelronplateau": { + "factor": 0.8, + "patience": 10, + // unit is epoch + "min_lr": 1.0e-4 + } + }, + "inference": { + "diffusion": { + "scheduler": "pndm", + "scheduler_settings": { + "num_inference_timesteps": 1000 + } + } + } +} \ No newline at end of file diff --git a/config/transformer.json b/config/transformer.json new file mode 100644 index 0000000000000000000000000000000000000000..be3514e9c36c5bd5bb618528eb044c9f68805ffc --- /dev/null +++ b/config/transformer.json @@ -0,0 +1,179 @@ +{ + "base_config": "config/svc/base.json", + "model_type": "Transformer", + "task_type": "svc", + "preprocess": { + // data augmentations + "use_pitch_shift": false, + "use_formant_shift": false, + "use_time_stretch": false, + "use_equalizer": false, + // acoustic features + "extract_mel": true, + "mel_min_max_norm": true, + "extract_pitch": true, + "pitch_extractor": "parselmouth", + "extract_uv": true, + "extract_energy": true, + // content features + "extract_whisper_feature": false, + "whisper_sample_rate": 16000, + "extract_contentvec_feature": false, + "contentvec_sample_rate": 16000, + "extract_wenet_feature": false, + "wenet_sample_rate": 16000, + "extract_mert_feature": false, + "mert_sample_rate": 16000, + // Default config for whisper + "whisper_frameshift": 0.01, + "whisper_downsample_rate": 2, + // Default config for content vector + "contentvec_frameshift": 0.02, + // Default config for mert + "mert_model": "m-a-p/MERT-v1-330M", + "mert_feature_layer": -1, + "mert_hop_size": 320, + // 24k + "mert_frameshit": 0.01333, + // 10ms + "wenet_frameshift": 0.01, + // wenetspeech is 4, gigaspeech is 6 + "wenet_downsample_rate": 4, + // Default config + "n_mel": 100, + "win_size": 1024, + // todo + "hop_size": 256, + "sample_rate": 24000, + "n_fft": 1024, + // todo + "fmin": 0, + "fmax": 12000, + // todo + "f0_min": 50, + // ~C2 + "f0_max": 1100, + //1100, // ~C6(1100), ~G5(800) + "pitch_bin": 256, + "pitch_max": 1100.0, + "pitch_min": 50.0, + "is_label": true, + "is_mu_law": true, + "bits": 8, + "mel_min_max_stats_dir": "mel_min_max_stats", + "whisper_dir": "whisper", + "contentvec_dir": "contentvec", + "wenet_dir": "wenet", + "mert_dir": "mert", + // Extract content features using dataloader + "pin_memory": true, + "num_workers": 8, + "content_feature_batch_size": 16, + // Features used for model training + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_uv": true, + "use_frame_energy": true, + "use_log_scale_pitch": false, + "use_log_scale_energy": false, + "use_spkid": true, + // Meta file + "train_file": "train.json", + "valid_file": "test.json", + "spk2id": "singers.json", + "utt2spk": "utt2singer" + }, + "model": { + "condition_encoder": { + "merge_mode": "add", + "input_melody_dim": 1, + "use_log_f0": true, + "n_bins_melody": 256, + //# Quantization (0 for not quantization) + "output_melody_dim": 384, + "input_loudness_dim": 1, + "use_log_loudness": true, + "n_bins_loudness": 256, + "output_loudness_dim": 384, + "use_whisper": false, + "use_contentvec": true, + "use_wenet": false, + "use_mert": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "mert_dim": 256, + "wenet_dim": 512, + "content_encoder_dim": 384, + "output_singer_dim": 384, + "singer_table_size": 512, + "output_content_dim": 384, + "use_spkid": true + }, + "transformer": { + "type": "conformer", + // 'conformer' or 'transformer' + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels": 512, + "dropout": 0.1, + } + }, + "train": { + // Basic settings + "batch_size": 64, + "gradient_accumulation_step": 1, + "max_epoch": -1, + // -1 means no limit + "save_checkpoint_stride": [ + 10, + 100 + ], + // unit is epoch + "keep_last": [ + 3, + -1 + ], + // -1 means infinite, if one number will broadcast + "run_eval": [ + false, + true + ], + // if one number will broadcast + // Fix the random seed + "random_seed": 10086, + // Batchsampler + "sampler": { + "holistic_shuffle": true, + "drop_last": true + }, + // Dataloader + "dataloader": { + "num_worker": 32, + "pin_memory": true + }, + // Trackers + "tracker": [ + "tensorboard" + // "wandb", + // "cometml", + // "mlflow", + ], + // Optimizer + "optimizer": "AdamW", + "adamw": { + "lr": 4.0e-4 + // nn model lr + }, + // LR Scheduler + "scheduler": "ReduceLROnPlateau", + "reducelronplateau": { + "factor": 0.8, + "patience": 10, + // unit is epoch + "min_lr": 1.0e-4 + } + } +} \ No newline at end of file diff --git a/config/tts.json b/config/tts.json new file mode 100644 index 0000000000000000000000000000000000000000..882726dbfb2748167af1fa936e9da980acb971c4 --- /dev/null +++ b/config/tts.json @@ -0,0 +1,25 @@ +{ + "base_config": "config/base.json", + "supported_model_type": [ + "Fastspeech2", + "VITS", + "VALLE", + "NaturalSpeech2" + ], + "task_type": "tts", + "preprocess": { + "language": "en-us", // espeak supports 100 languages https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md + // linguistic features + "extract_phone": true, + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + "lexicon_path": "./text/lexicon/librispeech-lexicon.txt", + // Directory names of processed data or extracted features + "phone_dir": "phones", + "use_phone": true, + "add_blank": true + }, + "model": { + "text_token_num": 512, + } + +} diff --git a/config/valle.json b/config/valle.json new file mode 100644 index 0000000000000000000000000000000000000000..5fa9329d1df54b2b1fbac1790829c53d7caca7b0 --- /dev/null +++ b/config/valle.json @@ -0,0 +1,55 @@ +{ + "base_config": "config/tts.json", + "model_type": "VALLE", + "task_type": "tts", + "dataset": [ + "libritts" + ], + "preprocess": { + "extract_phone": true, + "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon + "extract_acoustic_token": true, + "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo) + "acoustic_token_dir": "acoutic_tokens", + "use_text": false, + "use_phone": true, + "use_acoustic_token": true, + "symbols_dict": "symbols.dict", + "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration + "max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration. + "sample_rate": 24000, + "codec_hop_size": 320 + }, + "model": { + "text_token_num": 512, + "audio_token_num": 1024, + "decoder_dim": 1024, // embedding dimension of the decoder model + "nhead": 16, // number of attention heads in the decoder layers + "num_decoder_layers": 12, // number of decoder layers + "norm_first": true, // pre or post Normalization. + "add_prenet": false, // whether add PreNet after Inputs + "prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance + "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding + "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models + "prepend_bos": false, // whether prepend to the acoustic tokens -> AR Decoder inputs + "num_quantizers": 8, // numbert of the audio quantization layers + // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers + }, + "train": { + "use_dynamic_batchsize": false, // If use dynamic batch size + "ddp": false, + "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s) + "max_epoch": 20, + "optimizer": "AdamW", + "scheduler": "cosine", + "warmup_steps": 16000, // number of steps that affects how rapidly the learning rate decreases + "total_training_steps": 800000, + "base_lr": 1e-4, // base learning rate." + "valid_interval": 1000, + "log_epoch_step": 1000, + "save_checkpoint_stride": [ + 1, + 1 + ] + } +} diff --git a/config/vits.json b/config/vits.json new file mode 100644 index 0000000000000000000000000000000000000000..d8de83ab4641f5c4efef7752e7017d0733a7ddac --- /dev/null +++ b/config/vits.json @@ -0,0 +1,101 @@ +{ + "base_config": "config/tts.json", + "model_type": "VITS", + "task_type": "tts", + "preprocess": { + "extract_phone": true, + "extract_mel": true, + "n_mel": 80, + "fmin": 0, + "fmax": null, + "extract_linear_spec": true, + "extract_audio": true, + "use_linear": true, + "use_mel": true, + "use_audio": true, + "use_text": false, + "use_phone": true, + "lexicon_path": "./text/lexicon/librispeech-lexicon.txt", + "n_fft": 1024, + "win_size": 1024, + "hop_size": 256, + "segment_size": 8192, + "text_cleaners": [ + "english_cleaners" + ] + }, + "model": { + "text_token_num": 512, + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 8, + 8, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "n_layers_q": 3, + "use_spectral_norm": false, + "n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true + "gin_channels": 256, + "use_sdp": true + }, + "train": { + "fp16_run": true, + "learning_rate": 2e-4, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-9, + "batch_size": 16, + "lr_decay": 0.999875, + // "segment_size": 8192, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "AdamW": { + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-9, + } + } +} \ No newline at end of file diff --git a/config/vitssvc.json b/config/vitssvc.json new file mode 100644 index 0000000000000000000000000000000000000000..53aa164254de8bc1278a8016510920fea7b87574 --- /dev/null +++ b/config/vitssvc.json @@ -0,0 +1,306 @@ +{ + "base_config": "config/svc/base.json", + "model_type": "VITS", + "task_type": "svc", + "preprocess": { + // Config for features extraction + "extract_mel": true, + "extract_pitch": true, + "pitch_extractor": "parselmouth", + "extract_energy": true, + "extract_uv": true, + "extract_linear_spec": true, + "extract_audio": true, + "mel_min_max_norm": true, + // Config for features usage + "use_linear": true, + "use_mel": true, + "use_min_max_norm_mel": false, + "use_audio": true, + "use_frame_pitch": true, + "use_uv": true, + "use_spkid": true, + "use_contentvec": false, + "use_whisper": false, + "use_wenet": false, + "use_text": false, + "use_phone": false, + "fmin": 0, + "fmax": 12000, + "f0_min": 50, + "f0_max": 1100, + // f0_bin in sovits + "pitch_bin": 256, + // filter_length in sovits + "n_fft": 1024, + // hop_length in sovits + "hop_size": 256, + // win_length in sovits + "win_size": 1024, + "segment_size": 8192, + "n_mel": 100, + "sample_rate": 24000, + "mel_min_max_stats_dir": "mel_min_max_stats", + "whisper_dir": "whisper", + "contentvec_dir": "contentvec", + "wenet_dir": "wenet", + "mert_dir": "mert", + // Meta file + "train_file": "train.json", + "valid_file": "test.json", + "spk2id": "singers.json", + "utt2spk": "utt2singer" + }, + "model": { + "condition_encoder": { + "merge_mode": "add", + "input_melody_dim": 1, + "use_log_f0": true, + "n_bins_melody": 256, + "output_melody_dim": 384, + "input_loudness_dim": 1, + "use_log_loudness": true, + "n_bins_loudness": 256, + "output_loudness_dim": 384, + "use_whisper": false, + "use_contentvec": false, + "use_wenet": false, + "use_mert": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "mert_dim": 256, + "wenet_dim": 512, + "content_encoder_dim": 384, + "singer_table_size": 512, + "output_singer_dim": 384, + "output_content_dim": 384, + "use_spkid": true, + "pitch_max": 1100.0, + "pitch_min": 50.0, + }, + "vits": { + "filter_channels": 256, + "gin_channels": 256, + "hidden_channels": 384, + "inter_channels": 384, + "kernel_size": 3, + "n_flow_layer": 4, + "n_heads": 2, + "n_layers": 6, + "n_layers_q": 3, + "n_speakers": 512, + "p_dropout": 0.1, + "use_spectral_norm": false, + }, + "generator": "hifigan", + "generator_config": { + "hifigan": { + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "upsample_rates": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + }, + "melgan": { + "ratios": [ + 8, + 8, + 2, + 2 + ], + "ngf": 32, + "n_residual_layers": 3, + "num_D": 3, + "ndf": 16, + "n_layers": 4, + "downsampling_factor": 4 + }, + "bigvgan": { + "resblock": "1", + "activation": "snakebeta", + "snake_logscale": true, + "upsample_rates": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + }, + "nsfhifigan": { + "resblock": "1", + "harmonic_num": 8, + "upsample_rates": [ + 8, + 8, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 768, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + }, + "apnet": { + "ASP_channel": 512, + "ASP_resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "ASP_resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "ASP_input_conv_kernel_size": 7, + "ASP_output_conv_kernel_size": 7, + "PSP_channel": 512, + "PSP_resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "PSP_resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "PSP_input_conv_kernel_size": 7, + "PSP_output_R_conv_kernel_size": 7, + "PSP_output_I_conv_kernel_size": 7, + } + }, + }, + "train": { + "fp16_run": true, + "learning_rate": 2e-4, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-9, + "batch_size": 16, + "lr_decay": 0.999875, + // "segment_size": 8192, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0, + "AdamW": { + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-9, + } + } +} \ No newline at end of file diff --git a/config/vocoder.json b/config/vocoder.json new file mode 100644 index 0000000000000000000000000000000000000000..4fbce166a59e5b23bac37bff7ebfda2c3c7653c6 --- /dev/null +++ b/config/vocoder.json @@ -0,0 +1,84 @@ +{ + "base_config": "config/base.json", + "dataset": [ + "LJSpeech", + "LibriTTS", + "opencpop", + "m4singer", + "svcc", + "svcceval", + "pjs", + "opensinger", + "popbutfy", + "nus48e", + "popcs", + "kising", + "csd", + "opera", + "vctk", + "lijian", + "cdmusiceval" + ], + "task_type": "vocoder", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_pitch": false, + "extract_uv": false, + "extract_audio": true, + "extract_label": false, + "extract_one_hot": false, + "extract_amplitude_phase": false, + "pitch_extractor": "parselmouth", + // Settings for data preprocessing + "n_mel": 100, + "win_size": 1024, + "hop_size": 256, + "sample_rate": 24000, + "n_fft": 1024, + "fmin": 0, + "fmax": 12000, + "f0_min": 50, + "f0_max": 1100, + "pitch_bin": 256, + "pitch_max": 1100.0, + "pitch_min": 50.0, + "is_mu_law": false, + "bits": 8, + "cut_mel_frame": 32, + // Directory names of processed data or extracted features + "spk2id": "singers.json", + // Features used for model training + "use_mel": true, + "use_frame_pitch": false, + "use_uv": false, + "use_audio": true, + "use_label": false, + "use_one_hot": false, + "train_file": "train.json", + "valid_file": "test.json" + }, + "train": { + "random_seed": 114514, + "batch_size": 64, + "gradient_accumulation_step": 1, + "max_epoch": 1000000, + "save_checkpoint_stride": [ + 20 + ], + "run_eval": [ + true + ], + "sampler": { + "holistic_shuffle": true, + "drop_last": true + }, + "dataloader": { + "num_worker": 16, + "pin_memory": true + }, + "tracker": [ + "tensorboard" + ], + } +} \ No newline at end of file diff --git a/egs/codec/FAcodec/README.md b/egs/codec/FAcodec/README.md new file mode 100644 index 0000000000000000000000000000000000000000..560f8bbea97cf1133635be41cafbba0a8a84c945 --- /dev/null +++ b/egs/codec/FAcodec/README.md @@ -0,0 +1,51 @@ +# FAcodec + +Pytorch implementation for the training of FAcodec, which was proposed in paper [NaturalSpeech 3: Zero-Shot Speech Synthesis +with Factorized Codec and Diffusion Models](https://arxiv.org/pdf/2403.03100) + +A dedicated repository for the FAcodec model can also be find [here](https://github.com/Plachtaa/FAcodec). + +This implementation made some key improvements to the training pipeline, so that the requirements of any form of annotations, including +transcripts, phoneme alignments, and speaker labels, are eliminated. All you need are simply raw speech files. +With the new training pipeline, it is possible to train the model on more languages with more diverse timbre distributions. +We release the code for training and inference, including a pretrained checkpoint on 50k hours speech data with over 1 million speakers. + +## Model storage +We provide pretrained checkpoints on 50k hours speech data. + +| Model type | Link | +|-------------------|----------------------------------------------------------------------------------------------------------------------------------------| +| FAcodec | [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-FAcodec-blue)](https://huggingface.co/Plachta/FAcodec) | + +## Demo +Try our model on [![Hugging Face](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-blue)](https://huggingface.co/spaces/Plachta/FAcodecV2)! + +## Training +Prepare your data and put them under one folder, internal file structure does not matter. +Then, change the `dataset` in `./egs/codec/FAcodec/exp_custom_data.json` to the path of your data folder. +Finally, run the following command: +```bash +sh ./egs/codec/FAcodec/train.sh +``` + +## Inference +To reconstruct a speech file, run: +```bash +python ./bins/codec/inference.py --source --output_dir --checkpoint_path +``` +To use zero-shot voice conversion, run: +```bash +python ./bins/codec/inference.py --source --reference --output_dir --checkpoint_path +``` + +## Feature extraction +When running `./bins/codec/inference.py`, check the returned results of the `FAcodecInference` class: a tuple of `(quantized, codes)` +- `quantized` is the quantized representation of the input speech file. +- `quantized[0]` is the quantized representation of prosody +- `quantized[1]` is the quantized representation of content + +- `codes` is the discrete code representation of the input speech file. +- `codes[0]` is the discrete code representation of prosody +- `codes[1]` is the discrete code representation of content + +For the most clean content representation without any timbre, we suggest to use `codes[1][:, 0, :]`, which is the first layer of content codebooks. \ No newline at end of file diff --git a/egs/codec/FAcodec/exp_custom_data.json b/egs/codec/FAcodec/exp_custom_data.json new file mode 100644 index 0000000000000000000000000000000000000000..2215bae728a9884549d0dc7708a32bf8b7ccb71f --- /dev/null +++ b/egs/codec/FAcodec/exp_custom_data.json @@ -0,0 +1,80 @@ +{ + "exp_name": "facodec", + "model_type": "FAcodec", + + "log_dir": "./runs/", + "log_interval": 10, + "save_interval": 1000, + "device": "cuda", + "epochs": 1000, + "batch_size": 4, + "batch_length": 100, + "max_len": 80, + "pretrained_model": "", + "load_only_params": false, + "F0_path": "modules/JDC/bst.t7", + "dataset": "/path/to/dataset", + "preprocess_params": { + "sr": 24000, + "frame_rate": 80, + "duration_range": [1.0, 25.0], + "spect_params": { + "n_fft": 2048, + "win_length": 1200, + "hop_length": 300, + "n_mels": 80 + } + }, + "train": { + "gradient_accumulation_step": 1, + "batch_size": 1, + "save_checkpoint_stride": [ + 20 + ], + "random_seed": 1234, + "max_epoch": -1, + "max_frame_len": 80, + "tracker": [ + "tensorboard" + ], + "run_eval": [ + false + ], + "sampler": { + "holistic_shuffle": true, + "drop_last": true + }, + "dataloader": { + "num_worker": 0, + "pin_memory": true + } + }, + "model_params": { + "causal": true, + "lstm": 2, + "norm_f0": true, + "use_gr_content_f0": false, + "use_gr_prosody_phone": false, + "use_gr_timbre_prosody": false, + "separate_prosody_encoder": true, + "n_c_codebooks": 2, + "timbre_norm": true, + "use_gr_content_global_f0": true, + "DAC": { + "encoder_dim": 64, + "encoder_rates": [2, 5, 5, 6], + "decoder_dim": 1536, + "decoder_rates": [6, 5, 5, 2], + "sr": 24000 + } + }, + "loss_params": { + "base_lr": 0.0001, + "warmup_steps": 200, + "discriminator_iter_start": 2000, + "lambda_spk": 1.0, + "lambda_mel": 45, + "lambda_f0": 1.0, + "lambda_uv": 1.0 + } +} \ No newline at end of file diff --git a/egs/codec/FAcodec/train.sh b/egs/codec/FAcodec/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..88d5411f0740671114977a9dafa24149866cf904 --- /dev/null +++ b/egs/codec/FAcodec/train.sh @@ -0,0 +1,27 @@ +export PYTHONPATH="./" + +######## Build Experiment Environment ########### +exp_dir="./egs/codecs/FAcodec" +echo exp_dir: $exp_dir +work_dir="./" # Amphion root folder +echo work_dir: $work_dir + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Config File Dir ############## +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_libritts.json +fi +echo "Exprimental Configuration File: $exp_config" + +######## Set the experiment name ########## +exp_name="facodec" + +port=53333 # a random number for port + +######## Train Model ########### +echo "Experiment Name: $exp_name" +accelerate launch --main_process_port $port "${work_dir}"/bins/codec/train.py --config $exp_config \ +--exp_name $exp_name --log_level debug $1 diff --git a/egs/datasets/README.md b/egs/datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..426a4754822f3fa74d24e02050b57f791c1d91a2 --- /dev/null +++ b/egs/datasets/README.md @@ -0,0 +1,458 @@ +# Datasets Format + +Amphion support the following academic datasets (sort alphabetically): + +- [Datasets Format](#datasets-format) + - [AudioCaps](#audiocaps) + - [CSD](#csd) + - [CustomSVCDataset](#customsvcdataset) + - [Hi-Fi TTS](#hifitts) + - [KiSing](#kising) + - [LibriLight](#librilight) + - [LibriTTS](#libritts) + - [LJSpeech](#ljspeech) + - [M4Singer](#m4singer) + - [NUS-48E](#nus-48e) + - [Opencpop](#opencpop) + - [OpenSinger](#opensinger) + - [Opera](#opera) + - [PopBuTFy](#popbutfy) + - [PopCS](#popcs) + - [PJS](#pjs) + - [SVCC](#svcc) + - [VCTK](#vctk) + +The downloading link and the file structure tree of each dataset is displayed as follows. + +> **Note:** When using Docker to run Amphion, mount the dataset to the container is necessary after downloading. Check [Mount dataset in Docker container](./docker.md) for more details. + +## AudioCaps + +AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. + +Download AudioCaps dataset [here](https://github.com/cdjkim/audiocaps). The file structure looks like below: + +```plaintext +[AudioCaps dataset path] +┣ AudioCpas +┃ ┣ wav +┃ ┃ ┣ ---1_cCGK4M_0_10000.wav +┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav +┃ ┃ ┣ ... +``` + +## CSD + +Download the official CSD dataset [here](https://zenodo.org/records/4785016). The file structure looks like below: + +```plaintext +[CSD dataset path] + ┣ english + ┣ korean + ┣ utterances + ┃ ┣ en001a + ┃ ┃ ┣ {UtterenceID}.wav + ┃ ┣ en001b + ┃ ┣ en002a + ┃ ┣ en002b + ┃ ┣ ... + ┣ README +``` + +## CustomSVCDataset + +We support custom dataset for Singing Voice Conversion. Organize your data in the following structure to construct your own dataset: + +```plaintext +[Your Custom Dataset Path] + ┣ singer1 + ┃ ┣ song1 + ┃ ┃ ┣ utterance1.wav + ┃ ┃ ┣ utterance2.wav + ┃ ┃ ┣ ... + ┃ ┣ song2 + ┃ ┣ ... + ┣ singer2 + ┣ ... +``` + + +## Hi-Fi TTS + +Download the official Hi-Fi TTS dataset [here](https://www.openslr.org/109/). The file structure looks like below: + +```plaintext +[Hi-Fi TTS dataset path] + ┣ audio + ┃ ┣ 11614_other {Speaker_ID}_{SNR_subset} + ┃ ┃ ┣ 10547 {Book_ID} + ┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0001.flac + ┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0003.flac + ┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0004.flac + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ 92_manifest_clean_dev.json + ┣ 92_manifest_clean_test.json + ┣ 92_manifest_clean_train.json + ┣ ... + ┣ {Speaker_ID}_manifest_{SNR_subset}_{dataset_split}.json + ┣ ... + ┣ books_bandwidth.tsv + ┣ LICENSE.txt + ┣ readers_books_clean.txt + ┣ readers_books_other.txt + ┣ README.txt + +``` + +## KiSing + +Download the official KiSing dataset [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure looks like below: + +```plaintext +[KiSing dataset path] + ┣ clean + ┃ ┣ 421 + ┃ ┣ 422 + ┃ ┣ ... +``` + +## LibriLight + +Download the official LibriLight dataset [here](https://github.com/facebookresearch/libri-light). The file structure looks like below: + +```plaintext +[LibriTTS dataset path] + ┣ small (Subset) + ┃ ┣ 100 {Speaker_ID} + ┃ ┃ ┣ sea_fairies_0812_librivox_64kb_mp3 {Chapter_ID} + ┃ ┃ ┃ ┣ 01_baum_sea_fairies_64kb.flac + ┃ ┃ ┃ ┣ 02_baum_sea_fairies_64kb.flac + ┃ ┃ ┃ ┣ 03_baum_sea_fairies_64kb.flac + ┃ ┃ ┃ ┣ 22_baum_sea_fairies_64kb.flac + ┃ ┃ ┃ ┣ 01_baum_sea_fairies_64kb.json + ┃ ┃ ┃ ┣ 02_baum_sea_fairies_64kb.json + ┃ ┃ ┃ ┣ 03_baum_sea_fairies_64kb.json + ┃ ┃ ┃ ┣ 22_baum_sea_fairies_64kb.json + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ medium (Subset) + ┣ ... +``` + +## LibriTTS + +Download the official LibriTTS dataset [here](https://www.openslr.org/60/). The file structure looks like below: + +```plaintext +[LibriTTS dataset path] + ┣ BOOKS.txt + ┣ CHAPTERS.txt + ┣ eval_sentences10.tsv + ┣ LICENSE.txt + ┣ NOTE.txt + ┣ reader_book.tsv + ┣ README_librispeech.txt + ┣ README_libritts.txt + ┣ speakers.tsv + ┣ SPEAKERS.txt + ┣ dev-clean (Subset) + ┃ ┣ 1272{Speaker_ID} + ┃ ┃ ┣ 128104 {Chapter_ID} + ┃ ┃ ┃ ┣ 1272_128104_000001_000000.normalized.txt + ┃ ┃ ┃ ┣ 1272_128104_000001_000000.original.txt + ┃ ┃ ┃ ┣ 1272_128104_000001_000000.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ 1272_128104.book.tsv + ┃ ┃ ┃ ┣ 1272_128104.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ dev-other (Subset) + ┃ ┣ 116 (Speaker) + ┃ ┃ ┣ 288045 {Chapter_ID} + ┃ ┃ ┃ ┣ 116_288045_000003_000000.normalized.txt + ┃ ┃ ┃ ┣ 116_288045_000003_000000.original.txt + ┃ ┃ ┃ ┣ 116_288045_000003_000000.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ 116_288045.book.tsv + ┃ ┃ ┃ ┣ 116_288045.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┃ ┣ ... + ┣ test-clean (Subset) + ┃ ┣ {Speaker_ID} + ┃ ┃ ┣ {Chapter_ID} + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ test-other + ┃ ┣ {Speaker_ID} + ┃ ┃ ┣ {Chapter_ID} + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ train-clean-100 + ┃ ┣ {Speaker_ID} + ┃ ┃ ┣ {Chapter_ID} + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ train-clean-360 + ┃ ┣ {Speaker_ID} + ┃ ┃ ┣ {Chapter_ID} + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... + ┣ train-other-500 + ┃ ┣ {Speaker_ID} + ┃ ┃ ┣ {Chapter_ID} + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv + ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv + ┃ ┃ ┣ ... + ┃ ┣ ... +``` + +## LJSpeech + +Download the official LJSpeech dataset [here](https://keithito.com/LJ-Speech-Dataset/). The file structure looks like below: + +```plaintext +[LJSpeech dataset path] + ┣ metadata.csv + ┣ wavs + ┃ ┣ LJ001-0001.wav + ┃ ┣ LJ001-0002.wav + ┃ ┣ ... + ┣ README +``` + +## M4Singer + +Download the official M4Singer dataset [here](https://drive.google.com/file/d/1xC37E59EWRRFFLdG3aJkVqwtLDgtFNqW/view). The file structure looks like below: + +```plaintext +[M4Singer dataset path] + ┣ {Singer_1}#{Song_1} + ┃ ┣ 0000.mid + ┃ ┣ 0000.TextGrid + ┃ ┣ 0000.wav + ┃ ┣ ... + ┣ {Singer_1}#{Song_2} + ┣ ... + ┣ {Singer_2}#{Song_1} + ┣ {Singer_2}#{Song_2} + ┣ ... + ┗ meta.json +``` + +## NUS-48E + +Download the official NUS-48E dataset [here](https://drive.google.com/drive/folders/12pP9uUl0HTVANU3IPLnumTJiRjPtVUMx). The file structure looks like below: + +```plaintext +[NUS-48E dataset path] + ┣ {SpeakerID} + ┃ ┣ read + ┃ ┃ ┣ {SongID}.txt + ┃ ┃ ┣ {SongID}.wav + ┃ ┃ ┣ ... + ┃ ┣ sing + ┃ ┃ ┣ {SongID}.txt + ┃ ┃ ┣ {SongID}.wav + ┃ ┃ ┣ ... + ┣ ... + ┣ README.txt + +``` + +## Opencpop + +Download the official Opencpop dataset [here](https://wenet.org.cn/opencpop/). The file structure looks like below: + +```plaintext +[Opencpop dataset path] + ┣ midis + ┃ ┣ 2001.midi + ┃ ┣ 2002.midi + ┃ ┣ 2003.midi + ┃ ┣ ... + ┣ segments + ┃ ┣ wavs + ┃ ┃ ┣ 2001000001.wav + ┃ ┃ ┣ 2001000002.wav + ┃ ┃ ┣ 2001000003.wav + ┃ ┃ ┣ ... + ┃ ┣ test.txt + ┃ ┣ train.txt + ┃ ┗ transcriptions.txt + ┣ textgrids + ┃ ┣ 2001.TextGrid + ┃ ┣ 2002.TextGrid + ┃ ┣ 2003.TextGrid + ┃ ┣ ... + ┣ wavs + ┃ ┣ 2001.wav + ┃ ┣ 2002.wav + ┃ ┣ 2003.wav + ┃ ┣ ... + ┣ TERMS_OF_ACCESS + ┗ readme.md +``` + +## OpenSinger + +Download the official OpenSinger dataset [here](https://drive.google.com/file/d/1EofoZxvalgMjZqzUEuEdleHIZ6SHtNuK/view). The file structure looks like below: + +```plaintext +[OpenSinger dataset path] + ┣ ManRaw + ┃ ┣ {Singer_1}_{Song_1} + ┃ ┃ ┣ {Singer_1}_{Song_1}_0.lab + ┃ ┃ ┣ {Singer_1}_{Song_1}_0.txt + ┃ ┃ ┣ {Singer_1}_{Song_1}_0.wav + ┃ ┃ ┣ ... + ┃ ┣ {Singer_1}_{Song_2} + ┃ ┣ ... + ┣ WomanRaw + ┣ LICENSE + ┗ README.md +``` + +## Opera + +Download the official Opera dataset [here](http://isophonics.net/SingingVoiceDataset). The file structure looks like below: + +```plaintext +[Opera dataset path] + ┣ monophonic + ┃ ┣ chinese + ┃ ┃ ┣ {Gender}_{SingerID} + ┃ ┃ ┃ ┣ {Emotion}_{SongID}.wav + ┃ ┃ ┃ ┣ ... + ┃ ┃ ┣ ... + ┃ ┣ western + ┣ polyphonic + ┃ ┣ chinese + ┃ ┣ western + ┣ CrossculturalDataSet.xlsx +``` + +## PopBuTFy + +Download the official PopBuTFy dataset [here](https://github.com/MoonInTheRiver/NeuralSVB). The file structure looks like below: + +```plaintext +[PopBuTFy dataset path] + ┣ data + ┃ ┣ {SingerID}#singing#{SongName}_Amateur + ┃ ┃ ┣ {SingerID}#singing#{SongName}_Amateur_{UtteranceID}.mp3 + ┃ ┃ ┣ ... + ┃ ┣ {SingerID}#singing#{SongName}_Professional + ┃ ┃ ┣ {SingerID}#singing#{SongName}_Professional_{UtteranceID}.mp3 + ┃ ┃ ┣ ... + ┣ text_labels + ┗ TERMS_OF_ACCESS +``` + +## PopCS + +Download the official PopCS dataset [here](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). The file structure looks like below: + +```plaintext +[PopCS dataset path] + ┣ popcs + ┃ ┣ popcs-{SongName} + ┃ ┃ ┣ {UtteranceID}_ph.txt + ┃ ┃ ┣ {UtteranceID}_wf0.wav + ┃ ┃ ┣ {UtteranceID}.TextGrid + ┃ ┃ ┣ {UtteranceID}.txt + ┃ ┃ ┣ ... + ┃ ┣ ... + ┗ TERMS_OF_ACCESS +``` + +## PJS + +Download the official PJS dataset [here](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus). The file structure looks like below: + +```plaintext +[PJS dataset path] + ┣ PJS_corpus_ver1.1 + ┃ ┣ background_noise + ┃ ┣ pjs{SongID} + ┃ ┃ ┣ pjs{SongID}_song.wav + ┃ ┃ ┣ pjs{SongID}_speech.wav + ┃ ┃ ┣ pjs{SongID}.lab + ┃ ┃ ┣ pjs{SongID}.mid + ┃ ┃ ┣ pjs{SongID}.musicxml + ┃ ┃ ┣ pjs{SongID}.txt + ┃ ┣ ... +``` + +## SVCC + +Download the official SVCC dataset [here](https://github.com/lesterphillip/SVCC23_FastSVC/tree/main/egs/generate_dataset). The file structure looks like below: + +```plaintext +[SVCC dataset path] + ┣ Data + ┃ ┣ CDF1 + ┃ ┃ ┣ 10001.wav + ┃ ┃ ┣ 10002.wav + ┃ ┃ ┣ ... + ┃ ┣ CDM1 + ┃ ┣ IDF1 + ┃ ┣ IDM1 + ┗ README.md +``` + +## VCTK + +Download the official VCTK dataset [here](https://datashare.ed.ac.uk/handle/10283/3443). The file structure looks like below: + +```plaintext +[VCTK dataset path] + ┣ txt + ┃ ┣ {Speaker_1} + ┃ ┃ ┣ {Speaker_1}_001.txt + ┃ ┃ ┣ {Speaker_1}_002.txt + ┃ ┃ ┣ ... + ┃ ┣ {Speaker_2} + ┃ ┣ ... + ┣ wav48_silence_trimmed + ┃ ┣ {Speaker_1} + ┃ ┃ ┣ {Speaker_1}_001_mic1.flac + ┃ ┃ ┣ {Speaker_1}_001_mic2.flac + ┃ ┃ ┣ {Speaker_1}_002_mic1.flac + ┃ ┃ ┣ ... + ┃ ┣ {Speaker_2} + ┃ ┣ ... + ┣ speaker-info.txt + ┗ update.txt +``` diff --git a/egs/datasets/docker.md b/egs/datasets/docker.md new file mode 100644 index 0000000000000000000000000000000000000000..47d285077cc8f0864acf1b2e656261056f8101c1 --- /dev/null +++ b/egs/datasets/docker.md @@ -0,0 +1,19 @@ +# Mount dataset in Docker container + +When using Docker to run Amphion, mount the dataset to the container first is needed. It is recommend to mounte dataset to `/mnt/` in the container, where `` is the name of the dataset. + +When configuring the dataset in `exp_config.json`, you should use the path `/mnt/` as the dataset path instead of the actual path on your host machine. Otherwise, the dataset will not be found in the container. + +## Mount Example + +```bash +docker run --runtime=nvidia --gpus all -it -v .:/app -v :/mnt/ -v :/mnt/ amphion +``` + +For example, if you want to use the `LJSpeech` dataset, you can mount the dataset to `/mnt/LJSpeech` in the container. + +```bash +docker run --runtime=nvidia --gpus all -it -v .:/app -v /home/username/datasets/LJSpeech:/mnt/LJSpeech amphion +``` + +If you want to use multiple datasets, you can mount them to different directories in the container by adding more `-v` options. diff --git a/egs/metrics/README.md b/egs/metrics/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d41515464f7ea3b31b1b96edde7d41bd75e31f4f --- /dev/null +++ b/egs/metrics/README.md @@ -0,0 +1,174 @@ +# Amphion Evaluation Recipe + +## Supported Evaluation Metrics + +Until now, Amphion Evaluation has supported the following objective metrics: + +- **F0 Modeling**: + - F0 Pearson Coefficients (FPC) + - F0 Periodicity Root Mean Square Error (PeriodicityRMSE) + - F0 Root Mean Square Error (F0RMSE) + - Voiced/Unvoiced F1 Score (V/UV F1) +- **Energy Modeling**: + - Energy Root Mean Square Error (EnergyRMSE) + - Energy Pearson Coefficients (EnergyPC) +- **Intelligibility**: + - Character Error Rate (CER) based on [Whipser](https://github.com/openai/whisper) + - Word Error Rate (WER) based on [Whipser](https://github.com/openai/whisper) +- **Spectrogram Distortion**: + - Frechet Audio Distance (FAD) + - Mel Cepstral Distortion (MCD) + - Multi-Resolution STFT Distance (MSTFT) + - Perceptual Evaluation of Speech Quality (PESQ) + - Short Time Objective Intelligibility (STOI) + - Scale Invariant Signal to Distortion Ratio (SISDR) + - Scale Invariant Signal to Noise Ratio (SISNR) +- **Speaker Similarity**: + - Cosine similarity based on: + - [Rawnet3](https://github.com/Jungjee/RawNet) + - [Resemblyzer](https://github.com/resemble-ai/Resemblyzer) + - [WavLM](https://huggingface.co/microsoft/wavlm-base-plus-sv) + +We provide a recipe to demonstrate how to objectively evaluate your generated audios. There are three steps in total: + +1. Pretrained Models Preparation +2. Audio Data Preparation +3. Evaluation + +## 1. Pretrained Models Preparation + +If you want to calculate `RawNet3` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md). + +## 2. Audio Data Preparation + +Prepare reference audios and generated audios in two folders, the `ref_dir` contains the reference audio and the `gen_dir` contains the generated audio. Here is an example. + +```plaintext + ┣ {ref_dir} + ┃ ┣ sample1.wav + ┃ ┣ sample2.wav + ┣ {gen_dir} + ┃ ┣ sample1.wav + ┃ ┣ sample2.wav +``` + +You have to make sure that the pairwise **reference audio and generated audio are named the same**, as illustrated above (sample1 to sample1, sample2 to sample2). + +## 3. Evaluation + +Run the `run.sh` with specified refenrece folder, generated folder, dump folder and metrics. + +```bash +cd Amphion +sh egs/metrics/run.sh \ + --reference_folder [Your path to the reference audios] \ + --generated_folder [Your path to the generated audios] \ + --dump_folder [Your path to dump the objective results] \ + --metrics [The metrics you need] \ + --fs [Optional. To calculate all metrics in the specified sampling rate] \ + --similarity_model [Optional. To choose the model for calculating the speaker similarity. Currently "rawnet", "wavlm" and "resemblyzer" are available. Default to "wavlm"] \ + --similarity_mode [Optional. To choose the mode for calculating the speaker similarity. "pairwith" for calculating a series of ground truth / prediction audio pairs to obtain the speaker similarity, and "overall" for computing the average score with all possible pairs between the refernece folder and generated folder. Default to "pairwith"] \ + --intelligibility_mode [Optionoal. To choose the mode for computing CER and WER. "gt_audio" means selecting the recognition content of the reference audio as the target, "gt_content" means using transcription as the target. Default to "gt_audio"] \ + --ltr_path [Optional. Path to the transcription file] \ + --language [Optional. Language for computing CER and WER. Default to "english"] +``` + +As for the metrics, an example is provided below: + +```bash +--metrics "mcd pesq fad" +``` + +All currently available metrics keywords are listed below: + +| Keys | Description | +| ------------------------- | ------------------------------------------ | +| `fpc` | F0 Pearson Coefficients | +| `f0_periodicity_rmse` | F0 Periodicity Root Mean Square Error | +| `f0rmse` | F0 Root Mean Square Error | +| `v_uv_f1` | Voiced/Unvoiced F1 Score | +| `energy_rmse` | Energy Root Mean Square Error | +| `energy_pc` | Energy Pearson Coefficients | +| `cer` | Character Error Rate | +| `wer` | Word Error Rate | +| `similarity` | Speaker Similarity +| `fad` | Frechet Audio Distance | +| `mcd` | Mel Cepstral Distortion | +| `mstft` | Multi-Resolution STFT Distance | +| `pesq` | Perceptual Evaluation of Speech Quality | +| `si_sdr` | Scale Invariant Signal to Distortion Ratio | +| `si_snr` | Scale Invariant Signal to Noise Ratio | +| `stoi` | Short Time Objective Intelligibility | + +For example, if want to calculate the speaker similarity between the synthesized audio and the reference audio with the same content, run: + +```bash +sh egs/metrics/run.sh \ + --reference_folder [Your path to the reference audios] \ + --generated_folder [Your path to the generated audios] \ + --dump_folder [Your path to dump the objective results] \ + --metrics "similarity" \ + --similarity_model [Optional. To choose the model for calculating the speaker similarity. Currently "rawnet", "wavlm" and "resemblyzer" are available. Default to "wavlm"] \ + --similarity_mode "pairwith" \ +``` + +If you don't have the reference audio with the same content, run the following to get the conteng-free similarity score: + +```bash +sh egs/metrics/run.sh \ + --reference_folder [Your path to the reference audios] \ + --generated_folder [Your path to the generated audios] \ + --dump_folder [Your path to dump the objective results] \ + --metrics "similarity" \ + --similarity_model [Optional. To choose the model for calculating the speaker similarity. Currently "rawnet", "wavlm" and "resemblyzer" are available. Default to "wavlm"] \ + --similarity_mode "overall" \ +``` + +## Troubleshooting +### FAD (Using Offline Models) +If your system is unable to access huggingface.co from the terminal, you might run into an error like "OSError: Can't load tokenizer for ...". To work around this, follow these steps to use local models: + +1. Download the [bert-base-uncased](https://huggingface.co/bert-base-uncased), [roberta-base](https://huggingface.co/roberta-base), and [facebook/bart-base](https://huggingface.co/facebook/bart-base) models from `huggingface.co`. Ensure that the models are complete and uncorrupted. Place these directories within `Amphion/pretrained`. For a detailed file structure reference, see [This README](../../pretrained/README.md#optional-model-dependencies-for-evaluation) under `Amphion/pretrained`. +2. Inside the `Amphion/pretrained` directory, create a bash script with the content outlined below. This script will automatically update the tokenizer paths used by your system: + ```bash + #!/bin/bash + + BERT_DIR="bert-base-uncased" + ROBERTA_DIR="roberta-base" + BART_DIR="facebook/bart-base" + PYTHON_SCRIPT="[YOUR ENV PATH]/lib/python3.9/site-packages/laion_clap/training/data.py" + + update_tokenizer_path() { + local dir_name=$1 + local tokenizer_variable=$2 + local full_path + + if [ -d "$dir_name" ]; then + full_path=$(realpath "$dir_name") + if [ -f "$PYTHON_SCRIPT" ]; then + sed -i "s|${tokenizer_variable}.from_pretrained(\".*\")|${tokenizer_variable}.from_pretrained(\"$full_path\")|" "$PYTHON_SCRIPT" + echo "Updated ${tokenizer_variable} path to $full_path." + else + echo "Error: The specified Python script does not exist." + exit 1 + fi + else + echo "Error: The directory $dir_name does not exist in the current directory." + exit 1 + fi + } + + update_tokenizer_path "$BERT_DIR" "BertTokenizer" + update_tokenizer_path "$ROBERTA_DIR" "RobertaTokenizer" + update_tokenizer_path "$BART_DIR" "BartTokenizer" + + echo "BERT, BART and RoBERTa Python script paths have been updated." + + ``` + +3. The script provided is intended to adjust the tokenizer paths in the `data.py` file, found under `/lib/python3.9/site-packages/laion_clap/training/`, within your specific environment. For those utilizing conda, you can determine your environment path by running `conda info --envs`. Then, substitute `[YOUR ENV PATH]` in the script with this path. If your environment is configured differently, you'll need to update the `PYTHON_SCRIPT` variable to correctly point to the `data.py` file. +4. Run the script. If it executes successfully, the tokenizer paths will be updated, allowing them to be loaded locally. + +### WavLM-based Speaker Similarity (Using Offline Models) + +If your system is unable to access huggingface.co from the terminal and you want to calculate `WavLM` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md). \ No newline at end of file diff --git a/egs/metrics/run.sh b/egs/metrics/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..47d735ef577c90175b8a9e9055bc9fd5d94311de --- /dev/null +++ b/egs/metrics/run.sh @@ -0,0 +1,132 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $exp_dir)) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,reference_folder:,generated_folder:,dump_folder:,metrics:,fs:,align_method:,energy_db_scale:,f0_subtract_mean:,similarity_model:,similarity_mode:,ltr_path:,intelligibility_mode:,language: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + # Reference Audio Folder + --reference_folder) shift; ref_dir=$1 ; shift ;; + # Generated Audio Folder + --generated_folder) shift; deg_dir=$1 ; shift ;; + # Result Dumping Folder + --dump_folder) shift; dump_dir=$1 ; shift ;; + # Metrics to Compute + --metrics) shift; metrics=$1 ; shift ;; + # Sampling Rate + --fs) shift; fs=$1 ; shift ;; + + # Method for aligning F0. The default value is "cut" + --align_method) shift; align_method=$1 ; shift ;; + # Method for normalizing F0. The default value is "True" + --f0_subtract_mean) shift; f0_subtract_mean=$1 ; shift ;; + # Method for normalizing Energy. The default value is "True" + --energy_db_scale) shift; energy_db_scale=$1 ; shift ;; + + # Model for computing speaker similarity. The default value is "wavlm" + --similarity_model) shift; similarity_model=$1 ; shift ;; + # Mode for computing speaker similarity. The default value is "pairwith" + --similarity_mode) shift; similarity_mode=$1 ; shift ;; + + # Path for the transcript. + --ltr_path) shift; ltr_path=$1 ; shift ;; + # Mode for computing CER and WER. The default value is "gt_audio" + --intelligibility_mode) shift; intelligibility_mode=$1 ; shift ;; + # Language for computing CER and WER. The default value is "english" + --language) shift; language=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + +### Value check ### +if [ -z "$ref_dir" ]; then + echo "[Error] Please specify the reference_folder" + exit 1 +fi + +if [ -z "$deg_dir" ]; then + echo "[Error] Please specify the generated_folder" + exit 1 +fi + +if [ -z "$dump_dir" ]; then + echo "[Error] Please specify the dump_folder" + exit 1 +fi + +if [ -z "$metrics" ]; then + echo "[Error] Please specify the metrics" + exit 1 +fi + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$fs" ]; then + fs="None" +fi + +if [ -z "$align_method" ]; then + align_method="dtw" +fi + +if [ -z "$energy_db_scale" ]; then + energy_db_scale="True" +fi + +if [ -z "$f0_subtract_mean" ]; then + f0_subtract_mean="True" +fi + +if [ -z "$similarity_model" ]; then + similarity_model="wavlm" +fi + +if [ -z "$similarity_mode" ]; then + similarity_mode="pairwith" +fi + +if [ -z "$ltr_path" ]; then + ltr_path="None" +fi + +if [ -z "$intelligibility_mode" ]; then + intelligibility_mode="gt_audio" +fi + +if [ -z "$language" ]; then + language="english" +fi + +######## Calculate Objective Metrics ########### +CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/bins/calc_metrics.py \ + --ref_dir $ref_dir \ + --deg_dir $deg_dir \ + --dump_dir $dump_dir \ + --metrics $metrics \ + --fs $fs \ + --align_method $align_method \ + --db_scale $energy_db_scale \ + --f0_subtract_mean $f0_subtract_mean \ + --similarity_model $similarity_model \ + --similarity_mode $similarity_mode \ + --ltr_path $ltr_path \ + --intelligibility_mode $intelligibility_mode \ + --language $language \ No newline at end of file diff --git a/egs/svc/DiffComoSVC/README.md b/egs/svc/DiffComoSVC/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e421d9dcb3071e0763ee75298a5b4f2babfb8ecf --- /dev/null +++ b/egs/svc/DiffComoSVC/README.md @@ -0,0 +1,234 @@ +# Accelerating Diffusion-based Singing Voice Conversion through Consistency Distillation +
+
+ +
+
+ +This is an implement of [Consistency Models](https://arxiv.org/abs/2303.01469) for accelerating diffusion-based singing voice conversion. The overall architecture follows "[Leveraging Diverse Semantic-based Audio Pretrained Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (2024 IEEE Spoken Language Technology Workshop), only a slightly modification is applied on acoustic model. Specifically, + +* The acoustic model is a conformer which generates a coarse spectrogram and a diffusion decoder based on Bidirectional Non-Causal Dilated CNN which polish the former spectrogram for better. This is similar to [CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model](https://comospeech.github.io/) +* To accelerate diffusion model, we apply consistency distillation from [Consistency Models](https://arxiv.org/abs/2303.01469). For teacher model, the diffusion schedule of the diffusion decoder follows [karras diffusion](https://arxiv.org/abs/2206.00364). For distilling teacher model, the condition encoder and the conformer part of acoustic model are frozen while the diffusion decoder model is updated via exponential moving average. See Figure above for details. + +There are five stages in total: + +1. Data preparation +2. Features extraction +3. Teacher Model Training +4. Consistency Distillation +5. Inference/conversion + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, +``` + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path + "log_dir": "[Your path to save logs and checkpoints]", + "preprocess": { + // TODO: Fill in the output data path + "processed_dir": "[Your path to save processed data]", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 1 +``` + +Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Teacher Model Training + +### Configuration + +Set the `distill` in `config/comosvc.json` to `false` for teacher model training, you can also specify the detailed configuration for conformer encoder and diffusion process here: + +```JSON +"comosvc":{ + "distill": false, + // conformer encoder + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels":512, + // karras diffusion + "P_mean": -1.2, + "P_std": 1.2, + "sigma_data": 0.5, + "sigma_min": 0.002, + "sigma_max": 80, + "rho": 7, + "n_timesteps": 40, + }, +``` + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`. + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] +``` + +Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as: + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3" +``` + +## 4. Consistency Distillation + +### Configuration + +Set the `distill` in `config/comosvc.json` to `true` for teacher model training, and specify the `teacher_model_path` for consistency distillation. You can also specify the detailed configuration for conformer encoder and diffusion process here: + +```JSON +"model": { + "teacher_model_path":"[Your_teacher_model_checkpoint].bin", + ... + "comosvc":{ + "distill": true, + // conformer encoder + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels":512, + // karras diffusion + "P_mean": -1.2, + "P_std": 1.2, + "sigma_data": 0.5, + "sigma_min": 0.002, + "sigma_max": 80, + "rho": 7, + "n_timesteps": 40, + }, +``` + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`. + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] +``` + +Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as: + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3" +``` + +## 5. Inference/Conversion + +### Pretrained Vocoder Download + +We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`). + +### Run + +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | +| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | + +For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: + +```bash +cd Amphion +sh egs/svc/DiffComoSVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir [Your path to save logs and checkpoints]/[YourExptName] \ + --infer_output_dir [Your path to save logs and checkpoints]/[YourExptName]/result \ + --infer_source_audio_dir [Your Audios Folder] \ + --infer_target_speaker "opencpop_female1" \ + --infer_key_shift "autoshift" +``` +Specially, you can configurate the inference steps for teacher model by setting `inference` at `exp_config`(student model is always one-step sampling): +```json + "inference": { + "comosvc": { + "inference_steps": 40 + } + } +``` + +# Reference +https://github.com/zhenye234/CoMoSpeech + +https://github.com/openai/consistency_models \ No newline at end of file diff --git a/egs/svc/DiffComoSVC/exp_config.json b/egs/svc/DiffComoSVC/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..37f977f21a09ea64eb218388b38fe1c1f7350a04 --- /dev/null +++ b/egs/svc/DiffComoSVC/exp_config.json @@ -0,0 +1,143 @@ +{ + "base_config": "config/comosvc.json", + "model_type": "DiffComoSVC", + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, + // TODO: Fill in the output log path + "log_dir": "[Your path to save logs and checkpoints]", + "preprocess": { + // TODO: Fill in the output data path + "processed_dir": "[Your path to save processed data]", + // Config for features extraction + "extract_mel": true, + "extract_pitch": true, + "extract_energy": true, + "extract_whisper_feature": true, + "extract_contentvec_feature": true, + "extract_wenet_feature": false, + "whisper_batch_size": 30, // decrease it if your GPU is out of memory + "contentvec_batch_size": 1, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + // Config for features usage + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_frame_energy": true, + "use_spkid": true, + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "teacher_model_path":"[Your_teacher_model_checkpoint].bin", + "condition_encoder": { + // Config for features usage + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + "use_singer_encoder": false, + "pitch_min": 50, + "pitch_max": 1100 + }, + "comosvc":{ + "distill": false, + // conformer encoder + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels":512, + "dropout":0.1, + // karras diffusion + "P_mean": -1.2, + "P_std": 1.2, + "sigma_data": 0.5, + "sigma_min": 0.002, + "sigma_max": 80, + "rho": 7, + "n_timesteps": 40, + }, + "diffusion": { + // Diffusion steps encoder + "step_encoder": { + "dim_raw_embedding": 128, + "dim_hidden_layer": 512, + "activation": "SiLU", + "num_layer": 2, + "max_period": 10000 + }, + // Diffusion decoder + "model_type": "bidilconv", + // bidilconv, unet2d, TODO: unet1d + "bidilconv": { + "base_channel": 384, + "n_res_block": 20, + "conv_kernel_size": 3, + "dilation_cycle_length": 4, + // specially, 1 means no dilation + "conditioner_size": 100 + } + } + }, + "train": { + "batch_size": 64, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 50, + 50 + ], + "keep_last": [ + 5, + -1 + ], + "run_eval": [ + false, + true + ], + "adamw": { + "lr": 4.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 10, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + }, + "inference": { + "comosvc": { + "inference_steps": 40 + } + } +} \ No newline at end of file diff --git a/egs/svc/MultipleContentsSVC/README.md b/egs/svc/MultipleContentsSVC/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6865a2158bf1057a76f07eaea1805bbf40944a63 --- /dev/null +++ b/egs/svc/MultipleContentsSVC/README.md @@ -0,0 +1,248 @@ +# Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2310.11160) +[![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html) +[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Models-pink)](https://huggingface.co/amphion/singing_voice_conversion) +[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion) +[![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/singing_voice_conversion) + +
+
+ +
+
+ +This is the official implementation of the paper "[Leveraging Diverse Semantic-based Audio Pretrained Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (2024 IEEE Spoken Language Technology Workshop). Specially, + +- The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec). +- The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219). +- The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data. + +## A Little Taste Before Getting Started + +Before you delve into the code, we suggest exploring the interactive DEMO we've provided for a comprehensive overview. There are several ways you can engage with it: + +1. **Online DEMO** + + | HuggingFace | OpenXLab | + | :----------------------------------------------------------: | :----------------------------------------------------------: | + | [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion)
(Worldwide) | [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/singing_voice_conversion)
(Suitable for Mainland China Users) | + +2. **Run Local Gradio DEMO** + + | Run with Docker | Duplicate Space with Private GPU | + | :----------------------------------------------------------: | :----------------------------------------------------------: | + | [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion?docker=true) | [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion?duplicate=true) | + +3. **Run with the Extended Colab** + + You can check out [this repo](https://github.com/camenduru/singing-voice-conversion-colab) to run it with Colab. Thanks to [@camenduru](https://x.com/camenduru?s=20) and the community for their support! + +## Usage Overview + +To train a `DiffWaveNetSVC` model, there are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference/conversion + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, +``` + +### Custom Dataset + +We support custom dataset, see [here](../../datasets/README.md#customsvcdataset) for the file structure to follow. + +After constructing proper file structure, specify your dataset name in `dataset` and its path in `dataset_path`, also add its name in `use_custom_dataset`: + +```json + "dataset": [ + "[Exisiting Dataset Name]", + //... + "[Your Custom Dataset Name]" + ], + "dataset_path": { + "[Exisiting Dataset Name]": "[Exisiting Dataset Path]", + //... + "[Your Custom Dataset Name]": "[Your Custom Dataset Path]" + }, + "use_custom_dataset": [ + "[Your Custom Dataset Name]" + ], +``` + +> **NOTE:** Custom dataset name does not have to be the same as the folder name. But it needs to satisfy these rules: +> 1. It can not be the same as the exisiting dataset name. +> 2. It can not contain any space or underline(`_`). +> 3. It must be a valid folder name for operating system. +> +> Some examples of valid custom dataset names are `mydataset`, `myDataset`, `my-dataset`, `mydataset1`, `my-dataset-1`, etc. + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/svc/MultipleContentsSVC/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Train From Scratch + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. + +```bash +sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] +``` + +### Train From Existing Source + +We support training from existing source for various purposes. You can resume training the model from a checkpoint or fine-tune a model from another checkpoint. + +Setting `--resume true`, the training will resume from the **latest checkpoint** by default. For example, if you want to resume training from the latest checkpoint in `Amphion/ckpts/svc/[YourExptName]/checkpoint`, run: + +```bash +sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] \ + --resume true +``` + +You can choose a **specific checkpoint** for retraining by `--resume_from_ckpt_path` argument. For example, if you want to fine-tune from the checkpoint `Amphion/ckpts/svc/[YourExptName]/checkpoint/[SpecificCheckpoint]`, run: + +```bash +sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] \ + --resume true + --resume_from_ckpt_path "Amphion/ckpts/svc/[YourExptName]/checkpoint/[SpecificCheckpoint]" \ +``` + +If you want to **fine-tune from another checkpoint**, just use `--resume_type` and set it to `"finetune"`. For example, If you want to fine-tune from the checkpoint `Amphion/ckpts/svc/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]`, run: + +```bash +sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] \ + --resume true + --resume_from_ckpt_path "Amphion/ckpts/svc/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]" \ + --resume_type "finetune" +``` + +> **NOTE:** The `--resume_type` is set as `"resume"` in default. It's not necessary to specify it when resuming training. +> +> The difference between `"resume"` and `"finetune"` is that the `"finetune"` will **only** load the pretrained model weights from the checkpoint, while the `"resume"` will load all the training states (including optimizer, scheduler, etc.) from the checkpoint. + +Here are some example scenarios to better understand how to use these arguments: +| Scenario | `--resume` | `--resume_from_ckpt_path` | `--resume_type` | +| ------ | -------- | ----------------------- | ------------- | +| You want to train from scratch | no | no | no | +| The machine breaks down during training and you want to resume training from the latest checkpoint | `true` | no | no | +| You find the latest model is overfitting and you want to re-train from the checkpoint before | `true` | `SpecificCheckpoint Path` | no | +| You want to fine-tune a model from another checkpoint | `true` | `SpecificCheckpoint Path` | `"finetune"` | + + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +## 4. Inference/Conversion + +### Pretrained Vocoder Download + +We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`). + +### Run + +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | +| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | + +For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: + +```bash +sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir ckpts/svc/[YourExptName] \ + --infer_output_dir ckpts/svc/[YourExptName]/result \ + --infer_source_audio_dir [Your Audios Folder] \ + --infer_target_speaker "opencpop_female1" \ + --infer_key_shift "autoshift" +``` + +## Citations + +```bibtex +@inproceedings{zhang2024leveraging, + author={Zhang, Xueyao and Fang, Zihao and Gu, Yicheng and Chen, Haopeng and Zou, Lexiao and Zhang, Junan and Xue, Liumeng and Wu, Zhizheng}, + title={Leveraging Diverse Semantic-based Audio Pretrained Models for Singing Voice Conversion}, + booktitle={{IEEE} Spoken Language Technology Workshop, {SLT} 2024}, + year={2024} +} +``` diff --git a/egs/svc/MultipleContentsSVC/exp_config.json b/egs/svc/MultipleContentsSVC/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b607494cbb58f803475a61471ec02313ad03bae4 --- /dev/null +++ b/egs/svc/MultipleContentsSVC/exp_config.json @@ -0,0 +1,127 @@ +{ + "base_config": "config/svc/diffusion.json", + "model_type": "DiffWaveNetSVC", + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + // Config for features extraction + "features_extraction_mode": "offline", // Online or offline features extraction ("offline" or "online") + "extract_mel": true, + "extract_pitch": true, + "extract_energy": true, + "extract_whisper_feature": true, + "extract_contentvec_feature": true, + "extract_wenet_feature": false, + "whisper_batch_size": 30, // decrease it if your GPU is out of memory + "contentvec_batch_size": 1, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + // Config for features usage + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_frame_energy": true, + "use_spkid": true, + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "condition_encoder": { + // Config for features usage + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + "use_singer_encoder": false, + "pitch_min": 50, + "pitch_max": 1100 + }, + "diffusion": { + "scheduler": "ddpm", + "scheduler_settings": { + "num_train_timesteps": 1000, + "beta_start": 1.0e-4, + "beta_end": 0.02, + "beta_schedule": "linear" + }, + // Diffusion steps encoder + "step_encoder": { + "dim_raw_embedding": 128, + "dim_hidden_layer": 512, + "activation": "SiLU", + "num_layer": 2, + "max_period": 10000 + }, + // Diffusion decoder + "model_type": "bidilconv", + // bidilconv, unet2d, TODO: unet1d + "bidilconv": { + "base_channel": 512, + "n_res_block": 40, + "conv_kernel_size": 3, + "dilation_cycle_length": 4, + // specially, 1 means no dilation + "conditioner_size": 384 + } + } + }, + "train": { + "batch_size": 32, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 3, + 50 + ], + "keep_last": [ + 3, + 2 + ], + "run_eval": [ + true, + true + ], + "adamw": { + "lr": 2.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 30, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + } +} \ No newline at end of file diff --git a/egs/svc/README.md b/egs/svc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e85fed0c5b66fa4e1b5fb11998b4aee812219c9e --- /dev/null +++ b/egs/svc/README.md @@ -0,0 +1,34 @@ +# Amphion Singing Voice Conversion (SVC) Recipe + +## Quick Start + +We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Diverse Semantic-based Audio Pretrained Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (2024 IEEE Spoken Language Technology Workshop). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html). + +## Supported Model Architectures + +The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder): + +
+
+ +
+
+ +Until now, Amphion SVC has supported the following features and models: + +- **Speaker-agnostic Representations**: + - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec). + - Prosody Features: F0 and energy. +- **Speaker Embeddings**: + - Speaker Look-Up Table. + - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC. +- **Acoustic Decoders**: + - Diffusion-based models: + - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219). + - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model. + - Transformer-based models: + - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture. + - VAE- and Flow-based models: + - **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc). +- **Waveform Synthesizers (Vocoders)**: + - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md). diff --git a/egs/svc/TransformerSVC/README.md b/egs/svc/TransformerSVC/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1797e32f310994ce4c0a1ff3e7789b397b358907 --- /dev/null +++ b/egs/svc/TransformerSVC/README.md @@ -0,0 +1,164 @@ +# Transformer for Singing Voice Conversion + +This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference/conversion + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, +``` + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/svc/TransformerSVC/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration +Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported: +```json +"model": { + ... + "transformer":{ + // 'conformer' or 'transformer' + "type": "conformer", + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels":512, + "dropout":0.1, + } + } +``` +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. + +```bash +sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +## 4. Inference/Conversion + +### Pretrained Vocoder Download + +We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`). + +### Run + +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | +| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | + +For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: + +```bash +cd Amphion +sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \ + --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \ + --infer_source_audio_dir [Your Audios Folder] \ + --infer_target_speaker "opencpop_female1" \ + --infer_key_shift "autoshift" +``` + +## Citations + +```bibtex +@inproceedings{transformer, + author = {Ashish Vaswani and + Noam Shazeer and + Niki Parmar and + Jakob Uszkoreit and + Llion Jones and + Aidan N. Gomez and + Lukasz Kaiser and + Illia Polosukhin}, + title = {Attention is All you Need}, + booktitle = {{NIPS}}, + pages = {5998--6008}, + year = {2017} +} +``` \ No newline at end of file diff --git a/egs/svc/TransformerSVC/exp_config.json b/egs/svc/TransformerSVC/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4ad85fbe47dd4945e579200a7e5d82a03a02457b --- /dev/null +++ b/egs/svc/TransformerSVC/exp_config.json @@ -0,0 +1,108 @@ +{ + "base_config": "config/transformer.json", + "model_type": "TransformerSVC", + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + // Config for features extraction + "extract_mel": true, + "extract_pitch": true, + "extract_energy": true, + "extract_whisper_feature": true, + "extract_contentvec_feature": true, + "extract_wenet_feature": false, + "whisper_batch_size": 30, // decrease it if your GPU is out of memory + "contentvec_batch_size": 1, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + // Config for features usage + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_frame_energy": true, + "use_spkid": true, + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "condition_encoder": { + // Config for features usage + "use_whisper": true, + "use_contentvec": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + "use_singer_encoder": false, + "pitch_min": 50, + "pitch_max": 1100 + }, + "transformer": { + // 'conformer' or 'transformer' + "type": "conformer", + "input_dim": 384, + "output_dim": 100, + "n_heads": 2, + "n_layers": 6, + "filter_channels": 512, + "dropout": 0.1, + } + }, + "train": { + "batch_size": 64, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 50, + 50 + ], + "keep_last": [ + 5, + -1 + ], + "run_eval": [ + false, + true + ], + "adamw": { + "lr": 4.0e-4 + }, + "reducelronplateau": { + "factor": 0.8, + "patience": 10, + "min_lr": 1.0e-4 + }, + "dataloader": { + "num_worker": 8, + "pin_memory": true + }, + "sampler": { + "holistic_shuffle": false, + "drop_last": true + } + } +} \ No newline at end of file diff --git a/egs/svc/VitsSVC/README.md b/egs/svc/VitsSVC/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6dc81ee9486a9a926bd132ff4afc68881ba39895 --- /dev/null +++ b/egs/svc/VitsSVC/README.md @@ -0,0 +1,125 @@ +# VITS for Singing Voice Conversion + +This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference/conversion + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md). + +### Configuration + +Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, +``` + +## 2. Features Extraction + +### Content-based Pretrained Models Download + +By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md). + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/svc/VitsSVC/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... + "adamw": { + "lr": 2.0e-4 + }, + ... + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`. + +```bash +sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +## 4. Inference/Conversion + +### Run + +For inference/conversion, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` | +| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). | +| `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. | +| `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. | + +For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run: + +```bash +sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \ + --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \ + --infer_source_audio_dir [Your Audios Folder] \ + --infer_target_speaker "opencpop_female1" \ + --infer_key_shift "autoshift" +``` \ No newline at end of file diff --git a/egs/svc/VitsSVC/exp_config.json b/egs/svc/VitsSVC/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3310f0b345b615e59ff91345cd69b90336b63a96 --- /dev/null +++ b/egs/svc/VitsSVC/exp_config.json @@ -0,0 +1,106 @@ +{ + "base_config": "config/vitssvc.json", + "model_type": "VitsSVC", + "dataset": [ + "m4singer", + "opencpop", + "opensinger", + "svcc", + "vctk" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "m4singer": "[M4Singer dataset path]", + "opencpop": "[Opencpop dataset path]", + "opensinger": "[OpenSinger dataset path]", + "svcc": "[SVCC dataset path]", + "vctk": "[VCTK dataset path]" + }, + "use_custom_dataset": [], + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" + "log_dir": "ckpts/svc", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + + "n_mel": 100, + "sample_rate": 24000, + + // contentvec + "extract_contentvec_feature": true, + "contentvec_sample_rate": 16000, + "contentvec_batch_size": 1, + "contentvec_frameshift": 0.02, + // whisper + "extract_whisper_feature": true, + "whisper_sample_rate": 16000, + "whisper_frameshift": 0.01, + "whisper_downsample_rate": 2, + // wenet + "extract_wenet_feature": true, + "wenet_downsample_rate": 4, + "wenet_frameshift": 0.01, + "wenet_sample_rate": 16000, + // Fill in the content-based pretrained model's path + "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", + "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", + "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", + "whisper_model": "medium", + "whisper_model_path": "pretrained/whisper/medium.pt", + + "use_contentvec": true, + "use_whisper": true, + "use_wenet": false, + + // Extract content features using dataloader + "pin_memory": true, + "num_workers": 8, + "content_feature_batch_size": 16, + + }, + "model": { + "condition_encoder": { + // Config for features usage + "merge_mode": "add", + "use_log_loudness": true, + "use_contentvec": true, + "use_whisper": true, + "use_wenet": false, + "whisper_dim": 1024, + "contentvec_dim": 256, + "wenet_dim": 512, + }, + "vits": { + "inter_channels": 384, + "hidden_channels": 384, + "filter_channels": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0.1, + "n_flow_layer": 4, + "n_layers_q": 3, + "gin_channels": 256, + "n_speakers": 512, + "use_spectral_norm": false, + }, + "generator": "nsfhifigan", + }, + "train": { + "batch_size": 32, + "learning_rate": 2e-4, + "gradient_accumulation_step": 1, + "max_epoch": -1, // -1 means no limit + "save_checkpoint_stride": [ + 3, + 50 + ], + "keep_last": [ + 3, + 2 + ], + }, + "inference": { + "batch_size": 1, + } +} \ No newline at end of file diff --git a/egs/svc/_template/run.sh b/egs/svc/_template/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..cec22f48b22aa3b7d3ffef738bb24be80ea3f7fc --- /dev/null +++ b/egs/svc/_template/run.sh @@ -0,0 +1,160 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] Resume configuration + --resume) shift; resume=$1 ; shift ;; + # [Only for Training] The specific checkpoint path that you want to resume from. + --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac). + --infer_source_file) shift; infer_source_file=$1 ; shift ;; + --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;; + # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1". + --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;; + # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift". + --infer_key_shift) shift; infer_key_shift=$1 ; shift ;; + # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders. + --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \ + --config $exp_config \ + --num_workers 4 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + # add default value + if [ -z "$resume_from_ckpt_path" ]; then + resume_from_ckpt_path="" + fi + + if [ -z "$resume_type" ]; then + resume_type="resume" + fi + + if [ "$resume" = true ]; then + echo "Resume from the existing experiment..." + CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --resume \ + --resume_from_ckpt_path "$resume_from_ckpt_path" \ + --resume_type "$resume_type" + else + echo "Start a new experiment..." + CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info + fi +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$expt_dir/result" + fi + + if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then + echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)." + exit 1 + fi + + if [ -z "$infer_source_file" ]; then + infer_source=$infer_source_audio_dir + fi + + if [ -z "$infer_source_audio_dir" ]; then + infer_source=$infer_source_file + fi + + if [ -z "$infer_target_speaker" ]; then + echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1"" + exit 1 + fi + + if [ -z "$infer_key_shift" ]; then + infer_key_shift="autoshift" + fi + + if [ -z "$infer_vocoder_dir" ]; then + infer_vocoder_dir="$work_dir"/pretrained/bigvgan + echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint." + fi + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \ + --config $exp_config \ + --acoustics_dir $infer_expt_dir \ + --vocoder_dir $infer_vocoder_dir \ + --target_singer $infer_target_speaker \ + --trans_key $infer_key_shift \ + --source $infer_source \ + --output_dir $infer_output_dir \ + --log_level debug +fi \ No newline at end of file diff --git a/egs/tta/README.md b/egs/tta/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d944688768697da9e0cee60445410d6ce115694 --- /dev/null +++ b/egs/tta/README.md @@ -0,0 +1,19 @@ +# Amphion Text-to-Audio (TTA) Recipe + +## Quick Start + +We provide a **[beginner recipe](RECIPE.md)** to demonstrate how to train a cutting edge TTA model. Specifically, it is designed as a latent diffusion model like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830). + +## Supported Model Architectures + +Until now, Amphion has supported a latent diffusion based text-to-audio model: + +
+
+ +
+
+ +Similar to [AUDIT](https://arxiv.org/abs/2304.00830), we implement it in two-stage training: +1. Training the VAE which is called `AutoencoderKL` in Amphion. +2. Training the conditional latent diffusion model which is called `AudioLDM` in Amphion. \ No newline at end of file diff --git a/egs/tta/RECIPE.md b/egs/tta/RECIPE.md new file mode 100644 index 0000000000000000000000000000000000000000..5a6c0ce4451f3fed9387cf20039f2131293ca8d8 --- /dev/null +++ b/egs/tta/RECIPE.md @@ -0,0 +1,160 @@ +# Text-to-Audio with Latent Diffusion Model + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2304.00830) +[![demo](https://img.shields.io/badge/SVC-Demo-red)](https://audit-demo.github.io/) +[![model](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Models-pink)](https://huggingface.co/amphion/text_to_audio) +[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/Text-to-Audio) +[![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/Text-to-Audio) + +This is the quicktour for training a text-to-audio model with the popular and powerful generative model: [Latent Diffusion Model](https://arxiv.org/abs/2112.10752). Specially, this recipe is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper "[AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models](https://arxiv.org/abs/2304.00830)". You can check the last part of [AUDIT demos](https://audit-demo.github.io/) to see same text-to-audio examples. + +
+
+ +
+
+ +We train this latent diffusion model in two stages: +1. In the first stage, we aims to obtain a high-quality VAE (called `AutoencoderKL` in Amphion), in order that we can project +the input mel-spectrograms to an efficient, low-dimensional latent space. Specially, we train the VAE with GAN loss to improve the reconstruction quality. +1. In the second stage, we aims to obtain a text-controllable diffusion model (called `AudioLDM` in Amphion). We use U-Net architecture diffusion model, and use T5 encoder as text encoder. + +There are four stages in total for training the text-to-audio model: + +1. Data preparation and processing +2. Train the VAE model +3. Train the latent diffusion model +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## Overview + +```sh +# Train the VAE model +sh egs/tta/autoencoderkl/run_train.sh + +# Train the latent diffusion model +sh egs/tta/audioldm/run_train.sh + +# Inference +sh egs/tta/audioldm/run_inference.sh +``` + +## 1. Data preparation and processing + +### Dataset Download + +We take [AudioCaps](https://audiocaps.github.io/) as an example, AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. We have already processed the dataset. You can download the dataset [here](https://openxlab.org.cn/datasets/Amphion/AudioCaps). + + +### Data Processing + +- Download AudioCaps dataset to `[Your path to save tta dataset]` and modify `preprocess.processed_dir` in `egs/tta/.../exp_config.json`. + +```json +{ + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "[Your path to save tta dataset]", + ... + } +} +``` + +The folder structure of your downloaded data should be similar to: + +```plaintext +.../[Your path to save tta dataset] +┣ AudioCaps +┃   ┣ wav +┃ ┃ ┣ ---1_cCGK4M_0_10000.wav +┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav +┃ ┃ ┣ ... +``` + +- Then you may process the data to mel-specgram and save it as `.npy` format. If you use the data we provide, we have processed all the wav data. + +- Generate a json file to save the metadata, the json file is like: + +```json +[ + { + "Dataset": "AudioCaps", + "Uid": "---1_cCGK4M_0_10000", + "Caption": "Idling car, train blows horn and passes" + }, + { + "Dataset": "AudioCaps", + "Uid": "---lTs1dxhU_30000_40000", + "Caption": "A racing vehicle engine is heard passing by" + }, + ... +] +``` +- Finally, the folder structure is like: + +```plaintext +.../[Your path to save tta dataset] +┣ AudioCpas +┃   ┣ wav +┃ ┃ ┣ ---1_cCGK4M_0_10000.wav +┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav +┃ ┃ ┣ ... +┃   ┣ mel +┃ ┃ ┣ ---1_cCGK4M_0_10000.npy +┃ ┃ ┣ ---lTs1dxhU_30000_40000.npy +┃ ┃ ┣ ... +┃   ┣ train.json +┃   ┣ valid.json +┃   ┣ ... +``` + +## 2. Training the VAE Model + +The first stage model is a VAE model trained with GAN loss (called `AutoencoderKL` in Amphion), run the follow commands: + +```sh +sh egs/tta/autoencoderkl/run_train.sh +``` + +## 3. Training the Latent Diffusion Model + +The second stage model is a condition diffusion model with a T5 text encoder (called `AudioLDM` in Amphion), run the following commands: + +```sh +sh egs/tta/audioldm/run_train.sh +``` + +## 4. Inference + +Now you can generate audio with your pre-trained latent diffusion model, run the following commands and modify the `text` argument. + +```sh +sh egs/tta/audioldm/run_inference.sh \ +--text "A man is whistling" +``` + +## Citations + +```bibtex +@article{wang2023audit, + title={AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models}, + author={Wang, Yuancheng and Ju, Zeqian and Tan, Xu and He, Lei and Wu, Zhizheng and Bian, Jiang and Zhao, Sheng}, + journal={NeurIPS 2023}, + year={2023} +} + +@article{liu2023audioldm, + title={{AudioLDM}: Text-to-Audio Generation with Latent Diffusion Models}, + author={Liu, Haohe and Chen, Zehua and Yuan, Yi and Mei, Xinhao and Liu, Xubo and Mandic, Danilo and Wang, Wenwu and Plumbley, Mark D}, + journal={Proceedings of the International Conference on Machine Learning}, + year={2023} +} +``` \ No newline at end of file diff --git a/egs/tta/audioldm/exp_config.json b/egs/tta/audioldm/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5b57a771593094401fd38747c66951afe6d6d1f3 --- /dev/null +++ b/egs/tta/audioldm/exp_config.json @@ -0,0 +1,90 @@ +{ + "base_config": "egs/tta/audioldm/exp_config_base.json", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "data", + // For example: "/home/TTADataset/processed_data" + + // feature + "use_spkid": false, + "use_uv": false, + "use_frame_pitch": false, + "use_phone_pitch": false, + "use_frame_energy": false, + "use_phone_energy": false, + "use_mel": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false, + // feature for text to audio + "use_caption": true, + "use_melspec": true, + "use_wav": false, + // feature dir + "melspec_dir": "mel", + "wav_dir": "wav" + }, + // Specify the output root path to save model ckpts and logs + "log_dir": "ckpts/tta", + // For example: "/home/TTADataset/processed_data/logs" + + // model + "model": { + "audioldm": { + "image_size": 32, + "in_channels": 4, + "out_channels": 4, + "model_channels": 256, + "attention_resolutions": [4, 2, 1], + "num_res_blocks": 2, + "channel_mult": [1, 2, 4], + "num_heads": 8, + "use_spatial_transformer": true, + "transformer_depth": 1, + "context_dim": 768, + "use_checkpoint": true, + "legacy": false + }, + "autoencoderkl": { + "ch": 128, + "ch_mult": [1,1,2,2,4], + "num_res_blocks": 2, + "in_channels": 1, + "z_channels": 4, + "out_ch": 1, + "double_z": true + }, + "noise_scheduler": { + "num_train_timesteps": 1000, + "beta_start": 0.00085, + "beta_end": 0.012, + "beta_schedule": "scaled_linear", + "clip_sample": false, + "steps_offset": 1, + "set_alpha_to_one": false, + "skip_prk_steps": true, + "prediction_type": "epsilon" + }, + "autoencoder_path": "ckpts/tta/autoencoder_kl_debug/checkpoints/step-0445000_loss-0.3306.pt" + }, + + // train + "train": { + "adam": { + "lr": 5.0e-5 + }, + "ddp": false, + "random_seed": 12345, + "batch_size": 12, + "epochs": 50000, + "max_steps": 1000000, + "total_training_steps": 800000, + "save_summary_steps": 1000, + "save_checkpoints_steps": 5000, + "valid_interval": 5000, + "keep_checkpoint_max": 100 + } + } \ No newline at end of file diff --git a/egs/tta/audioldm/exp_config_base.json b/egs/tta/audioldm/exp_config_base.json new file mode 100644 index 0000000000000000000000000000000000000000..e3f30bacbe57655f8fed3452077e52bd93d46c9f --- /dev/null +++ b/egs/tta/audioldm/exp_config_base.json @@ -0,0 +1,11 @@ +{ + "base_config": "config/audioldm.json", + "model_type": "AudioLDM", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + "train_file": "train.json", + "valid_file": "valid.json" + } +} \ No newline at end of file diff --git a/egs/tta/audioldm/exp_config_latent_4_10_78.json b/egs/tta/audioldm/exp_config_latent_4_10_78.json new file mode 100644 index 0000000000000000000000000000000000000000..09ae7c38cf332e74ecb4bc485d44a5f8985d43c5 --- /dev/null +++ b/egs/tta/audioldm/exp_config_latent_4_10_78.json @@ -0,0 +1,88 @@ +{ + "base_config": "egs/tta/audioldm/exp_config_base.json", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "data", + + // feature + "use_spkid": false, + "use_uv": false, + "use_frame_pitch": false, + "use_phone_pitch": false, + "use_frame_energy": false, + "use_phone_energy": false, + "use_mel": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false, + // feature for text to audio + "use_caption": true, + "use_melspec": true, + "use_wav": false, + // feature dir + "melspec_dir": "mel", + "wav_dir": "wav" + }, + // Specify the output root path to save model ckpts and logs + "log_dir": "ckpts/tta", + + // model + "model": { + "audioldm": { + "image_size": 32, + "in_channels": 4, + "out_channels": 4, + "model_channels": 256, + "attention_resolutions": [4, 2, 1], + "num_res_blocks": 2, + "channel_mult": [1, 2, 4], + "num_heads": 8, + "use_spatial_transformer": true, + "transformer_depth": 1, + "context_dim": 768, + "use_checkpoint": true, + "legacy": false + }, + "autoencoderkl": { + "ch": 128, + "ch_mult": [1,2,2,4], + "num_res_blocks": 2, + "in_channels": 1, + "z_channels": 4, + "out_ch": 1, + "double_z": true + }, + "noise_scheduler": { + "num_train_timesteps": 1000, + "beta_start": 0.00085, + "beta_end": 0.012, + "beta_schedule": "scaled_linear", + "clip_sample": false, + "steps_offset": 1, + "set_alpha_to_one": false, + "skip_prk_steps": true, + "prediction_type": "epsilon" + }, + "autoencoder_path": "ckpts/tta/autoencoder_kl_debug_latent_size_4_10_78/checkpoints/step-0390000_loss-0.2876.pt" + }, + + // train + "train": { + "adam": { + "lr": 2.0e-5 + }, + "ddp": false, + "random_seed": 12345, + "batch_size": 12, + "epochs": 50000, + "max_steps": 1000000, + "total_training_steps": 800000, + "save_summary_steps": 1000, + "save_checkpoints_steps": 5000, + "valid_interval": 5000, + "keep_checkpoint_max": 100 + } + } \ No newline at end of file diff --git a/egs/tta/audioldm/run_inference.sh b/egs/tta/audioldm/run_inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..181244efece8e164eb44fb04f051dbb80721c248 --- /dev/null +++ b/egs/tta/audioldm/run_inference.sh @@ -0,0 +1,52 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config.json" +exp_name="audioldm_debug_latent_size_4_5_39" +checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_5_39/checkpoints/step-0570000_loss-0.2521.pt" +output_dir="$work_dir/temp" +vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json" +vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000" +num_steps=200 +guidance_scale=4.0 + +export CUDA_VISIBLE_DEVICES="0" + +######## Parse Command Line Arguments ########### +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + --text) + text="$2" + shift # past argument + shift # past value + ;; + *) # unknown option + shift # past argument + ;; +esac +done + +######## Run inference ########### +python "${work_dir}"/bins/tta/inference.py \ + --config=$exp_config \ + --checkpoint_path=$checkpoint_path \ + --text="$text" \ + --vocoder_path=$vocoder_path \ + --vocoder_config_path=$vocoder_config_path \ + --num_steps=$num_steps \ + --guidance_scale=$guidance_scale \ + --output_dir=$output_dir diff --git a/egs/tta/audioldm/run_inference_latent_4_10_78.sh b/egs/tta/audioldm/run_inference_latent_4_10_78.sh new file mode 100644 index 0000000000000000000000000000000000000000..3c247e9e6d2919e14e395deb83397439bb22efc5 --- /dev/null +++ b/egs/tta/audioldm/run_inference_latent_4_10_78.sh @@ -0,0 +1,52 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config_v2.json" +exp_name="audioldm_debug_latent_size_4_10_78" +checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_10_78/checkpoints/step-0325000_loss-0.1936.pt" +output_dir="$work_dir/temp" +vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json" +vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000" +num_steps=200 +guidance_scale=4.0 + +export CUDA_VISIBLE_DEVICES="0" + +######## Parse Command Line Arguments ########### +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + --text) + text="$2" + shift # past argument + shift # past value + ;; + *) # unknown option + shift # past argument + ;; +esac +done + +######## Run inference ########### +python "${work_dir}"/bins/tta/inference.py \ + --config=$exp_config \ + --checkpoint_path=$checkpoint_path \ + --text="A man is whistling" \ + --vocoder_path=$vocoder_path \ + --vocoder_config_path=$vocoder_config_path \ + --num_steps=$num_steps \ + --guidance_scale=$guidance_scale \ + --output_dir=$output_dir \ \ No newline at end of file diff --git a/egs/tta/audioldm/run_train.sh b/egs/tta/audioldm/run_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..b1060db6cf02ccd46bdc8ecac304e26233cea354 --- /dev/null +++ b/egs/tta/audioldm/run_train.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config.json" +exp_name="audioldm_debug_latent_size_4_5_39" + +num_workers=8 +export CUDA_VISIBLE_DEVICES="0" + +######## Train Model ########### +python "${work_dir}"/bins/tta/train_tta.py \ + --config=$exp_config \ + --num_workers=$num_workers \ + --exp_name=$exp_name \ + --stdout_interval=25 \ \ No newline at end of file diff --git a/egs/tta/audioldm/run_train_latent_4_10_78.sh b/egs/tta/audioldm/run_train_latent_4_10_78.sh new file mode 100644 index 0000000000000000000000000000000000000000..f61c0de52983ba6d4976b115fdc22c65696cbc8d --- /dev/null +++ b/egs/tta/audioldm/run_train_latent_4_10_78.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config_latent_4_10_78.json" +exp_name="audioldm_debug_latent_size_4_10_78" + +num_workers=8 +export CUDA_VISIBLE_DEVICES="0" + +######## Train Model ########### +python "${work_dir}"/bins/tta/train_tta.py \ + --config=$exp_config \ + --num_workers=$num_workers \ + --exp_name=$exp_name \ + --stdout_interval=25 \ \ No newline at end of file diff --git a/egs/tta/autoencoderkl/exp_config.json b/egs/tta/autoencoderkl/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e0d401051fea5920ade4ad7d765e816fdbec5e9d --- /dev/null +++ b/egs/tta/autoencoderkl/exp_config.json @@ -0,0 +1,49 @@ +{ + "base_config": "egs/tta/autoencoderkl/exp_config_base.json", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "data", + + // feature + "use_spk": false, + "use_spkid": false, + "use_uv": false, + "use_frame_pitch": false, + "use_phone_pitch": false, + "use_frame_energy": false, + "use_phone_energy": false, + "use_mel": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false, + // feature for text to audio + "use_caption": true, + "use_melspec": true, + "use_wav": false, + // feature dir + "melspec_dir": "mel", + "wav_dir": "wav" + }, + // Specify the output root path to save model ckpts and logs + "log_dir": "ckpts/tta", + + // train + "train": { + "adam": { + "lr": 4.0e-5 + }, + "ddp": false, + "random_seed": 12345, + "batch_size": 12, + "epochs": 50000, + "max_steps": 1000000, + "total_training_steps": 800000, + "save_summary_steps": 1000, + "save_checkpoints_steps": 5000, + "valid_interval": 5000, + "keep_checkpoint_max": 100 + } + } \ No newline at end of file diff --git a/egs/tta/autoencoderkl/exp_config_base.json b/egs/tta/autoencoderkl/exp_config_base.json new file mode 100644 index 0000000000000000000000000000000000000000..87c0b263fd0e446c2b5f8295fad19c5a11d9b0a2 --- /dev/null +++ b/egs/tta/autoencoderkl/exp_config_base.json @@ -0,0 +1,11 @@ +{ + "base_config": "config/autoencoderkl.json", + "model_type": "AutoencoderKL", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + "train_file": "train.json", + "valid_file": "valid.json" + } +} \ No newline at end of file diff --git a/egs/tta/autoencoderkl/exp_config_latent_4_10_78.json b/egs/tta/autoencoderkl/exp_config_latent_4_10_78.json new file mode 100644 index 0000000000000000000000000000000000000000..911018a28c8e2d6a471a6505261e8050151e4d60 --- /dev/null +++ b/egs/tta/autoencoderkl/exp_config_latent_4_10_78.json @@ -0,0 +1,59 @@ +{ + "base_config": "egs/tta/autoencoderkl/exp_config_base.json", + "dataset": [ + "AudioCaps" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "data", + + // feature + "use_spkid": false, + "use_uv": false, + "use_frame_pitch": false, + "use_phone_pitch": false, + "use_frame_energy": false, + "use_phone_energy": false, + "use_mel": false, + "use_audio": false, + "use_label": false, + "use_one_hot": false, + // feature for text to audio + "use_caption": true, + "use_melspec": true, + "use_wav": false, + // feature dir + "melspec_dir": "mel", + "wav_dir": "wav" + }, + // Specify the output root path to save model ckpts and logs + "log_dir": "ckpts/tta", + + "model": { + "autoencoderkl": { + "ch": 128, + "ch_mult": [1,2,2,4], + "num_res_blocks": 2, + "in_channels": 1, + "z_channels": 4, + "out_ch": 1, + "double_z": true + } + }, + // train + "train": { + "adam": { + "lr": 4.0e-5 + }, + "ddp": false, + "random_seed": 12345, + "batch_size": 12, + "epochs": 50000, + "max_steps": 1000000, + "total_training_steps": 800000, + "save_summary_steps": 1000, + "save_checkpoints_steps": 5000, + "valid_interval": 5000, + "keep_checkpoint_max": 100 + } + } \ No newline at end of file diff --git a/egs/tta/autoencoderkl/run_train.sh b/egs/tta/autoencoderkl/run_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..00a25693166dcee2d7b96dc6aa957ce96f8ef872 --- /dev/null +++ b/egs/tta/autoencoderkl/run_train.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config.json" +exp_name="autoencoder_kl_debug" + +num_workers=8 +export CUDA_VISIBLE_DEVICES="0" + +######## Train Model ########### +python "${work_dir}"/bins/tta/train_tta.py \ + --config=$exp_config \ + --num_workers=$num_workers \ + --exp_name=$exp_name \ + --stdout_interval=25 \ \ No newline at end of file diff --git a/egs/tta/autoencoderkl/run_train_latent_4_10_78.sh b/egs/tta/autoencoderkl/run_train_latent_4_10_78.sh new file mode 100644 index 0000000000000000000000000000000000000000..041627d9c43b56ee4f1657733062f18eb313e8b0 --- /dev/null +++ b/egs/tta/autoencoderkl/run_train_latent_4_10_78.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config_latent_4_10_78.json" +exp_name="autoencoder_kl_debug_latent_size_4_10_78" + +num_workers=8 +export CUDA_VISIBLE_DEVICES="0" + +######## Train Model ########### +python "${work_dir}"/bins/tta/train_tta.py \ + --config=$exp_config \ + --num_workers=$num_workers \ + --exp_name=$exp_name \ + --stdout_interval=25 \ \ No newline at end of file diff --git a/egs/tts/FastSpeech2/README.md b/egs/tts/FastSpeech2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..de7d95c8c285e54918cd0f7342b27f15dc96900c --- /dev/null +++ b/egs/tts/FastSpeech2/README.md @@ -0,0 +1,158 @@ + +# FastSpeech2 Recipe + +In this recipe, we will show how to train [FastSpeech2](https://openreview.net/forum?id=piLPYqxtWuA) using Amphion's infrastructure. FastSpeech2 is a non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download +You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md). + +### Configuration + +After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "LJSpeech", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "LJSpeech": "[LJSpeech dataset path]", + }, +``` + +## 2. Features Extraction + +### Configuration + +Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`: + +```json + // TODO: Fill in the output log path + "log_dir": "ckpts/tts", + "preprocess": { + // TODO: Fill in the output data path + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`): + +```bash +sh egs/tts/FastSpeech2/run.sh --stage 1 +``` + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. + +``` +"train": { + "batch_size": 16, + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `ckpts/tts/[YourExptName]`. + +```bash +sh egs/tts/FastSpeech2/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + + +## 4. Inference + +### Pre-trained Fastspeech 2 and HiFi-GAN Download + +We released a pre-trained Amphion [Fastspeech 2](https://huggingface.co/amphion/fastspeech2_ljspeech) model and [HiFi-GAN](https://huggingface.co/amphion/hifigan_ljspeech) trained on LJSpeech. So you can download the them and generate speech according to the following inference instruction. + + +### Configuration + +For inference, you need to specify the following configurations when running `run.sh`: + + +| Parameters | Description | Example | +| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `ckpts/tts/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `ckpts/tts/[YourExptName]/result` | +| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | +| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. | +| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | +| `--vocoder_dir` | The directory for the vocoder. | "`ckpts/vocoder/hifigan_ljspeech`" | + + +### Run +For example, if you want to generate speech of all testing set split from LJSpeech, just run: + +```bash +sh egs/tts/FastSpeech2/run.sh --stage 3 \ + --infer_expt_dir ckpts/tts/[YourExptName] \ + --infer_output_dir ckpts/tts/[YourExptName]/result \ + --infer_mode "batch" \ + --infer_dataset "LJSpeech" \ + --infer_testing_set "test" \ + --vocoder_dir ckpts/vocoder/hifigan_ljspeech/checkpoints +``` + +Or, if you want to generate a single clip of speech from a given text, just run: + +```bash +sh egs/tts/FastSpeech2/run.sh --stage 3 \ + --infer_expt_dir ckpts/tts/[YourExptName] \ + --infer_output_dir ckpts/tts/[YourExptName]/result \ + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." \ + --vocoder_dir ckpts/vocoder/hifigan_ljspeech +``` + +### ISSUES and Solutions + +``` +NotImplementedError: Using RTX 3090 or 4000 series doesn't support faster communication broadband via P2P or IB. Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which will do this automatically. +2024-02-24 10:57:49 | INFO | torch.distributed.distributed_c10d | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. +``` +The error message is related to an incompatibility issue with the NVIDIA RTX 3090 or 4000 series GPUs when trying to use peer-to-peer (P2P) communication or InfiniBand (IB) for faster communication. This incompatibility arises within the PyTorch accelerate library, which facilitates distributed training and inference. + +To fix this issue, before running your script, you can set the environment variables in your terminal: +``` +export NCCL_P2P_DISABLE=1 +export NCCL_IB_DISABLE=1 +``` + +### Noted +Extensive logging messages related to `torch._subclasses.fake_tensor` and `torch._dynamo.output_graph` may be observed during inference. Despite attempts to ignore these logs, no effective solution has been found. However, it does not impact the inference process. + + + + +```bibtex +@inproceedings{ren2020fastspeech, + title={FastSpeech 2: Fast and High-Quality End-to-End Text to Speech}, + author={Ren, Yi and Hu, Chenxu and Tan, Xu and Qin, Tao and Zhao, Sheng and Zhao, Zhou and Liu, Tie-Yan}, + booktitle={International Conference on Learning Representations}, + year={2020} +} +``` diff --git a/egs/tts/FastSpeech2/exp_config.json b/egs/tts/FastSpeech2/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a9ff1ee6fa60dc05c6bff8eecac96fa1e5aeb44a --- /dev/null +++ b/egs/tts/FastSpeech2/exp_config.json @@ -0,0 +1,22 @@ +{ + "base_config": "config/fs2.json", + "model_type": "FastSpeech2", + "dataset": [ + "LJSpeech" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "LJSpeech": "[LJSpeech dataset path]" + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" + "log_dir": "ckpts/tts", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + "sample_rate": 22050, + }, + "train": { + "batch_size": 16, + "max_epoch": 100, + } +} diff --git a/egs/tts/FastSpeech2/prepare_mfa.sh b/egs/tts/FastSpeech2/prepare_mfa.sh new file mode 100644 index 0000000000000000000000000000000000000000..1e9759e10458af5b0933e054a35990402826cbc9 --- /dev/null +++ b/egs/tts/FastSpeech2/prepare_mfa.sh @@ -0,0 +1,29 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#!/bin/bash + +# Navigate to the 'pretrained' directory +cd pretrained || { echo "Failed to change directory to 'pretrained'"; exit 1; } + +# Create and navigate to the 'mfa' directory +mkdir -p mfa && cd mfa || { echo "Failed to create or change directory to 'mfa'"; exit 1; } + +# Define the MFA file URL and the file name +mfa_url="https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.1.0-beta.2/montreal-forced-aligner_linux.tar.gz" +mfa_file="montreal-forced-aligner_linux.tar.gz" + +# Download MFA if it doesn't exist +if [ ! -f "$mfa_file" ]; then + wget "$mfa_url" || { echo "Failed to download MFA"; exit 1; } +fi + +# Extract MFA +tar -zxvf "$mfa_file" || { echo "Failed to extract MFA"; exit 1; } + +# Optionally, remove the tar.gz file after extraction +rm "$mfa_file" + +echo "MFA setup completed successfully." diff --git a/egs/tts/FastSpeech2/run.sh b/egs/tts/FastSpeech2/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..bd27a12ad581bbc391523a748c72b7a148fbc0fc --- /dev/null +++ b/egs/tts/FastSpeech2/run.sh @@ -0,0 +1,155 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +cd $work_dir/modules/monotonic_align +mkdir -p monotonic_align +python setup.py build_ext --inplace +cd $work_dir + +mfa_dir=$work_dir/pretrained/mfa +echo $mfa_dir + +######## Parse the Given Parameters from the Commond ########### +# options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@") +options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage:,vocoder_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech. + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inference dataset. It is only used when the inference model is "batch". + --infer_dataset) shift; infer_dataset=$1 ; shift ;; + # [Only for Inference] The inference testing set. It is only used when the inference model is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set. + --infer_testing_set) shift; infer_testing_set=$1 ; shift ;; + # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". + --infer_text) shift; infer_text=$1 ; shift ;; + # [Only for Inference] The output dir to the vocoder. + --vocoder_dir) shift; vocoder_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + if [ ! -d "$mfa_dir/montreal-forced-aligner" ]; then + bash ${exp_dir}/prepare_mfa.sh + fi + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \ + --config=$exp_config \ + --num_workers=4 \ + --prepare_alignment=true +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/tts/train.py \ + --config $exp_config \ + --exp_name $exp_name \ + --log_level debug +fi + +######## Inference ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$expt_dir/result" + fi + + if [ -z "$vocoder_dir" ]; then + echo "[Error] Please specify the vocoder directory to reconstruct waveform from mel spectrogram." + exit 1 + fi + + if [ -z "$infer_mode" ]; then + echo "[Error] Please specify the inference mode, e.g., "batch", "single"" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_dataset" ]; then + echo "[Error] Please specify the dataset used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_testing_set" ]; then + echo "[Error] Please specify the testing set used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then + echo "[Error] Please specify the text to be synthesized when the inference mode is single" + exit 1 + fi + + if [ "$infer_mode" = "single" ]; then + echo 'Text: ' ${infer_text} + infer_dataset=None + infer_testing_set=None + elif [ "$infer_mode" = "batch" ]; then + infer_text='' + fi + + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \ + --config $exp_config \ + --acoustics_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --mode $infer_mode \ + --dataset $infer_dataset \ + --testing_set $infer_testing_set \ + --text "$infer_text" \ + --log_level debug \ + --vocoder_dir $vocoder_dir + +fi \ No newline at end of file diff --git a/egs/tts/Jets/README.md b/egs/tts/Jets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e1cc16cc59621bcdbdca2d4c4f081415c6248d77 --- /dev/null +++ b/egs/tts/Jets/README.md @@ -0,0 +1,138 @@ +# Jets Recipe + +In this recipe, we will show how to train [Jets](https://arxiv.org/abs/2203.16852) using Amphion's infrastructure. Jets is an end-to-end text-to-speech (E2E-TTS) model which jointly trains FastSpeech2 and HiFi-GAN. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +You can use LJSpeech to train TTS model. How to download dataset is detailed [here](../../datasets/README.md). + +### Configuration + +After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "LJSpeech", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "LJSpeech": "[LJSpeech dataset path]", + }, +``` + +## 2. Features Extraction + +### Configuration + +Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`: + +```json + // TODO: Fill in the output log path + "log_dir": "ckpts/tts", + "preprocess": { + // TODO: Fill in the output data path + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`): + +```bash +sh egs/tts/Jets/run.sh --stage 1 +``` + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines. + +``` +"train": { + "batch_size": 16, + } +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `ckpts/tts/[YourExptName]`. + +```bash +sh egs/tts/Jets/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. We recommend you to only use one GPU for training. + +## 4. Inference + +### Configuration + +For inference, you need to specify the following configurations when running `run.sh`: + +| Parameters | Description | Example | +| ----------------------- | ------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `ckpts/tts/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `ckpts/tts/[YourExptName]/result` | +| `--infer_mode` | The inference mode, e.g., "`batch`". | `batch`" to generate a batch of speech at a time. | +| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction | + +### Run + +For example, if you want to generate speech of all testing set split from LJSpeech, just run: + +```bash +sh egs/tts/Jets/run.sh --stage 3 \ + --infer_expt_dir ckpts/tts/[YourExptName] \ + --infer_output_dir ckpts/tts/[YourExptName]/result \ + --infer_mode "batch" \ + --infer_dataset "LJSpeech" \ + --infer_testing_set "test" +``` + +### ISSUES and Solutions + +``` +NotImplementedError: Using RTX 3090 or 4000 series doesn't support faster communication broadband via P2P or IB. Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which will do this automatically. +2024-02-24 10:57:49 | INFO | torch.distributed.distributed_c10d | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes. +``` + +The error message is related to an incompatibility issue with the NVIDIA RTX 3090 or 4000 series GPUs when trying to use peer-to-peer (P2P) communication or InfiniBand (IB) for faster communication. This incompatibility arises within the PyTorch accelerate library, which facilitates distributed training and inference. + +To fix this issue, before running your script, you can set the environment variables in your terminal: + +``` +export NCCL_P2P_DISABLE=1 +export NCCL_IB_DISABLE=1 +``` + +### Noted + +Extensive logging messages related to `torch._subclasses.fake_tensor` and `torch._dynamo.output_graph` may be observed during inference. Despite attempts to ignore these logs, no effective solution has been found. However, it does not impact the inference process. + +```bibtex +@article{lim2022jets, + title={JETS: Jointly training FastSpeech2 and HiFi-GAN for end to end text to speech}, + author={Lim, Dan and Jung, Sunghee and Kim, Eesung}, + journal={arXiv preprint arXiv:2203.16852}, + year={2022} +} +``` diff --git a/egs/tts/Jets/exp_config.json b/egs/tts/Jets/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8f0dabb0b7609267b04e48ba8eb35d87b62d86c --- /dev/null +++ b/egs/tts/Jets/exp_config.json @@ -0,0 +1,32 @@ +{ + "base_config": "config/jets.json", + "model_type": "Jets", + "dataset": [ + "LJSpeech" + ], + "dataset_path": { + "LJSpeech": "../LJSpeech-1.1" + }, + "log_dir": "ckpts/tts", + "preprocess": { + "processed_dir": "data", + "sample_rate": 22050, + "use_audios": true, + "extract_audio": true, +}, + "train": { + "batch_size": 16, + "max_epoch": 100, + "learning_rate": 2e-4, + "AdamW": { + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-9, + }, + "lr_decay": 0.999875, + "segment_size": 64, + "upsample_factor": 256 + } +} diff --git a/egs/tts/Jets/prepare_mfa.sh b/egs/tts/Jets/prepare_mfa.sh new file mode 100644 index 0000000000000000000000000000000000000000..1e9759e10458af5b0933e054a35990402826cbc9 --- /dev/null +++ b/egs/tts/Jets/prepare_mfa.sh @@ -0,0 +1,29 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +#!/bin/bash + +# Navigate to the 'pretrained' directory +cd pretrained || { echo "Failed to change directory to 'pretrained'"; exit 1; } + +# Create and navigate to the 'mfa' directory +mkdir -p mfa && cd mfa || { echo "Failed to create or change directory to 'mfa'"; exit 1; } + +# Define the MFA file URL and the file name +mfa_url="https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.1.0-beta.2/montreal-forced-aligner_linux.tar.gz" +mfa_file="montreal-forced-aligner_linux.tar.gz" + +# Download MFA if it doesn't exist +if [ ! -f "$mfa_file" ]; then + wget "$mfa_url" || { echo "Failed to download MFA"; exit 1; } +fi + +# Extract MFA +tar -zxvf "$mfa_file" || { echo "Failed to extract MFA"; exit 1; } + +# Optionally, remove the tar.gz file after extraction +rm "$mfa_file" + +echo "MFA setup completed successfully." diff --git a/egs/tts/Jets/run.sh b/egs/tts/Jets/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..02fe60f50f1e545f26da8f26964a7513c481d4df --- /dev/null +++ b/egs/tts/Jets/run.sh @@ -0,0 +1,147 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +cd $work_dir/modules/monotonic_align +mkdir -p monotonic_align +python setup.py build_ext --inplace +cd $work_dir + +mfa_dir=$work_dir/pretrained/mfa +echo $mfa_dir + +######## Parse the Given Parameters from the Commond ########### +# options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,name:,stage: -- "$@") +options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage:, -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech. + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inference dataset. It is only used when the inference model is "batch". + --infer_dataset) shift; infer_dataset=$1 ; shift ;; + # [Only for Inference] The inference testing set. It is only used when the inference model is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set. + --infer_testing_set) shift; infer_testing_set=$1 ; shift ;; + # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". + --infer_text) shift; infer_text=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + if [ ! -d "$mfa_dir/montreal-forced-aligner" ]; then + bash ${exp_dir}/prepare_mfa.sh + fi + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \ + --config=$exp_config \ + --num_workers=4 \ + --prepare_alignment=true +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/tts/train.py \ + --config $exp_config \ + --exp_name $exp_name \ + --log_level debug +fi + +######## Inference ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$expt_dir/result" + fi + + if [ -z "$infer_mode" ]; then + echo "[Error] Please specify the inference mode, e.g., "batch", "single"" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_dataset" ]; then + echo "[Error] Please specify the dataset used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_testing_set" ]; then + echo "[Error] Please specify the testing set used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then + echo "[Error] Please specify the text to be synthesized when the inference mode is single" + exit 1 + fi + + if [ "$infer_mode" = "single" ]; then + echo 'Text: ' ${infer_text} + infer_dataset=None + infer_testing_set=None + elif [ "$infer_mode" = "batch" ]; then + infer_text='' + fi + + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \ + --config $exp_config \ + --acoustics_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --mode $infer_mode \ + --dataset $infer_dataset \ + --testing_set $infer_testing_set \ + --text "$infer_text" \ + --log_level debug \ + +fi \ No newline at end of file diff --git a/egs/tts/NaturalSpeech2/README.md b/egs/tts/NaturalSpeech2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..105e9c7f4627f40632ef9a09b17ad24aeeb24b68 --- /dev/null +++ b/egs/tts/NaturalSpeech2/README.md @@ -0,0 +1,47 @@ +# NaturalSpeech2 Recipe + +[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/NaturalSpeech2) + +In this recipe, we will show how to train [NaturalSpeech2](https://arxiv.org/abs/2304.09116) using Amphion's infrastructure. NaturalSpeech2 is a zero-shot TTS architecture that predicts latent representations of a neural audio codec. + +There are three stages in total: + +1. Data processing +2. Training +3. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data processing + +You can use the commonly used TTS dataset to train NaturalSpeech2 model, e.g., LibriTTS, etc. We strongly recommend you use LibriTTS to train NaturalSpeech2 model for the first time. How to download dataset is detailed [here](../../datasets/README.md). + +You can follow other Amphion TTS recipes for the data processing. + +## 3. Training + +```bash +sh egs/tts/NaturalSpeech2/run_train.sh +``` + +## 4. Inference + +```bash +bash egs/tts/NaturalSpeech2/run_inference.sh --text "[The text you want to generate]" +``` + +We released a pre-trained Amphion NatrualSpeech2 model. So you can download the pre-trained model [here](https://huggingface.co/amphion/naturalspeech2_libritts) and generate speech following the above inference instruction. + +We also provided an online [demo](https://huggingface.co/spaces/amphion/NaturalSpeech2), feel free to try it! + +```bibtex +@article{shen2023naturalspeech, + title={Naturalspeech 2: Latent diffusion models are natural and zero-shot speech and singing synthesizers}, + author={Shen, Kai and Ju, Zeqian and Tan, Xu and Liu, Yanqing and Leng, Yichong and He, Lei and Qin, Tao and Zhao, Sheng and Bian, Jiang}, + journal={arXiv preprint arXiv:2304.09116}, + year={2023} +} +``` diff --git a/egs/tts/NaturalSpeech2/exp_config.json b/egs/tts/NaturalSpeech2/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6e840f3090e85ab6b9e88f07b2826b2fef9a5ba8 --- /dev/null +++ b/egs/tts/NaturalSpeech2/exp_config.json @@ -0,0 +1,39 @@ +{ + "base_config": "egs/tts/NaturalSpeech2/exp_config_base.json", + "dataset": [ + "libritts" + ], + "preprocess": { + // Specify the output root path to save the processed data + "processed_dir": "data", + "train_file": "train.json", + "valid_file": "test.json", + "read_metadata": true, + "metadata_dir": "metadata" + }, + // Specify the output root path to save model ckpts and logs + "log_dir": "ckpts/tts", + "train": { + // New trainer and Accelerator + "gradient_accumulation_step": 1, + "tracker": ["tensorboard"], + "max_epoch": 5000, + "save_checkpoint_stride": [1], + "keep_last": [1000], + "run_eval": [true], + "dataloader": { + "num_worker": 16, + "pin_memory": true + }, + "adam": { + "lr": 1.0e-4 + }, + "use_dynamic_batchsize": true, + "batch_size": 8, + "max_tokens": 7500, + "max_sentences": 32, + "lr_warmup_steps": 5000, + "lr_scheduler": "cosine", + "num_train_steps": 800000 + } + } \ No newline at end of file diff --git a/egs/tts/NaturalSpeech2/exp_config_base.json b/egs/tts/NaturalSpeech2/exp_config_base.json new file mode 100644 index 0000000000000000000000000000000000000000..ac3842e2d51002aaa0ada6c49bc2d7c2abd7323e --- /dev/null +++ b/egs/tts/NaturalSpeech2/exp_config_base.json @@ -0,0 +1,118 @@ +{ + "base_config": "config/ns2.json", + "model_type": "NaturalSpeech2", + "dataset": [ + "libritts" + ], + "preprocess": { + "use_mel": false, + "use_code": true, + "use_spkid": true, + "use_pitch": true, + "use_duration": true, + "use_phone": true, + "use_len": true, + "use_cross_reference": true, + "train_file": "train.json", + "valid_file": "test.json", + "melspec_dir": "mel", + "code_dir": "code", + "pitch_dir": "pitch", + "duration_dir": "duration", + "metadata_dir": "metadata", + "read_metadata": true, + "clip_mode": "start" + }, + "model": { + "latent_dim": 128, + "prior_encoder": { + "vocab_size": 100, + "pitch_min": 50, + "pitch_max": 1100, + "pitch_bins_num": 512, + "encoder": { + "encoder_layer": 6, + "encoder_hidden": 512, + "encoder_head": 8, + "conv_filter_size": 2048, + "conv_kernel_size": 9, + "encoder_dropout": 0.2, + "use_cln": true + }, + "duration_predictor": { + "input_size": 512, + "filter_size": 512, + "kernel_size": 3, + "conv_layers": 30, + "cross_attn_per_layer": 3, + "attn_head": 8, + "drop_out": 0.5 + }, + "pitch_predictor": { + "input_size": 512, + "filter_size": 512, + "kernel_size": 5, + "conv_layers": 30, + "cross_attn_per_layer": 3, + "attn_head": 8, + "drop_out": 0.5 + } + }, + "diffusion": { + "wavenet": { + "input_size": 128, + "hidden_size": 512, + "out_size": 128, + "num_layers": 40, + "cross_attn_per_layer": 3, + "dilation_cycle": 2, + "attn_head": 8, + "drop_out": 0.2 + }, + "beta_min": 0.05, + "beta_max": 20, + "sigma": 1.0, + "noise_factor": 1.0, + "ode_solver": "euler", + "diffusion_type": "diffusion" + }, + "prompt_encoder": { + "encoder_layer": 6, + "encoder_hidden": 512, + "encoder_head": 8, + "conv_filter_size": 2048, + "conv_kernel_size": 9, + "encoder_dropout": 0.2, + "use_cln": false + }, + "query_emb": { + "query_token_num": 32, + "hidden_size": 512, + "head_num": 8 + }, + "inference_step": 500 + }, + "train": { + "use_dynamic_batchsize": true, + "max_tokens": 7500, + "max_sentences": 32, + "lr_warmup_steps": 5000, + "lr_scheduler": "cosine", + "num_train_steps": 800000, + "adam": { + "lr": 7.5e-5 + }, + "diff_ce_loss_lambda": 0.5, + "diff_noise_loss_lambda": 1.0, + "ddp": false, + "random_seed": 114, + "batch_size": 32, + "epochs": 5000, + "max_steps": 1000000, + "total_training_steps": 800000, + "save_summary_steps": 500, + "save_checkpoints_steps": 2000, + "valid_interval": 2000, + "keep_checkpoint_max": 100 + } +} \ No newline at end of file diff --git a/egs/tts/NaturalSpeech2/run_inference.sh b/egs/tts/NaturalSpeech2/run_inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..5e04cec07bccfd7a964ca18fb789115deb97a5ef --- /dev/null +++ b/egs/tts/NaturalSpeech2/run_inference.sh @@ -0,0 +1,49 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config.json" +exp_name="ns2_libritts" +ref_audio="$work_dir/egs/tts/NaturalSpeech2/prompt_example/ref_audio.wav" +checkpoint_path="$work_dir/ckpts/tts/naturalspeech2_libritts/checkpoint/epoch-0089_step-0512912_loss-6.367693" +output_dir="$work_dir/output" +mode="single" + +export CUDA_VISIBLE_DEVICES="0" + +######## Parse Command Line Arguments ########### +while [[ $# -gt 0 ]] +do +key="$1" + +case $key in + --text) + text="$2" + shift # past argument + shift # past value + ;; + *) # unknown option + shift # past argument + ;; +esac +done + +######## Train Model ########### +python "${work_dir}"/bins/tts/inference.py \ + --config=$exp_config \ + --text="$text" \ + --mode=$mode \ + --checkpoint_path=$checkpoint_path \ + --ref_audio=$ref_audio \ + --output_dir=$output_dir \ \ No newline at end of file diff --git a/egs/tts/NaturalSpeech2/run_train.sh b/egs/tts/NaturalSpeech2/run_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..365c160105300c552b0ca6dee09a3631b8d14e09 --- /dev/null +++ b/egs/tts/NaturalSpeech2/run_train.sh @@ -0,0 +1,24 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Experiment Configuration ########### +exp_config="$exp_dir/exp_config.json" +exp_name="ns2_libritts" + +######## Train Model ########### +CUDA_VISIBLE_DEVICES="0" accelerate \ + "${work_dir}"/bins/tts/train.py \ + --config=$exp_config \ + --exp_name=$exp_name \ + --log_level debug \ \ No newline at end of file diff --git a/egs/tts/README.md b/egs/tts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4af96cf04ef2fc5ad6a6260c882cbf696bfee5de --- /dev/null +++ b/egs/tts/README.md @@ -0,0 +1,18 @@ + +# Amphion Text-to-Speech (TTS) Recipe + +## Quick Start + +We provide a **[beginner recipe](VALLE_V2/)** to demonstrate how to train a cutting edge TTS model. Specifically, it is Amphion's re-implementation for [VALL-E](https://arxiv.org/abs/2301.02111), which is a zero-shot TTS architecture that uses a neural codec language model with discrete codes. + +## Supported Model Architectures + +Until now, Amphion TTS supports the following models or architectures, +- **[FastSpeech2](FastSpeech2)**: A non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks. +- **[VITS](VITS)**: An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning +- **[VALL-E](VALLE_V2)**: A zero-shot TTS architecture that uses a neural codec language model with discrete codes. This model is our updated VALL-E implementation as of June 2024 which uses Llama as its underlying architecture. The previous version of VALL-E release can be found [here](VALLE) +- **[NaturalSpeech2](NaturalSpeech2)** (👨‍💻 developing): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices. +- **[Jets](Jets)**: An end-to-end TTS model that jointly trains FastSpeech2 and HiFi-GAN with an alignment module. + +## Amphion TTS Demo +Here are some [TTS samples](https://openhlt.github.io/Amphion_TTS_Demo/) from Amphion. diff --git a/egs/tts/VALLE/README.md b/egs/tts/VALLE/README.md new file mode 100644 index 0000000000000000000000000000000000000000..192e1fb0c89e4d73e453429237645784d2ed9650 --- /dev/null +++ b/egs/tts/VALLE/README.md @@ -0,0 +1,207 @@ +# VALL-E Recipe + +In this recipe, we will show how to train [VALL-E](https://arxiv.org/abs/2301.02111) using Amphion's infrastructure. VALL-E is a zero-shot TTS architecture that uses a neural codec language model with discrete codes. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download +You can use the commonly used TTS dataset to train the VALL-E model, e.g., LibriTTS, etc. We strongly recommend you use LibriTTS to train the VALL-E model for the first time. How to download the dataset is detailed [here](../../datasets/README.md). + +### Configuration + +After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "libritts", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "libritts": "[LibriTTS dataset path]", + }, +``` + +## 2. Features Extraction + +### Configuration + +Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" + "log_dir": "ckpts/tts", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preprocess stage (set `--stage 1`): + +```bash +sh egs/tts/VALLE/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + + +## 3. Training + +### Configuration + +We provide the default hyperparameters in the `exp_config.json`. They can work on a single NVIDIA-24g GPU. You can adjust them based on your GPU machines. + +```json +"train": { + "batch_size": 4, + } +``` + +### Train From Scratch + +Run the `run.sh` as the training stage (set `--stage 2`). Specify an experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. + +Specifically, VALL-E needs to train an autoregressive (AR) model and then a non-autoregressive (NAR) model. So, you can set `--model_train_stage 1` to train AR model, and set `--model_train_stage 2` to train NAR model, where `--ar_model_ckpt_dir` should be set as the checkpoint path to the trained AR model. + + +Train an AR model, just run: + +```bash +sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 1 --name [YourExptName] +``` + +Train a NAR model, just run: +```bash +sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName] +``` + + + +### Train From Existing Source + +We support training from existing sources for various purposes. You can resume training the model from a checkpoint or fine-tune a model from another checkpoint. + +By setting `--resume true`, the training will resume from the **latest checkpoint** from the current `[YourExptName]` by default. For example, if you want to resume training from the latest checkpoint in `Amphion/ckpts/tts/[YourExptName]/checkpoint`, + +Train an AR model, just run: + +```bash +sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 1 --name [YourExptName] \ + --resume true +``` + +Train a NAR model, just run: +```bash +sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName] \ + --resume true +``` + + + +You can also choose a **specific checkpoint** for retraining by `--resume_from_ckpt_path` argument. For example, if you want to resume training from the checkpoint `Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]`, + +Train an AR model, just run: + +```bash +sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 1 --name [YourExptName] \ + --resume true \ + --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificARCheckpoint]" +``` + +Train a NAR model, just run: +```bash +sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName] \ + --resume true \ + --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificNARCheckpoint]" +``` + + +If you want to **fine-tune from another checkpoint**, just use `--resume_type` and set it to `"finetune"`. For example, If you want to fine-tune the model from the checkpoint `Amphion/ckpts/tts/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]`, + + +Train an AR model, just run: + +```bash +sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 1 --name [YourExptName] \ + --resume true \ + --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificARCheckpoint]" \ + --resume_type "finetune" +``` + +Train a NAR model, just run: +```bash +sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName] \ + --resume true \ + --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificNARCheckpoint]" \ + --resume_type "finetune" +``` + +> **NOTE:** The `--resume_type` is set as `"resume"` in default. It's not necessary to specify it when resuming training. +> +> The difference between `"resume"` and `"finetune"` is that the `"finetune"` will **only** load the pretrained model weights from the checkpoint, while the `"resume"` will load all the training states (including optimizer, scheduler, etc.) from the checkpoint. + + + + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + + +## 4. Inference + +### Configuration + +For inference, you need to specify the following configurations when running `run.sh`: + + + +| Parameters | Description | Example | +| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory of NAR model which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | +| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | +| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | +| `--infer_text_prompt` | The text prompt for inference. | The text prompt should be aligned with the audio prompt. | +| `--infer_audio_prompt` | The audio prompt for inference. | The audio prompt should be aligned with text prompt.| +| `--test_list_file` | The test list file used for batch inference. | The format of test list file is `text\|text_prompt\|audio_prompt`.| + + +### Run +For example, if you want to generate a single clip of speech, just run: + +```bash +sh egs/tts/VALLE/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ + --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." \ + --infer_text_prompt "But even the unsuccessful dramatist has his moments." \ + --infer_audio_prompt egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.wav +``` + +We have released pre-trained VALL-E models, so you can download the pre-trained model and then generate speech following the above inference instruction. Specifically, +1. The pre-trained VALL-E trained on [LibriTTS](https://github.com/open-mmlab/Amphion/tree/main/egs/datasets#libritts) can be downloaded [here](https://huggingface.co/amphion/valle-libritts). +2. The pre-trained VALL-E trained on the part of [Libri-light](https://ai.meta.com/tools/libri-light/) (about 6k hours) can be downloaded [here](https://huggingface.co/amphion/valle_librilight_6k). + +```bibtex +@article{wang2023neural, + title={Neural codec language models are zero-shot text to speech synthesizers}, + author={Wang, Chengyi and Chen, Sanyuan and Wu, Yu and Zhang, Ziqiang and Zhou, Long and Liu, Shujie and Chen, Zhuo and Liu, Yanqing and Wang, Huaming and Li, Jinyu and others}, + journal={arXiv preprint arXiv:2301.02111}, + year={2023} +} +``` \ No newline at end of file diff --git a/egs/tts/VALLE/exp_config.json b/egs/tts/VALLE/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ad1e894a1be3dd1e15594102275f1280fa827b5 --- /dev/null +++ b/egs/tts/VALLE/exp_config.json @@ -0,0 +1,33 @@ +{ + "base_config": "config/valle.json", + "model_type": "VALLE", + "dataset": [ + "libritts" + ], + "dataset_path": { + "libritts": "[LibriTTS dataset path]" + }, + "preprocess": { + "extract_phone": true, + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + "extract_acoustic_token": true, + "use_phone": true, + "use_acoustic_token": true, + "processed_dir": "Amphion/data/", + "sample_rate": 24000, // "Audio sampling rate." + "codec_hop_size": 320, // "Audio codec hop size." + "valid_file": "test.json", + }, + "model": { + "prefix_mode": 1, // "The mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance.", + }, + "log_dir": "Amphion/ckpts/tts/valle", + "train": { + "batch_size": 4, + "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s) + "max_epoch": 20, // "Number of epochs to train." + "use_dynamic_batchsize": true, // If use dynamic batch size + "max_tokens": 4000, // If use dynamic batch size + "max_sentences": 10 // If use dynamic batch size + } +} diff --git a/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.normalized.txt b/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.normalized.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed9d2b859d3c7bbb7bcc2476323d1a219ee8d53f --- /dev/null +++ b/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.normalized.txt @@ -0,0 +1 @@ +I almost think I can remember feeling a little different. \ No newline at end of file diff --git a/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.wav b/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.wav new file mode 100644 index 0000000000000000000000000000000000000000..fda92ee7b5e25572ec547d42f90b0cd2552b583d Binary files /dev/null and b/egs/tts/VALLE/prompt_examples/260_123440_000010_000004.wav differ diff --git a/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.normalized.txt b/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.normalized.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dab0ebd446cb814acccb2b97fee31a0d5b7444d --- /dev/null +++ b/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.normalized.txt @@ -0,0 +1 @@ +Ten sons sat at meat with him, and I was the youngest. \ No newline at end of file diff --git a/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.wav b/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.wav new file mode 100644 index 0000000000000000000000000000000000000000..c8eb2fa7b792dc2963a3d50018ef598652400592 Binary files /dev/null and b/egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.wav differ diff --git a/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.normalized.txt b/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.normalized.txt new file mode 100644 index 0000000000000000000000000000000000000000..04db64ff2711c35a78cbb210d11fce06865d831c --- /dev/null +++ b/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.normalized.txt @@ -0,0 +1 @@ +The girl entered, and gave an involuntary cry of surprise. \ No newline at end of file diff --git a/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.wav b/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.wav new file mode 100644 index 0000000000000000000000000000000000000000..b0be22f88911a0050a2ed62d20bb3a846ba34d64 Binary files /dev/null and b/egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.wav differ diff --git a/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.normalized.txt b/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.normalized.txt new file mode 100644 index 0000000000000000000000000000000000000000..6400078bd5d5fd4043a0019fc1e8b276b408cd39 --- /dev/null +++ b/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.normalized.txt @@ -0,0 +1 @@ +But even the unsuccessful dramatist has his moments. \ No newline at end of file diff --git a/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.wav b/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.wav new file mode 100644 index 0000000000000000000000000000000000000000..be07ba97e132ae124dc57226ab2044c042a3ceb3 Binary files /dev/null and b/egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.wav differ diff --git a/egs/tts/VALLE/run.sh b/egs/tts/VALLE/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..703b699bef8459896ce7d7fcf1f1bba3dbb874c2 --- /dev/null +++ b/egs/tts/VALLE/run.sh @@ -0,0 +1,190 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +cd $work_dir/modules/monotonic_align +mkdir -p monotonic_align +python setup.py build_ext --inplace +cd $work_dir + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,ar_model_ckpt_dir:,infer_output_dir:,infer_mode:,infer_test_list_file:,infer_text:,infer_text_prompt:,infer_audio_prompt:,model_train_stage:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] Model training stage. + --model_train_stage) shift; model_train_stage=$1 ; shift ;; + # [Only for Training] The stage1 ckpt dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --ar_model_ckpt_dir) shift; ar_model_ckpt_dir=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech. + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inference test list file. It is only used when the inference model is "batch". + --infer_test_list_file) shift; infer_test_list_file=$1 ; shift ;; + # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single". + --infer_text) shift; infer_text=$1 ; shift ;; + # [Only for Inference] The inference text prompt. It is only used when the inference model is "single". + --infer_text_prompt) shift; infer_text_prompt=$1 ; shift ;; + # [Only for Inference] The inference audio prompt. It is only used when the inference model is "single". + --infer_audio_prompt) shift; infer_audio_prompt=$1 ; shift ;; + + # [Only for Training] Resume configuration + --resume) shift; resume=$1 ; shift ;; + # [Only for Training] The specific checkpoint path that you want to resume from. + --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \ + --config=$exp_config \ + --num_workers=4 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + + if [ "$model_train_stage" = "2" ] && [ -z "$ar_model_ckpt_dir" ]; then + echo "[Error] Please specify the ckeckpoint path to the trained model in stage1." + exit 1 + fi + + if [ "$model_train_stage" = "1" ]; then + ar_model_ckpt_dir=None + fi + + echo "Exprimental Name: $exp_name" + + # Add default value + if [ -z "$resume_from_ckpt_path" ]; then + resume_from_ckpt_path="" + fi + + if [ -z "$resume_type" ]; then + resume_type="resume" + fi + + + if [ "$resume" = true ]; then + echo "Resume from the existing experiment..." + CUDA_VISIBLE_DEVICES=$gpu accelerate launch --main_process_port 29510 \ + "${work_dir}"/bins/tts/train.py \ + --config $exp_config \ + --exp_name $exp_name \ + --log_level debug \ + --train_stage $model_train_stage \ + --ar_model_ckpt_dir $ar_model_ckpt_dir \ + --resume \ + --checkpoint_path "$resume_from_ckpt_path" \ + --resume_type "$resume_type" + else + echo "Start a new experiment..." + CUDA_VISIBLE_DEVICES=$gpu accelerate launch --main_process_port 29510 \ + "${work_dir}"/bins/tts/train.py \ + --config $exp_config \ + --exp_name $exp_name \ + --log_level debug \ + --train_stage $model_train_stage \ + --ar_model_ckpt_dir $ar_model_ckpt_dir + fi +fi + + +######## Inference ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$expt_dir/result" + fi + + if [ -z "$infer_mode" ]; then + echo "[Error] Please specify the inference mode, e.g., "batch", "single"" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_test_list_file" ]; then + echo "[Error] Please specify the test list file used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then + echo "[Error] Please specify the text to be synthesized when the inference mode is single" + exit 1 + fi + + if [ "$infer_mode" = "single" ]; then + echo 'Text: ' ${infer_text} + infer_test_list_file=None + elif [ "$infer_mode" = "batch" ]; then + infer_text="" + infer_text_prompt="" + infer_audio_prompt="" + fi + + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \ + --config $exp_config \ + --log_level debug \ + --acoustics_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --mode $infer_mode \ + --text "$infer_text" \ + --text_prompt "$infer_text_prompt" \ + --audio_prompt $infer_audio_prompt\ + --test_list_file $infer_test_list_file \ + +fi diff --git a/egs/tts/VALLE_V2/README.md b/egs/tts/VALLE_V2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7ee26b0937591b14031c5cf843d29932ab558394 --- /dev/null +++ b/egs/tts/VALLE_V2/README.md @@ -0,0 +1,150 @@ +# VALL-E +## Introduction +This is an unofficial PyTorch implementation of VALL-E, a zero-shot voice cloning model via neural codec language modeling ([paper link](https://arxiv.org/abs/2301.02111)). +If trained properly, this model could match the performance specified in the original paper. + +## Change notes +This is a refined version compared to the first version of VALL-E in Amphion, we have changed the underlying implementation to Llama +to provide better model performance, faster training speed, and more readable codes. +This can be a great tool if you want to learn speech language models and its implementation. + +## Installation requirement + +Set up your environemnt as in Amphion README (you'll need a conda environment, and we recommend using Linux). A GPU is recommended if you want to train this model yourself. +For inferencing our pretrained models, you could generate samples even without a GPU. +To ensure your transformers library can run the code, we recommend additionally running: +```bash +pip install -U transformers==4.41.2 +``` + +## Inferencing pretrained VALL-E models +### Download pretrained weights +You need to download our pretrained weights from huggingface. + +Script to download AR and NAR model checkpoint: +```bash +huggingface-cli download amphion/valle valle_ar_mls_196000.bin valle_nar_mls_164000.bin --local-dir ckpts +``` +Script to download codec model (SpeechTokenizer) checkpoint: +```bash +mkdir -p ckpts/speechtokenizer_hubert_avg && huggingface-cli download amphion/valle SpeechTokenizer.pt config.json --local-dir ckpts/speechtokenizer_hubert_avg +``` + +If you cannot access huggingface, consider using the huggingface mirror to download: +```bash +HF_ENDPOINT=https://hf-mirror.com huggingface-cli download amphion/valle valle_ar_mls_196000.bin valle_nar_mls_164000.bin --local-dir ckpts +``` +```bash +mkdir -p ckpts/speechtokenizer_hubert_avg && HF_ENDPOINT=https://hf-mirror.com huggingface-cli download amphion/valle SpeechTokenizer.pt config.json --local-dir ckpts/speechtokenizer_hubert_avg +``` + + +### Inference in IPython notebook + +We provide our pretrained VALL-E model that is trained on 45k hours MLS dataset, which contains 10-20s English speech. +The "demo.ipynb" file provides a working example of inferencing our pretrained VALL-E model. Give it a try! + +## Examining the model files +Examining the model files of VALL-E is a great way to learn how it works. +We provide examples that allows you to overfit a single batch (so no dataset downloading is required). + +The AR model is essentially a causal language model that "continues" a speech. The NAR model is a modification from the AR model that allows for bidirectional attention. + + +File `valle_ar.py` and `valle_nar.py` in "models/tts/valle_v2" folder are models files, these files can be run directly via `python -m models.tts.valle_v2.valle_ar` (or `python -m models.tts.valle_v2.valle_nar`). +This will invoke a test which overfits it to a single example. + +## Training VALL-E from scratch +### Preparing LibriTTS or LibriTTS-R dataset files + +We have tested our training script on LibriTTS and LibriTTS-R. +You could download LibriTTS-R at [this link](https://www.openslr.org/141/) and LibriTTS at [this link](https://www.openslr.org/60). +The "train-clean-360" split is currently used by our configuration. +You can test dataset.py by run `python -m models.tts.valle_v2.libritts_dataset`. + +For your reference, our unzipped dataset files has a file structure like this: +``` +/path/to/LibriTTS_R +├── BOOKS.txt +├── CHAPTERS.txt +├── dev-clean +│ ├── 2412 +│ │ ├── 153947 +│ │ │ ├── 2412_153947_000014_000000.normalized.txt +│ │ │ ├── 2412_153947_000014_000000.original.txt +│ │ │ ├── 2412_153947_000014_000000.wav +│ │ │ ├── 2412_153947_000017_000001.normalized.txt +│ │ │ ├── 2412_153947_000017_000001.original.txt +│ │ │ ├── 2412_153947_000017_000001.wav +│ │ │ ├── 2412_153947_000017_000005.normalized.txt +├── train-clean-360 + ├── 422 +│ │ └── 122949 +│ │ ├── 422_122949_000009_000007.normalized.txt +│ │ ├── 422_122949_000009_000007.original.txt +│ │ ├── 422_122949_000009_000007.wav +│ │ ├── 422_122949_000013_000010.normalized.txt +│ │ ├── 422_122949_000013_000010.original.txt +│ │ ├── 422_122949_000013_000010.wav +│ │ ├── 422_122949.book.tsv +│ │ └── 422_122949.trans.tsv +``` + + +Alternativelly, you could write your own dataloader for your dataset. +You can reference the `__getitem__` method in `models/tts/VALLE_V2/mls_dataset.py` +It should return a dict of a 1-dimensional tensor 'speech', which is a 16kHz speech; and a 1-dimensional tensor of 'phone', which is the phoneme sequence of the speech. +As long as your dataset returns this in `__getitem__`, it should work. + +### Changing batch size and dataset path in configuration file +Our configuration file for training VALL-E AR model is at "egs/tts/VALLE_V2/exp_ar_libritts.json", and NAR model at "egs/tts/VALLE_V2/exp_nar_libritts.json" + +To train your model, you need to modify the `dataset` variable in the json configurations. +Currently it's at line 40, you should modify the "data_dir" to your dataset's root directory. +``` + "dataset": { + "dataset_list":["train-clean-360"], // You can also change to other splits like "dev-clean" + "data_dir": "/path/to/your/LibriTTS_R", + }, +``` + +You should also select a reasonable batch size at the "batch_size" entry (currently it's set at 5). + + +You can change other experiment settings in the `/egs/tts/VALLE_V2/exp_ar_libritts.json` such as the learning rate, optimizer and the dataset. + +### Run the command to Train AR model +(Make sure your current directory is at the Amphion root directory). +Run: +```sh +sh egs/tts/VALLE_V2/train_ar_libritts.sh +``` +Your initial model checkpoint could be found in places such as `ckpt/VALLE_V2/ar_libritts/checkpoint/epoch-0000_step-0000000_loss-7.397293/pytorch_model.bin` + + +### Resume from existing checkpoint +Our framework supports resuming from existing checkpoint. + +Run: +```sh +sh egs/tts/VALLE_V2/train_ar_libritts.sh --resume +``` + +### Finetuning based on our AR model +We provide our AR model optimizer, and random_states checkpoints to support finetuning (No need to download these files if you're only inferencing from the pretrained model). First rename the models as "pytorch_model.bin", "optimizer.bin", and "random_states_0.pkl", then you could resume from these checkpoints. [Link to AR optimizer checkpoint](https://huggingface.co/amphion/valle/blob/main/optimizer_valle_ar_mls_196000.bin) and [Link to random_states.pkl](https://huggingface.co/amphion/valle/blob/main/random_states_0.pkl). + + +### Run the command to Train NAR model +(Make sure your current directory is at the Amphion root directory). +Run: +```sh +sh egs/tts/VALLE_V2/train_nar_libritts.sh +``` + +### Inference your models +Since our inference script is already given, you can change the paths +from our pretrained model to you newly trained models and do the inference. + +## Future plans +- [ ] Support more languages +- [ ] More are coming... diff --git a/egs/tts/VALLE_V2/demo.ipynb b/egs/tts/VALLE_V2/demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..01b272914f67b8861753d06971c6fc95cab68a70 --- /dev/null +++ b/egs/tts/VALLE_V2/demo.ipynb @@ -0,0 +1,263 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.chdir('../../..')\n", + "print(os.getcwd()) # Ensure this is you Amphion root path, otherwise change the above path to you amphion root path\n", + "assert os.path.isfile('./README.md') # make sure the current path is Amphion root path\n", + "import sys\n", + "sys.path.append('.')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# put your cheackpoint file (.bin) in the root path of AmphionVALLEv2\n", + "# or use your own pretrained weights\n", + "ar_model_path = 'ckpts/valle_ar_mls_196000.bin' # huggingface-cli download amphion/valle valle_ar_mls_196000.bin valle_nar_mls_164000.bin --local-dir ckpts\n", + "nar_model_path = 'ckpts/valle_nar_mls_164000.bin'\n", + "speechtokenizer_path = 'ckpts/speechtokenizer_hubert_avg' # huggingface-cli download amphion/valle speechtokenizer_hubert_avg/SpeechTokenizer.pt speechtokenizer_hubert_avg/config.json --local-dir ckpts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "device = 'cpu' # change to 'cuda' if you have gpu" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from models.tts.valle_v2.valle_inference import ValleInference\n", + "# change to device='cuda' to use CUDA GPU for fast inference\n", + "# change \"use_vocos\" to True would give better sound quality\n", + "# If you meet problem with network, you could set \"use_vocos=False\", though would give bad quality\n", + "model = ValleInference(ar_path=ar_model_path, nar_path=nar_model_path, speechtokenizer_path=speechtokenizer_path, device=device)\n", + "# model = ValleInference(use_vocos=False, ar_path=ar_model_path, nar_path=nar_model_path, device='cuda')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# prepare inference data\n", + "import librosa\n", + "import torch\n", + "wav, _ = librosa.load('./egs/tts/VALLE_V2/example.wav', sr=16000)\n", + "wav = torch.tensor(wav, dtype=torch.float32)\n", + "from IPython.display import Audio\n", + "Audio(wav, rate = 16000)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# The transcript of the prompt part\n", + "prompt_transcript_text = 'and keeping eternity before the eyes'\n", + "\n", + "# Here are the words you want the model to output\n", + "target_transcript_text = 'It presents a unified framework that is inclusive of diverse generation tasks and models with the added bonus of being easily extendable for new applications'\n", + "from models.tts.valle_v2.g2p_processor import G2pProcessor\n", + "g2p = G2pProcessor()\n", + "prompt_transcript = g2p(prompt_transcript_text, 'en')[1]\n", + "target_transcript = g2p(target_transcript_text, 'en')[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "prompt_transcript = torch.tensor(prompt_transcript).long()\n", + "target_transcript = torch.tensor(target_transcript).long()\n", + "transcript = torch.cat([prompt_transcript, target_transcript], dim=-1)\n", + "batch = {\n", + " 'speech': wav.unsqueeze(0),\n", + " 'phone_ids': transcript.unsqueeze(0),\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'speech': tensor([[ 3.0518e-05, 3.0518e-05, 3.0518e-05, ..., -3.0518e-05,\n", + " -3.0518e-05, 3.0518e-05]]),\n", + " 'phone_ids': tensor([[ 5, 28, 149, 72, 219, 134, 127, 170, 115, 147, 219, 113, 185, 91,\n", + " 149, 30, 185, 123, 219, 65, 115, 106, 43, 172, 219, 73, 29, 219,\n", + " 59, 214, 6, 5, 116, 181, 219, 168, 173, 124, 218, 82, 149, 185,\n", + " 175, 219, 28, 219, 210, 200, 149, 30, 106, 64, 72, 219, 104, 173,\n", + " 100, 143, 209, 94, 135, 219, 73, 24, 181, 219, 116, 214, 219, 113,\n", + " 149, 136, 140, 200, 179, 115, 205, 219, 31, 205, 219, 71, 58, 206,\n", + " 91, 175, 219, 131, 85, 149, 88, 100, 178, 30, 145, 219, 180, 24,\n", + " 179, 136, 175, 219, 28, 149, 72, 219, 141, 15, 76, 30, 140, 214,\n", + " 219, 207, 118, 74, 219, 73, 29, 219, 22, 76, 30, 72, 219, 65,\n", + " 155, 149, 30, 175, 219, 31, 205, 219, 65, 127, 115, 147, 219, 125,\n", + " 218, 30, 140, 123, 219, 83, 136, 179, 185, 82, 149, 76, 30, 67,\n", + " 30, 139, 219, 104, 43, 172, 219, 144, 199, 219, 25, 170, 140, 30,\n", + " 136, 100, 178, 30, 149, 214, 6]])}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# print the contents of the model input\n", + "# `phone_ids` contains a concatenation of `prompt_transcript` and `target_transcript` \n", + "batch" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "configs = [dict(\n", + " top_p=0.9,\n", + " top_k=5,\n", + " temperature=0.95,\n", + " repeat_penalty=1.0,\n", + " max_length=2000,\n", + " num_beams=1,\n", + ")] # model inference hyperparameters\n", + "output_wav = model(batch, configs)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[-1.2337e-06, -1.2981e-05, -4.0130e-05, ..., -4.1360e-05,\n", + " 1.1917e-05, -4.2949e-05]]])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_wav # The output wav is a tensor of shape [1,1,T]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "prompt_transcript : and keeping eternity before the eyes\n", + "target_transcript : It presents a unified framework that is inclusive of diverse generation tasks and models with the added bonus of being easily extendable for new applications\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(f'prompt_transcript : {prompt_transcript_text}')\n", + "print(f'target_transcript : {target_transcript_text}')\n", + "Audio(output_wav.squeeze(0), rate = 16000)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import torchaudio\n", + "torchaudio.save('out.wav', output_wav.squeeze(0), 16000)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "amphion", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/egs/tts/VALLE_V2/example.wav b/egs/tts/VALLE_V2/example.wav new file mode 100644 index 0000000000000000000000000000000000000000..0484674dd6ec2b377480dd29c29d86206d0c61f6 Binary files /dev/null and b/egs/tts/VALLE_V2/example.wav differ diff --git a/egs/tts/VALLE_V2/exp_ar_libritts.json b/egs/tts/VALLE_V2/exp_ar_libritts.json new file mode 100644 index 0000000000000000000000000000000000000000..81bd8a878723572388fa93bd6e608748183390ab --- /dev/null +++ b/egs/tts/VALLE_V2/exp_ar_libritts.json @@ -0,0 +1,55 @@ +{ + "model_type": "VALLE_V2_AR", + "log_dir": "./ckpt/VALLE_V2", + "use_speechtokenizer": true, + "train": { + "gradient_accumulation_step": 1, + "find_unused_parameters": false, + "tracker": ["tensorboard"], + "max_epoch": 1000, + "save_checkpoint_stride": [500], + "keep_last": [1], + "run_eval": [true], + "dataloader": { + "num_worker": 4, + "pin_memory": true, + "persistent_workers": true + }, + "dataset": { + "use_dynamic_batchsize": false, + "name": "libritts" + }, + "optimizer": "adamW", + "adamw": { + "lr": 1e-4 + }, + "scheduler": { + "warmup_steps": 25000, + "total_steps": 800000, + "min_lr": 1e-5 + }, + "exponentiallr": { + "gamma": 0.999999 + }, + "batch_size": 5, + "max_tokens": 5000, + "max_sentences": 64, + "random_seed": 0 + }, + "dataset": { + "dataset_list":["train-clean-360"], + "data_dir": "/path/to/your/libritts" // You can also change to other splits like "dev-clean" + }, + "model": { + "phone_vocab_size": 300, + "target_vocab_size": 1024, + "pad_token_id": 1324, + "bos_target_id": 1325, + "eos_target_id": 1326, + "bos_phone_id": 1327, + "eos_phone_id": 1328, + "bos_prompt_id": 1329, + "eos_prompt_id": 1330, + "num_hidden_layers": 16 + } + } diff --git a/egs/tts/VALLE_V2/exp_nar_libritts.json b/egs/tts/VALLE_V2/exp_nar_libritts.json new file mode 100644 index 0000000000000000000000000000000000000000..bba7beb79f886e1ac7d61fc66fea833c9155505f --- /dev/null +++ b/egs/tts/VALLE_V2/exp_nar_libritts.json @@ -0,0 +1,55 @@ +{ + "model_type": "VALLE_V2_NAR", + "log_dir": "./ckpt/VALLE_V2", + "use_speechtokenizer": true, + "train": { + "gradient_accumulation_step": 1, + "find_unused_parameters": true, + "tracker": ["tensorboard"], + "max_epoch": 1000, + "save_checkpoint_stride": [500], + "keep_last": [1], + "run_eval": [true], + "dataloader": { + "num_worker": 4, + "pin_memory": true, + "persistent_workers": true + }, + "dataset": { + "use_dynamic_batchsize": false, + "name": "libritts" + }, + "optimizer": "adamw", + "adamw": { + "lr": 1e-4 + }, + "scheduler": { + "warmup_steps": 25000, + "total_steps": 800000, + "min_lr": 1e-5 + }, + "exponentiallr": { + "gamma": 0.999999 + }, + "batch_size": 5, + "max_tokens": 10500, + "max_sentences": 64, + "random_seed": 0 + }, + "dataset": { + "dataset_list":["train-clean-360"], + "data_dir": "/path/to/your/libritts" // You can also change to other splits like "dev-clean" + }, + "model": { + "phone_vocab_size": 300, + "target_vocab_size": 1024, + "pad_token_id": 1324, + "bos_target_id": 1325, + "eos_target_id": 1326, + "bos_phone_id": 1327, + "eos_phone_id": 1328, + "bos_prompt_id": 1329, + "eos_prompt_id": 1330, + "num_hidden_layers": 16 + } + } diff --git a/egs/tts/VALLE_V2/train_ar_libritts.sh b/egs/tts/VALLE_V2/train_ar_libritts.sh new file mode 100644 index 0000000000000000000000000000000000000000..2166551e253ef0665106d77c585a4c41338fcfd9 --- /dev/null +++ b/egs/tts/VALLE_V2/train_ar_libritts.sh @@ -0,0 +1,27 @@ +export PYTHONPATH="./" + +######## Build Experiment Environment ########### +exp_dir="./egs/tts/VALLE_V2" +echo exp_dir: $exp_dir +work_dir="./" # Amphion root folder +echo work_dir: $work_dir + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Config File Dir ############## +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_ar_libritts.json +fi +echo "Exprimental Configuration File: $exp_config" + +######## Set the experiment name ########## +exp_name="ar_libritts" + +port=53333 # a random number for port + +######## Train Model ########### +echo "Experiment Name: $exp_name" +accelerate launch --main_process_port $port "${work_dir}"/bins/tts/train.py --config $exp_config \ +--exp_name $exp_name --log_level debug $1 diff --git a/egs/tts/VALLE_V2/train_nar_libritts.sh b/egs/tts/VALLE_V2/train_nar_libritts.sh new file mode 100644 index 0000000000000000000000000000000000000000..d753854cce7e4dcef8097381fe333e7b70dc3e67 --- /dev/null +++ b/egs/tts/VALLE_V2/train_nar_libritts.sh @@ -0,0 +1,27 @@ +export PYTHONPATH="./" + +######## Build Experiment Environment ########### +exp_dir="./egs/tts/VALLE_V2" +echo exp_dir: $exp_dir +work_dir="./" # Amphion root folder +echo work_dir: $work_dir + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Set Config File Dir ############## +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_nar_libritts.json +fi +echo "Exprimental Configuration File: $exp_config" + +######## Set the experiment name ########## +exp_name="nar_libritts" + +port=17004 # a random number for port + +######## Train Model ########### +echo "Experimental Name: $exp_name" +accelerate launch --main_process_port $port "${work_dir}"/bins/tts/train.py --config $exp_config \ +--exp_name $exp_name --log_level debug $1 diff --git a/egs/tts/VITS/README.md b/egs/tts/VITS/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5fed54928aca12fcc6a41295f1329a6c63275575 --- /dev/null +++ b/egs/tts/VITS/README.md @@ -0,0 +1,221 @@ +# VITS Recipe + +[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/Text-to-Speech) +[![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/Text-to-Speech) + +In this recipe, we will show how to train VITS using Amphion's infrastructure. [VITS](https://arxiv.org/abs/2106.06103) is an end-to-end TTS architecture that utilizes a conditional variational autoencoder with adversarial learning. + +There are four stages in total: + +1. Data preparation +2. Features extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download +You can use the commonly used TTS dataset to train the TTS model, e.g., LJSpeech, VCTK, Hi-Fi TTS, LibriTTS, etc. We strongly recommend using LJSpeech to train the single-speaker TTS model for the first time. While training the multi-speaker TTS model for the first time, we recommend using Hi-Fi TTS. The process of downloading the dataset has been detailed [here](../../datasets/README.md). + +### Configuration + +After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json + "dataset": [ + "LJSpeech", + //"hifitts" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "LJSpeech": "[LJSpeech dataset path]", + //"hifitts": "[Hi-Fi TTS dataset path] + }, +``` + +## 2. Features Extraction + +### Configuration + +In `exp_config.json`, specify the `log_dir` for saving the checkpoints and logs, and specify the `processed_dir` for saving processed data. For preprocessing the multi-speaker TTS dataset, set `extract_audio` and `use_spkid` to `true`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" + "log_dir": "ckpts/tts", + "preprocess": { + //"extract_audio": true, + "use_phone": true, + // linguistic features + "extract_phone": true, + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + "sample_rate": 22050, //target sampling rate + "valid_file": "valid.json", //validation set + //"use_spkid": true, //use speaker ID to train multi-speaker TTS model + }, +``` + +### Run + +Run the `run.sh` as the preprocess stage (set `--stage 1`): + +```bash +sh egs/tts/VITS/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyperparameters in the `exp_config.json`. They can work on a single NVIDIA-24g GPU. You can adjust them based on your GPU machines. +For training the multi-speaker TTS model, specify the `n_speakers` value to be greater (used for new speaker fine-tuning) than or equal to the number of speakers in your dataset(s) and set `multi_speaker_training` to `true`. + +```json + "model": { + //"n_speakers": 10 //Number of speakers in the dataset(s) used. The default value is 0 if not specified. + }, + "train": { + "batch_size": 16, + //"multi_speaker_training": true, + } +``` + +### Train From Scratch + +Run the `run.sh` as the training stage (set `--stage 2`). Specify an experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`. + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] +``` + +### Train From Existing Source + +We support training from existing sources for various purposes. You can resume training the model from a checkpoint or fine-tune a model from another checkpoint. + +By setting `--resume true`, the training will resume from the **latest checkpoint** from the current `[YourExptName]` by default. For example, if you want to resume training from the latest checkpoint in `Amphion/ckpts/tts/[YourExptName]/checkpoint`, run: + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ + --resume true +``` + +You can also choose a **specific checkpoint** for retraining by `--resume_from_ckpt_path` argument. For example, if you want to resume training from the checkpoint `Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]`, run: + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ + --resume true \ + --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" +``` + +If you want to **fine-tune from another checkpoint**, just use `--resume_type` and set it to `"finetune"`. For example, If you want to fine-tune the model from the checkpoint `Amphion/ckpts/tts/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]`, run: + + +```bash +sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName] \ + --resume true \ + --resume_from_ckpt_path "Amphion/ckpts/tts/[YourExptName]/checkpoint/[SpecificCheckpoint]" \ + --resume_type "finetune" +``` + +> **NOTE:** The `--resume_type` is set as `"resume"` in default. It's not necessary to specify it when resuming training. +> +> The difference between `"resume"` and `"finetune"` is that the `"finetune"` will **only** load the pretrained model weights from the checkpoint, while the `"resume"` will load all the training states (including optimizer, scheduler, etc.) from the checkpoint. + +Here are some example scenarios to better understand how to use these arguments: +| Scenario | `--resume` | `--resume_from_ckpt_path` | `--resume_type` | +| ------ | -------- | ----------------------- | ------------- | +| You want to train from scratch | no | no | no | +| The machine breaks down during training and you want to resume training from the latest checkpoint | `true` | no | no | +| You find the latest model is overfitting and you want to re-train from the checkpoint before | `true` | `SpecificCheckpoint Path` | no | +| You want to fine-tune a model from another checkpoint | `true` | `SpecificCheckpoint Path` | `"finetune"` | + + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + + +## 4. Inference + +### Pre-trained Model Download + +We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech according to the following inference instruction. + + +### Configuration + +For inference, you need to specify the following configurations when running `run.sh`: + + +| Parameters | Description | Example | +| --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` | +| `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` | +| `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. | +| `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`.
For Hi-Fi TTS dataset, the inference dataset would be `hifitts`. | +| `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from the test set as template testing set.
For Hi-Fi TTS dataset, the testing set would be "`test`" split from Hi-Fi TTS during the feature extraction process. | +| `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" | +| `--infer_speaker_name` | The target speaker's voice is to be synthesized.
(***Note: only applicable to multi-speaker TTS model***) | For Hi-Fi TTS dataset, the list of available speakers includes: "`hifitts_11614`", "`hifitts_11697`", "`hifitts_12787`", "`hifitts_6097`", "`hifitts_6670`", "`hifitts_6671`", "`hifitts_8051`", "`hifitts_9017`", "`hifitts_9136`", "`hifitts_92`".
You may find the list of available speakers from `spk2id.json` file generated in ```log_dir/[YourExptName]``` that you have specified in `exp_config.json`. | + +### Run +#### Single text inference: +For the single-speaker TTS model, if you want to generate a single clip of speech from a given text, just run: + +```bash +sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ + --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." +``` + +For the multi-speaker TTS model, in addition to the above-mentioned arguments, you need to add ```infer_speaker_name``` argument, and run: +```bash +sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ + --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ + --infer_mode "single" \ + --infer_text "This is a clip of generated speech with the given text from a TTS model." \ + --infer_speaker_name "hifitts_92" +``` + +#### Batch inference: +For the single-speaker TTS model, if you want to generate speech of all testing sets split from LJSpeech, just run: + +```bash +sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ + --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ + --infer_mode "batch" \ + --infer_dataset "LJSpeech" \ + --infer_testing_set "test" +``` +For the multi-speaker TTS model, if you want to generate speech of all testing sets split from Hi-Fi TTS, the same procedure follows from above, with ```LJSpeech``` replaced by ```hifitts```. +```bash +sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \ + --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \ + --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \ + --infer_mode "batch" \ + --infer_dataset "hifitts" \ + --infer_testing_set "test" +``` + + +We released a pre-trained Amphion VITS model trained on LJSpeech. So, you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instructions. Meanwhile, the pre-trained multi-speaker VITS model trained on Hi-Fi TTS will be released soon. Stay tuned. + + +```bibtex +@inproceedings{kim2021conditional, + title={Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech}, + author={Kim, Jaehyeon and Kong, Jungil and Son, Juhee}, + booktitle={International Conference on Machine Learning}, + pages={5530--5540}, + year={2021}, +} +``` diff --git a/egs/tts/VITS/exp_config.json b/egs/tts/VITS/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3a2332f23abb4ba5bf16523237e9124d397da62e --- /dev/null +++ b/egs/tts/VITS/exp_config.json @@ -0,0 +1,34 @@ +{ + "base_config": "config/vits.json", + "model_type": "VITS", + "dataset": [ + "LJSpeech", + //"hifitts" + ], + "dataset_path": { + // TODO: Fill in your dataset path + "LJSpeech": "[LJSpeech dataset path]", + //"hifitts": "[Hi-Fi TTS dataset path] + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" + "log_dir": "ckpts/tts", + "preprocess": { + //"extract_audio":true, + "use_phone": true, + // linguistic features + "extract_phone": true, + "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + "sample_rate": 22050, // target sampling rate + "valid_file": "valid.json", // validation set + //"use_spkid": true // use speaker ID to train multi-speaker TTS model + }, + "model":{ + //"n_speakers": 10 // number of speakers, greater than or equal to the number of speakers in the dataset(s) used. The default value is 0 if not specified. + }, + "train": { + "batch_size": 16, + //"multi_speaker_training": true + } +} diff --git a/egs/tts/VITS/run.sh b/egs/tts/VITS/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..dd702795f909e923d6b8dff902e4853c7e8c5f8d --- /dev/null +++ b/egs/tts/VITS/run.sh @@ -0,0 +1,179 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $exp_dir))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +cd $work_dir/modules/monotonic_align +mkdir -p monotonic_align +python setup.py build_ext --inplace +cd $work_dir + +######## Parse the Given Parameters from the Commond ########### +# options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@") +options=$(getopt -o c:n:s --long gpu:,config:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,infer_speaker_name:,name:,stage: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] Resume configuration + --resume) shift; resume=$1 ; shift ;; + # [Only for Training] The specific checkpoint path that you want to resume from. + --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generate a single clip of speech. + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inference dataset. It is only used when the inference mode is "batch". + --infer_dataset) shift; infer_dataset=$1 ; shift ;; + # [Only for Inference] The inference testing set. It is only used when the inference mode is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set. + --infer_testing_set) shift; infer_testing_set=$1 ; shift ;; + # [Only for Inference] The text to be synthesized from. It is only used when the inference mode is "single". + --infer_text) shift; infer_text=$1 ; shift ;; + # [Only for Inference] The chosen speaker's voice to be synthesized. It is only used when the inference mode is "single" for multi-speaker VITS. + --infer_speaker_name) shift; infer_speaker_name=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Experimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \ + --config=$exp_config \ + --num_workers=4 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Experimental Name: $exp_name" + + # add default value + if [ -z "$resume_from_ckpt_path" ]; then + resume_from_ckpt_path="" + fi + + if [ -z "$resume_type" ]; then + resume_type="resume" + fi + + if [ "$resume" = true ]; then + echo "Resume from the existing experiment..." + CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/tts/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --resume \ + --checkpoint_path "$resume_from_ckpt_path" \ + --resume_type "$resume_type" + else + echo "Start a new experiment..." + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/tts/train.py \ + --config $exp_config \ + --exp_name $exp_name \ + --log_level debug + fi +fi + +######## Inference ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$expt_dir/result" + fi + + if [ -z "$infer_mode" ]; then + echo "[Error] Please specify the inference mode, e.g., "batch", "single"" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_dataset" ]; then + echo "[Error] Please specify the dataset used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "batch" ] && [ -z "$infer_testing_set" ]; then + echo "[Error] Please specify the testing set used in inference when the inference mode is batch" + exit 1 + fi + + if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then + echo "[Error] Please specify the text to be synthesized when the inference mode is single" + exit 1 + fi + + if [ "$infer_mode" = "single" ]; then + echo 'Text: ' ${infer_text} + infer_dataset=None + infer_testing_set=None + elif [ "$infer_mode" = "batch" ]; then + infer_text='' + fi + + if [ -z "$infer_speaker_name" ]; then + infer_speaker_name=None + fi + + + + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \ + --config $exp_config \ + --acoustics_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --mode $infer_mode \ + --dataset $infer_dataset \ + --testing_set $infer_testing_set \ + --text "$infer_text" \ + --speaker_name $infer_speaker_name \ + --log_level debug + + + +fi diff --git a/egs/visualization/README.md b/egs/visualization/README.md new file mode 100644 index 0000000000000000000000000000000000000000..08fc4b4b4fec31ae3ab9abdbf0d844b8138dfe9f --- /dev/null +++ b/egs/visualization/README.md @@ -0,0 +1,19 @@ +# Amphion Visualization Recipe + +## Quick Start + +We provides a **[beginner recipe](SingVisio/)** to demonstrate how to implement interactive visualization for classic audio, music and speech generative models. Specifically, it is also an official implementation of the paper "SingVisio: Visual Analytics of the Diffusion Model for Singing Voice Conversion", which can be accessed via [arXiv](https://arxiv.org/abs/2402.12660) or [Computers & Graphics](https://www.sciencedirect.com/science/article/pii/S0097849324001936). The **SingVisio** can be experienced [here](https://openxlab.org.cn/apps/detail/Amphion/SingVisio). + +## Supported Models + +As the unique feature of Amphion, visualization aims to introduce interactive visual analysis of some classical models for educational purposes, helping newcomers understand their inner workings. + +Until now, Amphion has supported the visualization tool for the following models: + +- **SVC**: + - **[MultipleContentsSVC](../svc/MultipleContentsSVC)**: A diffusion-based model for sining voice conversion +- **TTS**: + - **[FastSpeech 2](../tts/FastSpeech2/)** (👨‍💻 developing): A typical transformer-based TTS model. + - **[VITS](../tts/VITS/)** (👨‍💻 developing): A typical flow-based end-to-end TTS model. + + diff --git a/egs/visualization/SingVisio/README.md b/egs/visualization/SingVisio/README.md new file mode 100644 index 0000000000000000000000000000000000000000..daeb77feb5a6b8a6026ba7d6d57fa2c0816a4e4e --- /dev/null +++ b/egs/visualization/SingVisio/README.md @@ -0,0 +1,90 @@ +# SingVisio: Visual Analytics of the Diffusion Model for Singing Voice Conversion + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2402.12660) +[![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/SingVisio) +[![Video](https://img.shields.io/badge/Video-Demo-orange)](https://drive.google.com/file/d/15097SGhQh-SwUNbdWDYNyWEP--YGLba5/view) + +
+ +
+ +This is the official implementation of the paper "SingVisio: Visual Analytics of the Diffusion Model for Singing Voice Conversion", which can be accessed via [arXiv](https://arxiv.org/abs/2402.12660) or [Computers & Graphics](https://www.sciencedirect.com/science/article/pii/S0097849324001936). + +The online **SingVisio** system can be experienced [here](https://openxlab.org.cn/apps/detail/Amphion/SingVisio). + +**SingVisio** system comprises two main components: a web-based front-end user interface and a back-end generation model. + +- The web-based user interface was developed using [D3.js](https://d3js.org/), a JavaScript library designed for creating dynamic and interactive data visualizations. The code can be accessed [here](../../../visualization/SingVisio/webpage/). +- The core generative model, [MultipleContentsSVC](https://arxiv.org/abs/2310.11160), is a diffusion-based model tailored for singing voice conversion (SVC). The code for this model is available in Amphion, with the recipe accessible [here](../../svc/MultipleContentsSVC/). + +## Development Workflow for Visualization Systems + +The process of developing a visualization system encompasses seven key steps: + +1. **Identify the Model for Visualization**: Begin by selecting the model you wish to visualize. + +2. **Task Analysis**: Analyze the specific tasks that the visualization system needs to support through discussions with experts, model builders, and potential users. It means to determine what you want to visualize, such as the classical denoising generation process in diffusion models. + +3. **Data and Feature Generation**: Produce the data and features necessary for visualization based on the selected model. Alternatively, you can also generate and visualize them in real time. + +4. **Design the User Interface**: Design and develop the user interface to effectively display the model structure, data, and features. + +5. **Iterative Refinement**: Iteratively refine the user interface design for a better visualization experience. + +6. **User Study Preparation**: Design questionnaires for a user study to evaluate the system in terms of system design, functionality, explainability, and user-friendliness. + +7. **Evaluation and Improvement**: Conduct comprehensive evaluations through a user study, case study, and expert study to evaluate, analyze, and improve the system. + + +## Tasks Supported in SingVisio + +There are five tasks in **SingVisio** System. +- To investigate the evolution and quality of the converted SVC results from each step in the diffusion generation process, **SingVisio** supports the following two tasks: + - **T1: Step-wise Diffusion Generation Comparison:** Investigate the evolution and quality of results converted at each step of the diffusion process. + - **T2: Step-wise Metric Comparison:** Examine changes in metrics throughout the diffusion steps. + +- To explore how various factors (content, melody, singer timbre) influence the SVC results, **SingVisio** supports the following three tasks: + - **T3: Pair-wise SVC Comparison with Different Target Singers** + - **T4: Pair-wise SVC Comparison with Different Source Singers** + - **T5: Pair-wise SVC Comparison with Different Songs** + +## View Design in SingVisio + +The user inference of **SingVisio** is comprised of five views: +- **A: Control Panel:** Enables users to adjust the display mode and select data for visual analysis. +- **B: Step View:** Offers an overview of the diffusion generation process. +- **C: Comparison View:** Facilitates easy comparison of conversion results under different conditions. +- **D: Projection View:** Assists in observing the diffusion steps' trajectory with or without conditions. +- **E: Metric View:** Displays objective metrics evaluated on the diffusion-based SVC model, allowing for interactive examination of metric trends across diffusion steps. + +## Detailed System Introduction of SingVisio + +For a detailed introduction to **SingVisio** and user instructions, please refer to [this document](../../../visualization/SingVisio/System_Introduction_of_SingVisio_V2.pdf). + +Additionally, explore the SingVisio demo to see the system's functionalities and usage in action. + +## User Study of SingVisio + +Participate in the [user study](https://www.wjx.cn/vm/wkIH372.aspx#) of **SingVisio** if you're interested. We encourage you to conduct the study after experiencing the **SingVisio** system. Your valuable feedback is greatly appreciated. + +## Citations 📖 + +Please cite the following papers if you use **SingVisio** in your research: + +```bibtex +@article{singvisio, + author={Xue, Liumeng and Wang, Chaoren and Wang, Mingxuan and Zhang, Xueyao and Han, Jun and Wu, Zhizheng}, + title={SingVisio: Visual Analytics of the Diffusion Model for Singing Voice Conversion}, + journal={Computers & Graphics}, + year={2024} +} +``` + +```bibtex +@inproceedings{amphion, + author={Zhang, Xueyao and Xue, Liumeng and Gu, Yicheng and Wang, Yuancheng and Li, Jiaqi and He, Haorui and Wang, Chaoren and Song, Ting and Chen, Xi and Fang, Zihao and Chen, Haopeng and Zhang, Junan and Tang, Tze Ying and Zou, Lexiao and Wang, Mingxuan and Han, Jun and Chen, Kai and Li, Haizhou and Wu, Zhizheng}, + title={Amphion: An Open-Source Audio, Music and Speech Generation Toolkit}, + booktitle={{IEEE} Spoken Language Technology Workshop, {SLT} 2024}, + year={2024} +} +``` diff --git a/egs/vocoder/README.md b/egs/vocoder/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9d247d2f1341c4eba405c50d6fe504cc47dbc2a6 --- /dev/null +++ b/egs/vocoder/README.md @@ -0,0 +1,23 @@ +# Amphion Vocoder Recipe + +## Quick Start + +We provide a [**beginner recipe**](gan/tfr_enhanced_hifigan/README.md) to demonstrate how to train a high quality HiFi-GAN speech vocoder. Specially, it is also an official implementation of our paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". Some demos can be seen [here](https://vocodexelysium.github.io/MS-SB-CQTD/). + +## Supported Models + +Neural vocoder generates audible waveforms from acoustic representations, which is one of the key parts for current audio generation systems. Until now, Amphion has supported various widely-used vocoders according to different vocoder types, including: + +- **GAN-based vocoders**, which we have provided [**a unified recipe**](gan/README.md) : + - [MelGAN](https://arxiv.org/abs/1910.06711) + - [HiFi-GAN](https://arxiv.org/abs/2010.05646) + - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts) + - [BigVGAN](https://arxiv.org/abs/2206.04658) + - [APNet](https://arxiv.org/abs/2305.07952) +- **Flow-based vocoders** (👨‍💻 developing): + - [WaveGlow](https://arxiv.org/abs/1811.00002) +- **Diffusion-based vocoders**, which we have provided [**a unified recipe**](diffusion/README.md): + - [Diffwave](https://arxiv.org/abs/2009.09761) +- **Auto-regressive based vocoders** (👨‍💻 developing): + - [WaveNet](https://arxiv.org/abs/1609.03499) + - [WaveRNN](https://arxiv.org/abs/1802.08435v1) \ No newline at end of file diff --git a/egs/vocoder/diffusion/README.md b/egs/vocoder/diffusion/README.md new file mode 100644 index 0000000000000000000000000000000000000000..466d4fd28f9bd5dc95437bccec7f3fe7fe3c808c --- /dev/null +++ b/egs/vocoder/diffusion/README.md @@ -0,0 +1,212 @@ +# Amphion Diffusion-based Vocoder Recipe + +## Supported Model Architectures + +Diffusion-based Vocoders utilize the diffusion process for audio generation, as illustrated below: + +
+
+ +
+
+ +Until now, Amphion Diffusion-based Vocoder has supported the following models and training strategies. + +- **Models** + - [DiffWave](https://arxiv.org/pdf/2009.09761) +- **Training and Inference Strategy** + - [DDPM](https://proceedings.neurips.cc/paper/2020/hash/4c5bcfec8584af0d967f1ab10179ca4b-Abstract.html) + +You can use any vocoder architecture with any dataset you want. There are four steps in total: + +1. Data preparation +2. Feature extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +You can train the vocoder with any datasets. Amphion's supported open-source datasets are detailed [here](../../../datasets/README.md). + +### Configuration + +Specify the dataset path in `exp_config_base.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json +"dataset": [ + "csd", + "kising", + "m4singer", + "nus48e", + "opencpop", + "opensinger", + "opera", + "pjs", + "popbutfy", + "popcs", + "ljspeech", + "vctk", + "libritts", +], +"dataset_path": { + // TODO: Fill in your dataset path + "csd": "[dataset path]", + "kising": "[dataset path]", + "m4singer": "[dataset path]", + "nus48e": "[dataset path]", + "opencpop": "[dataset path]", + "opensinger": "[dataset path]", + "opera": "[dataset path]", + "pjs": "[dataset path]", + "popbutfy": "[dataset path]", + "popcs": "[dataset path]", + "ljspeech": "[dataset path]", + "vctk": "[dataset path]", + "libritts": "[dataset path]", +}, +``` + +### 2. Feature Extraction + +The needed features are speficied in the individual vocoder direction so it doesn't require any modification. + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_base.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder" + "log_dir": "ckpts/vocoder", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/vocoder/diffusion/{vocoder_name}/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config_base.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + "max_epoch": 1000000, + "save_checkpoint_stride": [20], + "adamw": { + "lr": 2.0e-4, + "adam_b1": 0.8, + "adam_b2": 0.99 + }, + "exponential_lr": { + "lr_decay": 0.999 + }, +} +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`. + +```bash +sh egs/vocoder/diffusion/{vocoder_name}/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +If you want to resume or finetune from a pretrained model, run: + +```bash +sh egs/vocoder/diffusion/{vocoder_name}/run.sh --stage 2 \ + --name [YourExptName] \ + --resume_type ["resume" for resuming training and "finetune" for loading parameters only] \ + --checkpoint Amphion/ckpts/vocoder/[YourExptName]/checkpoint \ +``` + +> **NOTE:** For multi-gpu training, the `main_process_port` is set as `29500` in default. You can change it when running `run.sh` by specifying such as `--main_process_port 29501`. + +## 4. Inference + +### Run + +Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from_audio`. + +```bash +sh egs/vocoder/diffusion/{vocoder_name}/run.sh --stage 3 \ + --infer_mode [Your chosen inference mode] \ + --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \ + --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \ + --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### a. Inference from Dataset + +Run the `run.sh` with specified datasets, here is an example. + +```bash +sh egs/vocoder/diffusion/{vocoder_name}/run.sh --stage 3 \ + --infer_mode infer_from_dataset \ + --infer_datasets "libritts vctk ljspeech" \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### b. Inference from Features + +If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure: + +```plaintext + ┣ {infer_feature_dir} + ┃ ┣ mels + ┃ ┃ ┣ sample1.npy + ┃ ┃ ┣ sample2.npy +``` + +Then run the `run.sh` with specificed folder direction, here is an example. + +```bash +sh egs/vocoder/diffusion/{vocoder_name}/run.sh --stage 3 \ + --infer_mode infer_from_feature \ + --infer_feature_dir [Your path to your predicted acoustic features] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### c. Inference from Audios + +If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure: + +```plaintext + ┣ audios + ┃ ┣ sample1.wav + ┃ ┣ sample2.wav +``` + +Then run the `run.sh` with specificed folder direction, here is an example. + +```bash +sh egs/vocoder/diffusion/{vocoder_name}/run.sh --stage 3 \ + --infer_mode infer_from_audio \ + --infer_audio_dir [Your path to your audio files] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` diff --git a/egs/vocoder/diffusion/_template/run.sh b/egs/vocoder/diffusion/_template/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/diffusion/_template/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/diffusion/diffwave/exp_config.json b/egs/vocoder/diffusion/diffwave/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1e429c628b854247b7e92dba6e1d24d870f09d47 --- /dev/null +++ b/egs/vocoder/diffusion/diffwave/exp_config.json @@ -0,0 +1,26 @@ +{ + "base_config": "egs/vocoder/diffusion/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + + // Features used for model training + "use_mel": true, + "use_audio": true, + }, + "model": { + "generator": "diffwave", + "diffwave": { + "residual_channels": 64, + "residual_layers": 30, + "dilation_cycle_length": 10, + "noise_schedule_factors": [1.0e-4, 0.05, 50], + "inference_noise_schedule": [0.0001, 0.001, 0.01, 0.05, 0.2, 0.5], + "upsample_factors": [16, 16], + } + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/diffusion/diffwave/run.sh b/egs/vocoder/diffusion/diffwave/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/diffusion/diffwave/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/diffusion/exp_config_base.json b/egs/vocoder/diffusion/exp_config_base.json new file mode 100644 index 0000000000000000000000000000000000000000..efc958be1f41bed99d1d1724b1e0248c4fe0f7e9 --- /dev/null +++ b/egs/vocoder/diffusion/exp_config_base.json @@ -0,0 +1,71 @@ +{ + "base_config": "config/vocoder.json", + "model_type": "DiffusionVocoder", + // TODO: Choose your needed datasets + "dataset": [ + "csd", + "kising", + "m4singer", + "nus48e", + "opencpop", + "opensinger", + "opera", + "pjs", + "popbutfy", + "popcs", + "ljspeech", + "vctk", + "libritts", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "csd": "[dataset path]", + "kising": "[dataset path]", + "m4singer": "[dataset path]", + "nus48e": "[dataset path]", + "opencpop": "[dataset path]", + "opensinger": "[dataset path]", + "opera": "[dataset path]", + "pjs": "[dataset path]", + "popbutfy": "[dataset path]", + "popcs": "[dataset path]", + "ljspeech": "[dataset path]", + "vctk": "[dataset path]", + "libritts": "[dataset path]", + }, + // TODO: Fill in the output log path + "log_dir": "ckpts/vocoder", + "preprocess": { + // Acoustic features + "extract_mel": true, + "extract_audio": true, + "extract_pitch": false, + "extract_uv": false, + "pitch_extractor": "parselmouth", + + // Features used for model training + "use_mel": true, + "use_frame_pitch": false, + "use_uv": false, + "use_audio": true, + + // TODO: Fill in the output data path + "processed_dir": "data/", + "n_mel": 100, + "sample_rate": 24000 + }, + "train": { + // TODO: Choose a suitable batch size, training epoch, and save stride + "batch_size": 32, + "max_epoch": 1000000, + "save_checkpoint_stride": [20], + "adamw": { + "lr": 2.0e-4, + "adam_b1": 0.8, + "adam_b2": 0.99 + }, + "exponential_lr": { + "lr_decay": 0.999 + }, + } +} \ No newline at end of file diff --git a/egs/vocoder/gan/README.md b/egs/vocoder/gan/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dcefd84d6b9c02223efac119d29dcfd88cd0c026 --- /dev/null +++ b/egs/vocoder/gan/README.md @@ -0,0 +1,234 @@ +# Amphion GAN-based Vocoder Recipe + +## Supported Model Architectures + +GAN-based Vocoder consists of a generator and multiple discriminators, as illustrated below: + +
+
+ +
+
+ +Until now, Amphion GAN-based Vocoder has supported the following generators and discriminators. + +- **Generators** + - [MelGAN](https://arxiv.org/abs/1910.06711) + - [HiFi-GAN](https://arxiv.org/abs/2010.05646) + - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts) + - [BigVGAN](https://arxiv.org/abs/2206.04658) + - [APNet](https://arxiv.org/abs/2305.07952) +- **Discriminators** + - [Multi-Scale Discriminator](https://arxiv.org/abs/2010.05646) + - [Multi-Period Discriminator](https://arxiv.org/abs/2010.05646) + - [Multi-Resolution Discriminator](https://arxiv.org/abs/2011.09631) + - [Multi-Scale Short-Time Fourier Transform Discriminator](https://arxiv.org/abs/2210.13438) + - [**Multi-Scale Constant-Q Transfrom Discriminator (ours)**](https://arxiv.org/abs/2311.14957) + +You can use any vocoder architecture with any dataset you want. There are four steps in total: + +1. Data preparation +2. Feature extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +You can train the vocoder with any datasets. Amphion's supported open-source datasets are detailed [here](../../../datasets/README.md). + +### Configuration + +Specify the dataset path in `exp_config_base.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json +"dataset": [ + "csd", + "kising", + "m4singer", + "nus48e", + "opencpop", + "opensinger", + "opera", + "pjs", + "popbutfy", + "popcs", + "ljspeech", + "vctk", + "libritts", +], +"dataset_path": { + // TODO: Fill in your dataset path + "csd": "[dataset path]", + "kising": "[dataset path]", + "m4singer": "[dataset path]", + "nus48e": "[dataset path]", + "opencpop": "[dataset path]", + "opensinger": "[dataset path]", + "opera": "[dataset path]", + "pjs": "[dataset path]", + "popbutfy": "[dataset path]", + "popcs": "[dataset path]", + "ljspeech": "[dataset path]", + "vctk": "[dataset path]", + "libritts": "[dataset path]", +}, +``` + +### 2. Feature Extraction + +The needed features are speficied in the individual vocoder direction so it doesn't require any modification. + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_base.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder" + "log_dir": "ckpts/vocoder", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config_base.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + "max_epoch": 1000000, + "save_checkpoint_stride": [20], + "adamw": { + "lr": 2.0e-4, + "adam_b1": 0.8, + "adam_b2": 0.99 + }, + "exponential_lr": { + "lr_decay": 0.999 + }, +} +``` + +You can also choose any amount of prefered discriminators for training in the `exp_config_base.json`. + +```json +"discriminators": [ + "msd", + "mpd", + "msstftd", + "mssbcqtd", +], +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`. + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +If you want to resume or finetune from a pretrained model, run: + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 2 \ + --name [YourExptName] \ + --resume_type ["resume" for resuming training and "finetune" for loading parameters only] \ + --checkpoint Amphion/ckpts/vocoder/[YourExptName]/checkpoint \ +``` + +> **NOTE:** For multi-gpu training, the `main_process_port` is set as `29500` in default. You can change it when running `run.sh` by specifying such as `--main_process_port 29501`. + +## 4. Inference + +### Run + +Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from_audio`. + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \ + --infer_mode [Your chosen inference mode] \ + --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \ + --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \ + --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### a. Inference from Dataset + +Run the `run.sh` with specified datasets, here is an example. + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \ + --infer_mode infer_from_dataset \ + --infer_datasets "libritts vctk ljspeech" \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### b. Inference from Features + +If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure: + +```plaintext + ┣ {infer_feature_dir} + ┃ ┣ mels + ┃ ┃ ┣ sample1.npy + ┃ ┃ ┣ sample2.npy + ┃ ┣ f0s (required if you use NSF-HiFiGAN) + ┃ ┃ ┣ sample1.npy + ┃ ┃ ┣ sample2.npy +``` + +Then run the `run.sh` with specificed folder direction, here is an example. + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \ + --infer_mode infer_from_feature \ + --infer_feature_dir [Your path to your predicted acoustic features] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### c. Inference from Audios + +If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure: + +```plaintext + ┣ audios + ┃ ┣ sample1.wav + ┃ ┣ sample2.wav +``` + +Then run the `run.sh` with specificed folder direction, here is an example. + +```bash +sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \ + --infer_mode infer_from_audio \ + --infer_audio_dir [Your path to your audio files] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` diff --git a/egs/vocoder/gan/_template/run.sh b/egs/vocoder/gan/_template/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/_template/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/apnet/exp_config.json b/egs/vocoder/gan/apnet/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..520d3da3ea36201b8fa71e0fb1fd0d1ef9bbfdc0 --- /dev/null +++ b/egs/vocoder/gan/apnet/exp_config.json @@ -0,0 +1,45 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + "extract_amplitude_phase": true, + + // Features used for model training + "use_mel": true, + "use_audio": true, + "use_amplitude_phase": true + }, + "model": { + "generator": "apnet", + "apnet": { + "ASP_channel": 512, + "ASP_resblock_kernel_sizes": [3,7,11], + "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "ASP_input_conv_kernel_size": 7, + "ASP_output_conv_kernel_size": 7, + + "PSP_channel": 512, + "PSP_resblock_kernel_sizes": [3,7,11], + "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "PSP_input_conv_kernel_size": 7, + "PSP_output_R_conv_kernel_size": 7, + "PSP_output_I_conv_kernel_size": 7, + } + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + "phase", + "amplitude", + "consistency" + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/apnet/run.sh b/egs/vocoder/gan/apnet/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/apnet/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/bigvgan/exp_config.json b/egs/vocoder/gan/bigvgan/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cbed8a4bcd082f8a4cfa368a372f35c7e1a94973 --- /dev/null +++ b/egs/vocoder/gan/bigvgan/exp_config.json @@ -0,0 +1,66 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + + // Features used for model training + "use_mel": true, + "use_audio": true + }, + "model": { + "generator": "bigvgan", + "bigvgan": { + "resblock": "1", + "activation": "snakebeta", + "snake_logscale": true, + "upsample_rates": [ + 8, + 8, + 2, + 2, + ], + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "upsample_initial_channel": 512, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + } + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/bigvgan/run.sh b/egs/vocoder/gan/bigvgan/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/bigvgan/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/bigvgan_large/exp_config.json b/egs/vocoder/gan/bigvgan_large/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a15dc3573839cd05085ac4540b651b98f94a1bf4 --- /dev/null +++ b/egs/vocoder/gan/bigvgan_large/exp_config.json @@ -0,0 +1,70 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + + // Features used for model training + "use_mel": true, + "use_audio": true + }, + "model": { + "generator": "bigvgan", + "bigvgan": { + "resblock": "1", + "activation": "snakebeta", + "snake_logscale": true, + "upsample_rates": [ + 4, + 4, + 2, + 2, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 8, + 8, + 4, + 4, + 4, + 4 + ], + "upsample_initial_channel": 1536, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + }, + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/bigvgan_large/run.sh b/egs/vocoder/gan/bigvgan_large/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/bigvgan_large/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/exp_config_base.json b/egs/vocoder/gan/exp_config_base.json new file mode 100644 index 0000000000000000000000000000000000000000..84dbf7bcc0f5044c1e1f1d67b59cf26a338dff3a --- /dev/null +++ b/egs/vocoder/gan/exp_config_base.json @@ -0,0 +1,111 @@ +{ + "base_config": "config/vocoder.json", + "model_type": "GANVocoder", + // TODO: Choose your needed datasets + "dataset": [ + "csd", + "kising", + "m4singer", + "nus48e", + "opencpop", + "opensinger", + "opera", + "pjs", + "popbutfy", + "popcs", + "ljspeech", + "vctk", + "libritts", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "csd": "[dataset path]", + "kising": "[dataset path]", + "m4singer": "[dataset path]", + "nus48e": "[dataset path]", + "opencpop": "[dataset path]", + "opensinger": "[dataset path]", + "opera": "[dataset path]", + "pjs": "[dataset path]", + "popbutfy": "[dataset path]", + "popcs": "[dataset path]", + "ljspeech": "[dataset path]", + "vctk": "[dataset path]", + "libritts": "[dataset path]", + }, + // TODO: Fill in the output log path + "log_dir": "ckpts/vocoder", + "preprocess": { + // Acoustic features + "extract_mel": true, + "extract_audio": true, + "extract_pitch": false, + "extract_uv": false, + "pitch_extractor": "parselmouth", + + // Features used for model training + "use_mel": true, + "use_frame_pitch": false, + "use_uv": false, + "use_audio": true, + + // TODO: Fill in the output data path + "processed_dir": "data/", + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + // TODO: Choose your needed discriminators + "discriminators": [ + "msd", + "mpd", + "msstftd", + "mssbcqtd", + ], + "mpd": { + "mpd_reshapes": [ + 2, + 3, + 5, + 7, + 11 + ], + "use_spectral_norm": false, + "discriminator_channel_mult_factor": 1 + }, + "mrd": { + "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]], + "use_spectral_norm": false, + "discriminator_channel_mult_factor": 1, + "mrd_override": false + }, + "msstftd": { + "filters": 32 + }, + "mssbcqtd": { + hop_lengths: [512, 256, 256], + filters: 32, + max_filters: 1024, + filters_scale: 1, + dilations: [1, 2, 4], + in_channels: 1, + out_channels: 1, + n_octaves: [9, 9, 9], + bins_per_octaves: [24, 36, 48] + }, + }, + "train": { + // TODO: Choose a suitable batch size, training epoch, and save stride + "batch_size": 32, + "max_epoch": 1000000, + "save_checkpoint_stride": [20], + "adamw": { + "lr": 2.0e-4, + "adam_b1": 0.8, + "adam_b2": 0.99 + }, + "exponential_lr": { + "lr_decay": 0.999 + }, + } +} \ No newline at end of file diff --git a/egs/vocoder/gan/hifigan/exp_config.json b/egs/vocoder/gan/hifigan/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b06712eac472a8452d7cfbfcf8bdf9bf3b232514 --- /dev/null +++ b/egs/vocoder/gan/hifigan/exp_config.json @@ -0,0 +1,59 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + + // Features used for model training + "use_mel": true, + "use_audio": true + }, + "model": { + "generator": "hifigan", + "hifigan": { + "resblock": "2", + "upsample_rates": [ + 8, + 8, + 4 + ], + "upsample_kernel_sizes": [ + 16, + 16, + 8 + ], + "upsample_initial_channel": 256, + "resblock_kernel_sizes": [ + 3, + 5, + 7 + ], + "resblock_dilation_sizes": [ + [ + 1, + 2 + ], + [ + 2, + 6 + ], + [ + 3, + 12 + ] + ] + } + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/hifigan/run.sh b/egs/vocoder/gan/hifigan/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/hifigan/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/melgan/exp_config.json b/egs/vocoder/gan/melgan/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9470600bc0d15afd07107f816995e8c67ad52d9a --- /dev/null +++ b/egs/vocoder/gan/melgan/exp_config.json @@ -0,0 +1,34 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + + // Features used for model training + "use_mel": true, + "use_audio": true + }, + "model": { + "generator": "melgan", + "melgan": { + "ratios": [8, 8, 2, 2], + "ngf": 32, + "n_residual_layers": 3, + "num_D": 3, + "ndf": 16, + "n_layers": 4, + "downsampling_factor": 4 + }, + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/melgan/run.sh b/egs/vocoder/gan/melgan/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/melgan/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/nsfhifigan/exp_config.json b/egs/vocoder/gan/nsfhifigan/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ead9dc8909b6a5283dd12b721cfb4519cc3953ca --- /dev/null +++ b/egs/vocoder/gan/nsfhifigan/exp_config.json @@ -0,0 +1,83 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "preprocess": { + // acoustic features + "extract_mel": true, + "extract_audio": true, + "extract_pitch": true, + + // Features used for model training + "use_mel": true, + "use_audio": true, + "use_frame_pitch": true + }, + "model": { + "generator": "nsfhifigan", + "nsfhifigan": { + "resblock": "1", + "harmonic_num": 8, + "upsample_rates": [ + 8, + 4, + 2, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 8, + 4, + 4, + 4 + ], + "upsample_initial_channel": 768, + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + }, + "mpd": { + "mpd_reshapes": [ + 2, + 3, + 5, + 7, + 11, + 17, + 23, + 37 + ], + "use_spectral_norm": false, + "discriminator_channel_multi": 1 + } + }, + "train": { + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + ] + }, + "inference": { + "batch_size": 1, + } +} diff --git a/egs/vocoder/gan/nsfhifigan/run.sh b/egs/vocoder/gan/nsfhifigan/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/nsfhifigan/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/egs/vocoder/gan/tfr_enhanced_hifigan/README.md b/egs/vocoder/gan/tfr_enhanced_hifigan/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7b4e3769175953ceaff5d9a2e464138aa8e66a7f --- /dev/null +++ b/egs/vocoder/gan/tfr_enhanced_hifigan/README.md @@ -0,0 +1,197 @@ +# Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fedility Vocoder + +[![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2311.14957) +[![demo](https://img.shields.io/badge/Vocoder-Demo-red)](https://vocodexelysium.github.io/MS-SB-CQTD/) +[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Models-pink)](https://huggingface.co/amphion/hifigan_speech_bigdata) + +
+
+ +
+
+ +This is the official implementation of the paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". In this recipe, we will illustrate how to train a high quality HiFi-GAN on LibriTTS, VCTK and LJSpeech via utilizing multiple Time-Frequency-Representation-based Discriminators. + +There are four stages in total: + +1. Data preparation +2. Feature extraction +3. Training +4. Inference + +> **NOTE:** You need to run every command of this recipe in the `Amphion` root path: +> ```bash +> cd Amphion +> ``` + +## 1. Data Preparation + +### Dataset Download + +By default, we utilize the three datasets for training: LibriTTS, VCTK and LJSpeech. How to download them is detailed in [here](../../../datasets/README.md). + +### Configuration + +Specify the dataset path in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets. + +```json +"dataset": [ + "ljspeech", + "vctk", + "libritts", +], +"dataset_path": { + // TODO: Fill in your dataset path + "ljspeech": "[LJSpeech dataset path]", + "vctk": "[VCTK dataset path]", + "libritts": "[LibriTTS dataset path]", +}, +``` + +## 2. Features Extraction + +For HiFiGAN, only the Mel-Spectrogram and the Output Audio are needed for training. + +### Configuration + +Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`: + +```json + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder" + "log_dir": "ckpts/vocoder", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + ... + }, +``` + +### Run + +Run the `run.sh` as the preproces stage (set `--stage 1`). + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 1 +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`. + +## 3. Training + +### Configuration + +We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines. + +```json +"train": { + "batch_size": 32, + ... +} +``` + +### Run + +Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`. + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 2 --name [YourExptName] +``` + +> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`. + +If you want to resume or finetune from a pretrained model, run: + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 2 \ + --name [YourExptName] \ + --resume_type ["resume" for resuming training and "finetune" for loading parameters only] \ + --checkpoint Amphion/ckpts/vocoder/[YourExptName]/checkpoint \ +``` + +> **NOTE:** For multi-gpu training, the `main_process_port` is set as `29500` in default. You can change it when running `run.sh` by specifying such as `--main_process_port 29501`. + +## 4. Inference + +### Pretrained Vocoder Download + +We trained a HiFiGAN checkpoint with around 685 hours Speech data. The final pretrained checkpoint is released [here](../../../../pretrained/hifigan/README.md). + +### Run + +Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from audio`. + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \ + --infer_mode [Your chosen inference mode] \ + --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \ + --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \ + --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### a. Inference from Dataset + +Run the `run.sh` with specified datasets, here is an example. + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \ + --infer_mode infer_from_dataset \ + --infer_datasets "libritts vctk ljspeech" \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### b. Inference from Features + +If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure: + +```plaintext + ┣ {infer_feature_dir} + ┃ ┣ mels + ┃ ┃ ┣ sample1.npy + ┃ ┃ ┣ sample2.npy +``` + +Then run the `run.sh` with specificed folder direction, here is an example. + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \ + --infer_mode infer_from_feature \ + --infer_feature_dir [Your path to your predicted acoustic features] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +#### c. Inference from Audios + +If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure: + +```plaintext + ┣ audios + ┃ ┣ sample1.wav + ┃ ┣ sample2.wav +``` + +Then run the `run.sh` with specificed folder direction, here is an example. + +```bash +sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \ + --infer_mode infer_from_audio \ + --infer_audio_dir [Your path to your audio files] \ + --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \ + --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \ +``` + +## Citations + +```bibtex +@misc{gu2023cqt, + title={Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder}, + author={Yicheng Gu and Xueyao Zhang and Liumeng Xue and Zhizheng Wu}, + year={2023}, + eprint={2311.14957}, + archivePrefix={arXiv}, + primaryClass={cs.SD} +} +``` \ No newline at end of file diff --git a/egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json b/egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json new file mode 100644 index 0000000000000000000000000000000000000000..854c575aace19e0da5991e2a29321fb8c07a76ca --- /dev/null +++ b/egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json @@ -0,0 +1,118 @@ +{ + "base_config": "egs/vocoder/gan/exp_config_base.json", + "model_type": "GANVocoder", + "dataset": [ + "ljspeech", + "vctk", + "libritts", + ], + "dataset_path": { + // TODO: Fill in your dataset path + "ljspeech": "[dataset path]", + "vctk": "[dataset path]", + "libritts": "[dataset path]", + }, + // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder" + "log_dir": "ckpts/vocoder", + "preprocess": { + // TODO: Fill in the output data path. The default value is "Amphion/data" + "processed_dir": "data", + // acoustic features + "extract_mel": true, + "extract_audio": true, + "extract_pitch": false, + "extract_uv": false, + "extract_amplitude_phase": false, + "pitch_extractor": "parselmouth", + // Features used for model training + "use_mel": true, + "use_frame_pitch": false, + "use_uv": false, + "use_audio": true, + "n_mel": 100, + "sample_rate": 24000 + }, + "model": { + "generator": "hifigan", + "discriminators": [ + "msd", + "mpd", + "mssbcqtd", + "msstftd", + ], + "hifigan": { + "resblock": "1", + "upsample_rates": [ + 8, + 4, + 2, + 2, + 2 + ], + "upsample_kernel_sizes": [ + 16, + 8, + 4, + 4, + 4 + ], + "upsample_initial_channel": 768, + "resblock_kernel_sizes": [ + 3, + 5, + 7 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ] + }, + "mpd": { + "mpd_reshapes": [ + 2, + 3, + 5, + 7, + 11, + 17, + 23, + 37 + ], + "use_spectral_norm": false, + "discriminator_channel_multi": 1 + } + }, + "train": { + "batch_size": 32, + "adamw": { + "lr": 2.0e-4, + "adam_b1": 0.8, + "adam_b2": 0.99 + }, + "exponential_lr": { + "lr_decay": 0.999 + }, + "criterions": [ + "feature", + "discriminator", + "generator", + "mel", + ] + }, + "inference": { + "batch_size": 1, + } +} \ No newline at end of file diff --git a/egs/vocoder/gan/tfr_enhanced_hifigan/run.sh b/egs/vocoder/gan/tfr_enhanced_hifigan/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..9d7a29276b2ea648f62b079f4d7d5daa871484f9 --- /dev/null +++ b/egs/vocoder/gan/tfr_enhanced_hifigan/run.sh @@ -0,0 +1,141 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +######## Build Experiment Environment ########### +exp_dir=$(cd `dirname $0`; pwd) +work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir)))) + +export WORK_DIR=$work_dir +export PYTHONPATH=$work_dir +export PYTHONIOENCODING=UTF-8 + +######## Parse the Given Parameters from the Commond ########### +options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,checkpoint:,resume_type:,main_process_port:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@") +eval set -- "$options" + +while true; do + case $1 in + # Experimental Configuration File + -c | --config) shift; exp_config=$1 ; shift ;; + # Experimental Name + -n | --name) shift; exp_name=$1 ; shift ;; + # Running Stage + -s | --stage) shift; running_stage=$1 ; shift ;; + # Visible GPU machines. The default value is "0". + --gpu) shift; gpu=$1 ; shift ;; + + # [Only for Training] The specific checkpoint path that you want to resume from. + --checkpoint) shift; checkpoint=$1 ; shift ;; + # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights. + --resume_type) shift; resume_type=$1 ; shift ;; + # [Only for Traiing] `main_process_port` for multi gpu training + --main_process_port) shift; main_process_port=$1 ; shift ;; + + # [Only for Inference] The inference mode + --infer_mode) shift; infer_mode=$1 ; shift ;; + # [Only for Inference] The inferenced datasets + --infer_datasets) shift; infer_datasets=$1 ; shift ;; + # [Only for Inference] The feature dir for inference + --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;; + # [Only for Inference] The audio dir for inference + --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;; + # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]" + --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;; + # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result" + --infer_output_dir) shift; infer_output_dir=$1 ; shift ;; + + --) shift ; break ;; + *) echo "Invalid option: $1" exit 1 ;; + esac +done + + +### Value check ### +if [ -z "$running_stage" ]; then + echo "[Error] Please specify the running stage" + exit 1 +fi + +if [ -z "$exp_config" ]; then + exp_config="${exp_dir}"/exp_config.json +fi +echo "Exprimental Configuration File: $exp_config" + +if [ -z "$gpu" ]; then + gpu="0" +fi + +if [ -z "$main_process_port" ]; then + main_process_port=29500 +fi +echo "Main Process Port: $main_process_port" + +######## Features Extraction ########### +if [ $running_stage -eq 1 ]; then + CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \ + --config $exp_config \ + --num_workers 8 +fi + +######## Training ########### +if [ $running_stage -eq 2 ]; then + if [ -z "$exp_name" ]; then + echo "[Error] Please specify the experiments name" + exit 1 + fi + echo "Exprimental Name: $exp_name" + + CUDA_VISIBLE_DEVICES=$gpu accelerate launch \ + --main_process_port "$main_process_port" \ + "${work_dir}"/bins/vocoder/train.py \ + --config "$exp_config" \ + --exp_name "$exp_name" \ + --log_level info \ + --checkpoint "$checkpoint" \ + --resume_type "$resume_type" +fi + +######## Inference/Conversion ########### +if [ $running_stage -eq 3 ]; then + if [ -z "$infer_expt_dir" ]; then + echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]" + exit 1 + fi + + if [ -z "$infer_output_dir" ]; then + infer_output_dir="$infer_expt_dir/result" + fi + + if [ $infer_mode = "infer_from_dataset" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --infer_datasets $infer_datasets \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_feature" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --feature_folder $infer_feature_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + + if [ $infer_mode = "infer_from_audio" ]; then + CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \ + --config $exp_config \ + --infer_mode $infer_mode \ + --audio_folder $infer_audio_dir \ + --vocoder_dir $infer_expt_dir \ + --output_dir $infer_output_dir \ + --log_level debug + fi + +fi \ No newline at end of file diff --git a/env.sh b/env.sh new file mode 100644 index 0000000000000000000000000000000000000000..9da2113ba570ad71f184bdc3ecff0ea38804c8de --- /dev/null +++ b/env.sh @@ -0,0 +1,34 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# Raise error if any command fails +set -e + +# Install ffmpeg in Linux +conda install -c conda-forge ffmpeg + +# Pip packages +pip install setuptools ruamel.yaml tqdm colorama easydict tabulate loguru json5 Cython unidecode inflect argparse g2p_en tgt librosa==0.9.1 matplotlib typeguard einops omegaconf hydra-core humanfriendly pandas munch + +pip install tensorboard tensorboardX torch==2.0.1 torchaudio==2.0.2 torchvision==0.15.2 accelerate==0.24.1 transformers==4.41.2 diffusers praat-parselmouth audiomentations pedalboard ffmpeg-python==0.2.0 pyworld diffsptk==1.0.1 nnAudio unidecode inflect ptwt + +pip install encodec vocos speechtokenizer g2p_en descript-audio-codec + +pip install torchmetrics pymcd openai-whisper frechet_audio_distance asteroid resemblyzer vector-quantize-pytorch==1.12.5 + +pip install https://github.com/vBaiCai/python-pesq/archive/master.zip + +pip install fairseq + +pip install git+https://github.com/lhotse-speech/lhotse + +pip install -U encodec + +pip install phonemizer==3.2.1 pypinyin==0.48.0 + +pip install black==24.1.1 + +# Uninstall nvidia-cublas-cu11 if there exist some bugs about CUDA version +# pip uninstall nvidia-cublas-cu11 diff --git a/evaluation/__init__.py b/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/evaluation/features/__init__.py b/evaluation/features/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/evaluation/features/long_term_average_spectrum.py b/evaluation/features/long_term_average_spectrum.py new file mode 100644 index 0000000000000000000000000000000000000000..b6724eecf0748a25e1b9afeae853064e2f26fa1e --- /dev/null +++ b/evaluation/features/long_term_average_spectrum.py @@ -0,0 +1,19 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import librosa +from scipy import signal + + +def extract_ltas(audio, fs=None, n_fft=1024, hop_length=256): + """Extract Long-Term Average Spectrum for a given audio.""" + if fs != None: + y, _ = librosa.load(audio, sr=fs) + else: + y, fs = librosa.load(audio) + frequency, density = signal.welch( + x=y, fs=fs, window="hann", nperseg=hop_length, nfft=n_fft + ) + return frequency, density diff --git a/evaluation/features/signal_to_noise_ratio.py b/evaluation/features/signal_to_noise_ratio.py new file mode 100644 index 0000000000000000000000000000000000000000..42abf0e1e721a680029fc57c743dfe06c3629fe9 --- /dev/null +++ b/evaluation/features/signal_to_noise_ratio.py @@ -0,0 +1,133 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import scipy.signal as sig +import copy +import librosa + + +def bandpower(ps, mode="time"): + """ + estimate bandpower, see https://de.mathworks.com/help/signal/ref/bandpower.html + """ + if mode == "time": + x = ps + l2norm = np.linalg.norm(x) ** 2.0 / len(x) + return l2norm + elif mode == "psd": + return sum(ps) + + +def getIndizesAroundPeak(arr, peakIndex, searchWidth=1000): + peakBins = [] + magMax = arr[peakIndex] + curVal = magMax + for i in range(searchWidth): + newBin = peakIndex + i + if newBin >= len(arr): + break + newVal = arr[newBin] + if newVal > curVal: + break + else: + peakBins.append(int(newBin)) + curVal = newVal + curVal = magMax + for i in range(searchWidth): + newBin = peakIndex - i + if newBin < 0: + break + newVal = arr[newBin] + if newVal > curVal: + break + else: + peakBins.append(int(newBin)) + curVal = newVal + return np.array(list(set(peakBins))) + + +def freqToBin(fAxis, Freq): + return np.argmin(abs(fAxis - Freq)) + + +def getPeakInArea(psd, faxis, estimation, searchWidthHz=10): + """ + returns bin and frequency of the maximum in an area + """ + binLow = freqToBin(faxis, estimation - searchWidthHz) + binHi = freqToBin(faxis, estimation + searchWidthHz) + peakbin = binLow + np.argmax(psd[binLow : binHi + 1]) + return peakbin, faxis[peakbin] + + +def getHarmonics(fund, sr, nHarmonics=6, aliased=False): + harmonicMultipliers = np.arange(2, nHarmonics + 2) + harmonicFs = fund * harmonicMultipliers + if not aliased: + harmonicFs[harmonicFs > sr / 2] = -1 + harmonicFs = np.delete(harmonicFs, harmonicFs == -1) + else: + nyqZone = np.floor(harmonicFs / (sr / 2)) + oddEvenNyq = nyqZone % 2 + harmonicFs = np.mod(harmonicFs, sr / 2) + harmonicFs[oddEvenNyq == 1] = (sr / 2) - harmonicFs[oddEvenNyq == 1] + return harmonicFs + + +def extract_snr(audio, sr=None): + """Extract Signal-to-Noise Ratio for a given audio.""" + if sr != None: + audio, _ = librosa.load(audio, sr=sr) + else: + audio, sr = librosa.load(audio, sr=sr) + faxis, ps = sig.periodogram( + audio, fs=sr, window=("kaiser", 38) + ) # get periodogram, parametrized like in matlab + fundBin = np.argmax( + ps + ) # estimate fundamental at maximum amplitude, get the bin number + fundIndizes = getIndizesAroundPeak( + ps, fundBin + ) # get bin numbers around fundamental peak + fundFrequency = faxis[fundBin] # frequency of fundamental + + nHarmonics = 18 + harmonicFs = getHarmonics( + fundFrequency, sr, nHarmonics=nHarmonics, aliased=True + ) # get harmonic frequencies + + harmonicBorders = np.zeros([2, nHarmonics], dtype=np.int16).T + fullHarmonicBins = np.array([], dtype=np.int16) + fullHarmonicBinList = [] + harmPeakFreqs = [] + harmPeaks = [] + for i, harmonic in enumerate(harmonicFs): + searcharea = 0.1 * fundFrequency + estimation = harmonic + + binNum, freq = getPeakInArea(ps, faxis, estimation, searcharea) + harmPeakFreqs.append(freq) + harmPeaks.append(ps[binNum]) + allBins = getIndizesAroundPeak(ps, binNum, searchWidth=1000) + fullHarmonicBins = np.append(fullHarmonicBins, allBins) + fullHarmonicBinList.append(allBins) + harmonicBorders[i, :] = [allBins[0], allBins[-1]] + + fundIndizes.sort() + pFund = bandpower(ps[fundIndizes[0] : fundIndizes[-1]]) # get power of fundamental + + noisePrepared = copy.copy(ps) + noisePrepared[fundIndizes] = 0 + noisePrepared[fullHarmonicBins] = 0 + noiseMean = np.median(noisePrepared[noisePrepared != 0]) + noisePrepared[fundIndizes] = noiseMean + noisePrepared[fullHarmonicBins] = noiseMean + + noisePower = bandpower(noisePrepared) + + r = 10 * np.log10(pFund / noisePower) + + return r, 10 * np.log10(noisePower) diff --git a/evaluation/features/singing_power_ratio.py b/evaluation/features/singing_power_ratio.py new file mode 100644 index 0000000000000000000000000000000000000000..600510167b7b6e4972b047a58bae15651bcb477a --- /dev/null +++ b/evaluation/features/singing_power_ratio.py @@ -0,0 +1,108 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import librosa + +from utils.util import JsonHParams +from utils.f0 import get_f0_features_using_parselmouth, get_pitch_sub_median +from utils.mel import extract_mel_features + + +def extract_spr( + audio, + fs=None, + hop_length=256, + win_length=1024, + n_fft=1024, + n_mels=128, + f0_min=37, + f0_max=1000, + pitch_bin=256, + pitch_max=1100.0, + pitch_min=50.0, +): + """Compute Singing Power Ratio (SPR) from a given audio. + audio: path to the audio. + fs: sampling rate. + hop_length: hop length. + win_length: window length. + n_mels: number of mel filters. + f0_min: lower limit for f0. + f0_max: upper limit for f0. + pitch_bin: number of bins for f0 quantization. + pitch_max: upper limit for f0 quantization. + pitch_min: lower limit for f0 quantization. + """ + # Load audio + if fs != None: + audio, _ = librosa.load(audio, sr=fs) + else: + audio, fs = librosa.load(audio) + audio = torch.from_numpy(audio) + + # Initialize config + cfg = JsonHParams() + cfg.sample_rate = fs + cfg.hop_size = hop_length + cfg.win_size = win_length + cfg.n_fft = n_fft + cfg.n_mel = n_mels + cfg.f0_min = f0_min + cfg.f0_max = f0_max + cfg.pitch_bin = pitch_bin + cfg.pitch_max = pitch_max + cfg.pitch_min = pitch_min + + # Extract mel spectrograms + + cfg.fmin = 2000 + cfg.fmax = 4000 + + mel1 = extract_mel_features( + y=audio.unsqueeze(0), + cfg=cfg, + ).squeeze(0) + + cfg.fmin = 0 + cfg.fmax = 2000 + + mel2 = extract_mel_features( + y=audio.unsqueeze(0), + cfg=cfg, + ).squeeze(0) + + f0 = get_f0_features_using_parselmouth( + audio, + cfg, + ) + + # Mel length alignment + length = min(len(f0), mel1.shape[-1]) + f0 = f0[:length] + mel1 = mel1[:, :length] + mel2 = mel2[:, :length] + + # Compute SPR + res = [] + + for i in range(mel1.shape[-1]): + if f0[i] <= 1: + continue + + chunk1 = mel1[:, i] + chunk2 = mel2[:, i] + + max1 = max(chunk1.numpy()) + max2 = max(chunk2.numpy()) + + tmp_res = max2 - max1 + + res.append(tmp_res) + + if len(res) == 0: + return False + else: + return sum(res) / len(res) diff --git a/evaluation/metrics/__init__.py b/evaluation/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/evaluation/metrics/energy/__init__.py b/evaluation/metrics/energy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/evaluation/metrics/energy/energy_pearson_coefficients.py b/evaluation/metrics/energy/energy_pearson_coefficients.py new file mode 100644 index 0000000000000000000000000000000000000000..55df77e95142bc997d0a883637ffbcc1fc4c9c37 --- /dev/null +++ b/evaluation/metrics/energy/energy_pearson_coefficients.py @@ -0,0 +1,101 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import librosa +import torch + +import numpy as np +from numpy import linalg as LA + +from torchmetrics import PearsonCorrCoef + + +def extract_energy_pearson_coeffcients( + audio_ref, + audio_deg, + n_fft=1024, + hop_length=256, + win_length=1024, + **kwargs, +): + """Compute Energy Pearson Coefficients between the predicted and the ground truth audio. + audio_ref: path to the ground truth audio. + audio_deg: path to the predicted audio. + fs: sampling rate. + n_fft: fft size. + hop_length: hop length. + win_length: window length. + method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio. + "cut" will cut both audios into a same length according to the one with the shorter length. + db_scale: the ground truth and predicted audio will be converted to db_scale if "True". + """ + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + db_scale = kwargs["db_scale"] + + # Initialize method + pearson = PearsonCorrCoef() + + # Load audio + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + # STFT + spec_ref = librosa.stft( + y=audio_ref, n_fft=n_fft, hop_length=hop_length, win_length=win_length + ) + spec_deg = librosa.stft( + y=audio_deg, n_fft=n_fft, hop_length=hop_length, win_length=win_length + ) + + # Get magnitudes + mag_ref = np.abs(spec_ref).T + mag_deg = np.abs(spec_deg).T + + # Convert spectrogram to energy + energy_ref = LA.norm(mag_ref, axis=1) + energy_deg = LA.norm(mag_deg, axis=1) + + # Convert to db_scale + if db_scale: + energy_ref = 20 * np.log10(energy_ref) + energy_deg = 20 * np.log10(energy_deg) + + # Audio length alignment + if method == "cut": + length = min(len(energy_ref), len(energy_deg)) + energy_ref = energy_ref[:length] + energy_deg = energy_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(energy_ref, energy_deg, backtrack=True) + energy_gt_new = [] + energy_pred_new = [] + for i in range(wp.shape[0]): + gt_index = wp[i][0] + pred_index = wp[i][1] + energy_gt_new.append(energy_ref[gt_index]) + energy_pred_new.append(energy_deg[pred_index]) + energy_ref = np.array(energy_gt_new) + energy_deg = np.array(energy_pred_new) + assert len(energy_ref) == len(energy_deg) + + # Convert to tensor + energy_ref = torch.from_numpy(energy_ref) + energy_deg = torch.from_numpy(energy_deg) + + if torch.cuda.is_available(): + device = torch.device("cuda") + energy_ref = energy_ref.to(device) + energy_deg = energy_deg.to(device) + pearson = pearson.to(device) + + return pearson(energy_ref, energy_deg).detach().cpu().numpy().tolist() diff --git a/evaluation/metrics/energy/energy_rmse.py b/evaluation/metrics/energy/energy_rmse.py new file mode 100644 index 0000000000000000000000000000000000000000..92e1f67e0ab3aab4252b21d050d9cdf1f885e062 --- /dev/null +++ b/evaluation/metrics/energy/energy_rmse.py @@ -0,0 +1,90 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import librosa +import torch + +import numpy as np +from numpy import linalg as LA + + +def extract_energy_rmse( + audio_ref, + audio_deg, + n_fft=1024, + hop_length=256, + win_length=1024, + **kwargs, +): + """Compute Energy Root Mean Square Error (RMSE) between the predicted and the ground truth audio. + audio_ref: path to the ground truth audio. + audio_deg: path to the predicted audio. + fs: sampling rate. + n_fft: fft size. + hop_length: hop length. + win_length: window length. + method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio. + "cut" will cut both audios into a same length according to the one with the shorter length. + db_scale: the ground truth and predicted audio will be converted to db_scale if "True". + """ + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + db_scale = kwargs["db_scale"] + + # Load audio + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + # STFT + spec_ref = librosa.stft( + y=audio_ref, n_fft=n_fft, hop_length=hop_length, win_length=win_length + ) + spec_deg = librosa.stft( + y=audio_deg, n_fft=n_fft, hop_length=hop_length, win_length=win_length + ) + + # Get magnitudes + mag_ref = np.abs(spec_ref).T + mag_deg = np.abs(spec_deg).T + + # Convert spectrogram to energy + energy_ref = LA.norm(mag_ref, axis=1) + energy_deg = LA.norm(mag_deg, axis=1) + + # Convert to db_scale + if db_scale: + energy_ref = 20 * np.log10(energy_ref) + energy_deg = 20 * np.log10(energy_deg) + + # Audio length alignment + if method == "cut": + length = min(len(energy_ref), len(energy_deg)) + energy_ref = energy_ref[:length] + energy_deg = energy_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(energy_ref, energy_deg, backtrack=True) + energy_gt_new = [] + energy_pred_new = [] + for i in range(wp.shape[0]): + gt_index = wp[i][0] + pred_index = wp[i][1] + energy_gt_new.append(energy_ref[gt_index]) + energy_pred_new.append(energy_deg[pred_index]) + energy_ref = np.array(energy_gt_new) + energy_deg = np.array(energy_pred_new) + assert len(energy_ref) == len(energy_deg) + + # Compute RMSE + energy_mse = np.square(np.subtract(energy_ref, energy_deg)).mean() + energy_rmse = math.sqrt(energy_mse) + + return energy_rmse diff --git a/evaluation/metrics/f0/__init__.py b/evaluation/metrics/f0/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/evaluation/metrics/f0/f0_pearson_coefficients.py b/evaluation/metrics/f0/f0_pearson_coefficients.py new file mode 100644 index 0000000000000000000000000000000000000000..6ab3c065e7c712ef2666f998e902f7498e7915ad --- /dev/null +++ b/evaluation/metrics/f0/f0_pearson_coefficients.py @@ -0,0 +1,118 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import librosa + +import numpy as np + +from torchmetrics import PearsonCorrCoef + +from utils.util import JsonHParams +from utils.f0 import get_f0_features_using_parselmouth, get_pitch_sub_median + + +def extract_fpc( + audio_ref, + audio_deg, + hop_length=256, + f0_min=50, + f0_max=1100, + **kwargs, +): + """Compute F0 Pearson Distance (FPC) between the predicted and the ground truth audio. + audio_ref: path to the ground truth audio. + audio_deg: path to the predicted audio. + fs: sampling rate. + hop_length: hop length. + f0_min: lower limit for f0. + f0_max: upper limit for f0. + pitch_bin: number of bins for f0 quantization. + pitch_max: upper limit for f0 quantization. + pitch_min: lower limit for f0 quantization. + need_mean: subtract the mean value from f0 if "True". + method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio. + "cut" will cut both audios into a same length according to the one with the shorter length. + """ + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + need_mean = kwargs["need_mean"] + + # Initialize method + pearson = PearsonCorrCoef() + + # Load audio + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + # Initialize config + cfg = JsonHParams() + cfg.sample_rate = fs + cfg.hop_size = hop_length + cfg.f0_min = f0_min + cfg.f0_max = f0_max + cfg.pitch_bin = 256 + cfg.pitch_max = f0_max + cfg.pitch_min = f0_min + + # Compute f0 + f0_ref = get_f0_features_using_parselmouth( + audio_ref, + cfg, + ) + + f0_deg = get_f0_features_using_parselmouth( + audio_deg, + cfg, + ) + + # Subtract mean value from f0 + if need_mean: + f0_ref = torch.from_numpy(f0_ref) + f0_deg = torch.from_numpy(f0_deg) + + f0_ref = get_pitch_sub_median(f0_ref).numpy() + f0_deg = get_pitch_sub_median(f0_deg).numpy() + + # Avoid silence + min_length = min(len(f0_ref), len(f0_deg)) + if min_length <= 1: + return 1 + + # F0 length alignment + if method == "cut": + length = min(len(f0_ref), len(f0_deg)) + f0_ref = f0_ref[:length] + f0_deg = f0_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(f0_ref, f0_deg, backtrack=True) + f0_gt_new = [] + f0_pred_new = [] + for i in range(wp.shape[0]): + gt_index = wp[i][0] + pred_index = wp[i][1] + f0_gt_new.append(f0_ref[gt_index]) + f0_pred_new.append(f0_deg[pred_index]) + f0_ref = np.array(f0_gt_new) + f0_deg = np.array(f0_pred_new) + assert len(f0_ref) == len(f0_deg) + + # Convert to tensor + f0_ref = torch.from_numpy(f0_ref) + f0_deg = torch.from_numpy(f0_deg) + + if torch.cuda.is_available(): + device = torch.device("cuda") + f0_ref = f0_ref.to(device) + f0_deg = f0_deg.to(device) + pearson = pearson.to(device) + + return pearson(f0_ref, f0_deg).detach().cpu().numpy().tolist() diff --git a/evaluation/metrics/f0/f0_periodicity_rmse.py b/evaluation/metrics/f0/f0_periodicity_rmse.py new file mode 100644 index 0000000000000000000000000000000000000000..3f1db492a59956f7d9782e018c6774fc2abc5d8b --- /dev/null +++ b/evaluation/metrics/f0/f0_periodicity_rmse.py @@ -0,0 +1,121 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torchcrepe +import math +import librosa +import torch + +import numpy as np + + +def extract_f0_periodicity_rmse( + audio_ref, + audio_deg, + hop_length=256, + **kwargs, +): + """Compute f0 periodicity Root Mean Square Error (RMSE) between the predicted and the ground truth audio. + audio_ref: path to the ground truth audio. + audio_deg: path to the predicted audio. + fs: sampling rate. + hop_length: hop length. + method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio. + "cut" will cut both audios into a same length according to the one with the shorter length. + """ + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + + # Load audio + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + # Convert to torch + audio_ref = torch.from_numpy(audio_ref).unsqueeze(0) + audio_deg = torch.from_numpy(audio_deg).unsqueeze(0) + + if torch.cuda.is_available(): + device = torch.device("cuda") + else: + device = torch.device("cpu") + + # Get periodicity + _, periodicity_ref = torchcrepe.predict( + audio_ref, + sample_rate=fs, + hop_length=hop_length, + fmin=0, + fmax=1500, + model="full", + return_periodicity=True, + device=device, + ) + _, periodicity_deg = torchcrepe.predict( + audio_deg, + sample_rate=fs, + hop_length=hop_length, + fmin=0, + fmax=1500, + model="full", + return_periodicity=True, + device=device, + ) + + # Cut silence + periodicity_ref = ( + torchcrepe.threshold.Silence()( + periodicity_ref, + audio_ref, + fs, + hop_length=hop_length, + ) + .squeeze(0) + .numpy() + ) + periodicity_deg = ( + torchcrepe.threshold.Silence()( + periodicity_deg, + audio_deg, + fs, + hop_length=hop_length, + ) + .squeeze(0) + .numpy() + ) + + # Avoid silence audio + min_length = min(len(periodicity_ref), len(periodicity_deg)) + if min_length <= 1: + return 0 + + # Periodicity length alignment + if method == "cut": + length = min(len(periodicity_ref), len(periodicity_deg)) + periodicity_ref = periodicity_ref[:length] + periodicity_deg = periodicity_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(periodicity_ref, periodicity_deg, backtrack=True) + periodicity_ref_new = [] + periodicity_deg_new = [] + for i in range(wp.shape[0]): + ref_index = wp[i][0] + deg_index = wp[i][1] + periodicity_ref_new.append(periodicity_ref[ref_index]) + periodicity_deg_new.append(periodicity_deg[deg_index]) + periodicity_ref = np.array(periodicity_ref_new) + periodicity_deg = np.array(periodicity_deg_new) + assert len(periodicity_ref) == len(periodicity_deg) + + # Compute RMSE + periodicity_mse = np.square(np.subtract(periodicity_ref, periodicity_deg)).mean() + periodicity_rmse = math.sqrt(periodicity_mse) + + return periodicity_rmse diff --git a/evaluation/metrics/f0/f0_rmse.py b/evaluation/metrics/f0/f0_rmse.py new file mode 100644 index 0000000000000000000000000000000000000000..337e9ae3862ba2dd9d0aabfe655ba02bfee18e7d --- /dev/null +++ b/evaluation/metrics/f0/f0_rmse.py @@ -0,0 +1,111 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import librosa +import torch + +import numpy as np + +from utils.util import JsonHParams +from utils.f0 import get_f0_features_using_parselmouth, get_pitch_sub_median + + +ZERO = 1e-8 + + +def extract_f0rmse( + audio_ref, + audio_deg, + hop_length=256, + f0_min=50, + f0_max=1100, + **kwargs, +): + """Compute F0 Root Mean Square Error (RMSE) between the predicted and the ground truth audio. + audio_ref: path to the ground truth audio. + audio_deg: path to the predicted audio. + fs: sampling rate. + hop_length: hop length. + f0_min: lower limit for f0. + f0_max: upper limit for f0. + pitch_bin: number of bins for f0 quantization. + pitch_max: upper limit for f0 quantization. + pitch_min: lower limit for f0 quantization. + need_mean: subtract the mean value from f0 if "True". + method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio. + "cut" will cut both audios into a same length according to the one with the shorter length. + """ + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + need_mean = kwargs["need_mean"] + + # Load audio + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + # Initialize config for f0 extraction + cfg = JsonHParams() + cfg.sample_rate = fs + cfg.hop_size = hop_length + cfg.f0_min = f0_min + cfg.f0_max = f0_max + cfg.pitch_bin = 256 + cfg.pitch_max = f0_max + cfg.pitch_min = f0_min + + # Extract f0 + f0_ref = get_f0_features_using_parselmouth( + audio_ref, + cfg, + ) + + f0_deg = get_f0_features_using_parselmouth( + audio_deg, + cfg, + ) + + # Subtract mean value from f0 + if need_mean: + f0_ref = torch.from_numpy(f0_ref) + f0_deg = torch.from_numpy(f0_deg) + + f0_ref = get_pitch_sub_median(f0_ref).numpy() + f0_deg = get_pitch_sub_median(f0_deg).numpy() + + # Avoid silence + min_length = min(len(f0_ref), len(f0_deg)) + if min_length <= 1: + return 0 + + # F0 length alignment + if method == "cut": + length = min(len(f0_ref), len(f0_deg)) + f0_ref = f0_ref[:length] + f0_deg = f0_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(f0_ref, f0_deg, backtrack=True) + f0_gt_new = [] + f0_pred_new = [] + for i in range(wp.shape[0]): + gt_index = wp[i][0] + pred_index = wp[i][1] + f0_gt_new.append(f0_ref[gt_index]) + f0_pred_new.append(f0_deg[pred_index]) + f0_ref = np.array(f0_gt_new) + f0_deg = np.array(f0_pred_new) + assert len(f0_ref) == len(f0_deg) + + # Compute RMSE + f0_mse = np.square(np.subtract(f0_ref, f0_deg)).mean() + f0_rmse = math.sqrt(f0_mse) + + return f0_rmse diff --git a/evaluation/metrics/f0/v_uv_f1.py b/evaluation/metrics/f0/v_uv_f1.py new file mode 100644 index 0000000000000000000000000000000000000000..c81c0c84cc3f341062ab8d6eab9339d39d978959 --- /dev/null +++ b/evaluation/metrics/f0/v_uv_f1.py @@ -0,0 +1,111 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import librosa +import torch + +import numpy as np + +from utils.util import JsonHParams +from utils.f0 import get_f0_features_using_parselmouth + + +ZERO = 1e-8 + + +def extract_f1_v_uv( + audio_ref, + audio_deg, + hop_length=256, + f0_min=50, + f0_max=1100, + **kwargs, +): + """Compute F1 socre of voiced/unvoiced accuracy between the predicted and the ground truth audio. + audio_ref: path to the ground truth audio. + audio_deg: path to the predicted audio. + fs: sampling rate. + hop_length: hop length. + f0_min: lower limit for f0. + f0_max: upper limit for f0. + pitch_bin: number of bins for f0 quantization. + pitch_max: upper limit for f0 quantization. + pitch_min: lower limit for f0 quantization. + need_mean: subtract the mean value from f0 if "True". + method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio. + "cut" will cut both audios into a same length according to the one with the shorter length. + """ + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + + # Load audio + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + # Initialize config + cfg = JsonHParams() + cfg.sample_rate = fs + cfg.hop_size = hop_length + cfg.f0_min = f0_min + cfg.f0_max = f0_max + cfg.pitch_bin = 256 + cfg.pitch_max = f0_max + cfg.pitch_min = f0_min + + # Compute f0 + f0_ref = get_f0_features_using_parselmouth( + audio_ref, + cfg, + ) + + f0_deg = get_f0_features_using_parselmouth( + audio_deg, + cfg, + ) + + # Avoid silence + min_length = min(len(f0_ref), len(f0_deg)) + if min_length <= 1: + return 0, 0, 0 + + # F0 length alignment + if method == "cut": + length = min(len(f0_ref), len(f0_deg)) + f0_ref = f0_ref[:length] + f0_deg = f0_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(f0_ref, f0_deg, backtrack=True) + f0_gt_new = [] + f0_pred_new = [] + for i in range(wp.shape[0]): + gt_index = wp[i][0] + pred_index = wp[i][1] + f0_gt_new.append(f0_ref[gt_index]) + f0_pred_new.append(f0_deg[pred_index]) + f0_ref = np.array(f0_gt_new) + f0_deg = np.array(f0_pred_new) + assert len(f0_ref) == len(f0_deg) + + # Get voiced/unvoiced parts + ref_voiced = torch.Tensor([f0_ref != 0]).bool() + deg_voiced = torch.Tensor([f0_deg != 0]).bool() + + # Compute TP, FP, FN + true_postives = (ref_voiced & deg_voiced).sum() + false_postives = (~ref_voiced & deg_voiced).sum() + false_negatives = (ref_voiced & ~deg_voiced).sum() + + return ( + true_postives.detach().cpu().numpy().tolist(), + false_postives.detach().cpu().numpy().tolist(), + false_negatives.detach().cpu().numpy().tolist(), + ) diff --git a/evaluation/metrics/intelligibility/__init__.py b/evaluation/metrics/intelligibility/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/evaluation/metrics/intelligibility/character_error_rate.py b/evaluation/metrics/intelligibility/character_error_rate.py new file mode 100644 index 0000000000000000000000000000000000000000..a9403d1564a00ecc99cef9aa82f293101beda077 --- /dev/null +++ b/evaluation/metrics/intelligibility/character_error_rate.py @@ -0,0 +1,81 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from torchmetrics import CharErrorRate + + +def extract_cer( + model, + **kwargs, +): + """Compute Character Error Rate (CER) between the predicted and the ground truth audio. + content_gt: the ground truth content. + audio_ref: path to the ground truth audio. + audio_deg: path to the predicted audio. + mode: "gt_content" computes the CER between the predicted content obtained from the whisper model and the ground truth content. + both content_gt and audio_deg are needed. + "gt_audio" computes the CER between the extracted ground truth and predicted contents obtained from the whisper model. + both audio_ref and audio_deg are needed. + """ + kwargs = kwargs["kwargs"] + mode = kwargs["intelligibility_mode"] + language = kwargs["language"] + cer = CharErrorRate() + + if torch.cuda.is_available(): + device = torch.device("cuda") + cer = cer.to(device) + + # Get ground truth content + if mode == "gt_content": + content_gt = kwargs["content_gt"] + audio_deg = kwargs["audio_deg"] + + if language == "chinese": + prompt = "以下是普通话的句子" + result_deg = model.transcribe( + audio_deg, language="zh", verbose=True, initial_prompt=prompt + ) + else: + result_deg = model.transcribe(audio_deg, verbose=True) + elif mode == "gt_audio": + audio_ref = kwargs["audio_ref"] + audio_deg = kwargs["audio_deg"] + + if language == "chinese": + prompt = "以下是普通话的句子" + result_ref = model.transcribe( + audio_ref, language="zh", verbose=True, initial_prompt=prompt + ) + result_deg = model.transcribe( + audio_deg, language="zh", verbose=True, initial_prompt=prompt + ) + else: + result_ref = model.transcribe(audio_deg, verbose=True) + result_deg = model.transcribe(audio_deg, verbose=True) + + content_gt = result_ref["text"] + + content_gt = content_gt.replace(" ", "") + content_gt = content_gt.replace(".", "") + content_gt = content_gt.replace("'", "") + content_gt = content_gt.replace("-", "") + content_gt = content_gt.replace(",", "") + content_gt = content_gt.replace("!", "") + content_gt = content_gt.lower() + + # Get predicted truth content + content_pred = result_deg["text"] + content_pred = content_pred.replace(" ", "") + content_pred = content_pred.replace(".", "") + content_pred = content_pred.replace("'", "") + content_pred = content_pred.replace("-", "") + content_pred = content_pred.replace(",", "") + content_pred = content_pred.replace("!", "") + content_pred = content_pred.lower() + + return cer(content_pred, content_gt).detach().cpu().numpy().tolist() diff --git a/evaluation/metrics/intelligibility/word_error_rate.py b/evaluation/metrics/intelligibility/word_error_rate.py new file mode 100644 index 0000000000000000000000000000000000000000..e13ed0db0a66dcd0fb533ab16b264b4e9f28b172 --- /dev/null +++ b/evaluation/metrics/intelligibility/word_error_rate.py @@ -0,0 +1,81 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from torchmetrics import WordErrorRate + + +def extract_wer( + model, + **kwargs, +): + """Compute Word Error Rate (WER) between the predicted and the ground truth audio. + content_gt: the ground truth content. + audio_ref: path to the ground truth audio. + audio_deg: path to the predicted audio. + mode: "gt_content" computes the WER between the predicted content obtained from the whisper model and the ground truth content. + both content_gt and audio_deg are needed. + "gt_audio" computes the WER between the extracted ground truth and predicted contents obtained from the whisper model. + both audio_ref and audio_deg are needed. + """ + kwargs = kwargs["kwargs"] + mode = kwargs["intelligibility_mode"] + language = kwargs["language"] + wer = WordErrorRate() + + if torch.cuda.is_available(): + device = torch.device("cuda") + wer = wer.to(device) + + # Get ground truth content + if mode == "gt_content": + content_gt = kwargs["content_gt"] + audio_deg = kwargs["audio_deg"] + + if language == "chinese": + prompt = "以下是普通话的句子" + result_deg = model.transcribe( + audio_deg, language="zh", verbose=True, initial_prompt=prompt + ) + else: + result_deg = model.transcribe(audio_deg, verbose=True) + elif mode == "gt_audio": + audio_ref = kwargs["audio_ref"] + audio_deg = kwargs["audio_deg"] + + if language == "chinese": + prompt = "以下是普通话的句子" + result_ref = model.transcribe( + audio_ref, language="zh", verbose=True, initial_prompt=prompt + ) + result_deg = model.transcribe( + audio_deg, language="zh", verbose=True, initial_prompt=prompt + ) + else: + result_ref = model.transcribe(audio_deg, verbose=True) + result_deg = model.transcribe(audio_deg, verbose=True) + + content_gt = result_ref["text"] + + content_gt = content_gt.replace(" ", "") + content_gt = content_gt.replace(".", "") + content_gt = content_gt.replace("'", "") + content_gt = content_gt.replace("-", "") + content_gt = content_gt.replace(",", "") + content_gt = content_gt.replace("!", "") + content_gt = content_gt.lower() + + # Get predicted truth content + content_pred = result_deg["text"] + content_pred = content_pred.replace(" ", "") + content_pred = content_pred.replace(".", "") + content_pred = content_pred.replace("'", "") + content_pred = content_pred.replace("-", "") + content_pred = content_pred.replace(",", "") + content_pred = content_pred.replace("!", "") + content_pred = content_pred.lower() + + return wer(content_pred, content_gt).detach().cpu().numpy().tolist() diff --git a/evaluation/metrics/similarity/__init__.py b/evaluation/metrics/similarity/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/evaluation/metrics/similarity/models/RawNetBasicBlock.py b/evaluation/metrics/similarity/models/RawNetBasicBlock.py new file mode 100644 index 0000000000000000000000000000000000000000..c52c50c226604ee688a5208e19cf6dcd649a3420 --- /dev/null +++ b/evaluation/metrics/similarity/models/RawNetBasicBlock.py @@ -0,0 +1,146 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class PreEmphasis(torch.nn.Module): + def __init__(self, coef: float = 0.97) -> None: + super().__init__() + self.coef = coef + # make kernel + # In pytorch, the convolution operation uses cross-correlation. So, filter is flipped. + self.register_buffer( + "flipped_filter", + torch.FloatTensor([-self.coef, 1.0]).unsqueeze(0).unsqueeze(0), + ) + + def forward(self, input: torch.tensor) -> torch.tensor: + assert ( + len(input.size()) == 2 + ), "The number of dimensions of input tensor must be 2!" + # reflect padding to match lengths of in/out + input = input.unsqueeze(1) + input = F.pad(input, (1, 0), "reflect") + return F.conv1d(input, self.flipped_filter) + + +class AFMS(nn.Module): + """ + Alpha-Feature map scaling, added to the output of each residual block[1,2]. + + Reference: + [1] RawNet2 : https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1011.pdf + [2] AMFS : https://www.koreascience.or.kr/article/JAKO202029757857763.page + """ + + def __init__(self, nb_dim: int) -> None: + super().__init__() + self.alpha = nn.Parameter(torch.ones((nb_dim, 1))) + self.fc = nn.Linear(nb_dim, nb_dim) + self.sig = nn.Sigmoid() + + def forward(self, x): + y = F.adaptive_avg_pool1d(x, 1).view(x.size(0), -1) + y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1) + + x = x + self.alpha + x = x * y + return x + + +class Bottle2neck(nn.Module): + def __init__( + self, + inplanes, + planes, + kernel_size=None, + dilation=None, + scale=4, + pool=False, + ): + super().__init__() + + width = int(math.floor(planes / scale)) + + self.conv1 = nn.Conv1d(inplanes, width * scale, kernel_size=1) + self.bn1 = nn.BatchNorm1d(width * scale) + + self.nums = scale - 1 + + convs = [] + bns = [] + + num_pad = math.floor(kernel_size / 2) * dilation + + for i in range(self.nums): + convs.append( + nn.Conv1d( + width, + width, + kernel_size=kernel_size, + dilation=dilation, + padding=num_pad, + ) + ) + bns.append(nn.BatchNorm1d(width)) + + self.convs = nn.ModuleList(convs) + self.bns = nn.ModuleList(bns) + + self.conv3 = nn.Conv1d(width * scale, planes, kernel_size=1) + self.bn3 = nn.BatchNorm1d(planes) + + self.relu = nn.ReLU() + + self.width = width + + self.mp = nn.MaxPool1d(pool) if pool else False + self.afms = AFMS(planes) + + if inplanes != planes: # if change in number of filters + self.residual = nn.Sequential( + nn.Conv1d(inplanes, planes, kernel_size=1, stride=1, bias=False) + ) + else: + self.residual = nn.Identity() + + def forward(self, x): + residual = self.residual(x) + + out = self.conv1(x) + out = self.relu(out) + out = self.bn1(out) + + spx = torch.split(out, self.width, 1) + for i in range(self.nums): + if i == 0: + sp = spx[i] + else: + sp = sp + spx[i] + sp = self.convs[i](sp) + sp = self.relu(sp) + sp = self.bns[i](sp) + if i == 0: + out = sp + else: + out = torch.cat((out, sp), 1) + + out = torch.cat((out, spx[self.nums]), 1) + + out = self.conv3(out) + out = self.relu(out) + out = self.bn3(out) + + out += residual + if self.mp: + out = self.mp(out) + out = self.afms(out) + + return out diff --git a/evaluation/metrics/similarity/models/RawNetModel.py b/evaluation/metrics/similarity/models/RawNetModel.py new file mode 100644 index 0000000000000000000000000000000000000000..cfe8a55573c44bd9f7d3f5ce600d7360a45d1981 --- /dev/null +++ b/evaluation/metrics/similarity/models/RawNetModel.py @@ -0,0 +1,140 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# -*- encoding: utf-8 -*- + +import torch +import torch.nn as nn +from asteroid_filterbanks import Encoder, ParamSincFB + +from .RawNetBasicBlock import Bottle2neck, PreEmphasis + + +class RawNet3(nn.Module): + def __init__(self, block, model_scale, context, summed, C=1024, **kwargs): + super().__init__() + + nOut = kwargs["nOut"] + + self.context = context + self.encoder_type = kwargs["encoder_type"] + self.log_sinc = kwargs["log_sinc"] + self.norm_sinc = kwargs["norm_sinc"] + self.out_bn = kwargs["out_bn"] + self.summed = summed + + self.preprocess = nn.Sequential( + PreEmphasis(), nn.InstanceNorm1d(1, eps=1e-4, affine=True) + ) + self.conv1 = Encoder( + ParamSincFB( + C // 4, + 251, + stride=kwargs["sinc_stride"], + ) + ) + self.relu = nn.ReLU() + self.bn1 = nn.BatchNorm1d(C // 4) + + self.layer1 = block( + C // 4, C, kernel_size=3, dilation=2, scale=model_scale, pool=5 + ) + self.layer2 = block(C, C, kernel_size=3, dilation=3, scale=model_scale, pool=3) + self.layer3 = block(C, C, kernel_size=3, dilation=4, scale=model_scale) + self.layer4 = nn.Conv1d(3 * C, 1536, kernel_size=1) + + if self.context: + attn_input = 1536 * 3 + else: + attn_input = 1536 + print("self.encoder_type", self.encoder_type) + if self.encoder_type == "ECA": + attn_output = 1536 + elif self.encoder_type == "ASP": + attn_output = 1 + else: + raise ValueError("Undefined encoder") + + self.attention = nn.Sequential( + nn.Conv1d(attn_input, 128, kernel_size=1), + nn.ReLU(), + nn.BatchNorm1d(128), + nn.Conv1d(128, attn_output, kernel_size=1), + nn.Softmax(dim=2), + ) + + self.bn5 = nn.BatchNorm1d(3072) + + self.fc6 = nn.Linear(3072, nOut) + self.bn6 = nn.BatchNorm1d(nOut) + + self.mp3 = nn.MaxPool1d(3) + + def forward(self, x): + """ + :param x: input mini-batch (bs, samp) + """ + + with torch.cuda.amp.autocast(enabled=False): + x = self.preprocess(x) + x = torch.abs(self.conv1(x)) + if self.log_sinc: + x = torch.log(x + 1e-6) + if self.norm_sinc == "mean": + x = x - torch.mean(x, dim=-1, keepdim=True) + elif self.norm_sinc == "mean_std": + m = torch.mean(x, dim=-1, keepdim=True) + s = torch.std(x, dim=-1, keepdim=True) + s[s < 0.001] = 0.001 + x = (x - m) / s + + if self.summed: + x1 = self.layer1(x) + x2 = self.layer2(x1) + x3 = self.layer3(self.mp3(x1) + x2) + else: + x1 = self.layer1(x) + x2 = self.layer2(x1) + x3 = self.layer3(x2) + + x = self.layer4(torch.cat((self.mp3(x1), x2, x3), dim=1)) + x = self.relu(x) + + t = x.size()[-1] + + if self.context: + global_x = torch.cat( + ( + x, + torch.mean(x, dim=2, keepdim=True).repeat(1, 1, t), + torch.sqrt( + torch.var(x, dim=2, keepdim=True).clamp(min=1e-4, max=1e4) + ).repeat(1, 1, t), + ), + dim=1, + ) + else: + global_x = x + + w = self.attention(global_x) + + mu = torch.sum(x * w, dim=2) + sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)) + + x = torch.cat((mu, sg), 1) + + x = self.bn5(x) + + x = self.fc6(x) + + if self.out_bn: + x = self.bn6(x) + + return x + + +def MainModel(**kwargs): + model = RawNet3(Bottle2neck, model_scale=8, context=True, summed=True, **kwargs) + return model diff --git a/evaluation/metrics/similarity/models/__init__.py b/evaluation/metrics/similarity/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/evaluation/metrics/similarity/speaker_similarity.py b/evaluation/metrics/similarity/speaker_similarity.py new file mode 100644 index 0000000000000000000000000000000000000000..ce90779f81eefc9cccd54415013b3141e0a1f608 --- /dev/null +++ b/evaluation/metrics/similarity/speaker_similarity.py @@ -0,0 +1,184 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os + +import numpy as np +import soundfile as sf +import torch +import torch.nn.functional as F +from tqdm import tqdm +import librosa + +from evaluation.metrics.similarity.models.RawNetModel import RawNet3 +from evaluation.metrics.similarity.models.RawNetBasicBlock import Bottle2neck + +from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector +from resemblyzer import VoiceEncoder, preprocess_wav + + +def extract_rawnet_speaker_embd( + model, fn: str, n_samples: int, n_segments: int = 10, gpu: bool = False +) -> np.ndarray: + audio, sample_rate = sf.read(fn) + if len(audio.shape) > 1: + raise ValueError( + f"RawNet3 supports mono input only. Input data has a shape of {audio.shape}." + ) + + if sample_rate != 16000: + audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) + if len(audio) < n_samples: + shortage = n_samples - len(audio) + 1 + audio = np.pad(audio, (0, shortage), "wrap") + + audios = [] + startframe = np.linspace(0, len(audio) - n_samples, num=n_segments) + for asf in startframe: + audios.append(audio[int(asf) : int(asf) + n_samples]) + + audios = torch.from_numpy(np.stack(audios, axis=0).astype(np.float32)) + if gpu: + audios = audios.to("cuda") + with torch.no_grad(): + output = model(audios) + + return output + + +def extract_similarity(path_ref, path_deg, **kwargs): + kwargs = kwargs["kwargs"] + model_name = kwargs["model_name"] + + ref_embds = [] + deg_embds = [] + + if torch.cuda.is_available(): + device = torch.device("cuda") + else: + device = torch.device("cpu") + + if model_name == "rawnet": + model = RawNet3( + Bottle2neck, + model_scale=8, + context=True, + summed=True, + encoder_type="ECA", + nOut=256, + out_bn=False, + sinc_stride=10, + log_sinc=True, + norm_sinc="mean", + grad_mult=1, + ) + model.load_state_dict( + torch.load( + "pretrained/rawnet3/model.pt", + map_location=lambda storage, loc: storage, + )["model"] + ) + model.eval() + model = model.to(device) + + for file in tqdm(os.listdir(path_ref)): + output = extract_rawnet_speaker_embd( + model, + fn=os.path.join(path_ref, file), + n_samples=48000, + n_segments=10, + gpu=torch.cuda.is_available(), + ).mean(0) + ref_embds.append(output) + + for file in tqdm(os.listdir(path_deg)): + output = extract_rawnet_speaker_embd( + model, + fn=os.path.join(path_deg, file), + n_samples=48000, + n_segments=10, + gpu=torch.cuda.is_available(), + ).mean(0) + deg_embds.append(output) + elif model_name == "wavlm": + try: + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( + "microsoft/wavlm-base-plus-sv" + ) + model = WavLMForXVector.from_pretrained("microsoft/wavlm-base-plus-sv") + except: + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( + "pretrained/wavlm", sampling_rate=16000 + ) + model = WavLMForXVector.from_pretrained("pretrained/wavlm") + model = model.to(device) + + for file in tqdm(os.listdir(path_ref)): + wav_path = os.path.join(path_ref, file) + wav, _ = librosa.load(wav_path, sr=16000) + + inputs = feature_extractor( + [wav], padding=True, return_tensors="pt", sampling_rate=16000 + ) + if torch.cuda.is_available(): + for key in inputs.keys(): + inputs[key] = inputs[key].to(device) + + with torch.no_grad(): + embds = model(**inputs).embeddings + embds = embds + ref_embds.append(embds[0]) + + for file in tqdm(os.listdir(path_deg)): + wav_path = os.path.join(path_deg, file) + wav, _ = librosa.load(wav_path, sr=16000) + + inputs = feature_extractor( + [wav], padding=True, return_tensors="pt", sampling_rate=16000 + ) + if torch.cuda.is_available(): + for key in inputs.keys(): + inputs[key] = inputs[key].to(device) + + with torch.no_grad(): + embds = model(**inputs).embeddings + embds = embds + deg_embds.append(embds[0]) + elif model_name == "resemblyzer": + encoder = VoiceEncoder().to(device) + + for file in tqdm(os.listdir(path_ref)): + wav_path = os.path.join(path_ref, file) + wav = preprocess_wav(wav_path) + + output = encoder.embed_utterance(wav) + ref_embds.append(torch.from_numpy(output).to(device)) + + for file in tqdm(os.listdir(path_deg)): + wav_path = os.path.join(path_deg, file) + wav = preprocess_wav(wav_path) + + output = encoder.embed_utterance(wav) + deg_embds.append(torch.from_numpy(output).to(device)) + + similarity_mode = kwargs["similarity_mode"] + scores = [] + + if similarity_mode == "pairwith": + for ref_embd, deg_embd in zip(ref_embds, deg_embds): + scores.append( + F.cosine_similarity(ref_embd, deg_embd, dim=-1).detach().cpu().numpy() + ) + elif similarity_mode == "overall": + for ref_embd in ref_embds: + for deg_embd in deg_embds: + scores.append( + F.cosine_similarity(ref_embd, deg_embd, dim=-1) + .detach() + .cpu() + .numpy() + ) + + return np.mean(scores) diff --git a/evaluation/metrics/spectrogram/__init__.py b/evaluation/metrics/spectrogram/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/evaluation/metrics/spectrogram/frechet_distance.py b/evaluation/metrics/spectrogram/frechet_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..c91e8cb9bdb3b4154e05a9afcf1a3a5112d0a78f --- /dev/null +++ b/evaluation/metrics/spectrogram/frechet_distance.py @@ -0,0 +1,28 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from frechet_audio_distance import FrechetAudioDistance + + +def extract_fad( + audio_dir1, + audio_dir2, + **kwargs, +): + """Extract Frechet Audio Distance for two given audio folders. + audio_dir1: path to the ground truth audio folder. + audio_dir2: path to the predicted audio folder. + mode: "vggish", "pann", "clap" for different models. + """ + frechet = FrechetAudioDistance( + model_name="vggish", + use_pca=False, + use_activation=False, + verbose=False, + ) + + fad_score = frechet.score(audio_dir1, audio_dir2) + + return fad_score diff --git a/evaluation/metrics/spectrogram/mel_cepstral_distortion.py b/evaluation/metrics/spectrogram/mel_cepstral_distortion.py new file mode 100644 index 0000000000000000000000000000000000000000..d4e4825ff11014c998b37025c4d29843c19417a4 --- /dev/null +++ b/evaluation/metrics/spectrogram/mel_cepstral_distortion.py @@ -0,0 +1,24 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from pymcd.mcd import Calculate_MCD + + +def extract_mcd(audio_ref, audio_deg, **kwargs): + """Extract Mel-Cepstral Distance for a two given audio. + Args: + audio_ref: The given reference audio. It is an audio path. + audio_deg: The given synthesized audio. It is an audio path. + """ + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + + mcd_toolbox = Calculate_MCD(MCD_mode="dtw_sl") + if fs != None: + mcd_toolbox.SAMPLING_RATE = fs + mcd_value = mcd_toolbox.calculate_mcd(audio_ref, audio_deg) + + return mcd_value diff --git a/evaluation/metrics/spectrogram/multi_resolution_stft_distance.py b/evaluation/metrics/spectrogram/multi_resolution_stft_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..2cbece73e49045bb8f53bb66f0da97dfcd6d75ae --- /dev/null +++ b/evaluation/metrics/spectrogram/multi_resolution_stft_distance.py @@ -0,0 +1,109 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import librosa +import torch + +import numpy as np + + +def extract_mstft( + audio_ref, + audio_deg, + **kwargs, +): + """Compute Multi-Scale STFT Distance (mstft) between the predicted and the ground truth audio. + audio_ref: path to the ground truth audio. + audio_deg: path to the predicted audio. + fs: sampling rate. + med_freq: division frequency for mid frequency parts. + high_freq: division frequency for high frequency parts. + method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio. + "cut" will cut both audios into a same length according to the one with the shorter length. + """ + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + + # Load audio + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + # Audio length alignment + if len(audio_ref) != len(audio_deg): + if method == "cut": + length = min(len(audio_ref), len(audio_deg)) + audio_ref = audio_ref[:length] + audio_deg = audio_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(audio_ref, audio_deg, backtrack=True) + audio_ref_new = [] + audio_deg_new = [] + for i in range(wp.shape[0]): + ref_index = wp[i][0] + deg_index = wp[i][1] + audio_ref_new.append(audio_ref[ref_index]) + audio_deg_new.append(audio_deg[deg_index]) + audio_ref = np.array(audio_ref_new) + audio_deg = np.array(audio_deg_new) + assert len(audio_ref) == len(audio_deg) + + # Define loss function + l1Loss = torch.nn.L1Loss(reduction="mean") + + # Compute distance + fft_sizes = [1024, 2048, 512] + hop_sizes = [120, 240, 50] + win_sizes = [600, 1200, 240] + + audio_ref = torch.from_numpy(audio_ref) + audio_deg = torch.from_numpy(audio_deg) + + if torch.cuda.is_available(): + device = torch.device("cuda") + audio_ref = audio_ref.to(device) + audio_deg = audio_deg.to(device) + + mstft_sc = 0 + mstft_mag = 0 + + for n_fft, hop_length, win_length in zip(fft_sizes, hop_sizes, win_sizes): + spec_ref = torch.stft( + audio_ref, n_fft, hop_length, win_length, return_complex=False + ) + spec_deg = torch.stft( + audio_deg, n_fft, hop_length, win_length, return_complex=False + ) + + real_ref = spec_ref[..., 0] + imag_ref = spec_ref[..., 1] + real_deg = spec_deg[..., 0] + imag_deg = spec_deg[..., 1] + + mag_ref = torch.sqrt( + torch.clamp(real_ref**2 + imag_ref**2, min=1e-7) + ).transpose(1, 0) + mag_deg = torch.sqrt( + torch.clamp(real_deg**2 + imag_deg**2, min=1e-7) + ).transpose(1, 0) + sc_loss = torch.norm(mag_ref - mag_deg, p="fro") / torch.norm(mag_ref, p="fro") + mag_loss = l1Loss(torch.log(mag_ref), torch.log(mag_deg)) + + mstft_sc += sc_loss + mstft_mag += mag_loss + + # Normalize distances + mstft_sc /= len(fft_sizes) + mstft_mag /= len(fft_sizes) + + return ( + mstft_sc.detach().cpu().numpy().tolist() + + mstft_mag.detach().cpu().numpy().tolist() + ) diff --git a/evaluation/metrics/spectrogram/pesq.py b/evaluation/metrics/spectrogram/pesq.py new file mode 100644 index 0000000000000000000000000000000000000000..5c71d16af2a791cfcc571f5b2fc59a7752e24877 --- /dev/null +++ b/evaluation/metrics/spectrogram/pesq.py @@ -0,0 +1,61 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import librosa + +import numpy as np + +from pypesq import pesq + + +def extract_pesq(audio_ref, audio_deg, **kwargs): + """Extract PESQ for a two given audio. + audio1: the given reference audio. It is a numpy array. + audio2: the given synthesized audio. It is a numpy array. + fs: sampling rate. + method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio. + "cut" will cut both audios into a same length according to the one with the shorter length. + """ + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + + # Load audio + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + # Resample + if fs != 16000: + audio_ref = librosa.resample(audio_ref, orig_sr=fs, target_sr=16000) + audio_deg = librosa.resample(audio_deg, orig_sr=fs, target_sr=16000) + fs = 16000 + + # Audio length alignment + if len(audio_ref) != len(audio_deg): + if method == "cut": + length = min(len(audio_ref), len(audio_deg)) + audio_ref = audio_ref[:length] + audio_deg = audio_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(audio_ref, audio_deg, backtrack=True) + audio_ref_new = [] + audio_deg_new = [] + for i in range(wp.shape[0]): + ref_index = wp[i][0] + deg_index = wp[i][1] + audio_ref_new.append(audio_ref[ref_index]) + audio_deg_new.append(audio_deg[deg_index]) + audio_ref = np.array(audio_ref_new) + audio_deg = np.array(audio_deg_new) + assert len(audio_ref) == len(audio_deg) + + # Compute pesq + score = pesq(audio_ref, audio_deg, fs) + return score diff --git a/evaluation/metrics/spectrogram/scale_invariant_signal_to_distortion_ratio.py b/evaluation/metrics/spectrogram/scale_invariant_signal_to_distortion_ratio.py new file mode 100644 index 0000000000000000000000000000000000000000..3a16f8c2bbc746186b4196a66382aaa3c27b7ebb --- /dev/null +++ b/evaluation/metrics/spectrogram/scale_invariant_signal_to_distortion_ratio.py @@ -0,0 +1,56 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import librosa + +import numpy as np + +from torchmetrics import ScaleInvariantSignalDistortionRatio + + +def extract_si_sdr(audio_ref, audio_deg, **kwargs): + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + + si_sdr = ScaleInvariantSignalDistortionRatio() + + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + if len(audio_ref) != len(audio_deg): + if method == "cut": + length = min(len(audio_ref), len(audio_deg)) + audio_ref = audio_ref[:length] + audio_deg = audio_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(audio_ref, audio_deg, backtrack=True) + audio_ref_new = [] + audio_deg_new = [] + for i in range(wp.shape[0]): + ref_index = wp[i][0] + deg_index = wp[i][1] + audio_ref_new.append(audio_ref[ref_index]) + audio_deg_new.append(audio_deg[deg_index]) + audio_ref = np.array(audio_ref_new) + audio_deg = np.array(audio_deg_new) + assert len(audio_ref) == len(audio_deg) + + audio_ref = torch.from_numpy(audio_ref) + audio_deg = torch.from_numpy(audio_deg) + + if torch.cuda.is_available(): + device = torch.device("cuda") + audio_ref = audio_ref.to(device) + audio_deg = audio_deg.to(device) + si_sdr = si_sdr.to(device) + + return si_sdr(audio_deg, audio_ref).detach().cpu().numpy().tolist() diff --git a/evaluation/metrics/spectrogram/scale_invariant_signal_to_noise_ratio.py b/evaluation/metrics/spectrogram/scale_invariant_signal_to_noise_ratio.py new file mode 100644 index 0000000000000000000000000000000000000000..2748021e65b874a705b1be42e386aca2d728ca43 --- /dev/null +++ b/evaluation/metrics/spectrogram/scale_invariant_signal_to_noise_ratio.py @@ -0,0 +1,56 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import librosa + +import numpy as np + +from torchmetrics import ScaleInvariantSignalNoiseRatio + + +def extract_si_snr(audio_ref, audio_deg, **kwargs): + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + + si_snr = ScaleInvariantSignalNoiseRatio() + + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + if len(audio_ref) != len(audio_deg): + if method == "cut": + length = min(len(audio_ref), len(audio_deg)) + audio_ref = audio_ref[:length] + audio_deg = audio_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(audio_ref, audio_deg, backtrack=True) + audio_ref_new = [] + audio_deg_new = [] + for i in range(wp.shape[0]): + ref_index = wp[i][0] + deg_index = wp[i][1] + audio_ref_new.append(audio_ref[ref_index]) + audio_deg_new.append(audio_deg[deg_index]) + audio_ref = np.array(audio_ref_new) + audio_deg = np.array(audio_deg_new) + assert len(audio_ref) == len(audio_deg) + + audio_ref = torch.from_numpy(audio_ref) + audio_deg = torch.from_numpy(audio_deg) + + if torch.cuda.is_available(): + device = torch.device("cuda") + audio_ref = audio_ref.to(device) + audio_deg = audio_deg.to(device) + si_snr = si_snr.to(device) + + return si_snr(audio_deg, audio_ref).detach().cpu().numpy().tolist() diff --git a/evaluation/metrics/spectrogram/short_time_objective_intelligibility.py b/evaluation/metrics/spectrogram/short_time_objective_intelligibility.py new file mode 100644 index 0000000000000000000000000000000000000000..e493ec437a5697c435c1483708204b0a41313453 --- /dev/null +++ b/evaluation/metrics/spectrogram/short_time_objective_intelligibility.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023 Amphion. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import librosa + +import numpy as np + +from torchmetrics.audio.stoi import ShortTimeObjectiveIntelligibility + + +def extract_stoi(audio_ref, audio_deg, **kwargs): + """Compute Short-Time Objective Intelligibility between the predicted and the ground truth audio. + audio_ref: path to the ground truth audio. + audio_deg: path to the predicted audio. + fs: sampling rate. + method: "dtw" will use dtw algorithm to align the length of the ground truth and predicted audio. + "cut" will cut both audios into a same length according to the one with the shorter length. + """ + # Load hyperparameters + kwargs = kwargs["kwargs"] + fs = kwargs["fs"] + method = kwargs["method"] + + # Load audio + if fs != None: + audio_ref, _ = librosa.load(audio_ref, sr=fs) + audio_deg, _ = librosa.load(audio_deg, sr=fs) + else: + audio_ref, fs = librosa.load(audio_ref) + audio_deg, fs = librosa.load(audio_deg) + + # Initialize method + stoi = ShortTimeObjectiveIntelligibility(fs, extended=False) + + # Audio length alignment + if len(audio_ref) != len(audio_deg): + if method == "cut": + length = min(len(audio_ref), len(audio_deg)) + audio_ref = audio_ref[:length] + audio_deg = audio_deg[:length] + elif method == "dtw": + _, wp = librosa.sequence.dtw(audio_ref, audio_deg, backtrack=True) + audio_ref_new = [] + audio_deg_new = [] + for i in range(wp.shape[0]): + ref_index = wp[i][0] + deg_index = wp[i][1] + audio_ref_new.append(audio_ref[ref_index]) + audio_deg_new.append(audio_deg[deg_index]) + audio_ref = np.array(audio_ref_new) + audio_deg = np.array(audio_deg_new) + assert len(audio_ref) == len(audio_deg) + + # Convert to tensor + audio_ref = torch.from_numpy(audio_ref) + audio_deg = torch.from_numpy(audio_deg) + + if torch.cuda.is_available(): + device = torch.device("cuda") + audio_ref = audio_ref.to(device) + audio_deg = audio_deg.to(device) + stoi = stoi.to(device) + + return stoi(audio_deg, audio_ref).detach().cpu().numpy().tolist() diff --git a/imgs/maskgct/maskgct.png b/imgs/maskgct/maskgct.png new file mode 100644 index 0000000000000000000000000000000000000000..ba0d99f10ebf0d983610e1059fd4fad67f23483f Binary files /dev/null and b/imgs/maskgct/maskgct.png differ diff --git a/imgs/ns3/ns3_facodec.png b/imgs/ns3/ns3_facodec.png new file mode 100644 index 0000000000000000000000000000000000000000..978ad8cb90348a5de79c29cddc0e02318ffa9f02 Binary files /dev/null and b/imgs/ns3/ns3_facodec.png differ diff --git a/imgs/ns3/ns3_overview.png b/imgs/ns3/ns3_overview.png new file mode 100644 index 0000000000000000000000000000000000000000..f484fbe75083c1f184bad203267637845bee4eec Binary files /dev/null and b/imgs/ns3/ns3_overview.png differ diff --git a/imgs/svc/DiffComoSVC.png b/imgs/svc/DiffComoSVC.png new file mode 100644 index 0000000000000000000000000000000000000000..40eddeec8384e0d5416febd556376f5ed46c8f4a Binary files /dev/null and b/imgs/svc/DiffComoSVC.png differ diff --git a/imgs/svc/MultipleContentsSVC.png b/imgs/svc/MultipleContentsSVC.png new file mode 100644 index 0000000000000000000000000000000000000000..6d5cc04f639f5d8e3b4c8cecad0c350c96bc092f Binary files /dev/null and b/imgs/svc/MultipleContentsSVC.png differ diff --git a/imgs/svc/pipeline.png b/imgs/svc/pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..eb5aafa2ed457f611c13dc45f40f7b005c3598e8 Binary files /dev/null and b/imgs/svc/pipeline.png differ diff --git a/imgs/tta/DiffusionTTA.png b/imgs/tta/DiffusionTTA.png new file mode 100644 index 0000000000000000000000000000000000000000..a55bd92323315398a7d91b410f4f3225b54179f1 Binary files /dev/null and b/imgs/tta/DiffusionTTA.png differ diff --git a/imgs/visualization/SingVisio_system.jpg b/imgs/visualization/SingVisio_system.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fe94fa379728cfcea36ca01511db1f4d8c8c81f9 Binary files /dev/null and b/imgs/visualization/SingVisio_system.jpg differ diff --git a/imgs/vocoder/diffusion/pipeline.png b/imgs/vocoder/diffusion/pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..6a3690e26f99b8704e8e80a27415887c9c636692 Binary files /dev/null and b/imgs/vocoder/diffusion/pipeline.png differ diff --git a/imgs/vocoder/gan/MSSBCQTD.png b/imgs/vocoder/gan/MSSBCQTD.png new file mode 100644 index 0000000000000000000000000000000000000000..417e20fbaa8997ac37141d5068029ad02a97cbac --- /dev/null +++ b/imgs/vocoder/gan/MSSBCQTD.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98706f1abc595abf10513cd05a1ba2177c4ee991f6786a9c3ca81513eea95c5a +size 1773360 diff --git a/imgs/vocoder/gan/pipeline.png b/imgs/vocoder/gan/pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..274828515ad906905d6c267d636995277a2b4caf Binary files /dev/null and b/imgs/vocoder/gan/pipeline.png differ