Spaces:

NorHsangPha
/

IMS-ToucanTTS

Sleeping

App Files Files Community

IMS-ToucanTTS / Utility /path_to_transcript_dicts.py

NorHsangPha

Initial commit

de6e35f verified 4 months ago

raw

history blame contribute delete

97 kB

	import glob
	import os
	import random

	import torch


	# HELPER FUNCTIONS


	def split_dictionary_into_chunks(input_dict, split_n):
	res = []
	new_dict = {}
	elements_per_dict = (len(input_dict.keys()) // split_n) + 1
	for k, v in input_dict.items():
	if len(new_dict) < elements_per_dict:
	new_dict[k] = v
	else:
	res.append(new_dict)
	new_dict = {k: v}
	res.append(new_dict)
	return res


	def limit_to_n(path_to_transcript_dict, n=40000):
	# deprecated, we now just use the whole thing always, because there's a critical mass of data
	limited_dict = dict()
	if len(path_to_transcript_dict.keys()) > n:
	for key in random.sample(list(path_to_transcript_dict.keys()), n):
	limited_dict[key] = path_to_transcript_dict[key]
	return limited_dict
	else:
	return path_to_transcript_dict


	def build_path_to_transcript_dict_multi_ling_librispeech_template(root):
	"""
	https://arxiv.org/abs/2012.03411
	"""
	path_to_transcript = dict()
	with open(os.path.join(root, "transcripts.txt"), "r", encoding="utf8") as file:
	lookup = file.read()
	for line in lookup.split("\n"):
	if line.strip() != "":
	fields = line.split("\t")
	wav_folders = fields[0].split("_")
	wav_path = (
	f"{root}/audio/{wav_folders[0]}/{wav_folders[1]}/{fields[0]}.flac"
	)
	path_to_transcript[wav_path] = fields[1]
	return path_to_transcript


	def build_path_to_transcript_dict_hui_template(root):
	"""
	https://arxiv.org/abs/2106.06309
	"""
	path_to_transcript = dict()
	for el in os.listdir(root):
	if os.path.isdir(os.path.join(root, el)):
	with open(
	os.path.join(root, el, "metadata.csv"), "r", encoding="utf8"
	) as file:
	lookup = file.read()
	for line in lookup.split("\n"):
	if line.strip() != "":
	norm_transcript = line.split("\|")[1]
	wav_path = os.path.join(
	root, el, "wavs", line.split("\|")[0] + ".wav"
	)
	if os.path.exists(wav_path):
	path_to_transcript[wav_path] = norm_transcript
	return path_to_transcript


	# ENGLISH


	def build_path_to_transcript_dict_mls_english(re_cache=False):
	lang = "english"
	root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train"
	cache_path = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train/pttd_cache.pt"
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = (
	build_path_to_transcript_dict_multi_ling_librispeech_template(root=root)
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_gigaspeech(re_cache=False):
	root = "/mount/resources/speech/corpora/GigaSpeech/"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(
	os.path.join(root, "transcripts_only_clean_samples.txt"),
	"r",
	encoding="utf8",
	) as file:
	lookup = file.read()
	for line in lookup.split("\n"):
	if line.strip() != "":
	fields = line.split("\t")
	norm_transcript = fields[1]
	wav_path = fields[0]
	path_to_transcript[wav_path] = norm_transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_elizabeth(re_cache=False):
	root = "/mount/resources/speech/corpora/MAILabs_british_single_speaker_elizabeth"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	for el in os.listdir(root):
	if os.path.isdir(os.path.join(root, el)):
	with open(
	os.path.join(root, el, "metadata.csv"), "r", encoding="utf8"
	) as file:
	lookup = file.read()
	for line in lookup.split("\n"):
	if line.strip() != "":
	norm_transcript = line.split("\|")[2]
	wav_path = os.path.join(
	root, el, "wavs", line.split("\|")[0] + ".wav"
	)
	if os.path.exists(wav_path):
	path_to_transcript[wav_path] = norm_transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_nancy(re_cache=False):
	root = "/mount/resources/speech/corpora/NancyKrebs"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(os.path.join(root, "metadata.csv"), "r", encoding="utf8") as file:
	lookup = file.read()
	for line in lookup.split("\n"):
	if line.strip() != "":
	norm_transcript = line.split("\|")[1]
	wav_path = os.path.join(root, "wav", line.split("\|")[0] + ".wav")
	if os.path.exists(wav_path):
	path_to_transcript[wav_path] = norm_transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_integration_test(re_cache=True):
	root = "/mount/resources/speech/corpora/NancyKrebs"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(os.path.join(root, "metadata.csv"), "r", encoding="utf8") as file:
	lookup = file.read()
	for line in lookup.split("\n")[:500]:
	if line.strip() != "":
	norm_transcript = line.split("\|")[1]
	wav_path = os.path.join(root, "wav", line.split("\|")[0] + ".wav")
	if os.path.exists(wav_path):
	path_to_transcript[wav_path] = norm_transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_CREMA_D(re_cache=False):
	root = "/mount/resources/speech/corpora/CREMA_D/"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	identifier_to_sent = {
	"IEO": "It's eleven o'clock.",
	"TIE": "That is exactly what happened.",
	"IOM": "I'm on my way to the meeting.",
	"IWW": "I wonder what this is about.",
	"TAI": "The airplane is almost full.",
	"MTI": "Maybe tomorrow it will be cold.",
	"IWL": "I would like a new alarm clock.",
	"ITH": "I think, I have a doctor's appointment.",
	"DFA": "Don't forget a jacket.",
	"ITS": "I think, I've seen this before.",
	"TSI": "The surface is slick.",
	"WSI": "We'll stop in a couple of minutes.",
	}
	path_to_transcript = dict()
	for file in os.listdir(root):
	if file.endswith(".wav"):
	path_to_transcript[root + file] = identifier_to_sent[file.split("_")[1]]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_EmoV_DB(re_cache=False):
	root = "/mount/resources/speech/corpora/EmoV_DB/"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(os.path.join(root, "labels.txt"), "r", encoding="utf8") as file:
	lookup = file.read()
	identifier_to_sent = dict()
	for line in lookup.split("\n"):
	if line.strip() != "":
	identifier_to_sent[line.split()[0]] = " ".join(line.split()[1:])
	for file in os.listdir(root):
	if file.endswith(".wav"):
	path_to_transcript[root + file] = identifier_to_sent[file[-14:-10]]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_ryanspeech(re_cache=False):
	root = "/mount/resources/speech/corpora/RyanSpeech"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript_dict = dict()
	with open(root + "/metadata.csv", mode="r", encoding="utf8") as f:
	transcripts = f.read().split("\n")
	for transcript in transcripts:
	if transcript.strip() != "":
	parsed_line = transcript.split("\|")
	audio_file = f"{root}/wavs/{parsed_line[0]}.wav"
	path_to_transcript_dict[audio_file] = parsed_line[2]
	torch.save(path_to_transcript_dict, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_RAVDESS(re_cache=False):
	root = "/mount/resources/speech/corpora/RAVDESS"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript_dict = dict()
	for speaker_dir in os.listdir(root):
	for audio_file in os.listdir(os.path.join(root, speaker_dir)):
	if audio_file.split("-")[4] == "01":
	path_to_transcript_dict[
	os.path.join(root, speaker_dir, audio_file)
	] = "Kids are talking by the door."
	else:
	path_to_transcript_dict[
	os.path.join(root, speaker_dir, audio_file)
	] = "Dogs are sitting by the door."
	torch.save(path_to_transcript_dict, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_ESDS(re_cache=False):
	root = "/mount/resources/speech/corpora/Emotional_Speech_Dataset_Singapore"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript_dict = dict()
	for speaker_dir in os.listdir(root):
	if speaker_dir.startswith("00"):
	if int(speaker_dir) > 10:
	with open(
	f"{root}/{speaker_dir}/fixed_unicode.txt",
	mode="r",
	encoding="utf8",
	) as f:
	transcripts = f.read()
	for line in (
	transcripts.replace("\n\n", "\n").replace(",", ", ").split("\n")
	):
	if line.strip() != "":
	filename, text, emo_dir = line.split("\t")
	filename = speaker_dir + "_" + filename.split("_")[1]
	path_to_transcript_dict[
	f"{root}/{speaker_dir}/{emo_dir}/{filename}.wav"
	] = text
	torch.save(path_to_transcript_dict, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_nvidia_hifitts(re_cache=False):
	root = "/mount/resources/speech/corpora/hi_fi_tts_v0"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	transcripts = list()
	import json

	for jpath in [
	f"{root}/6097_manifest_clean_dev.json",
	f"{root}/6097_manifest_clean_test.json",
	f"{root}/6097_manifest_clean_train.json",
	f"{root}/9017_manifest_clean_dev.json",
	f"{root}/9017_manifest_clean_test.json",
	f"{root}/9017_manifest_clean_train.json",
	f"{root}/92_manifest_clean_dev.json",
	f"{root}/92_manifest_clean_test.json",
	f"{root}/92_manifest_clean_train.json",
	]:
	with open(jpath, encoding="utf-8", mode="r") as jfile:
	for line in jfile.read().split("\n"):
	if line.strip() != "":
	transcripts.append(json.loads(line))
	for transcript in transcripts:
	path = transcript["audio_filepath"]
	norm_text = transcript["text_normalized"]
	path_to_transcript[f"{root}/{path}"] = norm_text
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_blizzard_2013(re_cache=False):
	root = "/mount/resources/speech/corpora/Blizzard2013/train/segmented/"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(root + "prompts.gui", encoding="utf8") as f:
	transcriptions = f.read()
	blocks = transcriptions.split("\|\|\n")
	for block in blocks:
	trans_lines = block.split("\n")
	if trans_lines[0].strip() != "":
	transcript = (
	trans_lines[1]
	.replace("@", "")
	.replace("#", ",")
	.replace("\|", "")
	.replace(";", ",")
	.replace(":", ",")
	.replace(" 's", "'s")
	.replace(", ,", ",")
	.replace(" ", " ")
	.replace(" ,", ",")
	.replace(" .", ".")
	.replace(" ?", "?")
	.replace(" !", "!")
	.rstrip(" ,")
	)
	path_to_transcript[root + "wavn/" + trans_lines[0] + ".wav"] = (
	transcript
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_vctk(re_cache=False):
	root = "/mount/resources/speech/corpora/VCTK"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	for transcript_dir in os.listdir("/mount/resources/speech/corpora/VCTK/txt"):
	for transcript_file in os.listdir(
	f"/mount/resources/speech/corpora/VCTK/txt/{transcript_dir}"
	):
	if transcript_file.endswith(".txt"):
	with open(
	f"/mount/resources/speech/corpora/VCTK/txt/{transcript_dir}/"
	+ transcript_file,
	"r",
	encoding="utf8",
	) as tf:
	transcript = tf.read()
	wav_path = (
	f"/mount/resources/speech/corpora/VCTK/wav48_silence_trimmed/{transcript_dir}/"
	+ transcript_file.rstrip(".txt")
	+ "_mic2.flac"
	)
	if os.path.exists(wav_path):
	path_to_transcript[wav_path] = transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_libritts_all_clean(re_cache=False):
	root = "/mount/resources/speech/corpora/LibriTTS_R/"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_train = "/mount/resources/speech/corpora/LibriTTS_R/" # using all files from the "clean" subsets from LibriTTS-R https://arxiv.org/abs/2305.18802
	path_to_transcript = dict()
	for speaker in os.listdir(path_train):
	for chapter in os.listdir(os.path.join(path_train, speaker)):
	for file in os.listdir(os.path.join(path_train, speaker, chapter)):
	if file.endswith("normalized.txt"):
	with open(
	os.path.join(path_train, speaker, chapter, file),
	"r",
	encoding="utf8",
	) as tf:
	transcript = tf.read()
	wav_file = file.split(".")[0] + ".wav"
	path_to_transcript[
	os.path.join(path_train, speaker, chapter, wav_file)
	] = transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_libritts_other500(re_cache=False):
	root = "/mount/resources/asr-data/LibriTTS/train-other-500"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_train = "/mount/resources/asr-data/LibriTTS/train-other-500"
	path_to_transcript = dict()
	for speaker in os.listdir(path_train):
	for chapter in os.listdir(os.path.join(path_train, speaker)):
	for file in os.listdir(os.path.join(path_train, speaker, chapter)):
	if file.endswith("normalized.txt"):
	with open(
	os.path.join(path_train, speaker, chapter, file),
	"r",
	encoding="utf8",
	) as tf:
	transcript = tf.read()
	wav_file = file.split(".")[0] + ".wav"
	path_to_transcript[
	os.path.join(path_train, speaker, chapter, wav_file)
	] = transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_ljspeech(re_cache=False):
	root = "/mount/resources/speech/corpora/LJSpeech/"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	for transcript_file in os.listdir(
	"/mount/resources/speech/corpora/LJSpeech/16kHz/txt"
	):
	with open(
	"/mount/resources/speech/corpora/LJSpeech/16kHz/txt/" + transcript_file,
	"r",
	encoding="utf8",
	) as tf:
	transcript = tf.read()
	wav_path = (
	"/mount/resources/speech/corpora/LJSpeech/16kHz/wav/"
	+ transcript_file.rstrip(".txt")
	+ ".wav"
	)
	path_to_transcript[wav_path] = transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_jenny(re_cache=False):
	"""
	https://www.kaggle.com/datasets/noml4u/jenny-tts-dataset
	https://github.com/dioco-group/jenny-tts-dataset

	Dataset of Speaker Jenny (Dioco) with an Irish accent
	"""
	root = "/mount/resources/speech/corpora/Jenny/"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(
	"/mount/resources/speech/corpora/Jenny/metadata.csv", encoding="utf8"
	) as f:
	transcriptions = f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[
	"/mount/resources/speech/corpora/Jenny/"
	+ line.split("\|")[0]
	+ "_silence.flac"
	] = line.split("\|")[1]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# GERMAN


	def build_path_to_transcript_dict_mls_german(re_cache=False):
	lang = "german"
	root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = (
	build_path_to_transcript_dict_multi_ling_librispeech_template(root=root)
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_karlsson(re_cache=False):
	root = "/mount/resources/speech/corpora/HUI_German/Karlsson"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = build_path_to_transcript_dict_hui_template(root=root)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_eva(re_cache=False):
	root = "/mount/resources/speech/corpora/HUI_German/Eva"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = build_path_to_transcript_dict_hui_template(root=root)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_bernd(re_cache=False):
	root = "/mount/resources/speech/corpora/HUI_German/Bernd"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = build_path_to_transcript_dict_hui_template(root=root)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_friedrich(re_cache=False):
	root = "/mount/resources/speech/corpora/HUI_German/Friedrich"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = build_path_to_transcript_dict_hui_template(root=root)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_hokus(re_cache=False):
	root = "/mount/resources/speech/corpora/HUI_German/Hokus"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = build_path_to_transcript_dict_hui_template(root=root)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_hui_others(re_cache=False):
	root = "/mount/resources/speech/corpora/HUI_German/others"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	pttd = dict()
	for speaker in os.listdir(root):
	pttd.update(
	build_path_to_transcript_dict_hui_template(root=f"{root}/{speaker}")
	)
	torch.save(pttd, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_thorsten_neutral(re_cache=False):
	root = "/mount/resources/speech/corpora/ThorstenDatasets/thorsten-de_v03"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(root + "/metadata_train.csv", encoding="utf8") as f:
	transcriptions = f.read()
	with open(root + "/metadata_val.csv", encoding="utf8") as f:
	transcriptions += "\n" + f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[root + "/wavs/" + line.split("\|")[0] + ".wav"] = (
	line.split("\|")[1]
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_thorsten_2022_10(re_cache=False):
	root = (
	"/mount/resources/speech/corpora/ThorstenDatasets/ThorstenVoice-Dataset_2022.10"
	)
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(root + "/metadata_train.csv", encoding="utf8") as f:
	transcriptions = f.read()
	with open(root + "/metadata_dev.csv", encoding="utf8") as f:
	transcriptions += "\n" + f.read()
	with open(root + "/metadata_test.csv", encoding="utf8") as f:
	transcriptions += "\n" + f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[root + "/wavs/" + line.split("\|")[0] + ".wav"] = (
	line.split("\|")[1]
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_thorsten_emotional(re_cache=False):
	root = "/mount/resources/speech/corpora/ThorstenDatasets/thorsten-emotional_v02"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(root + "/thorsten-emotional-metadata.csv", encoding="utf8") as f:
	transcriptions = f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[root + "/amused/" + line.split("\|")[0] + ".wav"] = (
	line.split("\|")[1]
	)
	path_to_transcript[root + "/angry/" + line.split("\|")[0] + ".wav"] = (
	line.split("\|")[1]
	)
	path_to_transcript[
	root + "/disgusted/" + line.split("\|")[0] + ".wav"
	] = line.split("\|")[1]
	path_to_transcript[root + "/neutral/" + line.split("\|")[0] + ".wav"] = (
	line.split("\|")[1]
	)
	path_to_transcript[root + "/sleepy/" + line.split("\|")[0] + ".wav"] = (
	line.split("\|")[1]
	)
	path_to_transcript[
	root + "/surprised/" + line.split("\|")[0] + ".wav"
	] = line.split("\|")[1]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# FRENCH


	def build_path_to_transcript_dict_mls_french(re_cache=False):
	lang = "french"
	root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = (
	build_path_to_transcript_dict_multi_ling_librispeech_template(root=root)
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_blizzard2023_ad_silence_removed(re_cache=False):
	root = "/mount/resources/speech/corpora/Blizzard2023/AD_silence_removed"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(os.path.join(root, "transcript.tsv"), "r", encoding="utf8") as file:
	lookup = file.read()
	for line in lookup.split("\n"):
	if line.strip() != "":
	norm_transcript = line.split("\t")[1]
	wav_path = os.path.join(root, line.split("\t")[0].split("/")[-1])
	if os.path.exists(wav_path):
	path_to_transcript[wav_path] = (
	norm_transcript.replace("§", "")
	.replace("#", "")
	.replace("~", "")
	.replace(" »", '"')
	.replace("« ", '"')
	.replace("»", '"')
	.replace("«", '"')
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_blizzard2023_neb_silence_removed(re_cache=False):
	root = "/mount/resources/speech/corpora/Blizzard2023/NEB_silence_removed"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(os.path.join(root, "transcript.tsv"), "r", encoding="utf8") as file:
	lookup = file.read()
	for line in lookup.split("\n"):
	if line.strip() != "":
	norm_transcript = line.split("\t")[1]
	wav_path = os.path.join(root, line.split("\t")[0].split("/")[-1])
	if os.path.exists(wav_path):
	path_to_transcript[wav_path] = (
	norm_transcript.replace("§", "")
	.replace("#", "")
	.replace("~", "")
	.replace(" »", '"')
	.replace("« ", '"')
	.replace("»", '"')
	.replace("«", '"')
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_blizzard2023_neb_e_silence_removed(re_cache=False):
	root = "/mount/resources/speech/corpora/Blizzard2023/enhanced_NEB_subset_silence_removed"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(os.path.join(root, "transcript.tsv"), "r", encoding="utf8") as file:
	lookup = file.read()
	for line in lookup.split("\n"):
	if line.strip() != "":
	norm_transcript = line.split("\t")[1]
	wav_path = os.path.join(root, line.split("\t")[0].split("/")[-1])
	if os.path.exists(wav_path):
	path_to_transcript[wav_path] = (
	norm_transcript.replace("§", "")
	.replace("#", "")
	.replace("~", "")
	.replace(" »", '"')
	.replace("« ", '"')
	.replace("»", '"')
	.replace("«", '"')
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_synpaflex_norm_subset(re_cache=False):
	"""
	Contributed by https://github.com/tomschelsen
	"""
	root = "/mount/resources/speech/corpora/synpaflex-corpus/5/v0.1/"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	for text_path in glob.iglob(
	os.path.join(root, "*/_norm.txt"), recursive=True
	):
	with open(text_path, "r", encoding="utf8") as file:
	norm_transcript = file.read()
	path_obj = Path(text_path)
	wav_path = str(
	(path_obj.parent.parent / path_obj.name[:-9]).with_suffix(".wav")
	)
	if Path(wav_path).exists():
	path_to_transcript[wav_path] = norm_transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_siwis_subset(re_cache=False):
	"""
	Contributed by https://github.com/tomschelsen
	"""
	root = "/mount/resources/speech/corpora/SiwisFrenchSpeechSynthesisDatabase/"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	# part4 and part5 are not segmented
	sub_dirs = ["part1", "part2", "part3"]
	path_to_transcript = dict()
	for sd in sub_dirs:
	for text_path in glob.iglob(os.path.join(root, "text", sd, "*.txt")):
	with open(text_path, "r", encoding="utf8") as file:
	norm_transcript = file.read()
	path_obj = Path(text_path)
	wav_path = str(
	(
	path_obj.parent.parent.parent / "wavs" / sd / path_obj.stem
	).with_suffix(".wav")
	)
	if Path(wav_path).exists():
	path_to_transcript[wav_path] = norm_transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_css10fr(re_cache=False):
	language = "french"
	root = f"/mount/resources/speech/corpora/CSS10/{language}"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(
	f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt",
	encoding="utf8",
	) as f:
	transcriptions = f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[
	f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('\|')[0]}"
	] = line.split("\|")[2]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# SPANISH


	def build_path_to_transcript_dict_mls_spanish(re_cache=False):
	lang = "spanish"
	root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = (
	build_path_to_transcript_dict_multi_ling_librispeech_template(root=root)
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_css10es(re_cache=False):
	language = "spanish"
	root = f"/mount/resources/speech/corpora/CSS10/{language}"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(
	f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt",
	encoding="utf8",
	) as f:
	transcriptions = f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[
	f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('\|')[0]}"
	] = line.split("\|")[2]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_spanish_blizzard_train(re_cache=False):
	root = "/mount/resources/speech/corpora/Blizzard2021/spanish_blizzard_release_2021_v2/hub"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(os.path.join(root, "train_text.txt"), "r", encoding="utf8") as file:
	lookup = file.read()
	for line in lookup.split("\n"):
	if line.strip() != "":
	norm_transcript = line.split("\t")[1]
	wav_path = os.path.join(root, "train_wav", line.split("\t")[0] + ".wav")
	if os.path.exists(wav_path):
	path_to_transcript[wav_path] = norm_transcript
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# PORTUGUESE


	def build_path_to_transcript_dict_mls_portuguese(re_cache=False):
	lang = "portuguese"
	root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = (
	build_path_to_transcript_dict_multi_ling_librispeech_template(root=root)
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# POLISH


	def build_path_to_transcript_dict_mls_polish(re_cache=False):
	lang = "polish"
	root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = (
	build_path_to_transcript_dict_multi_ling_librispeech_template(root=root)
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# ITALIAN


	def build_path_to_transcript_dict_mls_italian(re_cache=False):
	lang = "italian"
	root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = (
	build_path_to_transcript_dict_multi_ling_librispeech_template(root=root)
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# DUTCH


	def build_path_to_transcript_dict_mls_dutch(re_cache=False):
	lang = "dutch"
	root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = (
	build_path_to_transcript_dict_multi_ling_librispeech_template(root=root)
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_css10nl(re_cache=False):
	language = "dutch"
	root = f"/mount/resources/speech/corpora/CSS10/{language}"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(
	f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt",
	encoding="utf8",
	) as f:
	transcriptions = f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[
	f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('\|')[0]}"
	] = line.split("\|")[2]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# GREEK


	def build_path_to_transcript_dict_css10el(re_cache=False):
	language = "greek"
	root = f"/mount/resources/speech/corpora/CSS10/{language}"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(
	f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt",
	encoding="utf8",
	) as f:
	transcriptions = f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[
	f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('\|')[0]}"
	] = line.split("\|")[2]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# FINNISH


	def build_path_to_transcript_dict_css10fi(re_cache=False):
	language = "finnish"
	root = f"/mount/resources/speech/corpora/CSS10/{language}"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(
	f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt",
	encoding="utf8",
	) as f:
	transcriptions = f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[
	f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('\|')[0]}"
	] = line.split("\|")[2]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# VIETNAMESE


	def build_path_to_transcript_dict_VIVOS_viet(re_cache=False):
	root = "/mount/resources/speech/corpora/VIVOS_vietnamese/train"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript_dict = dict()
	with open(root + "/prompts.txt", mode="r", encoding="utf8") as f:
	transcripts = f.read().split("\n")
	for transcript in transcripts:
	if transcript.strip() != "":
	parsed_line = transcript.split(" ")
	audio_file = f"{root}/waves/{parsed_line[0][:10]}/{parsed_line[0]}.wav"
	path_to_transcript_dict[audio_file] = " ".join(parsed_line[1:]).lower()
	torch.save(path_to_transcript_dict, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_vietTTS(re_cache=False):
	root = "/mount/resources/speech/corpora/VietTTS"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(root + "/meta_data.tsv", encoding="utf8") as f:
	transcriptions = f.read()
	for line in transcriptions.split("\n"):
	if line.strip() != "":
	parsed_line = line.split(".wav")
	audio_path = parsed_line[0]
	transcript = parsed_line[1]
	path_to_transcript[os.path.join(root, audio_path + ".wav")] = (
	transcript.strip()
	)
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# CHINESE


	def build_path_to_transcript_dict_aishell3(re_cache=False):
	root = "/mount/resources/speech/corpora/aishell3/train"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript_dict = dict()
	with open(root + "/label_train-set.txt", mode="r", encoding="utf8") as f:
	transcripts = f.read().replace("$", "").replace("%", " ").split("\n")
	for transcript in transcripts:
	if transcript.strip() != "" and not transcript.startswith("#"):
	parsed_line = transcript.split("\|")
	audio_file = f"{root}/wav/{parsed_line[0][:7]}/{parsed_line[0]}.wav"
	kanji = parsed_line[2]
	path_to_transcript_dict[audio_file] = kanji
	torch.save(path_to_transcript_dict, cache_path)
	return torch.load(cache_path)


	def build_path_to_transcript_dict_css10cmn(re_cache=False):
	language = "chinese"
	root = f"/mount/resources/speech/corpora/CSS10/{language}"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(
	"/mount/resources/speech/corpora/CSS10/chinese/transcript.txt",
	encoding="utf8",
	) as f:
	transcriptions = f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[
	"/mount/resources/speech/corpora/CSS10/chinese/"
	+ line.split("\|")[0]
	] = line.split("\|")[2]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# RUSSIAN


	def build_path_to_transcript_dict_css10ru(re_cache=False):
	language = "russian"
	root = f"/mount/resources/speech/corpora/CSS10/{language}"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	with open(
	f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt",
	encoding="utf8",
	) as f:
	transcriptions = f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[
	f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('\|')[0]}"
	] = line.split("\|")[2]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# HUNGARIAN


	def build_path_to_transcript_dict_css10hu(re_cache=False):
	language = "hungarian"
	root = f"/mount/resources/speech/corpora/CSS10/{language}"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	path_to_transcript = dict()
	language = "hungarian"
	with open(
	f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt",
	encoding="utf8",
	) as f:
	transcriptions = f.read()
	trans_lines = transcriptions.split("\n")
	for line in trans_lines:
	if line.strip() != "":
	path_to_transcript[
	f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('\|')[0]}"
	] = line.split("\|")[2]
	torch.save(path_to_transcript, cache_path)
	return torch.load(cache_path)


	# OTHER


	def build_file_list_singing_voice_audio_database(re_cache=False):
	root = "/mount/resources/speech/corpora/singing_voice_audio_dataset/monophonic"
	cache_path = os.path.join(root, "pttd_cache.pt")
	if not os.path.exists(cache_path) or re_cache:
	file_list = list()
	for corw in os.listdir(root):
	for singer in os.listdir(os.path.join(root, corw)):
	for audio in os.listdir(os.path.join(root, corw, singer)):
	file_list.append(os.path.join(root, corw, singer, audio))
	torch.save(file_list, cache_path)
	return torch.load(cache_path)


	from pathlib import Path
	import xml.etree.ElementTree as ET
	from csv import DictReader
	import json


	def build_path_to_transcript_dict_nst_norwegian():
	root = "/resources/speech/corpora/NST_norwegian/pcm/cs"
	path_to_transcript = dict()
	audio_paths = sorted(list(Path(root).glob("*.pcm")))
	i = 0
	with open(Path(root, "SCRIPTS/CTTS_core"), encoding="latin-1") as f:
	for line in f:
	transcript = line.strip().replace("\xad", "")
	path = str(audio_paths[i].absolute())
	path_to_transcript[path] = transcript
	i += 1
	return path_to_transcript


	def build_path_to_transcript_dict_nst_swedish():
	root = "/resources/speech/corpora/NST_swedish/sw_pcms"
	path_to_transcript = dict()
	audio_paths = sorted(list(Path(root, "mf").glob("*.pcm")))
	audio_paths.insert(4154, None)
	audio_paths.insert(5144, None)
	i = 0
	with open(Path(root, "scripts/mf/sw_all"), encoding="latin-1") as f:
	for line in f:
	if i == 4154 or i == 5144:
	i += 1
	continue
	transcript = line.strip().replace("\xad", "")
	path = str(audio_paths[i].absolute())
	path_to_transcript[path] = transcript
	i += 1
	return path_to_transcript


	def build_path_to_transcript_dict_nchlt_afr():
	root = "/resources/speech/corpora/nchlt_afr"
	return build_path_to_transcript_dict_nchlt_template(root, lang_code="afr")


	def build_path_to_transcript_dict_nchlt_nbl():
	root = "/resources/speech/corpora/nchlt_nbl"
	return build_path_to_transcript_dict_nchlt_template(root, lang_code="nbl")


	def build_path_to_transcript_dict_nchlt_nso():
	root = "/resources/speech/corpora/nchlt_nso"
	return build_path_to_transcript_dict_nchlt_template(root, lang_code="nso")


	def build_path_to_transcript_dict_nchlt_sot():
	root = "/resources/speech/corpora/nchlt_sot"
	return build_path_to_transcript_dict_nchlt_template(root, lang_code="sot")


	def build_path_to_transcript_dict_nchlt_ssw():
	root = "/resources/speech/corpora/nchlt_ssw"
	return build_path_to_transcript_dict_nchlt_template(root, lang_code="ssw")


	def build_path_to_transcript_dict_nchlt_tsn():
	root = "/resources/speech/corpora/nchlt_tsn"
	return build_path_to_transcript_dict_nchlt_template(root, lang_code="tsn")


	def build_path_to_transcript_dict_nchlt_tso():
	root = "/resources/speech/corpora/nchlt_tso"
	return build_path_to_transcript_dict_nchlt_template(root, lang_code="tso")


	def build_path_to_transcript_dict_nchlt_ven():
	root = "/resources/speech/corpora/nchlt_ven"
	return build_path_to_transcript_dict_nchlt_template(root, lang_code="ven")


	def build_path_to_transcript_dict_nchlt_xho():
	root = "/resources/speech/corpora/nchlt_xho"
	return build_path_to_transcript_dict_nchlt_template(root, lang_code="xho")


	def build_path_to_transcript_dict_nchlt_zul():
	root = "/resources/speech/corpora/nchlt_zul"
	return build_path_to_transcript_dict_nchlt_template(root, lang_code="zul")


	def build_path_to_transcript_dict_nchlt_template(root, lang_code):
	path_to_transcript = dict()
	base_dir = Path(root).parent

	for split in ["trn", "tst"]:
	tree = ET.parse(f"{root}/transcriptions/nchlt_{lang_code}.{split}.xml")
	tree_root = tree.getroot()
	for rec in tree_root.iter("recording"):
	transcript = rec.find("orth").text
	if "[s]" in transcript:
	continue
	path = str(base_dir / rec.get("audio"))
	path_to_transcript[path] = transcript

	return path_to_transcript


	def build_path_to_transcript_dict_bibletts_akuapem_twi():
	path_to_transcript = dict()
	root = "/resources/speech/corpora/BibleTTS/akuapem-twi"
	for split in ["train", "dev", "test"]:
	for book in Path(root, split).glob("*"):
	for textfile in book.glob("*.txt"):
	with open(textfile, "r", encoding="utf-8") as f:
	text = " ".join(
	[line.strip() for line in f]
	) # should usually be only one line anyway
	path_to_transcript[textfile.with_suffix(".flac")] = text

	return path_to_transcript


	def build_path_to_transcript_dict_bembaspeech():
	root = "/resources/speech/corpora/BembaSpeech/bem"
	path_to_transcript = dict()

	for split in ["train", "dev", "test"]:
	with open(Path(root, f"{split}.tsv"), "r", encoding="utf-8") as f:
	reader = DictReader(f, delimiter="\t")
	for row in reader:
	path_to_transcript[str(Path(root, "audio", row["audio"]))] = row[
	"sentence"
	]

	return path_to_transcript


	def build_path_to_transcript_dict_alffa_sw():
	root = "/resources/speech/corpora/ALFFA/data_broadcastnews_sw/data"

	path_to_transcript = build_path_to_transcript_dict_kaldi_template(
	root=root, split="train", replace_in_path=("asr_swahili/data/", "")
	)
	path_to_transcript.update(
	build_path_to_transcript_dict_kaldi_template(
	root=root, split="test", replace_in_path=("/my_dir/wav", "test/wav5")
	)
	)
	return path_to_transcript


	def build_path_to_transcript_dict_alffa_am():
	root = "/resources/speech/corpora/ALFFA/data_readspeech_am/data"

	path_to_transcript = build_path_to_transcript_dict_kaldi_template(
	root=root, split="train", replace_in_path=("/home/melese/kaldi/data/", "")
	)
	path_to_transcript.update(
	build_path_to_transcript_dict_kaldi_template(
	root=root, split="test", replace_in_path=("/home/melese/kaldi/data/", "")
	)
	)

	return path_to_transcript


	def build_path_to_transcript_dict_alffa_wo():
	root = "/resources/speech/corpora/ALFFA/data_readspeech_wo/data"
	path_to_transcript = dict()

	for split in ["train", "dev", "test"]:
	with open(Path(root, split, "text"), "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip().split()
	file = line[0]
	text = " ".join(line[1:])
	number = file.split("_")[1]
	path_to_transcript[str(Path(root, split, number, f"{file}.wav"))] = text

	return path_to_transcript


	def build_path_to_transcript_dict_malayalam():
	root = "/resources/speech/corpora/malayalam"
	path_to_transcript = dict()

	for gender in ["female", "male"]:
	with open(Path(root, f"line_index_{gender}.tsv"), "r", encoding="utf-8") as f:
	for line in f:
	file, text = line.strip().split("\t")
	path_to_transcript[str(Path(root, gender, f"{file}.wav"))] = text

	return path_to_transcript


	def build_path_to_transcript_dict_msc():
	root = "/resources/speech/corpora/msc_reviewed_speech"
	path_to_transcript = dict()

	with open(Path(root, f"metadata.tsv"), "r", encoding="utf-8") as f:
	reader = DictReader(f, delimiter="\t")
	for row in reader:
	path_to_transcript[str(Path(root, row["speechpath"]))] = row["transcript"]

	return path_to_transcript


	def build_path_to_transcript_dict_chuvash():
	root = "/resources/speech/corpora/chuvash"
	path_to_transcript = dict()

	for textfile in Path(root, "transcripts", "txt").glob("*.txt"):
	with open(textfile, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip().split()
	text = " ".join(line[1:]).replace("«", "").replace("»", "")
	path = Path(
	root, "audio", "split", f"trim_clean_{textfile.stem}.{line[0]}.flac"
	)
	if path.exists():
	path_to_transcript[str(path)] = text

	return path_to_transcript


	def build_path_to_transcript_dict_iban():
	root = "/resources/speech/corpora/iban/data"
	path_to_transcript = build_path_to_transcript_dict_kaldi_template(
	root, "train", replace_in_path=("asr_iban/data/", "")
	)
	path_to_transcript.update(
	build_path_to_transcript_dict_kaldi_template(
	root, "dev", replace_in_path=("asr_iban/data/", "")
	)
	)
	return path_to_transcript


	def build_path_to_transcript_dict_kaldi_template(root, split, replace_in_path=None):
	path_to_transcript = dict()

	wav_scp = {}
	with open(Path(root, split, "wav.scp"), "r") as f:
	for line in f:
	wav_id, wav_path = line.split()
	if replace_in_path:
	wav_path = wav_path.replace(replace_in_path[0], replace_in_path[1])
	wav_scp[wav_id] = str(Path(root, wav_path))

	with open(Path(root, split, "text"), "r", encoding="utf-8") as f:
	for line in f:
	line = line.split()
	wav_id = line[0]
	text = " ".join(line[1:])
	if "<" in text: # ignore all <UNK> utterance etc.
	continue
	path_to_transcript[wav_scp[wav_id]] = text

	return path_to_transcript


	def build_path_to_transcript_dict_sundanese_speech():
	root = "/resources/speech/corpora/sundanese_speech/asr_sundanese"
	return build_path_to_transcript_dict_south_asian_languages_template(root)


	def build_path_to_transcript_dict_sinhala_speech():
	root = "/resources/speech/corpora/sinhala_speech/asr_sinhala"
	return build_path_to_transcript_dict_south_asian_languages_template(root)


	def build_path_to_transcript_dict_bengali_speech():
	root = "/resources/speech/corpora/bengali_speech/asr_bengali"
	return build_path_to_transcript_dict_south_asian_languages_template(root)


	def build_path_to_transcript_dict_nepali_speech():
	root = "/resources/speech/corpora/nepali_speech/asr_nepali"
	return build_path_to_transcript_dict_south_asian_languages_template(root)


	def build_path_to_transcript_dict_javanese_speech():
	root = "/resources/speech/corpora/javanese_speech/asr_javanese"
	return build_path_to_transcript_dict_south_asian_languages_template(root)


	def build_path_to_transcript_dict_south_asian_languages_template(root):
	path_to_transcript = dict()

	with open(Path(root, "utt_spk_text.tsv"), "r", encoding="utf-8") as f:
	for line in f:
	utt, spk, text = line.strip().split("\t")
	dir_tag = utt[:2]
	path_to_transcript[str(Path(root, "data", dir_tag, f"{utt}.flac"))] = text

	return path_to_transcript


	def build_path_to_transcript_dict_african_voices_kenyan_afv():
	root = "/resources/speech/corpora/AfricanVoices/afv_enke"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_fon_alf():
	root = "/resources/speech/corpora/AfricanVoices/fon_alf"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_hausa_cmv():
	main_root = "/resources/speech/corpora/AfricanVoices"
	path_to_transcript = build_path_to_transcript_dict_african_voices_template(
	f"{main_root}/hau_cmv_f"
	)
	path_to_transcript.update(
	build_path_to_transcript_dict_african_voices_template(f"{main_root}/hau_cmv_m")
	)
	return path_to_transcript


	def build_path_to_transcript_dict_african_voices_ibibio_lst():
	root = "/resources/speech/corpora/AfricanVoices/ibb_lst"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_kikuyu_opb():
	root = "/resources/speech/corpora/AfricanVoices/kik_opb"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_lingala_opb():
	root = "/resources/speech/corpora/AfricanVoices/lin_opb"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_ganda_cmv():
	root = "/resources/speech/corpora/AfricanVoices/lug_cmv"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_luo_afv():
	root = "/resources/speech/corpora/AfricanVoices/luo_afv"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_luo_opb():
	root = "/resources/speech/corpora/AfricanVoices/luo_opb"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_swahili_llsti():
	root = "/resources/speech/corpora/AfricanVoices/swa_llsti"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_suba_afv():
	root = "/resources/speech/corpora/AfricanVoices/sxb_afv"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_wolof_alf():
	root = "/resources/speech/corpora/AfricanVoices/wol_alf"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_yoruba_opb():
	root = "/resources/speech/corpora/AfricanVoices/yor_opb"
	return build_path_to_transcript_dict_african_voices_template(root)


	def build_path_to_transcript_dict_african_voices_template(root):
	path_to_transcript = dict()

	with open(Path(root, "txt.done.data"), "r", encoding="utf-8") as f:
	for line in f:
	line = line.replace('\\"', "'").split('"')
	text = line[1]
	file = line[0].split()[-1]
	path_to_transcript[str(Path(root, "wav", f"{file}.wav"))] = text

	return path_to_transcript


	def build_path_to_transcript_dict_zambezi_voice_nyanja():
	root = "/resources/speech/corpora/ZambeziVoice/nyanja/nya"
	return build_path_to_transcript_dict_zambezi_voice_template(root)


	def build_path_to_transcript_dict_zambezi_voice_lozi():
	root = "/resources/speech/corpora/ZambeziVoice/lozi/loz"
	return build_path_to_transcript_dict_zambezi_voice_template(root)


	def build_path_to_transcript_dict_zambezi_voice_tonga():
	root = "/resources/speech/corpora/ZambeziVoice/tonga/toi"
	return build_path_to_transcript_dict_zambezi_voice_template(root)


	def build_path_to_transcript_dict_zambezi_voice_template(root):
	path_to_transcript = dict()

	for split in ["train", "dev", "test"]:
	with open(Path(root, f"{split}.tsv"), "r", encoding="utf-8") as f:
	reader = DictReader(f, delimiter="\t")
	for row in reader:
	path_to_transcript[str(Path(root, "audio", row["audio_id"]))] = row[
	"sentence"
	].strip()

	return path_to_transcript


	def build_path_to_transcript_dict_fleurs_template(root):
	path_to_transcript = dict()

	for split in ["train", "dev", "test"]:
	with open(Path(root, f"{split}.tsv"), "r", encoding="utf-8") as f:
	reader = DictReader(
	f,
	delimiter="\t",
	fieldnames=[
	"id",
	"filename",
	"transcription_raw",
	"transcription",
	"words",
	"speaker",
	"gender",
	],
	)
	for row in reader:
	path_to_transcript[str(Path(root, "audio", split, row["filename"]))] = (
	row["transcription_raw"].strip()
	)

	return path_to_transcript


	def build_path_to_transcript_dict_fleurs_afrikaans():
	root = "/resources/speech/corpora/fleurs/af_za"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_amharic():
	root = "/resources/speech/corpora/fleurs/am_et"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_arabic():
	root = "/resources/speech/corpora/fleurs/ar_eg"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_assamese():
	root = "/resources/speech/corpora/fleurs/as_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_asturian():
	root = "/resources/speech/corpora/fleurs/ast_es"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_azerbaijani():
	root = "/resources/speech/corpora/fleurs/az_az"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_belarusian():
	root = "/resources/speech/corpora/fleurs/be_by"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_bulgarian():
	root = "/resources/speech/corpora/fleurs/bg_bg"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_bengali():
	root = "/resources/speech/corpora/fleurs/bn_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_bosnian():
	root = "/resources/speech/corpora/fleurs/bs_ba"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_catalan():
	root = "/resources/speech/corpora/fleurs/ca_es"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_cebuano():
	root = "/resources/speech/corpora/fleurs/ceb_ph"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_sorani_kurdish():
	root = "/resources/speech/corpora/fleurs/ckb_iq"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_mandarin():
	root = "/resources/speech/corpora/fleurs/cmn_hans_cn"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_czech():
	root = "/resources/speech/corpora/fleurs/cs_cz"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_welsh():
	root = "/resources/speech/corpora/fleurs/cy_gb"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_danish():
	root = "/resources/speech/corpora/fleurs/da_dk"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_german():
	root = "/resources/speech/corpora/fleurs/de_de"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_greek():
	root = "/resources/speech/corpora/fleurs/el_gr"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_english():
	root = "/resources/speech/corpora/fleurs/en_us"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_spanish():
	root = "/resources/speech/corpora/fleurs/es_419"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_estonian():
	root = "/resources/speech/corpora/fleurs/et_ee"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_persian():
	root = "/resources/speech/corpora/fleurs/fa_ir"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_fula():
	root = "/resources/speech/corpora/fleurs/ff_sn"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_finnish():
	root = "/resources/speech/corpora/fleurs/fi_fi"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_filipino():
	root = "/resources/speech/corpora/fleurs/fil_ph"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_french():
	root = "/resources/speech/corpora/fleurs/fr_fr"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_irish():
	root = "/resources/speech/corpora/fleurs/ga_ie"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_galician():
	root = "/resources/speech/corpora/fleurs/gl_es"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_gujarati():
	root = "/resources/speech/corpora/fleurs/gu_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_hausa():
	root = "/resources/speech/corpora/fleurs/ha_ng"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_hebrew():
	root = "/resources/speech/corpora/fleurs/he_il"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_hindi():
	root = "/resources/speech/corpora/fleurs/hi_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_croatian():
	root = "/resources/speech/corpora/fleurs/hr_hr"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_hungarian():
	root = "/resources/speech/corpora/fleurs/hu_hu"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_armenian():
	root = "/resources/speech/corpora/fleurs/hy_am"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_indonesian():
	root = "/resources/speech/corpora/fleurs/id_id"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_igbo():
	root = "/resources/speech/corpora/fleurs/ig_ng"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_icelandic():
	root = "/resources/speech/corpora/fleurs/is_is"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_italian():
	root = "/resources/speech/corpora/fleurs/it_it"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_japanese():
	root = "/resources/speech/corpora/fleurs/ja_jp"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_javanese():
	root = "/resources/speech/corpora/fleurs/jv_id"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_georgian():
	root = "/resources/speech/corpora/fleurs/ka_ge"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_kamba():
	root = "/resources/speech/corpora/fleurs/kam_ke"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_kabuverdianu():
	root = "/resources/speech/corpora/fleurs/kea_cv"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_kazakh():
	root = "/resources/speech/corpora/fleurs/kk_kz"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_khmer():
	root = "/resources/speech/corpora/fleurs/km_kh"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_kannada():
	root = "/resources/speech/corpora/fleurs/kn_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_korean():
	root = "/resources/speech/corpora/fleurs/ko_kr"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_kyrgyz():
	root = "/resources/speech/corpora/fleurs/ky_kg"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_luxembourgish():
	root = "/resources/speech/corpora/fleurs/lb_lu"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_ganda():
	root = "/resources/speech/corpora/fleurs/lg_ug"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_lingala():
	root = "/resources/speech/corpora/fleurs/ln_cd"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_lao():
	root = "/resources/speech/corpora/fleurs/lo_la"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_lithuanian():
	root = "/resources/speech/corpora/fleurs/lt_lt"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_luo():
	root = "/resources/speech/corpora/fleurs/luo_ke"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_latvian():
	root = "/resources/speech/corpora/fleurs/lv_lv"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_maori():
	root = "/resources/speech/corpora/fleurs/mi_nz"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_macedonian():
	root = "/resources/speech/corpora/fleurs/mk_mk"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_malayalam():
	root = "/resources/speech/corpora/fleurs/ml_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_mongolian():
	root = "/resources/speech/corpora/fleurs/mn_mn"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_marathi():
	root = "/resources/speech/corpora/fleurs/mr_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_malay():
	root = "/resources/speech/corpora/fleurs/ms_my"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_maltese():
	root = "/resources/speech/corpora/fleurs/mt_mt"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_burmese():
	root = "/resources/speech/corpora/fleurs/my_mm"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_norwegian():
	root = "/resources/speech/corpora/fleurs/nb_no"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_nepali():
	root = "/resources/speech/corpora/fleurs/ne_np"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_dutch():
	root = "/resources/speech/corpora/fleurs/nl_nl"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_northern_sotho():
	root = "/resources/speech/corpora/fleurs/nso_za"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_nyanja():
	root = "/resources/speech/corpora/fleurs/ny_mw"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_occitan():
	root = "/resources/speech/corpora/fleurs/oc_fr"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_oroma():
	root = "/resources/speech/corpora/fleurs/om_et"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_oriya():
	root = "/resources/speech/corpora/fleurs/or_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_punjabi():
	root = "/resources/speech/corpora/fleurs/pa_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_polish():
	root = "/resources/speech/corpora/fleurs/pl_pl"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_pashto():
	root = "/resources/speech/corpora/fleurs/ps_af"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_portuguese():
	root = "/resources/speech/corpora/fleurs/pt_br"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_romanian():
	root = "/resources/speech/corpora/fleurs/ro_ro"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_russian():
	root = "/resources/speech/corpora/fleurs/ru_ru"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_sindhi():
	root = "/resources/speech/corpora/fleurs/sd_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_slovak():
	root = "/resources/speech/corpora/fleurs/sk_sk"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_slovenian():
	root = "/resources/speech/corpora/fleurs/sl_si"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_shona():
	root = "/resources/speech/corpora/fleurs/sn_zw"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_somali():
	root = "/resources/speech/corpora/fleurs/so_so"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_serbian():
	root = "/resources/speech/corpora/fleurs/sr_rs"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_swedish():
	root = "/resources/speech/corpora/fleurs/sv_se"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_swahili():
	root = "/resources/speech/corpora/fleurs/sw_ke"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_tamil():
	root = "/resources/speech/corpora/fleurs/ta_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_telugu():
	root = "/resources/speech/corpora/fleurs/te_in"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_tajik():
	root = "/resources/speech/corpora/fleurs/tg_tj"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_thai():
	root = "/resources/speech/corpora/fleurs/th_th"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_turkish():
	root = "/resources/speech/corpora/fleurs/tr_tr"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_ukrainian():
	root = "/resources/speech/corpora/fleurs/uk_ua"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_umbundu():
	root = "/resources/speech/corpora/fleurs/umb_ao"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_urdu():
	root = "/resources/speech/corpora/fleurs/ur_pk"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_uzbek():
	root = "/resources/speech/corpora/fleurs/uz_uz"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_vietnamese():
	root = "/resources/speech/corpora/fleurs/vi_vn"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_wolof():
	root = "/resources/speech/corpora/fleurs/wo_sn"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_xhosa():
	root = "/resources/speech/corpora/fleurs/xh_za"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_yoruba():
	root = "/resources/speech/corpora/fleurs/yo_ng"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_cantonese():
	root = "/resources/speech/corpora/fleurs/yue_hant_hk"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_fleurs_zulu():
	root = "/resources/speech/corpora/fleurs/zu_za"
	return build_path_to_transcript_dict_fleurs_template(root)


	def build_path_to_transcript_dict_living_audio_dataset_template(root):
	path_to_transcript = dict()
	tree = ET.parse(f"{root}/text.xml")
	tree_root = tree.getroot()

	for rec in tree_root.iter("recording_script"):
	for file in rec.iter("fileid"):
	path_to_transcript[
	str(Path(root, "48000_orig", f'{file.get("id")}.wav'))
	] = file.text.strip()

	return path_to_transcript


	def build_path_to_transcript_dict_living_audio_dataset_irish():
	root = "/resources/speech/corpora/LivingAudioDataset/ga"
	return build_path_to_transcript_dict_living_audio_dataset_template(root)


	def build_path_to_transcript_dict_living_audio_dataset_dutch():
	root = "/resources/speech/corpora/LivingAudioDataset/nl"
	return build_path_to_transcript_dict_living_audio_dataset_template(root)


	def build_path_to_transcript_dict_living_audio_dataset_russian():
	root = "/resources/speech/corpora/LivingAudioDataset/ru"
	return build_path_to_transcript_dict_living_audio_dataset_template(root)


	def build_path_to_transcript_dict_romanian_db():
	root = "/resources/speech/corpora/RomanianDB"
	path_to_transcript = dict()

	for split in ["training", "testing", "elena", "georgiana"]:
	for transcript in Path(root, split, "text").glob("*.txt"):
	subset = transcript.stem
	with open(transcript, "r", encoding="utf-8") as f:
	for line in f:
	fileid = line.strip()[:2]
	if len(fileid) == 2:
	fileid = "0" + fileid
	text = line.strip()[5:]
	if split == "elena":
	path = f"ele_{subset}_{fileid}.wav"
	elif split == "georgiana":
	path = f"geo_{subset}_{fileid}.wav"
	else:
	path = f"adr_{subset}_{fileid}.wav"
	path_to_transcript[str(Path(root, split, "wav", subset, path))] = (
	text
	)

	return path_to_transcript


	def build_path_to_transcript_dict_shemo():
	root = "/resources/speech/corpora/ShEMO"
	path_to_transcript = dict()

	with open("/resources/speech/corpora/ShEMO/shemo.json", "r", encoding="utf-8") as f:
	data = json.load(f)

	for fileid, file_info in data.items():
	path = Path(root, file_info["gender"], f"{fileid}.wav")
	if path.exists():
	path_to_transcript[str(path)] = file_info["transcript"]

	return path_to_transcript


	def build_path_to_transcript_dict_mslt_template(root, lang="en"):
	path_to_transcript = dict()

	for split in Path(root).glob("*"):
	if split.is_dir():
	for audio_file in split.glob("*.wav"):
	text_file = str(audio_file).replace(f"T0.{lang}.wav", f"T1.{lang}.snt")
	with open(text_file, "r", encoding="utf-16") as f:
	for line in f:
	text = line.strip() # should have only one line
	if "<" in text or "[" in text:
	# ignore all utterances with special parts like [laughter] or <UNIN/>
	continue
	path_to_transcript[str(audio_file)] = text
	break

	return path_to_transcript


	def build_path_to_transcript_dict_mslt_english():
	root = "/resources/speech/corpora/MSLT/Data/EN"
	return build_path_to_transcript_dict_mslt_template(root, lang="en")


	def build_path_to_transcript_dict_mslt_japanese():
	root = "/resources/speech/corpora/MSLT/Data/JA"
	return build_path_to_transcript_dict_mslt_template(root, lang="jp")


	def build_path_to_transcript_dict_mslt_chinese():
	root = "/resources/speech/corpora/MSLT/Data/ZH"
	return build_path_to_transcript_dict_mslt_template(root, lang="ch")


	def build_path_to_transcript_dict_rajasthani_hindi_speech():
	root = "/resources/speech/corpora/Rajasthani_Hindi_Speech/Hindi-Speech-Data"
	path_to_transcript = dict()

	for audio_file in Path(root).glob("*.3gp"):
	with open(audio_file.with_suffix(".txt"), "r", encoding="utf-8") as f:
	for line in f: # should only be one line
	text = line.strip()
	path_to_transcript[str(audio_file)] = text

	return path_to_transcript


	def build_path_to_transcript_dict_cmu_arctic():
	root = "/resources/speech/corpora/cmu_arctic"
	path_to_transcript = dict()

	for speaker_dir in Path(root).glob("*"):
	if speaker_dir.is_dir():
	with open(
	Path(speaker_dir, "etc", "txt.done.data"), "r", encoding="utf-8"
	) as f:
	for line in f:
	line = line.replace('\\"', "'").split('"')
	text = line[1]
	file = line[0].split()[-1]
	path_to_transcript[str(Path(speaker_dir, "wav", f"{file}.wav"))] = (
	text
	)

	return path_to_transcript


	def build_path_to_transcript_dict_sevil_tatar():
	root = "/resources/speech/corpora/sevil_tatar/sevil"
	path_to_transcript = dict()

	with open(Path(root, "metadata.jsonl"), "r", encoding="utf-8") as f:
	for line in f:
	meta = json.loads(line)
	path_to_transcript[str(Path(root, meta["file"]))] = (
	meta["orig_text"].strip().replace("\xad", "")
	)

	return path_to_transcript


	def build_path_to_transcript_dict_clartts():
	root = "/resources/speech/corpora/ClArTTS"
	path_to_transcript = dict()

	with open(Path(root, "training.txt"), "r", encoding="utf-16") as f:
	for line in f:
	fileid, transcript = line.strip().split("\|")
	path_to_transcript[str(Path(root, "wav", "train", f"{fileid}.wav"))] = (
	transcript
	)

	with open(Path(root, "validation.txt"), "r", encoding="utf-16") as f:
	for line in f:
	fileid, transcript = line.strip().split("\|")
	path_to_transcript[str(Path(root, "wav", "val", f"{fileid}.wav"))] = (
	transcript
	)

	return path_to_transcript


	def build_path_to_transcript_dict_snow_mountain_template(root, lang):
	path_to_transcript = dict()

	for split in ["train_full", "val_full", "test_common"]:
	with open(
	Path(root, "experiments", lang, f"{split}.csv"), "r", encoding="utf-8"
	) as f:
	reader = DictReader(f, delimiter=",")
	for row in reader:
	path = row["path"].replace("data/", f"{root}/")
	path_to_transcript[path] = row["sentence"].strip()

	return path_to_transcript


	def build_path_to_transcript_dict_snow_mountain_bhadrawahi():
	root = "/resources/speech/corpora/snow_mountain"
	language = "bhadrawahi"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_bilaspuri():
	root = "/resources/speech/corpora/snow_mountain"
	language = "bilaspuri"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_dogri():
	root = "/resources/speech/corpora/snow_mountain"
	language = "dogri"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_gaddi():
	root = "/resources/speech/corpora/snow_mountain"
	language = "gaddi"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_haryanvi():
	root = "/resources/speech/corpora/snow_mountain"
	language = "haryanvi"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_hindi():
	root = "/resources/speech/corpora/snow_mountain"
	language = "hindi"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_kangri():
	root = "/resources/speech/corpora/snow_mountain"
	language = "kangri"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_kannada():
	root = "/resources/speech/corpora/snow_mountain"
	language = "kannada"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_kulvi():
	root = "/resources/speech/corpora/snow_mountain"
	language = "kulvi"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_kulvi_outer_seraji():
	root = "/resources/speech/corpora/snow_mountain"
	language = "kulvi_outer_seraji"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_malayalam():
	root = "/resources/speech/corpora/snow_mountain"
	language = "malayalam"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_mandeali():
	root = "/resources/speech/corpora/snow_mountain"
	language = "mandeali"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_pahari_mahasui():
	root = "/resources/speech/corpora/snow_mountain"
	language = "pahari_mahasui"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_tamil():
	root = "/resources/speech/corpora/snow_mountain"
	language = "tamil"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_snow_mountain_telugu():
	root = "/resources/speech/corpora/snow_mountain"
	language = "telugu"
	return build_path_to_transcript_dict_snow_mountain_template(root, language)


	def build_path_to_transcript_dict_ukrainian_lada():
	root = "/resources/speech/corpora/ukrainian_lada/dataset_lada/accept"
	path_to_transcript = dict()

	with open(Path(root, "metadata.jsonl"), "r", encoding="utf-8") as f:
	for line in f:
	meta = json.loads(line)
	path_to_transcript[str(Path(root, meta["file"]).with_suffix(".wav"))] = (
	meta["orig_text"].strip().replace("\xad", "")
	)

	return path_to_transcript


	def build_path_to_transcript_dict_m_ailabs_template(root):
	path_to_transcript = dict()

	for gender_dir in Path(root).glob("*"):
	if not gender_dir.is_dir():
	continue
	for speaker_dir in gender_dir.glob("*"):
	if not speaker_dir.is_dir():
	continue
	if (speaker_dir / "wavs").exists():
	with open(
	Path(speaker_dir, "metadata.csv"), "r", encoding="utf-8"
	) as f:
	for line in f:
	fileid, text, text_norm = line.strip().split("\|")
	path = Path(speaker_dir, "wavs", f"{fileid}.wav")
	if path.exists():
	path_to_transcript[str(path)] = text_norm
	else:

	for session_dir in speaker_dir.glob("*"):
	if not session_dir.is_dir():
	continue
	with open(
	Path(session_dir, "metadata.csv"), "r", encoding="utf-8"
	) as f:
	for line in f:
	fileid, text, text_norm = line.strip().split("\|")
	path = Path(session_dir, "wavs", f"{fileid}.wav")
	if path.exists():
	path_to_transcript[str(path)] = text_norm

	return path_to_transcript


	def build_path_to_transcript_dict_m_ailabs_german():
	root = "/resources/speech/corpora/m-ailabs-speech/de_DE"
	return build_path_to_transcript_dict_m_ailabs_template(root)


	def build_path_to_transcript_dict_m_ailabs_uk_english():
	root = "/resources/speech/corpora/m-ailabs-speech/en_UK"
	return build_path_to_transcript_dict_m_ailabs_template(root)


	def build_path_to_transcript_dict_m_ailabs_us_english():
	root = "/resources/speech/corpora/m-ailabs-speech/en_US"
	return build_path_to_transcript_dict_m_ailabs_template(root)


	def build_path_to_transcript_dict_m_ailabs_spanish():
	root = "/resources/speech/corpora/m-ailabs-speech/es_ES"
	return build_path_to_transcript_dict_m_ailabs_template(root)


	def build_path_to_transcript_dict_m_ailabs_french():
	root = "/resources/speech/corpora/m-ailabs-speech/fr_FR"
	return build_path_to_transcript_dict_m_ailabs_template(root)


	def build_path_to_transcript_dict_m_ailabs_italian():
	root = "/resources/speech/corpora/m-ailabs-speech/it_IT"
	return build_path_to_transcript_dict_m_ailabs_template(root)


	def build_path_to_transcript_dict_m_ailabs_polish():
	root = "/resources/speech/corpora/m-ailabs-speech/pl_PL"
	return build_path_to_transcript_dict_m_ailabs_template(root)


	def build_path_to_transcript_dict_m_ailabs_russian():
	root = "/resources/speech/corpora/m-ailabs-speech/ru_RU"
	return build_path_to_transcript_dict_m_ailabs_template(root)


	def build_path_to_transcript_dict_m_ailabs_ukrainian():
	root = "/resources/speech/corpora/m-ailabs-speech/uk_UK"
	return build_path_to_transcript_dict_m_ailabs_template(root)


	def build_path_to_transcript_dict_cml_tts_template(root):
	path_to_transcript = dict()

	for split in ["train", "dev", "test"]:
	with open(Path(root, f"{split}.csv"), "r", encoding="utf-8") as f:
	reader = DictReader(f, delimiter="\|")
	for row in reader:
	path_to_transcript[str(Path(root, row["wav_filename"]))] = row[
	"transcript"
	].strip()

	return path_to_transcript


	def build_path_to_transcript_dict_cml_tts_dutch():
	root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_dutch_v0.1"
	return build_path_to_transcript_dict_cml_tts_template(root)


	def build_path_to_transcript_dict_cml_tts_french():
	root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_french_v0.1"
	return build_path_to_transcript_dict_cml_tts_template(root)


	def build_path_to_transcript_dict_cml_tts_german():
	root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_german_v0.1"
	return build_path_to_transcript_dict_cml_tts_template(root)


	def build_path_to_transcript_dict_cml_tts_italian():
	root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_italian_v0.1"
	return build_path_to_transcript_dict_cml_tts_template(root)


	def build_path_to_transcript_dict_cml_tts_polish():
	root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_polish_v0.1"
	return build_path_to_transcript_dict_cml_tts_template(root)


	def build_path_to_transcript_dict_cml_tts_portuguese():
	root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_portuguese_v0.1"
	return build_path_to_transcript_dict_cml_tts_template(root)


	def build_path_to_transcript_dict_cml_tts_spanish():
	root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_spanish_v0.1"
	return build_path_to_transcript_dict_cml_tts_template(root)


	def build_path_to_transcript_dict_mms_template(
	lang, root="/resources/speech/corpora/mms_synthesized_bible_speech"
	):
	path_to_transcript = dict()

	i = 0
	with open(Path(root, "bible_texts", f"{lang}.txt"), "r", encoding="utf-8") as f:
	for line in f:
	path = Path(root, "bible_audios", lang, f"{i}.wav")
	if path.exists():
	path_to_transcript[str(path)] = line.strip()
	i += 1

	return path_to_transcript


	def build_path_to_transcript_dict_shan():
	root = "D:\\Work\\Developer\\ShanTTS\Datasets\\asr-dataset"
	path_to_transcript_dict = dict()

	with open(Path(root, "metadata.csv"), mode="r", encoding="utf-8") as f:
	reader = DictReader(
	f,
	delimiter=",",
	fieldnames=[
	"file_name",
	"transcription",
	],
	)
	for row in reader:
	path_to_transcript_dict[str(Path(root, row["file_name"]))] = row[
	"transcription"
	].strip()

	return path_to_transcript_dict


	if __name__ == "__main__":
	pass