Spaces:
Sleeping
Sleeping
import glob | |
import os | |
import random | |
import torch | |
# HELPER FUNCTIONS | |
def split_dictionary_into_chunks(input_dict, split_n): | |
res = [] | |
new_dict = {} | |
elements_per_dict = (len(input_dict.keys()) // split_n) + 1 | |
for k, v in input_dict.items(): | |
if len(new_dict) < elements_per_dict: | |
new_dict[k] = v | |
else: | |
res.append(new_dict) | |
new_dict = {k: v} | |
res.append(new_dict) | |
return res | |
def limit_to_n(path_to_transcript_dict, n=40000): | |
# deprecated, we now just use the whole thing always, because there's a critical mass of data | |
limited_dict = dict() | |
if len(path_to_transcript_dict.keys()) > n: | |
for key in random.sample(list(path_to_transcript_dict.keys()), n): | |
limited_dict[key] = path_to_transcript_dict[key] | |
return limited_dict | |
else: | |
return path_to_transcript_dict | |
def build_path_to_transcript_dict_multi_ling_librispeech_template(root): | |
""" | |
https://arxiv.org/abs/2012.03411 | |
""" | |
path_to_transcript = dict() | |
with open(os.path.join(root, "transcripts.txt"), "r", encoding="utf8") as file: | |
lookup = file.read() | |
for line in lookup.split("\n"): | |
if line.strip() != "": | |
fields = line.split("\t") | |
wav_folders = fields[0].split("_") | |
wav_path = ( | |
f"{root}/audio/{wav_folders[0]}/{wav_folders[1]}/{fields[0]}.flac" | |
) | |
path_to_transcript[wav_path] = fields[1] | |
return path_to_transcript | |
def build_path_to_transcript_dict_hui_template(root): | |
""" | |
https://arxiv.org/abs/2106.06309 | |
""" | |
path_to_transcript = dict() | |
for el in os.listdir(root): | |
if os.path.isdir(os.path.join(root, el)): | |
with open( | |
os.path.join(root, el, "metadata.csv"), "r", encoding="utf8" | |
) as file: | |
lookup = file.read() | |
for line in lookup.split("\n"): | |
if line.strip() != "": | |
norm_transcript = line.split("|")[1] | |
wav_path = os.path.join( | |
root, el, "wavs", line.split("|")[0] + ".wav" | |
) | |
if os.path.exists(wav_path): | |
path_to_transcript[wav_path] = norm_transcript | |
return path_to_transcript | |
# ENGLISH | |
def build_path_to_transcript_dict_mls_english(re_cache=False): | |
lang = "english" | |
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" | |
cache_path = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train/pttd_cache.pt" | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = ( | |
build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_gigaspeech(re_cache=False): | |
root = "/mount/resources/speech/corpora/GigaSpeech/" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open( | |
os.path.join(root, "transcripts_only_clean_samples.txt"), | |
"r", | |
encoding="utf8", | |
) as file: | |
lookup = file.read() | |
for line in lookup.split("\n"): | |
if line.strip() != "": | |
fields = line.split("\t") | |
norm_transcript = fields[1] | |
wav_path = fields[0] | |
path_to_transcript[wav_path] = norm_transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_elizabeth(re_cache=False): | |
root = "/mount/resources/speech/corpora/MAILabs_british_single_speaker_elizabeth" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
for el in os.listdir(root): | |
if os.path.isdir(os.path.join(root, el)): | |
with open( | |
os.path.join(root, el, "metadata.csv"), "r", encoding="utf8" | |
) as file: | |
lookup = file.read() | |
for line in lookup.split("\n"): | |
if line.strip() != "": | |
norm_transcript = line.split("|")[2] | |
wav_path = os.path.join( | |
root, el, "wavs", line.split("|")[0] + ".wav" | |
) | |
if os.path.exists(wav_path): | |
path_to_transcript[wav_path] = norm_transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_nancy(re_cache=False): | |
root = "/mount/resources/speech/corpora/NancyKrebs" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(os.path.join(root, "metadata.csv"), "r", encoding="utf8") as file: | |
lookup = file.read() | |
for line in lookup.split("\n"): | |
if line.strip() != "": | |
norm_transcript = line.split("|")[1] | |
wav_path = os.path.join(root, "wav", line.split("|")[0] + ".wav") | |
if os.path.exists(wav_path): | |
path_to_transcript[wav_path] = norm_transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_integration_test(re_cache=True): | |
root = "/mount/resources/speech/corpora/NancyKrebs" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(os.path.join(root, "metadata.csv"), "r", encoding="utf8") as file: | |
lookup = file.read() | |
for line in lookup.split("\n")[:500]: | |
if line.strip() != "": | |
norm_transcript = line.split("|")[1] | |
wav_path = os.path.join(root, "wav", line.split("|")[0] + ".wav") | |
if os.path.exists(wav_path): | |
path_to_transcript[wav_path] = norm_transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_CREMA_D(re_cache=False): | |
root = "/mount/resources/speech/corpora/CREMA_D/" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
identifier_to_sent = { | |
"IEO": "It's eleven o'clock.", | |
"TIE": "That is exactly what happened.", | |
"IOM": "I'm on my way to the meeting.", | |
"IWW": "I wonder what this is about.", | |
"TAI": "The airplane is almost full.", | |
"MTI": "Maybe tomorrow it will be cold.", | |
"IWL": "I would like a new alarm clock.", | |
"ITH": "I think, I have a doctor's appointment.", | |
"DFA": "Don't forget a jacket.", | |
"ITS": "I think, I've seen this before.", | |
"TSI": "The surface is slick.", | |
"WSI": "We'll stop in a couple of minutes.", | |
} | |
path_to_transcript = dict() | |
for file in os.listdir(root): | |
if file.endswith(".wav"): | |
path_to_transcript[root + file] = identifier_to_sent[file.split("_")[1]] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_EmoV_DB(re_cache=False): | |
root = "/mount/resources/speech/corpora/EmoV_DB/" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(os.path.join(root, "labels.txt"), "r", encoding="utf8") as file: | |
lookup = file.read() | |
identifier_to_sent = dict() | |
for line in lookup.split("\n"): | |
if line.strip() != "": | |
identifier_to_sent[line.split()[0]] = " ".join(line.split()[1:]) | |
for file in os.listdir(root): | |
if file.endswith(".wav"): | |
path_to_transcript[root + file] = identifier_to_sent[file[-14:-10]] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_ryanspeech(re_cache=False): | |
root = "/mount/resources/speech/corpora/RyanSpeech" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript_dict = dict() | |
with open(root + "/metadata.csv", mode="r", encoding="utf8") as f: | |
transcripts = f.read().split("\n") | |
for transcript in transcripts: | |
if transcript.strip() != "": | |
parsed_line = transcript.split("|") | |
audio_file = f"{root}/wavs/{parsed_line[0]}.wav" | |
path_to_transcript_dict[audio_file] = parsed_line[2] | |
torch.save(path_to_transcript_dict, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_RAVDESS(re_cache=False): | |
root = "/mount/resources/speech/corpora/RAVDESS" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript_dict = dict() | |
for speaker_dir in os.listdir(root): | |
for audio_file in os.listdir(os.path.join(root, speaker_dir)): | |
if audio_file.split("-")[4] == "01": | |
path_to_transcript_dict[ | |
os.path.join(root, speaker_dir, audio_file) | |
] = "Kids are talking by the door." | |
else: | |
path_to_transcript_dict[ | |
os.path.join(root, speaker_dir, audio_file) | |
] = "Dogs are sitting by the door." | |
torch.save(path_to_transcript_dict, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_ESDS(re_cache=False): | |
root = "/mount/resources/speech/corpora/Emotional_Speech_Dataset_Singapore" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript_dict = dict() | |
for speaker_dir in os.listdir(root): | |
if speaker_dir.startswith("00"): | |
if int(speaker_dir) > 10: | |
with open( | |
f"{root}/{speaker_dir}/fixed_unicode.txt", | |
mode="r", | |
encoding="utf8", | |
) as f: | |
transcripts = f.read() | |
for line in ( | |
transcripts.replace("\n\n", "\n").replace(",", ", ").split("\n") | |
): | |
if line.strip() != "": | |
filename, text, emo_dir = line.split("\t") | |
filename = speaker_dir + "_" + filename.split("_")[1] | |
path_to_transcript_dict[ | |
f"{root}/{speaker_dir}/{emo_dir}/{filename}.wav" | |
] = text | |
torch.save(path_to_transcript_dict, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_nvidia_hifitts(re_cache=False): | |
root = "/mount/resources/speech/corpora/hi_fi_tts_v0" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
transcripts = list() | |
import json | |
for jpath in [ | |
f"{root}/6097_manifest_clean_dev.json", | |
f"{root}/6097_manifest_clean_test.json", | |
f"{root}/6097_manifest_clean_train.json", | |
f"{root}/9017_manifest_clean_dev.json", | |
f"{root}/9017_manifest_clean_test.json", | |
f"{root}/9017_manifest_clean_train.json", | |
f"{root}/92_manifest_clean_dev.json", | |
f"{root}/92_manifest_clean_test.json", | |
f"{root}/92_manifest_clean_train.json", | |
]: | |
with open(jpath, encoding="utf-8", mode="r") as jfile: | |
for line in jfile.read().split("\n"): | |
if line.strip() != "": | |
transcripts.append(json.loads(line)) | |
for transcript in transcripts: | |
path = transcript["audio_filepath"] | |
norm_text = transcript["text_normalized"] | |
path_to_transcript[f"{root}/{path}"] = norm_text | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_blizzard_2013(re_cache=False): | |
root = "/mount/resources/speech/corpora/Blizzard2013/train/segmented/" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(root + "prompts.gui", encoding="utf8") as f: | |
transcriptions = f.read() | |
blocks = transcriptions.split("||\n") | |
for block in blocks: | |
trans_lines = block.split("\n") | |
if trans_lines[0].strip() != "": | |
transcript = ( | |
trans_lines[1] | |
.replace("@", "") | |
.replace("#", ",") | |
.replace("|", "") | |
.replace(";", ",") | |
.replace(":", ",") | |
.replace(" 's", "'s") | |
.replace(", ,", ",") | |
.replace(" ", " ") | |
.replace(" ,", ",") | |
.replace(" .", ".") | |
.replace(" ?", "?") | |
.replace(" !", "!") | |
.rstrip(" ,") | |
) | |
path_to_transcript[root + "wavn/" + trans_lines[0] + ".wav"] = ( | |
transcript | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_vctk(re_cache=False): | |
root = "/mount/resources/speech/corpora/VCTK" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
for transcript_dir in os.listdir("/mount/resources/speech/corpora/VCTK/txt"): | |
for transcript_file in os.listdir( | |
f"/mount/resources/speech/corpora/VCTK/txt/{transcript_dir}" | |
): | |
if transcript_file.endswith(".txt"): | |
with open( | |
f"/mount/resources/speech/corpora/VCTK/txt/{transcript_dir}/" | |
+ transcript_file, | |
"r", | |
encoding="utf8", | |
) as tf: | |
transcript = tf.read() | |
wav_path = ( | |
f"/mount/resources/speech/corpora/VCTK/wav48_silence_trimmed/{transcript_dir}/" | |
+ transcript_file.rstrip(".txt") | |
+ "_mic2.flac" | |
) | |
if os.path.exists(wav_path): | |
path_to_transcript[wav_path] = transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_libritts_all_clean(re_cache=False): | |
root = "/mount/resources/speech/corpora/LibriTTS_R/" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_train = "/mount/resources/speech/corpora/LibriTTS_R/" # using all files from the "clean" subsets from LibriTTS-R https://arxiv.org/abs/2305.18802 | |
path_to_transcript = dict() | |
for speaker in os.listdir(path_train): | |
for chapter in os.listdir(os.path.join(path_train, speaker)): | |
for file in os.listdir(os.path.join(path_train, speaker, chapter)): | |
if file.endswith("normalized.txt"): | |
with open( | |
os.path.join(path_train, speaker, chapter, file), | |
"r", | |
encoding="utf8", | |
) as tf: | |
transcript = tf.read() | |
wav_file = file.split(".")[0] + ".wav" | |
path_to_transcript[ | |
os.path.join(path_train, speaker, chapter, wav_file) | |
] = transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_libritts_other500(re_cache=False): | |
root = "/mount/resources/asr-data/LibriTTS/train-other-500" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_train = "/mount/resources/asr-data/LibriTTS/train-other-500" | |
path_to_transcript = dict() | |
for speaker in os.listdir(path_train): | |
for chapter in os.listdir(os.path.join(path_train, speaker)): | |
for file in os.listdir(os.path.join(path_train, speaker, chapter)): | |
if file.endswith("normalized.txt"): | |
with open( | |
os.path.join(path_train, speaker, chapter, file), | |
"r", | |
encoding="utf8", | |
) as tf: | |
transcript = tf.read() | |
wav_file = file.split(".")[0] + ".wav" | |
path_to_transcript[ | |
os.path.join(path_train, speaker, chapter, wav_file) | |
] = transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_ljspeech(re_cache=False): | |
root = "/mount/resources/speech/corpora/LJSpeech/" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
for transcript_file in os.listdir( | |
"/mount/resources/speech/corpora/LJSpeech/16kHz/txt" | |
): | |
with open( | |
"/mount/resources/speech/corpora/LJSpeech/16kHz/txt/" + transcript_file, | |
"r", | |
encoding="utf8", | |
) as tf: | |
transcript = tf.read() | |
wav_path = ( | |
"/mount/resources/speech/corpora/LJSpeech/16kHz/wav/" | |
+ transcript_file.rstrip(".txt") | |
+ ".wav" | |
) | |
path_to_transcript[wav_path] = transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_jenny(re_cache=False): | |
""" | |
https://www.kaggle.com/datasets/noml4u/jenny-tts-dataset | |
https://github.com/dioco-group/jenny-tts-dataset | |
Dataset of Speaker Jenny (Dioco) with an Irish accent | |
""" | |
root = "/mount/resources/speech/corpora/Jenny/" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open( | |
"/mount/resources/speech/corpora/Jenny/metadata.csv", encoding="utf8" | |
) as f: | |
transcriptions = f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[ | |
"/mount/resources/speech/corpora/Jenny/" | |
+ line.split("|")[0] | |
+ "_silence.flac" | |
] = line.split("|")[1] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# GERMAN | |
def build_path_to_transcript_dict_mls_german(re_cache=False): | |
lang = "german" | |
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = ( | |
build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_karlsson(re_cache=False): | |
root = "/mount/resources/speech/corpora/HUI_German/Karlsson" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = build_path_to_transcript_dict_hui_template(root=root) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_eva(re_cache=False): | |
root = "/mount/resources/speech/corpora/HUI_German/Eva" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = build_path_to_transcript_dict_hui_template(root=root) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_bernd(re_cache=False): | |
root = "/mount/resources/speech/corpora/HUI_German/Bernd" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = build_path_to_transcript_dict_hui_template(root=root) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_friedrich(re_cache=False): | |
root = "/mount/resources/speech/corpora/HUI_German/Friedrich" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = build_path_to_transcript_dict_hui_template(root=root) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_hokus(re_cache=False): | |
root = "/mount/resources/speech/corpora/HUI_German/Hokus" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = build_path_to_transcript_dict_hui_template(root=root) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_hui_others(re_cache=False): | |
root = "/mount/resources/speech/corpora/HUI_German/others" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
pttd = dict() | |
for speaker in os.listdir(root): | |
pttd.update( | |
build_path_to_transcript_dict_hui_template(root=f"{root}/{speaker}") | |
) | |
torch.save(pttd, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_thorsten_neutral(re_cache=False): | |
root = "/mount/resources/speech/corpora/ThorstenDatasets/thorsten-de_v03" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(root + "/metadata_train.csv", encoding="utf8") as f: | |
transcriptions = f.read() | |
with open(root + "/metadata_val.csv", encoding="utf8") as f: | |
transcriptions += "\n" + f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[root + "/wavs/" + line.split("|")[0] + ".wav"] = ( | |
line.split("|")[1] | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_thorsten_2022_10(re_cache=False): | |
root = ( | |
"/mount/resources/speech/corpora/ThorstenDatasets/ThorstenVoice-Dataset_2022.10" | |
) | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(root + "/metadata_train.csv", encoding="utf8") as f: | |
transcriptions = f.read() | |
with open(root + "/metadata_dev.csv", encoding="utf8") as f: | |
transcriptions += "\n" + f.read() | |
with open(root + "/metadata_test.csv", encoding="utf8") as f: | |
transcriptions += "\n" + f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[root + "/wavs/" + line.split("|")[0] + ".wav"] = ( | |
line.split("|")[1] | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_thorsten_emotional(re_cache=False): | |
root = "/mount/resources/speech/corpora/ThorstenDatasets/thorsten-emotional_v02" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(root + "/thorsten-emotional-metadata.csv", encoding="utf8") as f: | |
transcriptions = f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[root + "/amused/" + line.split("|")[0] + ".wav"] = ( | |
line.split("|")[1] | |
) | |
path_to_transcript[root + "/angry/" + line.split("|")[0] + ".wav"] = ( | |
line.split("|")[1] | |
) | |
path_to_transcript[ | |
root + "/disgusted/" + line.split("|")[0] + ".wav" | |
] = line.split("|")[1] | |
path_to_transcript[root + "/neutral/" + line.split("|")[0] + ".wav"] = ( | |
line.split("|")[1] | |
) | |
path_to_transcript[root + "/sleepy/" + line.split("|")[0] + ".wav"] = ( | |
line.split("|")[1] | |
) | |
path_to_transcript[ | |
root + "/surprised/" + line.split("|")[0] + ".wav" | |
] = line.split("|")[1] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# FRENCH | |
def build_path_to_transcript_dict_mls_french(re_cache=False): | |
lang = "french" | |
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = ( | |
build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_blizzard2023_ad_silence_removed(re_cache=False): | |
root = "/mount/resources/speech/corpora/Blizzard2023/AD_silence_removed" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(os.path.join(root, "transcript.tsv"), "r", encoding="utf8") as file: | |
lookup = file.read() | |
for line in lookup.split("\n"): | |
if line.strip() != "": | |
norm_transcript = line.split("\t")[1] | |
wav_path = os.path.join(root, line.split("\t")[0].split("/")[-1]) | |
if os.path.exists(wav_path): | |
path_to_transcript[wav_path] = ( | |
norm_transcript.replace("§", "") | |
.replace("#", "") | |
.replace("~", "") | |
.replace(" »", '"') | |
.replace("« ", '"') | |
.replace("»", '"') | |
.replace("«", '"') | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_blizzard2023_neb_silence_removed(re_cache=False): | |
root = "/mount/resources/speech/corpora/Blizzard2023/NEB_silence_removed" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(os.path.join(root, "transcript.tsv"), "r", encoding="utf8") as file: | |
lookup = file.read() | |
for line in lookup.split("\n"): | |
if line.strip() != "": | |
norm_transcript = line.split("\t")[1] | |
wav_path = os.path.join(root, line.split("\t")[0].split("/")[-1]) | |
if os.path.exists(wav_path): | |
path_to_transcript[wav_path] = ( | |
norm_transcript.replace("§", "") | |
.replace("#", "") | |
.replace("~", "") | |
.replace(" »", '"') | |
.replace("« ", '"') | |
.replace("»", '"') | |
.replace("«", '"') | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_blizzard2023_neb_e_silence_removed(re_cache=False): | |
root = "/mount/resources/speech/corpora/Blizzard2023/enhanced_NEB_subset_silence_removed" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(os.path.join(root, "transcript.tsv"), "r", encoding="utf8") as file: | |
lookup = file.read() | |
for line in lookup.split("\n"): | |
if line.strip() != "": | |
norm_transcript = line.split("\t")[1] | |
wav_path = os.path.join(root, line.split("\t")[0].split("/")[-1]) | |
if os.path.exists(wav_path): | |
path_to_transcript[wav_path] = ( | |
norm_transcript.replace("§", "") | |
.replace("#", "") | |
.replace("~", "") | |
.replace(" »", '"') | |
.replace("« ", '"') | |
.replace("»", '"') | |
.replace("«", '"') | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_synpaflex_norm_subset(re_cache=False): | |
""" | |
Contributed by https://github.com/tomschelsen | |
""" | |
root = "/mount/resources/speech/corpora/synpaflex-corpus/5/v0.1/" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
for text_path in glob.iglob( | |
os.path.join(root, "**/*_norm.txt"), recursive=True | |
): | |
with open(text_path, "r", encoding="utf8") as file: | |
norm_transcript = file.read() | |
path_obj = Path(text_path) | |
wav_path = str( | |
(path_obj.parent.parent / path_obj.name[:-9]).with_suffix(".wav") | |
) | |
if Path(wav_path).exists(): | |
path_to_transcript[wav_path] = norm_transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_siwis_subset(re_cache=False): | |
""" | |
Contributed by https://github.com/tomschelsen | |
""" | |
root = "/mount/resources/speech/corpora/SiwisFrenchSpeechSynthesisDatabase/" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
# part4 and part5 are not segmented | |
sub_dirs = ["part1", "part2", "part3"] | |
path_to_transcript = dict() | |
for sd in sub_dirs: | |
for text_path in glob.iglob(os.path.join(root, "text", sd, "*.txt")): | |
with open(text_path, "r", encoding="utf8") as file: | |
norm_transcript = file.read() | |
path_obj = Path(text_path) | |
wav_path = str( | |
( | |
path_obj.parent.parent.parent / "wavs" / sd / path_obj.stem | |
).with_suffix(".wav") | |
) | |
if Path(wav_path).exists(): | |
path_to_transcript[wav_path] = norm_transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_css10fr(re_cache=False): | |
language = "french" | |
root = f"/mount/resources/speech/corpora/CSS10/{language}" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open( | |
f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", | |
encoding="utf8", | |
) as f: | |
transcriptions = f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[ | |
f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}" | |
] = line.split("|")[2] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# SPANISH | |
def build_path_to_transcript_dict_mls_spanish(re_cache=False): | |
lang = "spanish" | |
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = ( | |
build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_css10es(re_cache=False): | |
language = "spanish" | |
root = f"/mount/resources/speech/corpora/CSS10/{language}" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open( | |
f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", | |
encoding="utf8", | |
) as f: | |
transcriptions = f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[ | |
f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}" | |
] = line.split("|")[2] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_spanish_blizzard_train(re_cache=False): | |
root = "/mount/resources/speech/corpora/Blizzard2021/spanish_blizzard_release_2021_v2/hub" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(os.path.join(root, "train_text.txt"), "r", encoding="utf8") as file: | |
lookup = file.read() | |
for line in lookup.split("\n"): | |
if line.strip() != "": | |
norm_transcript = line.split("\t")[1] | |
wav_path = os.path.join(root, "train_wav", line.split("\t")[0] + ".wav") | |
if os.path.exists(wav_path): | |
path_to_transcript[wav_path] = norm_transcript | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# PORTUGUESE | |
def build_path_to_transcript_dict_mls_portuguese(re_cache=False): | |
lang = "portuguese" | |
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = ( | |
build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# POLISH | |
def build_path_to_transcript_dict_mls_polish(re_cache=False): | |
lang = "polish" | |
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = ( | |
build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# ITALIAN | |
def build_path_to_transcript_dict_mls_italian(re_cache=False): | |
lang = "italian" | |
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = ( | |
build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# DUTCH | |
def build_path_to_transcript_dict_mls_dutch(re_cache=False): | |
lang = "dutch" | |
root = f"/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_{lang}/train" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = ( | |
build_path_to_transcript_dict_multi_ling_librispeech_template(root=root) | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_css10nl(re_cache=False): | |
language = "dutch" | |
root = f"/mount/resources/speech/corpora/CSS10/{language}" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open( | |
f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", | |
encoding="utf8", | |
) as f: | |
transcriptions = f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[ | |
f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}" | |
] = line.split("|")[2] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# GREEK | |
def build_path_to_transcript_dict_css10el(re_cache=False): | |
language = "greek" | |
root = f"/mount/resources/speech/corpora/CSS10/{language}" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open( | |
f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", | |
encoding="utf8", | |
) as f: | |
transcriptions = f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[ | |
f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}" | |
] = line.split("|")[2] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# FINNISH | |
def build_path_to_transcript_dict_css10fi(re_cache=False): | |
language = "finnish" | |
root = f"/mount/resources/speech/corpora/CSS10/{language}" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open( | |
f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", | |
encoding="utf8", | |
) as f: | |
transcriptions = f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[ | |
f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}" | |
] = line.split("|")[2] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# VIETNAMESE | |
def build_path_to_transcript_dict_VIVOS_viet(re_cache=False): | |
root = "/mount/resources/speech/corpora/VIVOS_vietnamese/train" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript_dict = dict() | |
with open(root + "/prompts.txt", mode="r", encoding="utf8") as f: | |
transcripts = f.read().split("\n") | |
for transcript in transcripts: | |
if transcript.strip() != "": | |
parsed_line = transcript.split(" ") | |
audio_file = f"{root}/waves/{parsed_line[0][:10]}/{parsed_line[0]}.wav" | |
path_to_transcript_dict[audio_file] = " ".join(parsed_line[1:]).lower() | |
torch.save(path_to_transcript_dict, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_vietTTS(re_cache=False): | |
root = "/mount/resources/speech/corpora/VietTTS" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open(root + "/meta_data.tsv", encoding="utf8") as f: | |
transcriptions = f.read() | |
for line in transcriptions.split("\n"): | |
if line.strip() != "": | |
parsed_line = line.split(".wav") | |
audio_path = parsed_line[0] | |
transcript = parsed_line[1] | |
path_to_transcript[os.path.join(root, audio_path + ".wav")] = ( | |
transcript.strip() | |
) | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# CHINESE | |
def build_path_to_transcript_dict_aishell3(re_cache=False): | |
root = "/mount/resources/speech/corpora/aishell3/train" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript_dict = dict() | |
with open(root + "/label_train-set.txt", mode="r", encoding="utf8") as f: | |
transcripts = f.read().replace("$", "").replace("%", " ").split("\n") | |
for transcript in transcripts: | |
if transcript.strip() != "" and not transcript.startswith("#"): | |
parsed_line = transcript.split("|") | |
audio_file = f"{root}/wav/{parsed_line[0][:7]}/{parsed_line[0]}.wav" | |
kanji = parsed_line[2] | |
path_to_transcript_dict[audio_file] = kanji | |
torch.save(path_to_transcript_dict, cache_path) | |
return torch.load(cache_path) | |
def build_path_to_transcript_dict_css10cmn(re_cache=False): | |
language = "chinese" | |
root = f"/mount/resources/speech/corpora/CSS10/{language}" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open( | |
"/mount/resources/speech/corpora/CSS10/chinese/transcript.txt", | |
encoding="utf8", | |
) as f: | |
transcriptions = f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[ | |
"/mount/resources/speech/corpora/CSS10/chinese/" | |
+ line.split("|")[0] | |
] = line.split("|")[2] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# RUSSIAN | |
def build_path_to_transcript_dict_css10ru(re_cache=False): | |
language = "russian" | |
root = f"/mount/resources/speech/corpora/CSS10/{language}" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
with open( | |
f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", | |
encoding="utf8", | |
) as f: | |
transcriptions = f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[ | |
f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}" | |
] = line.split("|")[2] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# HUNGARIAN | |
def build_path_to_transcript_dict_css10hu(re_cache=False): | |
language = "hungarian" | |
root = f"/mount/resources/speech/corpora/CSS10/{language}" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
path_to_transcript = dict() | |
language = "hungarian" | |
with open( | |
f"/mount/resources/speech/corpora/CSS10/{language}/transcript.txt", | |
encoding="utf8", | |
) as f: | |
transcriptions = f.read() | |
trans_lines = transcriptions.split("\n") | |
for line in trans_lines: | |
if line.strip() != "": | |
path_to_transcript[ | |
f"/mount/resources/speech/corpora/CSS10/{language}/{line.split('|')[0]}" | |
] = line.split("|")[2] | |
torch.save(path_to_transcript, cache_path) | |
return torch.load(cache_path) | |
# OTHER | |
def build_file_list_singing_voice_audio_database(re_cache=False): | |
root = "/mount/resources/speech/corpora/singing_voice_audio_dataset/monophonic" | |
cache_path = os.path.join(root, "pttd_cache.pt") | |
if not os.path.exists(cache_path) or re_cache: | |
file_list = list() | |
for corw in os.listdir(root): | |
for singer in os.listdir(os.path.join(root, corw)): | |
for audio in os.listdir(os.path.join(root, corw, singer)): | |
file_list.append(os.path.join(root, corw, singer, audio)) | |
torch.save(file_list, cache_path) | |
return torch.load(cache_path) | |
from pathlib import Path | |
import xml.etree.ElementTree as ET | |
from csv import DictReader | |
import json | |
def build_path_to_transcript_dict_nst_norwegian(): | |
root = "/resources/speech/corpora/NST_norwegian/pcm/cs" | |
path_to_transcript = dict() | |
audio_paths = sorted(list(Path(root).glob("*.pcm"))) | |
i = 0 | |
with open(Path(root, "SCRIPTS/CTTS_core"), encoding="latin-1") as f: | |
for line in f: | |
transcript = line.strip().replace("\xad", "") | |
path = str(audio_paths[i].absolute()) | |
path_to_transcript[path] = transcript | |
i += 1 | |
return path_to_transcript | |
def build_path_to_transcript_dict_nst_swedish(): | |
root = "/resources/speech/corpora/NST_swedish/sw_pcms" | |
path_to_transcript = dict() | |
audio_paths = sorted(list(Path(root, "mf").glob("*.pcm"))) | |
audio_paths.insert(4154, None) | |
audio_paths.insert(5144, None) | |
i = 0 | |
with open(Path(root, "scripts/mf/sw_all"), encoding="latin-1") as f: | |
for line in f: | |
if i == 4154 or i == 5144: | |
i += 1 | |
continue | |
transcript = line.strip().replace("\xad", "") | |
path = str(audio_paths[i].absolute()) | |
path_to_transcript[path] = transcript | |
i += 1 | |
return path_to_transcript | |
def build_path_to_transcript_dict_nchlt_afr(): | |
root = "/resources/speech/corpora/nchlt_afr" | |
return build_path_to_transcript_dict_nchlt_template(root, lang_code="afr") | |
def build_path_to_transcript_dict_nchlt_nbl(): | |
root = "/resources/speech/corpora/nchlt_nbl" | |
return build_path_to_transcript_dict_nchlt_template(root, lang_code="nbl") | |
def build_path_to_transcript_dict_nchlt_nso(): | |
root = "/resources/speech/corpora/nchlt_nso" | |
return build_path_to_transcript_dict_nchlt_template(root, lang_code="nso") | |
def build_path_to_transcript_dict_nchlt_sot(): | |
root = "/resources/speech/corpora/nchlt_sot" | |
return build_path_to_transcript_dict_nchlt_template(root, lang_code="sot") | |
def build_path_to_transcript_dict_nchlt_ssw(): | |
root = "/resources/speech/corpora/nchlt_ssw" | |
return build_path_to_transcript_dict_nchlt_template(root, lang_code="ssw") | |
def build_path_to_transcript_dict_nchlt_tsn(): | |
root = "/resources/speech/corpora/nchlt_tsn" | |
return build_path_to_transcript_dict_nchlt_template(root, lang_code="tsn") | |
def build_path_to_transcript_dict_nchlt_tso(): | |
root = "/resources/speech/corpora/nchlt_tso" | |
return build_path_to_transcript_dict_nchlt_template(root, lang_code="tso") | |
def build_path_to_transcript_dict_nchlt_ven(): | |
root = "/resources/speech/corpora/nchlt_ven" | |
return build_path_to_transcript_dict_nchlt_template(root, lang_code="ven") | |
def build_path_to_transcript_dict_nchlt_xho(): | |
root = "/resources/speech/corpora/nchlt_xho" | |
return build_path_to_transcript_dict_nchlt_template(root, lang_code="xho") | |
def build_path_to_transcript_dict_nchlt_zul(): | |
root = "/resources/speech/corpora/nchlt_zul" | |
return build_path_to_transcript_dict_nchlt_template(root, lang_code="zul") | |
def build_path_to_transcript_dict_nchlt_template(root, lang_code): | |
path_to_transcript = dict() | |
base_dir = Path(root).parent | |
for split in ["trn", "tst"]: | |
tree = ET.parse(f"{root}/transcriptions/nchlt_{lang_code}.{split}.xml") | |
tree_root = tree.getroot() | |
for rec in tree_root.iter("recording"): | |
transcript = rec.find("orth").text | |
if "[s]" in transcript: | |
continue | |
path = str(base_dir / rec.get("audio")) | |
path_to_transcript[path] = transcript | |
return path_to_transcript | |
def build_path_to_transcript_dict_bibletts_akuapem_twi(): | |
path_to_transcript = dict() | |
root = "/resources/speech/corpora/BibleTTS/akuapem-twi" | |
for split in ["train", "dev", "test"]: | |
for book in Path(root, split).glob("*"): | |
for textfile in book.glob("*.txt"): | |
with open(textfile, "r", encoding="utf-8") as f: | |
text = " ".join( | |
[line.strip() for line in f] | |
) # should usually be only one line anyway | |
path_to_transcript[textfile.with_suffix(".flac")] = text | |
return path_to_transcript | |
def build_path_to_transcript_dict_bembaspeech(): | |
root = "/resources/speech/corpora/BembaSpeech/bem" | |
path_to_transcript = dict() | |
for split in ["train", "dev", "test"]: | |
with open(Path(root, f"{split}.tsv"), "r", encoding="utf-8") as f: | |
reader = DictReader(f, delimiter="\t") | |
for row in reader: | |
path_to_transcript[str(Path(root, "audio", row["audio"]))] = row[ | |
"sentence" | |
] | |
return path_to_transcript | |
def build_path_to_transcript_dict_alffa_sw(): | |
root = "/resources/speech/corpora/ALFFA/data_broadcastnews_sw/data" | |
path_to_transcript = build_path_to_transcript_dict_kaldi_template( | |
root=root, split="train", replace_in_path=("asr_swahili/data/", "") | |
) | |
path_to_transcript.update( | |
build_path_to_transcript_dict_kaldi_template( | |
root=root, split="test", replace_in_path=("/my_dir/wav", "test/wav5") | |
) | |
) | |
return path_to_transcript | |
def build_path_to_transcript_dict_alffa_am(): | |
root = "/resources/speech/corpora/ALFFA/data_readspeech_am/data" | |
path_to_transcript = build_path_to_transcript_dict_kaldi_template( | |
root=root, split="train", replace_in_path=("/home/melese/kaldi/data/", "") | |
) | |
path_to_transcript.update( | |
build_path_to_transcript_dict_kaldi_template( | |
root=root, split="test", replace_in_path=("/home/melese/kaldi/data/", "") | |
) | |
) | |
return path_to_transcript | |
def build_path_to_transcript_dict_alffa_wo(): | |
root = "/resources/speech/corpora/ALFFA/data_readspeech_wo/data" | |
path_to_transcript = dict() | |
for split in ["train", "dev", "test"]: | |
with open(Path(root, split, "text"), "r", encoding="utf-8") as f: | |
for line in f: | |
line = line.strip().split() | |
file = line[0] | |
text = " ".join(line[1:]) | |
number = file.split("_")[1] | |
path_to_transcript[str(Path(root, split, number, f"{file}.wav"))] = text | |
return path_to_transcript | |
def build_path_to_transcript_dict_malayalam(): | |
root = "/resources/speech/corpora/malayalam" | |
path_to_transcript = dict() | |
for gender in ["female", "male"]: | |
with open(Path(root, f"line_index_{gender}.tsv"), "r", encoding="utf-8") as f: | |
for line in f: | |
file, text = line.strip().split("\t") | |
path_to_transcript[str(Path(root, gender, f"{file}.wav"))] = text | |
return path_to_transcript | |
def build_path_to_transcript_dict_msc(): | |
root = "/resources/speech/corpora/msc_reviewed_speech" | |
path_to_transcript = dict() | |
with open(Path(root, f"metadata.tsv"), "r", encoding="utf-8") as f: | |
reader = DictReader(f, delimiter="\t") | |
for row in reader: | |
path_to_transcript[str(Path(root, row["speechpath"]))] = row["transcript"] | |
return path_to_transcript | |
def build_path_to_transcript_dict_chuvash(): | |
root = "/resources/speech/corpora/chuvash" | |
path_to_transcript = dict() | |
for textfile in Path(root, "transcripts", "txt").glob("*.txt"): | |
with open(textfile, "r", encoding="utf-8") as f: | |
for line in f: | |
line = line.strip().split() | |
text = " ".join(line[1:]).replace("«", "").replace("»", "") | |
path = Path( | |
root, "audio", "split", f"trim_clean_{textfile.stem}.{line[0]}.flac" | |
) | |
if path.exists(): | |
path_to_transcript[str(path)] = text | |
return path_to_transcript | |
def build_path_to_transcript_dict_iban(): | |
root = "/resources/speech/corpora/iban/data" | |
path_to_transcript = build_path_to_transcript_dict_kaldi_template( | |
root, "train", replace_in_path=("asr_iban/data/", "") | |
) | |
path_to_transcript.update( | |
build_path_to_transcript_dict_kaldi_template( | |
root, "dev", replace_in_path=("asr_iban/data/", "") | |
) | |
) | |
return path_to_transcript | |
def build_path_to_transcript_dict_kaldi_template(root, split, replace_in_path=None): | |
path_to_transcript = dict() | |
wav_scp = {} | |
with open(Path(root, split, "wav.scp"), "r") as f: | |
for line in f: | |
wav_id, wav_path = line.split() | |
if replace_in_path: | |
wav_path = wav_path.replace(replace_in_path[0], replace_in_path[1]) | |
wav_scp[wav_id] = str(Path(root, wav_path)) | |
with open(Path(root, split, "text"), "r", encoding="utf-8") as f: | |
for line in f: | |
line = line.split() | |
wav_id = line[0] | |
text = " ".join(line[1:]) | |
if "<" in text: # ignore all <UNK> utterance etc. | |
continue | |
path_to_transcript[wav_scp[wav_id]] = text | |
return path_to_transcript | |
def build_path_to_transcript_dict_sundanese_speech(): | |
root = "/resources/speech/corpora/sundanese_speech/asr_sundanese" | |
return build_path_to_transcript_dict_south_asian_languages_template(root) | |
def build_path_to_transcript_dict_sinhala_speech(): | |
root = "/resources/speech/corpora/sinhala_speech/asr_sinhala" | |
return build_path_to_transcript_dict_south_asian_languages_template(root) | |
def build_path_to_transcript_dict_bengali_speech(): | |
root = "/resources/speech/corpora/bengali_speech/asr_bengali" | |
return build_path_to_transcript_dict_south_asian_languages_template(root) | |
def build_path_to_transcript_dict_nepali_speech(): | |
root = "/resources/speech/corpora/nepali_speech/asr_nepali" | |
return build_path_to_transcript_dict_south_asian_languages_template(root) | |
def build_path_to_transcript_dict_javanese_speech(): | |
root = "/resources/speech/corpora/javanese_speech/asr_javanese" | |
return build_path_to_transcript_dict_south_asian_languages_template(root) | |
def build_path_to_transcript_dict_south_asian_languages_template(root): | |
path_to_transcript = dict() | |
with open(Path(root, "utt_spk_text.tsv"), "r", encoding="utf-8") as f: | |
for line in f: | |
utt, spk, text = line.strip().split("\t") | |
dir_tag = utt[:2] | |
path_to_transcript[str(Path(root, "data", dir_tag, f"{utt}.flac"))] = text | |
return path_to_transcript | |
def build_path_to_transcript_dict_african_voices_kenyan_afv(): | |
root = "/resources/speech/corpora/AfricanVoices/afv_enke" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_fon_alf(): | |
root = "/resources/speech/corpora/AfricanVoices/fon_alf" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_hausa_cmv(): | |
main_root = "/resources/speech/corpora/AfricanVoices" | |
path_to_transcript = build_path_to_transcript_dict_african_voices_template( | |
f"{main_root}/hau_cmv_f" | |
) | |
path_to_transcript.update( | |
build_path_to_transcript_dict_african_voices_template(f"{main_root}/hau_cmv_m") | |
) | |
return path_to_transcript | |
def build_path_to_transcript_dict_african_voices_ibibio_lst(): | |
root = "/resources/speech/corpora/AfricanVoices/ibb_lst" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_kikuyu_opb(): | |
root = "/resources/speech/corpora/AfricanVoices/kik_opb" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_lingala_opb(): | |
root = "/resources/speech/corpora/AfricanVoices/lin_opb" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_ganda_cmv(): | |
root = "/resources/speech/corpora/AfricanVoices/lug_cmv" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_luo_afv(): | |
root = "/resources/speech/corpora/AfricanVoices/luo_afv" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_luo_opb(): | |
root = "/resources/speech/corpora/AfricanVoices/luo_opb" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_swahili_llsti(): | |
root = "/resources/speech/corpora/AfricanVoices/swa_llsti" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_suba_afv(): | |
root = "/resources/speech/corpora/AfricanVoices/sxb_afv" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_wolof_alf(): | |
root = "/resources/speech/corpora/AfricanVoices/wol_alf" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_yoruba_opb(): | |
root = "/resources/speech/corpora/AfricanVoices/yor_opb" | |
return build_path_to_transcript_dict_african_voices_template(root) | |
def build_path_to_transcript_dict_african_voices_template(root): | |
path_to_transcript = dict() | |
with open(Path(root, "txt.done.data"), "r", encoding="utf-8") as f: | |
for line in f: | |
line = line.replace('\\"', "'").split('"') | |
text = line[1] | |
file = line[0].split()[-1] | |
path_to_transcript[str(Path(root, "wav", f"{file}.wav"))] = text | |
return path_to_transcript | |
def build_path_to_transcript_dict_zambezi_voice_nyanja(): | |
root = "/resources/speech/corpora/ZambeziVoice/nyanja/nya" | |
return build_path_to_transcript_dict_zambezi_voice_template(root) | |
def build_path_to_transcript_dict_zambezi_voice_lozi(): | |
root = "/resources/speech/corpora/ZambeziVoice/lozi/loz" | |
return build_path_to_transcript_dict_zambezi_voice_template(root) | |
def build_path_to_transcript_dict_zambezi_voice_tonga(): | |
root = "/resources/speech/corpora/ZambeziVoice/tonga/toi" | |
return build_path_to_transcript_dict_zambezi_voice_template(root) | |
def build_path_to_transcript_dict_zambezi_voice_template(root): | |
path_to_transcript = dict() | |
for split in ["train", "dev", "test"]: | |
with open(Path(root, f"{split}.tsv"), "r", encoding="utf-8") as f: | |
reader = DictReader(f, delimiter="\t") | |
for row in reader: | |
path_to_transcript[str(Path(root, "audio", row["audio_id"]))] = row[ | |
"sentence" | |
].strip() | |
return path_to_transcript | |
def build_path_to_transcript_dict_fleurs_template(root): | |
path_to_transcript = dict() | |
for split in ["train", "dev", "test"]: | |
with open(Path(root, f"{split}.tsv"), "r", encoding="utf-8") as f: | |
reader = DictReader( | |
f, | |
delimiter="\t", | |
fieldnames=[ | |
"id", | |
"filename", | |
"transcription_raw", | |
"transcription", | |
"words", | |
"speaker", | |
"gender", | |
], | |
) | |
for row in reader: | |
path_to_transcript[str(Path(root, "audio", split, row["filename"]))] = ( | |
row["transcription_raw"].strip() | |
) | |
return path_to_transcript | |
def build_path_to_transcript_dict_fleurs_afrikaans(): | |
root = "/resources/speech/corpora/fleurs/af_za" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_amharic(): | |
root = "/resources/speech/corpora/fleurs/am_et" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_arabic(): | |
root = "/resources/speech/corpora/fleurs/ar_eg" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_assamese(): | |
root = "/resources/speech/corpora/fleurs/as_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_asturian(): | |
root = "/resources/speech/corpora/fleurs/ast_es" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_azerbaijani(): | |
root = "/resources/speech/corpora/fleurs/az_az" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_belarusian(): | |
root = "/resources/speech/corpora/fleurs/be_by" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_bulgarian(): | |
root = "/resources/speech/corpora/fleurs/bg_bg" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_bengali(): | |
root = "/resources/speech/corpora/fleurs/bn_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_bosnian(): | |
root = "/resources/speech/corpora/fleurs/bs_ba" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_catalan(): | |
root = "/resources/speech/corpora/fleurs/ca_es" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_cebuano(): | |
root = "/resources/speech/corpora/fleurs/ceb_ph" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_sorani_kurdish(): | |
root = "/resources/speech/corpora/fleurs/ckb_iq" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_mandarin(): | |
root = "/resources/speech/corpora/fleurs/cmn_hans_cn" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_czech(): | |
root = "/resources/speech/corpora/fleurs/cs_cz" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_welsh(): | |
root = "/resources/speech/corpora/fleurs/cy_gb" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_danish(): | |
root = "/resources/speech/corpora/fleurs/da_dk" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_german(): | |
root = "/resources/speech/corpora/fleurs/de_de" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_greek(): | |
root = "/resources/speech/corpora/fleurs/el_gr" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_english(): | |
root = "/resources/speech/corpora/fleurs/en_us" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_spanish(): | |
root = "/resources/speech/corpora/fleurs/es_419" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_estonian(): | |
root = "/resources/speech/corpora/fleurs/et_ee" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_persian(): | |
root = "/resources/speech/corpora/fleurs/fa_ir" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_fula(): | |
root = "/resources/speech/corpora/fleurs/ff_sn" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_finnish(): | |
root = "/resources/speech/corpora/fleurs/fi_fi" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_filipino(): | |
root = "/resources/speech/corpora/fleurs/fil_ph" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_french(): | |
root = "/resources/speech/corpora/fleurs/fr_fr" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_irish(): | |
root = "/resources/speech/corpora/fleurs/ga_ie" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_galician(): | |
root = "/resources/speech/corpora/fleurs/gl_es" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_gujarati(): | |
root = "/resources/speech/corpora/fleurs/gu_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_hausa(): | |
root = "/resources/speech/corpora/fleurs/ha_ng" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_hebrew(): | |
root = "/resources/speech/corpora/fleurs/he_il" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_hindi(): | |
root = "/resources/speech/corpora/fleurs/hi_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_croatian(): | |
root = "/resources/speech/corpora/fleurs/hr_hr" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_hungarian(): | |
root = "/resources/speech/corpora/fleurs/hu_hu" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_armenian(): | |
root = "/resources/speech/corpora/fleurs/hy_am" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_indonesian(): | |
root = "/resources/speech/corpora/fleurs/id_id" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_igbo(): | |
root = "/resources/speech/corpora/fleurs/ig_ng" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_icelandic(): | |
root = "/resources/speech/corpora/fleurs/is_is" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_italian(): | |
root = "/resources/speech/corpora/fleurs/it_it" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_japanese(): | |
root = "/resources/speech/corpora/fleurs/ja_jp" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_javanese(): | |
root = "/resources/speech/corpora/fleurs/jv_id" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_georgian(): | |
root = "/resources/speech/corpora/fleurs/ka_ge" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_kamba(): | |
root = "/resources/speech/corpora/fleurs/kam_ke" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_kabuverdianu(): | |
root = "/resources/speech/corpora/fleurs/kea_cv" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_kazakh(): | |
root = "/resources/speech/corpora/fleurs/kk_kz" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_khmer(): | |
root = "/resources/speech/corpora/fleurs/km_kh" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_kannada(): | |
root = "/resources/speech/corpora/fleurs/kn_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_korean(): | |
root = "/resources/speech/corpora/fleurs/ko_kr" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_kyrgyz(): | |
root = "/resources/speech/corpora/fleurs/ky_kg" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_luxembourgish(): | |
root = "/resources/speech/corpora/fleurs/lb_lu" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_ganda(): | |
root = "/resources/speech/corpora/fleurs/lg_ug" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_lingala(): | |
root = "/resources/speech/corpora/fleurs/ln_cd" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_lao(): | |
root = "/resources/speech/corpora/fleurs/lo_la" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_lithuanian(): | |
root = "/resources/speech/corpora/fleurs/lt_lt" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_luo(): | |
root = "/resources/speech/corpora/fleurs/luo_ke" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_latvian(): | |
root = "/resources/speech/corpora/fleurs/lv_lv" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_maori(): | |
root = "/resources/speech/corpora/fleurs/mi_nz" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_macedonian(): | |
root = "/resources/speech/corpora/fleurs/mk_mk" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_malayalam(): | |
root = "/resources/speech/corpora/fleurs/ml_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_mongolian(): | |
root = "/resources/speech/corpora/fleurs/mn_mn" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_marathi(): | |
root = "/resources/speech/corpora/fleurs/mr_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_malay(): | |
root = "/resources/speech/corpora/fleurs/ms_my" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_maltese(): | |
root = "/resources/speech/corpora/fleurs/mt_mt" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_burmese(): | |
root = "/resources/speech/corpora/fleurs/my_mm" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_norwegian(): | |
root = "/resources/speech/corpora/fleurs/nb_no" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_nepali(): | |
root = "/resources/speech/corpora/fleurs/ne_np" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_dutch(): | |
root = "/resources/speech/corpora/fleurs/nl_nl" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_northern_sotho(): | |
root = "/resources/speech/corpora/fleurs/nso_za" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_nyanja(): | |
root = "/resources/speech/corpora/fleurs/ny_mw" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_occitan(): | |
root = "/resources/speech/corpora/fleurs/oc_fr" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_oroma(): | |
root = "/resources/speech/corpora/fleurs/om_et" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_oriya(): | |
root = "/resources/speech/corpora/fleurs/or_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_punjabi(): | |
root = "/resources/speech/corpora/fleurs/pa_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_polish(): | |
root = "/resources/speech/corpora/fleurs/pl_pl" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_pashto(): | |
root = "/resources/speech/corpora/fleurs/ps_af" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_portuguese(): | |
root = "/resources/speech/corpora/fleurs/pt_br" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_romanian(): | |
root = "/resources/speech/corpora/fleurs/ro_ro" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_russian(): | |
root = "/resources/speech/corpora/fleurs/ru_ru" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_sindhi(): | |
root = "/resources/speech/corpora/fleurs/sd_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_slovak(): | |
root = "/resources/speech/corpora/fleurs/sk_sk" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_slovenian(): | |
root = "/resources/speech/corpora/fleurs/sl_si" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_shona(): | |
root = "/resources/speech/corpora/fleurs/sn_zw" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_somali(): | |
root = "/resources/speech/corpora/fleurs/so_so" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_serbian(): | |
root = "/resources/speech/corpora/fleurs/sr_rs" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_swedish(): | |
root = "/resources/speech/corpora/fleurs/sv_se" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_swahili(): | |
root = "/resources/speech/corpora/fleurs/sw_ke" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_tamil(): | |
root = "/resources/speech/corpora/fleurs/ta_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_telugu(): | |
root = "/resources/speech/corpora/fleurs/te_in" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_tajik(): | |
root = "/resources/speech/corpora/fleurs/tg_tj" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_thai(): | |
root = "/resources/speech/corpora/fleurs/th_th" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_turkish(): | |
root = "/resources/speech/corpora/fleurs/tr_tr" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_ukrainian(): | |
root = "/resources/speech/corpora/fleurs/uk_ua" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_umbundu(): | |
root = "/resources/speech/corpora/fleurs/umb_ao" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_urdu(): | |
root = "/resources/speech/corpora/fleurs/ur_pk" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_uzbek(): | |
root = "/resources/speech/corpora/fleurs/uz_uz" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_vietnamese(): | |
root = "/resources/speech/corpora/fleurs/vi_vn" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_wolof(): | |
root = "/resources/speech/corpora/fleurs/wo_sn" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_xhosa(): | |
root = "/resources/speech/corpora/fleurs/xh_za" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_yoruba(): | |
root = "/resources/speech/corpora/fleurs/yo_ng" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_cantonese(): | |
root = "/resources/speech/corpora/fleurs/yue_hant_hk" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_fleurs_zulu(): | |
root = "/resources/speech/corpora/fleurs/zu_za" | |
return build_path_to_transcript_dict_fleurs_template(root) | |
def build_path_to_transcript_dict_living_audio_dataset_template(root): | |
path_to_transcript = dict() | |
tree = ET.parse(f"{root}/text.xml") | |
tree_root = tree.getroot() | |
for rec in tree_root.iter("recording_script"): | |
for file in rec.iter("fileid"): | |
path_to_transcript[ | |
str(Path(root, "48000_orig", f'{file.get("id")}.wav')) | |
] = file.text.strip() | |
return path_to_transcript | |
def build_path_to_transcript_dict_living_audio_dataset_irish(): | |
root = "/resources/speech/corpora/LivingAudioDataset/ga" | |
return build_path_to_transcript_dict_living_audio_dataset_template(root) | |
def build_path_to_transcript_dict_living_audio_dataset_dutch(): | |
root = "/resources/speech/corpora/LivingAudioDataset/nl" | |
return build_path_to_transcript_dict_living_audio_dataset_template(root) | |
def build_path_to_transcript_dict_living_audio_dataset_russian(): | |
root = "/resources/speech/corpora/LivingAudioDataset/ru" | |
return build_path_to_transcript_dict_living_audio_dataset_template(root) | |
def build_path_to_transcript_dict_romanian_db(): | |
root = "/resources/speech/corpora/RomanianDB" | |
path_to_transcript = dict() | |
for split in ["training", "testing", "elena", "georgiana"]: | |
for transcript in Path(root, split, "text").glob("*.txt"): | |
subset = transcript.stem | |
with open(transcript, "r", encoding="utf-8") as f: | |
for line in f: | |
fileid = line.strip()[:2] | |
if len(fileid) == 2: | |
fileid = "0" + fileid | |
text = line.strip()[5:] | |
if split == "elena": | |
path = f"ele_{subset}_{fileid}.wav" | |
elif split == "georgiana": | |
path = f"geo_{subset}_{fileid}.wav" | |
else: | |
path = f"adr_{subset}_{fileid}.wav" | |
path_to_transcript[str(Path(root, split, "wav", subset, path))] = ( | |
text | |
) | |
return path_to_transcript | |
def build_path_to_transcript_dict_shemo(): | |
root = "/resources/speech/corpora/ShEMO" | |
path_to_transcript = dict() | |
with open("/resources/speech/corpora/ShEMO/shemo.json", "r", encoding="utf-8") as f: | |
data = json.load(f) | |
for fileid, file_info in data.items(): | |
path = Path(root, file_info["gender"], f"{fileid}.wav") | |
if path.exists(): | |
path_to_transcript[str(path)] = file_info["transcript"] | |
return path_to_transcript | |
def build_path_to_transcript_dict_mslt_template(root, lang="en"): | |
path_to_transcript = dict() | |
for split in Path(root).glob("*"): | |
if split.is_dir(): | |
for audio_file in split.glob("*.wav"): | |
text_file = str(audio_file).replace(f"T0.{lang}.wav", f"T1.{lang}.snt") | |
with open(text_file, "r", encoding="utf-16") as f: | |
for line in f: | |
text = line.strip() # should have only one line | |
if "<" in text or "[" in text: | |
# ignore all utterances with special parts like [laughter] or <UNIN/> | |
continue | |
path_to_transcript[str(audio_file)] = text | |
break | |
return path_to_transcript | |
def build_path_to_transcript_dict_mslt_english(): | |
root = "/resources/speech/corpora/MSLT/Data/EN" | |
return build_path_to_transcript_dict_mslt_template(root, lang="en") | |
def build_path_to_transcript_dict_mslt_japanese(): | |
root = "/resources/speech/corpora/MSLT/Data/JA" | |
return build_path_to_transcript_dict_mslt_template(root, lang="jp") | |
def build_path_to_transcript_dict_mslt_chinese(): | |
root = "/resources/speech/corpora/MSLT/Data/ZH" | |
return build_path_to_transcript_dict_mslt_template(root, lang="ch") | |
def build_path_to_transcript_dict_rajasthani_hindi_speech(): | |
root = "/resources/speech/corpora/Rajasthani_Hindi_Speech/Hindi-Speech-Data" | |
path_to_transcript = dict() | |
for audio_file in Path(root).glob("*.3gp"): | |
with open(audio_file.with_suffix(".txt"), "r", encoding="utf-8") as f: | |
for line in f: # should only be one line | |
text = line.strip() | |
path_to_transcript[str(audio_file)] = text | |
return path_to_transcript | |
def build_path_to_transcript_dict_cmu_arctic(): | |
root = "/resources/speech/corpora/cmu_arctic" | |
path_to_transcript = dict() | |
for speaker_dir in Path(root).glob("*"): | |
if speaker_dir.is_dir(): | |
with open( | |
Path(speaker_dir, "etc", "txt.done.data"), "r", encoding="utf-8" | |
) as f: | |
for line in f: | |
line = line.replace('\\"', "'").split('"') | |
text = line[1] | |
file = line[0].split()[-1] | |
path_to_transcript[str(Path(speaker_dir, "wav", f"{file}.wav"))] = ( | |
text | |
) | |
return path_to_transcript | |
def build_path_to_transcript_dict_sevil_tatar(): | |
root = "/resources/speech/corpora/sevil_tatar/sevil" | |
path_to_transcript = dict() | |
with open(Path(root, "metadata.jsonl"), "r", encoding="utf-8") as f: | |
for line in f: | |
meta = json.loads(line) | |
path_to_transcript[str(Path(root, meta["file"]))] = ( | |
meta["orig_text"].strip().replace("\xad", "") | |
) | |
return path_to_transcript | |
def build_path_to_transcript_dict_clartts(): | |
root = "/resources/speech/corpora/ClArTTS" | |
path_to_transcript = dict() | |
with open(Path(root, "training.txt"), "r", encoding="utf-16") as f: | |
for line in f: | |
fileid, transcript = line.strip().split("|") | |
path_to_transcript[str(Path(root, "wav", "train", f"{fileid}.wav"))] = ( | |
transcript | |
) | |
with open(Path(root, "validation.txt"), "r", encoding="utf-16") as f: | |
for line in f: | |
fileid, transcript = line.strip().split("|") | |
path_to_transcript[str(Path(root, "wav", "val", f"{fileid}.wav"))] = ( | |
transcript | |
) | |
return path_to_transcript | |
def build_path_to_transcript_dict_snow_mountain_template(root, lang): | |
path_to_transcript = dict() | |
for split in ["train_full", "val_full", "test_common"]: | |
with open( | |
Path(root, "experiments", lang, f"{split}.csv"), "r", encoding="utf-8" | |
) as f: | |
reader = DictReader(f, delimiter=",") | |
for row in reader: | |
path = row["path"].replace("data/", f"{root}/") | |
path_to_transcript[path] = row["sentence"].strip() | |
return path_to_transcript | |
def build_path_to_transcript_dict_snow_mountain_bhadrawahi(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "bhadrawahi" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_bilaspuri(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "bilaspuri" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_dogri(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "dogri" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_gaddi(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "gaddi" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_haryanvi(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "haryanvi" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_hindi(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "hindi" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_kangri(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "kangri" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_kannada(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "kannada" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_kulvi(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "kulvi" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_kulvi_outer_seraji(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "kulvi_outer_seraji" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_malayalam(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "malayalam" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_mandeali(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "mandeali" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_pahari_mahasui(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "pahari_mahasui" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_tamil(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "tamil" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_snow_mountain_telugu(): | |
root = "/resources/speech/corpora/snow_mountain" | |
language = "telugu" | |
return build_path_to_transcript_dict_snow_mountain_template(root, language) | |
def build_path_to_transcript_dict_ukrainian_lada(): | |
root = "/resources/speech/corpora/ukrainian_lada/dataset_lada/accept" | |
path_to_transcript = dict() | |
with open(Path(root, "metadata.jsonl"), "r", encoding="utf-8") as f: | |
for line in f: | |
meta = json.loads(line) | |
path_to_transcript[str(Path(root, meta["file"]).with_suffix(".wav"))] = ( | |
meta["orig_text"].strip().replace("\xad", "") | |
) | |
return path_to_transcript | |
def build_path_to_transcript_dict_m_ailabs_template(root): | |
path_to_transcript = dict() | |
for gender_dir in Path(root).glob("*"): | |
if not gender_dir.is_dir(): | |
continue | |
for speaker_dir in gender_dir.glob("*"): | |
if not speaker_dir.is_dir(): | |
continue | |
if (speaker_dir / "wavs").exists(): | |
with open( | |
Path(speaker_dir, "metadata.csv"), "r", encoding="utf-8" | |
) as f: | |
for line in f: | |
fileid, text, text_norm = line.strip().split("|") | |
path = Path(speaker_dir, "wavs", f"{fileid}.wav") | |
if path.exists(): | |
path_to_transcript[str(path)] = text_norm | |
else: | |
for session_dir in speaker_dir.glob("*"): | |
if not session_dir.is_dir(): | |
continue | |
with open( | |
Path(session_dir, "metadata.csv"), "r", encoding="utf-8" | |
) as f: | |
for line in f: | |
fileid, text, text_norm = line.strip().split("|") | |
path = Path(session_dir, "wavs", f"{fileid}.wav") | |
if path.exists(): | |
path_to_transcript[str(path)] = text_norm | |
return path_to_transcript | |
def build_path_to_transcript_dict_m_ailabs_german(): | |
root = "/resources/speech/corpora/m-ailabs-speech/de_DE" | |
return build_path_to_transcript_dict_m_ailabs_template(root) | |
def build_path_to_transcript_dict_m_ailabs_uk_english(): | |
root = "/resources/speech/corpora/m-ailabs-speech/en_UK" | |
return build_path_to_transcript_dict_m_ailabs_template(root) | |
def build_path_to_transcript_dict_m_ailabs_us_english(): | |
root = "/resources/speech/corpora/m-ailabs-speech/en_US" | |
return build_path_to_transcript_dict_m_ailabs_template(root) | |
def build_path_to_transcript_dict_m_ailabs_spanish(): | |
root = "/resources/speech/corpora/m-ailabs-speech/es_ES" | |
return build_path_to_transcript_dict_m_ailabs_template(root) | |
def build_path_to_transcript_dict_m_ailabs_french(): | |
root = "/resources/speech/corpora/m-ailabs-speech/fr_FR" | |
return build_path_to_transcript_dict_m_ailabs_template(root) | |
def build_path_to_transcript_dict_m_ailabs_italian(): | |
root = "/resources/speech/corpora/m-ailabs-speech/it_IT" | |
return build_path_to_transcript_dict_m_ailabs_template(root) | |
def build_path_to_transcript_dict_m_ailabs_polish(): | |
root = "/resources/speech/corpora/m-ailabs-speech/pl_PL" | |
return build_path_to_transcript_dict_m_ailabs_template(root) | |
def build_path_to_transcript_dict_m_ailabs_russian(): | |
root = "/resources/speech/corpora/m-ailabs-speech/ru_RU" | |
return build_path_to_transcript_dict_m_ailabs_template(root) | |
def build_path_to_transcript_dict_m_ailabs_ukrainian(): | |
root = "/resources/speech/corpora/m-ailabs-speech/uk_UK" | |
return build_path_to_transcript_dict_m_ailabs_template(root) | |
def build_path_to_transcript_dict_cml_tts_template(root): | |
path_to_transcript = dict() | |
for split in ["train", "dev", "test"]: | |
with open(Path(root, f"{split}.csv"), "r", encoding="utf-8") as f: | |
reader = DictReader(f, delimiter="|") | |
for row in reader: | |
path_to_transcript[str(Path(root, row["wav_filename"]))] = row[ | |
"transcript" | |
].strip() | |
return path_to_transcript | |
def build_path_to_transcript_dict_cml_tts_dutch(): | |
root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_dutch_v0.1" | |
return build_path_to_transcript_dict_cml_tts_template(root) | |
def build_path_to_transcript_dict_cml_tts_french(): | |
root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_french_v0.1" | |
return build_path_to_transcript_dict_cml_tts_template(root) | |
def build_path_to_transcript_dict_cml_tts_german(): | |
root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_german_v0.1" | |
return build_path_to_transcript_dict_cml_tts_template(root) | |
def build_path_to_transcript_dict_cml_tts_italian(): | |
root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_italian_v0.1" | |
return build_path_to_transcript_dict_cml_tts_template(root) | |
def build_path_to_transcript_dict_cml_tts_polish(): | |
root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_polish_v0.1" | |
return build_path_to_transcript_dict_cml_tts_template(root) | |
def build_path_to_transcript_dict_cml_tts_portuguese(): | |
root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_portuguese_v0.1" | |
return build_path_to_transcript_dict_cml_tts_template(root) | |
def build_path_to_transcript_dict_cml_tts_spanish(): | |
root = "/resources/speech/corpora/cml_tts/cml_tts_dataset_spanish_v0.1" | |
return build_path_to_transcript_dict_cml_tts_template(root) | |
def build_path_to_transcript_dict_mms_template( | |
lang, root="/resources/speech/corpora/mms_synthesized_bible_speech" | |
): | |
path_to_transcript = dict() | |
i = 0 | |
with open(Path(root, "bible_texts", f"{lang}.txt"), "r", encoding="utf-8") as f: | |
for line in f: | |
path = Path(root, "bible_audios", lang, f"{i}.wav") | |
if path.exists(): | |
path_to_transcript[str(path)] = line.strip() | |
i += 1 | |
return path_to_transcript | |
def build_path_to_transcript_dict_shan(): | |
root = "D:\\Work\\Developer\\ShanTTS\Datasets\\asr-dataset" | |
path_to_transcript_dict = dict() | |
with open(Path(root, "metadata.csv"), mode="r", encoding="utf-8") as f: | |
reader = DictReader( | |
f, | |
delimiter=",", | |
fieldnames=[ | |
"file_name", | |
"transcription", | |
], | |
) | |
for row in reader: | |
path_to_transcript_dict[str(Path(root, row["file_name"]))] = row[ | |
"transcription" | |
].strip() | |
return path_to_transcript_dict | |
if __name__ == "__main__": | |
pass | |