File size: 3,565 Bytes
0233e7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import argparse
import os

from torchaudio.datasets import CMUARCTIC
from tqdm import tqdm


SPLITS = {
    "train": list(range(   0,  932)),
    "valid": list(range( 932, 1032)),
    "test":  list(range(1032, 1132)),
}


def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "root", metavar="DIR", help="root directory containing wav files to index"
    )
    parser.add_argument(
        "--dest", default=".", type=str, metavar="DIR", help="output directory"
    )
    parser.add_argument(
        "--source", default="bdl,clb,slt,rms", type=str, help="Source voice from slt, clb, bdl, rms."
    )
    parser.add_argument(
        "--target", default="bdl,clb,slt,rms", type=str, help="Target voice from slt, clb, bdl, rms."
    )
    parser.add_argument(
        "--splits", default="932,100,100", type=str, help="Split of train,valid,test seperate by comma."
    )
    parser.add_argument(
        "--wav-root", default=None, type=str, metavar="DIR", help="saved waveform root directory for tsv"
    )
    parser.add_argument(
        "--spkemb-npy-dir", required=True, type=str, help="speaker embedding directory"
    )
    return parser

def main(args):
    dest_dir = args.dest
    wav_root = args.wav_root
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    
    source = args.source.split(",")
    target = args.target.split(",")
    spks = sorted(list(set(source + target)))
    datasets = {}
    
    datasets["slt"] = CMUARCTIC(args.root, url="slt", folder_in_archive="ARCTIC", download=False)
    for spk in spks:
        if spk != "slt":
            datasets[spk] = CMUARCTIC(args.root, url=spk, folder_in_archive="ARCTIC", download=False)
            datasets[spk]._walker = list(datasets["slt"]._walker) # some text sentences is missing
    if "slt" not in spks:
        del datasets["slt"]
    
    num_splits = [int(n_split) for n_split in args.splits.split(',')]
    assert sum(num_splits) == 1132, f"Missing utterances: {sum(num_splits)} != 1132"

    tsv = {}
    for split in SPLITS.keys():
        tsv[split] = open(os.path.join(dest_dir, f"{split}.tsv"), "w")
        print(wav_root, file=tsv[split])
    
    for split, indices in SPLITS.items():
        for i in tqdm(indices, desc=f"[{'-'.join(spks)}]tsv/wav/spk"):
            for src_spk in source:
                for tgt_spk in target:
                    if src_spk == tgt_spk: continue
                    # wav, sample_rate, utterance, utt_no
                    src_i = datasets[src_spk][i]
                    tgt_i = datasets[tgt_spk][i]
                    assert src_i[1] == tgt_i[1], f"{src_i[1]}-{tgt_i[1]}"
                    assert src_i[3] == tgt_i[3], f"{src_i[3]}-{tgt_i[3]}"
                    src_wav = os.path.join(os.path.basename(datasets[src_spk]._path), datasets[src_spk]._folder_audio, f"arctic_{src_i[3]}.wav")
                    src_nframes = src_i[0].shape[-1]
                    tgt_wav = os.path.join(os.path.basename(datasets[tgt_spk]._path), datasets[tgt_spk]._folder_audio, f"arctic_{tgt_i[3]}.wav")
                    tgt_nframes = tgt_i[0].shape[-1]
                    tgt_spkemb = os.path.join(args.spkemb_npy_dir, f"{os.path.basename(datasets[tgt_spk]._path)}-{datasets[tgt_spk]._folder_audio}-arctic_{tgt_i[3]}.npy")
                    print(f"{src_wav}\t{src_nframes}\t{tgt_wav}\t{tgt_nframes}\t{tgt_spkemb}", file=tsv[split])
    for split in tsv.keys():
        tsv[split].close()


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    main(args)